From 2060f13de121a4a1f31d3656c63db429450e0da0 Mon Sep 17 00:00:00 2001 From: Sam Ford <1584702+samford@users.noreply.github.com> Date: Mon, 21 Dec 2020 00:48:31 -0500 Subject: [PATCH] Refactor livecheck strategies around match regex --- Library/Homebrew/livecheck/strategy/apache.rb | 23 ++++++------- Library/Homebrew/livecheck/strategy/cpan.rb | 21 ++++++------ .../livecheck/strategy/github_latest.rb | 10 ++++-- Library/Homebrew/livecheck/strategy/gnome.rb | 12 ++++--- Library/Homebrew/livecheck/strategy/gnu.rb | 33 ++++--------------- .../Homebrew/livecheck/strategy/hackage.rb | 21 +++++++++--- .../Homebrew/livecheck/strategy/launchpad.rb | 9 +++-- Library/Homebrew/livecheck/strategy/npm.rb | 11 ++++--- Library/Homebrew/livecheck/strategy/pypi.rb | 33 ++++++++++++------- .../livecheck/strategy/sourceforge.rb | 19 +++++------ Library/Homebrew/livecheck/strategy/xorg.rb | 18 +++++++--- 11 files changed, 116 insertions(+), 94 deletions(-) diff --git a/Library/Homebrew/livecheck/strategy/apache.rb b/Library/Homebrew/livecheck/strategy/apache.rb index 1ab87092d5..c96f0b903c 100644 --- a/Library/Homebrew/livecheck/strategy/apache.rb +++ b/Library/Homebrew/livecheck/strategy/apache.rb @@ -22,7 +22,14 @@ module Homebrew # @api public class Apache # The `Regexp` used to determine if the strategy applies to the URL. - URL_MATCH_REGEX = %r{www\.apache\.org/dyn/.+path=.+}i.freeze + URL_MATCH_REGEX = %r{ + ^https?://www\.apache\.org + /dyn/.+path= + (?.+?)/ # Path to directory of files or version directories + (?[^/]*?) # Any text in filename or directory before version + v?\d+(?:\.\d+)+ # The numeric version + (?/|[^/]*) # Any text in filename or directory after version + }ix.freeze # Whether the strategy can be applied to the provided URL. # @@ -39,25 +46,19 @@ module Homebrew # @param regex [Regexp] a regex used for matching versions in content # @return [Hash] def self.find_versions(url, regex = nil, &block) - %r{ - path= - (?.+?)/ # Path to directory of files or version directories - (?[^/]*?) # Any text in filename or directory before version - v?\d+(?:\.\d+)+ # The numeric version - (?/|[^/]*) # Any text in filename or directory after version - }ix =~ url + match = url.match(URL_MATCH_REGEX) # Use `\.t` instead of specific tarball extensions (e.g. .tar.gz) - suffix.sub!(/\.t(?:ar\..+|[a-z0-9]+)$/i, "\.t") + suffix = match[:suffix].sub(/\.t(?:ar\..+|[a-z0-9]+)$/i, "\.t") # Example URL: `https://archive.apache.org/dist/example/` - page_url = "https://archive.apache.org/dist/#{path}/" + page_url = "https://archive.apache.org/dist/#{match[:path]}/" # Example directory regex: `%r{href=["']?v?(\d+(?:\.\d+)+)/}i` # Example file regexes: # * `/href=["']?example-v?(\d+(?:\.\d+)+)\.t/i` # * `/href=["']?example-v?(\d+(?:\.\d+)+)-bin\.zip/i` - regex ||= /href=["']?#{Regexp.escape(prefix)}v?(\d+(?:\.\d+)+)#{Regexp.escape(suffix)}/i + regex ||= /href=["']?#{Regexp.escape(match[:prefix])}v?(\d+(?:\.\d+)+)#{Regexp.escape(suffix)}/i PageMatch.find_versions(page_url, regex, &block) end diff --git a/Library/Homebrew/livecheck/strategy/cpan.rb b/Library/Homebrew/livecheck/strategy/cpan.rb index 5bf242830b..235254254c 100644 --- a/Library/Homebrew/livecheck/strategy/cpan.rb +++ b/Library/Homebrew/livecheck/strategy/cpan.rb @@ -21,7 +21,13 @@ module Homebrew NICE_NAME = "CPAN" # The `Regexp` used to determine if the strategy applies to the URL. - URL_MATCH_REGEX = %r{^https?://cpan\.metacpan\.org/authors/id(?:/[^/]+){3,}/[^/]+}i.freeze + URL_MATCH_REGEX = %r{ + ^https?://cpan\.metacpan\.org + (?/authors/id(?:/[^/]+){3,}/) # Path before the filename + (?[^/]+) # Filename text before the version + -v?\d+(?:\.\d+)* # The numeric version + (?[^/]+) # Filename text after the version + }ix.freeze # Whether the strategy can be applied to the provided URL. # @@ -38,21 +44,16 @@ module Homebrew # @param regex [Regexp] a regex used for matching versions in content # @return [Hash] def self.find_versions(url, regex = nil, &block) - %r{ - (?/authors/id(?:/[^/]+){3,}/) # Path before the filename - (?[^/]+) # Filename text before the version - -v?\d+(?:\.\d+)* # The numeric version - (?[^/]+) # Filename text after the version - }ix =~ url + match = url.match(URL_MATCH_REGEX) # Use `\.t` instead of specific tarball extensions (e.g. .tar.gz) - suffix.sub!(/\.t(?:ar\..+|[a-z0-9]+)$/i, "\.t") + suffix = match[:suffix].sub(/\.t(?:ar\..+|[a-z0-9]+)$/i, "\.t") # The directory listing page where the archive files are found - page_url = "https://cpan.metacpan.org#{path}" + page_url = "https://cpan.metacpan.org#{match[:path]}" # Example regex: `/href=.*?Brew[._-]v?(\d+(?:\.\d+)*)\.t/i` - regex ||= /href=.*?#{prefix}[._-]v?(\d+(?:\.\d+)*)#{Regexp.escape(suffix)}/i + regex ||= /href=.*?#{match[:prefix]}[._-]v?(\d+(?:\.\d+)*)#{Regexp.escape(suffix)}/i PageMatch.find_versions(page_url, regex, &block) end diff --git a/Library/Homebrew/livecheck/strategy/github_latest.rb b/Library/Homebrew/livecheck/strategy/github_latest.rb index 714e761fd5..8197e635ad 100644 --- a/Library/Homebrew/livecheck/strategy/github_latest.rb +++ b/Library/Homebrew/livecheck/strategy/github_latest.rb @@ -40,7 +40,11 @@ module Homebrew PRIORITY = 0 # The `Regexp` used to determine if the strategy applies to the URL. - URL_MATCH_REGEX = %r{//github\.com(?:/downloads)?(?:/[^/]+){2}}i.freeze + URL_MATCH_REGEX = %r{ + ^https?://github\.com + /(?:downloads/)?(?[^/]+) # The GitHub username + /(?[^/]+) # The GitHub repository name + }ix.freeze # Whether the strategy can be applied to the provided URL. # @@ -57,10 +61,10 @@ module Homebrew # @param regex [Regexp] a regex used for matching versions in content # @return [Hash] def self.find_versions(url, regex = nil, &block) - %r{github\.com/(?:downloads/)?(?[^/]+)/(?[^/]+)}i =~ url.sub(/\.git$/i, "") + match = url.sub(/\.git$/i, "").match(URL_MATCH_REGEX) # Example URL: `https://github.com/example/example/releases/latest` - page_url = "https://github.com/#{username}/#{repository}/releases/latest" + page_url = "https://github.com/#{match[:username]}/#{match[:repository]}/releases/latest" # The default regex is the same for all URLs using this strategy regex ||= %r{href=.*?/tag/v?(\d+(?:\.\d+)+)["' >]}i diff --git a/Library/Homebrew/livecheck/strategy/gnome.rb b/Library/Homebrew/livecheck/strategy/gnome.rb index e30e61372f..19ea4ed70c 100644 --- a/Library/Homebrew/livecheck/strategy/gnome.rb +++ b/Library/Homebrew/livecheck/strategy/gnome.rb @@ -20,7 +20,11 @@ module Homebrew NICE_NAME = "GNOME" # The `Regexp` used to determine if the strategy applies to the URL. - URL_MATCH_REGEX = %r{^https?://download\.gnome\.org/sources/[^/]+/}i.freeze + URL_MATCH_REGEX = %r{ + ^https?://download\.gnome\.org + /sources + /(?[^/]+)/ # The GNOME package name + }ix.freeze # Whether the strategy can be applied to the provided URL. # @@ -37,9 +41,9 @@ module Homebrew # @param regex [Regexp] a regex used for matching versions in content # @return [Hash] def self.find_versions(url, regex = nil, &block) - %r{/sources/(?[^/]+)/}i =~ url + match = url.match(URL_MATCH_REGEX) - page_url = "https://download.gnome.org/sources/#{package_name}/cache.json" + page_url = "https://download.gnome.org/sources/#{match[:package_name]}/cache.json" # GNOME archive files seem to use a standard filename format, so we # count on the delimiter between the package name and numeric version @@ -51,7 +55,7 @@ module Homebrew # development versions. See: https://www.gnome.org/gnome-3/source/ # # Example regex: `/example-(\d+\.([0-8]\d*?)?[02468](?:\.\d+)*?)\.t/i` - regex ||= /#{Regexp.escape(package_name)}-(\d+\.([0-8]\d*?)?[02468](?:\.\d+)*?)\.t/i + regex ||= /#{Regexp.escape(match[:package_name])}-(\d+\.([0-8]\d*?)?[02468](?:\.\d+)*?)\.t/i PageMatch.find_versions(page_url, regex, &block) end diff --git a/Library/Homebrew/livecheck/strategy/gnu.rb b/Library/Homebrew/livecheck/strategy/gnu.rb index 35fb951475..8f2a5bcdca 100644 --- a/Library/Homebrew/livecheck/strategy/gnu.rb +++ b/Library/Homebrew/livecheck/strategy/gnu.rb @@ -33,18 +33,11 @@ module Homebrew # The `Regexp` used to determine if the strategy applies to the URL. URL_MATCH_REGEX = %r{ - //.+?\.gnu\.org$| - gnu\.org/(?:gnu|software)/ + ^https?:// + (?:(?:[^/]+?\.)*gnu\.org/(?:gnu|software)/(?[^/]+)/ + |(?[^/]+)\.gnu\.org/?$) }ix.freeze - # The `Regexp` used to parse the project name from the provided URL. - # The strategy uses this information to create the URL to check and - # the default regex. - PROJECT_NAME_REGEXES = [ - %r{/(?:gnu|software)/(?.+?)/}i, - %r{//(?.+?)\.gnu\.org(?:/)?$}i, - ].freeze - # Whether the strategy can be applied to the provided URL. # # @param url [String] the URL to match against @@ -60,24 +53,10 @@ module Homebrew # @param regex [Regexp] a regex used for matching versions in content # @return [Hash] def self.find_versions(url, regex = nil, &block) - project_names = PROJECT_NAME_REGEXES.map do |project_name_regex| - m = url.match(project_name_regex) - m["project_name"] if m - end.compact - return { matches: {}, regex: regex, url: url } if project_names.blank? - - if project_names.length > 1 - odebug <<~EOS - - Multiple project names found: #{match_list} - - EOS - end - - project_name = project_names.first + match = url.match(URL_MATCH_REGEX) # The directory listing page for the project's files - page_url = "http://ftp.gnu.org/gnu/#{project_name}/?C=M&O=D" + page_url = "http://ftp.gnu.org/gnu/#{match[:project_name]}/?C=M&O=D" # The default regex consists of the following parts: # * `href=.*?`: restricts matching to URLs in `href` attributes @@ -87,7 +66,7 @@ module Homebrew # * `(?:\.[a-z]+|/)`: the file extension (a trailing delimiter) # # Example regex: `%r{href=.*?example[._-]v?(\d+(?:\.\d+)*)(?:\.[a-z]+|/)}i` - regex ||= %r{href=.*?#{project_name}[._-]v?(\d+(?:\.\d+)*)(?:\.[a-z]+|/)}i + regex ||= %r{href=.*?#{match[:project_name]}[._-]v?(\d+(?:\.\d+)*)(?:\.[a-z]+|/)}i PageMatch.find_versions(page_url, regex, &block) end diff --git a/Library/Homebrew/livecheck/strategy/hackage.rb b/Library/Homebrew/livecheck/strategy/hackage.rb index 04cfddcee2..cb986e34ba 100644 --- a/Library/Homebrew/livecheck/strategy/hackage.rb +++ b/Library/Homebrew/livecheck/strategy/hackage.rb @@ -17,8 +17,19 @@ module Homebrew # # @api public class Hackage - # The `Regexp` used to determine if the strategy applies to the URL. - URL_MATCH_REGEX = %r{^https?://(?:downloads|hackage)\.haskell\.org(?:/[^/]+){3}}i.freeze + # A `Regexp` used in determining if the strategy applies to the URL and + # also as part of extracting the package name from the URL basename. + PACKAGE_NAME_REGEX = /(?.+?)-\d+/i.freeze + + # A `Regexp` used to extract the package name from the URL basename. + FILENAME_REGEX = /^#{PACKAGE_NAME_REGEX.source.strip}/i.freeze + + # A `Regexp` used in determining if the strategy applies to the URL. + URL_MATCH_REGEX = %r{ + ^https?://(?:downloads|hackage)\.haskell\.org + (?:/[^/]+)+ # Path before the filename + #{PACKAGE_NAME_REGEX.source.strip} + }ix.freeze # Whether the strategy can be applied to the provided URL. # @@ -35,13 +46,13 @@ module Homebrew # @param regex [Regexp] a regex used for matching versions in content # @return [Hash] def self.find_versions(url, regex = nil, &block) - /^(?.+?)-\d+/i =~ File.basename(url) + match = File.basename(url).match(FILENAME_REGEX) # A page containing a directory listing of the latest source tarball - page_url = "https://hackage.haskell.org/package/#{package_name}/src/" + page_url = "https://hackage.haskell.org/package/#{match[:package_name]}/src/" # Example regex: `%r{

example-(.*?)/?

}i` - regex ||= %r{

#{Regexp.escape(package_name)}-(.*?)/?

}i + regex ||= %r{

#{Regexp.escape(match[:package_name])}-(.*?)/?

}i PageMatch.find_versions(page_url, regex, &block) end diff --git a/Library/Homebrew/livecheck/strategy/launchpad.rb b/Library/Homebrew/livecheck/strategy/launchpad.rb index cf5c286c54..a871973b03 100644 --- a/Library/Homebrew/livecheck/strategy/launchpad.rb +++ b/Library/Homebrew/livecheck/strategy/launchpad.rb @@ -24,7 +24,10 @@ module Homebrew # @api public class Launchpad # The `Regexp` used to determine if the strategy applies to the URL. - URL_MATCH_REGEX = /launchpad\.net/i.freeze + URL_MATCH_REGEX = %r{ + ^https?://(?:[^/]+?\.)*launchpad\.net + /(?[^/]+) # The Launchpad project name + }ix.freeze # Whether the strategy can be applied to the provided URL. # @@ -41,10 +44,10 @@ module Homebrew # @param regex [Regexp] a regex used for matching versions in content # @return [Hash] def self.find_versions(url, regex = nil, &block) - %r{launchpad\.net/(?[^/]+)}i =~ url + match = url.match(URL_MATCH_REGEX) # The main page for the project on Launchpad - page_url = "https://launchpad.net/#{project_name}" + page_url = "https://launchpad.net/#{match[:project_name]}" # The default regex is the same for all URLs using this strategy regex ||= %r{class="[^"]*version[^"]*"[^>]*>\s*Latest version is (.+)\s*.+?)/-/ # The npm package name + }ix.freeze # Whether the strategy can be applied to the provided URL. # @@ -37,14 +40,14 @@ module Homebrew # @param regex [Regexp] a regex used for matching versions in content # @return [Hash] def self.find_versions(url, regex = nil, &block) - %r{registry\.npmjs\.org/(?(?:[^/]+/)?[^/]+)/-/}i =~ url + match = url.match(URL_MATCH_REGEX) - page_url = "https://www.npmjs.com/package/#{package_name}?activeTab=versions" + page_url = "https://www.npmjs.com/package/#{match[:package_name]}?activeTab=versions" # Example regexes: # * `%r{href=.*?/package/example/v/(\d+(?:\.\d+)+)"}i` # * `%r{href=.*?/package/@example/example/v/(\d+(?:\.\d+)+)"}i` - regex ||= %r{href=.*?/package/#{Regexp.escape(package_name)}/v/(\d+(?:\.\d+)+)"}i + regex ||= %r{href=.*?/package/#{Regexp.escape(match[:package_name])}/v/(\d+(?:\.\d+)+)"}i PageMatch.find_versions(page_url, regex, &block) end diff --git a/Library/Homebrew/livecheck/strategy/pypi.rb b/Library/Homebrew/livecheck/strategy/pypi.rb index ba03c41a6d..45a07374c5 100644 --- a/Library/Homebrew/livecheck/strategy/pypi.rb +++ b/Library/Homebrew/livecheck/strategy/pypi.rb @@ -19,8 +19,21 @@ module Homebrew class Pypi NICE_NAME = "PyPI" + # The `Regexp` used to extract the package name and suffix (e.g., file + # extension) from the URL basename. + FILENAME_REGEX = / + (?.+)- # The package name followed by a hyphen + .*? # The version string + (?\.tar\.[a-z0-9]+|\.[a-z0-9]+)$ # Filename extension + /ix.freeze + # The `Regexp` used to determine if the strategy applies to the URL. - URL_MATCH_REGEX = %r{^https?://files\.pythonhosted\.org/packages(?:/[^/]+){4}i}.freeze + URL_MATCH_REGEX = %r{ + ^https?://files\.pythonhosted\.org + /packages + (?:/[^/]+)+ # The hexadecimal paths before the filename + /#{FILENAME_REGEX.source.strip} # The filename + }ix.freeze # Whether the strategy can be applied to the provided URL. # @@ -37,23 +50,19 @@ module Homebrew # @param regex [Regexp] a regex used for matching versions in content # @return [Hash] def self.find_versions(url, regex = nil, &block) - / - (?.+)- # The package name followed by a hyphen - .*? # The version string - (?\.tar\.[a-z0-9]+|\.[a-z0-9]+)$ # Filename extension - /ix =~ File.basename(url) + match = File.basename(url).match(FILENAME_REGEX) # Use `\.t` instead of specific tarball extensions (e.g. .tar.gz) - suffix.sub!(/\.t(?:ar\..+|[a-z0-9]+)$/i, "\.t") + suffix = match[:suffix].sub(/\.t(?:ar\..+|[a-z0-9]+)$/i, "\.t") # It's not technically necessary to have the `#files` fragment at the # end of the URL but it makes the debug output a bit more useful. - page_url = "https://pypi.org/project/#{package_name.gsub(/%20|_/, "-")}/#files" + page_url = "https://pypi.org/project/#{match[:package_name].gsub(/%20|_/, "-")}/#files" - # Example regex: `%r{href=.*?/packages.*?/example[._-]v?(\d+(?:\.\d+)*).t}i`. - regex ||= - %r{href=.*?/packages.*?/#{Regexp.escape(package_name)}[._-] - v?(\d+(?:\.\d+)*(.post\d+)?)#{Regexp.escape(suffix)}}ix + # Example regex: `%r{href=.*?/packages.*?/example[._-]v?(\d+(?:\.\d+)*(?:[._-]post\d+)?)\.t}i` + re_package_name = Regexp.escape(match[:package_name]) + re_suffix = Regexp.escape(suffix) + regex ||= %r{href=.*?/packages.*?/#{re_package_name}[._-]v?(\d+(?:\.\d+)*(?:[._-]post\d+)?)#{re_suffix}}i PageMatch.find_versions(page_url, regex, &block) end diff --git a/Library/Homebrew/livecheck/strategy/sourceforge.rb b/Library/Homebrew/livecheck/strategy/sourceforge.rb index e58a118ae5..f3a91dd66c 100644 --- a/Library/Homebrew/livecheck/strategy/sourceforge.rb +++ b/Library/Homebrew/livecheck/strategy/sourceforge.rb @@ -34,7 +34,12 @@ module Homebrew NICE_NAME = "SourceForge" # The `Regexp` used to determine if the strategy applies to the URL. - URL_MATCH_REGEX = /(?:sourceforge|sf)\.net/i.freeze + URL_MATCH_REGEX = %r{ + ^https?://(?:[^/]+?\.)*(?:sourceforge|sf)\.net + (?:/projects?/(?[^/]+)/ + |/p/(?[^/]+)/ + |(?::/cvsroot)?/(?[^/]+)) + }ix.freeze # Whether the strategy can be applied to the provided URL. # @@ -51,20 +56,14 @@ module Homebrew # @param regex [Regexp] a regex used for matching versions in content # @return [Hash] def self.find_versions(url, regex = nil, &block) - if url.include?("/project") - %r{/projects?/(?[^/]+)/}i =~ url - elsif url.include?(".net/p/") - %r{\.net/p/(?[^/]+)/}i =~ url - else - %r{\.net(?::/cvsroot)?/(?[^/]+)}i =~ url - end + match = url.match(URL_MATCH_REGEX) - page_url = "https://sourceforge.net/projects/#{project_name}/rss" + page_url = "https://sourceforge.net/projects/#{match[:project_name]}/rss" # It may be possible to improve the default regex but there's quite a # bit of variation between projects and it can be challenging to # create something that works for most URLs. - regex ||= %r{url=.*?/#{Regexp.escape(project_name)}/files/.*?[-_/](\d+(?:[-.]\d+)+)[-_/%.]}i + regex ||= %r{url=.*?/#{Regexp.escape(match[:project_name])}/files/.*?[-_/](\d+(?:[-.]\d+)+)[-_/%.]}i PageMatch.find_versions(page_url, regex, &block) end diff --git a/Library/Homebrew/livecheck/strategy/xorg.rb b/Library/Homebrew/livecheck/strategy/xorg.rb index 7800d96696..79dddc0b23 100644 --- a/Library/Homebrew/livecheck/strategy/xorg.rb +++ b/Library/Homebrew/livecheck/strategy/xorg.rb @@ -40,10 +40,18 @@ module Homebrew class Xorg NICE_NAME = "X.Org" + # A `Regexp` used in determining if the strategy applies to the URL and + # also as part of extracting the module name from the URL basename. + MODULE_REGEX = /(?.+)-\d+/i.freeze + + # A `Regexp` used to extract the module name from the URL basename. + FILENAME_REGEX = /^#{MODULE_REGEX.source.strip}/i.freeze + # The `Regexp` used to determine if the strategy applies to the URL. URL_MATCH_REGEX = %r{ - [/.]x\.org.*?/individual/| - freedesktop\.org/(?:archive|dist|software)/ + ^https?://(?:[^/]+?\.)* # Scheme and any leading subdomains + (?:x\.org/(?:[^/]+/)*individual/(?:[^/]+/)*#{MODULE_REGEX.source.strip} + |freedesktop\.org/(?:archive|dist|software)/(?:[^/]+/)*#{MODULE_REGEX.source.strip}) }ix.freeze # Used to cache page content, so we don't fetch the same pages @@ -72,15 +80,15 @@ module Homebrew # @return [Hash] def self.find_versions(url, regex = nil, &block) file_name = File.basename(url) - /^(?.+)-\d+/i =~ file_name + match = file_name.match(FILENAME_REGEX) # /pub/ URLs redirect to the same URL with /archive/, so we replace # it to avoid the redirection. Removing the filename from the end of # the URL gives us the relevant directory listing page. page_url = url.sub("x.org/pub/", "x.org/archive/").delete_suffix(file_name) - # Example regex: /href=.*?example[._-]v?(\d+(?:\.\d+)+)\.t/i - regex ||= /href=.*?#{Regexp.escape(module_name)}[._-]v?(\d+(?:\.\d+)+)\.t/i + # Example regex: `/href=.*?example[._-]v?(\d+(?:\.\d+)+)\.t/i` + regex ||= /href=.*?#{Regexp.escape(match[:module_name])}[._-]v?(\d+(?:\.\d+)+)\.t/i # Use the cached page content to avoid duplicate fetches cached_content = @page_data[page_url]