Merge pull request #9519 from vladimyr/livecheck-url-patterns

livecheck: update URL patterns
This commit is contained in:
Sam Ford 2020-12-31 12:03:29 -05:00 committed by GitHub
commit 425a2a675b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 120 additions and 94 deletions

View File

@ -22,7 +22,14 @@ module Homebrew
# @api public
class Apache
# The `Regexp` used to determine if the strategy applies to the URL.
URL_MATCH_REGEX = %r{www\.apache\.org/dyn/.+path=.+}i.freeze
URL_MATCH_REGEX = %r{
^https?://www\.apache\.org
/dyn/.+path=
(?<path>.+?)/ # Path to directory of files or version directories
(?<prefix>[^/]*?) # Any text in filename or directory before version
v?\d+(?:\.\d+)+ # The numeric version
(?<suffix>/|[^/]*) # Any text in filename or directory after version
}ix.freeze
# Whether the strategy can be applied to the provided URL.
#
@ -39,25 +46,19 @@ module Homebrew
# @param regex [Regexp] a regex used for matching versions in content
# @return [Hash]
def self.find_versions(url, regex = nil, &block)
%r{
path=
(?<path>.+?)/ # Path to directory of files or version directories
(?<prefix>[^/]*?) # Any text in filename or directory before version
v?\d+(?:\.\d+)+ # The numeric version
(?<suffix>/|[^/]*) # Any text in filename or directory after version
}ix =~ url
match = url.match(URL_MATCH_REGEX)
# Use `\.t` instead of specific tarball extensions (e.g. .tar.gz)
suffix.sub!(/\.t(?:ar\..+|[a-z0-9]+)$/i, "\.t")
suffix = match[:suffix].sub(/\.t(?:ar\..+|[a-z0-9]+)$/i, "\.t")
# Example URL: `https://archive.apache.org/dist/example/`
page_url = "https://archive.apache.org/dist/#{path}/"
page_url = "https://archive.apache.org/dist/#{match[:path]}/"
# Example directory regex: `%r{href=["']?v?(\d+(?:\.\d+)+)/}i`
# Example file regexes:
# * `/href=["']?example-v?(\d+(?:\.\d+)+)\.t/i`
# * `/href=["']?example-v?(\d+(?:\.\d+)+)-bin\.zip/i`
regex ||= /href=["']?#{Regexp.escape(prefix)}v?(\d+(?:\.\d+)+)#{Regexp.escape(suffix)}/i
regex ||= /href=["']?#{Regexp.escape(match[:prefix])}v?(\d+(?:\.\d+)+)#{Regexp.escape(suffix)}/i
PageMatch.find_versions(page_url, regex, &block)
end

View File

@ -21,7 +21,13 @@ module Homebrew
NICE_NAME = "CPAN"
# The `Regexp` used to determine if the strategy applies to the URL.
URL_MATCH_REGEX = %r{^https?://cpan\.metacpan\.org/authors/id(?:/[^/]+){3,}/[^/]+}i.freeze
URL_MATCH_REGEX = %r{
^https?://cpan\.metacpan\.org
(?<path>/authors/id(?:/[^/]+){3,}/) # Path before the filename
(?<prefix>[^/]+) # Filename text before the version
-v?\d+(?:\.\d+)* # The numeric version
(?<suffix>[^/]+) # Filename text after the version
}ix.freeze
# Whether the strategy can be applied to the provided URL.
#
@ -38,21 +44,16 @@ module Homebrew
# @param regex [Regexp] a regex used for matching versions in content
# @return [Hash]
def self.find_versions(url, regex = nil, &block)
%r{
(?<path>/authors/id(?:/[^/]+){3,}/) # Path before the filename
(?<prefix>[^/]+) # Filename text before the version
-v?\d+(?:\.\d+)* # The numeric version
(?<suffix>[^/]+) # Filename text after the version
}ix =~ url
match = url.match(URL_MATCH_REGEX)
# Use `\.t` instead of specific tarball extensions (e.g. .tar.gz)
suffix.sub!(/\.t(?:ar\..+|[a-z0-9]+)$/i, "\.t")
suffix = match[:suffix].sub(/\.t(?:ar\..+|[a-z0-9]+)$/i, "\.t")
# The directory listing page where the archive files are found
page_url = "https://cpan.metacpan.org#{path}"
page_url = "https://cpan.metacpan.org#{match[:path]}"
# Example regex: `/href=.*?Brew[._-]v?(\d+(?:\.\d+)*)\.t/i`
regex ||= /href=.*?#{prefix}[._-]v?(\d+(?:\.\d+)*)#{Regexp.escape(suffix)}/i
regex ||= /href=.*?#{match[:prefix]}[._-]v?(\d+(?:\.\d+)*)#{Regexp.escape(suffix)}/i
PageMatch.find_versions(page_url, regex, &block)
end

View File

@ -40,7 +40,11 @@ module Homebrew
PRIORITY = 0
# The `Regexp` used to determine if the strategy applies to the URL.
URL_MATCH_REGEX = %r{//github\.com(?:/downloads)?(?:/[^/]+){2}}i.freeze
URL_MATCH_REGEX = %r{
^https?://github\.com
/(?:downloads/)?(?<username>[^/]+) # The GitHub username
/(?<repository>[^/]+) # The GitHub repository name
}ix.freeze
# Whether the strategy can be applied to the provided URL.
#
@ -57,10 +61,10 @@ module Homebrew
# @param regex [Regexp] a regex used for matching versions in content
# @return [Hash]
def self.find_versions(url, regex = nil, &block)
%r{github\.com/(?:downloads/)?(?<username>[^/]+)/(?<repository>[^/]+)}i =~ url.sub(/\.git$/i, "")
match = url.sub(/\.git$/i, "").match(URL_MATCH_REGEX)
# Example URL: `https://github.com/example/example/releases/latest`
page_url = "https://github.com/#{username}/#{repository}/releases/latest"
page_url = "https://github.com/#{match[:username]}/#{match[:repository]}/releases/latest"
# The default regex is the same for all URLs using this strategy
regex ||= %r{href=.*?/tag/v?(\d+(?:\.\d+)+)["' >]}i

View File

@ -20,7 +20,11 @@ module Homebrew
NICE_NAME = "GNOME"
# The `Regexp` used to determine if the strategy applies to the URL.
URL_MATCH_REGEX = /download\.gnome\.org/i.freeze
URL_MATCH_REGEX = %r{
^https?://download\.gnome\.org
/sources
/(?<package_name>[^/]+)/ # The GNOME package name
}ix.freeze
# Whether the strategy can be applied to the provided URL.
#
@ -37,9 +41,9 @@ module Homebrew
# @param regex [Regexp] a regex used for matching versions in content
# @return [Hash]
def self.find_versions(url, regex = nil, &block)
%r{/sources/(?<package_name>.*?)/}i =~ url
match = url.match(URL_MATCH_REGEX)
page_url = "https://download.gnome.org/sources/#{package_name}/cache.json"
page_url = "https://download.gnome.org/sources/#{match[:package_name]}/cache.json"
# GNOME archive files seem to use a standard filename format, so we
# count on the delimiter between the package name and numeric version
@ -51,7 +55,7 @@ module Homebrew
# development versions. See: https://www.gnome.org/gnome-3/source/
#
# Example regex: `/example-(\d+\.([0-8]\d*?)?[02468](?:\.\d+)*?)\.t/i`
regex ||= /#{Regexp.escape(package_name)}-(\d+\.([0-8]\d*?)?[02468](?:\.\d+)*?)\.t/i
regex ||= /#{Regexp.escape(match[:package_name])}-(\d+\.([0-8]\d*?)?[02468](?:\.\d+)*?)\.t/i
PageMatch.find_versions(page_url, regex, &block)
end

View File

@ -33,18 +33,11 @@ module Homebrew
# The `Regexp` used to determine if the strategy applies to the URL.
URL_MATCH_REGEX = %r{
//.+?\.gnu\.org$|
gnu\.org/(?:gnu|software)/
^https?://
(?:(?:[^/]+?\.)*gnu\.org/(?:gnu|software)/(?<project_name>[^/]+)/
|(?<project_name>[^/]+)\.gnu\.org/?$)
}ix.freeze
# The `Regexp` used to parse the project name from the provided URL.
# The strategy uses this information to create the URL to check and
# the default regex.
PROJECT_NAME_REGEXES = [
%r{/(?:gnu|software)/(?<project_name>.+?)/}i,
%r{//(?<project_name>.+?)\.gnu\.org(?:/)?$}i,
].freeze
# Whether the strategy can be applied to the provided URL.
#
# @param url [String] the URL to match against
@ -60,24 +53,10 @@ module Homebrew
# @param regex [Regexp] a regex used for matching versions in content
# @return [Hash]
def self.find_versions(url, regex = nil, &block)
project_names = PROJECT_NAME_REGEXES.map do |project_name_regex|
m = url.match(project_name_regex)
m["project_name"] if m
end.compact
return { matches: {}, regex: regex, url: url } if project_names.blank?
if project_names.length > 1
odebug <<~EOS
Multiple project names found: #{match_list}
EOS
end
project_name = project_names.first
match = url.match(URL_MATCH_REGEX)
# The directory listing page for the project's files
page_url = "http://ftp.gnu.org/gnu/#{project_name}/?C=M&O=D"
page_url = "http://ftp.gnu.org/gnu/#{match[:project_name]}/?C=M&O=D"
# The default regex consists of the following parts:
# * `href=.*?`: restricts matching to URLs in `href` attributes
@ -87,7 +66,7 @@ module Homebrew
# * `(?:\.[a-z]+|/)`: the file extension (a trailing delimiter)
#
# Example regex: `%r{href=.*?example[._-]v?(\d+(?:\.\d+)*)(?:\.[a-z]+|/)}i`
regex ||= %r{href=.*?#{project_name}[._-]v?(\d+(?:\.\d+)*)(?:\.[a-z]+|/)}i
regex ||= %r{href=.*?#{match[:project_name]}[._-]v?(\d+(?:\.\d+)*)(?:\.[a-z]+|/)}i
PageMatch.find_versions(page_url, regex, &block)
end

View File

@ -17,8 +17,19 @@ module Homebrew
#
# @api public
class Hackage
# The `Regexp` used to determine if the strategy applies to the URL.
URL_MATCH_REGEX = /(?:downloads|hackage)\.haskell\.org/i.freeze
# A `Regexp` used in determining if the strategy applies to the URL and
# also as part of extracting the package name from the URL basename.
PACKAGE_NAME_REGEX = /(?<package_name>.+?)-\d+/i.freeze
# A `Regexp` used to extract the package name from the URL basename.
FILENAME_REGEX = /^#{PACKAGE_NAME_REGEX.source.strip}/i.freeze
# A `Regexp` used in determining if the strategy applies to the URL.
URL_MATCH_REGEX = %r{
^https?://(?:downloads|hackage)\.haskell\.org
(?:/[^/]+)+ # Path before the filename
#{PACKAGE_NAME_REGEX.source.strip}
}ix.freeze
# Whether the strategy can be applied to the provided URL.
#
@ -35,13 +46,13 @@ module Homebrew
# @param regex [Regexp] a regex used for matching versions in content
# @return [Hash]
def self.find_versions(url, regex = nil, &block)
/^(?<package_name>.+?)-\d+/i =~ File.basename(url)
match = File.basename(url).match(FILENAME_REGEX)
# A page containing a directory listing of the latest source tarball
page_url = "https://hackage.haskell.org/package/#{package_name}/src/"
page_url = "https://hackage.haskell.org/package/#{match[:package_name]}/src/"
# Example regex: `%r{<h3>example-(.*?)/?</h3>}i`
regex ||= %r{<h3>#{Regexp.escape(package_name)}-(.*?)/?</h3>}i
regex ||= %r{<h3>#{Regexp.escape(match[:package_name])}-(.*?)/?</h3>}i
PageMatch.find_versions(page_url, regex, &block)
end

View File

@ -24,7 +24,10 @@ module Homebrew
# @api public
class Launchpad
# The `Regexp` used to determine if the strategy applies to the URL.
URL_MATCH_REGEX = /launchpad\.net/i.freeze
URL_MATCH_REGEX = %r{
^https?://(?:[^/]+?\.)*launchpad\.net
/(?<project_name>[^/]+) # The Launchpad project name
}ix.freeze
# Whether the strategy can be applied to the provided URL.
#
@ -41,10 +44,10 @@ module Homebrew
# @param regex [Regexp] a regex used for matching versions in content
# @return [Hash]
def self.find_versions(url, regex = nil, &block)
%r{launchpad\.net/(?<project_name>[^/]+)}i =~ url
match = url.match(URL_MATCH_REGEX)
# The main page for the project on Launchpad
page_url = "https://launchpad.net/#{project_name}"
page_url = "https://launchpad.net/#{match[:project_name]}"
# The default regex is the same for all URLs using this strategy
regex ||= %r{class="[^"]*version[^"]*"[^>]*>\s*Latest version is (.+)\s*</}

View File

@ -20,7 +20,10 @@ module Homebrew
NICE_NAME = "npm"
# The `Regexp` used to determine if the strategy applies to the URL.
URL_MATCH_REGEX = /registry\.npmjs\.org/i.freeze
URL_MATCH_REGEX = %r{
^https?://registry\.npmjs\.org
/(?<package_name>.+?)/-/ # The npm package name
}ix.freeze
# Whether the strategy can be applied to the provided URL.
#
@ -37,14 +40,14 @@ module Homebrew
# @param regex [Regexp] a regex used for matching versions in content
# @return [Hash]
def self.find_versions(url, regex = nil, &block)
%r{registry\.npmjs\.org/(?<package_name>.+)/-/}i =~ url
match = url.match(URL_MATCH_REGEX)
page_url = "https://www.npmjs.com/package/#{package_name}?activeTab=versions"
page_url = "https://www.npmjs.com/package/#{match[:package_name]}?activeTab=versions"
# Example regexes:
# * `%r{href=.*?/package/example/v/(\d+(?:\.\d+)+)"}i`
# * `%r{href=.*?/package/@example/example/v/(\d+(?:\.\d+)+)"}i`
regex ||= %r{href=.*?/package/#{Regexp.escape(package_name)}/v/(\d+(?:\.\d+)+)"}i
regex ||= %r{href=.*?/package/#{Regexp.escape(match[:package_name])}/v/(\d+(?:\.\d+)+)"}i
PageMatch.find_versions(page_url, regex, &block)
end

View File

@ -19,8 +19,21 @@ module Homebrew
class Pypi
NICE_NAME = "PyPI"
# The `Regexp` used to extract the package name and suffix (e.g., file
# extension) from the URL basename.
FILENAME_REGEX = /
(?<package_name>.+)- # The package name followed by a hyphen
.*? # The version string
(?<suffix>\.tar\.[a-z0-9]+|\.[a-z0-9]+)$ # Filename extension
/ix.freeze
# The `Regexp` used to determine if the strategy applies to the URL.
URL_MATCH_REGEX = /files\.pythonhosted\.org/i.freeze
URL_MATCH_REGEX = %r{
^https?://files\.pythonhosted\.org
/packages
(?:/[^/]+)+ # The hexadecimal paths before the filename
/#{FILENAME_REGEX.source.strip} # The filename
}ix.freeze
# Whether the strategy can be applied to the provided URL.
#
@ -37,23 +50,19 @@ module Homebrew
# @param regex [Regexp] a regex used for matching versions in content
# @return [Hash]
def self.find_versions(url, regex = nil, &block)
/
(?<package_name>.+)- # The package name followed by a hyphen
.*? # The version string
(?<suffix>\.tar\.[a-z0-9]+|\.[a-z0-9]+)$ # Filename extension
/ix =~ File.basename(url)
match = File.basename(url).match(FILENAME_REGEX)
# Use `\.t` instead of specific tarball extensions (e.g. .tar.gz)
suffix.sub!(/\.t(?:ar\..+|[a-z0-9]+)$/i, "\.t")
suffix = match[:suffix].sub(/\.t(?:ar\..+|[a-z0-9]+)$/i, "\.t")
# It's not technically necessary to have the `#files` fragment at the
# end of the URL but it makes the debug output a bit more useful.
page_url = "https://pypi.org/project/#{package_name.gsub(/%20|_/, "-")}/#files"
page_url = "https://pypi.org/project/#{match[:package_name].gsub(/%20|_/, "-")}/#files"
# Example regex: `%r{href=.*?/packages.*?/example[._-]v?(\d+(?:\.\d+)*).t}i`.
regex ||=
%r{href=.*?/packages.*?/#{Regexp.escape(package_name)}[._-]
v?(\d+(?:\.\d+)*(.post\d+)?)#{Regexp.escape(suffix)}}ix
# Example regex: `%r{href=.*?/packages.*?/example[._-]v?(\d+(?:\.\d+)*(?:[._-]post\d+)?)\.t}i`
re_package_name = Regexp.escape(match[:package_name])
re_suffix = Regexp.escape(suffix)
regex ||= %r{href=.*?/packages.*?/#{re_package_name}[._-]v?(\d+(?:\.\d+)*(?:[._-]post\d+)?)#{re_suffix}}i
PageMatch.find_versions(page_url, regex, &block)
end

View File

@ -34,7 +34,12 @@ module Homebrew
NICE_NAME = "SourceForge"
# The `Regexp` used to determine if the strategy applies to the URL.
URL_MATCH_REGEX = /(?:sourceforge|sf)\.net/i.freeze
URL_MATCH_REGEX = %r{
^https?://(?:[^/]+?\.)*(?:sourceforge|sf)\.net
(?:/projects?/(?<project_name>[^/]+)/
|/p/(?<project_name>[^/]+)/
|(?::/cvsroot)?/(?<project_name>[^/]+))
}ix.freeze
# Whether the strategy can be applied to the provided URL.
#
@ -51,20 +56,14 @@ module Homebrew
# @param regex [Regexp] a regex used for matching versions in content
# @return [Hash]
def self.find_versions(url, regex = nil, &block)
if url.include?("/project")
%r{/projects?/(?<project_name>[^/]+)/}i =~ url
elsif url.include?(".net/p/")
%r{\.net/p/(?<project_name>[^/]+)/}i =~ url
else
%r{\.net(?::/cvsroot)?/(?<project_name>[^/]+)}i =~ url
end
match = url.match(URL_MATCH_REGEX)
page_url = "https://sourceforge.net/projects/#{project_name}/rss"
page_url = "https://sourceforge.net/projects/#{match[:project_name]}/rss"
# It may be possible to improve the default regex but there's quite a
# bit of variation between projects and it can be challenging to
# create something that works for most URLs.
regex ||= %r{url=.*?/#{Regexp.escape(project_name)}/files/.*?[-_/](\d+(?:[-.]\d+)+)[-_/%.]}i
regex ||= %r{url=.*?/#{Regexp.escape(match[:project_name])}/files/.*?[-_/](\d+(?:[-.]\d+)+)[-_/%.]}i
PageMatch.find_versions(page_url, regex, &block)
end

View File

@ -40,10 +40,18 @@ module Homebrew
class Xorg
NICE_NAME = "X.Org"
# A `Regexp` used in determining if the strategy applies to the URL and
# also as part of extracting the module name from the URL basename.
MODULE_REGEX = /(?<module_name>.+)-\d+/i.freeze
# A `Regexp` used to extract the module name from the URL basename.
FILENAME_REGEX = /^#{MODULE_REGEX.source.strip}/i.freeze
# The `Regexp` used to determine if the strategy applies to the URL.
URL_MATCH_REGEX = %r{
[/.]x\.org.*?/individual/|
freedesktop\.org/(?:archive|dist|software)/
^https?://(?:[^/]+?\.)* # Scheme and any leading subdomains
(?:x\.org/(?:[^/]+/)*individual/(?:[^/]+/)*#{MODULE_REGEX.source.strip}
|freedesktop\.org/(?:archive|dist|software)/(?:[^/]+/)*#{MODULE_REGEX.source.strip})
}ix.freeze
# Used to cache page content, so we don't fetch the same pages
@ -72,15 +80,15 @@ module Homebrew
# @return [Hash]
def self.find_versions(url, regex = nil, &block)
file_name = File.basename(url)
/^(?<module_name>.+)-\d+/i =~ file_name
match = file_name.match(FILENAME_REGEX)
# /pub/ URLs redirect to the same URL with /archive/, so we replace
# it to avoid the redirection. Removing the filename from the end of
# the URL gives us the relevant directory listing page.
page_url = url.sub("x.org/pub/", "x.org/archive/").delete_suffix(file_name)
# Example regex: /href=.*?example[._-]v?(\d+(?:\.\d+)+)\.t/i
regex ||= /href=.*?#{Regexp.escape(module_name)}[._-]v?(\d+(?:\.\d+)+)\.t/i
# Example regex: `/href=.*?example[._-]v?(\d+(?:\.\d+)+)\.t/i`
regex ||= /href=.*?#{Regexp.escape(match[:module_name])}[._-]v?(\d+(?:\.\d+)+)\.t/i
# Use the cached page content to avoid duplicate fetches
cached_content = @page_data[page_url]

View File

@ -7,11 +7,13 @@ describe Homebrew::Livecheck::Strategy::Hackage do
subject(:hackage) { described_class }
let(:hackage_url) { "https://hackage.haskell.org/package/abc-1.2.3/def-1.2.3.tar.gz" }
let(:hackage_downloads_url) { "https://downloads.haskell.org/~abc/1.2.3/def-1.2.3-src.tar.xz" }
let(:non_hackage_url) { "https://brew.sh/test" }
describe "::match?" do
it "returns true if the argument provided is a Hackage URL" do
expect(hackage.match?(hackage_url)).to be true
expect(hackage.match?(hackage_downloads_url)).to be true
end
it "returns false if the argument provided is not a Hackage URL" do

View File

@ -7,11 +7,13 @@ describe Homebrew::Livecheck::Strategy::Npm do
subject(:npm) { described_class }
let(:npm_url) { "https://registry.npmjs.org/abc/-/def-1.2.3.tgz" }
let(:npm_scoped_url) { "https://registry.npmjs.org/@example/abc/-/def-1.2.3.tgz" }
let(:non_npm_url) { "https://brew.sh/test" }
describe "::match?" do
it "returns true if the argument provided is an npm URL" do
expect(npm.match?(npm_url)).to be true
expect(npm.match?(npm_scoped_url)).to be true
end
it "returns false if the argument provided is not an npm URL" do