Merge pull request #14845 from samford/livecheck/add-xml-strategy

livecheck: Add Xml strategy
This commit is contained in:
Mike McQuaid 2023-03-01 22:21:13 +00:00 committed by GitHub
commit 97fbd89a57
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 379 additions and 20 deletions

View File

@ -52,6 +52,7 @@ Style/Documentation:
- livecheck/strategy/pypi.rb - livecheck/strategy/pypi.rb
- livecheck/strategy/sourceforge.rb - livecheck/strategy/sourceforge.rb
- livecheck/strategy/sparkle.rb - livecheck/strategy/sparkle.rb
- livecheck/strategy/xml.rb
- livecheck/strategy/xorg.rb - livecheck/strategy/xorg.rb
- os.rb - os.rb
- resource.rb - resource.rb

View File

@ -156,7 +156,7 @@ module Homebrew
# Only treat the strategy as usable if the `livecheck` block # Only treat the strategy as usable if the `livecheck` block
# contains a regex and/or `strategy` block # contains a regex and/or `strategy` block
next if !regex_provided && !block_provided next if !regex_provided && !block_provided
elsif strategy == Json elsif [Json, Xml].include?(strategy)
# Only treat the strategy as usable if the `livecheck` block # Only treat the strategy as usable if the `livecheck` block
# specifies the strategy and contains a `strategy` block # specifies the strategy and contains a `strategy` block
next if (livecheck_strategy != strategy_symbol) || !block_provided next if (livecheck_strategy != strategy_symbol) || !block_provided
@ -284,4 +284,5 @@ require_relative "strategy/page_match"
require_relative "strategy/pypi" require_relative "strategy/pypi"
require_relative "strategy/sourceforge" require_relative "strategy/sourceforge"
require_relative "strategy/sparkle" require_relative "strategy/sparkle"
require_relative "strategy/xml"
require_relative "strategy/xorg" require_relative "strategy/xorg"

View File

@ -66,25 +66,8 @@ module Homebrew
# @return [Item, nil] # @return [Item, nil]
sig { params(content: String).returns(T::Array[Item]) } sig { params(content: String).returns(T::Array[Item]) }
def self.items_from_content(content) def self.items_from_content(content)
require "rexml/document" xml = Xml.parse_xml(content)
return [] if xml.blank?
parsing_tries = 0
xml = begin
REXML::Document.new(content)
rescue REXML::UndefinedNamespaceException => e
undefined_prefix = e.to_s[/Undefined prefix ([^ ]+) found/i, 1]
raise if undefined_prefix.blank?
# Only retry parsing once after removing prefix from content
parsing_tries += 1
raise if parsing_tries > 1
# When an XML document contains a prefix without a corresponding
# namespace, it's necessary to remove the prefix from the content
# to be able to successfully parse it using REXML
content = content.gsub(%r{(</?| )#{Regexp.escape(undefined_prefix)}:}, '\1')
retry
end
# Remove prefixes, so we can reliably identify elements and attributes # Remove prefixes, so we can reliably identify elements and attributes
xml.root&.each_recursive do |node| xml.root&.each_recursive do |node|

View File

@ -0,0 +1,153 @@
# typed: true
# frozen_string_literal: true
module Homebrew
module Livecheck
module Strategy
# The {Xml} strategy fetches content at a URL, parses it as XML using
# `REXML`, and provides the `REXML::Document` to a `strategy` block.
# If a regex is present in the `livecheck` block, it should be passed
# as the second argument to the `strategy` block.
#
# This is a generic strategy that doesn't contain any logic for finding
# versions, as the structure of XML data varies. Instead, a `strategy`
# block must be used to extract version information from the XML data.
# For more information on how to work with an `REXML::Document` object,
# please refer to the [`REXML::Document`](https://ruby.github.io/rexml/REXML/Document.html)
# and [`REXML::Element`](https://ruby.github.io/rexml/REXML/Element.html)
# documentation.
#
# This strategy is not applied automatically and it is necessary to use
# `strategy :xml` in a `livecheck` block (in conjunction with a
# `strategy` block) to use it.
#
# This strategy's {find_versions} method can be used in other strategies
# that work with XML content, so it should only be necessary to write
# the version-finding logic that works with the parsed XML data.
#
# @api public
class Xml
extend T::Sig
NICE_NAME = "XML"
# A priority of zero causes livecheck to skip the strategy. We do this
# for {Xml} so we can selectively apply it only when a strategy block
# is provided in a `livecheck` block.
PRIORITY = 0
# The `Regexp` used to determine if the strategy applies to the URL.
URL_MATCH_REGEX = %r{^https?://}i.freeze
# Whether the strategy can be applied to the provided URL.
# {Xml} will technically match any HTTP URL but is only usable with
# a `livecheck` block containing a `strategy` block.
#
# @param url [String] the URL to match against
# @return [Boolean]
sig { params(url: String).returns(T::Boolean) }
def self.match?(url)
URL_MATCH_REGEX.match?(url)
end
# Parses XML text and returns an `REXML::Document` object.
# @param content [String] the XML text to parse
# @return [REXML::Document, nil]
sig { params(content: String).returns(T.nilable(REXML::Document)) }
def self.parse_xml(content)
require "rexml/document"
parsing_tries = 0
begin
REXML::Document.new(content)
rescue REXML::UndefinedNamespaceException => e
undefined_prefix = e.to_s[/Undefined prefix ([^ ]+) found/i, 1]
raise "Could not identify undefined prefix." if undefined_prefix.blank?
# Only retry parsing once after removing prefix from content
parsing_tries += 1
raise "Could not parse XML after removing undefined prefix." if parsing_tries > 1
# When an XML document contains a prefix without a corresponding
# namespace, it's necessary to remove the prefix from the content
# to be able to successfully parse it using REXML
content = content.gsub(%r{(</?| )#{Regexp.escape(undefined_prefix)}:}, '\1')
retry
end
end
# Parses XML text and identifies versions using a `strategy` block.
# If a regex is provided, it will be passed as the second argument to
# the `strategy` block (after the parsed XML data).
# @param content [String] the XML text to parse and check
# @param regex [Regexp, nil] a regex used for matching versions in the
# content
# @return [Array]
sig {
params(
content: String,
regex: T.nilable(Regexp),
block: T.untyped,
).returns(T::Array[String])
}
def self.versions_from_content(content, regex = nil, &block)
return [] if content.blank? || block.blank?
require "rexml"
xml = parse_xml(content)
return [] if xml.blank?
block_return_value = if regex.present?
yield(xml, regex)
elsif block.arity == 2
raise "Two arguments found in `strategy` block but no regex provided."
else
yield(xml)
end
Strategy.handle_block_return(block_return_value)
end
# Checks the XML content at the URL for versions, using the provided
# `strategy` block to extract version information.
#
# @param url [String] the URL of the content to check
# @param regex [Regexp, nil] a regex used for matching versions
# @param provided_content [String, nil] page content to use in place of
# fetching via `Strategy#page_content`
# @param homebrew_curl [Boolean] whether to use brewed curl with the URL
# @return [Hash]
sig {
params(
url: String,
regex: T.nilable(Regexp),
provided_content: T.nilable(String),
homebrew_curl: T::Boolean,
_unused: T.nilable(T::Hash[Symbol, T.untyped]),
block: T.untyped,
).returns(T::Hash[Symbol, T.untyped])
}
def self.find_versions(url:, regex: nil, provided_content: nil, homebrew_curl: false, **_unused, &block)
raise ArgumentError, "#{Utils.demodulize(T.must(name))} requires a `strategy` block" if block.blank?
match_data = { matches: {}, regex: regex, url: url }
return match_data if url.blank? || block.blank?
content = if provided_content.is_a?(String)
match_data[:cached] = true
provided_content
else
match_data.merge!(Strategy.page_content(url, homebrew_curl: homebrew_curl))
match_data[:content]
end
return match_data if content.blank?
versions_from_content(content, regex, &block).each do |match_text|
match_data[:matches][match_text] = Version.new(match_text)
end
match_data
end
end
end
end
end

View File

@ -0,0 +1,205 @@
# typed: false
# frozen_string_literal: true
require "livecheck/strategy"
require "rexml/document"
describe Homebrew::Livecheck::Strategy::Xml do
subject(:xml) { described_class }
let(:http_url) { "https://brew.sh/blog/" }
let(:non_http_url) { "ftp://brew.sh/" }
let(:regex) { /^v?(\d+(?:\.\d+)+)$/i }
let(:content_version_text) {
<<~EOS
<?xml version="1.0" encoding="utf-8"?>
<versions>
<version>1.1.2</version>
<version>1.1.2b</version>
<version>1.1.2a</version>
<version>1.1.1</version>
<version>1.1.0</version>
<version>1.1.0-rc3</version>
<version>1.1.0-rc2</version>
<version>1.1.0-rc1</version>
<version>1.0.x-last</version>
<version>1.0.3</version>
<version>1.0.3-rc3</version>
<version>1.0.3-rc2</version>
<version>1.0.3-rc1</version>
<version>1.0.2</version>
<version>1.0.2-rc1</version>
<version>1.0.1</version>
<version>1.0.1-rc1</version>
<version>1.0.0</version>
<version>1.0.0-rc1</version>
</versions>
EOS
}
let(:content_version_attr) {
<<~EOS
<?xml version="1.0" encoding="utf-8"?>
<items>
<item version="1.1.2" />
<item version="1.1.2b" />
<item version="1.1.2a" />
<item version="1.1.1" />
<item version="1.1.0" />
<item version="1.1.0-rc3" />
<item version="1.1.0-rc2" />
<item version="1.1.0-rc1" />
<item version="1.0.x-last" />
<item version="1.0.3" />
<item version="1.0.3-rc3" />
<item version="1.0.3-rc2" />
<item version="1.0.3-rc1" />
<item version="1.0.2" />
<item version="1.0.2-rc1" />
<item version="1.0.1" />
<item version="1.0.1-rc1" />
<item version="1.0.0" />
<item version="1.0.0-rc1" />
</items>
EOS
}
let(:content_simple) {
<<~EOS
<?xml version="1.0" encoding="utf-8"?>
<version>1.2.3</version>
EOS
}
let(:content_undefined_namespace) {
<<~EOS
<?xml version="1.0" encoding="utf-8"?>
<something:version>1.2.3</something:version>
EOS
}
let(:content_matches) { ["1.1.2", "1.1.1", "1.1.0", "1.0.3", "1.0.2", "1.0.1", "1.0.0"] }
let(:content_simple_matches) { ["1.2.3"] }
let(:find_versions_return_hash) {
{
matches: {
"1.1.2" => Version.new("1.1.2"),
"1.1.1" => Version.new("1.1.1"),
"1.1.0" => Version.new("1.1.0"),
"1.0.3" => Version.new("1.0.3"),
"1.0.2" => Version.new("1.0.2"),
"1.0.1" => Version.new("1.0.1"),
"1.0.0" => Version.new("1.0.0"),
},
regex: regex,
url: http_url,
}
}
let(:find_versions_cached_return_hash) {
find_versions_return_hash.merge({ cached: true })
}
describe "::match?" do
it "returns true for an HTTP URL" do
expect(xml.match?(http_url)).to be true
end
it "returns false for a non-HTTP URL" do
expect(xml.match?(non_http_url)).to be false
end
end
describe "::parse_xml" do
# TODO: Should we be comparing against an actual REXML::Document object?
it "returns an REXML::Document when given XML content" do
expect(xml.parse_xml(content_version_text)).to be_an_instance_of(REXML::Document)
end
it "returns an REXML::Document when given XML content with an undefined namespace" do
expect(xml.parse_xml(content_undefined_namespace)).to be_an_instance_of(REXML::Document)
end
end
describe "::versions_from_content" do
it "returns an empty array when given a block but content is blank" do
expect(xml.versions_from_content("", regex) { "1.2.3" }).to eq([])
end
it "returns an array of version strings when given content and a block" do
# Returning a string from block
expect(xml.versions_from_content(content_simple) do |xml|
xml.elements["version"]&.text
end).to eq(content_simple_matches)
expect(xml.versions_from_content(content_simple, regex) do |xml|
version = xml.elements["version"]&.text
next if version.blank?
version[regex, 1]
end).to eq(content_simple_matches)
# Returning an array of strings from block
expect(xml.versions_from_content(content_version_text, regex) do |xml, regex|
xml.get_elements("versions//version").map { |item| item.text[regex, 1] }
end).to eq(content_matches)
expect(xml.versions_from_content(content_version_attr, regex) do |xml, regex|
xml.get_elements("items//item").map do |item|
version = item["version"]
next if version.blank?
version[regex, 1]
end
end).to eq(content_matches)
end
it "allows a nil return from a block" do
expect(xml.versions_from_content(content_simple, regex) { next }).to eq([])
end
it "errors if a block uses two arguments but a regex is not given" do
expect { xml.versions_from_content(content_simple) { |xml, regex| xml["version"][regex, 1] } }
.to raise_error("Two arguments found in `strategy` block but no regex provided.")
end
it "errors on an invalid return type from a block" do
expect { xml.versions_from_content(content_simple, regex) { 123 } }
.to raise_error(TypeError, Homebrew::Livecheck::Strategy::INVALID_BLOCK_RETURN_VALUE_MSG)
end
end
describe "::find_versions?" do
it "finds versions in provided_content using a block" do
expect(xml.find_versions(url: http_url, regex: regex, provided_content: content_version_text) do |xml, regex|
xml.get_elements("versions//version").map { |item| item.text[regex, 1] }
end).to eq(find_versions_cached_return_hash)
# NOTE: A regex should be provided using the `#regex` method in a
# `livecheck` block but we're using a regex literal in the `strategy`
# block here simply to ensure this method works as expected when a
# regex isn't provided.
expect(xml.find_versions(url: http_url, provided_content: content_version_text) do |xml|
regex = /^v?(\d+(?:\.\d+)+)$/i.freeze
xml.get_elements("versions//version").map { |item| item.text[regex, 1] }
end).to eq(find_versions_cached_return_hash.merge({ regex: nil }))
end
it "errors if a block is not provided" do
expect { xml.find_versions(url: http_url, provided_content: content_simple) }
.to raise_error(ArgumentError, "Xml requires a `strategy` block")
end
it "returns default match_data when url is blank" do
expect(xml.find_versions(url: "") { "1.2.3" })
.to eq({ matches: {}, regex: nil, url: "" })
end
it "returns default match_data when content is blank" do
expect(xml.find_versions(url: http_url, provided_content: "") { "1.2.3" })
.to eq({ matches: {}, regex: nil, url: http_url, cached: true })
end
end
end

View File

@ -171,6 +171,22 @@ livecheck do
end end
``` ```
#### `Xml` `strategy` block
A `strategy` block for `Xml` receives an `REXML::Document` object and, if provided, a regex. For example, if the XML contains a `versions` element with nested `version` elements and their inner text contains the version string, we could extract it using a regex as follows:
```ruby
livecheck do
url "https://www.example.com/example.xml"
regex(/v?(\d+(?:\.\d+)+)/i)
strategy :xml do |xml, regex|
xml.get_elements("versions//version").map { |item| item.text[regex, 1] }
end
end
```
For more information on how to work with an `REXML::Document` object, please refer to the [`REXML::Document`](https://ruby.github.io/rexml/REXML/Document.html) and [`REXML::Element`](https://ruby.github.io/rexml/REXML/Element.html) documentation.
### `skip` ### `skip`
Livecheck automatically skips some formulae/casks for a number of reasons (deprecated, disabled, discontinued, etc.). However, on rare occasions we need to use a `livecheck` block to do a manual skip. The `skip` method takes a string containing a very brief reason for skipping. Livecheck automatically skips some formulae/casks for a number of reasons (deprecated, disabled, discontinued, etc.). However, on rare occasions we need to use a `livecheck` block to do a manual skip. The `skip` method takes a string containing a very brief reason for skipping.