From ccfd01ba389ed638f0c67bbf82e01b3108314449 Mon Sep 17 00:00:00 2001 From: Sam Ford <1584702+samford@users.noreply.github.com> Date: Mon, 14 Dec 2020 13:03:10 -0500 Subject: [PATCH 1/3] Strategy: Replace open-uri with curl --- Library/Homebrew/livecheck/strategy.rb | 68 ++++++++++++++++--- .../Homebrew/livecheck/strategy/page_match.rb | 2 - 2 files changed, 57 insertions(+), 13 deletions(-) diff --git a/Library/Homebrew/livecheck/strategy.rb b/Library/Homebrew/livecheck/strategy.rb index 76b1edfd79..3a1f2a423f 100644 --- a/Library/Homebrew/livecheck/strategy.rb +++ b/Library/Homebrew/livecheck/strategy.rb @@ -100,7 +100,8 @@ module Homebrew *args, url, print_stdout: false, print_stderr: false, debug: false, verbose: false, - user_agent: user_agent, retry: false + user_agent: user_agent, timeout: 20, + retry: false ) while stdout.match?(/\AHTTP.*\r$/) @@ -119,6 +120,8 @@ module Homebrew # Fetches the content at the URL and returns a hash containing the # content and, if there are any redirections, the final URL. + # If `curl` encounters an error, the hash will contain a `:messages` + # array with the error message instead. # # @param url [String] the URL of the content to check # @return [Hash] @@ -126,18 +129,61 @@ module Homebrew def self.page_content(url) original_url = url - # Manually handling `URI#open` redirections allows us to detect the - # resolved URL while also supporting HTTPS to HTTP redirections (which - # are normally forbidden by `OpenURI`). - begin - content = URI.parse(url).open(redirect: false, &:read) - rescue OpenURI::HTTPRedirect => e - url = e.uri.to_s - retry + args = curl_args( + "--compressed", + # Include HTTP response headers in output, so we can identify the + # final URL after any redirections + "--include", + # Follow redirections to handle mirrors, relocations, etc. + "--location", + # cURL's default timeout can be up to two minutes, so we need to + # set our own timeout settings to avoid a lengthy wait + "--connect-timeout", "10", + "--max-time", "15" + ) + + stdout, stderr, status = curl_with_workarounds( + *args, url, + print_stdout: false, print_stderr: false, + debug: false, verbose: false, + user_agent: :default, timeout: 20, + retry: false + ) + + unless status.success? + /^(?curl: \(\d+\) .+)/ =~ stderr + return { + messages: [error_msg.presence || "cURL failed without an error"], + } end - data = { content: content } - data[:final_url] = url unless url == original_url + # stdout contains the header information followed by the page content. + # We use #scrub here to avoid "invalid byte sequence in UTF-8" errors. + output = stdout.scrub + + # Separate the head(s)/body and identify the final URL (after any + # redirections) + max_iterations = 5 + iterations = 0 + output = output.lstrip + while output.match?(%r{\AHTTP/[\d.]+ \d+}) && output.include?("\r\n\r\n") + iterations += 1 + raise "Too many redirects (max = #{max_iterations})" if iterations > max_iterations + + head_text, _, output = output.partition("\r\n\r\n") + output = output.lstrip + + location = head_text[/^Location:\s*(.*)$/i, 1] + next if location.blank? + + location.chomp! + # Convert a relative redirect URL to an absolute URL + location = URI.join(url, location) unless location.match?(PageMatch::URL_MATCH_REGEX) + final_url = location + end + + data = { content: output } + data[:final_url] = final_url if final_url.present? && final_url != original_url data end end diff --git a/Library/Homebrew/livecheck/strategy/page_match.rb b/Library/Homebrew/livecheck/strategy/page_match.rb index 322fe997bc..ac8750416f 100644 --- a/Library/Homebrew/livecheck/strategy/page_match.rb +++ b/Library/Homebrew/livecheck/strategy/page_match.rb @@ -1,8 +1,6 @@ # typed: true # frozen_string_literal: true -require "open-uri" - module Homebrew module Livecheck module Strategy From 793ea17749a9a12d9d76b07dc10e47a7e9d3c201 Mon Sep 17 00:00:00 2001 From: Sam Ford <1584702+samford@users.noreply.github.com> Date: Fri, 4 Jun 2021 13:12:01 -0400 Subject: [PATCH 2/3] Strategy: Create constant for HTTP separator --- Library/Homebrew/livecheck/strategy.rb | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/Library/Homebrew/livecheck/strategy.rb b/Library/Homebrew/livecheck/strategy.rb index 3a1f2a423f..b69de957cb 100644 --- a/Library/Homebrew/livecheck/strategy.rb +++ b/Library/Homebrew/livecheck/strategy.rb @@ -19,7 +19,15 @@ module Homebrew # the middle of this range. Strategies with a priority of 0 (or lower) # are ignored. DEFAULT_PRIORITY = 5 - private_constant :DEFAULT_PRIORITY + + # HTTP response head(s) and body are typically separated by a double + # `CRLF` (whereas HTTP header lines are separated by a single `CRLF`). + # In rare cases, this can also be a double newline (`\n\n`). + HTTP_HEAD_BODY_SEPARATOR = "\r\n\r\n" + + # The `#strategies` method expects `Strategy` constants to be strategies, + # so constants we create need to be private for this to work properly. + private_constant :DEFAULT_PRIORITY, :HTTP_HEAD_BODY_SEPARATOR # Creates and/or returns a `@strategies` `Hash`, which maps a snake # case strategy name symbol (e.g. `:page_match`) to the associated @@ -166,11 +174,11 @@ module Homebrew max_iterations = 5 iterations = 0 output = output.lstrip - while output.match?(%r{\AHTTP/[\d.]+ \d+}) && output.include?("\r\n\r\n") + while output.match?(%r{\AHTTP/[\d.]+ \d+}) && output.include?(HTTP_HEAD_BODY_SEPARATOR) iterations += 1 raise "Too many redirects (max = #{max_iterations})" if iterations > max_iterations - head_text, _, output = output.partition("\r\n\r\n") + head_text, _, output = output.partition(HTTP_HEAD_BODY_SEPARATOR) output = output.lstrip location = head_text[/^Location:\s*(.*)$/i, 1] From a6769ae7cdf50e99f8b5c45064958bc8a7798453 Mon Sep 17 00:00:00 2001 From: Sam Ford <1584702+samford@users.noreply.github.com> Date: Fri, 4 Jun 2021 13:16:02 -0400 Subject: [PATCH 3/3] Strategy: Better align curl usage in methods --- Library/Homebrew/livecheck/strategy.rb | 92 +++++++++++++++++--------- 1 file changed, 59 insertions(+), 33 deletions(-) diff --git a/Library/Homebrew/livecheck/strategy.rb b/Library/Homebrew/livecheck/strategy.rb index b69de957cb..9d992a5011 100644 --- a/Library/Homebrew/livecheck/strategy.rb +++ b/Library/Homebrew/livecheck/strategy.rb @@ -20,6 +20,56 @@ module Homebrew # are ignored. DEFAULT_PRIORITY = 5 + # cURL's default `--connect-timeout` value can be up to two minutes, so + # we need to use a more reasonable duration (in seconds) to avoid a + # lengthy wait when a connection can't be established. + CURL_CONNECT_TIMEOUT = 10 + + # cURL does not set a default `--max-time` value, so we provide a value + # to ensure cURL will time out in a reasonable amount of time. + CURL_MAX_TIME = CURL_CONNECT_TIMEOUT + 5 + + # The `curl` process will sometimes hang indefinitely (despite setting + # the `--max-time` argument) and it needs to be quit for livecheck to + # continue. This value is used to set the `timeout` argument on + # `Utils::Curl` method calls in `Strategy`. + CURL_PROCESS_TIMEOUT = CURL_MAX_TIME + 5 + + # Baseline `curl` arguments used in `Strategy` methods. + DEFAULT_CURL_ARGS = [ + # Follow redirections to handle mirrors, relocations, etc. + "--location", + "--connect-timeout", CURL_CONNECT_TIMEOUT, + "--max-time", CURL_MAX_TIME + ].freeze + + # `curl` arguments used in `Strategy#page_headers` method. + PAGE_HEADERS_CURL_ARGS = ([ + # We only need the response head (not the body) + "--head", + # Some servers may not allow a HEAD request, so we use GET + "--request", "GET", + "--silent" + ] + DEFAULT_CURL_ARGS).freeze + + # `curl` arguments used in `Strategy#page_content` method. + PAGE_CONTENT_CURL_ARGS = ([ + "--compressed", + # Include HTTP response headers in output, so we can identify the + # final URL after any redirections + "--include", + ] + DEFAULT_CURL_ARGS).freeze + + # Baseline `curl` options used in `Strategy` methods. + DEFAULT_CURL_OPTIONS = { + print_stdout: false, + print_stderr: false, + debug: false, + verbose: false, + timeout: CURL_PROCESS_TIMEOUT, + retry: false, + }.freeze + # HTTP response head(s) and body are typically separated by a double # `CRLF` (whereas HTTP header lines are separated by a single `CRLF`). # In rare cases, this can also be a double newline (`\n\n`). @@ -27,7 +77,10 @@ module Homebrew # The `#strategies` method expects `Strategy` constants to be strategies, # so constants we create need to be private for this to work properly. - private_constant :DEFAULT_PRIORITY, :HTTP_HEAD_BODY_SEPARATOR + private_constant :DEFAULT_PRIORITY, :CURL_CONNECT_TIMEOUT, :CURL_MAX_TIME, + :CURL_PROCESS_TIMEOUT, :DEFAULT_CURL_ARGS, + :PAGE_HEADERS_CURL_ARGS, :PAGE_CONTENT_CURL_ARGS, + :DEFAULT_CURL_OPTIONS, :HTTP_HEAD_BODY_SEPARATOR # Creates and/or returns a `@strategies` `Hash`, which maps a snake # case strategy name symbol (e.g. `:page_match`) to the associated @@ -95,21 +148,10 @@ module Homebrew headers = [] [:default, :browser].each do |user_agent| - args = [ - "--head", # Only work with the response headers - "--request", "GET", # Use a GET request (instead of HEAD) - "--silent", # Silent mode - "--location", # Follow redirects - "--connect-timeout", "5", # Max time allowed for connection (secs) - "--max-time", "10" # Max time allowed for transfer (secs) - ] - stdout, _, status = curl_with_workarounds( - *args, url, - print_stdout: false, print_stderr: false, - debug: false, verbose: false, - user_agent: user_agent, timeout: 20, - retry: false + *PAGE_HEADERS_CURL_ARGS, url, + **DEFAULT_CURL_OPTIONS, + user_agent: user_agent ) while stdout.match?(/\AHTTP.*\r$/) @@ -137,25 +179,9 @@ module Homebrew def self.page_content(url) original_url = url - args = curl_args( - "--compressed", - # Include HTTP response headers in output, so we can identify the - # final URL after any redirections - "--include", - # Follow redirections to handle mirrors, relocations, etc. - "--location", - # cURL's default timeout can be up to two minutes, so we need to - # set our own timeout settings to avoid a lengthy wait - "--connect-timeout", "10", - "--max-time", "15" - ) - stdout, stderr, status = curl_with_workarounds( - *args, url, - print_stdout: false, print_stderr: false, - debug: false, verbose: false, - user_agent: :default, timeout: 20, - retry: false + *PAGE_CONTENT_CURL_ARGS, url, + **DEFAULT_CURL_OPTIONS ) unless status.success?