Merge pull request #11517 from n-thumann/page_content_both_headers

Try getting page content with both headers
This commit is contained in:
Mike McQuaid 2021-06-15 12:44:38 +01:00 committed by GitHub
commit 06124a4ca6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -179,46 +179,49 @@ module Homebrew
def self.page_content(url)
original_url = url
stdout, stderr, status = curl_with_workarounds(
*PAGE_CONTENT_CURL_ARGS, url,
**DEFAULT_CURL_OPTIONS
)
stderr = nil
[:default, :browser].each do |user_agent|
stdout, stderr, status = curl_with_workarounds(
*PAGE_CONTENT_CURL_ARGS, url,
**DEFAULT_CURL_OPTIONS,
user_agent: user_agent
)
next unless status.success?
unless status.success?
/^(?<error_msg>curl: \(\d+\) .+)/ =~ stderr
return {
messages: [error_msg.presence || "cURL failed without an error"],
}
end
# stdout contains the header information followed by the page content.
# We use #scrub here to avoid "invalid byte sequence in UTF-8" errors.
output = stdout.scrub
# stdout contains the header information followed by the page content.
# We use #scrub here to avoid "invalid byte sequence in UTF-8" errors.
output = stdout.scrub
# Separate the head(s)/body and identify the final URL (after any
# redirections)
max_iterations = 5
iterations = 0
output = output.lstrip
while output.match?(%r{\AHTTP/[\d.]+ \d+}) && output.include?(HTTP_HEAD_BODY_SEPARATOR)
iterations += 1
raise "Too many redirects (max = #{max_iterations})" if iterations > max_iterations
head_text, _, output = output.partition(HTTP_HEAD_BODY_SEPARATOR)
# Separate the head(s)/body and identify the final URL (after any
# redirections)
max_iterations = 5
iterations = 0
output = output.lstrip
while output.match?(%r{\AHTTP/[\d.]+ \d+}) && output.include?(HTTP_HEAD_BODY_SEPARATOR)
iterations += 1
raise "Too many redirects (max = #{max_iterations})" if iterations > max_iterations
location = head_text[/^Location:\s*(.*)$/i, 1]
next if location.blank?
head_text, _, output = output.partition(HTTP_HEAD_BODY_SEPARATOR)
output = output.lstrip
location.chomp!
# Convert a relative redirect URL to an absolute URL
location = URI.join(url, location) unless location.match?(PageMatch::URL_MATCH_REGEX)
final_url = location
location = head_text[/^Location:\s*(.*)$/i, 1]
next if location.blank?
location.chomp!
# Convert a relative redirect URL to an absolute URL
location = URI.join(url, location) unless location.match?(PageMatch::URL_MATCH_REGEX)
final_url = location
end
data = { content: output }
data[:final_url] = final_url if final_url.present? && final_url != original_url
return data
end
data = { content: output }
data[:final_url] = final_url if final_url.present? && final_url != original_url
data
/^(?<error_msg>curl: \(\d+\) .+)/ =~ stderr
{
messages: [error_msg.presence || "cURL failed without an error"],
}
end
end
end