Try getting page content with both headers

This commit is contained in:
nthumann 2021-06-09 23:54:56 +02:00
parent cc7b62c829
commit e6a18803ea
No known key found for this signature in database
GPG Key ID: 91AE26F52DE54B76

View File

@ -179,46 +179,49 @@ module Homebrew
def self.page_content(url) def self.page_content(url)
original_url = url original_url = url
stdout, stderr, status = curl_with_workarounds( stderr = nil
*PAGE_CONTENT_CURL_ARGS, url, [:default, :browser].each do |user_agent|
**DEFAULT_CURL_OPTIONS stdout, stderr, status = curl_with_workarounds(
) *PAGE_CONTENT_CURL_ARGS, url,
**DEFAULT_CURL_OPTIONS,
user_agent: user_agent
)
next unless status.success?
unless status.success? # stdout contains the header information followed by the page content.
/^(?<error_msg>curl: \(\d+\) .+)/ =~ stderr # We use #scrub here to avoid "invalid byte sequence in UTF-8" errors.
return { output = stdout.scrub
messages: [error_msg.presence || "cURL failed without an error"],
}
end
# stdout contains the header information followed by the page content. # Separate the head(s)/body and identify the final URL (after any
# We use #scrub here to avoid "invalid byte sequence in UTF-8" errors. # redirections)
output = stdout.scrub max_iterations = 5
iterations = 0
# Separate the head(s)/body and identify the final URL (after any
# redirections)
max_iterations = 5
iterations = 0
output = output.lstrip
while output.match?(%r{\AHTTP/[\d.]+ \d+}) && output.include?(HTTP_HEAD_BODY_SEPARATOR)
iterations += 1
raise "Too many redirects (max = #{max_iterations})" if iterations > max_iterations
head_text, _, output = output.partition(HTTP_HEAD_BODY_SEPARATOR)
output = output.lstrip output = output.lstrip
while output.match?(%r{\AHTTP/[\d.]+ \d+}) && output.include?(HTTP_HEAD_BODY_SEPARATOR)
iterations += 1
raise "Too many redirects (max = #{max_iterations})" if iterations > max_iterations
location = head_text[/^Location:\s*(.*)$/i, 1] head_text, _, output = output.partition(HTTP_HEAD_BODY_SEPARATOR)
next if location.blank? output = output.lstrip
location.chomp! location = head_text[/^Location:\s*(.*)$/i, 1]
# Convert a relative redirect URL to an absolute URL next if location.blank?
location = URI.join(url, location) unless location.match?(PageMatch::URL_MATCH_REGEX)
final_url = location location.chomp!
# Convert a relative redirect URL to an absolute URL
location = URI.join(url, location) unless location.match?(PageMatch::URL_MATCH_REGEX)
final_url = location
end
data = { content: output }
data[:final_url] = final_url if final_url.present? && final_url != original_url
return data
end end
data = { content: output } /^(?<error_msg>curl: \(\d+\) .+)/ =~ stderr
data[:final_url] = final_url if final_url.present? && final_url != original_url {
data messages: [error_msg.presence || "cURL failed without an error"],
}
end end
end end
end end