From 4575ddf9094831be79f2f577696e116f835868ab Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 2 May 2022 13:57:51 +0100 Subject: [PATCH 1/2] utils/curl: force utf-8 encoding for text content --- Library/Homebrew/utils/curl.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/Library/Homebrew/utils/curl.rb b/Library/Homebrew/utils/curl.rb index d182397618..5d4ec86190 100644 --- a/Library/Homebrew/utils/curl.rb +++ b/Library/Homebrew/utils/curl.rb @@ -359,6 +359,7 @@ module Utils if status.success? file_contents = File.read(file.path) + file_contents.encode!(Encoding::UTF_8, invalid: :replace) if headers["content-type"]&.start_with?("text/") file_hash = Digest::SHA2.hexdigest(file_contents) if hash_needed end From 6643f58b49d7ddc99435a6f71aa037a29557edce Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 4 May 2022 00:13:56 +0100 Subject: [PATCH 2/2] utils/curl: get encoding from header --- Library/Homebrew/utils/curl.rb | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/Library/Homebrew/utils/curl.rb b/Library/Homebrew/utils/curl.rb index 5d4ec86190..fe71682353 100644 --- a/Library/Homebrew/utils/curl.rb +++ b/Library/Homebrew/utils/curl.rb @@ -296,8 +296,8 @@ module Utils return unless check_content no_protocol_file_contents = %r{https?:\\?/\\?/} - http_content = details[:file]&.gsub(no_protocol_file_contents, "/") - https_content = secure_details[:file]&.gsub(no_protocol_file_contents, "/") + http_content = details[:file]&.scrub&.gsub(no_protocol_file_contents, "/") + https_content = secure_details[:file]&.scrub&.gsub(no_protocol_file_contents, "/") # Check for the same content after removing all protocols if (http_content && https_content) && (http_content == https_content) && http_with_https_available @@ -358,8 +358,19 @@ module Utils content_length = headers["content-length"] if status.success? - file_contents = File.read(file.path) - file_contents.encode!(Encoding::UTF_8, invalid: :replace) if headers["content-type"]&.start_with?("text/") + open_args = {} + # Try to get encoding from Content-Type header + # TODO: add guessing encoding by tag + if (content_type = headers["content-type"]) && + (match = content_type.match(/;\s*charset\s*=\s*([^\s]+)/)) && + (charset = match[1]) + begin + open_args[:encoding] = Encoding.find(charset) + rescue ArgumentError + # Unknown charset in Content-Type header + end + end + file_contents = File.read(file.path, open_args) file_hash = Digest::SHA2.hexdigest(file_contents) if hash_needed end