Use Mechanize to parse Content-Disposition.

This commit is contained in:
Markus Reiter 2019-01-30 04:01:00 +01:00
parent 0f270d8115
commit de5b35876f
7 changed files with 276 additions and 3 deletions

19
.gitignore vendored
View File

@ -25,6 +25,7 @@
**/vendor/bundle
**/vendor/ruby
**/vendor/bundle-standalone/ruby/*/bin
**/vendor/bundle-standalone/ruby/*/build_info/
**/vendor/bundle-standalone/ruby/*/cache
**/vendor/bundle-standalone/ruby/*/extensions
**/vendor/bundle-standalone/ruby/*/gems/*/*
@ -93,15 +94,33 @@
# Ignore rubocop's (and other) dependencies we don't wish to vendor
**/vendor/bundle-standalone/ruby/*/gems/ast-*/
**/vendor/bundle-standalone/ruby/*/gems/connection_pool-*/lib
**/vendor/bundle-standalone/ruby/*/gems/domain_name-*/lib
**/vendor/bundle-standalone/ruby/*/gems/http-cookie-*/lib
**/vendor/bundle-standalone/ruby/*/gems/jaro_winkler-*/
**/vendor/bundle-standalone/ruby/*/gems/mime-types-data-*/lib
**/vendor/bundle-standalone/ruby/*/gems/mime-types-*/lib
**/vendor/bundle-standalone/ruby/*/gems/mini_portile2-*/lib
**/vendor/bundle-standalone/ruby/*/gems/minitest-*/lib
**/vendor/bundle-standalone/ruby/*/gems/net-http-digest_auth-*/lib
**/vendor/bundle-standalone/ruby/*/gems/net-http-persistent-*/lib
**/vendor/bundle-standalone/ruby/*/gems/nokogiri-*/lib
**/vendor/bundle-standalone/ruby/*/gems/ntlm-http-*/lib
**/vendor/bundle-standalone/ruby/*/gems/parallel-*/
**/vendor/bundle-standalone/ruby/*/gems/parser-*/
**/vendor/bundle-standalone/ruby/*/gems/powerpack-*/
**/vendor/bundle-standalone/ruby/*/gems/rainbow-*/
**/vendor/bundle-standalone/ruby/*/gems/rubocop-0*/
**/vendor/bundle-standalone/ruby/*/gems/ruby-progressbar-*/
**/vendor/bundle-standalone/ruby/*/gems/unf_ext-*/lib
**/vendor/bundle-standalone/ruby/*/gems/unf-*/lib
**/vendor/bundle-standalone/ruby/*/gems/unicode-display_width-*/
**/vendor/bundle-standalone/ruby/*/gems/webrobots-*/lib
# Only include the `Mechanize::HTTP::ContentDispositionParser`.
**/vendor/bundle-standalone/ruby/*/gems/mechanize-*/lib
!**/vendor/bundle-standalone/ruby/*/gems/mechanize-*/lib/mechanize/http/content_disposition_parser.rb
!**/vendor/bundle-standalone/ruby/*/gems/mechanize-*/lib/mechanize/version.rb
# Ignore `bin` contents (again).
/bin

View File

@ -5,6 +5,9 @@ require "unpack_strategy"
require "lazy_object"
require "cgi"
require "mechanize/version"
require "mechanize/http/content_disposition_parser"
class AbstractDownloadStrategy
extend Forwardable
include FileUtils
@ -363,9 +366,20 @@ class CurlDownloadStrategy < AbstractFileDownloadStrategy
end
end
filenames =
lines.map { |line| line[/^Content\-Disposition:\s*(?:inline|attachment);\s*filename=(["']?)([^;]+)\1/i, 2] }
.compact
content_disposition_parser = Mechanize::HTTP::ContentDispositionParser.new
parse_content_disposition = lambda do |line|
next unless content_disposition = content_disposition_parser.parse(line, true)
if filename_with_encoding = content_disposition.parameters["filename*"]
encoding, encoded_filename = filename_with_encoding.split("''", 2)
URI.decode_www_form_component(encoded_filename).encode(encoding)
else
content_disposition.filename
end
end
filenames = lines.map(&parse_content_disposition).compact
time =
lines.map { |line| line[/^Last\-Modified:\s*(.+)/i, 1] }

View File

@ -3,6 +3,7 @@ source "https://rubygems.org"
gem "activesupport"
gem "concurrent-ruby"
gem "backports"
gem "mechanize"
gem "plist"
gem "ruby-macho"
gem "rubocop-rspec"

View File

@ -9,10 +9,34 @@ GEM
ast (2.4.0)
backports (3.11.4)
concurrent-ruby (1.1.4)
connection_pool (2.2.2)
domain_name (0.5.20180417)
unf (>= 0.0.5, < 1.0.0)
http-cookie (1.0.3)
domain_name (~> 0.5)
i18n (1.5.3)
concurrent-ruby (~> 1.0)
jaro_winkler (1.5.2)
mechanize (2.7.6)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (>= 1.17.2)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (>= 2.5.2)
nokogiri (~> 1.6)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
mime-types (3.2.2)
mime-types-data (~> 3.2015)
mime-types-data (3.2018.0812)
mini_portile2 (2.4.0)
minitest (5.11.3)
net-http-digest_auth (1.4.1)
net-http-persistent (3.0.0)
connection_pool (~> 2.2)
nokogiri (1.10.1)
mini_portile2 (~> 2.4.0)
ntlm-http (0.1.1)
parallel (1.13.0)
parser (2.6.0.0)
ast (~> 2.4.0)
@ -34,7 +58,11 @@ GEM
thread_safe (0.3.6)
tzinfo (1.2.5)
thread_safe (~> 0.1)
unf (0.1.4)
unf_ext
unf_ext (0.0.7.5)
unicode-display_width (1.4.1)
webrobots (0.1.2)
PLATFORMS
ruby
@ -43,6 +71,7 @@ DEPENDENCIES
activesupport
backports
concurrent-ruby
mechanize
plist
rubocop
rubocop-rspec

View File

@ -12,8 +12,24 @@ $:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/activesupport-5.2.2/l
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/ast-2.4.0/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/backports-3.11.4/lib"
$:.unshift "#{path}/"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/connection_pool-2.2.2/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/extensions/universal-darwin-18/2.3.0/unf_ext-0.0.7.5"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/unf_ext-0.0.7.5/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/unf-0.1.4/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/domain_name-0.5.20180417/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/http-cookie-1.0.3/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/extensions/universal-darwin-18/2.3.0/jaro_winkler-1.5.2"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/jaro_winkler-1.5.2/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/mime-types-data-3.2018.0812/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/mime-types-3.2.2/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/net-http-digest_auth-1.4.1/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/net-http-persistent-3.0.0/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/mini_portile2-2.4.0/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/extensions/universal-darwin-18/2.3.0/nokogiri-1.10.1"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/nokogiri-1.10.1/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/ntlm-http-0.1.1/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/webrobots-0.1.2/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/mechanize-2.7.6/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/parallel-1.13.0/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/parser-2.6.0.0/lib"
$:.unshift "#{path}/../#{ruby_engine}/#{ruby_version}/gems/plist-3.5.0/lib"

View File

@ -0,0 +1,191 @@
# coding: BINARY
require 'strscan'
require 'time'
class Mechanize::HTTP
ContentDisposition = Struct.new :type, :filename, :creation_date,
:modification_date, :read_date, :size, :parameters
end
##
# Parser Content-Disposition headers that loosely follows RFC 2183.
#
# Beyond RFC 2183, this parser allows:
#
# * Missing disposition-type
# * Multiple semicolons
# * Whitespace around semicolons
class Mechanize::HTTP::ContentDispositionParser
attr_accessor :scanner # :nodoc:
@parser = nil
##
# Parses the disposition type and params in the +content_disposition+
# string. The "Content-Disposition:" must be removed.
def self.parse content_disposition
@parser ||= self.new
@parser.parse content_disposition
end
##
# Creates a new parser Content-Disposition headers
def initialize
@scanner = nil
end
##
# Parses the +content_disposition+ header. If +header+ is set to true the
# "Content-Disposition:" portion will be parsed
def parse content_disposition, header = false
return nil if content_disposition.empty?
@scanner = StringScanner.new content_disposition
if header then
return nil unless @scanner.scan(/Content-Disposition/i)
return nil unless @scanner.scan(/:/)
spaces
end
type = rfc_2045_token
@scanner.scan(/;+/)
if @scanner.peek(1) == '=' then
@scanner.pos = 0
type = nil
end
disposition = Mechanize::HTTP::ContentDisposition.new type
spaces
return nil unless parameters = parse_parameters
disposition.filename = parameters.delete 'filename'
disposition.creation_date = parameters.delete 'creation-date'
disposition.modification_date = parameters.delete 'modification-date'
disposition.read_date = parameters.delete 'read-date'
disposition.size = parameters.delete 'size'
disposition.parameters = parameters
disposition
end
##
# Extracts disposition-parm and returns a Hash.
def parse_parameters
parameters = {}
while true do
return nil unless param = rfc_2045_token
param.downcase!
return nil unless @scanner.scan(/=/)
value = case param
when /^filename$/ then
rfc_2045_value
when /^(creation|modification|read)-date$/ then
Time.rfc822 rfc_2045_quoted_string
when /^size$/ then
rfc_2045_value.to_i(10)
else
rfc_2045_value
end
return nil unless value
parameters[param] = value
spaces
break if @scanner.eos? or not @scanner.scan(/;+/)
spaces
end
parameters
end
##
# quoted-string = <"> *(qtext/quoted-pair) <">
# qtext = <any CHAR excepting <">, "\" & CR,
# and including linear-white-space
# quoted-pair = "\" CHAR
#
# Parses an RFC 2045 quoted-string
def rfc_2045_quoted_string
return nil unless @scanner.scan(/"/)
text = ''
while true do
chunk = @scanner.scan(/[\000-\014\016-\041\043-\133\135-\177]+/) # not \r "
if chunk then
text << chunk
if @scanner.peek(1) == '\\' then
@scanner.get_byte
return nil if @scanner.eos?
text << @scanner.get_byte
elsif @scanner.scan(/\r\n[\t ]+/) then
text << " "
end
else
if '\\"' == @scanner.peek(2) then
@scanner.skip(/\\/)
text << @scanner.get_byte
elsif '"' == @scanner.peek(1) then
@scanner.get_byte
break
else
return nil
end
end
end
text
end
##
# token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials>
#
# Parses an RFC 2045 token
def rfc_2045_token
@scanner.scan(/[^\000-\037\177()<>@,;:\\"\/\[\]?= ]+/)
end
##
# value := token / quoted-string
#
# Parses an RFC 2045 value
def rfc_2045_value
if @scanner.peek(1) == '"' then
rfc_2045_quoted_string
else
rfc_2045_token
end
end
##
# 1*SP
#
# Parses spaces
def spaces
@scanner.scan(/ +/)
end
end

View File

@ -0,0 +1,3 @@
class Mechanize
VERSION = "2.7.6"
end