diff --git a/.gitignore b/.gitignore index daba77c..e1ef93d 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,13 @@ Gemfile.lock .yardoc doc/ +# mac junk +._* + +# annoying emacs backups +.\#* +\#*\# + vendor # don't include generated files diff --git a/lib/mimemagic.rb b/lib/mimemagic.rb index 8fe8372..26a8d2b 100644 --- a/lib/mimemagic.rb +++ b/lib/mimemagic.rb @@ -9,107 +9,405 @@ # Mime type detection class MimeMagic - attr_reader :type, :mediatype, :subtype + attr_reader :type, :mediatype, :subtype, :params - # Mime type by type string + # Initialize a new MIME type by its string representation. + # + # @param type [#to_s] the type to parse. + # def initialize(type) - @type = type - @mediatype, @subtype = type.split('/', 2) + @type, *params = type.to_s.strip.split(/(?:\s*;\s*)+/) # chop off params + @type.downcase! # normalize the case + # split parameter-value pairs if present + @params = params.map { |x| x.split(/\s*=\s*/, 2) } unless params.empty? + @mediatype, @subtype = @type.split ?/, 2 # split major and minor + end + + # Syntactic sugar alias for constructor. No-op if `type` is already + # a {MimeMagic} object. + # + # @param type [#to_s] a string-like object representing a MIME type + # or file extension. + # + # @return [MimeMagic] the instantiated object. + # + def self.[] type + # try noop first + return type if type.is_a? self + + # now we handle the string + type = type.to_s.strip + return by_extension type unless type.include? ?/ + + # otherwise pass to constructor + new type end - # Add custom mime type. Arguments: - # * type: Mime type - # * options: Options hash + # Add a custom MIME type to the internal dictionary. + # + # @param type [#to_s] the type + # @param extensions [Array<#to_s>] file extensions + # @param parents [Array<#to_s>] parent types + # @param magic [Array] MIME "magic" specification + # @param aliases [Array<#to_s>] alternative names for the type + # @param comment [#to_s] a comment # - # Option keys: - # * :extensions: String list or single string of file extensions - # * :parents: String list or single string of parent mime types - # * :magic: Mime magic specification - # * :comment: Comment string - def self.add(type, options) - extensions = [options[:extensions]].flatten.compact - TYPES[type] = [extensions, - [options[:parents]].flatten.compact, - options[:comment]] - extensions.each {|ext| EXTENSIONS[ext] = type } - MAGIC.unshift [type, options[:magic]] if options[:magic] + def self.add type, + extensions: [], parents: [], magic: [], comment: nil, aliases: [] + type = type.to_s.strip.downcase + extensions = [extensions].flatten.compact + aliases = [[aliases] || []].flatten.compact + t = TYPES[type] = [extensions, [parents].flatten.compact, + comment, type, aliases] + aliases.each { |a| TYPES[a] = t } + extensions.each {|ext| EXTENSIONS[ext] ||= type } + + MAGIC.unshift [type, magic] if magic + + true # output is ignored end - # Removes a mime type from the dictionary. You might want to do this if + # Removes a MIME type from the dictionary. You might want to do this if # you're seeing impossible conflicts (for instance, application/x-gmc-link). - # * type: The mime type to remove. All associated extensions and magic are removed too. + # + # @note All associated extensions and magic are removed too. + # + # @param type [#to_s] the type to remove. + # def self.remove(type) EXTENSIONS.delete_if {|ext, t| t == type } MAGIC.delete_if {|t, m| t == type } TYPES.delete(type) + + true # output is also ignored end - # Returns true if type is a text format - def text?; mediatype == 'text' || child_of?('text/plain'); end + # Returns true if type is a text format. + def text?; mediatype == 'text' || descendant_of?('text/plain'); end - # Mediatype shortcuts + # Determine if the type is an image. def image?; mediatype == 'image'; end + + # Determine if the type is audio. def audio?; mediatype == 'audio'; end - def video?; mediatype == 'video'; end - # Returns true if type is child of parent type - def child_of?(parent) - MimeMagic.child?(type, parent) - end + # Determine if the type is video. + def video?; mediatype == 'video'; end - # Get string list of file extensions + # Get string list of file extensions. + # + # @return [Array] associated file extensions. + # def extensions - TYPES.key?(type) ? TYPES[type][0] : [] + TYPES.fetch(type, [[]]).first.map { |e| e.to_s.dup } end - # Get mime comment + # Get MIME comment. + # + # @return [nil, String] the comment + # def comment - (TYPES.key?(type) ? TYPES[type][2] : nil).to_s + TYPES.fetch(type, [nil, nil, nil])[2].to_s.dup + end + + # Return the canonical type. Returns `nil` if the type is unknown to + # the registry. + # + # @return [MimeMagic, nil] the canonical type, if present. + # + def canonical + t = TYPES[type.downcase] or return + return self if type == t[3] + self.class.new t[3] + end + + # Return the type's aliases. + # + # @return [Array] the aliases, if any. + # + def aliases + TYPES.fetch(type.downcase, [nil, nil, nil, nil, []])[4].map do |t| + self.class.new t + end end - # Lookup mime type by file extension - def self.by_extension(ext) - ext = ext.to_s.downcase - mime = ext[0..0] == '.' ? EXTENSIONS[ext[1..-1]] : EXTENSIONS[ext] - mime && new(mime) + # Determine if the type is an alias. + # + # @return [false, true] whether the type is an alias. + # + def alias? + type != canonical.type end - # Lookup mime type by filename - def self.by_path(path) - by_extension(File.extname(path)) + # Returns true if the ancestor type is anywhere in the subject + # type's lineage. Always returns `false` if either `self` or + # `ancestor` are unknown to the type registry. + # + # @param ancestor [MimeType,#to_s] the candidate ancestor type + # + # @return [true, false] whether `self` is a descendant of `ancestor` + # + def descendant_of? ancestor + # always false if we don't know what this is + return unless c = canonical + + # ancestor canonical could be nil which will be false + c.lineage.include? self.class[ancestor].canonical end - # Lookup mime type by magic content analysis. - # This is a slow operation. - def self.by_magic(io) - mime = magic_match(io, :find) - mime && new(mime[0]) + # Returns true if type is child of parent type. Behaves the same as + # #descendant_of? if `recurse` is true, which is the default. + # + # @param parent [#to_s] a candidate parent type + # @param recurse [true, false] whether to recurse + # + # @return [true, false] whether `self` is a child of `parent` + # + def child_of?(parent, recurse: true) + return descendant_of? parent if recurse + return unless c = canonical + c.parents.include? self.class[parent].canonical end - # Lookup all mime types by magic content analysis. - # This is a slower operation. - def self.all_by_magic(io) - magic_match(io, :select).map { |mime| new(mime[0]) } + # Fetches the immediate parent types. + # + # @return [Array] the type's parents + # + def parents + out = TYPES.fetch(type.to_s.downcase, [nil, []])[1].map do |x| + self.class.new x + end + # add this unless we're it + out << self.class.new('application/octet-stream') if + out.empty? and type.downcase != 'application/octet-stream' + + out.uniq end - # Return type as string - def to_s - type + # Fetches the entire inheritance hierarchy for the given MIME type. + # + # @return [Array] the type's lineage + # + def lineage + ([canonical || self] + parents.map { |t| t.lineage }.flatten).uniq + end + + alias_method :ancestor_types, :lineage + + # Determine if the _type_ is a descendant of `text/plain`. Not to be + # confused with the class method {.binary?}, which concerns + # arbitrary input. + # + # @return [true, false, nil] whether the type is binary. + # + def binary? + not lineage.include? 'text/plain' end - # Allow comparison with string + # Compare the equality of the type with another (or plain string). + # + # @param other [#to_s] the other to test + # + # @return [false, true] whether the two are equal. + # def eql?(other) - type == other.to_s + # coerce the rhs + other = self.class[other] + + # check for an exact match + return true if type == other.type + + # now canonicalize both sides and check + lhs = canonical + rhs = other.canonical + + lhs && rhs && lhs.type == rhs.type end + alias_method :==, :eql? + + # Return the object's (the underlying type string) hash. + # + # @return [Integer] the hash value. + # def hash type.hash end - alias == eql? + # Return the type as a string. + # + # @return [String] the type, as a string. + # + def to_s + type + end + + # Return a diagnostic representation of the object. + # + # @return [String] a string representing the object. + # + def inspect + out = @type + out = [out, @params.map { |x| x.join ?= }].join ?; if + @params and !@params.empty? + %q[<%s "%s">] % [self.class, out] + end + + # Look up MIME type by file extension. When `default` is true or a + # value, this method will always return a value. + # + # @param path [#to_s] + # @param default [false, true, #to_s, MimeMagic] a default fallback type + # + # @return [nil, MimeMagic] the type, if found. + # + def self.by_extension ext, default: false + ext = ext.to_s.downcase.delete_prefix ?. + default = coerce_default '', default + mime = EXTENSIONS[ext] + mime ? new(mime) : default + end + + # Look up MIME type by file path. When `default` is true or a value, + # this method will always return a value. + # + # @param path [#to_s] the file/path to check + # @param default [false, true, #to_s, MimeMagic] a default fallback type + # + # @return [nil, MimeMagic] the type, if found. + # + def self.by_path path, default: false + by_extension(File.extname(path), default: default) + end + + # Look up MIME type by magic content analysis. When `default` is true or a + # value, this method will always return a value. + # + # @note This is a relatively slow operation. + # + # @param io [#read, #to_s] the IO/String-like object to check for magic + # @param default [false, true, #to_s, MimeMagic] a default fallback type + # + # @return [nil, MimeMagic] a matching type, if found. + # + def self.by_magic io, default: false + default = coerce_default io, default + mime = magic_match(io, :find) or return default + new mime.first + end + + # Return all matching MIME types by magic content analysis. When + # `default` is true or a value, the result will never be empty. + # + # @note This is a relatively slow operation. + # + # @param io [#read, #to_s] the IO/String-like object to check for magic + # @param default [false, true, #to_s, MimeMagic] a default fallback type + # + # @return [Array] all matching types + # + def self.all_by_magic io, default: false + default = coerce_default io, default + out = magic_match(io, :select).map { |mime| new mime.first } + out << default if out.empty? and default + out + end - def self.child?(child, parent) - child == parent || TYPES.key?(child) && TYPES[child][1].any? {|p| child?(p, parent) } + # Returns true if type is child of parent type. + # + # @param child [#to_s] a candidate child type + # @param parent [#to_s] a candidate parent type + # + # @return [true, false] whether `self` is a child of `parent` + # + def self.child?(child, parent, recurse: true) + self[child].child_of? parent, recurse: recurse + end + + # Return the canonical type. + # + # @param type [#to_s] the type to test + # + # @return [MimeMagic, nil] the canonical type, if present. + # + def self.canonical type + self[type].canonical + end + + # Return the type's aliases. + # + # @param type [#to_s] the type to check + # + # @return [Array] the aliases, if any. + # + def self.aliases type + self[type].aliases + end + + # Determine if an _input_ is binary. Not to be confused with the + # instance method {#binary?}, which concerns the _type_. + # + # @param thing [#read, #to_s] the IO-like or String-like thing to + # test; can also be a file name/path/extension or MIME type. + # + # @return [true, false, nil] whether the input is binary (`nil` if + # indeterminate). + # + def self.binary? thing + sample = '' + + # get some stuff out of the IO or get a substring + if thing.is_a? MimeMagic + return thing.binary? + elsif %i[seek tell read].all? { |m| thing.respond_to? m } + pos = thing.tell + thing.seek 0, 0 + sample = thing.read(256).to_s # handle empty + thing.seek pos + elsif thing.respond_to? :to_s + str = thing.to_s + # if it contains a slash it could be either a path or mimetype + test = if str.include? ?/ + canonical(str) || by_extension(str.split(?.).last) + else + by_extension str.split(?.).last + end + + return test.binary? if test + + sample = str[0, 256] + else + # nil if we don't know what this thing is + return + end + + # consider this to be 'binary' if empty + return true if sample.empty? + # control codes minus ordinary whitespace + /[\x0-\x8\xe-\x1f\x7f]/n.match? sample.b + end + + # Return either `application/octet-stream` or `text/plain` depending + # on whether the thing is binary. + # + # @param thing [#read, #to_s] the thing (IO-like, String-like, MIME type, + # + # @return [MimeMagic] the default type + # + def self.default_type thing + new(binary?(thing) ? 'application/octet-stream' : 'text/plain') + end + + private + + def self.coerce_default thing, default + case default + when nil, false then nil + when true then default_type thing + when MimeMagic then default + when String, -> x { x.respond_to? :to_s } then new default + else default_type thing + end end def self.magic_match(io, method) @@ -138,5 +436,4 @@ def self.magic_match_io(io, matches, buffer) end end - private_class_method :magic_match, :magic_match_io end diff --git a/lib/mimemagic/tables.rb b/lib/mimemagic/tables.rb index 11da9f6..8b55bc7 100644 --- a/lib/mimemagic/tables.rb +++ b/lib/mimemagic/tables.rb @@ -80,17 +80,23 @@ def self.parse_database comments = Hash[*(mime/'comment').map {|comment| [comment['xml:lang'], comment.inner_text] }.flatten] type = mime['type'] subclass = (mime/'sub-class-of').map{|x| x['type']} - exts = (mime/'glob').map{|x| x['pattern'] =~ /^\*\.([^\[\]]+)$/ ? $1.downcase : nil }.compact + exts = (mime/'glob').map do |x| + x['pattern'] =~ /^\*\.([^\[\]]+)$/ ? $1.downcase : nil + end.compact + (mime/'magic').each do |magic| priority = magic['priority'].to_i matches = get_matches(magic) magics << [priority, type, matches] end - if !exts.empty? - exts.each{|x| - extensions[x] = type if !extensions.include?(x) - } - types[type] = [exts,subclass,comments[nil]] + + aliases = (mime/'alias/@type').map { |a| a.value.downcase.strip.freeze } + + # XXX uhh do we only use the type if it has a file extension?? + unless exts.empty? + exts.each { |x| extensions[x] ||= type } + types[type] = [exts, subclass, comments[nil], type, aliases] + # don't add the aliases yet; we do that below end end @@ -141,13 +147,20 @@ def self.parse_database extensions.keys.sort.each do |key| EXTENSIONS[key] = extensions[key] end + types.keys.sort.each do |key| - exts = types[key][0] - parents = types[key][1].sort - comment = types[key][2] + exts, parents, comment, canon, aliases = *types[key] - TYPES[key] = [exts, parents, comment] + parents.sort! + aliases.sort! + + # we are copying it i guess + t = TYPES[key] = [exts, parents, comment, canon, aliases].freeze + + # now do the aliases oops they'll be out of order oh well + aliases.each { |a| TYPES[a] = t } end + magics.each do |priority, type, matches| MAGIC << [type, matches] end diff --git a/lib/mimemagic/version.rb b/lib/mimemagic/version.rb index 56e3ee1..68702d2 100644 --- a/lib/mimemagic/version.rb +++ b/lib/mimemagic/version.rb @@ -1,5 +1,5 @@ class MimeMagic # MimeMagic version string # @api public - VERSION = '0.4.3' + VERSION = '0.5.3' end diff --git a/test/mimemagic_test.rb b/test/mimemagic_test.rb index 8aa48b1..db54224 100644 --- a/test/mimemagic_test.rb +++ b/test/mimemagic_test.rb @@ -25,6 +25,12 @@ def test_have_type_mediatype_and_subtype assert_equal 'text/html', MimeMagic.new('text/html').type assert_equal 'text', MimeMagic.new('text/html').mediatype assert_equal 'html', MimeMagic.new('text/html').subtype + + # a little more robust equality test perchance + assert MimeMagic['TEXT/HTML'] == 'TeXT/HtML;charset=utf-8' + + # this was crashing because the RHS has no canonical + assert MimeMagic['text/html'] != 'application/x-bogus' end def test_have_mediatype_helpers @@ -40,7 +46,10 @@ def test_have_mediatype_helpers def test_have_hierarchy assert MimeMagic.new('text/html').child_of?('text/plain') - assert MimeMagic.new('text/x-java').child_of?('text/plain') + # drake-no: text/plain is an ancestor but not an immediate parent + refute MimeMagic.new('text/x-java').child_of?('text/plain', recurse: false) + # drake-yes + assert MimeMagic.new('text/x-java').descendant_of?('text/plain') end def test_have_extensions @@ -52,28 +61,37 @@ def test_have_comment end def test_recognize_extensions - assert true + assert MimeMagic.by_extension('html') - # Unknown if this test failure is expected. Commenting out for now. + # these resolve to application/xhtml+xml instead of text/html + # because of ambiguities in file extension associations; the data + # file associates the former since it's first. # # assert_equal 'text/html', MimeMagic.by_extension('.html').to_s # assert_equal 'text/html', MimeMagic.by_extension('html').to_s # assert_equal 'text/html', MimeMagic.by_extension(:html).to_s - # assert_equal 'application/x-ruby', MimeMagic.by_extension('rb').to_s - # assert_nil MimeMagic.by_extension('crazy') - # assert_nil MimeMagic.by_extension('') + + assert_equal 'application/x-ruby', MimeMagic.by_extension('rb').to_s + assert_nil MimeMagic.by_extension('crazy') + assert_nil MimeMagic.by_extension('') + # try with duplicate + assert_equal 'application/octet-stream', + MimeMagic.by_extension('crazy', default: true).to_s end def test_recognize_by_a_path - assert true - # Unknown if this test failure is expected. Commenting out for now. + # once again, ambiguities. # # assert_equal 'text/html', MimeMagic.by_path('/adsjkfa/kajsdfkadsf/kajsdfjasdf.html').to_s # assert_equal 'text/html', MimeMagic.by_path('something.html').to_s - # assert_equal 'application/x-ruby', MimeMagic.by_path('wtf.rb').to_s - # assert_nil MimeMagic.by_path('where/am.html/crazy') - # assert_nil MimeMagic.by_path('') + + assert_equal 'application/x-ruby', MimeMagic.by_path('wtf.rb').to_s + assert_nil MimeMagic.by_path('where/am.html/crazy') + assert_nil MimeMagic.by_path('') + + assert_equal 'application/octet-stream', + MimeMagic.by_path('', default: true).to_s end def test_recognize_xlsx_as_zip_without_magic @@ -118,7 +136,7 @@ def test_have_add assert_equal 'application/mimemagic-test', MimeMagic.by_extension('ext2').to_s assert_equal 'Comment', MimeMagic.by_extension('ext2').comment assert_equal %w(ext1 ext2), MimeMagic.new('application/mimemagic-test').extensions - assert MimeMagic.new('application/mimemagic-test').child_of?('text/plain') + assert MimeMagic.new('application/mimemagic-test').descendant_of?('text/plain') end def test_process_magic @@ -151,6 +169,16 @@ def test_process_magic assert_nil MimeMagic.by_magic(StringIO.new 'Z MAGICTEST') end + def test_type_is_binary + assert MimeMagic.binary? 'psd' + refute MimeMagic.binary? 'html' + end + + def test_fancy_constructor + assert_equal 'text/html', MimeMagic['text/html'].to_s + assert_equal 'application/pdf', MimeMagic['pdf'].to_s + end + class IOObject def initialize @io = StringIO.new('MAGICTEST')