Skip to content

Commit

Permalink
Merge pull request rouge-ruby#489 from jneen/refactor.guessers
Browse files Browse the repository at this point in the history
Refactor.guessers
  • Loading branch information
jneen authored Jun 14, 2016
2 parents c0fde3b + f0f6b47 commit f3dea84
Show file tree
Hide file tree
Showing 11 changed files with 311 additions and 88 deletions.
7 changes: 7 additions & 0 deletions lib/rouge.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ def highlight(text, lexer, formatter, &b)
load load_dir.join('rouge/text_analyzer.rb')
load load_dir.join('rouge/token.rb')

load load_dir.join('rouge/guesser.rb')
load load_dir.join('rouge/guessers/glob_mapping.rb')
load load_dir.join('rouge/guessers/modeline.rb')
load load_dir.join('rouge/guessers/filename.rb')
load load_dir.join('rouge/guessers/mimetype.rb')
load load_dir.join('rouge/guessers/source.rb')

load load_dir.join('rouge/lexer.rb')
load load_dir.join('rouge/regex_lexer.rb')
load load_dir.join('rouge/template_lexer.rb')
Expand Down
46 changes: 46 additions & 0 deletions lib/rouge/guesser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
module Rouge
class Guesser
def self.guess(guessers, lexers)
original_size = lexers.size

guessers.each do |g|
new_lexers = case g
when Guesser then g.filter(lexers)
when proc { |x| x.respond_to? :call } then g.call(lexers)
else raise "bad guesser: #{g}"
end

lexers = new_lexers && new_lexers.any? ? new_lexers : lexers
end

# if we haven't filtered the input at *all*,
# then we have no idea what language it is,
# so we bail and return [].
lexers.size < original_size ? lexers : []
end

def collect_best(lexers, opts={}, &scorer)
best = []
best_score = opts[:threshold]

lexers.each do |lexer|
score = scorer.call(lexer)

next if score.nil?

if best_score.nil? || score > best_score
best_score = score
best = [lexer]
elsif score == best_score
best << lexer
end
end

best
end

def filter(lexers)
raise 'abstract'
end
end
end
25 changes: 25 additions & 0 deletions lib/rouge/guessers/filename.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
module Rouge
module Guessers
class Filename < Guesser
attr_reader :fname
def initialize(filename)
@filename = filename
end

# returns a list of lexers that match the given filename with
# equal specificity (i.e. number of wildcards in the pattern).
# This helps disambiguate between, e.g. the Nginx lexer, which
# matches `nginx.conf`, and the Conf lexer, which matches `*.conf`.
# In this case, nginx will win because the pattern has no wildcards,
# while `*.conf` has one.
def filter(lexers)
mapping = {}
lexers.each do |lexer|
mapping[lexer.name] = lexer.filenames || []
end

GlobMapping.new(mapping, @filename).filter(lexers)
end
end
end
end
46 changes: 46 additions & 0 deletions lib/rouge/guessers/glob_mapping.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
module Rouge
module Guessers
# This class allows for custom behavior
# with glob -> lexer name mappings
class GlobMapping < Guesser
def self.by_pairs(mapping, filename)
glob_map = {}
mapping.each do |(glob, lexer_name)|
lexer = Lexer.find(lexer_name)

# ignore unknown lexers
next unless lexer

glob_map[lexer.name] ||= []
glob_map[lexer.name] << glob
end

new(glob_map, filename)
end

attr_reader :glob_map, :filename
def initialize(glob_map, filename)
@glob_map = glob_map
@filename = filename
end

def filter(lexers)
basename = File.basename(filename)

collect_best(lexers) do |lexer|
score = (@glob_map[lexer.name] || []).map do |pattern|
if test_pattern(pattern, basename)
# specificity is better the fewer wildcards there are
-pattern.scan(/[*?\[]/).size
end
end.compact.min
end
end

private
def test_pattern(pattern, path)
File.fnmatch?(pattern, path, File::FNM_DOTMATCH | File::FNM_CASEFOLD)
end
end
end
end
14 changes: 14 additions & 0 deletions lib/rouge/guessers/mimetype.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
module Rouge
module Guessers
class Mimetype < Guesser
attr_reader :mimetype
def initialize(mimetype)
@mimetype = mimetype
end

def filter(lexers)
lexers.select { |lexer| lexer.mimetypes.include? @mimetype }
end
end
end
end
42 changes: 42 additions & 0 deletions lib/rouge/guessers/modeline.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
module Rouge
module Guessers
class Modeline < Guesser
# [jneen] regexen stolen from linguist
EMACS_MODELINE = /-\*-\s*(?:(?!mode)[\w-]+\s*:\s*(?:[\w+-]+)\s*;?\s*)*(?:mode\s*:)?\s*([\w+-]+)\s*(?:;\s*(?!mode)[\w-]+\s*:\s*[\w+-]+\s*)*;?\s*-\*-/i

# First form vim modeline
# [text]{white}{vi:|vim:|ex:}[white]{options}
# ex: 'vim: syntax=ruby'
VIM_MODELINE_1 = /(?:vim|vi|ex):\s*(?:ft|filetype|syntax)=(\w+)\s?/i

# Second form vim modeline (compatible with some versions of Vi)
# [text]{white}{vi:|vim:|Vim:|ex:}[white]se[t] {options}:[text]
# ex: 'vim set syntax=ruby:'
VIM_MODELINE_2 = /(?:vim|vi|Vim|ex):\s*se(?:t)?.*\s(?:ft|filetype|syntax)=(\w+)\s?.*:/i

MODELINES = [EMACS_MODELINE, VIM_MODELINE_1, VIM_MODELINE_2]

def initialize(source, opts={})
@source = source
@lines = opts[:lines] || 5
end

def filter(lexers)
# don't bother reading the stream if we've already decided
return lexers if lexers.size == 1

source_text = @source
source_text = source_text.read if source_text.respond_to? :read

lines = source_text.split(/\r?\n/)

search_space = (lines.first(@lines) + lines.last(@lines)).join("\n")

matches = MODELINES.map { |re| re.match(search_space) }.compact
match_set = Set.new(matches.map { |m| m[1] })

lexers.select { |l| (Set.new([l.tag] + l.aliases) & match_set).any? }
end
end
end
end
39 changes: 39 additions & 0 deletions lib/rouge/guessers/source.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
module Rouge
module Guessers
class Source < Guesser
attr_reader :source
def initialize(source)
@source = source
end

def filter(lexers)
# don't bother reading the input if
# we've already filtered to 1
return lexers if lexers.size == 1

# If we're filtering against *all* lexers, we only use confident return
# values from analyze_text. But if we've filtered down already, we can trust
# the analysis more.
threshold = lexers.size < 10 ? 0 : 0.5

source_text = case @source
when String
@source
when ->(s){ s.respond_to? :read }
@source.read
else
raise 'invalid source'
end

Lexer.assert_utf8!(source_text)

source_text = TextAnalyzer.new(source_text)

collect_best(lexers, threshold: threshold) do |lexer|
next unless lexer.methods(false).include? :analyze_text
lexer.analyze_text(source_text)
end
end
end
end
end
96 changes: 11 additions & 85 deletions lib/rouge/lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -109,26 +109,17 @@ def all
# to use.
def guesses(info={})
mimetype, filename, source = info.values_at(:mimetype, :filename, :source)
lexers = registry.values.uniq
total_size = lexers.size

lexers = filter_by_mimetype(lexers, mimetype) if mimetype
return lexers if lexers.size == 1

lexers = filter_by_filename(lexers, filename) if filename
return lexers if lexers.size == 1

if source
# If we're filtering against *all* lexers, we only use confident return
# values from analyze_text. But if we've filtered down already, we can trust
# the analysis more.
source_threshold = lexers.size < total_size ? 0 : 0.5
return [best_by_source(lexers, source, source_threshold)].compact
elsif lexers.size < total_size
return lexers
else
return []
end
custom_globs = info[:custom_globs]

guessers = (info[:guessers] || []).dup

guessers << Guessers::Mimetype.new(mimetype) if mimetype
guessers << Guessers::GlobMapping.by_pairs(custom_globs, filename) if custom_globs && filename
guessers << Guessers::Filename.new(filename) if filename
guessers << Guessers::Modeline.new(source) if source
guessers << Guessers::Source.new(source) if source

Guesser.guess(guessers, Lexer.all)
end

class AmbiguousGuess < StandardError
Expand Down Expand Up @@ -175,71 +166,6 @@ def guess_by_source(source)
end

private
def filter_by_mimetype(lexers, mt)
filtered = lexers.select { |lexer| lexer.mimetypes.include? mt }
filtered.any? ? filtered : lexers
end

# returns a list of lexers that match the given filename with
# equal specificity (i.e. number of wildcards in the pattern).
# This helps disambiguate between, e.g. the Nginx lexer, which
# matches `nginx.conf`, and the Conf lexer, which matches `*.conf`.
# In this case, nginx will win because the pattern has no wildcards,
# while `*.conf` has one.
def filter_by_filename(lexers, fname)
fname = File.basename(fname)

out = []
best_seen = nil
lexers.each do |lexer|
score = lexer.filenames.map do |pattern|
if File.fnmatch?(pattern, fname, File::FNM_DOTMATCH)
# specificity is better the fewer wildcards there are
pattern.scan(/[*?\[]/).size
end
end.compact.min

next unless score

if best_seen.nil? || score < best_seen
best_seen = score
out = [lexer]
elsif score == best_seen
out << lexer
end
end

out.any? ? out : lexers
end

def best_by_source(lexers, source, threshold=0)
source = case source
when String
source
when ->(s){ s.respond_to? :read }
source.read
else
raise 'invalid source'
end

assert_utf8!(source)

source = TextAnalyzer.new(source)

best_result = threshold
best_match = nil
lexers.each do |lexer|
result = lexer.analyze_text(source) || 0
return lexer if result == 1

if result > best_result
best_match = lexer
best_result = result
end
end

best_match
end

protected
# @private
Expand Down
6 changes: 3 additions & 3 deletions lib/rouge/lexers/http.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ class HTTP < RegexLexer
title "HTTP"
desc 'http requests and responses'

def self.methods
@methods ||= %w(GET POST PUT DELETE HEAD OPTIONS TRACE PATCH)
def self.http_methods
@http_methods ||= %w(GET POST PUT DELETE HEAD OPTIONS TRACE PATCH)
end

def content_lexer
Expand All @@ -24,7 +24,7 @@ def content_lexer
state :root do
# request
rule %r(
(#{HTTP.methods.join('|')})([ ]+) # method
(#{HTTP.http_methods.join('|')})([ ]+) # method
([^ ]+)([ ]+) # path
(HTTPS?)(/)(1[.][01])(\r?\n|$) # http version
)ox do
Expand Down
Loading

0 comments on commit f3dea84

Please sign in to comment.