diff --git a/lib/rdoc/generator/template/rails/resources/js/search.js b/lib/rdoc/generator/template/rails/resources/js/search.js index af7529a6..0e8e7357 100644 --- a/lib/rdoc/generator/template/rails/resources/js/search.js +++ b/lib/rdoc/generator/template/rails/resources/js/search.js @@ -37,17 +37,17 @@ export class Search { } compileQuery(query) { + query = ` ${query} `; const bitPositions = []; - for (let i = 0, len = query.length; i < len; i += 1) { - const bigram = i === 0 ? (" " + query[0]) : query.substring(i - 1, i + 1); - const position = searchIndex.bigrams[bigram]; + for (let i = 0, upto = query.length - 2; i < upto; i += 1) { + const ngram = query.substring(i, i + 3); + const position = searchIndex.ngrams[ngram]; if (position) { bitPositions.push(position); } } - return bitPositions; } diff --git a/lib/sdoc/search_index.rb b/lib/sdoc/search_index.rb index 1b3063f6..97bde191 100644 --- a/lib/sdoc/search_index.rb +++ b/lib/sdoc/search_index.rb @@ -1,26 +1,33 @@ -require "base64" require "nokogiri" require_relative "helpers" module SDoc::SearchIndex extend self + class Uint8Array < Array + # This doesn't generate valid JSON, but it is suitable as an export from an + # ES6 module. + def to_json(*) + "(new Uint8Array(#{super}))" + end + end + def generate(rdoc_modules) # RDoc duplicates RDoc::MethodAttr instances when modules are aliased by # assigning to a constant. For example, `MyBar = Foo::Bar` will duplicate # all of Foo::Bar's RDoc::MethodAttr instances. rdoc_objects = rdoc_modules + rdoc_modules.flat_map(&:method_list).uniq - bigram_sets = rdoc_objects.map { |rdoc_object| derive_bigrams(rdoc_object.full_name) } - bigram_bit_positions = compile_bigrams(bigram_sets) - bit_weights = compute_bit_weights(bigram_bit_positions) + ngram_sets = rdoc_objects.map { |rdoc_object| derive_ngrams(rdoc_object.full_name) } + ngram_bit_positions = compile_ngrams(ngram_sets) + bit_weights = compute_bit_weights(ngram_bit_positions) - entries = rdoc_objects.zip(bigram_sets).map do |rdoc_object, bigrams| + entries = rdoc_objects.zip(ngram_sets).map do |rdoc_object, ngrams| rdoc_module, rdoc_method = rdoc_object.is_a?(RDoc::ClassModule) ? [rdoc_object] : [rdoc_object.parent, rdoc_object] description = rdoc_object.description [ - generate_fingerprint(bigrams, bigram_bit_positions), + generate_fingerprint(ngrams, ngram_bit_positions), compute_tiebreaker_bonus(rdoc_module.full_name, rdoc_method&.name, description), rdoc_object.path, rdoc_module.full_name, @@ -29,44 +36,41 @@ def generate(rdoc_modules) ] end - { "bigrams" => bigram_bit_positions, "weights" => bit_weights, "entries" => entries } + { "ngrams" => ngram_bit_positions, "weights" => bit_weights, "entries" => entries } end - def derive_bigrams(name) + def derive_ngrams(name) # Example: "ActiveSupport::Cache::Store" => ":ActiveSupport:Cache:Store" strings = [":#{name}".gsub("::", ":")] + # Example: ":ActiveSupport:Cache:lookup_store" => ":ActiveSupport:Cache.lookup_store(" + strings.concat(strings.map { |string| string.gsub(/[:#]([^A-Z].+)/, '.\1(') }) # Example: ":ActiveModel:API" => ":activemodel:api" strings.concat(strings.map(&:downcase)) # Example: ":ActiveSupport:HashWithIndifferentAccess" => ":AS:HWIA" strings.concat(strings.map { |string| string.gsub(/([A-Z])[a-z]+/, '\1') }) # Example: ":AbstractController:Base#action_name" => " AbstractController Base action_name" strings.concat(strings.map { |string| string.tr(":#", " ") }) - # Example: ":AbstractController:Base#action_name" => ":AbstractController:Base#actionname" + # Example: ":ActiveRecord:Querying#find_by_sql" => ":ActiveRecord:Querying#findbysql" strings.concat(strings.map { |string| string.tr("_", "") }) # Example: ":ActiveModel:Name#<=>" => [":ActiveModel", ":Name", "#<=>"] - strings.map! { |string| string.split(/(?=[ :#])/) }.flatten! + strings.map! { |string| string.split(/(?=[ :#.])/) }.flatten!.uniq! + # Example: ":ActiveModel" => ":A " + strings.concat(strings.map { |string| "#{string[0, 2]} " }) - if method_name_first_char = name[/(?:#|::)([^A-Z])/, 1] - # Example: "AbstractController::Base::controller_path" => ".c" - strings << ".#{method_name_first_char}" - # Example: "AbstractController::Base::controller_path" => "h(" - strings << "#{name[-1]}(" - end - - strings.flat_map { |string| string.each_char.each_cons(2).map(&:join) }.uniq + strings.flat_map { |string| string.each_char.each_cons(3).map(&:join) }.uniq end - def compile_bigrams(bigram_sets) - # Assign each bigram a bit position based on its rarity. More common bigrams + def compile_ngrams(ngram_sets) + # Assign each ngram a bit position based on its rarity. More common ngrams # come first. This reduces the average number of bytes required to store a # fingerprint. - bigram_sets.flatten.tally.sort_by(&:last).reverse.map(&:first).each_with_index.to_h + ngram_sets.flatten.tally.sort_by(&:last).reverse.map(&:first).each_with_index.to_h end - def generate_fingerprint(bigrams, bigram_bit_positions) - bit_positions = bigrams.map(&bigram_bit_positions) + def generate_fingerprint(ngrams, ngram_bit_positions) + bit_positions = ngrams.map(&ngram_bit_positions) byte_count = ((bit_positions.max + 1) / 8.0).ceil bytes = [0] * byte_count @@ -74,27 +78,29 @@ def generate_fingerprint(bigrams, bigram_bit_positions) bytes[position / 8] |= 1 << (position % 8) end - bytes + Uint8Array.new(bytes) end - BIGRAM_PATTERN_WEIGHTS = { + NGRAM_PATTERN_WEIGHTS = { /[^a-z]/ => 2, # Bonus point for non-lowercase-alpha chars because they show intentionality. /^ / => 3, # More points for matching generic start of token. /^:/ => 4, # Even more points for explicit start of token. /[#.(]/ => 50, # Strongly prefer methods when query includes "#", ".", or "(". } - def compute_bit_weights(bigram_bit_positions) - bigram_bit_positions.uniq(&:last).sort_by(&:last).map do |bigram, _position| - BIGRAM_PATTERN_WEIGHTS.map { |pattern, weight| bigram.match?(pattern) ? weight : 1 }.max + def compute_bit_weights(ngram_bit_positions) + weights = ngram_bit_positions.uniq(&:last).sort_by(&:last).map do |ngram, _position| + NGRAM_PATTERN_WEIGHTS.map { |pattern, weight| ngram.match?(pattern) ? weight : 1 }.max end + + Uint8Array.new(weights) end def compute_tiebreaker_bonus(module_name, method_name, description) method_name ||= "" - # Bonus is per matching bigram and is very small so it does not outweigh - # points from other matches. Longer names have smaller per-bigram bonuses, + # Bonus is per matching ngram and is very small so it does not outweigh + # points from other matches. Longer names have smaller per-ngram bonuses, # but the value scales down very slowly. bonus = 0.01 / (module_name.length + method_name.length) ** 0.025 diff --git a/spec/rdoc_generator_spec.rb b/spec/rdoc_generator_spec.rb index 34e1d384..ba504e42 100644 --- a/spec/rdoc_generator_spec.rb +++ b/spec/rdoc_generator_spec.rb @@ -35,9 +35,10 @@ def parse_options(*options) Dir.mktmpdir do |dir| Dir.chdir(dir) do rdoc_run("--files", "#{__dir__}/../README.md", "#{__dir__}/../lib/sdoc/version.rb") - contents = File.read("doc/js/search-index.js") - index = JSON.parse(contents.delete_prefix!("export default ").delete_suffix!(";")) - _(index.keys.sort).must_equal ["bigrams", "entries", "weights"] + index = File.read("doc/js/search-index.js") + index.delete_prefix!("export default ").delete_suffix!(";") + index.gsub!(/\(new Uint8Array\((.+?)\)\)/, '\1') + _(JSON.parse(index).keys.sort).must_equal ["ngrams", "weights", "entries"].sort end end end diff --git a/spec/search_index_spec.rb b/spec/search_index_spec.rb index d240d71f..07d2ede8 100644 --- a/spec/search_index_spec.rb +++ b/spec/search_index_spec.rb @@ -11,19 +11,19 @@ def hoge_fuga; end end RUBY - bigrams = SDoc::SearchIndex.derive_bigrams("FooBar#hoge_fuga") + ngrams = SDoc::SearchIndex.derive_ngrams("FooBar#hoge_fuga") search_index = SDoc::SearchIndex.generate(top_level.classes_and_modules) - _(search_index.keys.sort).must_equal ["bigrams", "entries", "weights"] + _(search_index.keys.sort).must_equal ["ngrams", "weights", "entries"].sort - _(search_index["bigrams"].keys.sort).must_equal bigrams.sort - _(search_index["bigrams"].values.max).must_equal search_index["weights"].length - 1 + _(search_index["ngrams"].keys.sort).must_equal ngrams.sort + _(search_index["ngrams"].values.max).must_equal search_index["weights"].length - 1 _(search_index["entries"].length).must_equal 2 search_index["entries"].each do |entry| _(entry.length).must_be :<=, 6 - _(entry[0]).must_be_instance_of Array # Fingerprint + _(entry[0]).must_be_kind_of Array # Fingerprint _(entry[1]).must_be :<, 1.0 # Tiebreaker bonus end @@ -47,125 +47,143 @@ def hoge_fuga; end end end - describe "#derive_bigrams" do - it "returns bigrams for a given string" do - expected = %w[ab bc cx xy yz] - _(SDoc::SearchIndex.derive_bigrams("abcxyz") & expected).must_equal expected + describe "#derive_ngrams" do + it "returns ngrams for a given string" do + expected = %w[abc bcx cxy xyz] + _(SDoc::SearchIndex.derive_ngrams("abcxyz") & expected).must_equal expected end - it "includes module-related bigrams" do - bigrams = SDoc::SearchIndex.derive_bigrams("Abc::Xyz") + it "includes module-related ngrams" do + ngrams = SDoc::SearchIndex.derive_ngrams("Abc::Xyz") - _(bigrams).must_include ":A" - _(bigrams).must_include ":X" + _(ngrams).must_include ":Ab" + _(ngrams).must_include ":Xy" - _(bigrams).wont_include "c:" - _(bigrams).wont_include "::" + _(ngrams.grep(/c:|::|[.(]/)).must_be_empty end - it "includes method-related bigrams" do - bigrams = SDoc::SearchIndex.derive_bigrams("Abc#def_xyz") + it "includes method-related ngrams for instance methods" do + ngrams = SDoc::SearchIndex.derive_ngrams("Abc#def_xyz") - _(bigrams).must_include "#d" - _(bigrams).must_include ".d" - _(bigrams).must_include "z(" + _(ngrams).must_include "#de" + _(ngrams).must_include ".de" + _(ngrams).must_include "yz(" - _(bigrams).wont_include "c#" + _(ngrams).must_include "f_x" + _(ngrams).must_include "efx" + _(ngrams).must_include "fxy" - _(bigrams).must_include "f_" - _(bigrams).must_include "_x" - _(bigrams).must_include "fx" + _(ngrams.grep(/c#/)).must_be_empty end - it "includes space delimiter bigrams" do - bigrams = SDoc::SearchIndex.derive_bigrams("Abc::Def#xyz") + it "includes method-related ngrams for singleton methods" do + ngrams = SDoc::SearchIndex.derive_ngrams("Abc::def_xyz") - _(bigrams).must_include " A" - _(bigrams).must_include " D" - _(bigrams).must_include " x" + _(ngrams).must_include ":de" + _(ngrams).must_include ".de" + _(ngrams).must_include "yz(" - _(bigrams).wont_include "c " - _(bigrams).wont_include "f " + _(ngrams.grep(/c:/)).must_be_empty end - it "includes acronym bigrams" do - bigrams = SDoc::SearchIndex.derive_bigrams("AbcDefGhi::RstUvwXyz") + it "includes space delimiter ngrams" do + ngrams = SDoc::SearchIndex.derive_ngrams("Abc::Def#xyz") - _(bigrams).must_include "AD" - _(bigrams).must_include "DG" - _(bigrams).must_include "RU" - _(bigrams).must_include "UX" + _(ngrams).must_include " Ab" + _(ngrams).must_include " A " + _(ngrams).must_include ":A " - _(bigrams).wont_include "GR" + _(ngrams).must_include " De" + _(ngrams).must_include " D " + _(ngrams).must_include ":D " + + _(ngrams).must_include " xy" + _(ngrams).must_include " x " + _(ngrams).must_include "#x " + + _(ngrams.grep(/[cfz] $/)).must_be_empty + end + + it "includes acronym ngrams" do + ngrams = SDoc::SearchIndex.derive_ngrams("AbcDef::StUvWxYz") + + _(ngrams).must_include ":AD" + _(ngrams).must_include " AD" + _(ngrams).must_include ":SU" + _(ngrams).must_include " SU" + _(ngrams).must_include "SUW" + _(ngrams).must_include "UWY" + + _(ngrams.grep(/DS/)).must_be_empty end - it "includes downcased bigrams except for acronym bigrams" do - bigrams = SDoc::SearchIndex.derive_bigrams("AbcDefGhi::RstUvwXyz") + it "includes downcased ngrams except for acronym ngrams" do + ngrams = SDoc::SearchIndex.derive_ngrams("AbcDef::StUvWxYz") - bigrams.grep(/[A-Z]/).grep_v(/[A-Z]{2}/).each do |uppercase| - _(bigrams).must_include uppercase.downcase + ngrams.grep(/[A-Z]/).grep_v(/[A-Z]{2}/).each do |uppercase| + _(ngrams).must_include uppercase.downcase end end end - describe "#compile_bigrams" do - it "assigns bigram bit positions based on bigram rarity" do - base_bigrams = ("aa".."zz").take(4) - bigram_sets = (0..3).map { |n| base_bigrams.drop(n) } + describe "#compile_ngrams" do + it "assigns ngram bit positions based on ngram rarity" do + base_ngrams = ("aaa".."zzz").take(4) + ngram_sets = (0..3).map { |n| base_ngrams.drop(n) } - _(SDoc::SearchIndex.compile_bigrams(bigram_sets)). - must_equal base_bigrams.reverse.each_with_index.to_h + _(SDoc::SearchIndex.compile_ngrams(ngram_sets)). + must_equal base_ngrams.reverse.each_with_index.to_h end end describe "#generate_fingerprint" do - it "returns an array of bytes with bits set for the given bigrams" do - bigrams = ("aa".."zz").take(8) + it "returns an array of bytes with bits set for the given ngrams" do + ngrams = ("aaa".."zzz").take(8) - packed_positions = bigrams.each_with_index.to_h - _(SDoc::SearchIndex.generate_fingerprint(bigrams, packed_positions)).must_equal [0b11111111] + packed_positions = ngrams.each_with_index.to_h + _(SDoc::SearchIndex.generate_fingerprint(ngrams, packed_positions)).must_equal [0b11111111] - sparse_positions = bigrams.each_with_index.to_h { |bigram, i| [bigram, i * 8] } - _(SDoc::SearchIndex.generate_fingerprint(bigrams, sparse_positions)).must_equal [1] * 8 + sparse_positions = ngrams.each_with_index.to_h { |ngram, i| [ngram, i * 8] } + _(SDoc::SearchIndex.generate_fingerprint(ngrams, sparse_positions)).must_equal [1] * 8 end it "omits trailing zero bytes" do - _(SDoc::SearchIndex.generate_fingerprint(["xx"], { "xx" => 0, "yy" => 100 })).must_equal [1] + _(SDoc::SearchIndex.generate_fingerprint(["xxx"], { "xxx" => 0, "yyy" => 100 })).must_equal [1] end end describe "#compute_bit_weights" do it "returns an array of weights" do - _(SDoc::SearchIndex.compute_bit_weights({ "xx" => 0, "yy" => 1 })).must_equal [1, 1] + _(SDoc::SearchIndex.compute_bit_weights({ "xxx" => 0, "yyy" => 1 })).must_equal [1, 1] end - it "computes weights based on bigram content" do - bigram_bit_positions = { "xx" => 0, " x" => 1, ":X" => 2, "#x" => 3 } - bit_weights = SDoc::SearchIndex.compute_bit_weights(bigram_bit_positions) + it "computes weights based on ngram content" do + ngram_bit_positions = { "xxx" => 0, " xx" => 1, ":Xx" => 2, "#xx" => 3 } + bit_weights = SDoc::SearchIndex.compute_bit_weights(ngram_bit_positions) - _(bit_weights.length).must_equal bigram_bit_positions.length + _(bit_weights.length).must_equal ngram_bit_positions.length _(bit_weights.uniq).must_equal bit_weights _(bit_weights.sort).must_equal bit_weights end it "orders weights by bit position" do - bigram_bit_positions = { "xx" => 0, " x" => 1, ":X" => 2, "#x" => 3 } - bit_weights = SDoc::SearchIndex.compute_bit_weights(bigram_bit_positions) + ngram_bit_positions = { "xxx" => 0, " xx" => 1, ":Xx" => 2, "#xx" => 3 } + bit_weights = SDoc::SearchIndex.compute_bit_weights(ngram_bit_positions) - reversed = bigram_bit_positions.reverse_each.to_h + reversed = ngram_bit_positions.reverse_each.to_h _(SDoc::SearchIndex.compute_bit_weights(reversed)).must_equal bit_weights - inverted = bigram_bit_positions.transform_values { |pos| -pos + bit_weights.length } + inverted = ngram_bit_positions.transform_values { |pos| -pos + bit_weights.length } _(SDoc::SearchIndex.compute_bit_weights(inverted)).must_equal bit_weights.reverse end - it "ignores alias bigrams" do - _(SDoc::SearchIndex.compute_bit_weights({ "#x" => 0, ".x" => 0}).length).must_equal 1 + it "ignores alias ngrams" do + _(SDoc::SearchIndex.compute_bit_weights({ "#xx" => 0, ".xx" => 0}).length).must_equal 1 end end describe "#compute_tiebreaker_bonus" do - it "returns a value much smaller than 1 (the value of a single matching bigram)" do + it "returns a value much smaller than 1 (the value of a single matching ngram)" do _(SDoc::SearchIndex.compute_tiebreaker_bonus("X", nil, "")).must_be :<=, 0.1 end