Skip to content

Commit

Permalink
Switch from bigrams to trigrams for search
Browse files Browse the repository at this point in the history
Trigrams can provide more accurate search results than bigrams.  For
example, using bigrams, searching for "sel" would attempt to match the
ngrams " s", "se", and "el".  For the Rails API (at `7c65a4b83b583f4f`),
the top result is `ActiveModel::Serializers` due to "Model" matching
"el" and ":Serial" matching " s" and "se".  However, using trigrams,
"sel" would attempt to match " se" and "sel".  In that case, for the
Rails API, the top result is `ActiveRecord::QueryMethods#select`.

The downside to using trigrams is that the search index increases from
2.9 MB to 8.6 MB.  But the data compresses well, so when gzipped the
size only increases from 474 kB to 670 kB.  And browser heap snapshot
size stays reasonably small, increasing from 6.8 MB to 11.1 MB in
Firefox and 8.0 MB to 22.2 MB in Chrome.
  • Loading branch information
jonathanhefner committed Oct 23, 2023
1 parent f9e496a commit 6795062
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 45 deletions.
6 changes: 3 additions & 3 deletions lib/rdoc/generator/template/rails/resources/js/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,17 @@ export class Search {
}

compileQuery(query) {
query = ` ${query} `;
const bitPositions = [];

for (let i = 0, len = query.length; i < len; i += 1) {
const ngram = i === 0 ? (" " + query[0]) : query.substring(i - 1, i + 1);
for (let i = 0, upto = query.length - 2; i < upto; i += 1) {
const ngram = query.substring(i, i + 3);
const position = searchIndex.ngrams[ngram];

if (position) {
bitPositions.push(position);
}
}

return bitPositions;
}

Expand Down
17 changes: 7 additions & 10 deletions lib/sdoc/search_index.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,26 +43,23 @@ def derive_ngrams(name)
# Example: "ActiveSupport::Cache::Store" => ":ActiveSupport:Cache:Store"
strings = [":#{name}".gsub("::", ":")]

# Example: ":ActiveSupport:Cache:lookup_store" => ":ActiveSupport:Cache.lookup_store("
strings.concat(strings.map { |string| string.gsub(/[:#]([^A-Z].+)/, '.\1(') })
# Example: ":ActiveModel:API" => ":activemodel:api"
strings.concat(strings.map(&:downcase))
# Example: ":ActiveSupport:HashWithIndifferentAccess" => ":AS:HWIA"
strings.concat(strings.map { |string| string.gsub(/([A-Z])[a-z]+/, '\1') })
# Example: ":AbstractController:Base#action_name" => " AbstractController Base action_name"
strings.concat(strings.map { |string| string.tr(":#", " ") })
# Example: ":AbstractController:Base#action_name" => ":AbstractController:Base#actionname"
# Example: ":ActiveRecord:Querying#find_by_sql" => ":ActiveRecord:Querying#findbysql"
strings.concat(strings.map { |string| string.tr("_", "") })

# Example: ":ActiveModel:Name#<=>" => [":ActiveModel", ":Name", "#<=>"]
strings.map! { |string| string.split(/(?=[ :#])/) }.flatten!
strings.map! { |string| string.split(/(?=[ :#.])/) }.flatten!.uniq!
# Example: ":ActiveModel" => ":A "
strings.concat(strings.map { |string| "#{string[0, 2]} " })

if method_name_first_char = name[/(?:#|::)([^A-Z])/, 1]
# Example: "AbstractController::Base::controller_path" => ".c"
strings << ".#{method_name_first_char}"
# Example: "AbstractController::Base::controller_path" => "h("
strings << "#{name[-1]}("
end

strings.flat_map { |string| string.each_char.each_cons(2).map(&:join) }.uniq
strings.flat_map { |string| string.each_char.each_cons(3).map(&:join) }.uniq
end

def compile_ngrams(ngram_sets)
Expand Down
82 changes: 50 additions & 32 deletions spec/search_index_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,58 +49,76 @@ def hoge_fuga; end

describe "#derive_ngrams" do
it "returns ngrams for a given string" do
expected = %w[ab bc cx xy yz]
expected = %w[abc bcx cxy xyz]
_(SDoc::SearchIndex.derive_ngrams("abcxyz") & expected).must_equal expected
end

it "includes module-related ngrams" do
ngrams = SDoc::SearchIndex.derive_ngrams("Abc::Xyz")

_(ngrams).must_include ":A"
_(ngrams).must_include ":X"
_(ngrams).must_include ":Ab"
_(ngrams).must_include ":Xy"

_(ngrams).wont_include "c:"
_(ngrams).wont_include "::"
_(ngrams.grep(/c:|::|[.(]/)).must_be_empty
end

it "includes method-related ngrams" do
it "includes method-related ngrams for instance methods" do
ngrams = SDoc::SearchIndex.derive_ngrams("Abc#def_xyz")

_(ngrams).must_include "#d"
_(ngrams).must_include ".d"
_(ngrams).must_include "z("
_(ngrams).must_include "#de"
_(ngrams).must_include ".de"
_(ngrams).must_include "yz("

_(ngrams).wont_include "c#"
_(ngrams).must_include "f_x"
_(ngrams).must_include "efx"
_(ngrams).must_include "fxy"

_(ngrams).must_include "f_"
_(ngrams).must_include "_x"
_(ngrams).must_include "fx"
_(ngrams.grep(/c#/)).must_be_empty
end

it "includes method-related ngrams for singleton methods" do
ngrams = SDoc::SearchIndex.derive_ngrams("Abc::def_xyz")

_(ngrams).must_include ":de"
_(ngrams).must_include ".de"
_(ngrams).must_include "yz("

_(ngrams.grep(/c:/)).must_be_empty
end

it "includes space delimiter ngrams" do
ngrams = SDoc::SearchIndex.derive_ngrams("Abc::Def#xyz")

_(ngrams).must_include " A"
_(ngrams).must_include " D"
_(ngrams).must_include " x"
_(ngrams).must_include " Ab"
_(ngrams).must_include " A "
_(ngrams).must_include ":A "

_(ngrams).must_include " De"
_(ngrams).must_include " D "
_(ngrams).must_include ":D "

_(ngrams).must_include " xy"
_(ngrams).must_include " x "
_(ngrams).must_include "#x "

_(ngrams).wont_include "c "
_(ngrams).wont_include "f "
_(ngrams.grep(/[cfz] $/)).must_be_empty
end

it "includes acronym ngrams" do
ngrams = SDoc::SearchIndex.derive_ngrams("AbcDefGhi::RstUvwXyz")
ngrams = SDoc::SearchIndex.derive_ngrams("AbcDef::StUvWxYz")

_(ngrams).must_include "AD"
_(ngrams).must_include "DG"
_(ngrams).must_include "RU"
_(ngrams).must_include "UX"
_(ngrams).must_include ":AD"
_(ngrams).must_include " AD"
_(ngrams).must_include ":SU"
_(ngrams).must_include " SU"
_(ngrams).must_include "SUW"
_(ngrams).must_include "UWY"

_(ngrams).wont_include "GR"
_(ngrams.grep(/DS/)).must_be_empty
end

it "includes downcased ngrams except for acronym ngrams" do
ngrams = SDoc::SearchIndex.derive_ngrams("AbcDefGhi::RstUvwXyz")
ngrams = SDoc::SearchIndex.derive_ngrams("AbcDef::StUvWxYz")

ngrams.grep(/[A-Z]/).grep_v(/[A-Z]{2}/).each do |uppercase|
_(ngrams).must_include uppercase.downcase
Expand All @@ -110,7 +128,7 @@ def hoge_fuga; end

describe "#compile_ngrams" do
it "assigns ngram bit positions based on ngram rarity" do
base_ngrams = ("aa".."zz").take(4)
base_ngrams = ("aaa".."zzz").take(4)
ngram_sets = (0..3).map { |n| base_ngrams.drop(n) }

_(SDoc::SearchIndex.compile_ngrams(ngram_sets)).
Expand All @@ -120,7 +138,7 @@ def hoge_fuga; end

describe "#generate_fingerprint" do
it "returns an array of bytes with bits set for the given ngrams" do
ngrams = ("aa".."zz").take(8)
ngrams = ("aaa".."zzz").take(8)

packed_positions = ngrams.each_with_index.to_h
_(SDoc::SearchIndex.generate_fingerprint(ngrams, packed_positions)).must_equal [0b11111111]
Expand All @@ -130,17 +148,17 @@ def hoge_fuga; end
end

it "omits trailing zero bytes" do
_(SDoc::SearchIndex.generate_fingerprint(["xx"], { "xx" => 0, "yy" => 100 })).must_equal [1]
_(SDoc::SearchIndex.generate_fingerprint(["xxx"], { "xxx" => 0, "yyy" => 100 })).must_equal [1]
end
end

describe "#compute_bit_weights" do
it "returns an array of weights" do
_(SDoc::SearchIndex.compute_bit_weights({ "xx" => 0, "yy" => 1 })).must_equal [1, 1]
_(SDoc::SearchIndex.compute_bit_weights({ "xxx" => 0, "yyy" => 1 })).must_equal [1, 1]
end

it "computes weights based on ngram content" do
ngram_bit_positions = { "xx" => 0, " x" => 1, ":X" => 2, "#x" => 3 }
ngram_bit_positions = { "xxx" => 0, " xx" => 1, ":Xx" => 2, "#xx" => 3 }
bit_weights = SDoc::SearchIndex.compute_bit_weights(ngram_bit_positions)

_(bit_weights.length).must_equal ngram_bit_positions.length
Expand All @@ -149,7 +167,7 @@ def hoge_fuga; end
end

it "orders weights by bit position" do
ngram_bit_positions = { "xx" => 0, " x" => 1, ":X" => 2, "#x" => 3 }
ngram_bit_positions = { "xxx" => 0, " xx" => 1, ":Xx" => 2, "#xx" => 3 }
bit_weights = SDoc::SearchIndex.compute_bit_weights(ngram_bit_positions)

reversed = ngram_bit_positions.reverse_each.to_h
Expand All @@ -160,7 +178,7 @@ def hoge_fuga; end
end

it "ignores alias ngrams" do
_(SDoc::SearchIndex.compute_bit_weights({ "#x" => 0, ".x" => 0}).length).must_equal 1
_(SDoc::SearchIndex.compute_bit_weights({ "#xx" => 0, ".xx" => 0}).length).must_equal 1
end
end

Expand Down

0 comments on commit 6795062

Please sign in to comment.