Skip to content

Commit

Permalink
Merge pull request #342 from jonathanhefner/search-trigrams
Browse files Browse the repository at this point in the history
Switch from bigrams to trigrams for search
  • Loading branch information
jonathanhefner authored Oct 25, 2023
2 parents 5546703 + 6795062 commit de49d9e
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 104 deletions.
8 changes: 4 additions & 4 deletions lib/rdoc/generator/template/rails/resources/js/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,17 @@ export class Search {
}

compileQuery(query) {
query = ` ${query} `;
const bitPositions = [];

for (let i = 0, len = query.length; i < len; i += 1) {
const bigram = i === 0 ? (" " + query[0]) : query.substring(i - 1, i + 1);
const position = searchIndex.bigrams[bigram];
for (let i = 0, upto = query.length - 2; i < upto; i += 1) {
const ngram = query.substring(i, i + 3);
const position = searchIndex.ngrams[ngram];

if (position) {
bitPositions.push(position);
}
}

return bitPositions;
}

Expand Down
66 changes: 36 additions & 30 deletions lib/sdoc/search_index.rb
Original file line number Diff line number Diff line change
@@ -1,26 +1,33 @@
require "base64"
require "nokogiri"
require_relative "helpers"

module SDoc::SearchIndex
extend self

class Uint8Array < Array
# This doesn't generate valid JSON, but it is suitable as an export from an
# ES6 module.
def to_json(*)
"(new Uint8Array(#{super}))"
end
end

def generate(rdoc_modules)
# RDoc duplicates RDoc::MethodAttr instances when modules are aliased by
# assigning to a constant. For example, `MyBar = Foo::Bar` will duplicate
# all of Foo::Bar's RDoc::MethodAttr instances.
rdoc_objects = rdoc_modules + rdoc_modules.flat_map(&:method_list).uniq

bigram_sets = rdoc_objects.map { |rdoc_object| derive_bigrams(rdoc_object.full_name) }
bigram_bit_positions = compile_bigrams(bigram_sets)
bit_weights = compute_bit_weights(bigram_bit_positions)
ngram_sets = rdoc_objects.map { |rdoc_object| derive_ngrams(rdoc_object.full_name) }
ngram_bit_positions = compile_ngrams(ngram_sets)
bit_weights = compute_bit_weights(ngram_bit_positions)

entries = rdoc_objects.zip(bigram_sets).map do |rdoc_object, bigrams|
entries = rdoc_objects.zip(ngram_sets).map do |rdoc_object, ngrams|
rdoc_module, rdoc_method = rdoc_object.is_a?(RDoc::ClassModule) ? [rdoc_object] : [rdoc_object.parent, rdoc_object]
description = rdoc_object.description

[
generate_fingerprint(bigrams, bigram_bit_positions),
generate_fingerprint(ngrams, ngram_bit_positions),
compute_tiebreaker_bonus(rdoc_module.full_name, rdoc_method&.name, description),
rdoc_object.path,
rdoc_module.full_name,
Expand All @@ -29,72 +36,71 @@ def generate(rdoc_modules)
]
end

{ "bigrams" => bigram_bit_positions, "weights" => bit_weights, "entries" => entries }
{ "ngrams" => ngram_bit_positions, "weights" => bit_weights, "entries" => entries }
end

def derive_bigrams(name)
def derive_ngrams(name)
# Example: "ActiveSupport::Cache::Store" => ":ActiveSupport:Cache:Store"
strings = [":#{name}".gsub("::", ":")]

# Example: ":ActiveSupport:Cache:lookup_store" => ":ActiveSupport:Cache.lookup_store("
strings.concat(strings.map { |string| string.gsub(/[:#]([^A-Z].+)/, '.\1(') })
# Example: ":ActiveModel:API" => ":activemodel:api"
strings.concat(strings.map(&:downcase))
# Example: ":ActiveSupport:HashWithIndifferentAccess" => ":AS:HWIA"
strings.concat(strings.map { |string| string.gsub(/([A-Z])[a-z]+/, '\1') })
# Example: ":AbstractController:Base#action_name" => " AbstractController Base action_name"
strings.concat(strings.map { |string| string.tr(":#", " ") })
# Example: ":AbstractController:Base#action_name" => ":AbstractController:Base#actionname"
# Example: ":ActiveRecord:Querying#find_by_sql" => ":ActiveRecord:Querying#findbysql"
strings.concat(strings.map { |string| string.tr("_", "") })

# Example: ":ActiveModel:Name#<=>" => [":ActiveModel", ":Name", "#<=>"]
strings.map! { |string| string.split(/(?=[ :#])/) }.flatten!
strings.map! { |string| string.split(/(?=[ :#.])/) }.flatten!.uniq!
# Example: ":ActiveModel" => ":A "
strings.concat(strings.map { |string| "#{string[0, 2]} " })

if method_name_first_char = name[/(?:#|::)([^A-Z])/, 1]
# Example: "AbstractController::Base::controller_path" => ".c"
strings << ".#{method_name_first_char}"
# Example: "AbstractController::Base::controller_path" => "h("
strings << "#{name[-1]}("
end

strings.flat_map { |string| string.each_char.each_cons(2).map(&:join) }.uniq
strings.flat_map { |string| string.each_char.each_cons(3).map(&:join) }.uniq
end

def compile_bigrams(bigram_sets)
# Assign each bigram a bit position based on its rarity. More common bigrams
def compile_ngrams(ngram_sets)
# Assign each ngram a bit position based on its rarity. More common ngrams
# come first. This reduces the average number of bytes required to store a
# fingerprint.
bigram_sets.flatten.tally.sort_by(&:last).reverse.map(&:first).each_with_index.to_h
ngram_sets.flatten.tally.sort_by(&:last).reverse.map(&:first).each_with_index.to_h
end

def generate_fingerprint(bigrams, bigram_bit_positions)
bit_positions = bigrams.map(&bigram_bit_positions)
def generate_fingerprint(ngrams, ngram_bit_positions)
bit_positions = ngrams.map(&ngram_bit_positions)
byte_count = ((bit_positions.max + 1) / 8.0).ceil
bytes = [0] * byte_count

bit_positions.each do |position|
bytes[position / 8] |= 1 << (position % 8)
end

bytes
Uint8Array.new(bytes)
end

BIGRAM_PATTERN_WEIGHTS = {
NGRAM_PATTERN_WEIGHTS = {
/[^a-z]/ => 2, # Bonus point for non-lowercase-alpha chars because they show intentionality.
/^ / => 3, # More points for matching generic start of token.
/^:/ => 4, # Even more points for explicit start of token.
/[#.(]/ => 50, # Strongly prefer methods when query includes "#", ".", or "(".
}

def compute_bit_weights(bigram_bit_positions)
bigram_bit_positions.uniq(&:last).sort_by(&:last).map do |bigram, _position|
BIGRAM_PATTERN_WEIGHTS.map { |pattern, weight| bigram.match?(pattern) ? weight : 1 }.max
def compute_bit_weights(ngram_bit_positions)
weights = ngram_bit_positions.uniq(&:last).sort_by(&:last).map do |ngram, _position|
NGRAM_PATTERN_WEIGHTS.map { |pattern, weight| ngram.match?(pattern) ? weight : 1 }.max
end

Uint8Array.new(weights)
end

def compute_tiebreaker_bonus(module_name, method_name, description)
method_name ||= ""

# Bonus is per matching bigram and is very small so it does not outweigh
# points from other matches. Longer names have smaller per-bigram bonuses,
# Bonus is per matching ngram and is very small so it does not outweigh
# points from other matches. Longer names have smaller per-ngram bonuses,
# but the value scales down very slowly.
bonus = 0.01 / (module_name.length + method_name.length) ** 0.025

Expand Down
7 changes: 4 additions & 3 deletions spec/rdoc_generator_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ def parse_options(*options)
Dir.mktmpdir do |dir|
Dir.chdir(dir) do
rdoc_run("--files", "#{__dir__}/../README.md", "#{__dir__}/../lib/sdoc/version.rb")
contents = File.read("doc/js/search-index.js")
index = JSON.parse(contents.delete_prefix!("export default ").delete_suffix!(";"))
_(index.keys.sort).must_equal ["bigrams", "entries", "weights"]
index = File.read("doc/js/search-index.js")
index.delete_prefix!("export default ").delete_suffix!(";")
index.gsub!(/\(new Uint8Array\((.+?)\)\)/, '\1')
_(JSON.parse(index).keys.sort).must_equal ["ngrams", "weights", "entries"].sort
end
end
end
Expand Down
Loading

0 comments on commit de49d9e

Please sign in to comment.