Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch from bigrams to trigrams for search #342

Merged
merged 3 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions lib/rdoc/generator/template/rails/resources/js/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,17 @@ export class Search {
}

compileQuery(query) {
query = ` ${query} `;
const bitPositions = [];

for (let i = 0, len = query.length; i < len; i += 1) {
const bigram = i === 0 ? (" " + query[0]) : query.substring(i - 1, i + 1);
const position = searchIndex.bigrams[bigram];
for (let i = 0, upto = query.length - 2; i < upto; i += 1) {
const ngram = query.substring(i, i + 3);
const position = searchIndex.ngrams[ngram];

if (position) {
bitPositions.push(position);
}
}

return bitPositions;
}

Expand Down
66 changes: 36 additions & 30 deletions lib/sdoc/search_index.rb
Original file line number Diff line number Diff line change
@@ -1,26 +1,33 @@
require "base64"
require "nokogiri"
require_relative "helpers"

module SDoc::SearchIndex
extend self

class Uint8Array < Array
# This doesn't generate valid JSON, but it is suitable as an export from an
# ES6 module.
def to_json(*)
"(new Uint8Array(#{super}))"
end
end

def generate(rdoc_modules)
# RDoc duplicates RDoc::MethodAttr instances when modules are aliased by
# assigning to a constant. For example, `MyBar = Foo::Bar` will duplicate
# all of Foo::Bar's RDoc::MethodAttr instances.
rdoc_objects = rdoc_modules + rdoc_modules.flat_map(&:method_list).uniq

bigram_sets = rdoc_objects.map { |rdoc_object| derive_bigrams(rdoc_object.full_name) }
bigram_bit_positions = compile_bigrams(bigram_sets)
bit_weights = compute_bit_weights(bigram_bit_positions)
ngram_sets = rdoc_objects.map { |rdoc_object| derive_ngrams(rdoc_object.full_name) }
ngram_bit_positions = compile_ngrams(ngram_sets)
bit_weights = compute_bit_weights(ngram_bit_positions)

entries = rdoc_objects.zip(bigram_sets).map do |rdoc_object, bigrams|
entries = rdoc_objects.zip(ngram_sets).map do |rdoc_object, ngrams|
rdoc_module, rdoc_method = rdoc_object.is_a?(RDoc::ClassModule) ? [rdoc_object] : [rdoc_object.parent, rdoc_object]
description = rdoc_object.description

[
generate_fingerprint(bigrams, bigram_bit_positions),
generate_fingerprint(ngrams, ngram_bit_positions),
compute_tiebreaker_bonus(rdoc_module.full_name, rdoc_method&.name, description),
rdoc_object.path,
rdoc_module.full_name,
Expand All @@ -29,72 +36,71 @@ def generate(rdoc_modules)
]
end

{ "bigrams" => bigram_bit_positions, "weights" => bit_weights, "entries" => entries }
{ "ngrams" => ngram_bit_positions, "weights" => bit_weights, "entries" => entries }
end

def derive_bigrams(name)
def derive_ngrams(name)
# Example: "ActiveSupport::Cache::Store" => ":ActiveSupport:Cache:Store"
strings = [":#{name}".gsub("::", ":")]

# Example: ":ActiveSupport:Cache:lookup_store" => ":ActiveSupport:Cache.lookup_store("
strings.concat(strings.map { |string| string.gsub(/[:#]([^A-Z].+)/, '.\1(') })
# Example: ":ActiveModel:API" => ":activemodel:api"
strings.concat(strings.map(&:downcase))
# Example: ":ActiveSupport:HashWithIndifferentAccess" => ":AS:HWIA"
strings.concat(strings.map { |string| string.gsub(/([A-Z])[a-z]+/, '\1') })
# Example: ":AbstractController:Base#action_name" => " AbstractController Base action_name"
strings.concat(strings.map { |string| string.tr(":#", " ") })
# Example: ":AbstractController:Base#action_name" => ":AbstractController:Base#actionname"
# Example: ":ActiveRecord:Querying#find_by_sql" => ":ActiveRecord:Querying#findbysql"
strings.concat(strings.map { |string| string.tr("_", "") })

# Example: ":ActiveModel:Name#<=>" => [":ActiveModel", ":Name", "#<=>"]
strings.map! { |string| string.split(/(?=[ :#])/) }.flatten!
strings.map! { |string| string.split(/(?=[ :#.])/) }.flatten!.uniq!
# Example: ":ActiveModel" => ":A "
strings.concat(strings.map { |string| "#{string[0, 2]} " })

if method_name_first_char = name[/(?:#|::)([^A-Z])/, 1]
# Example: "AbstractController::Base::controller_path" => ".c"
strings << ".#{method_name_first_char}"
# Example: "AbstractController::Base::controller_path" => "h("
strings << "#{name[-1]}("
end

strings.flat_map { |string| string.each_char.each_cons(2).map(&:join) }.uniq
strings.flat_map { |string| string.each_char.each_cons(3).map(&:join) }.uniq
end

def compile_bigrams(bigram_sets)
# Assign each bigram a bit position based on its rarity. More common bigrams
def compile_ngrams(ngram_sets)
# Assign each ngram a bit position based on its rarity. More common ngrams
# come first. This reduces the average number of bytes required to store a
# fingerprint.
bigram_sets.flatten.tally.sort_by(&:last).reverse.map(&:first).each_with_index.to_h
ngram_sets.flatten.tally.sort_by(&:last).reverse.map(&:first).each_with_index.to_h
end

def generate_fingerprint(bigrams, bigram_bit_positions)
bit_positions = bigrams.map(&bigram_bit_positions)
def generate_fingerprint(ngrams, ngram_bit_positions)
bit_positions = ngrams.map(&ngram_bit_positions)
byte_count = ((bit_positions.max + 1) / 8.0).ceil
bytes = [0] * byte_count

bit_positions.each do |position|
bytes[position / 8] |= 1 << (position % 8)
end

bytes
Uint8Array.new(bytes)
end

BIGRAM_PATTERN_WEIGHTS = {
NGRAM_PATTERN_WEIGHTS = {
/[^a-z]/ => 2, # Bonus point for non-lowercase-alpha chars because they show intentionality.
/^ / => 3, # More points for matching generic start of token.
/^:/ => 4, # Even more points for explicit start of token.
/[#.(]/ => 50, # Strongly prefer methods when query includes "#", ".", or "(".
}

def compute_bit_weights(bigram_bit_positions)
bigram_bit_positions.uniq(&:last).sort_by(&:last).map do |bigram, _position|
BIGRAM_PATTERN_WEIGHTS.map { |pattern, weight| bigram.match?(pattern) ? weight : 1 }.max
def compute_bit_weights(ngram_bit_positions)
weights = ngram_bit_positions.uniq(&:last).sort_by(&:last).map do |ngram, _position|
NGRAM_PATTERN_WEIGHTS.map { |pattern, weight| ngram.match?(pattern) ? weight : 1 }.max
end

Uint8Array.new(weights)
end

def compute_tiebreaker_bonus(module_name, method_name, description)
method_name ||= ""

# Bonus is per matching bigram and is very small so it does not outweigh
# points from other matches. Longer names have smaller per-bigram bonuses,
# Bonus is per matching ngram and is very small so it does not outweigh
# points from other matches. Longer names have smaller per-ngram bonuses,
# but the value scales down very slowly.
bonus = 0.01 / (module_name.length + method_name.length) ** 0.025

Expand Down
7 changes: 4 additions & 3 deletions spec/rdoc_generator_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ def parse_options(*options)
Dir.mktmpdir do |dir|
Dir.chdir(dir) do
rdoc_run("--files", "#{__dir__}/../README.md", "#{__dir__}/../lib/sdoc/version.rb")
contents = File.read("doc/js/search-index.js")
index = JSON.parse(contents.delete_prefix!("export default ").delete_suffix!(";"))
_(index.keys.sort).must_equal ["bigrams", "entries", "weights"]
index = File.read("doc/js/search-index.js")
index.delete_prefix!("export default ").delete_suffix!(";")
index.gsub!(/\(new Uint8Array\((.+?)\)\)/, '\1')
_(JSON.parse(index).keys.sort).must_equal ["ngrams", "weights", "entries"].sort
end
end
end
Expand Down
Loading