Skip to content

Commit

Permalink
Replace occurrences of "bigram" with "ngram"
Browse files Browse the repository at this point in the history
This is in preparation for switching from bigrams to trigrams, reducing
the size of the subsequent diff.
  • Loading branch information
jonathanhefner committed Oct 23, 2023
1 parent 36b184c commit f9e496a
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 83 deletions.
4 changes: 2 additions & 2 deletions lib/rdoc/generator/template/rails/resources/js/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ export class Search {
const bitPositions = [];

for (let i = 0, len = query.length; i < len; i += 1) {
const bigram = i === 0 ? (" " + query[0]) : query.substring(i - 1, i + 1);
const position = searchIndex.bigrams[bigram];
const ngram = i === 0 ? (" " + query[0]) : query.substring(i - 1, i + 1);
const position = searchIndex.ngrams[ngram];

if (position) {
bitPositions.push(position);
Expand Down
36 changes: 18 additions & 18 deletions lib/sdoc/search_index.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,16 @@ def generate(rdoc_modules)
# all of Foo::Bar's RDoc::MethodAttr instances.
rdoc_objects = rdoc_modules + rdoc_modules.flat_map(&:method_list).uniq

bigram_sets = rdoc_objects.map { |rdoc_object| derive_bigrams(rdoc_object.full_name) }
bigram_bit_positions = compile_bigrams(bigram_sets)
bit_weights = compute_bit_weights(bigram_bit_positions)
ngram_sets = rdoc_objects.map { |rdoc_object| derive_ngrams(rdoc_object.full_name) }
ngram_bit_positions = compile_ngrams(ngram_sets)
bit_weights = compute_bit_weights(ngram_bit_positions)

entries = rdoc_objects.zip(bigram_sets).map do |rdoc_object, bigrams|
entries = rdoc_objects.zip(ngram_sets).map do |rdoc_object, ngrams|
rdoc_module, rdoc_method = rdoc_object.is_a?(RDoc::ClassModule) ? [rdoc_object] : [rdoc_object.parent, rdoc_object]
description = rdoc_object.description

[
generate_fingerprint(bigrams, bigram_bit_positions),
generate_fingerprint(ngrams, ngram_bit_positions),
compute_tiebreaker_bonus(rdoc_module.full_name, rdoc_method&.name, description),
rdoc_object.path,
rdoc_module.full_name,
Expand All @@ -36,10 +36,10 @@ def generate(rdoc_modules)
]
end

{ "bigrams" => bigram_bit_positions, "weights" => bit_weights, "entries" => entries }
{ "ngrams" => ngram_bit_positions, "weights" => bit_weights, "entries" => entries }
end

def derive_bigrams(name)
def derive_ngrams(name)
# Example: "ActiveSupport::Cache::Store" => ":ActiveSupport:Cache:Store"
strings = [":#{name}".gsub("::", ":")]

Expand All @@ -65,15 +65,15 @@ def derive_bigrams(name)
strings.flat_map { |string| string.each_char.each_cons(2).map(&:join) }.uniq
end

def compile_bigrams(bigram_sets)
# Assign each bigram a bit position based on its rarity. More common bigrams
def compile_ngrams(ngram_sets)
# Assign each ngram a bit position based on its rarity. More common ngrams
# come first. This reduces the average number of bytes required to store a
# fingerprint.
bigram_sets.flatten.tally.sort_by(&:last).reverse.map(&:first).each_with_index.to_h
ngram_sets.flatten.tally.sort_by(&:last).reverse.map(&:first).each_with_index.to_h
end

def generate_fingerprint(bigrams, bigram_bit_positions)
bit_positions = bigrams.map(&bigram_bit_positions)
def generate_fingerprint(ngrams, ngram_bit_positions)
bit_positions = ngrams.map(&ngram_bit_positions)
byte_count = ((bit_positions.max + 1) / 8.0).ceil
bytes = [0] * byte_count

Expand All @@ -84,16 +84,16 @@ def generate_fingerprint(bigrams, bigram_bit_positions)
Uint8Array.new(bytes)
end

BIGRAM_PATTERN_WEIGHTS = {
NGRAM_PATTERN_WEIGHTS = {
/[^a-z]/ => 2, # Bonus point for non-lowercase-alpha chars because they show intentionality.
/^ / => 3, # More points for matching generic start of token.
/^:/ => 4, # Even more points for explicit start of token.
/[#.(]/ => 50, # Strongly prefer methods when query includes "#", ".", or "(".
}

def compute_bit_weights(bigram_bit_positions)
weights = bigram_bit_positions.uniq(&:last).sort_by(&:last).map do |bigram, _position|
BIGRAM_PATTERN_WEIGHTS.map { |pattern, weight| bigram.match?(pattern) ? weight : 1 }.max
def compute_bit_weights(ngram_bit_positions)
weights = ngram_bit_positions.uniq(&:last).sort_by(&:last).map do |ngram, _position|
NGRAM_PATTERN_WEIGHTS.map { |pattern, weight| ngram.match?(pattern) ? weight : 1 }.max
end

Uint8Array.new(weights)
Expand All @@ -102,8 +102,8 @@ def compute_bit_weights(bigram_bit_positions)
def compute_tiebreaker_bonus(module_name, method_name, description)
method_name ||= ""

# Bonus is per matching bigram and is very small so it does not outweigh
# points from other matches. Longer names have smaller per-bigram bonuses,
# Bonus is per matching ngram and is very small so it does not outweigh
# points from other matches. Longer names have smaller per-ngram bonuses,
# but the value scales down very slowly.
bonus = 0.01 / (module_name.length + method_name.length) ** 0.025

Expand Down
2 changes: 1 addition & 1 deletion spec/rdoc_generator_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def parse_options(*options)
index = File.read("doc/js/search-index.js")
index.delete_prefix!("export default ").delete_suffix!(";")
index.gsub!(/\(new Uint8Array\((.+?)\)\)/, '\1')
_(JSON.parse(index).keys.sort).must_equal ["bigrams", "entries", "weights"]
_(JSON.parse(index).keys.sort).must_equal ["ngrams", "weights", "entries"].sort
end
end
end
Expand Down
124 changes: 62 additions & 62 deletions spec/search_index_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ def hoge_fuga; end
end
RUBY

bigrams = SDoc::SearchIndex.derive_bigrams("FooBar#hoge_fuga")
ngrams = SDoc::SearchIndex.derive_ngrams("FooBar#hoge_fuga")

search_index = SDoc::SearchIndex.generate(top_level.classes_and_modules)

_(search_index.keys.sort).must_equal ["bigrams", "entries", "weights"]
_(search_index.keys.sort).must_equal ["ngrams", "weights", "entries"].sort

_(search_index["bigrams"].keys.sort).must_equal bigrams.sort
_(search_index["bigrams"].values.max).must_equal search_index["weights"].length - 1
_(search_index["ngrams"].keys.sort).must_equal ngrams.sort
_(search_index["ngrams"].values.max).must_equal search_index["weights"].length - 1

_(search_index["entries"].length).must_equal 2
search_index["entries"].each do |entry|
Expand Down Expand Up @@ -47,86 +47,86 @@ def hoge_fuga; end
end
end

describe "#derive_bigrams" do
it "returns bigrams for a given string" do
describe "#derive_ngrams" do
it "returns ngrams for a given string" do
expected = %w[ab bc cx xy yz]
_(SDoc::SearchIndex.derive_bigrams("abcxyz") & expected).must_equal expected
_(SDoc::SearchIndex.derive_ngrams("abcxyz") & expected).must_equal expected
end

it "includes module-related bigrams" do
bigrams = SDoc::SearchIndex.derive_bigrams("Abc::Xyz")
it "includes module-related ngrams" do
ngrams = SDoc::SearchIndex.derive_ngrams("Abc::Xyz")

_(bigrams).must_include ":A"
_(bigrams).must_include ":X"
_(ngrams).must_include ":A"
_(ngrams).must_include ":X"

_(bigrams).wont_include "c:"
_(bigrams).wont_include "::"
_(ngrams).wont_include "c:"
_(ngrams).wont_include "::"
end

it "includes method-related bigrams" do
bigrams = SDoc::SearchIndex.derive_bigrams("Abc#def_xyz")
it "includes method-related ngrams" do
ngrams = SDoc::SearchIndex.derive_ngrams("Abc#def_xyz")

_(bigrams).must_include "#d"
_(bigrams).must_include ".d"
_(bigrams).must_include "z("
_(ngrams).must_include "#d"
_(ngrams).must_include ".d"
_(ngrams).must_include "z("

_(bigrams).wont_include "c#"
_(ngrams).wont_include "c#"

_(bigrams).must_include "f_"
_(bigrams).must_include "_x"
_(bigrams).must_include "fx"
_(ngrams).must_include "f_"
_(ngrams).must_include "_x"
_(ngrams).must_include "fx"
end

it "includes space delimiter bigrams" do
bigrams = SDoc::SearchIndex.derive_bigrams("Abc::Def#xyz")
it "includes space delimiter ngrams" do
ngrams = SDoc::SearchIndex.derive_ngrams("Abc::Def#xyz")

_(bigrams).must_include " A"
_(bigrams).must_include " D"
_(bigrams).must_include " x"
_(ngrams).must_include " A"
_(ngrams).must_include " D"
_(ngrams).must_include " x"

_(bigrams).wont_include "c "
_(bigrams).wont_include "f "
_(ngrams).wont_include "c "
_(ngrams).wont_include "f "
end

it "includes acronym bigrams" do
bigrams = SDoc::SearchIndex.derive_bigrams("AbcDefGhi::RstUvwXyz")
it "includes acronym ngrams" do
ngrams = SDoc::SearchIndex.derive_ngrams("AbcDefGhi::RstUvwXyz")

_(bigrams).must_include "AD"
_(bigrams).must_include "DG"
_(bigrams).must_include "RU"
_(bigrams).must_include "UX"
_(ngrams).must_include "AD"
_(ngrams).must_include "DG"
_(ngrams).must_include "RU"
_(ngrams).must_include "UX"

_(bigrams).wont_include "GR"
_(ngrams).wont_include "GR"
end

it "includes downcased bigrams except for acronym bigrams" do
bigrams = SDoc::SearchIndex.derive_bigrams("AbcDefGhi::RstUvwXyz")
it "includes downcased ngrams except for acronym ngrams" do
ngrams = SDoc::SearchIndex.derive_ngrams("AbcDefGhi::RstUvwXyz")

bigrams.grep(/[A-Z]/).grep_v(/[A-Z]{2}/).each do |uppercase|
_(bigrams).must_include uppercase.downcase
ngrams.grep(/[A-Z]/).grep_v(/[A-Z]{2}/).each do |uppercase|
_(ngrams).must_include uppercase.downcase
end
end
end

describe "#compile_bigrams" do
it "assigns bigram bit positions based on bigram rarity" do
base_bigrams = ("aa".."zz").take(4)
bigram_sets = (0..3).map { |n| base_bigrams.drop(n) }
describe "#compile_ngrams" do
it "assigns ngram bit positions based on ngram rarity" do
base_ngrams = ("aa".."zz").take(4)
ngram_sets = (0..3).map { |n| base_ngrams.drop(n) }

_(SDoc::SearchIndex.compile_bigrams(bigram_sets)).
must_equal base_bigrams.reverse.each_with_index.to_h
_(SDoc::SearchIndex.compile_ngrams(ngram_sets)).
must_equal base_ngrams.reverse.each_with_index.to_h
end
end

describe "#generate_fingerprint" do
it "returns an array of bytes with bits set for the given bigrams" do
bigrams = ("aa".."zz").take(8)
it "returns an array of bytes with bits set for the given ngrams" do
ngrams = ("aa".."zz").take(8)

packed_positions = bigrams.each_with_index.to_h
_(SDoc::SearchIndex.generate_fingerprint(bigrams, packed_positions)).must_equal [0b11111111]
packed_positions = ngrams.each_with_index.to_h
_(SDoc::SearchIndex.generate_fingerprint(ngrams, packed_positions)).must_equal [0b11111111]

sparse_positions = bigrams.each_with_index.to_h { |bigram, i| [bigram, i * 8] }
_(SDoc::SearchIndex.generate_fingerprint(bigrams, sparse_positions)).must_equal [1] * 8
sparse_positions = ngrams.each_with_index.to_h { |ngram, i| [ngram, i * 8] }
_(SDoc::SearchIndex.generate_fingerprint(ngrams, sparse_positions)).must_equal [1] * 8
end

it "omits trailing zero bytes" do
Expand All @@ -139,33 +139,33 @@ def hoge_fuga; end
_(SDoc::SearchIndex.compute_bit_weights({ "xx" => 0, "yy" => 1 })).must_equal [1, 1]
end

it "computes weights based on bigram content" do
bigram_bit_positions = { "xx" => 0, " x" => 1, ":X" => 2, "#x" => 3 }
bit_weights = SDoc::SearchIndex.compute_bit_weights(bigram_bit_positions)
it "computes weights based on ngram content" do
ngram_bit_positions = { "xx" => 0, " x" => 1, ":X" => 2, "#x" => 3 }
bit_weights = SDoc::SearchIndex.compute_bit_weights(ngram_bit_positions)

_(bit_weights.length).must_equal bigram_bit_positions.length
_(bit_weights.length).must_equal ngram_bit_positions.length
_(bit_weights.uniq).must_equal bit_weights
_(bit_weights.sort).must_equal bit_weights
end

it "orders weights by bit position" do
bigram_bit_positions = { "xx" => 0, " x" => 1, ":X" => 2, "#x" => 3 }
bit_weights = SDoc::SearchIndex.compute_bit_weights(bigram_bit_positions)
ngram_bit_positions = { "xx" => 0, " x" => 1, ":X" => 2, "#x" => 3 }
bit_weights = SDoc::SearchIndex.compute_bit_weights(ngram_bit_positions)

reversed = bigram_bit_positions.reverse_each.to_h
reversed = ngram_bit_positions.reverse_each.to_h
_(SDoc::SearchIndex.compute_bit_weights(reversed)).must_equal bit_weights

inverted = bigram_bit_positions.transform_values { |pos| -pos + bit_weights.length }
inverted = ngram_bit_positions.transform_values { |pos| -pos + bit_weights.length }
_(SDoc::SearchIndex.compute_bit_weights(inverted)).must_equal bit_weights.reverse
end

it "ignores alias bigrams" do
it "ignores alias ngrams" do
_(SDoc::SearchIndex.compute_bit_weights({ "#x" => 0, ".x" => 0}).length).must_equal 1
end
end

describe "#compute_tiebreaker_bonus" do
it "returns a value much smaller than 1 (the value of a single matching bigram)" do
it "returns a value much smaller than 1 (the value of a single matching ngram)" do
_(SDoc::SearchIndex.compute_tiebreaker_bonus("X", nil, "")).must_be :<=, 0.1
end

Expand Down

0 comments on commit f9e496a

Please sign in to comment.