From f9e496a9f6b3332dc653ddfd04176fad38d03df5 Mon Sep 17 00:00:00 2001 From: Jonathan Hefner Date: Sun, 22 Oct 2023 12:29:20 -0500 Subject: [PATCH] Replace occurrences of "bigram" with "ngram" This is in preparation for switching from bigrams to trigrams, reducing the size of the subsequent diff. --- .../template/rails/resources/js/search.js | 4 +- lib/sdoc/search_index.rb | 36 ++--- spec/rdoc_generator_spec.rb | 2 +- spec/search_index_spec.rb | 124 +++++++++--------- 4 files changed, 83 insertions(+), 83 deletions(-) diff --git a/lib/rdoc/generator/template/rails/resources/js/search.js b/lib/rdoc/generator/template/rails/resources/js/search.js index af7529a6..f02a330f 100644 --- a/lib/rdoc/generator/template/rails/resources/js/search.js +++ b/lib/rdoc/generator/template/rails/resources/js/search.js @@ -40,8 +40,8 @@ export class Search { const bitPositions = []; for (let i = 0, len = query.length; i < len; i += 1) { - const bigram = i === 0 ? (" " + query[0]) : query.substring(i - 1, i + 1); - const position = searchIndex.bigrams[bigram]; + const ngram = i === 0 ? (" " + query[0]) : query.substring(i - 1, i + 1); + const position = searchIndex.ngrams[ngram]; if (position) { bitPositions.push(position); diff --git a/lib/sdoc/search_index.rb b/lib/sdoc/search_index.rb index dcbd836a..50dd6538 100644 --- a/lib/sdoc/search_index.rb +++ b/lib/sdoc/search_index.rb @@ -18,16 +18,16 @@ def generate(rdoc_modules) # all of Foo::Bar's RDoc::MethodAttr instances. rdoc_objects = rdoc_modules + rdoc_modules.flat_map(&:method_list).uniq - bigram_sets = rdoc_objects.map { |rdoc_object| derive_bigrams(rdoc_object.full_name) } - bigram_bit_positions = compile_bigrams(bigram_sets) - bit_weights = compute_bit_weights(bigram_bit_positions) + ngram_sets = rdoc_objects.map { |rdoc_object| derive_ngrams(rdoc_object.full_name) } + ngram_bit_positions = compile_ngrams(ngram_sets) + bit_weights = compute_bit_weights(ngram_bit_positions) - entries = rdoc_objects.zip(bigram_sets).map do |rdoc_object, bigrams| + entries = rdoc_objects.zip(ngram_sets).map do |rdoc_object, ngrams| rdoc_module, rdoc_method = rdoc_object.is_a?(RDoc::ClassModule) ? [rdoc_object] : [rdoc_object.parent, rdoc_object] description = rdoc_object.description [ - generate_fingerprint(bigrams, bigram_bit_positions), + generate_fingerprint(ngrams, ngram_bit_positions), compute_tiebreaker_bonus(rdoc_module.full_name, rdoc_method&.name, description), rdoc_object.path, rdoc_module.full_name, @@ -36,10 +36,10 @@ def generate(rdoc_modules) ] end - { "bigrams" => bigram_bit_positions, "weights" => bit_weights, "entries" => entries } + { "ngrams" => ngram_bit_positions, "weights" => bit_weights, "entries" => entries } end - def derive_bigrams(name) + def derive_ngrams(name) # Example: "ActiveSupport::Cache::Store" => ":ActiveSupport:Cache:Store" strings = [":#{name}".gsub("::", ":")] @@ -65,15 +65,15 @@ def derive_bigrams(name) strings.flat_map { |string| string.each_char.each_cons(2).map(&:join) }.uniq end - def compile_bigrams(bigram_sets) - # Assign each bigram a bit position based on its rarity. More common bigrams + def compile_ngrams(ngram_sets) + # Assign each ngram a bit position based on its rarity. More common ngrams # come first. This reduces the average number of bytes required to store a # fingerprint. - bigram_sets.flatten.tally.sort_by(&:last).reverse.map(&:first).each_with_index.to_h + ngram_sets.flatten.tally.sort_by(&:last).reverse.map(&:first).each_with_index.to_h end - def generate_fingerprint(bigrams, bigram_bit_positions) - bit_positions = bigrams.map(&bigram_bit_positions) + def generate_fingerprint(ngrams, ngram_bit_positions) + bit_positions = ngrams.map(&ngram_bit_positions) byte_count = ((bit_positions.max + 1) / 8.0).ceil bytes = [0] * byte_count @@ -84,16 +84,16 @@ def generate_fingerprint(bigrams, bigram_bit_positions) Uint8Array.new(bytes) end - BIGRAM_PATTERN_WEIGHTS = { + NGRAM_PATTERN_WEIGHTS = { /[^a-z]/ => 2, # Bonus point for non-lowercase-alpha chars because they show intentionality. /^ / => 3, # More points for matching generic start of token. /^:/ => 4, # Even more points for explicit start of token. /[#.(]/ => 50, # Strongly prefer methods when query includes "#", ".", or "(". } - def compute_bit_weights(bigram_bit_positions) - weights = bigram_bit_positions.uniq(&:last).sort_by(&:last).map do |bigram, _position| - BIGRAM_PATTERN_WEIGHTS.map { |pattern, weight| bigram.match?(pattern) ? weight : 1 }.max + def compute_bit_weights(ngram_bit_positions) + weights = ngram_bit_positions.uniq(&:last).sort_by(&:last).map do |ngram, _position| + NGRAM_PATTERN_WEIGHTS.map { |pattern, weight| ngram.match?(pattern) ? weight : 1 }.max end Uint8Array.new(weights) @@ -102,8 +102,8 @@ def compute_bit_weights(bigram_bit_positions) def compute_tiebreaker_bonus(module_name, method_name, description) method_name ||= "" - # Bonus is per matching bigram and is very small so it does not outweigh - # points from other matches. Longer names have smaller per-bigram bonuses, + # Bonus is per matching ngram and is very small so it does not outweigh + # points from other matches. Longer names have smaller per-ngram bonuses, # but the value scales down very slowly. bonus = 0.01 / (module_name.length + method_name.length) ** 0.025 diff --git a/spec/rdoc_generator_spec.rb b/spec/rdoc_generator_spec.rb index d36fd2b6..ba504e42 100644 --- a/spec/rdoc_generator_spec.rb +++ b/spec/rdoc_generator_spec.rb @@ -38,7 +38,7 @@ def parse_options(*options) index = File.read("doc/js/search-index.js") index.delete_prefix!("export default ").delete_suffix!(";") index.gsub!(/\(new Uint8Array\((.+?)\)\)/, '\1') - _(JSON.parse(index).keys.sort).must_equal ["bigrams", "entries", "weights"] + _(JSON.parse(index).keys.sort).must_equal ["ngrams", "weights", "entries"].sort end end end diff --git a/spec/search_index_spec.rb b/spec/search_index_spec.rb index c19aca37..0c992341 100644 --- a/spec/search_index_spec.rb +++ b/spec/search_index_spec.rb @@ -11,14 +11,14 @@ def hoge_fuga; end end RUBY - bigrams = SDoc::SearchIndex.derive_bigrams("FooBar#hoge_fuga") + ngrams = SDoc::SearchIndex.derive_ngrams("FooBar#hoge_fuga") search_index = SDoc::SearchIndex.generate(top_level.classes_and_modules) - _(search_index.keys.sort).must_equal ["bigrams", "entries", "weights"] + _(search_index.keys.sort).must_equal ["ngrams", "weights", "entries"].sort - _(search_index["bigrams"].keys.sort).must_equal bigrams.sort - _(search_index["bigrams"].values.max).must_equal search_index["weights"].length - 1 + _(search_index["ngrams"].keys.sort).must_equal ngrams.sort + _(search_index["ngrams"].values.max).must_equal search_index["weights"].length - 1 _(search_index["entries"].length).must_equal 2 search_index["entries"].each do |entry| @@ -47,86 +47,86 @@ def hoge_fuga; end end end - describe "#derive_bigrams" do - it "returns bigrams for a given string" do + describe "#derive_ngrams" do + it "returns ngrams for a given string" do expected = %w[ab bc cx xy yz] - _(SDoc::SearchIndex.derive_bigrams("abcxyz") & expected).must_equal expected + _(SDoc::SearchIndex.derive_ngrams("abcxyz") & expected).must_equal expected end - it "includes module-related bigrams" do - bigrams = SDoc::SearchIndex.derive_bigrams("Abc::Xyz") + it "includes module-related ngrams" do + ngrams = SDoc::SearchIndex.derive_ngrams("Abc::Xyz") - _(bigrams).must_include ":A" - _(bigrams).must_include ":X" + _(ngrams).must_include ":A" + _(ngrams).must_include ":X" - _(bigrams).wont_include "c:" - _(bigrams).wont_include "::" + _(ngrams).wont_include "c:" + _(ngrams).wont_include "::" end - it "includes method-related bigrams" do - bigrams = SDoc::SearchIndex.derive_bigrams("Abc#def_xyz") + it "includes method-related ngrams" do + ngrams = SDoc::SearchIndex.derive_ngrams("Abc#def_xyz") - _(bigrams).must_include "#d" - _(bigrams).must_include ".d" - _(bigrams).must_include "z(" + _(ngrams).must_include "#d" + _(ngrams).must_include ".d" + _(ngrams).must_include "z(" - _(bigrams).wont_include "c#" + _(ngrams).wont_include "c#" - _(bigrams).must_include "f_" - _(bigrams).must_include "_x" - _(bigrams).must_include "fx" + _(ngrams).must_include "f_" + _(ngrams).must_include "_x" + _(ngrams).must_include "fx" end - it "includes space delimiter bigrams" do - bigrams = SDoc::SearchIndex.derive_bigrams("Abc::Def#xyz") + it "includes space delimiter ngrams" do + ngrams = SDoc::SearchIndex.derive_ngrams("Abc::Def#xyz") - _(bigrams).must_include " A" - _(bigrams).must_include " D" - _(bigrams).must_include " x" + _(ngrams).must_include " A" + _(ngrams).must_include " D" + _(ngrams).must_include " x" - _(bigrams).wont_include "c " - _(bigrams).wont_include "f " + _(ngrams).wont_include "c " + _(ngrams).wont_include "f " end - it "includes acronym bigrams" do - bigrams = SDoc::SearchIndex.derive_bigrams("AbcDefGhi::RstUvwXyz") + it "includes acronym ngrams" do + ngrams = SDoc::SearchIndex.derive_ngrams("AbcDefGhi::RstUvwXyz") - _(bigrams).must_include "AD" - _(bigrams).must_include "DG" - _(bigrams).must_include "RU" - _(bigrams).must_include "UX" + _(ngrams).must_include "AD" + _(ngrams).must_include "DG" + _(ngrams).must_include "RU" + _(ngrams).must_include "UX" - _(bigrams).wont_include "GR" + _(ngrams).wont_include "GR" end - it "includes downcased bigrams except for acronym bigrams" do - bigrams = SDoc::SearchIndex.derive_bigrams("AbcDefGhi::RstUvwXyz") + it "includes downcased ngrams except for acronym ngrams" do + ngrams = SDoc::SearchIndex.derive_ngrams("AbcDefGhi::RstUvwXyz") - bigrams.grep(/[A-Z]/).grep_v(/[A-Z]{2}/).each do |uppercase| - _(bigrams).must_include uppercase.downcase + ngrams.grep(/[A-Z]/).grep_v(/[A-Z]{2}/).each do |uppercase| + _(ngrams).must_include uppercase.downcase end end end - describe "#compile_bigrams" do - it "assigns bigram bit positions based on bigram rarity" do - base_bigrams = ("aa".."zz").take(4) - bigram_sets = (0..3).map { |n| base_bigrams.drop(n) } + describe "#compile_ngrams" do + it "assigns ngram bit positions based on ngram rarity" do + base_ngrams = ("aa".."zz").take(4) + ngram_sets = (0..3).map { |n| base_ngrams.drop(n) } - _(SDoc::SearchIndex.compile_bigrams(bigram_sets)). - must_equal base_bigrams.reverse.each_with_index.to_h + _(SDoc::SearchIndex.compile_ngrams(ngram_sets)). + must_equal base_ngrams.reverse.each_with_index.to_h end end describe "#generate_fingerprint" do - it "returns an array of bytes with bits set for the given bigrams" do - bigrams = ("aa".."zz").take(8) + it "returns an array of bytes with bits set for the given ngrams" do + ngrams = ("aa".."zz").take(8) - packed_positions = bigrams.each_with_index.to_h - _(SDoc::SearchIndex.generate_fingerprint(bigrams, packed_positions)).must_equal [0b11111111] + packed_positions = ngrams.each_with_index.to_h + _(SDoc::SearchIndex.generate_fingerprint(ngrams, packed_positions)).must_equal [0b11111111] - sparse_positions = bigrams.each_with_index.to_h { |bigram, i| [bigram, i * 8] } - _(SDoc::SearchIndex.generate_fingerprint(bigrams, sparse_positions)).must_equal [1] * 8 + sparse_positions = ngrams.each_with_index.to_h { |ngram, i| [ngram, i * 8] } + _(SDoc::SearchIndex.generate_fingerprint(ngrams, sparse_positions)).must_equal [1] * 8 end it "omits trailing zero bytes" do @@ -139,33 +139,33 @@ def hoge_fuga; end _(SDoc::SearchIndex.compute_bit_weights({ "xx" => 0, "yy" => 1 })).must_equal [1, 1] end - it "computes weights based on bigram content" do - bigram_bit_positions = { "xx" => 0, " x" => 1, ":X" => 2, "#x" => 3 } - bit_weights = SDoc::SearchIndex.compute_bit_weights(bigram_bit_positions) + it "computes weights based on ngram content" do + ngram_bit_positions = { "xx" => 0, " x" => 1, ":X" => 2, "#x" => 3 } + bit_weights = SDoc::SearchIndex.compute_bit_weights(ngram_bit_positions) - _(bit_weights.length).must_equal bigram_bit_positions.length + _(bit_weights.length).must_equal ngram_bit_positions.length _(bit_weights.uniq).must_equal bit_weights _(bit_weights.sort).must_equal bit_weights end it "orders weights by bit position" do - bigram_bit_positions = { "xx" => 0, " x" => 1, ":X" => 2, "#x" => 3 } - bit_weights = SDoc::SearchIndex.compute_bit_weights(bigram_bit_positions) + ngram_bit_positions = { "xx" => 0, " x" => 1, ":X" => 2, "#x" => 3 } + bit_weights = SDoc::SearchIndex.compute_bit_weights(ngram_bit_positions) - reversed = bigram_bit_positions.reverse_each.to_h + reversed = ngram_bit_positions.reverse_each.to_h _(SDoc::SearchIndex.compute_bit_weights(reversed)).must_equal bit_weights - inverted = bigram_bit_positions.transform_values { |pos| -pos + bit_weights.length } + inverted = ngram_bit_positions.transform_values { |pos| -pos + bit_weights.length } _(SDoc::SearchIndex.compute_bit_weights(inverted)).must_equal bit_weights.reverse end - it "ignores alias bigrams" do + it "ignores alias ngrams" do _(SDoc::SearchIndex.compute_bit_weights({ "#x" => 0, ".x" => 0}).length).must_equal 1 end end describe "#compute_tiebreaker_bonus" do - it "returns a value much smaller than 1 (the value of a single matching bigram)" do + it "returns a value much smaller than 1 (the value of a single matching ngram)" do _(SDoc::SearchIndex.compute_tiebreaker_bonus("X", nil, "")).must_be :<=, 0.1 end