Skip to content

Commit

Permalink
Include stop word lists from Lucene and the Snowball project (#1666)
Browse files Browse the repository at this point in the history
  • Loading branch information
adamreichold authored Nov 9, 2022
1 parent 3e9c806 commit a4b759d
Show file tree
Hide file tree
Showing 6 changed files with 2,243 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
strategy:
matrix:
features: [
{ label: "all", flags: "mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
{ label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
]

Expand Down
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,9 @@ debug-assertions = true
overflow-checks = true

[features]
default = ["mmap", "lz4-compression" ]
default = ["mmap", "stopwords", "lz4-compression"]
mmap = ["fs2", "tempfile", "memmap2"]
stopwords = []

brotli-compression = ["brotli"]
lz4-compression = ["lz4_flex"]
Expand Down
2 changes: 1 addition & 1 deletion src/fieldnorm/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::DocId;
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
/// of each document for each field with field norms.
///
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
/// `FieldNormsWriter` stores a `Vec<u8>` for each tracked field, using a
/// byte per document per field.
pub struct FieldNormsWriter {
fieldnorms_buffers: Vec<Option<Vec<u8>>>,
Expand Down
42 changes: 42 additions & 0 deletions src/tokenizer/stop_word_filter/gen_stopwords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import requests

LANGUAGES = [
"danish",
"dutch",
"finnish",
"french",
"german",
"italian",
"norwegian",
"portuguese",
"russian",
"spanish",
"swedish",
]

with requests.Session() as sess, open("stopwords.rs", "w") as mod:
mod.write("/*\n")
mod.write(
"These stop word lists are from the Snowball project (https://snowballstem.org/)\nwhich carries the following copyright and license:\n\n"
)

resp = sess.get(
"https://raw.githubusercontent.com/snowballstem/snowball/master/COPYING"
)
resp.raise_for_status()
mod.write(resp.text)
mod.write("*/\n\n")

for lang in LANGUAGES:
resp = sess.get(f"https://snowballstem.org/algorithms/{lang}/stop.txt")
resp.raise_for_status()

mod.write(f"pub const {lang.upper()}: &[&str] = &[\n")

for line in resp.text.splitlines():
line, _, _ = line.partition("|")

for word in line.split():
mod.write(f' "{word}",\n')

mod.write("];\n\n")
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
//! assert_eq!(stream.next().unwrap().text, "crafty");
//! assert!(stream.next().is_none());
//! ```
#[cfg(feature = "stopwords")]
#[rustfmt::skip]
mod stopwords;

use std::sync::Arc;

use rustc_hash::FxHashSet;
Expand All @@ -31,14 +35,87 @@ impl StopWordFilter {
}
}

fn english() -> StopWordFilter {
let words: [&'static str; 33] = [
fn from_word_list(words: &[&str]) -> Self {
Self::remove(words.iter().map(|&word| word.to_owned()))
}

#[cfg(feature = "stopwords")]
/// Create a `StopWorldFilter` for the Danish language
pub fn danish() -> Self {
Self::from_word_list(stopwords::DANISH)
}

#[cfg(feature = "stopwords")]
/// Create a `StopWorldFilter` for the Dutch language
pub fn dutch() -> Self {
Self::from_word_list(stopwords::DUTCH)
}

/// Create a `StopWorldFilter` for the English language
pub fn english() -> Self {
// This is the same list of words used by the Apache-licensed Lucene project,
// c.f. https://github.com/apache/lucene/blob/d5d6dc079395c47cd6d12dcce3bcfdd2c7d9dc63/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java#L46
const WORDS: &[&str] = &[
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into",
"is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then",
"there", "these", "they", "this", "to", "was", "will", "with",
];

StopWordFilter::remove(words.iter().map(|&s| s.to_string()))
Self::from_word_list(WORDS)
}

#[cfg(feature = "stopwords")]
/// Create a `StopWorldFilter` for the Finnish language
pub fn finnish() -> Self {
Self::from_word_list(stopwords::FINNISH)
}

#[cfg(feature = "stopwords")]
/// Create a `StopWorldFilter` for the French language
pub fn french() -> Self {
Self::from_word_list(stopwords::FRENCH)
}

#[cfg(feature = "stopwords")]
/// Create a `StopWorldFilter` for the German language
pub fn german() -> Self {
Self::from_word_list(stopwords::GERMAN)
}

#[cfg(feature = "stopwords")]
/// Create a `StopWorldFilter` for the Italian language
pub fn italian() -> Self {
Self::from_word_list(stopwords::ITALIAN)
}

#[cfg(feature = "stopwords")]
/// Create a `StopWorldFilter` for the Norwegian language
pub fn norwegian() -> Self {
Self::from_word_list(stopwords::NORWEGIAN)
}

#[cfg(feature = "stopwords")]
/// Create a `StopWorldFilter` for the Portuguese language
pub fn portuguese() -> Self {
Self::from_word_list(stopwords::PORTUGUESE)
}

#[cfg(feature = "stopwords")]
/// Create a `StopWorldFilter` for the Russian language
pub fn russian() -> Self {
Self::from_word_list(stopwords::RUSSIAN)
}

#[cfg(feature = "stopwords")]
/// Create a `StopWorldFilter` for the Spanish language
pub fn spanish() -> Self {
Self::from_word_list(stopwords::SPANISH)
}

#[cfg(feature = "stopwords")]
/// Create a `StopWorldFilter` for the Swedish language
pub fn swedish() -> Self {
Self::from_word_list(stopwords::SWEDISH)
}
}

Expand Down
Loading

0 comments on commit a4b759d

Please sign in to comment.