Skip to content

Commit

Permalink
simplify api
Browse files Browse the repository at this point in the history
  • Loading branch information
PSeitz committed Jun 2, 2023
1 parent 26a0e7b commit 69c9277
Show file tree
Hide file tree
Showing 17 changed files with 51 additions and 51 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ proptest = "1.0.0"
criterion = "0.5"
test-log = "0.2.10"
env_logger = "0.10.0"
pprof = { version = "0.11.0", features = ["flamegraph", "criterion"] }
pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "7a92207", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5
futures = "0.3.21"
paste = "1.0.11"
more-asserts = "0.3.1"
Expand Down
4 changes: 2 additions & 2 deletions src/tokenizer/alphanum_only.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ impl TokenFilter for AlphaNumOnlyFilter {
pub struct AlphaNumOnlyFilterWrapper<T>(T);

impl<T: Tokenizer> Tokenizer for AlphaNumOnlyFilterWrapper<T> {
type TokenStream<'a, 'b> = AlphaNumOnlyFilterStream<T::TokenStream<'a, 'b>>;
type TokenStream<'a> = AlphaNumOnlyFilterStream<T::TokenStream<'a>>;

fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> {
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
AlphaNumOnlyFilterStream {
tail: self.0.token_stream(text),
}
Expand Down
4 changes: 2 additions & 2 deletions src/tokenizer/ascii_folding_filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ impl TokenFilter for AsciiFoldingFilter {
pub struct AsciiFoldingFilterWrapper<T>(T);

impl<T: Tokenizer> Tokenizer for AsciiFoldingFilterWrapper<T> {
type TokenStream<'a, 'b> = AsciiFoldingFilterTokenStream<T::TokenStream<'a, 'b>>;
type TokenStream<'a> = AsciiFoldingFilterTokenStream<T::TokenStream<'a>>;

fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> {
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
AsciiFoldingFilterTokenStream {
buffer: String::with_capacity(100),
tail: self.0.token_stream(text),
Expand Down
2 changes: 1 addition & 1 deletion src/tokenizer/empty_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::tokenizer::{Token, TokenStream, Tokenizer};
pub(crate) struct EmptyTokenizer;

impl Tokenizer for EmptyTokenizer {
type TokenStream<'a, 'b> = EmptyTokenStream;
type TokenStream<'a> = EmptyTokenStream;
fn token_stream(&mut self, _text: &str) -> EmptyTokenStream {
EmptyTokenStream::default()
}
Expand Down
10 changes: 5 additions & 5 deletions src/tokenizer/facet_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ enum State {
Terminated,
}

pub struct FacetTokenStream<'a, 'b> {
pub struct FacetTokenStream<'a> {
text: &'a str,
state: State,
token: &'b mut Token,
token: &'a mut Token,
}

impl Tokenizer for FacetTokenizer {
type TokenStream<'a, 'b> = FacetTokenStream<'a, 'b>;
fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> FacetTokenStream<'a, 'b> {
type TokenStream<'a> = FacetTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> FacetTokenStream<'a> {
self.token.reset();
self.token.position = 0;
FacetTokenStream {
Expand All @@ -40,7 +40,7 @@ impl Tokenizer for FacetTokenizer {
}
}

impl<'a, 'b> TokenStream for FacetTokenStream<'a, 'b> {
impl<'a> TokenStream for FacetTokenStream<'a> {
fn advance(&mut self) -> bool {
match self.state {
State::RootFacetNotEmitted => {
Expand Down
4 changes: 2 additions & 2 deletions src/tokenizer/lower_caser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ impl TokenFilter for LowerCaser {
pub struct LowerCaserFilter<T>(T);

impl<T: Tokenizer> Tokenizer for LowerCaserFilter<T> {
type TokenStream<'a, 'b> = LowerCaserTokenStream<T::TokenStream<'a, 'b>>;
type TokenStream<'a> = LowerCaserTokenStream<T::TokenStream<'a>>;

fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> {
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
LowerCaserTokenStream {
tail: self.0.token_stream(text),
buffer: String::new(), // TODO move to global buffer
Expand Down
10 changes: 5 additions & 5 deletions src/tokenizer/ngram_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,20 +121,20 @@ impl NgramTokenizer {
}

/// TokenStream associate to the `NgramTokenizer`
pub struct NgramTokenStream<'a, 'b> {
pub struct NgramTokenStream<'a> {
/// parameters
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>,
/// true if the NgramTokenStream is in prefix mode.
prefix_only: bool,
/// input
text: &'a str,
/// output
token: &'b mut Token,
token: &'a mut Token,
}

impl Tokenizer for NgramTokenizer {
type TokenStream<'a, 'b> = NgramTokenStream<'a, 'b>;
fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> NgramTokenStream<'a, 'b> {
type TokenStream<'a> = NgramTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> NgramTokenStream<'a> {
self.token.reset();
NgramTokenStream {
ngram_charidx_iterator: StutteringIterator::new(
Expand All @@ -149,7 +149,7 @@ impl Tokenizer for NgramTokenizer {
}
}

impl<'a, 'b> TokenStream for NgramTokenStream<'a, 'b> {
impl<'a> TokenStream for NgramTokenStream<'a> {
fn advance(&mut self) -> bool {
if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
if self.prefix_only && offset_from > 0 {
Expand Down
2 changes: 1 addition & 1 deletion src/tokenizer/raw_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ pub struct RawTokenStream<'a> {
}

impl Tokenizer for RawTokenizer {
type TokenStream<'b, 'a> = RawTokenStream<'a>;
type TokenStream<'a> = RawTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &str) -> RawTokenStream<'a> {
self.token.reset();
self.token.position = 0;
Expand Down
14 changes: 7 additions & 7 deletions src/tokenizer/regex_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ impl RegexTokenizer {
}

impl Tokenizer for RegexTokenizer {
type TokenStream<'a, 'b> = RegexTokenStream<'a, 'b>;
fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> RegexTokenStream<'a, 'b> {
type TokenStream<'a> = RegexTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> RegexTokenStream<'a> {
self.token.reset();
RegexTokenStream {
regex: self.regex.clone(),
Expand All @@ -76,14 +76,14 @@ impl Tokenizer for RegexTokenizer {
}
}

pub struct RegexTokenStream<'a, 'b> {
pub struct RegexTokenStream<'a> {
regex: Regex,
text: &'a str,
token: &'b mut Token,
token: &'a mut Token,
cursor: usize,
}

impl<'a, 'b> TokenStream for RegexTokenStream<'a, 'b> {
impl<'a> TokenStream for RegexTokenStream<'a> {
fn advance(&mut self) -> bool {
let Some(regex_match) = self.regex.find(self.text) else {
return false;
Expand All @@ -105,11 +105,11 @@ impl<'a, 'b> TokenStream for RegexTokenStream<'a, 'b> {
}

fn token(&self) -> &Token {
&self.token
self.token
}

fn token_mut(&mut self) -> &mut Token {
&mut self.token
self.token
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/tokenizer/remove_long.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ pub struct RemoveLongFilterWrapper<T: Tokenizer> {
}

impl<T: Tokenizer> Tokenizer for RemoveLongFilterWrapper<T> {
type TokenStream<'a, 'b> = RemoveLongFilterStream<T::TokenStream<'a, 'b>>;
type TokenStream<'a> = RemoveLongFilterStream<T::TokenStream<'a>>;

fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> {
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
RemoveLongFilterStream {
token_length_limit: self.length_limit,
tail: self.inner.token_stream(text),
Expand Down
12 changes: 6 additions & 6 deletions src/tokenizer/simple_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@ pub struct SimpleTokenizer {
}

/// TokenStream produced by the `SimpleTokenizer`.
pub struct SimpleTokenStream<'a, 'b> {
pub struct SimpleTokenStream<'a> {
text: &'a str,
chars: CharIndices<'a>,
token: &'b mut Token,
token: &'a mut Token,
}

impl Tokenizer for SimpleTokenizer {
type TokenStream<'a, 'b> = SimpleTokenStream<'a, 'b>;
fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> SimpleTokenStream<'a, 'b> {
type TokenStream<'a> = SimpleTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> SimpleTokenStream<'a> {
self.token.reset();
SimpleTokenStream {
text,
Expand All @@ -27,7 +27,7 @@ impl Tokenizer for SimpleTokenizer {
}
}

impl<'a, 'b> SimpleTokenStream<'a, 'b> {
impl<'a> SimpleTokenStream<'a> {
// search for the end of the current token.
fn search_token_end(&mut self) -> usize {
(&mut self.chars)
Expand All @@ -38,7 +38,7 @@ impl<'a, 'b> SimpleTokenStream<'a, 'b> {
}
}

impl<'a, 'b> TokenStream for SimpleTokenStream<'a, 'b> {
impl<'a> TokenStream for SimpleTokenStream<'a> {
fn advance(&mut self) -> bool {
self.token.text.clear();
self.token.position = self.token.position.wrapping_add(1);
Expand Down
4 changes: 2 additions & 2 deletions src/tokenizer/split_compound_words.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,9 @@ pub struct SplitCompoundWordsFilter<T> {
}

impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
type TokenStream<'a, 'b> = SplitCompoundWordsTokenStream<T::TokenStream<'a, 'b>>;
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;

fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> {
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: self.inner.token_stream(text),
Expand Down
4 changes: 2 additions & 2 deletions src/tokenizer/stemmer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,9 @@ pub struct StemmerFilter<T> {
}

impl<T: Tokenizer> Tokenizer for StemmerFilter<T> {
type TokenStream<'a, 'b> = StemmerTokenStream<T::TokenStream<'a, 'b>>;
type TokenStream<'a> = StemmerTokenStream<T::TokenStream<'a>>;

fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> {
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
let stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
StemmerTokenStream {
tail: self.inner.token_stream(text),
Expand Down
4 changes: 2 additions & 2 deletions src/tokenizer/stop_word_filter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ pub struct StopWordFilterWrapper<T> {
}

impl<T: Tokenizer> Tokenizer for StopWordFilterWrapper<T> {
type TokenStream<'a, 'b> = StopWordFilterStream<T::TokenStream<'a, 'b>>;
type TokenStream<'a> = StopWordFilterStream<T::TokenStream<'a>>;

fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b> {
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
StopWordFilterStream {
words: self.words.clone(),
tail: self.inner.token_stream(text),
Expand Down
6 changes: 3 additions & 3 deletions src/tokenizer/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ pub struct TextAnalyzer {
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
trait BoxableTokenizer: 'static + Send + Sync {
/// Creates a boxed token stream for a given `str`.
fn box_token_stream<'a, 'b: 'a>(&'b mut self, text: &'a str) -> BoxTokenStream<'a>;
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
/// Clone this tokenizer.
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
}

impl<T: Tokenizer> BoxableTokenizer for T {
fn box_token_stream<'a, 'b: 'a>(&'b mut self, text: &'a str) -> BoxTokenStream<'a> {
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.token_stream(text).into()
}
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
Expand Down Expand Up @@ -53,7 +53,7 @@ impl TextAnalyzer {
}

/// Creates a token stream for a given `str`.
pub fn token_stream<'a, 'b: 'a>(&'b mut self, text: &'a str) -> BoxTokenStream<'a> {
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.tokenizer.box_token_stream(text)
}
}
Expand Down
12 changes: 6 additions & 6 deletions src/tokenizer/whitespace_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ pub struct WhitespaceTokenizer {
token: Token,
}

pub struct WhitespaceTokenStream<'a, 'b> {
pub struct WhitespaceTokenStream<'a> {
text: &'a str,
chars: CharIndices<'a>,
token: &'b mut Token,
token: &'a mut Token,
}

impl Tokenizer for WhitespaceTokenizer {
type TokenStream<'a, 'b> = WhitespaceTokenStream<'a, 'b>;
fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> WhitespaceTokenStream<'a, 'b> {
type TokenStream<'a> = WhitespaceTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> WhitespaceTokenStream<'a> {
self.token.reset();
WhitespaceTokenStream {
text,
Expand All @@ -26,7 +26,7 @@ impl Tokenizer for WhitespaceTokenizer {
}
}

impl<'a, 'b> WhitespaceTokenStream<'a, 'b> {
impl<'a> WhitespaceTokenStream<'a> {
// search for the end of the current token.
fn search_token_end(&mut self) -> usize {
(&mut self.chars)
Expand All @@ -37,7 +37,7 @@ impl<'a, 'b> WhitespaceTokenStream<'a, 'b> {
}
}

impl<'a, 'b> TokenStream for WhitespaceTokenStream<'a, 'b> {
impl<'a> TokenStream for WhitespaceTokenStream<'a> {
fn advance(&mut self) -> bool {
self.token.text.clear();
self.token.position = self.token.position.wrapping_add(1);
Expand Down
4 changes: 2 additions & 2 deletions tokenizer-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ impl Token {
/// before indexing.
pub trait Tokenizer: 'static + Clone + Send + Sync {
/// The token stream returned by this Tokenizer.
type TokenStream<'a, 'b>: TokenStream;
type TokenStream<'a>: TokenStream;
/// Creates a token stream for a given `str`.
fn token_stream<'a, 'b>(&'b mut self, text: &'a str) -> Self::TokenStream<'a, 'b>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>;
}

/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
Expand Down

0 comments on commit 69c9277

Please sign in to comment.