Skip to content

Commit

Permalink
Add median token length as limit (#47)
Browse files Browse the repository at this point in the history
  • Loading branch information
stephantul authored Sep 29, 2024
1 parent 6b8a1be commit 9a887a3
Showing 1 changed file with 5 additions and 0 deletions.
5 changes: 5 additions & 0 deletions model2vec/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def __init__(
else:
self.unk_token_id = None

self.median_token_length = int(np.median([len(token) for token in self.tokens]))
self.config = config
self.base_model_name = base_model_name
self.language = language
Expand Down Expand Up @@ -123,6 +124,10 @@ def tokenize(self, sentences: list[str], max_length: int | None = None) -> tuple
:param max_length: The maximum length of the sentence.
:return: The tokens.
"""
if max_length is not None:
m = max_length * self.median_token_length
sentences = [sentence[:m] for sentence in sentences]

encodings: list[Encoding] = self.tokenizer.encode_batch(sentences, add_special_tokens=False)
encodings_ids = [encoding.ids for encoding in encodings]

Expand Down

0 comments on commit 9a887a3

Please sign in to comment.