MinishLab · stephantul · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/model2vec/distill/distillation.py b/model2vec/distill/distillation.py
@@ -3,6 +3,7 @@
 import numpy as np
 from huggingface_hub import model_info
 from sklearn.decomposition import PCA
+from tokenizers.models import BPE, Unigram, WordPiece
 from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PreTrainedTokenizerFast
 
 from model2vec.distill.inference import (
@@ -50,6 +51,14 @@
 
     # Load original tokenizer. We need to keep this to tokenize any tokens in the vocabulary.
     original_tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(model_name)
+
+    if vocabulary and isinstance(original_tokenizer.backend_tokenizer.model, (BPE, Unigram)):
+        raise ValueError(
+            "You passed a vocabulary, but the model you are using does not use a WordPiece tokenizer. "
+            "This is not supported yet."
+            "Feel free to open an issue if this is a blocker: https://github.com/MinishLab/model2vec/issues"
+        )
+
     original_model: PreTrainedModel = AutoModel.from_pretrained(model_name)
     # Make a base list of tokens.
     tokens: list[str] = []
@@ -79,7 +88,7 @@
         # We need to set embeddings to None because we don't know the dimensions of the embeddings yet.
         embeddings = None
 
-    if vocabulary is not None:
+    if vocabulary:
         # Preprocess the vocabulary with the original tokenizer.
         preprocessed_vocabulary = preprocess_vocabulary(original_tokenizer.backend_tokenizer, vocabulary)
         n_tokens_before = len(preprocessed_vocabulary)

diff --git a/model2vec/distill/tokenizer.py b/model2vec/distill/tokenizer.py
@@ -23,7 +23,19 @@
     :param tokenizer: The tokenizer to remove tokens from.
     :param tokens_to_remove: The tokens to remove.
     :return: The modified tokenizer.
+    :raises ValueError: If the tokenizer model type is not supported.
     """
+    model_vocab = set(tokenizer.get_vocab())
+    # This triggers when tokens_to_remove is empty or when there is no overlap
+    # between the tokens to remove and the model vocabulary.
+    if not set(tokens_to_remove).intersection(model_vocab):
+        # NOTE: return a copy.
+        if tokens_to_remove:
+            logger.info("No tokens to remove, none of the tokens were in the vocabulary.")
+        else:
+            logger.info("No tokens to remove.")
+        return Tokenizer.from_str(tokenizer.to_str())
+
     tokenizer_data = json.loads(tokenizer.to_str())
 
     # Find all added tokens
@@ -35,20 +47,31 @@
     tokens_to_remove = [token for token in tokens_to_remove if token not in added_tokens_str]
 
     # Load the vocabulary.
-    vocab: dict[str, int] = tokenizer_data["model"]["vocab"]
-    n_tokens = len(vocab)
-
-    # Remove the tokens.
-    for token in tokens_to_remove:
-        if vocab.pop(token, None) is None:
-            logger.warning(f"Token {token} was not in the vocabulary.")
-
-    n_removed = n_tokens - len(vocab)
-    logger.info(f"Removed {n_removed} tokens from the vocabulary.")
-
-    # Reindex the vocabulary so that it is contiguous.
-    reindexed = {token: idx for idx, (token, _) in enumerate(sorted(vocab.items(), key=lambda x: x[1]))}
-    tokenizer_data["model"]["vocab"] = reindexed
+    model_type = tokenizer_data["model"]["type"]
+
+    match model_type:
+        case "WordPiece":
+            # Vocab is a dictionary.
+            vocab: dict[str, int] = tokenizer_data["model"]["vocab"]
+            n_tokens = len(vocab)
+
+            # Remove the tokens.
+            for token in tokens_to_remove:
+                if vocab.pop(token, None) is None:
+                    logger.warning(f"Token {token} was not in the vocabulary.")
+
+            n_removed = n_tokens - len(vocab)
+            logger.info(f"Removed {n_removed} tokens from the vocabulary.")
+
+            # Reindex the vocabulary so that it is contiguous.
+            reindexed = {token: idx for idx, (token, _) in enumerate(sorted(vocab.items(), key=lambda x: x[1]))}
+            tokenizer_data["model"]["vocab"] = reindexed
+        case "Unigram":
+            raise ValueError("Removing tokens from a unigram tokenizer is not supported.")
+        case "BPE":
+            raise ValueError("Removing tokens from a bpe tokenizer is not supported.")
+        case _:
+            raise ValueError(f"Unknown model type {model_type}")
 
     # Reindex the special tokens (i.e., CLS and SEP for BertTokenizers.)
     special_tokens_post_processor: dict[str, dict] = tokenizer_data["post_processor"]["special_tokens"]
@@ -68,12 +91,24 @@
     :param tokenizer: The tokenizer to add tokens to.
     :param tokens_to_add: The tokens to add.
     :return: The modified tokenizer.
+    :raises ValueError: If the tokenizer model type is not supported.
     """
     data = json.loads(tokenizer.to_str())
 
-    vocab: dict[str, int] = data["model"]["vocab"]
-    for token in tokens_to_add:
-        vocab[token] = len(vocab)
+    model = data["model"]["type"]
+
+    match model:
+        case "WordPiece":
+            wordpiece_vocab: dict[str, int] = data["model"]["vocab"]
+            for token in tokens_to_add:
+                if token not in wordpiece_vocab:
+                    wordpiece_vocab[token] = len(wordpiece_vocab)
+        case "Unigram":
+            raise ValueError("Adding tokens to a unigram tokenizer is not supported.")
+        case "BPE":
+            raise ValueError("Adding tokens to a bpe tokenizer is not supported.")
+        case _:
+            raise ValueError(f"Unknown model type {model}")
 
     tokenizer = Tokenizer.from_str(json.dumps(data))