Resolved conflict

MinishLab · Nov 2, 2024 · 8822c4b · 8822c4b
2 parents d92390b + 54a6460
commit 8822c4b
Show file tree

Hide file tree

Showing 27 changed files with 1,787 additions and 979 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -10,7 +10,14 @@ jobs:
     strategy:
       matrix:
         os: ["ubuntu-latest", "windows-latest"]
-        python-version: ["3.10"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        exclude:
+          - os: windows-latest
+            python-version: "3.9"
+          - os: windows-latest
+            python-version: "3.11"
+          - os: windows-latest
+            python-version: "3.12"
       fail-fast: false
 
     steps:
@@ -42,8 +49,7 @@ jobs:
 
       # Install dependencies using uv pip
       - name: Install dependencies
-        run: make install
-       # run: uv pip install -e ".[pytest]"
+        run: make install-no-pre-commit
 
       # Run tests with coverage
       - name: Run tests under coverage

diff --git a/.gitignore b/.gitignore
@@ -168,7 +168,6 @@ models
 checkpoints/*
 features/*
 model2vec_models
-results/*
 counts/*
 results_old/*
 local/*

diff --git a/Makefile b/Makefile
@@ -8,6 +8,9 @@ install:
 	uv sync --all-extras
 	uv run pre-commit install
 
+install-no-pre-commit:
+	uv pip install ".[dev,distill]"
+
 install-base:
 	uv sync --extra dev
 

diff --git a/README.md b/README.md
diff --git a/assets/images/speed_vs_accuracy_v4.png b/assets/images/speed_vs_accuracy_v4.png
diff --git a/assets/images/speed_vs_mteb_score.png b/assets/images/speed_vs_mteb_score.png
diff --git a/assets/images/speed_vs_mteb_score_v2.png b/assets/images/speed_vs_mteb_score_v2.png
diff --git a/model2vec/__init__.py b/model2vec/__init__.py
@@ -1,4 +1,4 @@
-from model2vec.distill import distill
 from model2vec.model import StaticModel
+from model2vec.version import __version__
 
-__all__ = ["distill", "StaticModel"]
+__all__ = ["StaticModel", "__version__"]
diff --git a/model2vec/distill/__init__.py b/model2vec/distill/__init__.py
@@ -1,3 +1,10 @@
+from model2vec.utils import get_package_extras, importable
+
+_REQUIRED_EXTRA = "distill"
+
+for extra_dependency in get_package_extras("model2vec", _REQUIRED_EXTRA):
+    importable(extra_dependency, _REQUIRED_EXTRA)
+
 from model2vec.distill.distillation import distill, distill_from_model
 
 __all__ = ["distill", "distill_from_model"]
diff --git a/model2vec/distill/__main__.py b/model2vec/distill/__main__.py
diff --git a/model2vec/distill/distillation.py b/model2vec/distill/distillation.py
@@ -1,5 +1,7 @@
+from __future__ import annotations
+
 import logging
-from typing import Literal
+from typing import Literal, Union
 
 import numpy as np
 from huggingface_hub import model_info
@@ -26,7 +28,7 @@
 logger = logging.getLogger(__name__)
 
 
-PCADimType = int | None | Literal["auto"]
+PCADimType = Union[int, None, Literal["auto"]]
 
 
 def distill_from_model(
@@ -214,9 +216,17 @@ def _post_process_embeddings(embeddings: np.ndarray, pca_dims: PCADimType, apply
         elif pca_dims <= embeddings.shape[1]:
             logger.info(f"Applying PCA with n_components {pca_dims}")
 
+            orig_dims = embeddings.shape[1]
             p = PCA(n_components=pca_dims, whiten=False)
             embeddings = p.fit_transform(embeddings)
 
+            if embeddings.shape[1] < orig_dims:
+                explained_variance_ratio = np.sum(p.explained_variance_ratio_)
+                explained_variance = np.sum(p.explained_variance_)
+                logger.info(f"Reduced dimensionality from {orig_dims} to {embeddings.shape[1]}.")
+                logger.info(f"Explained variance ratio: {explained_variance_ratio:.3f}.")
+                logger.info(f"Explained variance: {explained_variance:.3f}.")
+
     if apply_zipf:
         logger.info("Applying Zipf weighting")
         embeddings *= np.log(1 + np.arange(embeddings.shape[0]))[:, None]

diff --git a/model2vec/distill/inference.py b/model2vec/distill/inference.py
@@ -1,8 +1,10 @@
 # -*- coding: utf-8 -*-
+from __future__ import annotations
+
 import inspect
 import logging
 from pathlib import Path
-from typing import Protocol
+from typing import Protocol, Union
 
 import numpy as np
 import torch
@@ -13,7 +15,7 @@
 logger = logging.getLogger(__name__)
 
 
-PathLike = str | Path
+PathLike = Union[Path, str]
 
 _DEFAULT_BATCH_SIZE = 1024
 
@@ -113,7 +115,15 @@ def create_output_embeddings_from_model_name(
     :return: The tokens and output embeddings.
     """
     model = model.to(device)
-    ids = torch.arange(tokenizer.vocab_size)
+
+    # Quick check to see if the tokenizer is consistent.
+    vocab_length = len(tokenizer.get_vocab())
+    if vocab_length != tokenizer.vocab_size:
+        logger.warning(
+            f"Reported vocab size {tokenizer.vocab_size} is inconsistent with the vocab size {vocab_length}."
+        )
+
+    ids = torch.arange(vocab_length)
 
     # Work-around to get the eos and bos token ids without having to go into tokenizer internals.
     dummy_encoding = tokenizer.encode("A")
@@ -122,7 +132,8 @@ def create_output_embeddings_from_model_name(
     bos = torch.full([len(ids)], fill_value=bos_token_id)
     eos = torch.full([len(ids)], fill_value=eos_token_id)
 
-    stacked = torch.stack([bos, ids, eos], dim=1)
+    # NOTE: reversing the bos and eos tokens works better on our benchmarks.
+    stacked = torch.stack([eos, ids, bos], dim=1)
 
     intermediate_weights: list[np.ndarray] = []
     for batch_idx in tqdm(range(0, len(stacked), _DEFAULT_BATCH_SIZE)):

diff --git a/model2vec/distill/tokenizer.py b/model2vec/distill/tokenizer.py
@@ -2,6 +2,7 @@
 
 import json
 import logging
+from typing import Any
 
 from tokenizers import Tokenizer
 
@@ -36,11 +37,11 @@ def remove_tokens(tokenizer: Tokenizer, tokens_to_remove: list[str]) -> Tokenize
             logger.info("No tokens to remove.")
         return Tokenizer.from_str(tokenizer.to_str())
 
-    tokenizer_data = json.loads(tokenizer.to_str())
+    tokenizer_data: dict[str, Any] = json.loads(tokenizer.to_str())
 
     # Find all added tokens
-    added_tokens = tokenizer_data["added_tokens"]
-    added_tokens_str = {token["content"] for token in added_tokens}
+    added_tokens: list[dict[str, Any]] = tokenizer_data.get("added_tokens", [])
+    added_tokens_str: set[str] = {token["content"] for token in added_tokens}
 
     # Remove all added tokens from the list of tokens to remove.
     # Things will go bad if we keep them.
@@ -49,34 +50,36 @@ def remove_tokens(tokenizer: Tokenizer, tokens_to_remove: list[str]) -> Tokenize
     # Load the vocabulary.
     model_type = tokenizer_data["model"]["type"]
 
-    match model_type:
-        case "WordPiece":
-            # Vocab is a dictionary.
-            vocab: dict[str, int] = tokenizer_data["model"]["vocab"]
-            n_tokens = len(vocab)
-
-            # Remove the tokens.
-            for token in tokens_to_remove:
-                if vocab.pop(token, None) is None:
-                    logger.warning(f"Token {token} was not in the vocabulary.")
-
-            n_removed = n_tokens - len(vocab)
-            logger.info(f"Removed {n_removed} tokens from the vocabulary.")
-
-            # Reindex the vocabulary so that it is contiguous.
-            reindexed = {token: idx for idx, (token, _) in enumerate(sorted(vocab.items(), key=lambda x: x[1]))}
-            tokenizer_data["model"]["vocab"] = reindexed
-        case "Unigram":
-            raise ValueError("Removing tokens from a unigram tokenizer is not supported.")
-        case "BPE":
-            raise ValueError("Removing tokens from a bpe tokenizer is not supported.")
-        case _:
-            raise ValueError(f"Unknown model type {model_type}")
+    if model_type == "WordPiece":
+        # Vocab is a dictionary.
+        vocab: dict[str, int] = tokenizer_data["model"]["vocab"]
+        n_tokens = len(vocab)
+
+        # Remove the tokens.
+        for token in tokens_to_remove:
+            if vocab.pop(token, None) is None:
+                logger.warning(f"Token {token} was not in the vocabulary.")
+
+        n_removed = n_tokens - len(vocab)
+        logger.info(f"Removed {n_removed} tokens from the vocabulary.")
+
+        # Reindex the vocabulary so that it is contiguous.
+        reindexed = {token: idx for idx, (token, _) in enumerate(sorted(vocab.items(), key=lambda x: x[1]))}
+        tokenizer_data["model"]["vocab"] = reindexed
+
+    elif model_type == "Unigram":
+        raise ValueError("Removing tokens from a unigram tokenizer is not supported.")
+
+    elif model_type == "BPE":
+        raise ValueError("Removing tokens from a BPE tokenizer is not supported.")
+
+    else:
+        raise ValueError(f"Unknown model type {model_type}")
 
     # Reindex the special tokens (i.e., CLS and SEP for BertTokenizers.)
-    special_tokens_post_processor: dict[str, dict] = tokenizer_data["post_processor"]["special_tokens"]
-    for token, token_data in special_tokens_post_processor.items():
-        token_data["ids"] = [reindexed[token] for token in token_data["tokens"]]
+    added_tokens = tokenizer_data.get("added_tokens", [])
+    for token_data in added_tokens:
+        token_data["id"] = reindexed[token_data["content"]]
 
     # Reinitialize the tokenizer from the json.
     tokenizer = Tokenizer.from_str(json.dumps(tokenizer_data))
@@ -97,18 +100,20 @@ def add_tokens(tokenizer: Tokenizer, tokens_to_add: list[str]) -> Tokenizer:
 
     model = data["model"]["type"]
 
-    match model:
-        case "WordPiece":
-            wordpiece_vocab: dict[str, int] = data["model"]["vocab"]
-            for token in tokens_to_add:
-                if token not in wordpiece_vocab:
-                    wordpiece_vocab[token] = len(wordpiece_vocab)
-        case "Unigram":
-            raise ValueError("Adding tokens to a unigram tokenizer is not supported.")
-        case "BPE":
-            raise ValueError("Adding tokens to a bpe tokenizer is not supported.")
-        case _:
-            raise ValueError(f"Unknown model type {model}")
+    if model == "WordPiece":
+        wordpiece_vocab: dict[str, int] = data["model"]["vocab"]
+        for token in tokens_to_add:
+            if token not in wordpiece_vocab:
+                wordpiece_vocab[token] = len(wordpiece_vocab)
+
+    elif model == "Unigram":
+        raise ValueError("Adding tokens to a unigram tokenizer is not supported.")
+
+    elif model == "BPE":
+        raise ValueError("Adding tokens to a BPE tokenizer is not supported.")
+
+    else:
+        raise ValueError(f"Unknown model type {model}")
 
     tokenizer = Tokenizer.from_str(json.dumps(data))
 

diff --git a/model2vec/distill/utils.py b/model2vec/distill/utils.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from logging import getLogger
 
 import torch