Patch Llama 3 tokenizer, add LMs to tests

kddubey · Nov 2, 2024 · 5e721b8 · 5e721b8
1 parent 4b61684
commit 5e721b8
Show file tree

Hide file tree

Showing 9 changed files with 266 additions and 33 deletions.
diff --git a/docs/source/select_a_language_model.rst b/docs/source/select_a_language_model.rst
@@ -49,11 +49,14 @@ Here's a quick example (which will download a small GPT-2 model to your computer
 
 So far, CAPPr has been tested for code correctness on the following architectures:
 
+- Llama, Llama 2 
+- Mistral
+- Gemma 2
+- Phi
 - GPT-2
 - GPT-J
 - GPT-NeoX (including StableLM)
-- Llama, Llama 2
-- Mistral.
+- (Q)LoRA models whose base model is one of the above.
 
 You'll need access to beefier hardware to run models from the Hugging Face hub, as
 :mod:`cappr.huggingface` currently assumes you've locally loaded the model. Hugging Face

diff --git a/docs/source/select_a_prompt_completion_format.rst b/docs/source/select_a_prompt_completion_format.rst
@@ -22,8 +22,7 @@ any string processing for you; it just concatenates the three strings and sends
 **It's on you to format the prompt according to the model's instruction/chat format**,
 assuming that's applicable and beneficial. Before calling any CAPPr functions, consider
 printing ``{prompt}{end_of_prompt}{completion}`` for each completion in your list of
-possible completions/choices, and ensure that each passes the eye test. If you had to
-apply a chat format, ensure that the ``prompt`` doesn't end with the EOS token.
+possible completions/choices, and ensure that each passes the eye test.
 
 And yes, you'll likely need to do a bit of prompt engineering. But if you can write a
 sentence, you can write a prompt. It's mostly a matter of trial and error. Here's an

diff --git a/pyproject.toml b/pyproject.toml
@@ -34,9 +34,10 @@ hf = ["transformers[torch]>=4.31.0"]
 llama-cpp = ["llama-cpp-python>=0.2.11"]
 all = ["cappr[openai,hf,llama-cpp]"]
 hf-dev = [
-    "transformers[torch]>=4.35.0",  # to test AutoGPTQ on CPU and AutoAWQ with caching
     "huggingface-hub>=0.16.4",
+    "peft>=0.13.2",
     "sentencepiece>=0.1.99",
+    "transformers[torch]>=4.35.0",  # to test AutoGPTQ on CPU and AutoAWQ with caching
 ]
 llama-cpp-dev = ["llama-cpp-python>=0.2.13"]
 demos = [

diff --git a/src/cappr/huggingface/__init__.py b/src/cappr/huggingface/__init__.py
@@ -8,6 +8,6 @@
 https://cappr.readthedocs.io/en/latest/select_a_language_model.html#hugging-face
 """
 
-from . import _utils, classify, classify_no_cache
+from . import _patch_tokenizer, _utils, classify, classify_no_cache
 
-__all__ = ["_utils", "classify", "classify_no_cache"]
+__all__ = ["_patch_tokenizer", "_utils", "classify", "classify_no_cache"]
diff --git a/src/cappr/huggingface/_patch_tokenizer.py b/src/cappr/huggingface/_patch_tokenizer.py
@@ -0,0 +1,148 @@
+"""
+This module only exists to ensure that Llama 3's tokenizer supports
+`tokenizer.add_bos_token = False`. In the future, it'd be nice to delete this.
+
+Issue: https://github.com/huggingface/transformers/issues/30947
+"""
+
+from functools import lru_cache
+from typing import Literal
+
+from tokenizers import processors
+from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast
+
+from cappr.utils.classify import _setattr
+
+
+@lru_cache(maxsize=5)
+def does_disabling_add_token_disable_adding_token(
+    tokenizer: PreTrainedTokenizerBase, token_name: Literal["bos_token", "eos_token"]
+) -> bool:
+    # NOTE: this function should only return False for Llama 3's BOS token. This fact is
+    # tested via:
+    #
+    # python -m pytest \
+    # tests/huggingface/test_huggingface_classify.py \
+    # -k test__does_disabling_add_token_disable_adding_token \
+    # -x
+
+    if token_name == "bos_token":
+        position = 0
+    elif token_name == "eos_token":
+        position = -1
+    else:
+        raise ValueError(
+            'token_name must be either "bos_token", "eos_token"'
+        )  # pragma: no cover
+
+    text = "a"
+    tokens_default: list[int] = tokenizer(text)["input_ids"]
+    is_token_added = tokens_default[position] == getattr(
+        tokenizer, f"{token_name}_id", None
+    )
+    if not is_token_added:
+        # Disabling vacuously works b/c, by default, the token wasn't added
+        return True
+
+    with _setattr(tokenizer, f"add_{token_name}", False):
+        tokens_after_disabling: list[int] = tokenizer(text)["input_ids"]
+
+    tokens_default_wo_token = tokens_default[:]
+    tokens_default_wo_token.pop(position)
+    if tokens_after_disabling == tokens_default_wo_token:
+        return True
+    else:
+        # Ensure that disabling really did do nothing / it didn't remove the token and
+        # did nothing else.
+        condition = tokens_after_disabling == tokens_default
+        msg = (
+            f"There was an unexpected side effect from disabling add_{token_name}. "
+            f"The default setting caused 'a' to be tokenized as {tokens_default}. "
+            f"Disabling caused 'a' to be tokenized as {tokens_after_disabling}. "
+            "Please raise an issue here: https://github.com/kddubey/cappr/issues"
+        )
+        assert condition, msg
+        return False
+
+
+def force_support(tokenizer: PreTrainedTokenizerFast) -> None:
+    """
+    Hack to incorporate:
+
+    https://github.com/huggingface/transformers/pull/31316
+    """
+
+    text = "a"
+    tokens_default: list[int] = tokenizer(text)["input_ids"]
+
+    # We need to initialize these correctly, not None. The reason is that if we update
+    # set add_eos/bos_token later, and then reset it back to None, we'll always have
+    # False-y values instead of the original behavior.
+    tokenizer._add_eos_token = tokens_default[-1] == getattr(
+        tokenizer, "eos_token_id", None
+    )
+    tokenizer._add_bos_token = tokens_default[0] == getattr(
+        tokenizer, "bos_token_id", None
+    )
+
+    class _PreTrainedTokenizerFastPatched(type(tokenizer)):
+        @property
+        def add_eos_token(self):
+            return self._add_eos_token
+
+        @property
+        def add_bos_token(self):
+            return self._add_bos_token
+
+        @add_eos_token.setter
+        def add_eos_token(self, value: bool):
+            self._add_eos_token = value
+            self.update_post_processor()
+
+        @add_bos_token.setter
+        def add_bos_token(self, value: bool):
+            self._add_bos_token = value
+            self.update_post_processor()
+
+        def update_post_processor(self):
+            """
+            Overwrites the underlying post processor with the current `bos_token` and
+            `eos_token`.
+            """
+            if not isinstance(
+                self._tokenizer.post_processor, processors.TemplateProcessing
+            ) and not isinstance(self._tokenizer.post_processor, processors.Sequence):
+                return
+
+            bos = self.bos_token
+            bos_token_id = self.bos_token_id
+            if bos is None and self.add_bos_token:
+                raise ValueError("add_bos_token = True but bos_token = None")
+
+            eos = self.eos_token
+            eos_token_id = self.eos_token_id
+            if eos is None and self.add_eos_token:
+                raise ValueError("add_eos_token = True but eos_token = None")
+
+            single = (
+                f"{(bos + ':0 ') if self.add_bos_token else ''}"
+                "$A:0"
+                f"{(' ' + eos + ':0') if self.add_eos_token else ''}"
+            )
+            pair = (
+                f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} "
+                "$B:1"
+                f"{(' ' + eos + ':1') if self.add_eos_token else ''}"
+            )
+
+            special_tokens = []
+            if self.add_bos_token:
+                special_tokens.append((bos, bos_token_id))
+            if self.add_eos_token:
+                special_tokens.append((eos, eos_token_id))
+            self._tokenizer.post_processor = processors.TemplateProcessing(
+                single=single, pair=pair, special_tokens=special_tokens
+            )
+
+    # https://stackoverflow.com/questions/31590152/monkey-patching-a-property
+    tokenizer.__class__ = _PreTrainedTokenizerFastPatched
diff --git a/src/cappr/huggingface/_utils.py b/src/cappr/huggingface/_utils.py
@@ -14,6 +14,8 @@
 from cappr.utils import _check
 from cappr.utils.classify import _setattr
 
+from cappr.huggingface import _patch_tokenizer
+
 
 BatchEncodingPT = Mapping[str, torch.Tensor]
 """
@@ -117,6 +119,11 @@ def dont_add_eos_token(tokenizer: PreTrainedTokenizerBase):
     """
     In this context, don't add an end-of-sentence token.
     """
+    if not _patch_tokenizer.does_disabling_add_token_disable_adding_token(
+        tokenizer, "eos_token"
+    ):
+        _patch_tokenizer.force_support(tokenizer)
+
     with _setattr(tokenizer, "add_eos_token", False):
         yield
 
@@ -154,6 +161,11 @@ def dont_add_bos_token(tokenizer: PreTrainedTokenizerBase):
     """
     In this context, don't add a beginning-of-sentence token.
     """
+    if not _patch_tokenizer.does_disabling_add_token_disable_adding_token(
+        tokenizer, "bos_token"
+    ):
+        _patch_tokenizer.force_support(tokenizer)
+
     with _setattr(tokenizer, "add_bos_token", False):
         yield
 
@@ -163,10 +175,15 @@ def dont_add_bos_token(tokenizer: PreTrainedTokenizerBase):
 ########################################################################################
 
 
-@lru_cache()
+@lru_cache(maxsize=5)
 def does_tokenizer_need_prepended_space(
     tokenizer: PreTrainedTokenizerBase,
 ) -> bool:
+    # There's an (inaccessible?) add_prefix_space attr in
+    # tokenizer._tokenizer.post_processor's ByteLevel (first processor) that's probably
+    # the same as this function. But even if we can access it, not every tokenizer is
+    # BPE. And I doubt every tokenizer will make this info easily accessible. Instead,
+    # let's check it by running the tokenizer.
     with dont_add_eos_token(tokenizer):
         tokenize = lambda text: tokenizer(text)["input_ids"]
         bos_token_id = getattr(tokenizer, "bos_token_id", None)
@@ -199,6 +216,12 @@ def _batched_model_call(
     return CausalLMOutput(logits=logits)
 
 
+def is_bos_token_added(
+    tokenizer: PreTrainedTokenizerBase, encodings: BatchEncodingPT
+) -> bool:
+    return encodings["input_ids"][0][0] == getattr(tokenizer, "bos_token_id", None)
+
+
 def logits_texts(
     texts: Sequence[str],
     model_and_tokenizer: tuple[ModelForCausalLM, PreTrainedTokenizerBase],
@@ -222,7 +245,7 @@ def logits_texts(
     else:
         with set_up_model(model):
             out: CausalLMOutput = model(**encodings)
-    if drop_bos_token and getattr(tokenizer, "add_bos_token", False):
+    if drop_bos_token and is_bos_token_added(tokenizer, encodings):
         # Drop the first/bos token after we're done encoding so that the shape is
         # consistent w/ other tokenizers. For CAPPr, we'll never be interested in
         # Pr(token | <bos>). We're only interested in completion tokens

diff --git a/src/cappr/huggingface/classify_no_cache.py b/src/cappr/huggingface/classify_no_cache.py
@@ -106,14 +106,14 @@ def _prompts_offsets(
     prompts = list(prompts)
     padding = len(prompts) > 1
     with hf._utils.set_up_tokenizer(tokenizer):
-        encoding = tokenizer(prompts, return_tensors="pt", padding=padding)
-        encoding = cast(BatchEncodingPT, encoding)
+        encodings = tokenizer(prompts, return_tensors="pt", padding=padding)
+        encodings = cast(BatchEncodingPT, encodings)
         offsets: torch.Tensor = (
-            encoding["attention_mask"]
+            encodings["attention_mask"]
             .sum(dim=1)
             .repeat_interleave(num_completions_per_prompt, dim=0)
         )
-    if getattr(tokenizer, "add_bos_token", False):
+    if hf._utils.is_bos_token_added(tokenizer, encodings):
         # Drop the first <s> token after we're done encoding so that the shape is
         # consistent w/ other tokenizers
         offsets -= 1