Skip to content

Commit

Permalink
Patch Llama 3 tokenizer, add LMs to tests
Browse files Browse the repository at this point in the history
  • Loading branch information
kddubey committed Nov 2, 2024
1 parent 4b61684 commit 5e721b8
Show file tree
Hide file tree
Showing 9 changed files with 266 additions and 33 deletions.
7 changes: 5 additions & 2 deletions docs/source/select_a_language_model.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,14 @@ Here's a quick example (which will download a small GPT-2 model to your computer
So far, CAPPr has been tested for code correctness on the following architectures:

- Llama, Llama 2
- Mistral
- Gemma 2
- Phi
- GPT-2
- GPT-J
- GPT-NeoX (including StableLM)
- Llama, Llama 2
- Mistral.
- (Q)LoRA models whose base model is one of the above.

You'll need access to beefier hardware to run models from the Hugging Face hub, as
:mod:`cappr.huggingface` currently assumes you've locally loaded the model. Hugging Face
Expand Down
3 changes: 1 addition & 2 deletions docs/source/select_a_prompt_completion_format.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ any string processing for you; it just concatenates the three strings and sends
**It's on you to format the prompt according to the model's instruction/chat format**,
assuming that's applicable and beneficial. Before calling any CAPPr functions, consider
printing ``{prompt}{end_of_prompt}{completion}`` for each completion in your list of
possible completions/choices, and ensure that each passes the eye test. If you had to
apply a chat format, ensure that the ``prompt`` doesn't end with the EOS token.
possible completions/choices, and ensure that each passes the eye test.

And yes, you'll likely need to do a bit of prompt engineering. But if you can write a
sentence, you can write a prompt. It's mostly a matter of trial and error. Here's an
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ hf = ["transformers[torch]>=4.31.0"]
llama-cpp = ["llama-cpp-python>=0.2.11"]
all = ["cappr[openai,hf,llama-cpp]"]
hf-dev = [
"transformers[torch]>=4.35.0", # to test AutoGPTQ on CPU and AutoAWQ with caching
"huggingface-hub>=0.16.4",
"peft>=0.13.2",
"sentencepiece>=0.1.99",
"transformers[torch]>=4.35.0", # to test AutoGPTQ on CPU and AutoAWQ with caching
]
llama-cpp-dev = ["llama-cpp-python>=0.2.13"]
demos = [
Expand Down
4 changes: 2 additions & 2 deletions src/cappr/huggingface/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@
https://cappr.readthedocs.io/en/latest/select_a_language_model.html#hugging-face
"""

from . import _utils, classify, classify_no_cache
from . import _patch_tokenizer, _utils, classify, classify_no_cache

__all__ = ["_utils", "classify", "classify_no_cache"]
__all__ = ["_patch_tokenizer", "_utils", "classify", "classify_no_cache"]
148 changes: 148 additions & 0 deletions src/cappr/huggingface/_patch_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""
This module only exists to ensure that Llama 3's tokenizer supports
`tokenizer.add_bos_token = False`. In the future, it'd be nice to delete this.
Issue: https://github.com/huggingface/transformers/issues/30947
"""

from functools import lru_cache
from typing import Literal

from tokenizers import processors
from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast

from cappr.utils.classify import _setattr


@lru_cache(maxsize=5)
def does_disabling_add_token_disable_adding_token(
tokenizer: PreTrainedTokenizerBase, token_name: Literal["bos_token", "eos_token"]
) -> bool:
# NOTE: this function should only return False for Llama 3's BOS token. This fact is
# tested via:
#
# python -m pytest \
# tests/huggingface/test_huggingface_classify.py \
# -k test__does_disabling_add_token_disable_adding_token \
# -x

if token_name == "bos_token":
position = 0
elif token_name == "eos_token":
position = -1
else:
raise ValueError(
'token_name must be either "bos_token", "eos_token"'
) # pragma: no cover

text = "a"
tokens_default: list[int] = tokenizer(text)["input_ids"]
is_token_added = tokens_default[position] == getattr(
tokenizer, f"{token_name}_id", None
)
if not is_token_added:
# Disabling vacuously works b/c, by default, the token wasn't added
return True

with _setattr(tokenizer, f"add_{token_name}", False):
tokens_after_disabling: list[int] = tokenizer(text)["input_ids"]

tokens_default_wo_token = tokens_default[:]
tokens_default_wo_token.pop(position)
if tokens_after_disabling == tokens_default_wo_token:
return True
else:
# Ensure that disabling really did do nothing / it didn't remove the token and
# did nothing else.
condition = tokens_after_disabling == tokens_default
msg = (
f"There was an unexpected side effect from disabling add_{token_name}. "
f"The default setting caused 'a' to be tokenized as {tokens_default}. "
f"Disabling caused 'a' to be tokenized as {tokens_after_disabling}. "
"Please raise an issue here: https://github.com/kddubey/cappr/issues"
)
assert condition, msg
return False


def force_support(tokenizer: PreTrainedTokenizerFast) -> None:
"""
Hack to incorporate:
https://github.com/huggingface/transformers/pull/31316
"""

text = "a"
tokens_default: list[int] = tokenizer(text)["input_ids"]

# We need to initialize these correctly, not None. The reason is that if we update
# set add_eos/bos_token later, and then reset it back to None, we'll always have
# False-y values instead of the original behavior.
tokenizer._add_eos_token = tokens_default[-1] == getattr(
tokenizer, "eos_token_id", None
)
tokenizer._add_bos_token = tokens_default[0] == getattr(
tokenizer, "bos_token_id", None
)

class _PreTrainedTokenizerFastPatched(type(tokenizer)):
@property
def add_eos_token(self):
return self._add_eos_token

@property
def add_bos_token(self):
return self._add_bos_token

@add_eos_token.setter
def add_eos_token(self, value: bool):
self._add_eos_token = value
self.update_post_processor()

@add_bos_token.setter
def add_bos_token(self, value: bool):
self._add_bos_token = value
self.update_post_processor()

def update_post_processor(self):
"""
Overwrites the underlying post processor with the current `bos_token` and
`eos_token`.
"""
if not isinstance(
self._tokenizer.post_processor, processors.TemplateProcessing
) and not isinstance(self._tokenizer.post_processor, processors.Sequence):
return

bos = self.bos_token
bos_token_id = self.bos_token_id
if bos is None and self.add_bos_token:
raise ValueError("add_bos_token = True but bos_token = None")

eos = self.eos_token
eos_token_id = self.eos_token_id
if eos is None and self.add_eos_token:
raise ValueError("add_eos_token = True but eos_token = None")

single = (
f"{(bos + ':0 ') if self.add_bos_token else ''}"
"$A:0"
f"{(' ' + eos + ':0') if self.add_eos_token else ''}"
)
pair = (
f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} "
"$B:1"
f"{(' ' + eos + ':1') if self.add_eos_token else ''}"
)

special_tokens = []
if self.add_bos_token:
special_tokens.append((bos, bos_token_id))
if self.add_eos_token:
special_tokens.append((eos, eos_token_id))
self._tokenizer.post_processor = processors.TemplateProcessing(
single=single, pair=pair, special_tokens=special_tokens
)

# https://stackoverflow.com/questions/31590152/monkey-patching-a-property
tokenizer.__class__ = _PreTrainedTokenizerFastPatched
27 changes: 25 additions & 2 deletions src/cappr/huggingface/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from cappr.utils import _check
from cappr.utils.classify import _setattr

from cappr.huggingface import _patch_tokenizer


BatchEncodingPT = Mapping[str, torch.Tensor]
"""
Expand Down Expand Up @@ -117,6 +119,11 @@ def dont_add_eos_token(tokenizer: PreTrainedTokenizerBase):
"""
In this context, don't add an end-of-sentence token.
"""
if not _patch_tokenizer.does_disabling_add_token_disable_adding_token(
tokenizer, "eos_token"
):
_patch_tokenizer.force_support(tokenizer)

with _setattr(tokenizer, "add_eos_token", False):
yield

Expand Down Expand Up @@ -154,6 +161,11 @@ def dont_add_bos_token(tokenizer: PreTrainedTokenizerBase):
"""
In this context, don't add a beginning-of-sentence token.
"""
if not _patch_tokenizer.does_disabling_add_token_disable_adding_token(
tokenizer, "bos_token"
):
_patch_tokenizer.force_support(tokenizer)

with _setattr(tokenizer, "add_bos_token", False):
yield

Expand All @@ -163,10 +175,15 @@ def dont_add_bos_token(tokenizer: PreTrainedTokenizerBase):
########################################################################################


@lru_cache()
@lru_cache(maxsize=5)
def does_tokenizer_need_prepended_space(
tokenizer: PreTrainedTokenizerBase,
) -> bool:
# There's an (inaccessible?) add_prefix_space attr in
# tokenizer._tokenizer.post_processor's ByteLevel (first processor) that's probably
# the same as this function. But even if we can access it, not every tokenizer is
# BPE. And I doubt every tokenizer will make this info easily accessible. Instead,
# let's check it by running the tokenizer.
with dont_add_eos_token(tokenizer):
tokenize = lambda text: tokenizer(text)["input_ids"]
bos_token_id = getattr(tokenizer, "bos_token_id", None)
Expand Down Expand Up @@ -199,6 +216,12 @@ def _batched_model_call(
return CausalLMOutput(logits=logits)


def is_bos_token_added(
tokenizer: PreTrainedTokenizerBase, encodings: BatchEncodingPT
) -> bool:
return encodings["input_ids"][0][0] == getattr(tokenizer, "bos_token_id", None)


def logits_texts(
texts: Sequence[str],
model_and_tokenizer: tuple[ModelForCausalLM, PreTrainedTokenizerBase],
Expand All @@ -222,7 +245,7 @@ def logits_texts(
else:
with set_up_model(model):
out: CausalLMOutput = model(**encodings)
if drop_bos_token and getattr(tokenizer, "add_bos_token", False):
if drop_bos_token and is_bos_token_added(tokenizer, encodings):
# Drop the first/bos token after we're done encoding so that the shape is
# consistent w/ other tokenizers. For CAPPr, we'll never be interested in
# Pr(token | <bos>). We're only interested in completion tokens
Expand Down
8 changes: 4 additions & 4 deletions src/cappr/huggingface/classify_no_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,14 +106,14 @@ def _prompts_offsets(
prompts = list(prompts)
padding = len(prompts) > 1
with hf._utils.set_up_tokenizer(tokenizer):
encoding = tokenizer(prompts, return_tensors="pt", padding=padding)
encoding = cast(BatchEncodingPT, encoding)
encodings = tokenizer(prompts, return_tensors="pt", padding=padding)
encodings = cast(BatchEncodingPT, encodings)
offsets: torch.Tensor = (
encoding["attention_mask"]
encodings["attention_mask"]
.sum(dim=1)
.repeat_interleave(num_completions_per_prompt, dim=0)
)
if getattr(tokenizer, "add_bos_token", False):
if hf._utils.is_bos_token_added(tokenizer, encodings):
# Drop the first <s> token after we're done encoding so that the shape is
# consistent w/ other tokenizers
offsets -= 1
Expand Down
Loading

0 comments on commit 5e721b8

Please sign in to comment.