Skip to content

Commit

Permalink
fix llama answer choice tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexTMallen committed Nov 15, 2023
1 parent 6e07819 commit 688a796
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
8 changes: 5 additions & 3 deletions ccs/extraction/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,12 +264,14 @@ def extract_hiddens(
if is_enc_dec:
answer = labels = assert_type(Tensor, encoding.labels)
else:
a_id = tokenizer.encode(" " + choice["answer"], add_special_tokens=False)
a_id = tokenizer.encode(
" " + choice["answer"], add_special_tokens=False
)

# the Llama tokenizer splits off leading spaces
if tokenizer.decode(a_id[0]).strip() == "":
a_id_without_space = tokenizer.encode(
choice, add_special_tokens=False
choice["answer"], add_special_tokens=False
)
assert a_id_without_space == a_id[1:]
a_id = a_id_without_space
Expand Down Expand Up @@ -303,7 +305,7 @@ def extract_hiddens(
# probs near 1 will be somewhat imprecise
# log(p/(1-p)) = log(p) - log(1-p) = logp - log(1 - exp(logp))
lm_log_odds[i, j] = logprob - torch.log1p(-logprob.exp())

hiddens = (
outputs.get("decoder_hidden_states") or outputs["hidden_states"]
)
Expand Down
16 changes: 16 additions & 0 deletions ccs/promptsource/templates/qm_grader_first/templates.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
dataset: None
templates:
41e13d03004a458abeab1e5c22f83e68: !Template
answer_choices: False ||| True
id: 41e13d03004a458abeab1e5c22f83e68
jinja: 'Grader: {{ character }}
{{ summand1 }} + {{ summand2 }} = {{ sum }}
Score: ||| {{answer_choices[label]}}'
metadata: !TemplateMetadata
languages:
- en
metrics:
- Accuracy
name: "grader_first"

0 comments on commit 688a796

Please sign in to comment.