Enable testing of pad-to-max-length for tokenizer in embeddings

caikit · Oct 8, 2024 · 14d3afc · 14d3afc
1 parent 79e1366
commit 14d3afc
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 1 deletion.
diff --git a/caikit_nlp/modules/text_embedding/embedding.py b/caikit_nlp/modules/text_embedding/embedding.py
@@ -1052,7 +1052,7 @@ def _get_tokenized(self, texts, **kwargs):
         )
 
         if pad_to_max_length:
-                return tokenizer(
+            return tokenizer(
                 texts,
                 return_attention_mask=True,  # Used for determining token count
                 return_token_type_ids=False,

diff --git a/tests/modules/text_embedding/test_embedding.py b/tests/modules/text_embedding/test_embedding.py
@@ -1143,3 +1143,22 @@ def test_same_same(loaded_model: EmbeddingModule, truncate_input_tokens):
     assert not np.allclose(
         separate_vectors[1], separate_vectors[2], rtol=1e-05, atol=1e-08
     )
+
+@pytest.mark.parametrize("pad_to_max_length", [None, False, True, 0, 1])
+def test_pad_to_max_length(pad_to_max_length, loaded_model):
+    """Tests for tokenization kwargs pad_to_max_length will modify tokenizer and give same result"""
+    model_max = loaded_model.model.max_seq_length
+
+    tokenizer_kwargs = {'pad_to_max_length': pad_to_max_length}
+    max_seq = "x " * (model_max - 2) # Subtract 2 for begin/end tokens
+    max_seq_minus_one = "x " * (model_max - 3) # 1 token length shorter than max_seq_length
+    short = "x "
+
+    normal_result = loaded_model._encode_with_retry(
+        [max_seq_minus_one, max_seq, short], return_token_count=True
+    )
+    padded_result = loaded_model._encode_with_retry(
+        [max_seq_minus_one, max_seq, short], return_token_count=True, tokenizer_kwargs=tokenizer_kwargs
+    )
+
+    assert np.all(normal_result.embedding == padded_result.embedding)