Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upstream/1 add unit tests and regression test #171

Open
wants to merge 32 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
aa05578
Create Python package & bump to Python 3.8
Sep 14, 2022
03ab20e
Address pr comments
Sep 14, 2022
ad1d4c0
Address pr comments
Sep 14, 2022
47b9779
Merge pull request #2 from EducationalTestingService/create_python_pa…
damien2012eng Sep 14, 2022
4a2ec3d
add unit tests for tokenization file
Sep 7, 2022
87a4997
Address pr comments
Sep 14, 2022
67d2256
add unit tests for token_indexer
Sep 14, 2022
6e57109
Address pr comments
Sep 15, 2022
f311813
Address pr comments
Sep 15, 2022
5c0cf10
Merge pull request #1 from EducationalTestingService/features/add_uni…
damien2012eng Sep 16, 2022
b00e946
Add unit tests for pretrained BERT embedder
Frost45 Sep 9, 2022
05407f3
Add unit tests for pretrained RoBERTa embedder
Frost45 Sep 14, 2022
b649aeb
Add unit tests for seq2labels model
Frost45 Sep 20, 2022
6801c42
Update tokenization tests to use AllenNLP test modules
Frost45 Sep 23, 2022
2964b05
Address PR comments
Frost45 Sep 26, 2022
a3c74c2
Add CI plan
Frost45 Sep 26, 2022
dbc39e2
Addressed PR comments
Frost45 Sep 27, 2022
ca950c3
Unit test for GecModel Prediction.
Sep 27, 2022
e3ea139
Removing duplicate import.
Sep 27, 2022
8291809
Adding readme file so that fixtures dir exists for downloading gector
ksteimel Sep 27, 2022
40619fe
Merge pull request #5 from EducationalTestingService/feature/add_inte…
ksteimel Sep 28, 2022
c535c53
Add how to run unit tests on README
Sep 29, 2022
86a7aff
Add regression data for raw and predictions
Sep 29, 2022
469d162
Merge pull request #7 from EducationalTestingService/features/add_reg…
damien2012eng Oct 3, 2022
bf4e209
Add regression test file
Frost45 Oct 6, 2022
2b4513e
Update CI plan to run regression tests
Frost45 Oct 6, 2022
68d900e
Addressed PR comments
Frost45 Oct 10, 2022
1b60b1b
Addressed PR comments
Frost45 Oct 12, 2022
f1db9a9
Addressed PR comments
Frost45 Oct 12, 2022
323d61a
Add environment.yml
damien2012eng Oct 17, 2022
21d496c
versioning starting with 1.0.0
damien2012eng Oct 17, 2022
5da5955
Address PR comments
damien2012eng Oct 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update tokenization tests to use AllenNLP test modules
  • Loading branch information
Frost45 committed Sep 27, 2022

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 6801c427b75117066a1fbcd781e537343f790583
289 changes: 223 additions & 66 deletions tests/test_token_indexer.py
Original file line number Diff line number Diff line change
@@ -1,92 +1,249 @@
"""Tests for the PretrainedBertIndexer module."""
import unittest
import torch

from allennlp.data.tokenizers import WordTokenizer
from allennlp.common.testing import ModelTestCase
from gector.tokenizer_indexer import PretrainedBertIndexer
from allennlp.data.tokenizers.word_splitter import BertBasicWordSplitter
from allennlp.data.vocabulary import Vocabulary


class TestPretrainedTransformerIndexer(unittest.TestCase):
"""
A test case that tests PretrainedBertIndexer methods.
class TestPretrainedTransformerIndexer(ModelTestCase):
"""A test case that tests PretrainedBertIndexer methods."""

Attributes:
tokens : List[Token]
List of AllenNLP Token objects

vocab : Vocabulary object
Vocabularies from pretrained transformers

model_name : str
Pre-trained model name
def setUp(self):
"""Set up tokenizer and indexer."""

"""
super().setUp()

def setUp(self):
"""Initial setup."""
tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
sentence = "the Quick brown fox jumped over the laziest lazy elmo"
vocab_path = "../data/output_vocabulary"
vocab_path = "data/output_vocabulary"
self.tokens = tokenizer.tokenize(sentence)
self.vocab = Vocabulary.from_files(vocab_path)
self.model_name = 'roberta-base'
self.model_name = "roberta-base"

def test_do_lowercase(self):
"""Test tokenizer to handle setting do_lowercase to be True"""
token_indexer = PretrainedBertIndexer(pretrained_model=self.model_name, max_pieces_per_token=5,
do_lowercase=True, max_pieces=512,
special_tokens_fix=1)
indexed_tokens = token_indexer.tokens_to_indices(self.tokens, self.vocab, self.model_name)
self.assertEqual(indexed_tokens['bert'], [627, 2119, 6219, 23602, 4262, 81, 5, 40154,
7098, 22414, 1615, 4992])
self.assertEqual(indexed_tokens['bert-offsets'], [0, 1, 2, 3, 4, 5, 6, 7, 9, 10])
self.assertEqual(indexed_tokens['mask'], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
token_indexer = PretrainedBertIndexer(
pretrained_model=self.model_name,
max_pieces_per_token=5,
do_lowercase=True,
max_pieces=512,
special_tokens_fix=1,
)
indexed_tokens = token_indexer.tokens_to_indices(
self.tokens, self.vocab, self.model_name
)
assert indexed_tokens["bert"] == [
627,
2119,
6219,
23602,
4262,
81,
5,
40154,
7098,
22414,
1615,
4992,
]
assert indexed_tokens["bert-offsets"] == [0, 1, 2, 3, 4, 5, 6, 7, 9, 10]
assert indexed_tokens["mask"] == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

def test_toggle_special_tokens_fix(self):
"""Test togging special_tokens_fix to be False"""
token_indexer = PretrainedBertIndexer(pretrained_model=self.model_name, max_pieces_per_token=5,
do_lowercase=True, max_pieces=512,
special_tokens_fix=0)
indexed_tokens = token_indexer.tokens_to_indices(self.tokens, self.vocab, self.model_name)

self.assertEqual(indexed_tokens['bert'], [627, 2119, 6219, 23602, 4262, 81, 5, 40154,
7098, 22414, 1615, 4992])
self.assertEqual(indexed_tokens['bert-offsets'], [0, 1, 2, 3, 4, 5, 6, 7, 9, 10])
self.assertEqual(indexed_tokens['mask'], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
token_indexer = PretrainedBertIndexer(
pretrained_model=self.model_name,
max_pieces_per_token=5,
do_lowercase=True,
max_pieces=512,
special_tokens_fix=0,
)
indexed_tokens = token_indexer.tokens_to_indices(
self.tokens, self.vocab, self.model_name
)

assert indexed_tokens["bert"] == [
627,
2119,
6219,
23602,
4262,
81,
5,
40154,
7098,
22414,
1615,
4992,
]
assert indexed_tokens["bert-offsets"] == [0, 1, 2, 3, 4, 5, 6, 7, 9, 10]
assert indexed_tokens["mask"] == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

def test_truncate_window(self):
"""Test the functionality of truncating word pieces"""
token_indexer = PretrainedBertIndexer(pretrained_model=self.model_name, max_pieces_per_token=5,
do_lowercase=True, max_pieces=5,
special_tokens_fix=1)
indexed_tokens = token_indexer.tokens_to_indices(self.tokens, self.vocab, self.model_name)

self.assertEqual(indexed_tokens['bert'], [])
self.assertEqual(indexed_tokens['bert-offsets'], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1])
self.assertEqual(indexed_tokens['mask'], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
token_indexer = PretrainedBertIndexer(
pretrained_model=self.model_name,
max_pieces_per_token=5,
do_lowercase=True,
max_pieces=5,
special_tokens_fix=1,
)
indexed_tokens = token_indexer.tokens_to_indices(
self.tokens, self.vocab, self.model_name
)

assert indexed_tokens["bert"] == []
assert indexed_tokens["bert-offsets"] == [
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
]
assert indexed_tokens["mask"] == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

def test_as_padded_tensor_dict(self):
"""Test the method as_padded_tensor_dict"""
tokens = {'bert': [50265, 37158, 15, 1012, 2156, 89, 32, 460, 5,
12043, 268, 8, 4131, 22761, 13659, 49, 1351, 8, 11360, 7, 12043,
7768, 479], 'bert-offsets': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12,
13, 15, 16, 17, 18, 19, 20, 21, 22], 'mask': [1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
padding_lengths = {'bert': 42, 'bert-offsets': 41, 'mask': 41, 'num_tokens': 42}
desired_num_tokens = {'bert': 42, 'bert-offsets': 41, 'mask': 41}

token_indexer = PretrainedBertIndexer(pretrained_model=self.model_name, max_pieces_per_token=5,
do_lowercase=True, max_pieces=512,
special_tokens_fix=1)

padded_tensor = token_indexer.pad_token_sequence(tokens, desired_num_tokens, padding_lengths)
self.assertEqual(padded_tensor['bert'], [50265, 37158, 15, 1012, 2156,
89, 32, 460, 5, 12043, 268, 8, 4131, 22761, 13659, 49, 1351, 8, 11360,
7, 12043, 7768, 479, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0])


if __name__ == "__main__":
unittest.main()
tokens = {
"bert": [
50265,
37158,
15,
1012,
2156,
89,
32,
460,
5,
12043,
268,
8,
4131,
22761,
13659,
49,
1351,
8,
11360,
7,
12043,
7768,
479,
],
"bert-offsets": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
11,
12,
13,
15,
16,
17,
18,
19,
20,
21,
22,
],
"mask": [
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
],
}
padding_lengths = {
"bert": 42,
"bert-offsets": 41,
"mask": 41,
"num_tokens": 42,
}
desired_num_tokens = {"bert": 42, "bert-offsets": 41, "mask": 41}

token_indexer = PretrainedBertIndexer(
pretrained_model=self.model_name,
max_pieces_per_token=5,
do_lowercase=True,
max_pieces=512,
special_tokens_fix=1,
)

padded_tensor = token_indexer.pad_token_sequence(
tokens, desired_num_tokens, padding_lengths
)
assert padded_tensor["bert"] == [
50265,
37158,
15,
1012,
2156,
89,
32,
460,
5,
12043,
268,
8,
4131,
22761,
13659,
49,
1351,
8,
11360,
7,
12043,
7768,
479,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
]
Loading