EducationalTestingService · Frost45 · Oct 24, 2022 · Sep 29, 2022 · Sep 29, 2022 · Oct 6, 2022
diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml
@@ -7,7 +7,6 @@ on:
   push:
     branches: [ "master" ]
   pull_request:
-    branches: [ "master" ]
 
 jobs:
   build:
@@ -28,6 +27,10 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install -e .
-    - name: Test
+    - name: Unit Testing
       run: |
         pytest -v tests
+    - name: Regression Testing
+      run: |
+        python regression_tests/test_gector_roberta.py
+        python regression_tests/test_regression_data_predictor.py
diff --git a/README.md b/README.md
@@ -16,6 +16,10 @@ pip install -e .
 ```
 The project was tested using Python 3.8.
 
+## Unit tests
+After activating the conda environment, simply run the code below: 
+`pytest -v tests`
+
 ## Datasets
 All the public GEC datasets used in the paper can be downloaded from [here](https://www.cl.cam.ac.uk/research/nl/bea2019st/#data).<br>
 Synthetically created datasets can be generated/downloaded [here](https://github.com/awasthiabhijeet/PIE/tree/master/errorify).<br>

diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,13 @@
+name: gector
+dependencies:
+  - python=3.8
+  - pytorch=1.10.0
+  - python-Levenshtein
+  - transformers
+  - scikit-learn
+  - sentencepiece
+  - overrides=4.1.2
+  - numpy
+  - pip:
+    - allennlp==0.9.0
+
diff --git a/gector/datareader.py b/gector/datareader.py
@@ -6,7 +6,12 @@
 
 from allennlp.common.file_utils import cached_path
 from allennlp.data.dataset_readers.dataset_reader import DatasetReader
-from allennlp.data.fields import TextField, SequenceLabelField, MetadataField, Field
+from allennlp.data.fields import (
+    TextField,
+    SequenceLabelField,
+    MetadataField,
+    Field,
+)
 from allennlp.data.instance import Instance
 from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
 from allennlp.data.tokenizers import Token
@@ -37,23 +42,28 @@ class Seq2LabelsDatasetReader(DatasetReader):
         are pre-tokenised in the data file.
     max_len: if set than will truncate long sentences
     """
+
     # fix broken sentences mostly in Lang8
-    BROKEN_SENTENCES_REGEXP = re.compile(r'\.[a-zA-RT-Z]')
-
-    def __init__(self,
-                 token_indexers: Dict[str, TokenIndexer] = None,
-                 delimeters: dict = SEQ_DELIMETERS,
-                 skip_correct: bool = False,
-                 skip_complex: int = 0,
-                 lazy: bool = False,
-                 max_len: int = None,
-                 test_mode: bool = False,
-                 tag_strategy: str = "keep_one",
-                 tn_prob: float = 0,
-                 tp_prob: float = 0,
-                 broken_dot_strategy: str = "keep") -> None:
+    BROKEN_SENTENCES_REGEXP = re.compile(r"\.[a-zA-RT-Z]")
+
+    def __init__(
+        self,
+        token_indexers: Dict[str, TokenIndexer] = None,
+        delimeters: dict = SEQ_DELIMETERS,
+        skip_correct: bool = False,
+        skip_complex: int = 0,
+        lazy: bool = False,
+        max_len: int = None,
+        test_mode: bool = False,
+        tag_strategy: str = "keep_one",
+        tn_prob: float = 0,
+        tp_prob: float = 0,
+        broken_dot_strategy: str = "keep",
+    ) -> None:
         super().__init__(lazy)
-        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
+        self._token_indexers = token_indexers or {
+            "tokens": SingleIdTokenIndexer()
+        }
         self._delimeters = delimeters
         self._max_len = max_len
         self._skip_correct = skip_correct
@@ -69,16 +79,23 @@ def _read(self, file_path):
         # if `file_path` is a URL, redirect to the cache
         file_path = cached_path(file_path)
         with open(file_path, "r") as data_file:
-            logger.info("Reading instances from lines in file at: %s", file_path)
+            logger.info(
+                "Reading instances from lines in file at: %s", file_path
+            )
             for line in data_file:
                 line = line.strip("\n")
                 # skip blank and broken lines
-                if not line or (not self._test_mode and self._broken_dot_strategy == 'skip'
-                                and self.BROKEN_SENTENCES_REGEXP.search(line) is not None):
+                if not line or (
+                    not self._test_mode
+                    and self._broken_dot_strategy == "skip"
+                    and self.BROKEN_SENTENCES_REGEXP.search(line) is not None
+                ):
                     continue
 
-                tokens_and_tags = [pair.rsplit(self._delimeters['labels'], 1)
-                                   for pair in line.split(self._delimeters['tokens'])]
+                tokens_and_tags = [
+                    pair.rsplit(self._delimeters["labels"], 1)
+                    for pair in line.split(self._delimeters["tokens"])
+                ]
                 try:
                     tokens = [Token(token) for token, tag in tokens_and_tags]
                     tags = [tag for token, tag in tokens_and_tags]
@@ -91,14 +108,14 @@ def _read(self, file_path):
 
                 words = [x.text for x in tokens]
                 if self._max_len is not None:
-                    tokens = tokens[:self._max_len]
-                    tags = None if tags is None else tags[:self._max_len]
+                    tokens = tokens[: self._max_len]
+                    tags = None if tags is None else tags[: self._max_len]
                 instance = self.text_to_instance(tokens, tags, words)
                 if instance:
                     yield instance
 
     def extract_tags(self, tags: List[str]):
-        op_del = self._delimeters['operations']
+        op_del = self._delimeters["operations"]
 
         labels = [x.split(op_del) for x in tags]
 
@@ -117,17 +134,24 @@ def extract_tags(self, tags: List[str]):
         else:
             raise Exception("Incorrect tag strategy")
 
-        detect_tags = ["CORRECT" if label == "$KEEP" else "INCORRECT" for label in labels]
+        detect_tags = [
+            "CORRECT" if label == "$KEEP" else "INCORRECT" for label in labels
+        ]
         return labels, detect_tags, comlex_flag_dict
 
-    def text_to_instance(self, tokens: List[Token], tags: List[str] = None,
-                         words: List[str] = None) -> Instance:  # type: ignore
+    def text_to_instance(
+        self,
+        tokens: List[Token],
+        tags: List[str] = None,
+        words: List[str] = None,
+    ) -> Instance:  # type: ignore
         """
         We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
         """
         # pylint: disable=arguments-differ
         fields: Dict[str, Field] = {}
-        sequence = TextField(tokens, self._token_indexers)
+        # Set size of tokens to _max_len + 1 since $START token is being added
+        sequence = TextField(tokens[: self._max_len + 1], self._token_indexers)
         fields["tokens"] = sequence
         # If words has not been explicitly passed in, create it from tokens.
         if words is None:
@@ -147,8 +171,10 @@ def text_to_instance(self, tokens: List[Token], tags: List[str] = None,
                 if rnd > self._tp_prob:
                     return None
 
-            fields["labels"] = SequenceLabelField(labels, sequence,
-                                                  label_namespace="labels")
-            fields["d_tags"] = SequenceLabelField(detect_tags, sequence,
-                                                  label_namespace="d_tags")
+            fields["labels"] = SequenceLabelField(
+                labels, sequence, label_namespace="labels"
+            )
+            fields["d_tags"] = SequenceLabelField(
+                detect_tags, sequence, label_namespace="d_tags"
+            )
         return Instance(fields)