fix bug

ku-nlp · Jun 27, 2024 · 4bca62b · 4bca62b
1 parent 758dd42
commit 4bca62b
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 1 deletion.
diff --git a/src/kwja/datamodule/datasets/seq2seq.py b/src/kwja/datamodule/datasets/seq2seq.py
@@ -57,6 +57,7 @@ def _load_examples(self, documents: List[Document], is_train: bool) -> List[Seq2
         examples: List[Seq2SeqExample] = []
         example_id: int = 0
         for document in track(documents, description="Loading examples"):
+            document = self._postprocess_document(document)
             for sentence in document.sentences:
                 src_tokens: List[str] = self.formatter.get_src_tokens(sentence)
                 src_input_ids: List[int] = self.tokenizer.convert_tokens_to_ids(src_tokens) + [

diff --git a/src/kwja/utils/normalization.py b/src/kwja/utils/normalization.py
@@ -11,7 +11,8 @@ def normalize_text(text: str) -> str:
 
 def normalize_morpheme(morpheme: Morpheme) -> None:
     morpheme.text = normalize_text(morpheme.text)
-    morpheme.reading = normalize_text(morpheme.reading)
+    reading = "ゔぁいおりん" if morpheme.reading == "う゛ぁいおりん" else morpheme.reading
+    morpheme.reading = normalize_text(reading)
     morpheme.lemma = normalize_text(morpheme.lemma)
     canon = morpheme.semantics.get("代表表記")
     if isinstance(canon, str):