augmented ontonotes script to be able to read singleton conlls

stanfordnlp · Jan 25, 2025 · 3d356f3 · 3d356f3
1 parent 1abc2b6
commit 3d356f3
Showing 1 changed file with 152 additions and 5 deletions.
diff --git a/stanza/utils/datasets/coref/convert_ontonotes.py b/stanza/utils/datasets/coref/convert_ontonotes.py
@@ -1,13 +1,48 @@
+"""
+convert_ontonotes.py
+
+This script is used to convert the OntoNotes dataset into a format that can be used by Stanza's coreference resolution model. The script uses the datasets package to download the OntoNotes dataset and then processes the dataset using Stanza's coreference resolution pipeline. The processed dataset is then saved in a JSON file.
+
+If you want to simply process the official OntoNotes dataset...
+1. install the `datasets` package: `pip install datasets`
+2. make folders! (or those adjusted to taste through scripts/config.sh)
+   - extern_data/coref/english/en_ontonotes
+   - data/coref
+2. run this script: python convert_ontonotes.py
+
+If you happen to have singleton annotated coref chains...
+1. install the `datasets` package: `pip install datasets`
+2. make folders! (or those adjusted to taste through scripts/config.sh)
+   - extern_data/coref/english/en_ontonotes
+   - data/coref
+3. get the singletons annotated coref chains in conll format from the Splice repo
+   https://github.com/yilunzhu/splice/blob/main/data/ontonotes5_mentions.zip
+4. place the singleton annotated coref chains in the folder `extern_data/coref/english/en_ontonotes`
+   $ ls ./extern_data/coref/english/en_ontonotes
+        dev_sg_pred.english.v4_gold_conll
+        test_sg_pred.english.v4_gold_conll
+        train_sg.english.v4_gold_conll
+5. run this script: python convert_ontonotes.py
+
+Your results will appear in ./data/coref/, and you can be off to the races with training!
+Note that this script invokes Stanza itself to run some tagging.
+"""
+
 import json
 import os
 
+from pathlib import Path
+
 import stanza
 
 from stanza.models.constituency import tree_reader
 from stanza.utils.default_paths import get_default_paths
 from stanza.utils.get_tqdm import get_tqdm
 from stanza.utils.datasets.coref.utils import process_document
 
+from stanza.utils.conll import CoNLL
+from collections import defaultdict
+
 tqdm = get_tqdm()
 
 def read_paragraphs(section):
@@ -25,22 +60,121 @@ def read_paragraphs(section):
         if paragraph != []:
             yield doc['document_id'], part_id, paragraph
 
-def convert_dataset_section(pipe, section):
+
+def convert_dataset_section(pipe, section, override_coref_chains=None):
     processed_section = []
     section = list(x for x in read_paragraphs(section))
 
     for idx, (doc_id, part_id, paragraph) in enumerate(tqdm(section)):
         sentences = [x['words'] for x in paragraph]
-        coref_spans = [x['coref_spans'] for x in paragraph]
+        coref_spans = ([x['coref_spans'] for x in paragraph]
+                       if not override_coref_chains
+                       else override_coref_chains[doc_id][part_id])
         sentence_speakers = [x['speaker'] for x in paragraph]
 
         processed = process_document(pipe, doc_id, part_id, sentences, coref_spans, sentence_speakers)
         processed_section.append(processed)
     return processed_section
 
+def extract_chains_from_chunk(chunk):
+    """give a chunk of the gold conll, extract the coref chains
+
+    remember, the indicies are front and back *inclusive*, zero indexed
+    and a span that takes one word only is annotated [id, n, n] (i.e. we
+    don't fencepost by +1)
+
+    Arguments
+    ---------
+        chunk : List[str]
+            list of strings, each string is a line in the conll file
+
+    Returns
+    -------
+        final_chains : List[Tuple[int, int, int ]]
+            list of chains, each chain is a list of [id, open_location, close_location]
+    """
+
+    chains = [sentence.split("    ")[-1].strip()
+            for sentence in chunk]
+    chains = [[] if i == '-' else i.split("|")
+            for i in chains]
+
+    opens = defaultdict(list)
+    closes = defaultdict(list)
+
+    for indx, elem in enumerate(chains):
+
+        # for each one, check if its an open, close, or both 
+        for i in elem:
+            id = int(i.strip("(").strip(")"))
+            if (i[0]=="("):
+                opens[id].append(indx)
+            if (i[-1]==")"):
+                closes[id].append(indx)
+
+    # and now, we chain the ids' opens and closes together
+    # into the shape of [id, open_location, close_location]
+    opens = dict(opens)
+    closes = dict(closes)
+
+    final_chains = []
+    for key, open_indx in opens.items():
+        for o,c in zip(sorted(open_indx), sorted(closes[key])):
+            final_chains.append([key, o,c])
+
+    return final_chains
+
+def extract_chains_from_conll(gold_coref_conll):
+    """extract the coref chains from the gold conll file
+
+    Arguments
+    --------
+        gold_coref_conll : str
+            path to the gold conll file, with coreference chains
+    Returns
+    -------
+        final_chunks : Dict[str, List[List[List[Tuple[int, int, int]]]]]
+            dictionary of document_id to list of paragraphs into
+            list of coref chains in OntoNotes style, keyed by document ID
+    """
+    with open(gold_coref_conll, 'r') as df:
+        gold_coref_conll = df.readlines()
+    # we want to first seperate the document into sentence-level
+    # chunks; we assume that the ordering of the sentences are correct in the
+    # gold document
+    sections = []
+    section = []
+    chunk = []
+    for i in gold_coref_conll:
+        if len(i.split("    ")) < 10:
+            if len(chunk) > 0:
+                section.append(chunk)
+            elif i.startswith("#end document"): # this is a new paragraph
+                sections.append(section)
+                section = []
+            chunk = []
+        else:
+            chunk.append(i)
+
+    # finally, we process each chunk and *index them by ID*
+    final_chunks = defaultdict(list)
+    for section in sections:
+        section_chains = []
+        for chunk in section:
+            section_chains.append(extract_chains_from_chunk(chunk))
+        final_chunks[chunk[0].split("    ")[0]].append(section_chains)
+    final_chunks = dict(final_chunks)
+
+    return final_chunks
+
 SECTION_NAMES = {"train": "train",
                  "dev": "validation",
                  "test": "test"}
+OVERRIDE_CONLL_PATHS = {"en_ontonotes": {
+    "train": "train_sg.english.v4_gold_conll",
+    "dev": "dev_sg_pred.english.v4_gold_conll",
+    "test": "test_sg_pred.english.v4_gold_conll"
+}}
 
 def process_dataset(short_name, ontonotes_path, coref_output_path):
     try:
@@ -58,11 +192,24 @@ def process_dataset(short_name, ontonotes_path, coref_output_path):
         raise ValueError("Unknown short name for downloading ontonotes: %s" % short_name)
 
     pipe = stanza.Pipeline("en", processors="tokenize,pos,lemma,depparse", package="default_accurate", tokenize_pretokenized=True)
-    dataset = load_dataset("conll2012_ontonotesv5", config_name, cache_dir=ontonotes_path)
+
+    # if the cache directory doesn't yet exist, we make it
+    # we store the cache in a seperate subfolder to distinguish from the
+    # possible Singleton conlls that maybe in the folder
+    (Path(ontonotes_path) / "cache").mkdir(exist_ok=True)
+
+    dataset = load_dataset("conll2012_ontonotesv5", config_name, cache_dir=str(Path(ontonotes_path) / "cache"))
     for section, hf_name in SECTION_NAMES.items():
-    #for section, hf_name in [("test", "test")]:
+    # for section, hf_name in [("test", "test")]:
         print("Processing %s" % section)
-        converted_section = convert_dataset_section(pipe, dataset[hf_name])
+        if (Path(ontonotes_path) / OVERRIDE_CONLL_PATHS[short_name][hf_name]).exists():
+            # if, for instance, Amir have given us some singleton annotated coref chains in conll files,
+            # we will use those instead of the ones that OntoNotes has
+            converted_section = convert_dataset_section(pipe, dataset[hf_name], extract_chains_from_conll(
+                str((Path(ontonotes_path) / OVERRIDE_CONLL_PATHS[short_name][hf_name]))
+            ))
+        else:
+            converted_section = convert_dataset_section(pipe, dataset[hf_name])
         output_filename = os.path.join(coref_output_path, "%s.%s.json" % (short_name, section))
         with open(output_filename, "w", encoding="utf-8") as fout:
             json.dump(converted_section, fout, indent=2)