Skip to content

Commit

Permalink
augmented ontonotes script to be able to read singleton conlls
Browse files Browse the repository at this point in the history
  • Loading branch information
Jemoka committed Jan 25, 2025
1 parent 1abc2b6 commit 3d356f3
Showing 1 changed file with 152 additions and 5 deletions.
157 changes: 152 additions & 5 deletions stanza/utils/datasets/coref/convert_ontonotes.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,48 @@
"""
convert_ontonotes.py
This script is used to convert the OntoNotes dataset into a format that can be used by Stanza's coreference resolution model. The script uses the datasets package to download the OntoNotes dataset and then processes the dataset using Stanza's coreference resolution pipeline. The processed dataset is then saved in a JSON file.
If you want to simply process the official OntoNotes dataset...
1. install the `datasets` package: `pip install datasets`
2. make folders! (or those adjusted to taste through scripts/config.sh)
- extern_data/coref/english/en_ontonotes
- data/coref
2. run this script: python convert_ontonotes.py
If you happen to have singleton annotated coref chains...
1. install the `datasets` package: `pip install datasets`
2. make folders! (or those adjusted to taste through scripts/config.sh)
- extern_data/coref/english/en_ontonotes
- data/coref
3. get the singletons annotated coref chains in conll format from the Splice repo
https://github.com/yilunzhu/splice/blob/main/data/ontonotes5_mentions.zip
4. place the singleton annotated coref chains in the folder `extern_data/coref/english/en_ontonotes`
$ ls ./extern_data/coref/english/en_ontonotes
dev_sg_pred.english.v4_gold_conll
test_sg_pred.english.v4_gold_conll
train_sg.english.v4_gold_conll
5. run this script: python convert_ontonotes.py
Your results will appear in ./data/coref/, and you can be off to the races with training!
Note that this script invokes Stanza itself to run some tagging.
"""

import json
import os

from pathlib import Path

import stanza

from stanza.models.constituency import tree_reader
from stanza.utils.default_paths import get_default_paths
from stanza.utils.get_tqdm import get_tqdm
from stanza.utils.datasets.coref.utils import process_document

from stanza.utils.conll import CoNLL
from collections import defaultdict

tqdm = get_tqdm()

def read_paragraphs(section):
Expand All @@ -25,22 +60,121 @@ def read_paragraphs(section):
if paragraph != []:
yield doc['document_id'], part_id, paragraph

def convert_dataset_section(pipe, section):

def convert_dataset_section(pipe, section, override_coref_chains=None):
processed_section = []
section = list(x for x in read_paragraphs(section))

for idx, (doc_id, part_id, paragraph) in enumerate(tqdm(section)):
sentences = [x['words'] for x in paragraph]
coref_spans = [x['coref_spans'] for x in paragraph]
coref_spans = ([x['coref_spans'] for x in paragraph]
if not override_coref_chains
else override_coref_chains[doc_id][part_id])
sentence_speakers = [x['speaker'] for x in paragraph]

processed = process_document(pipe, doc_id, part_id, sentences, coref_spans, sentence_speakers)
processed_section.append(processed)
return processed_section

def extract_chains_from_chunk(chunk):
"""give a chunk of the gold conll, extract the coref chains
remember, the indicies are front and back *inclusive*, zero indexed
and a span that takes one word only is annotated [id, n, n] (i.e. we
don't fencepost by +1)
Arguments
---------
chunk : List[str]
list of strings, each string is a line in the conll file
Returns
-------
final_chains : List[Tuple[int, int, int ]]
list of chains, each chain is a list of [id, open_location, close_location]
"""

chains = [sentence.split(" ")[-1].strip()
for sentence in chunk]
chains = [[] if i == '-' else i.split("|")
for i in chains]

opens = defaultdict(list)
closes = defaultdict(list)

for indx, elem in enumerate(chains):

# for each one, check if its an open, close, or both
for i in elem:
id = int(i.strip("(").strip(")"))
if (i[0]=="("):
opens[id].append(indx)
if (i[-1]==")"):
closes[id].append(indx)

# and now, we chain the ids' opens and closes together
# into the shape of [id, open_location, close_location]
opens = dict(opens)
closes = dict(closes)

final_chains = []
for key, open_indx in opens.items():
for o,c in zip(sorted(open_indx), sorted(closes[key])):
final_chains.append([key, o,c])

return final_chains

def extract_chains_from_conll(gold_coref_conll):
"""extract the coref chains from the gold conll file
Arguments
--------
gold_coref_conll : str
path to the gold conll file, with coreference chains
Returns
-------
final_chunks : Dict[str, List[List[List[Tuple[int, int, int]]]]]
dictionary of document_id to list of paragraphs into
list of coref chains in OntoNotes style, keyed by document ID
"""
with open(gold_coref_conll, 'r') as df:
gold_coref_conll = df.readlines()
# we want to first seperate the document into sentence-level
# chunks; we assume that the ordering of the sentences are correct in the
# gold document
sections = []
section = []
chunk = []
for i in gold_coref_conll:
if len(i.split(" ")) < 10:
if len(chunk) > 0:
section.append(chunk)
elif i.startswith("#end document"): # this is a new paragraph
sections.append(section)
section = []
chunk = []
else:
chunk.append(i)

# finally, we process each chunk and *index them by ID*
final_chunks = defaultdict(list)
for section in sections:
section_chains = []
for chunk in section:
section_chains.append(extract_chains_from_chunk(chunk))
final_chunks[chunk[0].split(" ")[0]].append(section_chains)
final_chunks = dict(final_chunks)

return final_chunks

SECTION_NAMES = {"train": "train",
"dev": "validation",
"test": "test"}
OVERRIDE_CONLL_PATHS = {"en_ontonotes": {
"train": "train_sg.english.v4_gold_conll",
"dev": "dev_sg_pred.english.v4_gold_conll",
"test": "test_sg_pred.english.v4_gold_conll"
}}

def process_dataset(short_name, ontonotes_path, coref_output_path):
try:
Expand All @@ -58,11 +192,24 @@ def process_dataset(short_name, ontonotes_path, coref_output_path):
raise ValueError("Unknown short name for downloading ontonotes: %s" % short_name)

pipe = stanza.Pipeline("en", processors="tokenize,pos,lemma,depparse", package="default_accurate", tokenize_pretokenized=True)
dataset = load_dataset("conll2012_ontonotesv5", config_name, cache_dir=ontonotes_path)

# if the cache directory doesn't yet exist, we make it
# we store the cache in a seperate subfolder to distinguish from the
# possible Singleton conlls that maybe in the folder
(Path(ontonotes_path) / "cache").mkdir(exist_ok=True)

dataset = load_dataset("conll2012_ontonotesv5", config_name, cache_dir=str(Path(ontonotes_path) / "cache"))
for section, hf_name in SECTION_NAMES.items():
#for section, hf_name in [("test", "test")]:
# for section, hf_name in [("test", "test")]:
print("Processing %s" % section)
converted_section = convert_dataset_section(pipe, dataset[hf_name])
if (Path(ontonotes_path) / OVERRIDE_CONLL_PATHS[short_name][hf_name]).exists():
# if, for instance, Amir have given us some singleton annotated coref chains in conll files,
# we will use those instead of the ones that OntoNotes has
converted_section = convert_dataset_section(pipe, dataset[hf_name], extract_chains_from_conll(
str((Path(ontonotes_path) / OVERRIDE_CONLL_PATHS[short_name][hf_name]))
))
else:
converted_section = convert_dataset_section(pipe, dataset[hf_name])
output_filename = os.path.join(coref_output_path, "%s.%s.json" % (short_name, section))
with open(output_filename, "w", encoding="utf-8") as fout:
json.dump(converted_section, fout, indent=2)
Expand Down

0 comments on commit 3d356f3

Please sign in to comment.