Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add _PUBMED to new datasets #687

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 21 additions & 18 deletions bigbio/biodatasets/gad/gad.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,23 @@
annotation procedure based on the Genetic Association Database
"""

_HOMEPAGE = "https://github.com/dmis-lab/biobert" # This data source is used by the BLURB benchmark
_PUBMED = True

_HOMEPAGE = "https://github.com/dmis-lab/biobert" # This data source is used by the BLURB benchmark

_LICENSE = "Creative Common Attribution 4.0 International"

_URLs = {
"source": "https://drive.google.com/uc?export=download&id=1-jDKGcXREb2X9xTFnuiJ36PvsqoyHWcw",
"bigbio_text": "https://drive.google.com/uc?export=download&id=1-jDKGcXREb2X9xTFnuiJ36PvsqoyHWcw"
"bigbio_text": "https://drive.google.com/uc?export=download&id=1-jDKGcXREb2X9xTFnuiJ36PvsqoyHWcw",
}

_SUPPORTED_TASKS = [
Tasks.TEXT_CLASSIFICATION
]
_SUPPORTED_TASKS = [Tasks.TEXT_CLASSIFICATION]

_SOURCE_VERSION = "1.0.0"
_BIGBIO_VERSION = "1.0.0"


class GAD(datasets.GeneratorBasedBuilder):
"""GAD is a weakly labeled dataset for Entity Relations (REL) task which is treated as a sentence classification task."""

Expand All @@ -61,7 +62,8 @@ class GAD(datasets.GeneratorBasedBuilder):
description="GAD source schema",
schema="source",
subset_id=f"gad_fold{i}",
) for i in range(10)
)
for i in range(10)
] + [
# 10-fold bigbio schema
BigBioConfig(
Expand All @@ -70,7 +72,8 @@ class GAD(datasets.GeneratorBasedBuilder):
description="GAD BigBio schema",
schema="bigbio_text",
subset_id=f"gad_fold{i}",
) for i in range(10)
)
for i in range(10)
]

DEFAULT_CONFIG_NAME = "gad_fold0_source"
Expand All @@ -81,7 +84,7 @@ def _info(self):
{
"index": datasets.Value("string"),
"sentence": datasets.Value("string"),
"label": datasets.Value("int32")
"label": datasets.Value("int32"),
}
)
elif self.config.schema == "bigbio_text":
Expand All @@ -99,12 +102,12 @@ def _split_generators(
self, dl_manager: datasets.DownloadManager
) -> List[datasets.SplitGenerator]:
fold_id = int(self.config.subset_id.split("_fold")[1][0]) + 1

my_urls = _URLs[self.config.schema]
data_dir = Path(dl_manager.download_and_extract(my_urls))
data_files = {
"train": data_dir / "GAD" / str(fold_id) / "train.tsv",
"test": data_dir / "GAD" / str(fold_id) / "test.tsv"
"test": data_dir / "GAD" / str(fold_id) / "test.tsv",
}

return [
Expand All @@ -119,28 +122,28 @@ def _split_generators(
]

def _generate_examples(self, filepath: Path):
if 'train.tsv' in str(filepath):
df = pd.read_csv(filepath, sep='\t', header=None).reset_index()
if "train.tsv" in str(filepath):
df = pd.read_csv(filepath, sep="\t", header=None).reset_index()
else:
df = pd.read_csv(filepath, sep='\t')
df.columns = ['id', 'sentence', 'label']
df = pd.read_csv(filepath, sep="\t")
df.columns = ["id", "sentence", "label"]

if self.config.schema == "source":
for id, row in enumerate(df.itertuples()):
ex = {
"index": row.id,
"sentence": row.sentence,
"label": int(row.label)
}
"label": int(row.label),
}
yield id, ex
elif self.config.schema == "bigbio_text":
for id, row in enumerate(df.itertuples()):
ex = {
"id": id,
"document_id": row.id,
"text": row.sentence,
"labels": [str(row.label)]
"labels": [str(row.label)],
}
yield id, ex
yield id, ex
else:
raise ValueError(f"Invalid config: {self.config.name}")
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
multiple PTM types at once in a unified framework.
"""

_PUBMED = True

_HOMEPAGE = "http://www.geniaproject.org/other-corpora/ptm-event-corpus"

_LICENSE = "GENIA Project License for Annotated Corpora"
Expand All @@ -69,7 +71,11 @@
_DATASETNAME: "http://www.geniaproject.org/other-corpora/ptm-event-corpus/post-translational_modifications_training_data.tar.gz?attredirects=0&d=1",
}

_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.COREFERENCE_RESOLUTION, Tasks.EVENT_EXTRACTION]
_SUPPORTED_TASKS = [
Tasks.NAMED_ENTITY_RECOGNITION,
Tasks.COREFERENCE_RESOLUTION,
Tasks.EVENT_EXTRACTION,
]

_SOURCE_VERSION = "1.0.0"

Expand Down Expand Up @@ -119,7 +125,9 @@ def _info(self) -> datasets.DatasetInfo:
"events": [ # E line in brat
{
"id": datasets.Value("string"),
"type": datasets.Value("string"), # refers to the text_bound_annotation of the trigger
"type": datasets.Value(
"string"
), # refers to the text_bound_annotation of the trigger
"trigger": datasets.Value("string"),
"arguments": [
{
Expand Down Expand Up @@ -183,12 +191,16 @@ def _generate_examples(self, data_dir) -> Tuple[int, Dict]:
if filename.endswith(".txt"):
txt_file_path = Path(dirpath, filename)
if self.config.schema == "source":
example = parsing.parse_brat_file(txt_file_path, annotation_file_suffixes=[".a1", ".a2"])
example = parsing.parse_brat_file(
txt_file_path, annotation_file_suffixes=[".a1", ".a2"]
)
example["id"] = str(guid)
for key in ["attributes", "normalizations"]:
del example[key]
yield guid, example
elif self.config.schema == "bigbio_kb":
example = parsing.brat_parse_to_bigbio_kb(parsing.parse_brat_file(txt_file_path))
example = parsing.brat_parse_to_bigbio_kb(
parsing.parse_brat_file(txt_file_path)
)
example["id"] = str(guid)
yield guid, example
2 changes: 2 additions & 0 deletions bigbio/biodatasets/medical_data/medical_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@

_LICENSE = ""

_PUBMED = False

_URLS = {}

_SUPPORTED_TASKS = [Tasks.TEXTUAL_ENTAILMENT]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@
}
"""

_PUBMED = False

_DATASETNAME = "n2c2_2014_risk_factors"

_DESCRIPTION = """\
Expand Down Expand Up @@ -268,4 +270,4 @@ def _read_task2_file(self, file_object, file_name):
risk_factors.append(risk_factor)

document = {"document_id": file_name, "text": text, "cardiac_risk_factors": risk_factors}
return document
return document