Skip to content

Commit

Permalink
Template for Alzheimer's Disease extractions from literature; expose …
Browse files Browse the repository at this point in the history
…the `max-text-length` option for pubmed_annotate (#392)
  • Loading branch information
caufieldjh authored Jun 3, 2024
2 parents c1ef136 + d42001e commit 886c4db
Show file tree
Hide file tree
Showing 4 changed files with 275 additions and 4 deletions.
8 changes: 7 additions & 1 deletion src/ontogpt/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,9 +563,14 @@ def pubmed_extract(model, pmid, template, output, output_format, get_pmc, show_p
default=False,
help="Attempt to parse PubMed Central full text(s) instead of abstract(s) alone.",
)
@click.option(
"--max-text-length",
default=3000,
help="Maximum text length for each input chunk. Dependent on context size of model used."
)
@click.argument("search")
def pubmed_annotate(
model, search, template, output, output_format, limit, get_pmc, show_prompt, **kwargs
model, search, template, output, output_format, limit, get_pmc, show_prompt, max_text_length, **kwargs
):
"""Retrieve a collection of PubMed IDs for a search term; annotate them using a template.
Expand Down Expand Up @@ -595,6 +600,7 @@ def pubmed_annotate(

pubmed_annotate_limit = limit
pmc = PubmedClient()
pmc.max_text_length = max_text_length
pmids = pmc.get_pmids(search)
if get_pmc:
logging.info("Will try to retrieve PubMed Central texts.")
Expand Down
7 changes: 4 additions & 3 deletions src/ontogpt/clients/pubmed_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,10 @@ class PubmedClient:
This class is a wrapper around the Entrez API.
"""

# TODO: this doesn't need to be hardcoded
# and may vary based on the model in use
max_text_length = 10000
# The maximum length of text, in characters, to include in
# a single input chunk. This may be set in the CLI
# with the max_text_length option.
max_text_length: int = 10000

try:
email = get_apikey_value("ncbi-email")
Expand Down
168 changes: 168 additions & 0 deletions src/ontogpt/templates/alz_treat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
from __future__ import annotations
from datetime import (
datetime,
date
)
from decimal import Decimal
from enum import Enum
import re
import sys
from typing import (
Any,
List,
Literal,
Dict,
Optional,
Union
)
from pydantic.version import VERSION as PYDANTIC_VERSION
if int(PYDANTIC_VERSION[0])>=2:
from pydantic import (
BaseModel,
ConfigDict,
Field,
field_validator
)
else:
from pydantic import (
BaseModel,
Field,
validator
)

metamodel_version = "None"
version = "None"


class ConfiguredBaseModel(BaseModel):
model_config = ConfigDict(
validate_assignment = True,
validate_default = True,
extra = "forbid",
arbitrary_types_allowed = True,
use_enum_values = True,
strict = False,
)
pass


class NullDataOptions(str, Enum):
UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION"
NOT_APPLICABLE = "NOT_APPLICABLE"
NOT_MENTIONED = "NOT_MENTIONED"


class ExtractionResult(ConfiguredBaseModel):
"""
A result of extracting knowledge on text
"""
input_id: Optional[str] = Field(None)
input_title: Optional[str] = Field(None)
input_text: Optional[str] = Field(None)
raw_completion_output: Optional[str] = Field(None)
prompt: Optional[str] = Field(None)
extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""")
named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""")


class NamedEntity(ConfiguredBaseModel):
id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


class CompoundExpression(ConfiguredBaseModel):
pass


class Triple(CompoundExpression):
"""
Abstract parent for Relation Extraction tasks
"""
subject: Optional[str] = Field(None)
predicate: Optional[str] = Field(None)
object: Optional[str] = Field(None)
qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""")
subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""")
object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""")


class TextWithTriples(ConfiguredBaseModel):
"""
A text containing one or more relations of the Triple type.
"""
publication: Optional[Publication] = Field(None)
triples: Optional[List[Triple]] = Field(default_factory=list)


class TextWithEntity(ConfiguredBaseModel):
"""
A text containing one or more instances of a single type of entity.
"""
publication: Optional[Publication] = Field(None)
entities: Optional[List[str]] = Field(default_factory=list)


class RelationshipType(NamedEntity):
id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


class Publication(ConfiguredBaseModel):
id: Optional[str] = Field(None, description="""The publication identifier""")
title: Optional[str] = Field(None, description="""The title of the publication""")
abstract: Optional[str] = Field(None, description="""The abstract of the publication""")
combined_text: Optional[str] = Field(None)
full_text: Optional[str] = Field(None, description="""The full text of the publication""")


class AnnotatorResult(ConfiguredBaseModel):
subject_text: Optional[str] = Field(None)
object_id: Optional[str] = Field(None)
object_text: Optional[str] = Field(None)


class Document(NamedEntity):
sections: Optional[List[DocumentSection]] = Field(default_factory=list, description="""A semicolon-separated list of full sections of the document. If semicolons are present in the section text, they should be replaced with (SEMICOLON) to avoid parsing errors. A section is a major division of the document, such as an abstract, introduction, methods, results, discussion, or conclusion, or a subsection of one of these. The text should include the section title. A single phrase or ID is not a section. Do not format in Markdown.""")
id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


class DocumentSection(CompoundExpression):
summary: Optional[str] = Field(None, description="""A brief summary of the section, suitable for display in a table of contents or search results. This should be a single sentence or phrase, not a full paragraph. Do not format in Markdown.""")
symptoms: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of symptoms mentioned in the section.""")
diagnostics: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diagnostic procedures mentioned in the section.""")
treatments: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of treatments mentioned in the section. These may be drugs or other therapeutic procedures.""")


class Symptom(NamedEntity):
id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


class Diagnostic(NamedEntity):
id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


class Treatment(NamedEntity):
id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


# Model rebuild
# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model
ExtractionResult.model_rebuild()
NamedEntity.model_rebuild()
CompoundExpression.model_rebuild()
Triple.model_rebuild()
TextWithTriples.model_rebuild()
TextWithEntity.model_rebuild()
RelationshipType.model_rebuild()
Publication.model_rebuild()
AnnotatorResult.model_rebuild()
Document.model_rebuild()
DocumentSection.model_rebuild()
Symptom.model_rebuild()
Diagnostic.model_rebuild()
Treatment.model_rebuild()

96 changes: 96 additions & 0 deletions src/ontogpt/templates/alz_treat.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
id: http://w3id.org/ontogpt/alz_treat
name: alz_treat
title: Template for extracting Alzheimer's Disease Treatments
description: >-
Template for extracting Alzheimer's disease treatments and related concepts.
Assumes a large input text, on the order of a full scientific article or
review. Try with this review - PMID:33302541
license: https://creativecommons.org/publicdomain/zero/1.0/
prefixes:
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
alz_treat: http://w3id.org/ontogpt/alz_treat
linkml: https://w3id.org/linkml/

default_prefix: alz_treat
default_range: string

imports:
- linkml:types
- core

classes:
Document:
tree_root: true
is_a: NamedEntity
attributes:
sections:
range: DocumentSection
multivalued: true
description: >-
A semicolon-separated list of full sections of the document.
If semicolons are present in the section text, they should
be replaced with (SEMICOLON) to avoid parsing errors.
A section is a major division of the document, such as an abstract,
introduction, methods, results, discussion, or conclusion,
or a subsection of one of these. The text should include the section
title. A single phrase or ID is not a section.
Do not format in Markdown.
DocumentSection:
is_a: CompoundExpression
attributes:
summary:
range: string
description: >-
A brief summary of the section, suitable for display in a table of
contents or search results. This should be a single sentence or
phrase, not a full paragraph. Do not format in Markdown.
symptoms:
range: Symptom
multivalued: true
description: >-
A semicolon-separated list of symptoms mentioned in the section.
diagnostics:
range: Diagnostic
multivalued: true
description: >-
A semicolon-separated list of diagnostic procedures mentioned in the
section.
treatments:
range: Treatment
multivalued: true
description: >-
A semicolon-separated list of treatments mentioned in the section.
These may be drugs or other therapeutic procedures.
Symptom:
is_a: NamedEntity
id_prefixes:
- HP
annotations:
annotators: sqlite:obo:hp, sqlite:obo:mondo, sqlite:obo:mesh, sqlite:obo:ncit
prompt: >-
the name of a human phenotype or symptom.
Examples are ascites, fever, pain, seizure, increased intracranial
pressure, lactic acidosis.
Diagnostic:
is_a: NamedEntity
id_prefixes:
- MAXO
annotations:
annotators: sqlite:obo:maxo, sqlite:obo:mesh, bioportal:SNOMEDCT, sqlite:obo:ncit
prompt: >-
the name of a diagnostic procedure or test.
Examples are MRI, PET scan, lumbar puncture, blood test, biopsy.
Treatment:
is_a: NamedEntity
id_prefixes:
- DRUGBANK
- MAXO
annotations:
annotators: sqlite:obo:drugbank, sqlite:obo:maxo, sqlite:obo:mesh, sqlite:obo:ncit
prompt: >-
the name of a drug or therapeutic procedure.
Examples are aspirin, chemotherapy, radiation therapy, surgery.

0 comments on commit 886c4db

Please sign in to comment.