From f2f10d63d56bb67044f225d674415b0b895efa39 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Mon, 3 Jun 2024 13:16:09 -0400 Subject: [PATCH 1/3] Add alzheimers info extraction template --- src/ontogpt/templates/alz_treat.py | 169 +++++++++++++++++++++++++++ src/ontogpt/templates/alz_treat.yaml | 85 ++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 src/ontogpt/templates/alz_treat.py create mode 100644 src/ontogpt/templates/alz_treat.yaml diff --git a/src/ontogpt/templates/alz_treat.py b/src/ontogpt/templates/alz_treat.py new file mode 100644 index 000000000..1a2e4ff1e --- /dev/null +++ b/src/ontogpt/templates/alz_treat.py @@ -0,0 +1,169 @@ +from __future__ import annotations +from datetime import ( + datetime, + date +) +from decimal import Decimal +from enum import Enum +import re +import sys +from typing import ( + Any, + List, + Literal, + Dict, + Optional, + Union +) +from pydantic.version import VERSION as PYDANTIC_VERSION +if int(PYDANTIC_VERSION[0])>=2: + from pydantic import ( + BaseModel, + ConfigDict, + Field, + field_validator + ) +else: + from pydantic import ( + BaseModel, + Field, + validator + ) + +metamodel_version = "None" +version = "None" + + +class ConfiguredBaseModel(BaseModel): + model_config = ConfigDict( + validate_assignment = True, + validate_default = True, + extra = "forbid", + arbitrary_types_allowed = True, + use_enum_values = True, + strict = False, + ) + pass + + +class NullDataOptions(str, Enum): + UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION" + NOT_APPLICABLE = "NOT_APPLICABLE" + NOT_MENTIONED = "NOT_MENTIONED" + + +class ExtractionResult(ConfiguredBaseModel): + """ + A result of extracting knowledge on text + """ + input_id: Optional[str] = Field(None) + input_title: Optional[str] = Field(None) + input_text: Optional[str] = Field(None) + raw_completion_output: Optional[str] = Field(None) + prompt: Optional[str] = Field(None) + extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""") + named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""") + + +class NamedEntity(ConfiguredBaseModel): + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class CompoundExpression(ConfiguredBaseModel): + pass + + +class Triple(CompoundExpression): + """ + Abstract parent for Relation Extraction tasks + """ + subject: Optional[str] = Field(None) + predicate: Optional[str] = Field(None) + object: Optional[str] = Field(None) + qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""") + subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""") + object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""") + + +class TextWithTriples(ConfiguredBaseModel): + """ + A text containing one or more relations of the Triple type. + """ + publication: Optional[Publication] = Field(None) + triples: Optional[List[Triple]] = Field(default_factory=list) + + +class TextWithEntity(ConfiguredBaseModel): + """ + A text containing one or more instances of a single type of entity. + """ + publication: Optional[Publication] = Field(None) + entities: Optional[List[str]] = Field(default_factory=list) + + +class RelationshipType(NamedEntity): + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class Publication(ConfiguredBaseModel): + id: Optional[str] = Field(None, description="""The publication identifier""") + title: Optional[str] = Field(None, description="""The title of the publication""") + abstract: Optional[str] = Field(None, description="""The abstract of the publication""") + combined_text: Optional[str] = Field(None) + full_text: Optional[str] = Field(None, description="""The full text of the publication""") + + +class AnnotatorResult(ConfiguredBaseModel): + subject_text: Optional[str] = Field(None) + object_id: Optional[str] = Field(None) + object_text: Optional[str] = Field(None) + + +class Document(NamedEntity): + sections: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of full sections of the document. If semicolons are present in the section text, they should be replaced with (SEMICOLON) to avoid parsing errors.""") + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class DocumentSection(NamedEntity): + symptoms: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of symptoms mentioned in the section.""") + diagnostics: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diagnostic procedures mentioned in the section.""") + treatments: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of treatments mentioned in the section. These may be drugs or other therapeutic procedures.""") + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class Symptom(NamedEntity): + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class Diagnostic(NamedEntity): + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class Treatment(NamedEntity): + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +# Model rebuild +# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model +ExtractionResult.model_rebuild() +NamedEntity.model_rebuild() +CompoundExpression.model_rebuild() +Triple.model_rebuild() +TextWithTriples.model_rebuild() +TextWithEntity.model_rebuild() +RelationshipType.model_rebuild() +Publication.model_rebuild() +AnnotatorResult.model_rebuild() +Document.model_rebuild() +DocumentSection.model_rebuild() +Symptom.model_rebuild() +Diagnostic.model_rebuild() +Treatment.model_rebuild() + diff --git a/src/ontogpt/templates/alz_treat.yaml b/src/ontogpt/templates/alz_treat.yaml new file mode 100644 index 000000000..baf61a53d --- /dev/null +++ b/src/ontogpt/templates/alz_treat.yaml @@ -0,0 +1,85 @@ +id: http://w3id.org/ontogpt/alz_treat +name: alz_treat +title: Template for extracting Alzheimer's Disease Treatments +description: >- + Template for extracting Alzheimer's disease treatments and related concepts. + Assumes a large input text, on the order of a full scientific article or + review. +license: https://creativecommons.org/publicdomain/zero/1.0/ +prefixes: + rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# + alz_treat: http://w3id.org/ontogpt/alz_treat + linkml: https://w3id.org/linkml/ + +default_prefix: alz_treat +default_range: string + +imports: + - linkml:types + - core + +classes: + Document: + tree_root: true + is_a: NamedEntity + attributes: + sections: + range: DocumentSection + multivalued: true + description: >- + A semicolon-separated list of full sections of the document. + If semicolons are present in the section text, they should + be replaced with (SEMICOLON) to avoid parsing errors. + + DocumentSection: + is_a: NamedEntity + attributes: + symptoms: + range: Symptom + multivalued: true + description: >- + A semicolon-separated list of symptoms mentioned in the section. + diagnostics: + range: Diagnostic + multivalued: true + description: >- + A semicolon-separated list of diagnostic procedures mentioned in the + section. + treatments: + range: Treatment + multivalued: true + description: >- + A semicolon-separated list of treatments mentioned in the section. + These may be drugs or other therapeutic procedures. + + Symptom: + is_a: NamedEntity + id_prefixes: + - HP + annotations: + annotators: sqlite:obo:hp, sqlite:obo:mondo, sqlite:obo:mesh, sqlite:obo:ncit + prompt: >- + the name of a human phenotype or symptom. + Examples are ascites, fever, pain, seizure, increased intracranial + pressure, lactic acidosis. + + Diagnostic: + is_a: NamedEntity + id_prefixes: + - MAXO + annotations: + annotators: sqlite:obo:maxo, sqlite:obo:mesh, sqlite:obo:snomedct, sqlite:obo:ncit + prompt: >- + the name of a diagnostic procedure or test. + Examples are MRI, PET scan, lumbar puncture, blood test, biopsy. + + Treatment: + is_a: NamedEntity + id_prefixes: + - DRUGBANK + - MAXO + annotations: + annotators: sqlite:obo:drugbank, sqlite:obo:maxo, sqlite:obo:mesh, sqlite:obo:ncit + prompt: >- + the name of a drug or therapeutic procedure. + Examples are aspirin, chemotherapy, radiation therapy, surgery. From ec92b2bdadec02cbfc9af429227cad58e5446c4c Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Mon, 3 Jun 2024 14:08:26 -0400 Subject: [PATCH 2/3] Enable passing max_text_length as arg --- src/ontogpt/cli.py | 8 +++++++- src/ontogpt/clients/pubmed_client.py | 7 ++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/ontogpt/cli.py b/src/ontogpt/cli.py index ac10d1b1e..db4f3a1e0 100644 --- a/src/ontogpt/cli.py +++ b/src/ontogpt/cli.py @@ -563,9 +563,14 @@ def pubmed_extract(model, pmid, template, output, output_format, get_pmc, show_p default=False, help="Attempt to parse PubMed Central full text(s) instead of abstract(s) alone.", ) +@click.option( + "--max-text-length", + default=3000, + help="Maximum text length for each input chunk. Dependent on context size of model used." +) @click.argument("search") def pubmed_annotate( - model, search, template, output, output_format, limit, get_pmc, show_prompt, **kwargs + model, search, template, output, output_format, limit, get_pmc, show_prompt, max_text_length, **kwargs ): """Retrieve a collection of PubMed IDs for a search term; annotate them using a template. @@ -595,6 +600,7 @@ def pubmed_annotate( pubmed_annotate_limit = limit pmc = PubmedClient() + pmc.max_text_length = max_text_length pmids = pmc.get_pmids(search) if get_pmc: logging.info("Will try to retrieve PubMed Central texts.") diff --git a/src/ontogpt/clients/pubmed_client.py b/src/ontogpt/clients/pubmed_client.py index fb3acce81..d02fe6404 100644 --- a/src/ontogpt/clients/pubmed_client.py +++ b/src/ontogpt/clients/pubmed_client.py @@ -80,9 +80,10 @@ class PubmedClient: This class is a wrapper around the Entrez API. """ - # TODO: this doesn't need to be hardcoded - # and may vary based on the model in use - max_text_length = 10000 + # The maximum length of text, in characters, to include in + # a single input chunk. This may be set in the CLI + # with the max_text_length option. + max_text_length: int = 10000 try: email = get_apikey_value("ncbi-email") From d42001eefbea4904f2a62dd7c0aecfe81ac7302c Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Mon, 3 Jun 2024 14:08:38 -0400 Subject: [PATCH 3/3] Update alz template --- src/ontogpt/templates/alz_treat.py | 7 +++---- src/ontogpt/templates/alz_treat.yaml | 17 ++++++++++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/ontogpt/templates/alz_treat.py b/src/ontogpt/templates/alz_treat.py index 1a2e4ff1e..e12808967 100644 --- a/src/ontogpt/templates/alz_treat.py +++ b/src/ontogpt/templates/alz_treat.py @@ -122,17 +122,16 @@ class AnnotatorResult(ConfiguredBaseModel): class Document(NamedEntity): - sections: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of full sections of the document. If semicolons are present in the section text, they should be replaced with (SEMICOLON) to avoid parsing errors.""") + sections: Optional[List[DocumentSection]] = Field(default_factory=list, description="""A semicolon-separated list of full sections of the document. If semicolons are present in the section text, they should be replaced with (SEMICOLON) to avoid parsing errors. A section is a major division of the document, such as an abstract, introduction, methods, results, discussion, or conclusion, or a subsection of one of these. The text should include the section title. A single phrase or ID is not a section. Do not format in Markdown.""") id: str = Field(..., description="""A unique identifier for the named entity""") label: Optional[str] = Field(None, description="""The label (name) of the named thing""") -class DocumentSection(NamedEntity): +class DocumentSection(CompoundExpression): + summary: Optional[str] = Field(None, description="""A brief summary of the section, suitable for display in a table of contents or search results. This should be a single sentence or phrase, not a full paragraph. Do not format in Markdown.""") symptoms: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of symptoms mentioned in the section.""") diagnostics: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diagnostic procedures mentioned in the section.""") treatments: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of treatments mentioned in the section. These may be drugs or other therapeutic procedures.""") - id: str = Field(..., description="""A unique identifier for the named entity""") - label: Optional[str] = Field(None, description="""The label (name) of the named thing""") class Symptom(NamedEntity): diff --git a/src/ontogpt/templates/alz_treat.yaml b/src/ontogpt/templates/alz_treat.yaml index baf61a53d..037969e1c 100644 --- a/src/ontogpt/templates/alz_treat.yaml +++ b/src/ontogpt/templates/alz_treat.yaml @@ -4,7 +4,7 @@ title: Template for extracting Alzheimer's Disease Treatments description: >- Template for extracting Alzheimer's disease treatments and related concepts. Assumes a large input text, on the order of a full scientific article or - review. + review. Try with this review - PMID:33302541 license: https://creativecommons.org/publicdomain/zero/1.0/ prefixes: rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# @@ -30,10 +30,21 @@ classes: A semicolon-separated list of full sections of the document. If semicolons are present in the section text, they should be replaced with (SEMICOLON) to avoid parsing errors. + A section is a major division of the document, such as an abstract, + introduction, methods, results, discussion, or conclusion, + or a subsection of one of these. The text should include the section + title. A single phrase or ID is not a section. + Do not format in Markdown. DocumentSection: - is_a: NamedEntity + is_a: CompoundExpression attributes: + summary: + range: string + description: >- + A brief summary of the section, suitable for display in a table of + contents or search results. This should be a single sentence or + phrase, not a full paragraph. Do not format in Markdown. symptoms: range: Symptom multivalued: true @@ -68,7 +79,7 @@ classes: id_prefixes: - MAXO annotations: - annotators: sqlite:obo:maxo, sqlite:obo:mesh, sqlite:obo:snomedct, sqlite:obo:ncit + annotators: sqlite:obo:maxo, sqlite:obo:mesh, bioportal:SNOMEDCT, sqlite:obo:ncit prompt: >- the name of a diagnostic procedure or test. Examples are MRI, PET scan, lumbar puncture, blood test, biopsy.