From 67494b8eaeaf2974aed8eb841660638e7bc61b7d Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Fri, 28 Jun 2024 15:13:03 -0400 Subject: [PATCH 1/9] Annotator tuning --- src/ontogpt/templates/alzrd.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ontogpt/templates/alzrd.yaml b/src/ontogpt/templates/alzrd.yaml index 7e9d0905e..5c9f87dc4 100644 --- a/src/ontogpt/templates/alzrd.yaml +++ b/src/ontogpt/templates/alzrd.yaml @@ -155,9 +155,9 @@ classes: is_a: NamedEntity id_prefixes: - MAXO - - SNOMEDCT + - MESH annotations: - annotators: sqlite:obo:maxo, sqlite:obo:mesh, bioportal:SNOMEDCT, sqlite:obo:ncit + annotators: sqlite:obo:maxo, sqlite:obo:mesh, sqlite:obo:ncit prompt: >- The name of a diagnostic procedure or test. Examples are MRI, PET scan, lumbar puncture, blood test, biopsy. From b0289c51ac8bfc5e510fd8a2545909c582ac87ac Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Fri, 28 Jun 2024 15:15:32 -0400 Subject: [PATCH 2/9] Return NOT FOUND instead of NONE --- src/ontogpt/templates/alzrd.py | 12 ++++++------ src/ontogpt/templates/alzrd.yaml | 12 +++++++----- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/ontogpt/templates/alzrd.py b/src/ontogpt/templates/alzrd.py index 3f18cb697..c1a79b571 100644 --- a/src/ontogpt/templates/alzrd.py +++ b/src/ontogpt/templates/alzrd.py @@ -128,14 +128,14 @@ class Document(NamedEntity): class DocumentSection(CompoundExpression): - taxon: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of taxa or species of organisms mentioned in the section. Where possible, translate to the binomial species name (e.g., change \"mouse\" to \"Mus musculus\"), unless a different species name is provided in the text.""") + taxon: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of taxa or species of organisms mentioned in the section. Where possible, translate to the binomial species name (e.g., change \"mouse\" to \"Mus musculus\"), unless a different species name is provided in the text. If no taxon is mentioned, return NOT FOUND.""") part_of: Optional[str] = Field(None, description="""The major document division that this section is a part of. Examples are \"introduction\", \"methods\", \"results\", \"discussion\", or \"conclusions\". Do not format in Markdown.""") summary: Optional[str] = Field(None, description="""A brief summary of the section, suitable for display in a table of contents or search results. This should be a single sentence or phrase, not a full paragraph. Do not format in Markdown.""") - diagnostics: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diagnostic procedures mentioned in the section. If no diagnostic procedures are mentioned, return NONE.""") - diseases: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diseases or conditions mentioned in the section. If no diseases are mentioned, return NONE.""") - chemical: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of chemicals, drugs, or other substances mentioned in the section. If no chemicals are mentioned, return NONE.""") - environmental_exposures: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of environmental exposures mentioned in the section. These may include exposure to general classes of materials, e.g., \"exposure to pesticides\", or other phenomena, e.g., \"chronic stress\". If no environmental exposures are mentioned, return NONE.""") - experimental_metrics_and_indicators: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of of a experimental metrics, signs, symptoms, or outcomes used to measure the progression of Alzheimer's disease and related dementias. These may be quantitative or qualitative measures, including biomolecular assays. In experimental animal models these are analogues of cognitive impairment or indicators of disease progression modeling those observed in humans. Examples are Amyloid beta (Aβ) levels, Morris water maze test, tau phosphorylation, neurofibrillary tangles, and cognitive decline. If no experimental metrics are mentioned, return NONE.""") + diagnostics: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diagnostic procedures mentioned in the section. If no diagnostic procedures are mentioned, return NOT FOUND.""") + diseases: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diseases or conditions mentioned in the section. If no diseases are mentioned, return NOT FOUND.""") + chemical: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of chemicals, drugs, or other substances mentioned in the section. If no chemicals are mentioned, return NOT FOUND.""") + environmental_exposures: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of environmental exposures mentioned in the section. These may include exposure to general classes of materials, e.g., \"exposure to pesticides\", or other phenomena, e.g., \"chronic stress\". If no environmental exposures are mentioned, return NOT FOUND.""") + experimental_metrics_and_indicators: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of of a experimental metrics, signs, symptoms, or outcomes used to measure the progression of Alzheimer's disease and related dementias. These may be quantitative or qualitative measures, including biomolecular assays. In experimental animal models these are analogues of cognitive impairment or indicators of disease progression modeling those observed in humans. Examples are Amyloid beta (Aβ) levels, Morris water maze test, tau phosphorylation, neurofibrillary tangles, and cognitive decline. If no experimental metrics are mentioned, return NOT FOUND.""") experimental_metrics_to_taxon_relationships: Optional[List[ExperimentalMetricToTaxonRelationship]] = Field(default_factory=list, description="""Semicolon-separated list of relationships between a specific experimental metric, sign, symptom, or outcome and a taxon, where the relationship is used to measure progression of Alzheimer's disease and related dementias, or an experimental analogue, in the taxon. For example, \"Amyloid beta (Aβ) levels are measured in Mus musculus\" or \"Morris water maze test is measured with Rattus norvegicus\".""") experimental_metric_to_disease_relationships: Optional[List[ExperimentalMetricToDiseaseRelationship]] = Field(default_factory=list, description="""Semicolon-separated list of relationships between a specific experimental metric, sign, symptom, or outcome and a disease or condition, where the relationship is used as an experimental model of progression or presence of a disease. For example, \"Amyloid beta (Aβ) levels are used to model Alzheimer's disease\" or \"Morris water maze test is used to model Parkinson's disease\".""") diff --git a/src/ontogpt/templates/alzrd.yaml b/src/ontogpt/templates/alzrd.yaml index 5c9f87dc4..fb2d0e80c 100644 --- a/src/ontogpt/templates/alzrd.yaml +++ b/src/ontogpt/templates/alzrd.yaml @@ -53,6 +53,7 @@ classes: mentioned in the section. Where possible, translate to the binomial species name (e.g., change "mouse" to "Mus musculus"), unless a different species name is provided in the text. + If no taxon is mentioned, return NOT FOUND. part_of: range: string description: >- @@ -70,19 +71,20 @@ classes: multivalued: true description: >- A semicolon-separated list of diagnostic procedures mentioned in the - section. If no diagnostic procedures are mentioned, return NONE. + section. If no diagnostic procedures are mentioned, return NOT FOUND. diseases: range: Disease multivalued: true description: >- A semicolon-separated list of diseases or conditions mentioned in the - section. If no diseases are mentioned, return NONE. + section. If no diseases are mentioned, return NOT FOUND. chemical: range: Chemical multivalued: true description: >- A semicolon-separated list of chemicals, drugs, or other substances - mentioned in the section. If no chemicals are mentioned, return NONE. + mentioned in the section. If no chemicals are mentioned, return NOT + FOUND. environmental_exposures: range: EnvironmentalExposure multivalued: true @@ -91,7 +93,7 @@ classes: the section. These may include exposure to general classes of materials, e.g., "exposure to pesticides", or other phenomena, e.g., "chronic stress". If no environmental exposures are mentioned, - return NONE. + return NOT FOUND. experimental_metrics_and_indicators: range: MetricOrIndicator multivalued: true @@ -104,7 +106,7 @@ classes: or indicators of disease progression modeling those observed in humans. Examples are Amyloid beta (Aβ) levels, Morris water maze test, tau phosphorylation, neurofibrillary tangles, and cognitive decline. - If no experimental metrics are mentioned, return NONE. + If no experimental metrics are mentioned, return NOT FOUND. experimental_metrics_to_taxon_relationships: description: >- Semicolon-separated list of relationships between From 4c529872c66493a379aa3079313d4716753e5f53 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Fri, 28 Jun 2024 15:21:14 -0400 Subject: [PATCH 3/9] Prompt tuning for section extraction --- src/ontogpt/templates/alzrd.py | 2 +- src/ontogpt/templates/alzrd.yaml | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/ontogpt/templates/alzrd.py b/src/ontogpt/templates/alzrd.py index c1a79b571..8199f6c46 100644 --- a/src/ontogpt/templates/alzrd.py +++ b/src/ontogpt/templates/alzrd.py @@ -122,7 +122,7 @@ class AnnotatorResult(ConfiguredBaseModel): class Document(NamedEntity): - sections: Optional[List[DocumentSection]] = Field(default_factory=list, description="""A semicolon-separated list of full sections of the document. If semicolons are present in the section text, they should be replaced with (SEMICOLON) to avoid parsing errors. A section is a major division of the document, such as an abstract, introduction, methods, results, discussion, or conclusion, or a subsection of one of these. The text should include the section title. A single phrase or ID is not a section. Do not format in Markdown.""") + sections: Optional[List[DocumentSection]] = Field(default_factory=list, description="""A semicolon-separated list of full sections of the document, including the full text of that section alone, beginning with the major division of the document, such as ABSTRACT, INTRODUCTION, METHODS, RESULTS, DISCUSSION, CONCLUSIONS, or a similar heading used by the text. The text should include the section title. If semicolons are present in the section text, they must be replaced with (SEMICOLON) to avoid parsing errors. A single phrase or ID is not a section. Do not format in Markdown.""") id: str = Field(..., description="""A unique identifier for the named entity""") label: Optional[str] = Field(None, description="""The label (name) of the named thing""") diff --git a/src/ontogpt/templates/alzrd.yaml b/src/ontogpt/templates/alzrd.yaml index fb2d0e80c..9997cf0e8 100644 --- a/src/ontogpt/templates/alzrd.yaml +++ b/src/ontogpt/templates/alzrd.yaml @@ -31,13 +31,14 @@ classes: range: DocumentSection multivalued: true description: >- - A semicolon-separated list of full sections of the document. - If semicolons are present in the section text, they should - be replaced with (SEMICOLON) to avoid parsing errors. - A section is a major division of the document, such as an abstract, - introduction, methods, results, discussion, or conclusion, - or a subsection of one of these. The text should include the section - title. A single phrase or ID is not a section. + A semicolon-separated list of full sections of the document, + including the full text of that section alone, beginning with the + major division of the document, such as ABSTRACT, + INTRODUCTION, METHODS, RESULTS, DISCUSSION, CONCLUSIONS, + or a similar heading used by the text. The text should include the + section title. If semicolons are present in the section text, they + must be replaced with (SEMICOLON) to avoid parsing errors. + A single phrase or ID is not a section. Do not format in Markdown. DocumentSection: From 319d9eceba025c11fbae164a2734777d77af4abe Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Fri, 28 Jun 2024 16:08:40 -0400 Subject: [PATCH 4/9] Change order of slots --- src/ontogpt/templates/alzrd.py | 2 +- src/ontogpt/templates/alzrd.yaml | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/ontogpt/templates/alzrd.py b/src/ontogpt/templates/alzrd.py index 8199f6c46..36ca6c2b6 100644 --- a/src/ontogpt/templates/alzrd.py +++ b/src/ontogpt/templates/alzrd.py @@ -128,9 +128,9 @@ class Document(NamedEntity): class DocumentSection(CompoundExpression): - taxon: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of taxa or species of organisms mentioned in the section. Where possible, translate to the binomial species name (e.g., change \"mouse\" to \"Mus musculus\"), unless a different species name is provided in the text. If no taxon is mentioned, return NOT FOUND.""") part_of: Optional[str] = Field(None, description="""The major document division that this section is a part of. Examples are \"introduction\", \"methods\", \"results\", \"discussion\", or \"conclusions\". Do not format in Markdown.""") summary: Optional[str] = Field(None, description="""A brief summary of the section, suitable for display in a table of contents or search results. This should be a single sentence or phrase, not a full paragraph. Do not format in Markdown.""") + taxon: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of taxa or species of organisms mentioned in the section. Where possible, translate to the binomial species name (e.g., change \"mouse\" to \"Mus musculus\"), unless a different species name is provided in the text. If no taxon is mentioned, return NOT FOUND.""") diagnostics: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diagnostic procedures mentioned in the section. If no diagnostic procedures are mentioned, return NOT FOUND.""") diseases: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diseases or conditions mentioned in the section. If no diseases are mentioned, return NOT FOUND.""") chemical: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of chemicals, drugs, or other substances mentioned in the section. If no chemicals are mentioned, return NOT FOUND.""") diff --git a/src/ontogpt/templates/alzrd.yaml b/src/ontogpt/templates/alzrd.yaml index 9997cf0e8..7e0b57599 100644 --- a/src/ontogpt/templates/alzrd.yaml +++ b/src/ontogpt/templates/alzrd.yaml @@ -44,17 +44,6 @@ classes: DocumentSection: is_a: CompoundExpression attributes: - taxon: - range: Taxon - multivalued: true - # NOTE: Don't ask the LLM to return NONE here or it may get - # grounded to NCBITaxon:32644 (unidentified) - description: >- - A semicolon-separated list of taxa or species of organisms - mentioned in the section. Where possible, translate to the - binomial species name (e.g., change "mouse" to "Mus musculus"), - unless a different species name is provided in the text. - If no taxon is mentioned, return NOT FOUND. part_of: range: string description: >- @@ -67,6 +56,17 @@ classes: A brief summary of the section, suitable for display in a table of contents or search results. This should be a single sentence or phrase, not a full paragraph. Do not format in Markdown. + taxon: + range: Taxon + multivalued: true + # NOTE: Don't ask the LLM to return NONE here or it may get + # grounded to NCBITaxon:32644 (unidentified) + description: >- + A semicolon-separated list of taxa or species of organisms + mentioned in the section. Where possible, translate to the + binomial species name (e.g., change "mouse" to "Mus musculus"), + unless a different species name is provided in the text. + If no taxon is mentioned, return NOT FOUND. diagnostics: range: Diagnostic multivalued: true From 91ea70bc8ed57d51bc65b3d0d6a8f2458e0f2e7e Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Fri, 28 Jun 2024 16:11:42 -0400 Subject: [PATCH 5/9] Repair taxon range --- src/ontogpt/templates/alzrd.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ontogpt/templates/alzrd.yaml b/src/ontogpt/templates/alzrd.yaml index 7e0b57599..c178fa97a 100644 --- a/src/ontogpt/templates/alzrd.yaml +++ b/src/ontogpt/templates/alzrd.yaml @@ -230,12 +230,12 @@ classes: or indicators of disease progression modeling those observed in humans. Examples are Amyloid beta (Aβ) levels, Morris water maze test, tau phosphorylation, neurofibrillary tangles, and cognitive decline. - range: Taxon + range: MetricOrIndicator object: description: >- The taxon or species of the model organism in which the experimental metric is measured. For example, Mus musculus, Rattus norvegicus. - range: MetricOrIndicator + range: Taxon predicate: range: NamedEntity description: >- From 223e470f38fc62668cb87efd55baa2b4fa599ecf Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Fri, 28 Jun 2024 16:15:33 -0400 Subject: [PATCH 6/9] Split templates into two parsing approaches --- src/ontogpt/templates/alzrd.yaml | 3 +- src/ontogpt/templates/alzrd_section.py | 218 +++++++++++++++++ src/ontogpt/templates/alzrd_section.yaml | 294 +++++++++++++++++++++++ 3 files changed, 514 insertions(+), 1 deletion(-) create mode 100644 src/ontogpt/templates/alzrd_section.py create mode 100644 src/ontogpt/templates/alzrd_section.yaml diff --git a/src/ontogpt/templates/alzrd.yaml b/src/ontogpt/templates/alzrd.yaml index c178fa97a..b18f3af47 100644 --- a/src/ontogpt/templates/alzrd.yaml +++ b/src/ontogpt/templates/alzrd.yaml @@ -6,7 +6,8 @@ description: >- dementias along with experimental metrics and model organisms. Assumes a large input text, on the order of a full scientific article or review. Focus is on extracting the methods and metrics used - with different model organisms. Try with this review - PMID:33302541 + with different model organisms. This version of the template + attempts to process the entirety of the input text at once. license: https://creativecommons.org/publicdomain/zero/1.0/ prefixes: rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# diff --git a/src/ontogpt/templates/alzrd_section.py b/src/ontogpt/templates/alzrd_section.py new file mode 100644 index 000000000..36ca6c2b6 --- /dev/null +++ b/src/ontogpt/templates/alzrd_section.py @@ -0,0 +1,218 @@ +from __future__ import annotations +from datetime import ( + datetime, + date +) +from decimal import Decimal +from enum import Enum +import re +import sys +from typing import ( + Any, + List, + Literal, + Dict, + Optional, + Union +) +from pydantic.version import VERSION as PYDANTIC_VERSION +if int(PYDANTIC_VERSION[0])>=2: + from pydantic import ( + BaseModel, + ConfigDict, + Field, + field_validator + ) +else: + from pydantic import ( + BaseModel, + Field, + validator + ) + +metamodel_version = "None" +version = "None" + + +class ConfiguredBaseModel(BaseModel): + model_config = ConfigDict( + validate_assignment = True, + validate_default = True, + extra = "forbid", + arbitrary_types_allowed = True, + use_enum_values = True, + strict = False, + ) + pass + + +class NullDataOptions(str, Enum): + UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION" + NOT_APPLICABLE = "NOT_APPLICABLE" + NOT_MENTIONED = "NOT_MENTIONED" + + +class ExtractionResult(ConfiguredBaseModel): + """ + A result of extracting knowledge on text + """ + input_id: Optional[str] = Field(None) + input_title: Optional[str] = Field(None) + input_text: Optional[str] = Field(None) + raw_completion_output: Optional[str] = Field(None) + prompt: Optional[str] = Field(None) + extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""") + named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""") + + +class NamedEntity(ConfiguredBaseModel): + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class CompoundExpression(ConfiguredBaseModel): + pass + + +class Triple(CompoundExpression): + """ + Abstract parent for Relation Extraction tasks + """ + subject: Optional[str] = Field(None) + predicate: Optional[str] = Field(None) + object: Optional[str] = Field(None) + qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""") + subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""") + object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""") + + +class TextWithTriples(ConfiguredBaseModel): + """ + A text containing one or more relations of the Triple type. + """ + publication: Optional[Publication] = Field(None) + triples: Optional[List[Triple]] = Field(default_factory=list) + + +class TextWithEntity(ConfiguredBaseModel): + """ + A text containing one or more instances of a single type of entity. + """ + publication: Optional[Publication] = Field(None) + entities: Optional[List[str]] = Field(default_factory=list) + + +class RelationshipType(NamedEntity): + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class Publication(ConfiguredBaseModel): + id: Optional[str] = Field(None, description="""The publication identifier""") + title: Optional[str] = Field(None, description="""The title of the publication""") + abstract: Optional[str] = Field(None, description="""The abstract of the publication""") + combined_text: Optional[str] = Field(None) + full_text: Optional[str] = Field(None, description="""The full text of the publication""") + + +class AnnotatorResult(ConfiguredBaseModel): + subject_text: Optional[str] = Field(None) + object_id: Optional[str] = Field(None) + object_text: Optional[str] = Field(None) + + +class Document(NamedEntity): + sections: Optional[List[DocumentSection]] = Field(default_factory=list, description="""A semicolon-separated list of full sections of the document, including the full text of that section alone, beginning with the major division of the document, such as ABSTRACT, INTRODUCTION, METHODS, RESULTS, DISCUSSION, CONCLUSIONS, or a similar heading used by the text. The text should include the section title. If semicolons are present in the section text, they must be replaced with (SEMICOLON) to avoid parsing errors. A single phrase or ID is not a section. Do not format in Markdown.""") + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class DocumentSection(CompoundExpression): + part_of: Optional[str] = Field(None, description="""The major document division that this section is a part of. Examples are \"introduction\", \"methods\", \"results\", \"discussion\", or \"conclusions\". Do not format in Markdown.""") + summary: Optional[str] = Field(None, description="""A brief summary of the section, suitable for display in a table of contents or search results. This should be a single sentence or phrase, not a full paragraph. Do not format in Markdown.""") + taxon: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of taxa or species of organisms mentioned in the section. Where possible, translate to the binomial species name (e.g., change \"mouse\" to \"Mus musculus\"), unless a different species name is provided in the text. If no taxon is mentioned, return NOT FOUND.""") + diagnostics: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diagnostic procedures mentioned in the section. If no diagnostic procedures are mentioned, return NOT FOUND.""") + diseases: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diseases or conditions mentioned in the section. If no diseases are mentioned, return NOT FOUND.""") + chemical: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of chemicals, drugs, or other substances mentioned in the section. If no chemicals are mentioned, return NOT FOUND.""") + environmental_exposures: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of environmental exposures mentioned in the section. These may include exposure to general classes of materials, e.g., \"exposure to pesticides\", or other phenomena, e.g., \"chronic stress\". If no environmental exposures are mentioned, return NOT FOUND.""") + experimental_metrics_and_indicators: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of of a experimental metrics, signs, symptoms, or outcomes used to measure the progression of Alzheimer's disease and related dementias. These may be quantitative or qualitative measures, including biomolecular assays. In experimental animal models these are analogues of cognitive impairment or indicators of disease progression modeling those observed in humans. Examples are Amyloid beta (Aβ) levels, Morris water maze test, tau phosphorylation, neurofibrillary tangles, and cognitive decline. If no experimental metrics are mentioned, return NOT FOUND.""") + experimental_metrics_to_taxon_relationships: Optional[List[ExperimentalMetricToTaxonRelationship]] = Field(default_factory=list, description="""Semicolon-separated list of relationships between a specific experimental metric, sign, symptom, or outcome and a taxon, where the relationship is used to measure progression of Alzheimer's disease and related dementias, or an experimental analogue, in the taxon. For example, \"Amyloid beta (Aβ) levels are measured in Mus musculus\" or \"Morris water maze test is measured with Rattus norvegicus\".""") + experimental_metric_to_disease_relationships: Optional[List[ExperimentalMetricToDiseaseRelationship]] = Field(default_factory=list, description="""Semicolon-separated list of relationships between a specific experimental metric, sign, symptom, or outcome and a disease or condition, where the relationship is used as an experimental model of progression or presence of a disease. For example, \"Amyloid beta (Aβ) levels are used to model Alzheimer's disease\" or \"Morris water maze test is used to model Parkinson's disease\".""") + + +class MetricOrIndicator(NamedEntity): + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class Diagnostic(NamedEntity): + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class Disease(NamedEntity): + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class Taxon(NamedEntity): + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class Chemical(NamedEntity): + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class EnvironmentalExposure(NamedEntity): + id: str = Field(..., description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + +class ExperimentalMetricToTaxonRelationship(Triple): + """ + A triple where the subject is an experimental metric, the object is an taxon, metric, and the predicate describes the relationship between the metric and the taxon, usually MEASURED_IN. + """ + subject: Optional[str] = Field(None, description="""The name of an experimental metric, sign, symptom, or outcome used to measure the effects of treatments on symptoms or diagnostics, or of the progression of Alzheimer's disease and related dementias. In experimental animal models these are analogues of cognitive impairment or indicators of disease progression modeling those observed in humans. Examples are Amyloid beta (Aβ) levels, Morris water maze test, tau phosphorylation, neurofibrillary tangles, and cognitive decline.""") + predicate: Optional[str] = Field(None, description="""The relationship type, generally MEASURED_IN to indicate a metric is measured in a taxon.""") + object: Optional[str] = Field(None, description="""The taxon or species of the model organism in which the experimental metric is measured. For example, Mus musculus, Rattus norvegicus.""") + qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""") + subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the taxon. This may include a strain or genetic background of the model organism.""") + object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the experimental metric. This may include the method of measurement or the specific assay used.""") + + +class ExperimentalMetricToDiseaseRelationship(Triple): + """ + A triple where the subject is an experimental metric, the object is a disease or condition, and the predicate describes the relationship between the metric and the disease, usually USED_TO_MODEL. + """ + subject: Optional[str] = Field(None, description="""The name of an experimental metric, sign, symptom, or outcome used to measure the effects of treatments on symptoms or diagnostics, or of the progression of Alzheimer's disease and related dementias. In experimental animal models these are analogues of cognitive impairment or indicators of disease progression modeling those observed in humans. Examples are Amyloid beta (Aβ) levels, Morris water maze test, tau phosphorylation, neurofibrillary tangles, and cognitive decline.""") + predicate: Optional[str] = Field(None, description="""The relationship type, generally USED_TO_MODEL to indicate a metric is used to model a disease or condition.""") + object: Optional[str] = Field(None, description="""The name of a disease or condition. Examples are Alzheimer's disease, Parkinson's disease, Huntington's disease.""") + qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""") + subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the experimental metric. This may include the method of measurement or the specific assay used.""") + object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the disease or condition. This may include the stage or subtype of the disease.""") + + +# Model rebuild +# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model +ExtractionResult.model_rebuild() +NamedEntity.model_rebuild() +CompoundExpression.model_rebuild() +Triple.model_rebuild() +TextWithTriples.model_rebuild() +TextWithEntity.model_rebuild() +RelationshipType.model_rebuild() +Publication.model_rebuild() +AnnotatorResult.model_rebuild() +Document.model_rebuild() +DocumentSection.model_rebuild() +MetricOrIndicator.model_rebuild() +Diagnostic.model_rebuild() +Disease.model_rebuild() +Taxon.model_rebuild() +Chemical.model_rebuild() +EnvironmentalExposure.model_rebuild() +ExperimentalMetricToTaxonRelationship.model_rebuild() +ExperimentalMetricToDiseaseRelationship.model_rebuild() + diff --git a/src/ontogpt/templates/alzrd_section.yaml b/src/ontogpt/templates/alzrd_section.yaml new file mode 100644 index 000000000..7d85cd71d --- /dev/null +++ b/src/ontogpt/templates/alzrd_section.yaml @@ -0,0 +1,294 @@ +id: http://w3id.org/ontogpt/alzrd_section +name: alzrd_section +title: Template for extracting Alzheimer's Disease Phenotypes by section +description: >- + Template for extracting phenotypes of Alzheimer's disease and related + dementias along with experimental metrics and model organisms. + Assumes a large input text, on the order of a full scientific article or + review. Focus is on extracting the methods and metrics used + with different model organisms. This template will attempt to break + up the input text by section before parsing, as opposed to parsing + the entire text at once. +license: https://creativecommons.org/publicdomain/zero/1.0/ +prefixes: + rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# + alzrd: http://w3id.org/ontogpt/alzrd_section + linkml: https://w3id.org/linkml/ + +default_prefix: alzrd +default_range: string + +imports: + - linkml:types + - core + +# TODO: add more behavioral metrics extraction + +classes: + Document: + tree_root: true + is_a: NamedEntity + attributes: + sections: + range: DocumentSection + multivalued: true + description: >- + A semicolon-separated list of full sections of the document, + including the full text of that section alone, beginning with the + major division of the document, such as ABSTRACT, + INTRODUCTION, METHODS, RESULTS, DISCUSSION, CONCLUSIONS, + or a similar heading used by the text. The text should include the + section title. If semicolons are present in the section text, they + must be replaced with (SEMICOLON) to avoid parsing errors. + A single phrase or ID is not a section. + Do not format in Markdown. + + DocumentSection: + is_a: CompoundExpression + attributes: + part_of: + range: string + description: >- + The major document division that this section is a part of. + Examples are "introduction", "methods", "results", "discussion", + or "conclusions". Do not format in Markdown. + summary: + range: string + description: >- + A brief summary of the section, suitable for display in a table of + contents or search results. This should be a single sentence or + phrase, not a full paragraph. Do not format in Markdown. + taxon: + range: Taxon + multivalued: true + # NOTE: Don't ask the LLM to return NONE here or it may get + # grounded to NCBITaxon:32644 (unidentified) + description: >- + A semicolon-separated list of taxa or species of organisms + mentioned in the section. Where possible, translate to the + binomial species name (e.g., change "mouse" to "Mus musculus"), + unless a different species name is provided in the text. + If no taxon is mentioned, return NOT FOUND. + diagnostics: + range: Diagnostic + multivalued: true + description: >- + A semicolon-separated list of diagnostic procedures mentioned in the + section. If no diagnostic procedures are mentioned, return NOT FOUND. + diseases: + range: Disease + multivalued: true + description: >- + A semicolon-separated list of diseases or conditions mentioned in the + section. If no diseases are mentioned, return NOT FOUND. + chemical: + range: Chemical + multivalued: true + description: >- + A semicolon-separated list of chemicals, drugs, or other substances + mentioned in the section. If no chemicals are mentioned, return NOT + FOUND. + environmental_exposures: + range: EnvironmentalExposure + multivalued: true + description: >- + A semicolon-separated list of environmental exposures mentioned in + the section. These may include exposure to general classes of + materials, e.g., "exposure to pesticides", or other phenomena, + e.g., "chronic stress". If no environmental exposures are mentioned, + return NOT FOUND. + experimental_metrics_and_indicators: + range: MetricOrIndicator + multivalued: true + description: >- + A semicolon-separated list of of a experimental metrics, signs, + symptoms, or outcomes used to measure the progression of Alzheimer's + disease and related dementias. These may be quantitative or + qualitative measures, including biomolecular assays. In + experimental animal models these are analogues of cognitive impairment + or indicators of disease progression modeling those observed in + humans. Examples are Amyloid beta (Aβ) levels, Morris water maze test, + tau phosphorylation, neurofibrillary tangles, and cognitive decline. + If no experimental metrics are mentioned, return NOT FOUND. + experimental_metrics_to_taxon_relationships: + description: >- + Semicolon-separated list of relationships between + a specific experimental metric, sign, symptom, + or outcome and a taxon, where the relationship is used to measure + progression of Alzheimer's disease and + related dementias, or an experimental analogue, in the taxon. + For example, "Amyloid beta (Aβ) levels are measured in Mus musculus" + or "Morris water maze test is measured with Rattus norvegicus". + multivalued: true + range: ExperimentalMetricToTaxonRelationship + experimental_metric_to_disease_relationships: + description: >- + Semicolon-separated list of relationships between + a specific experimental metric, sign, symptom, + or outcome and a disease or condition, where the relationship is used + as an experimental model of progression or presence of a disease. + For example, "Amyloid beta (Aβ) levels are used to model Alzheimer's + disease" or "Morris water maze test is used to model Parkinson's + disease". + multivalued: true + range: ExperimentalMetricToDiseaseRelationship + + # Entities + + # TODO: make signs species-specific if possible + MetricOrIndicator: + is_a: NamedEntity + id_prefixes: + - MAXO + - MESH + - HP + - MP + - NBO + - SNOMEDCT + annotations: + annotators: sqlite:obo:hp, sqlite:obo:mondo, sqlite:obo:mp, sqlite:obo:nbo, sqlite:obo:maxo, sqlite:obo:mesh, bioportal:SNOMEDCT, sqlite:obo:ncit + prompt: >- + The name of an experimental metric, sign, symptom, or outcome used to + measure the effects of treatments on symptoms or diagnostics, or of the + progression of Alzheimer's disease and related dementias. In + experimental animal models these are analogues of cognitive impairment + or indicators of disease progression modeling those observed in humans. + Examples are Amyloid beta (Aβ) levels, Morris water maze test, tau + phosphorylation, neurofibrillary tangles, and cognitive decline. + + Diagnostic: + is_a: NamedEntity + id_prefixes: + - MAXO + - MESH + annotations: + annotators: sqlite:obo:maxo, sqlite:obo:mesh, sqlite:obo:ncit + prompt: >- + The name of a diagnostic procedure or test. + Examples are MRI, PET scan, lumbar puncture, blood test, biopsy. + + Disease: + is_a: NamedEntity + id_prefixes: + - MONDO + annotations: + annotators: sqlite:obo:mondo + prompt: >- + The name of a disease or condition. + Examples are Alzheimer's disease, Parkinson's disease, Huntington's disease. + + # TODO: use a taxon slim + # TODO: Consider making this an enum of known model organisms for AD/RD + Taxon: + is_a: NamedEntity + id_prefixes: + - NCBITaxon + annotations: + annotators: sqlite:obo:ncbitaxon + prompt: >- + The taxonomic group or species of a model organism. + Examples are "human", "mouse", "rat", "Rhesus macaque", + "canine", "marmoset", "fruit fly", "C. elegans", "S. cerevisiae". + + Chemical: + is_a: NamedEntity + id_prefixes: + - CHEBI + - MESH + annotations: + annotators: sqlite:obo:chebi, sqlite:obo:mesh + prompt: >- + The name of a chemical, drug, or other substance. + Examples are "donepezil", "Aβ42", "Aβ40", "tau", "insulin", + "caffeine", "nicotine", "alcohol". + + EnvironmentalExposure: + is_a: NamedEntity + id_prefixes: + - ENVO + - MESH + annotations: + annotators: sqlite:obo:envo, sqlite:obo:mesh + prompt: >- + The name of an environmental exposure or condition. + Examples are "pesticides", "chronic stress", "air pollution", + "heavy metals", "radiation", "heat stress". + + # Relationships + + ExperimentalMetricToTaxonRelationship: + is_a: Triple + description: >- + A triple where the subject is an experimental metric, the object is an + taxon, metric, and the predicate describes the relationship between the + metric and the taxon, usually MEASURED_IN. + slot_usage: + subject: + description: >- + The name of an experimental metric, sign, symptom, or outcome used to + measure the effects of treatments on symptoms or diagnostics, or of + the progression of Alzheimer's disease and related dementias. In + experimental animal models these are analogues of cognitive impairment + or indicators of disease progression modeling those observed in + humans. Examples are Amyloid beta (Aβ) levels, Morris water maze test, + tau phosphorylation, neurofibrillary tangles, and cognitive decline. + range: MetricOrIndicator + object: + description: >- + The taxon or species of the model organism in which the experimental + metric is measured. For example, Mus musculus, Rattus norvegicus. + range: Taxon + predicate: + range: NamedEntity + description: >- + The relationship type, generally MEASURED_IN to indicate a metric + is measured in a taxon. + subject_qualifier: + range: NamedEntity + description: >- + An optional qualifier or modifier for the taxon. + This may include a strain or genetic background of the model organism. + object_qualifier: + range: NamedEntity + description: >- + An optional qualifier or modifier for the experimental metric. + This may include the method of measurement or the specific assay used. + + ExperimentalMetricToDiseaseRelationship: + is_a: Triple + description: >- + A triple where the subject is an experimental metric, the object is a + disease or condition, and the predicate describes the relationship between + the metric and the disease, usually USED_TO_MODEL. + slot_usage: + subject: + description: >- + The name of an experimental metric, sign, symptom, or outcome used to + measure the effects of treatments on symptoms or diagnostics, or of + the progression of Alzheimer's disease and related dementias. In + experimental animal models these are analogues of cognitive impairment + or indicators of disease progression modeling those observed in + humans. Examples are Amyloid beta (Aβ) levels, Morris water maze test, + tau phosphorylation, neurofibrillary tangles, and cognitive decline. + range: MetricOrIndicator + object: + description: >- + The name of a disease or condition. + Examples are Alzheimer's disease, Parkinson's disease, Huntington's + disease. + range: Disease + predicate: + range: NamedEntity + description: >- + The relationship type, generally USED_TO_MODEL to indicate a metric + is used to model a disease or condition. + subject_qualifier: + range: NamedEntity + description: >- + An optional qualifier or modifier for the experimental metric. + This may include the method of measurement or the specific assay used. + object_qualifier: + range: NamedEntity + description: >- + An optional qualifier or modifier for the disease or condition. + This may include the stage or subtype of the disease. From 6a82bdfe451c6d6ef4b83234c2cf112cb3670d1f Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Fri, 28 Jun 2024 16:21:31 -0400 Subject: [PATCH 7/9] Restructure the main alzrd template to do full text at once --- src/ontogpt/templates/alzrd.py | 32 +++++------- src/ontogpt/templates/alzrd.yaml | 85 +++++++++++++------------------- 2 files changed, 47 insertions(+), 70 deletions(-) diff --git a/src/ontogpt/templates/alzrd.py b/src/ontogpt/templates/alzrd.py index 36ca6c2b6..9d3f6f977 100644 --- a/src/ontogpt/templates/alzrd.py +++ b/src/ontogpt/templates/alzrd.py @@ -122,24 +122,19 @@ class AnnotatorResult(ConfiguredBaseModel): class Document(NamedEntity): - sections: Optional[List[DocumentSection]] = Field(default_factory=list, description="""A semicolon-separated list of full sections of the document, including the full text of that section alone, beginning with the major division of the document, such as ABSTRACT, INTRODUCTION, METHODS, RESULTS, DISCUSSION, CONCLUSIONS, or a similar heading used by the text. The text should include the section title. If semicolons are present in the section text, they must be replaced with (SEMICOLON) to avoid parsing errors. A single phrase or ID is not a section. Do not format in Markdown.""") + summary: Optional[str] = Field(None, description="""A brief summary of the input text, suitable for display in a table of contents or search results. This should be no more than three sentences. Do not format in Markdown.""") + taxon: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of taxa or species of organisms mentioned in the input text. Where possible, translate to the binomial species name (e.g., change \"mouse\" to \"Mus musculus\"), unless a different species name is provided in the text. If no taxon is mentioned, return NOT FOUND.""") + diagnostics: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diagnostic procedures mentioned in the input text. If no diagnostic procedures are mentioned, return NOT FOUND.""") + diseases: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diseases or conditions mentioned in the input text. If no diseases are mentioned, return NOT FOUND.""") + chemical: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of chemicals, drugs, or other substances mentioned in the input text. If no chemicals are mentioned, return NOT FOUND.""") + environmental_exposures: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of environmental exposures mentioned in the input text. These may include exposure to general classes of materials, e.g., \"exposure to pesticides\", or other phenomena, e.g., \"chronic stress\". If no environmental exposures are mentioned, return NOT FOUND.""") + experimental_metrics_and_indicators: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of of experimental metrics, signs, symptoms, or outcomes used to measure the progression of Alzheimer's disease and related dementias, mentioned in the input text. These may be quantitative or qualitative measures, including biomolecular assays. In experimental animal models these are analogues of cognitive impairment or indicators of disease progression modeling those observed in humans. Examples are Amyloid beta (Aβ) levels, Morris water maze test, tau phosphorylation, neurofibrillary tangles, and cognitive decline. If no experimental metrics are mentioned, return NOT FOUND.""") + experimental_metrics_to_taxon_relationships: Optional[List[ExperimentalMetricToTaxonRelationship]] = Field(default_factory=list, description="""Semicolon-separated list of relationships between a specific experimental metric, sign, symptom, or outcome and a taxon, as described in the input text. These are cases in which the relationship is used to measure progression of Alzheimer's disease and related dementias, or an experimental analogue, in the taxon. For example, \"Amyloid beta (Aβ) levels are measured in Mus musculus\" or \"Morris water maze test is measured with Rattus norvegicus\".""") + experimental_metric_to_disease_relationships: Optional[List[ExperimentalMetricToDiseaseRelationship]] = Field(default_factory=list, description="""Semicolon-separated list of relationships between a specific experimental metric, sign, symptom, or outcome and a disease or condition, as described in the input text. These are cases in which the relationship is used as an experimental model of progression or presence of a disease. For example, \"Amyloid beta (Aβ) levels are used to model Alzheimer's disease\" or \"Morris water maze test is used to model Parkinson's disease\".""") id: str = Field(..., description="""A unique identifier for the named entity""") label: Optional[str] = Field(None, description="""The label (name) of the named thing""") -class DocumentSection(CompoundExpression): - part_of: Optional[str] = Field(None, description="""The major document division that this section is a part of. Examples are \"introduction\", \"methods\", \"results\", \"discussion\", or \"conclusions\". Do not format in Markdown.""") - summary: Optional[str] = Field(None, description="""A brief summary of the section, suitable for display in a table of contents or search results. This should be a single sentence or phrase, not a full paragraph. Do not format in Markdown.""") - taxon: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of taxa or species of organisms mentioned in the section. Where possible, translate to the binomial species name (e.g., change \"mouse\" to \"Mus musculus\"), unless a different species name is provided in the text. If no taxon is mentioned, return NOT FOUND.""") - diagnostics: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diagnostic procedures mentioned in the section. If no diagnostic procedures are mentioned, return NOT FOUND.""") - diseases: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diseases or conditions mentioned in the section. If no diseases are mentioned, return NOT FOUND.""") - chemical: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of chemicals, drugs, or other substances mentioned in the section. If no chemicals are mentioned, return NOT FOUND.""") - environmental_exposures: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of environmental exposures mentioned in the section. These may include exposure to general classes of materials, e.g., \"exposure to pesticides\", or other phenomena, e.g., \"chronic stress\". If no environmental exposures are mentioned, return NOT FOUND.""") - experimental_metrics_and_indicators: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of of a experimental metrics, signs, symptoms, or outcomes used to measure the progression of Alzheimer's disease and related dementias. These may be quantitative or qualitative measures, including biomolecular assays. In experimental animal models these are analogues of cognitive impairment or indicators of disease progression modeling those observed in humans. Examples are Amyloid beta (Aβ) levels, Morris water maze test, tau phosphorylation, neurofibrillary tangles, and cognitive decline. If no experimental metrics are mentioned, return NOT FOUND.""") - experimental_metrics_to_taxon_relationships: Optional[List[ExperimentalMetricToTaxonRelationship]] = Field(default_factory=list, description="""Semicolon-separated list of relationships between a specific experimental metric, sign, symptom, or outcome and a taxon, where the relationship is used to measure progression of Alzheimer's disease and related dementias, or an experimental analogue, in the taxon. For example, \"Amyloid beta (Aβ) levels are measured in Mus musculus\" or \"Morris water maze test is measured with Rattus norvegicus\".""") - experimental_metric_to_disease_relationships: Optional[List[ExperimentalMetricToDiseaseRelationship]] = Field(default_factory=list, description="""Semicolon-separated list of relationships between a specific experimental metric, sign, symptom, or outcome and a disease or condition, where the relationship is used as an experimental model of progression or presence of a disease. For example, \"Amyloid beta (Aβ) levels are used to model Alzheimer's disease\" or \"Morris water maze test is used to model Parkinson's disease\".""") - - class MetricOrIndicator(NamedEntity): id: str = Field(..., description="""A unique identifier for the named entity""") label: Optional[str] = Field(None, description="""The label (name) of the named thing""") @@ -178,8 +173,8 @@ class ExperimentalMetricToTaxonRelationship(Triple): predicate: Optional[str] = Field(None, description="""The relationship type, generally MEASURED_IN to indicate a metric is measured in a taxon.""") object: Optional[str] = Field(None, description="""The taxon or species of the model organism in which the experimental metric is measured. For example, Mus musculus, Rattus norvegicus.""") qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""") - subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the taxon. This may include a strain or genetic background of the model organism.""") - object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the experimental metric. This may include the method of measurement or the specific assay used.""") + subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the taxon, as described in the input text. This may include a strain or genetic background of the model organism.""") + object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the experimental metric, as described in the input text. This may include the method of measurement or the specific assay used.""") class ExperimentalMetricToDiseaseRelationship(Triple): @@ -190,8 +185,8 @@ class ExperimentalMetricToDiseaseRelationship(Triple): predicate: Optional[str] = Field(None, description="""The relationship type, generally USED_TO_MODEL to indicate a metric is used to model a disease or condition.""") object: Optional[str] = Field(None, description="""The name of a disease or condition. Examples are Alzheimer's disease, Parkinson's disease, Huntington's disease.""") qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""") - subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the experimental metric. This may include the method of measurement or the specific assay used.""") - object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the disease or condition. This may include the stage or subtype of the disease.""") + subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the experimental metric, as described in the input text. This may include the method of measurement or the specific assay used.""") + object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the disease or condition, as described in the input text. This may include the stage or subtype of the disease.""") # Model rebuild @@ -206,7 +201,6 @@ class ExperimentalMetricToDiseaseRelationship(Triple): Publication.model_rebuild() AnnotatorResult.model_rebuild() Document.model_rebuild() -DocumentSection.model_rebuild() MetricOrIndicator.model_rebuild() Diagnostic.model_rebuild() Disease.model_rebuild() diff --git a/src/ontogpt/templates/alzrd.yaml b/src/ontogpt/templates/alzrd.yaml index b18f3af47..4d322fab7 100644 --- a/src/ontogpt/templates/alzrd.yaml +++ b/src/ontogpt/templates/alzrd.yaml @@ -28,43 +28,18 @@ classes: tree_root: true is_a: NamedEntity attributes: - sections: - range: DocumentSection - multivalued: true - description: >- - A semicolon-separated list of full sections of the document, - including the full text of that section alone, beginning with the - major division of the document, such as ABSTRACT, - INTRODUCTION, METHODS, RESULTS, DISCUSSION, CONCLUSIONS, - or a similar heading used by the text. The text should include the - section title. If semicolons are present in the section text, they - must be replaced with (SEMICOLON) to avoid parsing errors. - A single phrase or ID is not a section. - Do not format in Markdown. - - DocumentSection: - is_a: CompoundExpression - attributes: - part_of: - range: string - description: >- - The major document division that this section is a part of. - Examples are "introduction", "methods", "results", "discussion", - or "conclusions". Do not format in Markdown. summary: range: string description: >- - A brief summary of the section, suitable for display in a table of - contents or search results. This should be a single sentence or - phrase, not a full paragraph. Do not format in Markdown. + A brief summary of the input text, suitable for display in a table of + contents or search results. This should be no more than three + sentences. Do not format in Markdown. taxon: range: Taxon multivalued: true - # NOTE: Don't ask the LLM to return NONE here or it may get - # grounded to NCBITaxon:32644 (unidentified) description: >- A semicolon-separated list of taxa or species of organisms - mentioned in the section. Where possible, translate to the + mentioned in the input text. Where possible, translate to the binomial species name (e.g., change "mouse" to "Mus musculus"), unless a different species name is provided in the text. If no taxon is mentioned, return NOT FOUND. @@ -73,26 +48,27 @@ classes: multivalued: true description: >- A semicolon-separated list of diagnostic procedures mentioned in the - section. If no diagnostic procedures are mentioned, return NOT FOUND. + input text. If no diagnostic procedures are mentioned, return NOT + FOUND. diseases: range: Disease multivalued: true description: >- A semicolon-separated list of diseases or conditions mentioned in the - section. If no diseases are mentioned, return NOT FOUND. + input text. If no diseases are mentioned, return NOT FOUND. chemical: range: Chemical multivalued: true description: >- A semicolon-separated list of chemicals, drugs, or other substances - mentioned in the section. If no chemicals are mentioned, return NOT + mentioned in the input text. If no chemicals are mentioned, return NOT FOUND. environmental_exposures: range: EnvironmentalExposure multivalued: true description: >- A semicolon-separated list of environmental exposures mentioned in - the section. These may include exposure to general classes of + the input text. These may include exposure to general classes of materials, e.g., "exposure to pesticides", or other phenomena, e.g., "chronic stress". If no environmental exposures are mentioned, return NOT FOUND. @@ -100,31 +76,34 @@ classes: range: MetricOrIndicator multivalued: true description: >- - A semicolon-separated list of of a experimental metrics, signs, + A semicolon-separated list of of experimental metrics, signs, symptoms, or outcomes used to measure the progression of Alzheimer's - disease and related dementias. These may be quantitative or - qualitative measures, including biomolecular assays. In - experimental animal models these are analogues of cognitive impairment - or indicators of disease progression modeling those observed in - humans. Examples are Amyloid beta (Aβ) levels, Morris water maze test, - tau phosphorylation, neurofibrillary tangles, and cognitive decline. - If no experimental metrics are mentioned, return NOT FOUND. + disease and related dementias, mentioned in the input text. These may + be quantitative or qualitative measures, including biomolecular + assays. In experimental animal models these are analogues of + cognitive impairment or indicators of disease progression modeling + those observed in humans. Examples are Amyloid beta (Aβ) levels, + Morris water maze test, tau phosphorylation, neurofibrillary tangles, + and cognitive decline. If no experimental metrics are mentioned, + return NOT FOUND. experimental_metrics_to_taxon_relationships: description: >- Semicolon-separated list of relationships between a specific experimental metric, sign, symptom, - or outcome and a taxon, where the relationship is used to measure - progression of Alzheimer's disease and - related dementias, or an experimental analogue, in the taxon. - For example, "Amyloid beta (Aβ) levels are measured in Mus musculus" - or "Morris water maze test is measured with Rattus norvegicus". + or outcome and a taxon, as described in the input text. + These are cases in which the relationship is used to measure + progression of Alzheimer's disease and related dementias, or an + experimental analogue, in the taxon. For example, "Amyloid beta (Aβ) + levels are measured in Mus musculus" or "Morris water maze test is + measured with Rattus norvegicus". multivalued: true range: ExperimentalMetricToTaxonRelationship experimental_metric_to_disease_relationships: description: >- Semicolon-separated list of relationships between a specific experimental metric, sign, symptom, - or outcome and a disease or condition, where the relationship is used + or outcome and a disease or condition, as described in the input + text. These are cases in which the relationship is used as an experimental model of progression or presence of a disease. For example, "Amyloid beta (Aβ) levels are used to model Alzheimer's disease" or "Morris water maze test is used to model Parkinson's @@ -245,12 +224,14 @@ classes: subject_qualifier: range: NamedEntity description: >- - An optional qualifier or modifier for the taxon. + An optional qualifier or modifier for the taxon, as described + in the input text. This may include a strain or genetic background of the model organism. object_qualifier: range: NamedEntity description: >- - An optional qualifier or modifier for the experimental metric. + An optional qualifier or modifier for the experimental metric, + as described in the input text. This may include the method of measurement or the specific assay used. ExperimentalMetricToDiseaseRelationship: @@ -284,10 +265,12 @@ classes: subject_qualifier: range: NamedEntity description: >- - An optional qualifier or modifier for the experimental metric. + An optional qualifier or modifier for the experimental metric, + as described in the input text. This may include the method of measurement or the specific assay used. object_qualifier: range: NamedEntity description: >- - An optional qualifier or modifier for the disease or condition. + An optional qualifier or modifier for the disease or condition, + as described in the input text. This may include the stage or subtype of the disease. From dab6a6bb566da637e4751285f9fa97b333249f7e Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Fri, 28 Jun 2024 16:34:57 -0400 Subject: [PATCH 8/9] Add more slots to document extraction --- src/ontogpt/templates/alzrd.py | 3 +++ src/ontogpt/templates/alzrd.yaml | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/src/ontogpt/templates/alzrd.py b/src/ontogpt/templates/alzrd.py index 9d3f6f977..7b8de9ba0 100644 --- a/src/ontogpt/templates/alzrd.py +++ b/src/ontogpt/templates/alzrd.py @@ -123,6 +123,9 @@ class AnnotatorResult(ConfiguredBaseModel): class Document(NamedEntity): summary: Optional[str] = Field(None, description="""A brief summary of the input text, suitable for display in a table of contents or search results. This should be no more than three sentences. Do not format in Markdown.""") + article_type: Optional[str] = Field(None, description="""The type of article, e.g., \"research article\", \"review\", \"case report\".""") + modeling_approach: Optional[str] = Field(None, description="""A brief description of the modeling approach used in the input text, e.g., \"experimental\", \"observational\", \"computational\", \"review\".""") + modeling_summary: Optional[str] = Field(None, description="""A brief summary of the modeling approach used in the input text, suitable for display in a table of contents or search results. Include any details about how a model of disease is defined, including the use of model organisms, cell lines, or in silico models, as well as the experimental metrics used to model human disease. If this is a study of human subjects, include details about the study design and the human subjects involved. This should be no more than three sentences. Do not format in Markdown.""") taxon: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of taxa or species of organisms mentioned in the input text. Where possible, translate to the binomial species name (e.g., change \"mouse\" to \"Mus musculus\"), unless a different species name is provided in the text. If no taxon is mentioned, return NOT FOUND.""") diagnostics: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diagnostic procedures mentioned in the input text. If no diagnostic procedures are mentioned, return NOT FOUND.""") diseases: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diseases or conditions mentioned in the input text. If no diseases are mentioned, return NOT FOUND.""") diff --git a/src/ontogpt/templates/alzrd.yaml b/src/ontogpt/templates/alzrd.yaml index 4d322fab7..7d7424436 100644 --- a/src/ontogpt/templates/alzrd.yaml +++ b/src/ontogpt/templates/alzrd.yaml @@ -34,6 +34,28 @@ classes: A brief summary of the input text, suitable for display in a table of contents or search results. This should be no more than three sentences. Do not format in Markdown. + article_type: + range: string + description: >- + The type of article, e.g., "research article", "review", "case + report". + modeling_approach: + range: string + description: >- + A brief description of the modeling approach used in the input text, + e.g., "experimental", "observational", "computational", "review". + modeling_summary: + range: string + description: >- + A brief summary of the modeling approach used in the input text, + suitable for display in a table of contents or search results. + Include any details about how a model of disease is defined, + including the use of model organisms, cell lines, or in silico + models, as well as the experimental metrics used to model human + disease. If this is a study of human subjects, include details about + the study design and the human subjects involved. + This should be no more than three sentences. Do not format in + Markdown. taxon: range: Taxon multivalued: true From d3ac2a4be102399f4a792917f46ada62fc988c6b Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Fri, 28 Jun 2024 17:07:28 -0400 Subject: [PATCH 9/9] taxon -> taxa --- src/ontogpt/templates/alzrd.py | 2 +- src/ontogpt/templates/alzrd.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ontogpt/templates/alzrd.py b/src/ontogpt/templates/alzrd.py index 7b8de9ba0..e73e3995d 100644 --- a/src/ontogpt/templates/alzrd.py +++ b/src/ontogpt/templates/alzrd.py @@ -126,7 +126,7 @@ class Document(NamedEntity): article_type: Optional[str] = Field(None, description="""The type of article, e.g., \"research article\", \"review\", \"case report\".""") modeling_approach: Optional[str] = Field(None, description="""A brief description of the modeling approach used in the input text, e.g., \"experimental\", \"observational\", \"computational\", \"review\".""") modeling_summary: Optional[str] = Field(None, description="""A brief summary of the modeling approach used in the input text, suitable for display in a table of contents or search results. Include any details about how a model of disease is defined, including the use of model organisms, cell lines, or in silico models, as well as the experimental metrics used to model human disease. If this is a study of human subjects, include details about the study design and the human subjects involved. This should be no more than three sentences. Do not format in Markdown.""") - taxon: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of taxa or species of organisms mentioned in the input text. Where possible, translate to the binomial species name (e.g., change \"mouse\" to \"Mus musculus\"), unless a different species name is provided in the text. If no taxon is mentioned, return NOT FOUND.""") + taxa: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of taxa or species of organisms mentioned in the input text. Where possible, translate to the binomial species name (e.g., change \"mouse\" to \"Mus musculus\"), unless a different species name is provided in the text. If no taxon is mentioned, return NOT FOUND.""") diagnostics: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diagnostic procedures mentioned in the input text. If no diagnostic procedures are mentioned, return NOT FOUND.""") diseases: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diseases or conditions mentioned in the input text. If no diseases are mentioned, return NOT FOUND.""") chemical: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of chemicals, drugs, or other substances mentioned in the input text. If no chemicals are mentioned, return NOT FOUND.""") diff --git a/src/ontogpt/templates/alzrd.yaml b/src/ontogpt/templates/alzrd.yaml index 7d7424436..e91207eb2 100644 --- a/src/ontogpt/templates/alzrd.yaml +++ b/src/ontogpt/templates/alzrd.yaml @@ -56,7 +56,7 @@ classes: the study design and the human subjects involved. This should be no more than three sentences. Do not format in Markdown. - taxon: + taxa: range: Taxon multivalued: true description: >-