From f2f10d63d56bb67044f225d674415b0b895efa39 Mon Sep 17 00:00:00 2001
From: caufieldjh <j.harry.caufield@gmail.com>
Date: Mon, 3 Jun 2024 13:16:09 -0400
Subject: [PATCH 1/3] Add alzheimers info extraction template

---
 src/ontogpt/templates/alz_treat.py   | 169 +++++++++++++++++++++++++++
 src/ontogpt/templates/alz_treat.yaml |  85 ++++++++++++++
 2 files changed, 254 insertions(+)
 create mode 100644 src/ontogpt/templates/alz_treat.py
 create mode 100644 src/ontogpt/templates/alz_treat.yaml

diff --git a/src/ontogpt/templates/alz_treat.py b/src/ontogpt/templates/alz_treat.py
new file mode 100644
index 000000000..1a2e4ff1e
--- /dev/null
+++ b/src/ontogpt/templates/alz_treat.py
@@ -0,0 +1,169 @@
+from __future__ import annotations 
+from datetime import (
+    datetime,
+    date
+)
+from decimal import Decimal 
+from enum import Enum 
+import re
+import sys
+from typing import (
+    Any,
+    List,
+    Literal,
+    Dict,
+    Optional,
+    Union
+)
+from pydantic.version import VERSION  as PYDANTIC_VERSION 
+if int(PYDANTIC_VERSION[0])>=2:
+    from pydantic import (
+        BaseModel,
+        ConfigDict,
+        Field,
+        field_validator
+    )
+else:
+    from pydantic import (
+        BaseModel,
+        Field,
+        validator
+    )
+
+metamodel_version = "None"
+version = "None"
+
+
+class ConfiguredBaseModel(BaseModel):
+    model_config = ConfigDict(
+        validate_assignment = True,
+        validate_default = True,
+        extra = "forbid",
+        arbitrary_types_allowed = True,
+        use_enum_values = True,
+        strict = False,
+    )
+    pass
+
+
+class NullDataOptions(str, Enum):
+    UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION"
+    NOT_APPLICABLE = "NOT_APPLICABLE"
+    NOT_MENTIONED = "NOT_MENTIONED"
+
+
+class ExtractionResult(ConfiguredBaseModel):
+    """
+    A result of extracting knowledge on text
+    """
+    input_id: Optional[str] = Field(None)
+    input_title: Optional[str] = Field(None)
+    input_text: Optional[str] = Field(None)
+    raw_completion_output: Optional[str] = Field(None)
+    prompt: Optional[str] = Field(None)
+    extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""")
+    named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""")
+
+
+class NamedEntity(ConfiguredBaseModel):
+    id: str = Field(..., description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+class CompoundExpression(ConfiguredBaseModel):
+    pass
+
+
+class Triple(CompoundExpression):
+    """
+    Abstract parent for Relation Extraction tasks
+    """
+    subject: Optional[str] = Field(None)
+    predicate: Optional[str] = Field(None)
+    object: Optional[str] = Field(None)
+    qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""")
+    subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""")
+    object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""")
+
+
+class TextWithTriples(ConfiguredBaseModel):
+    """
+    A text containing one or more relations of the Triple type.
+    """
+    publication: Optional[Publication] = Field(None)
+    triples: Optional[List[Triple]] = Field(default_factory=list)
+
+
+class TextWithEntity(ConfiguredBaseModel):
+    """
+    A text containing one or more instances of a single type of entity.
+    """
+    publication: Optional[Publication] = Field(None)
+    entities: Optional[List[str]] = Field(default_factory=list)
+
+
+class RelationshipType(NamedEntity):
+    id: str = Field(..., description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+class Publication(ConfiguredBaseModel):
+    id: Optional[str] = Field(None, description="""The publication identifier""")
+    title: Optional[str] = Field(None, description="""The title of the publication""")
+    abstract: Optional[str] = Field(None, description="""The abstract of the publication""")
+    combined_text: Optional[str] = Field(None)
+    full_text: Optional[str] = Field(None, description="""The full text of the publication""")
+
+
+class AnnotatorResult(ConfiguredBaseModel):
+    subject_text: Optional[str] = Field(None)
+    object_id: Optional[str] = Field(None)
+    object_text: Optional[str] = Field(None)
+
+
+class Document(NamedEntity):
+    sections: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of full sections of the document. If semicolons are present in the section text, they should be replaced with (SEMICOLON) to avoid parsing errors.""")
+    id: str = Field(..., description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+class DocumentSection(NamedEntity):
+    symptoms: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of symptoms mentioned in the section.""")
+    diagnostics: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diagnostic procedures mentioned in the section.""")
+    treatments: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of treatments mentioned in the section. These may be drugs or other therapeutic procedures.""")
+    id: str = Field(..., description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+class Symptom(NamedEntity):
+    id: str = Field(..., description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+class Diagnostic(NamedEntity):
+    id: str = Field(..., description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+class Treatment(NamedEntity):
+    id: str = Field(..., description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+# Model rebuild
+# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model
+ExtractionResult.model_rebuild()
+NamedEntity.model_rebuild()
+CompoundExpression.model_rebuild()
+Triple.model_rebuild()
+TextWithTriples.model_rebuild()
+TextWithEntity.model_rebuild()
+RelationshipType.model_rebuild()
+Publication.model_rebuild()
+AnnotatorResult.model_rebuild()
+Document.model_rebuild()
+DocumentSection.model_rebuild()
+Symptom.model_rebuild()
+Diagnostic.model_rebuild()
+Treatment.model_rebuild()
+
diff --git a/src/ontogpt/templates/alz_treat.yaml b/src/ontogpt/templates/alz_treat.yaml
new file mode 100644
index 000000000..baf61a53d
--- /dev/null
+++ b/src/ontogpt/templates/alz_treat.yaml
@@ -0,0 +1,85 @@
+id: http://w3id.org/ontogpt/alz_treat
+name: alz_treat
+title: Template for extracting Alzheimer's Disease Treatments
+description: >-
+  Template for extracting Alzheimer's disease treatments and related concepts.
+  Assumes a large input text, on the order of a full scientific article or
+  review.
+license: https://creativecommons.org/publicdomain/zero/1.0/
+prefixes:
+  rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
+  alz_treat: http://w3id.org/ontogpt/alz_treat
+  linkml: https://w3id.org/linkml/
+
+default_prefix: alz_treat
+default_range: string
+
+imports:
+  - linkml:types
+  - core
+
+classes:
+  Document:
+    tree_root: true
+    is_a: NamedEntity
+    attributes:
+      sections:
+        range: DocumentSection
+        multivalued: true
+        description: >-
+          A semicolon-separated list of full sections of the document.
+          If semicolons are present in the section text, they should
+          be replaced with (SEMICOLON) to avoid parsing errors.
+
+  DocumentSection:
+    is_a: NamedEntity
+    attributes:
+      symptoms:
+        range: Symptom
+        multivalued: true
+        description: >-
+          A semicolon-separated list of symptoms mentioned in the section.
+      diagnostics:
+        range: Diagnostic
+        multivalued: true
+        description: >-
+          A semicolon-separated list of diagnostic procedures mentioned in the
+          section.
+      treatments:
+        range: Treatment
+        multivalued: true
+        description: >-
+          A semicolon-separated list of treatments mentioned in the section.
+          These may be drugs or other therapeutic procedures.
+
+  Symptom:
+    is_a: NamedEntity
+    id_prefixes:
+      - HP
+    annotations:
+      annotators: sqlite:obo:hp, sqlite:obo:mondo, sqlite:obo:mesh, sqlite:obo:ncit
+      prompt: >-
+        the name of a human phenotype or symptom.
+        Examples are ascites, fever, pain, seizure, increased intracranial
+        pressure, lactic acidosis.
+
+  Diagnostic:
+    is_a: NamedEntity
+    id_prefixes:
+      - MAXO
+    annotations:
+      annotators: sqlite:obo:maxo, sqlite:obo:mesh, sqlite:obo:snomedct, sqlite:obo:ncit
+      prompt: >-
+        the name of a diagnostic procedure or test.
+        Examples are MRI, PET scan, lumbar puncture, blood test, biopsy.
+
+  Treatment:
+    is_a: NamedEntity
+    id_prefixes:
+      - DRUGBANK
+      - MAXO
+    annotations:
+      annotators: sqlite:obo:drugbank, sqlite:obo:maxo, sqlite:obo:mesh, sqlite:obo:ncit
+      prompt: >-
+        the name of a drug or therapeutic procedure.
+        Examples are aspirin, chemotherapy, radiation therapy, surgery.

From ec92b2bdadec02cbfc9af429227cad58e5446c4c Mon Sep 17 00:00:00 2001
From: caufieldjh <j.harry.caufield@gmail.com>
Date: Mon, 3 Jun 2024 14:08:26 -0400
Subject: [PATCH 2/3] Enable passing max_text_length as arg

---
 src/ontogpt/cli.py                   | 8 +++++++-
 src/ontogpt/clients/pubmed_client.py | 7 ++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/ontogpt/cli.py b/src/ontogpt/cli.py
index ac10d1b1e..db4f3a1e0 100644
--- a/src/ontogpt/cli.py
+++ b/src/ontogpt/cli.py
@@ -563,9 +563,14 @@ def pubmed_extract(model, pmid, template, output, output_format, get_pmc, show_p
     default=False,
     help="Attempt to parse PubMed Central full text(s) instead of abstract(s) alone.",
 )
+@click.option(
+    "--max-text-length",
+    default=3000,
+    help="Maximum text length for each input chunk. Dependent on context size of model used."
+)
 @click.argument("search")
 def pubmed_annotate(
-    model, search, template, output, output_format, limit, get_pmc, show_prompt, **kwargs
+    model, search, template, output, output_format, limit, get_pmc, show_prompt, max_text_length, **kwargs
 ):
     """Retrieve a collection of PubMed IDs for a search term; annotate them using a template.
 
@@ -595,6 +600,7 @@ def pubmed_annotate(
 
     pubmed_annotate_limit = limit
     pmc = PubmedClient()
+    pmc.max_text_length = max_text_length
     pmids = pmc.get_pmids(search)
     if get_pmc:
         logging.info("Will try to retrieve PubMed Central texts.")
diff --git a/src/ontogpt/clients/pubmed_client.py b/src/ontogpt/clients/pubmed_client.py
index fb3acce81..d02fe6404 100644
--- a/src/ontogpt/clients/pubmed_client.py
+++ b/src/ontogpt/clients/pubmed_client.py
@@ -80,9 +80,10 @@ class PubmedClient:
     This class is a wrapper around the Entrez API.
     """
 
-    # TODO: this doesn't need to be hardcoded
-    # and may vary based on the model in use
-    max_text_length = 10000
+    # The maximum length of text, in characters, to include in
+    # a single input chunk. This may be set in the CLI
+    # with the max_text_length option.
+    max_text_length: int = 10000
 
     try:
         email = get_apikey_value("ncbi-email")

From d42001eefbea4904f2a62dd7c0aecfe81ac7302c Mon Sep 17 00:00:00 2001
From: caufieldjh <j.harry.caufield@gmail.com>
Date: Mon, 3 Jun 2024 14:08:38 -0400
Subject: [PATCH 3/3] Update alz template

---
 src/ontogpt/templates/alz_treat.py   |  7 +++----
 src/ontogpt/templates/alz_treat.yaml | 17 ++++++++++++++---
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/ontogpt/templates/alz_treat.py b/src/ontogpt/templates/alz_treat.py
index 1a2e4ff1e..e12808967 100644
--- a/src/ontogpt/templates/alz_treat.py
+++ b/src/ontogpt/templates/alz_treat.py
@@ -122,17 +122,16 @@ class AnnotatorResult(ConfiguredBaseModel):
 
 
 class Document(NamedEntity):
-    sections: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of full sections of the document. If semicolons are present in the section text, they should be replaced with (SEMICOLON) to avoid parsing errors.""")
+    sections: Optional[List[DocumentSection]] = Field(default_factory=list, description="""A semicolon-separated list of full sections of the document. If semicolons are present in the section text, they should be replaced with (SEMICOLON) to avoid parsing errors. A section is a major division of the document, such as an abstract, introduction, methods, results, discussion, or conclusion, or a subsection of one of these. The text should include the section title. A single phrase or ID is not a section. Do not format in Markdown.""")
     id: str = Field(..., description="""A unique identifier for the named entity""")
     label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
 
 
-class DocumentSection(NamedEntity):
+class DocumentSection(CompoundExpression):
+    summary: Optional[str] = Field(None, description="""A brief summary of the section, suitable for display in a table of contents or search results. This should be a single sentence or phrase, not a full paragraph. Do not format in Markdown.""")
     symptoms: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of symptoms mentioned in the section.""")
     diagnostics: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of diagnostic procedures mentioned in the section.""")
     treatments: Optional[List[str]] = Field(default_factory=list, description="""A semicolon-separated list of treatments mentioned in the section. These may be drugs or other therapeutic procedures.""")
-    id: str = Field(..., description="""A unique identifier for the named entity""")
-    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
 
 
 class Symptom(NamedEntity):
diff --git a/src/ontogpt/templates/alz_treat.yaml b/src/ontogpt/templates/alz_treat.yaml
index baf61a53d..037969e1c 100644
--- a/src/ontogpt/templates/alz_treat.yaml
+++ b/src/ontogpt/templates/alz_treat.yaml
@@ -4,7 +4,7 @@ title: Template for extracting Alzheimer's Disease Treatments
 description: >-
   Template for extracting Alzheimer's disease treatments and related concepts.
   Assumes a large input text, on the order of a full scientific article or
-  review.
+  review. Try with this review - PMID:33302541
 license: https://creativecommons.org/publicdomain/zero/1.0/
 prefixes:
   rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
@@ -30,10 +30,21 @@ classes:
           A semicolon-separated list of full sections of the document.
           If semicolons are present in the section text, they should
           be replaced with (SEMICOLON) to avoid parsing errors.
+          A section is a major division of the document, such as an abstract,
+          introduction, methods, results, discussion, or conclusion,
+          or a subsection of one of these. The text should include the section
+          title. A single phrase or ID is not a section.
+          Do not format in Markdown.
 
   DocumentSection:
-    is_a: NamedEntity
+    is_a: CompoundExpression
     attributes:
+      summary:
+        range: string
+        description: >-
+          A brief summary of the section, suitable for display in a table of
+          contents or search results. This should be a single sentence or
+          phrase, not a full paragraph. Do not format in Markdown.
       symptoms:
         range: Symptom
         multivalued: true
@@ -68,7 +79,7 @@ classes:
     id_prefixes:
       - MAXO
     annotations:
-      annotators: sqlite:obo:maxo, sqlite:obo:mesh, sqlite:obo:snomedct, sqlite:obo:ncit
+      annotators: sqlite:obo:maxo, sqlite:obo:mesh, bioportal:SNOMEDCT, sqlite:obo:ncit
       prompt: >-
         the name of a diagnostic procedure or test.
         Examples are MRI, PET scan, lumbar puncture, blood test, biopsy.