Try extraction to nmdc-schema (#148)

This is the nmdc-schema: https://github.com/microbiomedata/nmdc-schema Study/sample metadata should ideally be represented by ENVO terms, but this is not always the case. This PR introduces a template for extracting nmdc-schema compatible contents from study/sample data entries.
monarch-initiative · Dec 22, 2023 · 4355202 · 4355202
2 parents cdd896b + 79c7b06
commit 4355202
Show file tree

Hide file tree

Showing 2 changed files with 197 additions and 0 deletions.
diff --git a/src/ontogpt/templates/nmdc_schema_data.py b/src/ontogpt/templates/nmdc_schema_data.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+from datetime import datetime, date
+from enum import Enum
+from typing import List, Dict, Optional, Any, Union, Literal
+from pydantic import BaseModel as BaseModel, Field
+from linkml_runtime.linkml_model import Decimal
+
+metamodel_version = "None"
+version = "None"
+
+class WeakRefShimBaseModel(BaseModel):
+   __slots__ = '__weakref__'
+
+class ConfiguredBaseModel(WeakRefShimBaseModel,
+                validate_assignment = True, 
+                validate_all = True, 
+                underscore_attrs_are_private = True, 
+                extra = 'forbid', 
+                arbitrary_types_allowed = True):
+    pass                    
+
+
+class NullDataOptions(str, Enum):
+
+    UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION"
+    NOT_APPLICABLE = "NOT_APPLICABLE"
+    NOT_MENTIONED = "NOT_MENTIONED"
+
+
+
+class Dataset(ConfiguredBaseModel):
+
+    packageid: Optional[str] = Field(None, description="""The internal identifier for the dataset""")
+    environmental_material: Optional[List[str]] = Field(default_factory=list, description="""the environmental material that was sampled""")
+    environments: Optional[List[str]] = Field(default_factory=list, description="""the environmental context in which the study was conducted""")
+
+
+
+class ExtractionResult(ConfiguredBaseModel):
+    """
+    A result of extracting knowledge on text
+    """
+    input_id: Optional[str] = Field(None)
+    input_title: Optional[str] = Field(None)
+    input_text: Optional[str] = Field(None)
+    raw_completion_output: Optional[str] = Field(None)
+    prompt: Optional[str] = Field(None)
+    extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""")
+    named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""")
+
+
+
+class NamedEntity(ConfiguredBaseModel):
+
+    id: str = Field(None, description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+
+class EnvironmentalMaterial(NamedEntity):
+
+    id: str = Field(None, description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+
+class Environment(NamedEntity):
+
+    id: str = Field(None, description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+
+class CompoundExpression(ConfiguredBaseModel):
+
+    None
+
+
+
+class Triple(CompoundExpression):
+    """
+    Abstract parent for Relation Extraction tasks
+    """
+    subject: Optional[str] = Field(None)
+    predicate: Optional[str] = Field(None)
+    object: Optional[str] = Field(None)
+    qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""")
+    subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""")
+    object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""")
+
+
+
+class TextWithTriples(ConfiguredBaseModel):
+
+    publication: Optional[Publication] = Field(None)
+    triples: Optional[List[Triple]] = Field(default_factory=list)
+
+
+
+class RelationshipType(NamedEntity):
+
+    id: str = Field(None, description="""A unique identifier for the named entity""")
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""")
+
+
+
+class Publication(ConfiguredBaseModel):
+
+    id: Optional[str] = Field(None, description="""The publication identifier""")
+    title: Optional[str] = Field(None, description="""The title of the publication""")
+    abstract: Optional[str] = Field(None, description="""The abstract of the publication""")
+    combined_text: Optional[str] = Field(None)
+    full_text: Optional[str] = Field(None, description="""The full text of the publication""")
+
+
+
+class AnnotatorResult(ConfiguredBaseModel):
+
+    subject_text: Optional[str] = Field(None)
+    object_id: Optional[str] = Field(None)
+    object_text: Optional[str] = Field(None)
+
+
+
+
+# Update forward refs
+# see https://pydantic-docs.helpmanual.io/usage/postponed_annotations/
+Dataset.update_forward_refs()
+ExtractionResult.update_forward_refs()
+NamedEntity.update_forward_refs()
+EnvironmentalMaterial.update_forward_refs()
+Environment.update_forward_refs()
+CompoundExpression.update_forward_refs()
+Triple.update_forward_refs()
+TextWithTriples.update_forward_refs()
+RelationshipType.update_forward_refs()
+Publication.update_forward_refs()
+AnnotatorResult.update_forward_refs()
+
diff --git a/src/ontogpt/templates/nmdc_schema_data.yaml b/src/ontogpt/templates/nmdc_schema_data.yaml
@@ -0,0 +1,58 @@
+id: http://w3id.org/ontogpt/nmdc-schema-data
+name: nmdc-schema-data
+title: nmdc-schema-data
+description: >-
+  A template for populating nmdc-schema required slots
+  from data entries. Primarily, this involves three different
+  levels corresponding to ENVO terms, as well as conversion
+  of NLCD values and FAO soil taxonomy classes to ENVO.
+license: https://creativecommons.org/publicdomain/zero/1.0/
+prefixes:
+  linkml: https://w3id.org/linkml/
+  nmdcsd: http://w3id.org/ontogpt/nmdc-schema-data
+
+default_prefix: nmdcsd
+default_range: string
+
+imports:
+  - linkml:types
+  - core
+
+classes:
+  Dataset:
+    tree_root: true
+    attributes:
+      packageid:
+        description: The internal identifier for the dataset
+        annotations:
+          prompt: single unique identifier for the dataset
+        range: string
+        multivalued: false
+
+      environmental_material:
+        description: the environmental material that was sampled
+        annotations:
+          prompt: semicolon-separated list of environmental materials
+        range: EnvironmentalMaterial
+        multivalued: true
+
+      environments:
+        description: the environmental context in which the study was conducted
+        annotations:
+          prompt: semicolon-separated list of environmental contexts in which the study was conducted
+        range: Environment
+        multivalued: true
+
+  EnvironmentalMaterial:
+    is_a: NamedEntity
+    id_prefixes:
+      - ENVO
+    annotations:
+      annotators: sqlite:obo:envo
+
+  Environment:
+    is_a: NamedEntity
+    id_prefixes:
+      - ENVO
+    annotations:
+      annotators: sqlite:obo:envo