Skip to content

Commit

Permalink
Try extraction to nmdc-schema (#148)
Browse files Browse the repository at this point in the history
This is the nmdc-schema: https://github.com/microbiomedata/nmdc-schema
Study/sample metadata should ideally be represented by ENVO terms, but
this is not always the case.
This PR introduces a template for extracting nmdc-schema compatible
contents from study/sample data entries.
  • Loading branch information
caufieldjh authored Dec 22, 2023
2 parents cdd896b + 79c7b06 commit 4355202
Show file tree
Hide file tree
Showing 2 changed files with 197 additions and 0 deletions.
139 changes: 139 additions & 0 deletions src/ontogpt/templates/nmdc_schema_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from __future__ import annotations
from datetime import datetime, date
from enum import Enum
from typing import List, Dict, Optional, Any, Union, Literal
from pydantic import BaseModel as BaseModel, Field
from linkml_runtime.linkml_model import Decimal

metamodel_version = "None"
version = "None"

class WeakRefShimBaseModel(BaseModel):
__slots__ = '__weakref__'

class ConfiguredBaseModel(WeakRefShimBaseModel,
validate_assignment = True,
validate_all = True,
underscore_attrs_are_private = True,
extra = 'forbid',
arbitrary_types_allowed = True):
pass


class NullDataOptions(str, Enum):

UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION"
NOT_APPLICABLE = "NOT_APPLICABLE"
NOT_MENTIONED = "NOT_MENTIONED"



class Dataset(ConfiguredBaseModel):

packageid: Optional[str] = Field(None, description="""The internal identifier for the dataset""")
environmental_material: Optional[List[str]] = Field(default_factory=list, description="""the environmental material that was sampled""")
environments: Optional[List[str]] = Field(default_factory=list, description="""the environmental context in which the study was conducted""")



class ExtractionResult(ConfiguredBaseModel):
"""
A result of extracting knowledge on text
"""
input_id: Optional[str] = Field(None)
input_title: Optional[str] = Field(None)
input_text: Optional[str] = Field(None)
raw_completion_output: Optional[str] = Field(None)
prompt: Optional[str] = Field(None)
extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""")
named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""")



class NamedEntity(ConfiguredBaseModel):

id: str = Field(None, description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")



class EnvironmentalMaterial(NamedEntity):

id: str = Field(None, description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")



class Environment(NamedEntity):

id: str = Field(None, description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")



class CompoundExpression(ConfiguredBaseModel):

None



class Triple(CompoundExpression):
"""
Abstract parent for Relation Extraction tasks
"""
subject: Optional[str] = Field(None)
predicate: Optional[str] = Field(None)
object: Optional[str] = Field(None)
qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""")
subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""")
object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""")



class TextWithTriples(ConfiguredBaseModel):

publication: Optional[Publication] = Field(None)
triples: Optional[List[Triple]] = Field(default_factory=list)



class RelationshipType(NamedEntity):

id: str = Field(None, description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")



class Publication(ConfiguredBaseModel):

id: Optional[str] = Field(None, description="""The publication identifier""")
title: Optional[str] = Field(None, description="""The title of the publication""")
abstract: Optional[str] = Field(None, description="""The abstract of the publication""")
combined_text: Optional[str] = Field(None)
full_text: Optional[str] = Field(None, description="""The full text of the publication""")



class AnnotatorResult(ConfiguredBaseModel):

subject_text: Optional[str] = Field(None)
object_id: Optional[str] = Field(None)
object_text: Optional[str] = Field(None)




# Update forward refs
# see https://pydantic-docs.helpmanual.io/usage/postponed_annotations/
Dataset.update_forward_refs()
ExtractionResult.update_forward_refs()
NamedEntity.update_forward_refs()
EnvironmentalMaterial.update_forward_refs()
Environment.update_forward_refs()
CompoundExpression.update_forward_refs()
Triple.update_forward_refs()
TextWithTriples.update_forward_refs()
RelationshipType.update_forward_refs()
Publication.update_forward_refs()
AnnotatorResult.update_forward_refs()

58 changes: 58 additions & 0 deletions src/ontogpt/templates/nmdc_schema_data.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
id: http://w3id.org/ontogpt/nmdc-schema-data
name: nmdc-schema-data
title: nmdc-schema-data
description: >-
A template for populating nmdc-schema required slots
from data entries. Primarily, this involves three different
levels corresponding to ENVO terms, as well as conversion
of NLCD values and FAO soil taxonomy classes to ENVO.
license: https://creativecommons.org/publicdomain/zero/1.0/
prefixes:
linkml: https://w3id.org/linkml/
nmdcsd: http://w3id.org/ontogpt/nmdc-schema-data

default_prefix: nmdcsd
default_range: string

imports:
- linkml:types
- core

classes:
Dataset:
tree_root: true
attributes:
packageid:
description: The internal identifier for the dataset
annotations:
prompt: single unique identifier for the dataset
range: string
multivalued: false

environmental_material:
description: the environmental material that was sampled
annotations:
prompt: semicolon-separated list of environmental materials
range: EnvironmentalMaterial
multivalued: true

environments:
description: the environmental context in which the study was conducted
annotations:
prompt: semicolon-separated list of environmental contexts in which the study was conducted
range: Environment
multivalued: true

EnvironmentalMaterial:
is_a: NamedEntity
id_prefixes:
- ENVO
annotations:
annotators: sqlite:obo:envo

Environment:
is_a: NamedEntity
id_prefixes:
- ENVO
annotations:
annotators: sqlite:obo:envo

0 comments on commit 4355202

Please sign in to comment.