-
Notifications
You must be signed in to change notification settings - Fork 81
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Try extraction to nmdc-schema (#148)
This is the nmdc-schema: https://github.com/microbiomedata/nmdc-schema Study/sample metadata should ideally be represented by ENVO terms, but this is not always the case. This PR introduces a template for extracting nmdc-schema compatible contents from study/sample data entries.
- Loading branch information
Showing
2 changed files
with
197 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
from __future__ import annotations | ||
from datetime import datetime, date | ||
from enum import Enum | ||
from typing import List, Dict, Optional, Any, Union, Literal | ||
from pydantic import BaseModel as BaseModel, Field | ||
from linkml_runtime.linkml_model import Decimal | ||
|
||
metamodel_version = "None" | ||
version = "None" | ||
|
||
class WeakRefShimBaseModel(BaseModel): | ||
__slots__ = '__weakref__' | ||
|
||
class ConfiguredBaseModel(WeakRefShimBaseModel, | ||
validate_assignment = True, | ||
validate_all = True, | ||
underscore_attrs_are_private = True, | ||
extra = 'forbid', | ||
arbitrary_types_allowed = True): | ||
pass | ||
|
||
|
||
class NullDataOptions(str, Enum): | ||
|
||
UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION" | ||
NOT_APPLICABLE = "NOT_APPLICABLE" | ||
NOT_MENTIONED = "NOT_MENTIONED" | ||
|
||
|
||
|
||
class Dataset(ConfiguredBaseModel): | ||
|
||
packageid: Optional[str] = Field(None, description="""The internal identifier for the dataset""") | ||
environmental_material: Optional[List[str]] = Field(default_factory=list, description="""the environmental material that was sampled""") | ||
environments: Optional[List[str]] = Field(default_factory=list, description="""the environmental context in which the study was conducted""") | ||
|
||
|
||
|
||
class ExtractionResult(ConfiguredBaseModel): | ||
""" | ||
A result of extracting knowledge on text | ||
""" | ||
input_id: Optional[str] = Field(None) | ||
input_title: Optional[str] = Field(None) | ||
input_text: Optional[str] = Field(None) | ||
raw_completion_output: Optional[str] = Field(None) | ||
prompt: Optional[str] = Field(None) | ||
extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""") | ||
named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""") | ||
|
||
|
||
|
||
class NamedEntity(ConfiguredBaseModel): | ||
|
||
id: str = Field(None, description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
|
||
class EnvironmentalMaterial(NamedEntity): | ||
|
||
id: str = Field(None, description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
|
||
class Environment(NamedEntity): | ||
|
||
id: str = Field(None, description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
|
||
class CompoundExpression(ConfiguredBaseModel): | ||
|
||
None | ||
|
||
|
||
|
||
class Triple(CompoundExpression): | ||
""" | ||
Abstract parent for Relation Extraction tasks | ||
""" | ||
subject: Optional[str] = Field(None) | ||
predicate: Optional[str] = Field(None) | ||
object: Optional[str] = Field(None) | ||
qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""") | ||
subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""") | ||
object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""") | ||
|
||
|
||
|
||
class TextWithTriples(ConfiguredBaseModel): | ||
|
||
publication: Optional[Publication] = Field(None) | ||
triples: Optional[List[Triple]] = Field(default_factory=list) | ||
|
||
|
||
|
||
class RelationshipType(NamedEntity): | ||
|
||
id: str = Field(None, description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
|
||
class Publication(ConfiguredBaseModel): | ||
|
||
id: Optional[str] = Field(None, description="""The publication identifier""") | ||
title: Optional[str] = Field(None, description="""The title of the publication""") | ||
abstract: Optional[str] = Field(None, description="""The abstract of the publication""") | ||
combined_text: Optional[str] = Field(None) | ||
full_text: Optional[str] = Field(None, description="""The full text of the publication""") | ||
|
||
|
||
|
||
class AnnotatorResult(ConfiguredBaseModel): | ||
|
||
subject_text: Optional[str] = Field(None) | ||
object_id: Optional[str] = Field(None) | ||
object_text: Optional[str] = Field(None) | ||
|
||
|
||
|
||
|
||
# Update forward refs | ||
# see https://pydantic-docs.helpmanual.io/usage/postponed_annotations/ | ||
Dataset.update_forward_refs() | ||
ExtractionResult.update_forward_refs() | ||
NamedEntity.update_forward_refs() | ||
EnvironmentalMaterial.update_forward_refs() | ||
Environment.update_forward_refs() | ||
CompoundExpression.update_forward_refs() | ||
Triple.update_forward_refs() | ||
TextWithTriples.update_forward_refs() | ||
RelationshipType.update_forward_refs() | ||
Publication.update_forward_refs() | ||
AnnotatorResult.update_forward_refs() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
id: http://w3id.org/ontogpt/nmdc-schema-data | ||
name: nmdc-schema-data | ||
title: nmdc-schema-data | ||
description: >- | ||
A template for populating nmdc-schema required slots | ||
from data entries. Primarily, this involves three different | ||
levels corresponding to ENVO terms, as well as conversion | ||
of NLCD values and FAO soil taxonomy classes to ENVO. | ||
license: https://creativecommons.org/publicdomain/zero/1.0/ | ||
prefixes: | ||
linkml: https://w3id.org/linkml/ | ||
nmdcsd: http://w3id.org/ontogpt/nmdc-schema-data | ||
|
||
default_prefix: nmdcsd | ||
default_range: string | ||
|
||
imports: | ||
- linkml:types | ||
- core | ||
|
||
classes: | ||
Dataset: | ||
tree_root: true | ||
attributes: | ||
packageid: | ||
description: The internal identifier for the dataset | ||
annotations: | ||
prompt: single unique identifier for the dataset | ||
range: string | ||
multivalued: false | ||
|
||
environmental_material: | ||
description: the environmental material that was sampled | ||
annotations: | ||
prompt: semicolon-separated list of environmental materials | ||
range: EnvironmentalMaterial | ||
multivalued: true | ||
|
||
environments: | ||
description: the environmental context in which the study was conducted | ||
annotations: | ||
prompt: semicolon-separated list of environmental contexts in which the study was conducted | ||
range: Environment | ||
multivalued: true | ||
|
||
EnvironmentalMaterial: | ||
is_a: NamedEntity | ||
id_prefixes: | ||
- ENVO | ||
annotations: | ||
annotators: sqlite:obo:envo | ||
|
||
Environment: | ||
is_a: NamedEntity | ||
id_prefixes: | ||
- ENVO | ||
annotations: | ||
annotators: sqlite:obo:envo |