Skip to content

Commit

Permalink
Add template for alternate gene symbol extraction strategy (#390)
Browse files Browse the repository at this point in the history
  • Loading branch information
caufieldjh authored May 31, 2024
2 parents c25f581 + 0cbeae9 commit c1ef136
Show file tree
Hide file tree
Showing 2 changed files with 195 additions and 0 deletions.
150 changes: 150 additions & 0 deletions src/ontogpt/templates/gene_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from __future__ import annotations
from datetime import (
datetime,
date
)
from decimal import Decimal
from enum import Enum
import re
import sys
from typing import (
Any,
List,
Literal,
Dict,
Optional,
Union
)
from pydantic.version import VERSION as PYDANTIC_VERSION
if int(PYDANTIC_VERSION[0])>=2:
from pydantic import (
BaseModel,
ConfigDict,
Field,
field_validator
)
else:
from pydantic import (
BaseModel,
Field,
validator
)

metamodel_version = "None"
version = "None"


class ConfiguredBaseModel(BaseModel):
model_config = ConfigDict(
validate_assignment = True,
validate_default = True,
extra = "forbid",
arbitrary_types_allowed = True,
use_enum_values = True,
strict = False,
)
pass


class NullDataOptions(str, Enum):
UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION"
NOT_APPLICABLE = "NOT_APPLICABLE"
NOT_MENTIONED = "NOT_MENTIONED"


class ExtractionResult(ConfiguredBaseModel):
"""
A result of extracting knowledge on text
"""
input_id: Optional[str] = Field(None)
input_title: Optional[str] = Field(None)
input_text: Optional[str] = Field(None)
raw_completion_output: Optional[str] = Field(None)
prompt: Optional[str] = Field(None)
extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""")
named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""")


class NamedEntity(ConfiguredBaseModel):
id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


class CompoundExpression(ConfiguredBaseModel):
pass


class Triple(CompoundExpression):
"""
Abstract parent for Relation Extraction tasks
"""
subject: Optional[str] = Field(None)
predicate: Optional[str] = Field(None)
object: Optional[str] = Field(None)
qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""")
subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""")
object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""")


class TextWithTriples(ConfiguredBaseModel):
"""
A text containing one or more relations of the Triple type.
"""
publication: Optional[Publication] = Field(None)
triples: Optional[List[Triple]] = Field(default_factory=list)


class TextWithEntity(ConfiguredBaseModel):
"""
A text containing one or more instances of a single type of entity.
"""
publication: Optional[Publication] = Field(None)
entities: Optional[List[str]] = Field(default_factory=list)


class RelationshipType(NamedEntity):
id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


class Publication(ConfiguredBaseModel):
id: Optional[str] = Field(None, description="""The publication identifier""")
title: Optional[str] = Field(None, description="""The title of the publication""")
abstract: Optional[str] = Field(None, description="""The abstract of the publication""")
combined_text: Optional[str] = Field(None)
full_text: Optional[str] = Field(None, description="""The full text of the publication""")


class AnnotatorResult(ConfiguredBaseModel):
subject_text: Optional[str] = Field(None)
object_id: Optional[str] = Field(None)
object_text: Optional[str] = Field(None)


class AcronymList(NamedEntity):
genes: Optional[List[str]] = Field(default_factory=list)
id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


class Gene(NamedEntity):
"""
a single gene symbol.
"""
id: str = Field(..., description="""A unique identifier for the named entity""")
label: Optional[str] = Field(None, description="""The label (name) of the named thing""")


# Model rebuild
# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model
ExtractionResult.model_rebuild()
NamedEntity.model_rebuild()
CompoundExpression.model_rebuild()
Triple.model_rebuild()
TextWithTriples.model_rebuild()
TextWithEntity.model_rebuild()
RelationshipType.model_rebuild()
Publication.model_rebuild()
AnnotatorResult.model_rebuild()
AcronymList.model_rebuild()
Gene.model_rebuild()
45 changes: 45 additions & 0 deletions src/ontogpt/templates/gene_extraction.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
id: http://w3id.org/ontogpt/gene_extraction
name: gene_extraction
title: Gene Extraction Template
description: >-
A template demonstrating a general strategy for extracting gene symbols from
ambiguous context.
license: https://creativecommons.org/publicdomain/zero/1.0/
prefixes:
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
HGNC: http://identifiers.org/hgnc/
geneextraction: http://w3id.org/ontogpt/gene_extraction
linkml: https://w3id.org/linkml/

default_prefix: geneextraction
default_range: string

imports:
- linkml:types
- core

classes:
AcronymList:
tree_root: true
is_a: NamedEntity
attributes:
genes:
annotations:
prompt: >-
A semicolon-delimited list of potential gene symbols within
the text. Include all acronyms that could be gene symbols,
i.e., any string of capital letters, particularly if it is
followed by a number. Examples of gene symbols include:
BRCA1, TP53, EGR2, ITGB6, PRKCD. Gene symbols may resemble
acronyms referring to diseases or phenotypes,
and may be surrounded by punctuation or other text.
range: Gene
multivalued: true

Gene:
is_a: NamedEntity
description: a single gene symbol.
id_prefixes:
- HGNC
annotations:
annotators: bioportal:hgnc-nr, bioportal:gexo

0 comments on commit c1ef136

Please sign in to comment.