-
Notifications
You must be signed in to change notification settings - Fork 81
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add template for alternate gene symbol extraction strategy (#390)
- Loading branch information
Showing
2 changed files
with
195 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
from __future__ import annotations | ||
from datetime import ( | ||
datetime, | ||
date | ||
) | ||
from decimal import Decimal | ||
from enum import Enum | ||
import re | ||
import sys | ||
from typing import ( | ||
Any, | ||
List, | ||
Literal, | ||
Dict, | ||
Optional, | ||
Union | ||
) | ||
from pydantic.version import VERSION as PYDANTIC_VERSION | ||
if int(PYDANTIC_VERSION[0])>=2: | ||
from pydantic import ( | ||
BaseModel, | ||
ConfigDict, | ||
Field, | ||
field_validator | ||
) | ||
else: | ||
from pydantic import ( | ||
BaseModel, | ||
Field, | ||
validator | ||
) | ||
|
||
metamodel_version = "None" | ||
version = "None" | ||
|
||
|
||
class ConfiguredBaseModel(BaseModel): | ||
model_config = ConfigDict( | ||
validate_assignment = True, | ||
validate_default = True, | ||
extra = "forbid", | ||
arbitrary_types_allowed = True, | ||
use_enum_values = True, | ||
strict = False, | ||
) | ||
pass | ||
|
||
|
||
class NullDataOptions(str, Enum): | ||
UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION" | ||
NOT_APPLICABLE = "NOT_APPLICABLE" | ||
NOT_MENTIONED = "NOT_MENTIONED" | ||
|
||
|
||
class ExtractionResult(ConfiguredBaseModel): | ||
""" | ||
A result of extracting knowledge on text | ||
""" | ||
input_id: Optional[str] = Field(None) | ||
input_title: Optional[str] = Field(None) | ||
input_text: Optional[str] = Field(None) | ||
raw_completion_output: Optional[str] = Field(None) | ||
prompt: Optional[str] = Field(None) | ||
extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""") | ||
named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""") | ||
|
||
|
||
class NamedEntity(ConfiguredBaseModel): | ||
id: str = Field(..., description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
class CompoundExpression(ConfiguredBaseModel): | ||
pass | ||
|
||
|
||
class Triple(CompoundExpression): | ||
""" | ||
Abstract parent for Relation Extraction tasks | ||
""" | ||
subject: Optional[str] = Field(None) | ||
predicate: Optional[str] = Field(None) | ||
object: Optional[str] = Field(None) | ||
qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""") | ||
subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""") | ||
object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""") | ||
|
||
|
||
class TextWithTriples(ConfiguredBaseModel): | ||
""" | ||
A text containing one or more relations of the Triple type. | ||
""" | ||
publication: Optional[Publication] = Field(None) | ||
triples: Optional[List[Triple]] = Field(default_factory=list) | ||
|
||
|
||
class TextWithEntity(ConfiguredBaseModel): | ||
""" | ||
A text containing one or more instances of a single type of entity. | ||
""" | ||
publication: Optional[Publication] = Field(None) | ||
entities: Optional[List[str]] = Field(default_factory=list) | ||
|
||
|
||
class RelationshipType(NamedEntity): | ||
id: str = Field(..., description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
class Publication(ConfiguredBaseModel): | ||
id: Optional[str] = Field(None, description="""The publication identifier""") | ||
title: Optional[str] = Field(None, description="""The title of the publication""") | ||
abstract: Optional[str] = Field(None, description="""The abstract of the publication""") | ||
combined_text: Optional[str] = Field(None) | ||
full_text: Optional[str] = Field(None, description="""The full text of the publication""") | ||
|
||
|
||
class AnnotatorResult(ConfiguredBaseModel): | ||
subject_text: Optional[str] = Field(None) | ||
object_id: Optional[str] = Field(None) | ||
object_text: Optional[str] = Field(None) | ||
|
||
|
||
class AcronymList(NamedEntity): | ||
genes: Optional[List[str]] = Field(default_factory=list) | ||
id: str = Field(..., description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
class Gene(NamedEntity): | ||
""" | ||
a single gene symbol. | ||
""" | ||
id: str = Field(..., description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
# Model rebuild | ||
# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model | ||
ExtractionResult.model_rebuild() | ||
NamedEntity.model_rebuild() | ||
CompoundExpression.model_rebuild() | ||
Triple.model_rebuild() | ||
TextWithTriples.model_rebuild() | ||
TextWithEntity.model_rebuild() | ||
RelationshipType.model_rebuild() | ||
Publication.model_rebuild() | ||
AnnotatorResult.model_rebuild() | ||
AcronymList.model_rebuild() | ||
Gene.model_rebuild() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
id: http://w3id.org/ontogpt/gene_extraction | ||
name: gene_extraction | ||
title: Gene Extraction Template | ||
description: >- | ||
A template demonstrating a general strategy for extracting gene symbols from | ||
ambiguous context. | ||
license: https://creativecommons.org/publicdomain/zero/1.0/ | ||
prefixes: | ||
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# | ||
HGNC: http://identifiers.org/hgnc/ | ||
geneextraction: http://w3id.org/ontogpt/gene_extraction | ||
linkml: https://w3id.org/linkml/ | ||
|
||
default_prefix: geneextraction | ||
default_range: string | ||
|
||
imports: | ||
- linkml:types | ||
- core | ||
|
||
classes: | ||
AcronymList: | ||
tree_root: true | ||
is_a: NamedEntity | ||
attributes: | ||
genes: | ||
annotations: | ||
prompt: >- | ||
A semicolon-delimited list of potential gene symbols within | ||
the text. Include all acronyms that could be gene symbols, | ||
i.e., any string of capital letters, particularly if it is | ||
followed by a number. Examples of gene symbols include: | ||
BRCA1, TP53, EGR2, ITGB6, PRKCD. Gene symbols may resemble | ||
acronyms referring to diseases or phenotypes, | ||
and may be surrounded by punctuation or other text. | ||
range: Gene | ||
multivalued: true | ||
|
||
Gene: | ||
is_a: NamedEntity | ||
description: a single gene symbol. | ||
id_prefixes: | ||
- HGNC | ||
annotations: | ||
annotators: bioportal:hgnc-nr, bioportal:gexo |