Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose annotation through biolexica.Grounder and improve type hints #10

Merged
merged 2 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ prune tests/.pytest_cache
prune docs/build
prune docs/source/api
prune lexica
prune scenarios

recursive-include docs/source *.py
recursive-include docs/source *.rst
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,11 @@ Load a pre-defined grounder like this:
import biolexica

grounder = biolexica.load_grounder("phenotype")

>>> grounder.get_best_match("Alzheimer's disease")
Match(reference=Reference(prefix='doid', identifier='10652'), name="Alzheimer's disease", score=0.7777777777777778)

>>> grounder.annotate("""Clinical trials for reducing beta amyloid levels in Alzheimer's disease have been controversial.""")
```

Note: Biolexica constructs extended version of `gilda.Grounder` that has convenience functions and a more
Expand Down
37 changes: 36 additions & 1 deletion src/biolexica/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Literal, Optional, Union
from typing import TYPE_CHECKING, Any, Iterable, List, Literal, Optional, Union
from urllib.request import urlretrieve

import bioregistry
Expand All @@ -26,6 +26,8 @@
"iter_terms_by_prefix",
"load_grounder",
"get_mesh_category_curies",
"Annotation",
"Match",
]

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -80,6 +82,30 @@
)


class Annotation(BaseModel):
"""Data about an annotation."""

text: str
start: int
end: int
match: Match

@property
def reference(self) -> Reference:
"""Get the match's reference."""
return self.match.reference

@property
def name(self) -> str:
"""Get the match's entry name."""
return self.match.name

@property
def substr(self) -> str:
"""Get the substring that was matched."""
return self.text[self.start : self.end]

Check warning on line 106 in src/biolexica/api.py

View check run for this annotation

Codecov / codecov/patch

src/biolexica/api.py#L106

Added line #L106 was not covered by tests


class Grounder(gilda.Grounder):
"""Wrap a Gilda grounder with additional functionality."""

Expand Down Expand Up @@ -113,6 +139,15 @@
return None
return Match.from_gilda(scored_matches[0])

def annotate(self, text: str, **kwargs: Any) -> List[Annotation]:
"""Annotate the text."""
import gilda.ner

return [
Annotation(text=text, match=Match.from_gilda(match), start=start, end=end)
for text, match, start, end in gilda.ner.annotate(text, grounder=self, **kwargs)
]


def load_grounder(grounder: GrounderHint) -> Grounder:
"""Load a gilda grounder, potentially from a remote location."""
Expand Down
43 changes: 6 additions & 37 deletions src/biolexica/literature/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,17 @@
from collections import Counter
from typing import List, Optional, Union

import gilda
import gilda.ner
from curies import Reference
from more_itertools import batched
from pydantic import BaseModel
from tqdm.auto import tqdm

from biolexica.api import Annotation, GrounderHint, load_grounder
from biolexica.literature.retrieve import get_pubmed_dataframe
from biolexica.literature.search import query_pubmed

__all__ = [
"AnnotatedArticle",
"Annotation",
"annotate_abstracts_from_search",
"annotate_abstracts_from_pubmeds",
]
Expand All @@ -29,22 +27,6 @@
logger = logging.getLogger(__name__)


class Annotation(BaseModel):
"""Data about an annotation."""

text: str
reference: Reference
score: float
start: int
end: int
name: str

@property
def substr(self) -> str:
"""Get the substring that was matched."""
return self.text[self.start : self.end]


class AnnotatedArticle(BaseModel):
"""A data model representing an annotated article from PubMed."""

Expand All @@ -60,7 +42,7 @@ def count_references(self) -> t.Counter[t.Tuple[Reference, str]]:

def annotate_abstracts_from_search(
pubmed_query: str,
grounder: gilda.Grounder,
grounder: GrounderHint,
*,
use_indra_db: bool = True,
limit: Optional[int] = None,
Expand All @@ -78,7 +60,7 @@ def annotate_abstracts_from_search(

def annotate_abstracts_from_pubmeds(
pubmed_ids: t.Collection[Union[str, int]],
grounder: gilda.Grounder,
grounder: GrounderHint,
*,
use_indra_db: bool = True,
batch_size: int = 20_000,
Expand All @@ -89,6 +71,8 @@ def annotate_abstracts_from_pubmeds(

rv: List[AnnotatedArticle] = []

grounder = load_grounder(grounder)

outer_it = tqdm(
batched(pubmed_ids, batch_size),
total=1 + n_pmids // batch_size,
Expand Down Expand Up @@ -119,23 +103,8 @@ def annotate_abstracts_from_pubmeds(
pubmed=pmid,
title=title,
abstract=abstract,
annotations=annotate(abstract, grounder=grounder),
annotations=grounder.annotate(abstract),
)
)

return rv


def annotate(text: str, grounder: gilda.Grounder) -> List[Annotation]:
"""Annotate text using the given Gilda grounder."""
return [
Annotation(
text=text,
reference=Reference(prefix=match.term.db, identifier=match.term.id),
name=match.term.entry_name,
score=match.score,
start=start,
end=end,
)
for text, match, start, end in gilda.ner.annotate(text, grounder=grounder)
]
Loading