From fc8aefdc5d0e1eb65e1b6511a2d62493f4b39e49 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Fri, 29 Sep 2023 13:59:14 -0400 Subject: [PATCH 1/9] Use model_dump instead of dict --- src/ontogpt/evaluation/ctd/eval_ctd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ontogpt/evaluation/ctd/eval_ctd.py b/src/ontogpt/evaluation/ctd/eval_ctd.py index b9853c2a7..eabd59e7e 100644 --- a/src/ontogpt/evaluation/ctd/eval_ctd.py +++ b/src/ontogpt/evaluation/ctd/eval_ctd.py @@ -249,10 +249,10 @@ def included(t: ChemicalToDiseaseRelationship): pred = PredictionRE(predicted_object=predicted_obj, test_object=doc) pred.named_entities = named_entities logger.info("PRED") - logger.info(yaml.dump(pred.dict())) + logger.info(yaml.dump(pred.model_dump())) logger.info("Calc scores") pred.calculate_scores(labelers=[labeler]) - logger.info(yaml.dump(pred.dict())) + logger.info(yaml.dump(pred.model_dump())) eos.predictions.append(pred) self.calc_stats(eos) return eos From e991a658684ddbb6cea73426b83440a1b00d9a1e Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Fri, 29 Sep 2023 15:28:42 -0400 Subject: [PATCH 2/9] use get_adapter with OAK --- src/ontogpt/evaluation/ctd/eval_ctd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ontogpt/evaluation/ctd/eval_ctd.py b/src/ontogpt/evaluation/ctd/eval_ctd.py index eabd59e7e..e66bf5a2e 100644 --- a/src/ontogpt/evaluation/ctd/eval_ctd.py +++ b/src/ontogpt/evaluation/ctd/eval_ctd.py @@ -26,7 +26,7 @@ import yaml from bioc import biocxml -from oaklib import BasicOntologyInterface, get_implementation_from_shorthand +from oaklib import BasicOntologyInterface, get_adapter from pydantic import BaseModel from ontogpt.engines.knowledge_engine import chunk_text @@ -181,7 +181,7 @@ def create_training_set(self, num=100): def eval(self) -> EvaluationObjectSetRE: """Evaluate the ability to extract relations.""" - labeler = get_implementation_from_shorthand("sqlite:obo:mesh") + labeler = get_adapter("sqlite:obo:mesh") num_test = self.num_tests ke = self.extractor docs = list(self.load_test_cases()) From 7d84e890113a47c4d71575cf31b8143f342948d0 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Tue, 3 Oct 2023 15:50:59 -0400 Subject: [PATCH 3/9] Fix validation error for Publication --- src/ontogpt/evaluation/ctd/eval_ctd.py | 30 +++++++++++++++++++------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/ontogpt/evaluation/ctd/eval_ctd.py b/src/ontogpt/evaluation/ctd/eval_ctd.py index e66bf5a2e..428cca2ee 100644 --- a/src/ontogpt/evaluation/ctd/eval_ctd.py +++ b/src/ontogpt/evaluation/ctd/eval_ctd.py @@ -32,11 +32,12 @@ from ontogpt.engines.knowledge_engine import chunk_text from ontogpt.engines.spires_engine import SPIRESEngine from ontogpt.evaluation.evaluation_engine import SimilarityScore, SPIRESEvaluationEngine -from ontogpt.templates.core import Publication, Triple from ontogpt.templates.ctd import ( ChemicalToDiseaseDocument, ChemicalToDiseaseRelationship, + Publication, TextWithTriples, + Triple, ) THIS_DIR = Path(__file__).parent @@ -158,16 +159,29 @@ def load_cases(self, path: Path) -> Iterable[ChemicalToDiseaseDocument]: # text = f"Title: {title} Abstract: {abstract}" for r in document.relations: i = r.infons - t = Triple( - subject=f"{self.subject_prefix}:{i['Chemical']}", - predicate=RMAP[i["relation"]], - object=f"{self.object_prefix}:{i['Disease']}", + t = Triple.model_validate( + { + "subject": f"{self.subject_prefix}:{i['Chemical']}", + "predicate": RMAP[i["relation"]], + "object": f"{self.object_prefix}:{i['Disease']}", + } ) triples_by_text[(title, abstract)].append(t) + i = 0 for (title, abstract), triples in triples_by_text.items(): - pub = Publication(title=title, abstract=abstract) - logger.debug(f"Triples: {len(triples)} for Title: {title} Abstract: {abstract}") - yield ChemicalToDiseaseDocument(publication=pub, triples=triples) + i = str(i + 1) + pub = Publication.model_validate( + { + "id": i, + "title": title, + "abstract": abstract, + "combined_text": "N/A", + "full_text": "N/A", + } + ) + # logger.debug(f"Triples: {len(triples)} for Title: {title} Abstract: {abstract}") + print(f"Triples: {len(triples)} for Title: {title} Abstract: {abstract}") + yield ChemicalToDiseaseDocument.model_validate({"publication": pub, "triples": triples}) def create_training_set(self, num=100): ke = self.extractor From f9ef35604d659cb131a189300866136af10c5807 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Tue, 3 Oct 2023 15:58:30 -0400 Subject: [PATCH 4/9] Fix validation error on Triple vs ChemicalToDiseaseRelationship --- src/ontogpt/evaluation/ctd/eval_ctd.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ontogpt/evaluation/ctd/eval_ctd.py b/src/ontogpt/evaluation/ctd/eval_ctd.py index 428cca2ee..b8099f009 100644 --- a/src/ontogpt/evaluation/ctd/eval_ctd.py +++ b/src/ontogpt/evaluation/ctd/eval_ctd.py @@ -159,7 +159,7 @@ def load_cases(self, path: Path) -> Iterable[ChemicalToDiseaseDocument]: # text = f"Title: {title} Abstract: {abstract}" for r in document.relations: i = r.infons - t = Triple.model_validate( + t = ChemicalToDiseaseRelationship.model_validate( { "subject": f"{self.subject_prefix}:{i['Chemical']}", "predicate": RMAP[i["relation"]], @@ -169,10 +169,10 @@ def load_cases(self, path: Path) -> Iterable[ChemicalToDiseaseDocument]: triples_by_text[(title, abstract)].append(t) i = 0 for (title, abstract), triples in triples_by_text.items(): - i = str(i + 1) + i = i + 1 pub = Publication.model_validate( { - "id": i, + "id": str(i), "title": title, "abstract": abstract, "combined_text": "N/A", From 37e2d6cdf71e018c6f853104f8532e7404446000 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Tue, 3 Oct 2023 16:00:10 -0400 Subject: [PATCH 5/9] Restore original logger output --- src/ontogpt/evaluation/ctd/eval_ctd.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ontogpt/evaluation/ctd/eval_ctd.py b/src/ontogpt/evaluation/ctd/eval_ctd.py index b8099f009..feb597f4c 100644 --- a/src/ontogpt/evaluation/ctd/eval_ctd.py +++ b/src/ontogpt/evaluation/ctd/eval_ctd.py @@ -179,8 +179,7 @@ def load_cases(self, path: Path) -> Iterable[ChemicalToDiseaseDocument]: "full_text": "N/A", } ) - # logger.debug(f"Triples: {len(triples)} for Title: {title} Abstract: {abstract}") - print(f"Triples: {len(triples)} for Title: {title} Abstract: {abstract}") + logger.debug(f"Triples: {len(triples)} for Title: {title} Abstract: {abstract}") yield ChemicalToDiseaseDocument.model_validate({"publication": pub, "triples": triples}) def create_training_set(self, num=100): From 7b38c3969f6eb0dbf3f7416adf83d7b6cca50d81 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Tue, 3 Oct 2023 16:04:45 -0400 Subject: [PATCH 6/9] Repair negation --- src/ontogpt/evaluation/ctd/eval_ctd.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ontogpt/evaluation/ctd/eval_ctd.py b/src/ontogpt/evaluation/ctd/eval_ctd.py index feb597f4c..340ea427a 100644 --- a/src/ontogpt/evaluation/ctd/eval_ctd.py +++ b/src/ontogpt/evaluation/ctd/eval_ctd.py @@ -37,7 +37,6 @@ ChemicalToDiseaseRelationship, Publication, TextWithTriples, - Triple, ) THIS_DIR = Path(__file__).parent @@ -50,8 +49,11 @@ logger = logging.getLogger(__name__) -def negated(Triple) -> bool: - return Triple.qualifier and Triple.qualifier.lower() == "not" +def negated(ChemicalToDiseaseRelationship) -> bool: + return ( + ChemicalToDiseaseRelationship.qualifier + and ChemicalToDiseaseRelationship.qualifier.lower() == "not" + ) class PredictionRE(BaseModel): From ec3a151c76e0ab7fcbb0755b5072939049117bea Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 4 Oct 2023 13:46:43 -0400 Subject: [PATCH 7/9] Remove extra fields from Publication --- src/ontogpt/evaluation/ctd/eval_ctd.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/ontogpt/evaluation/ctd/eval_ctd.py b/src/ontogpt/evaluation/ctd/eval_ctd.py index 340ea427a..276f5e6a7 100644 --- a/src/ontogpt/evaluation/ctd/eval_ctd.py +++ b/src/ontogpt/evaluation/ctd/eval_ctd.py @@ -177,8 +177,6 @@ def load_cases(self, path: Path) -> Iterable[ChemicalToDiseaseDocument]: "id": str(i), "title": title, "abstract": abstract, - "combined_text": "N/A", - "full_text": "N/A", } ) logger.debug(f"Triples: {len(triples)} for Title: {title} Abstract: {abstract}") From de98bc5ece0cfc9d9dd3618b0e25264f95f41c87 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 4 Oct 2023 13:52:41 -0400 Subject: [PATCH 8/9] Cleanup and linting --- src/ontogpt/evaluation/ctd/eval_ctd.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/src/ontogpt/evaluation/ctd/eval_ctd.py b/src/ontogpt/evaluation/ctd/eval_ctd.py index 276f5e6a7..0ca679ae4 100644 --- a/src/ontogpt/evaluation/ctd/eval_ctd.py +++ b/src/ontogpt/evaluation/ctd/eval_ctd.py @@ -132,7 +132,6 @@ class EvaluationObjectSetRE(BaseModel): @dataclass class EvalCTD(SPIRESEvaluationEngine): - # ontology: OboGraphInterface = None subject_prefix = "MESH" object_prefix = "MESH" @@ -158,7 +157,7 @@ def load_cases(self, path: Path) -> Iterable[ChemicalToDiseaseDocument]: doc[p.infons["type"]] = p.text title = doc["title"] abstract = doc["abstract"] - # text = f"Title: {title} Abstract: {abstract}" + logger.debug(f"Title: {title} Abstract: {abstract}") for r in document.relations: i = r.infons t = ChemicalToDiseaseRelationship.model_validate( @@ -189,7 +188,7 @@ def create_training_set(self, num=100): for doc in docs[0:num]: text = doc.text prompt = ke.get_completion_prompt(None, text) - completion = ke.serialize_object(m) + completion = ke.serialize_object() yield dict(prompt=prompt, completion=completion) def eval(self) -> EvaluationObjectSetRE: @@ -230,18 +229,6 @@ def eval(self) -> EvaluationObjectSetRE: logger.debug(f"concatenated triples: {predicted_obj.triples}") named_entities.extend(extraction.named_entities) - # title_extraction = ke.extract_from_text(doc.publication.title) - # logger.info(f"{len(title_extraction.extracted_object.triples)}\ - # triples from: Title {doc.publication.title}") - # abstract_extraction = ke.extract_from_text(doc.publication.abstract) - # logger.info(f"{len(abstract_extraction.extracted_object.triples)}\ - # triples from: Abstract {doc.publication.abstract}") - # ke.merge_resultsets([results, results2]) - # predicted_obj = title_extraction.extracted_object - # predicted_obj.triples.extend(abstract_extraction.extracted_object.triples) - # logger.info(f"{len(predicted_obj.triples)} total triples, after concatenation") - # logger.debug(f"concatenated triples: {predicted_obj.triples}") - def included(t: ChemicalToDiseaseRelationship): if not [var for var in (t.subject, t.object, t.predicate) if var is None]: return ( From 96aa7e5575f887dad043db17dd73cff11dba827d Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Thu, 5 Oct 2023 10:20:28 -0400 Subject: [PATCH 9/9] Name a param --- src/ontogpt/evaluation/ctd/eval_ctd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ontogpt/evaluation/ctd/eval_ctd.py b/src/ontogpt/evaluation/ctd/eval_ctd.py index 0ca679ae4..019cbc241 100644 --- a/src/ontogpt/evaluation/ctd/eval_ctd.py +++ b/src/ontogpt/evaluation/ctd/eval_ctd.py @@ -249,10 +249,10 @@ def included(t: ChemicalToDiseaseRelationship): pred = PredictionRE(predicted_object=predicted_obj, test_object=doc) pred.named_entities = named_entities logger.info("PRED") - logger.info(yaml.dump(pred.model_dump())) + logger.info(yaml.dump(data=pred.model_dump())) logger.info("Calc scores") pred.calculate_scores(labelers=[labeler]) - logger.info(yaml.dump(pred.model_dump())) + logger.info(yaml.dump(data=pred.model_dump())) eos.predictions.append(pred) self.calc_stats(eos) return eos