From cf2821c96ed8adb64c40ac7216376cc03bd2bc3b Mon Sep 17 00:00:00 2001 From: yashlamba Date: Thu, 5 Dec 2024 13:56:43 +0100 Subject: [PATCH 1/4] curation: add new rules for EU curation --- site/zenodo_rdm/curation/config.py | 20 +++++++++- site/zenodo_rdm/curation/rules.py | 62 ++++++++++++++++++++++++++++-- 2 files changed, 78 insertions(+), 4 deletions(-) diff --git a/site/zenodo_rdm/curation/config.py b/site/zenodo_rdm/curation/config.py index a9efae57..1b265409 100644 --- a/site/zenodo_rdm/curation/config.py +++ b/site/zenodo_rdm/curation/config.py @@ -10,13 +10,21 @@ from .rules import ( award_acronym_in_description, award_acronym_in_title, + contains_high_conf_keywords, + contains_low_conf_keywords, + published_before_award_start, test_phrases_in_record, + user_verified, ) CURATION_EU_RULES = { "award_acronym_in_title": award_acronym_in_title, "award_acronym_in_description": award_acronym_in_description, "test_phrases_in_record": test_phrases_in_record, + "published_before_award_start": published_before_award_start, + "user_verified": user_verified, + "contains_low_conf_keywords": contains_low_conf_keywords, + "contains_high_conf_keywords": contains_high_conf_keywords, } """Rules to run for EU Curation.""" @@ -24,13 +32,23 @@ "award_acronym_in_title": 5, "award_acronym_in_description": 10, "test_phrases_in_record": False, + "published_before_award_start": False, + "user_verified": 5, + "contains_low_conf_keywords": 5, + "contains_high_conf_keywords": 10, } """Rule scores for EU Curation.""" -CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 10} +CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 15} """Threshold values for curators/rules.""" CURATION_ENABLE_EU_CURATOR = False """Controls whether to dry run EU Curation.""" + +CURATION_LOW_CONF_KEYWORDS_EU = [] +"""Low confidence keywords for EU records.""" + +CURATION_HIGH_CONF_KEYWORDS_EU = [] +"""High confidence keywords for EU records.""" diff --git a/site/zenodo_rdm/curation/rules.py b/site/zenodo_rdm/curation/rules.py index 1e1f5c72..11c62ffb 100644 --- a/site/zenodo_rdm/curation/rules.py +++ b/site/zenodo_rdm/curation/rules.py @@ -4,9 +4,10 @@ # # ZenodoRDM is free software; you can redistribute it and/or modify # it under the terms of the MIT License; see LICENSE file for more details. - """Rules for curation.""" +from datetime import datetime + from flask import current_app from invenio_records_resources.proxies import current_service_registry @@ -14,9 +15,11 @@ def award_acronym_in_description(record): """Check if EU award name in record description.""" award_service = current_service_registry.get("awards") - description = record.metadata["description"] - funding = record.metadata["funding"] + description = record.metadata.get("description") + if not description: + return False + funding = record.metadata["funding"] for f in funding: if f["funder"]["id"] == "00k4n6c32": if award_id := f.get("award", {}).get("id"): @@ -56,3 +59,56 @@ def test_phrases_in_record(record): if word.lower() in record_data.lower(): return True return False + + +def published_before_award_start(record): + """Check if published before award start date.""" + award_service = current_service_registry.get("awards") + + for f in record.metadata["funding"]: + if f["funder"]["id"] == "00k4n6c32": + if award_id := f.get("award", {}).get("id"): + award = award_service.record_cls.pid.resolve(award_id) + if award.get("start_date") and ( + record.created < datetime.fromisoformat(award.get("start_date")) + ): + return True + return False + + +def user_verified(record): + """Check if user is verified.""" + is_verified = ( + getattr(record.parent, "is_verified", None) + if hasattr(record, "parent") + else getattr(record, "is_verified", False) + ) + return is_verified + + +def contains_low_conf_keywords(record): + """Check if record contains low confidence keywords.""" + low_conf_keywords_eu = current_app.config.get("CURATION_LOW_CONF_KEYWORDS_EU") + record_data = ( + record.metadata["title"] + " " + record.metadata.get("description", "") + ) + + for word in low_conf_keywords_eu: + # TODO could possibly return a number for higher conf + if word.lower() in record_data.lower(): + return True + return False + + +def contains_high_conf_keywords(record): + """Check if record contains high confidence keywords.""" + high_conf_keywords_eu = current_app.config.get("CURATION_HIGH_CONF_KEYWORDS_EU") + record_data = ( + record.metadata["title"] + " " + record.metadata.get("description", "") + ) + + for word in high_conf_keywords_eu: + # TODO could possibly return a number for higher conf + if word.lower() in record_data.lower(): + return True + return False From a2a1ee66b020dcb74a0344a0fa7a6f7439abcb30 Mon Sep 17 00:00:00 2001 From: Yash Lamba Date: Fri, 6 Dec 2024 11:03:46 +0100 Subject: [PATCH 2/4] curation: rules: funding failsafe Co-authored-by: Alex Ioannidis --- site/zenodo_rdm/curation/rules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/zenodo_rdm/curation/rules.py b/site/zenodo_rdm/curation/rules.py index 11c62ffb..f414e4a7 100644 --- a/site/zenodo_rdm/curation/rules.py +++ b/site/zenodo_rdm/curation/rules.py @@ -19,7 +19,7 @@ def award_acronym_in_description(record): if not description: return False - funding = record.metadata["funding"] + funding = record.metadata.get("funding", []) for f in funding: if f["funder"]["id"] == "00k4n6c32": if award_id := f.get("award", {}).get("id"): From 4b11adbcd748f31ff06416900f00bc488de47c6f Mon Sep 17 00:00:00 2001 From: Yash Lamba Date: Fri, 6 Dec 2024 11:04:26 +0100 Subject: [PATCH 3/4] curation: rules: funder failsafe Co-authored-by: Alex Ioannidis --- site/zenodo_rdm/curation/rules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/zenodo_rdm/curation/rules.py b/site/zenodo_rdm/curation/rules.py index f414e4a7..40bce82b 100644 --- a/site/zenodo_rdm/curation/rules.py +++ b/site/zenodo_rdm/curation/rules.py @@ -21,7 +21,7 @@ def award_acronym_in_description(record): funding = record.metadata.get("funding", []) for f in funding: - if f["funder"]["id"] == "00k4n6c32": + if f["funder"].get("id") == "00k4n6c32": if award_id := f.get("award", {}).get("id"): award = award_service.record_cls.pid.resolve(award_id) if award.get("acronym") and ( From b0e5a376535375617f285f601f26d3b82a1c75d9 Mon Sep 17 00:00:00 2001 From: yashlamba Date: Fri, 6 Dec 2024 11:09:57 +0100 Subject: [PATCH 4/4] use arrow --- site/zenodo_rdm/curation/rules.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/site/zenodo_rdm/curation/rules.py b/site/zenodo_rdm/curation/rules.py index 40bce82b..bb7b5db1 100644 --- a/site/zenodo_rdm/curation/rules.py +++ b/site/zenodo_rdm/curation/rules.py @@ -6,8 +6,7 @@ # it under the terms of the MIT License; see LICENSE file for more details. """Rules for curation.""" -from datetime import datetime - +import arrow from flask import current_app from invenio_records_resources.proxies import current_service_registry @@ -70,7 +69,7 @@ def published_before_award_start(record): if award_id := f.get("award", {}).get("id"): award = award_service.record_cls.pid.resolve(award_id) if award.get("start_date") and ( - record.created < datetime.fromisoformat(award.get("start_date")) + record.created < arrow.get(award.get("start_date")).datetime ): return True return False