Skip to content

Commit

Permalink
curation: add new rules for EU curation
Browse files Browse the repository at this point in the history
  • Loading branch information
yashlamba committed Dec 5, 2024
1 parent 0e13f59 commit cf2821c
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 4 deletions.
20 changes: 19 additions & 1 deletion site/zenodo_rdm/curation/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,45 @@
from .rules import (
award_acronym_in_description,
award_acronym_in_title,
contains_high_conf_keywords,
contains_low_conf_keywords,
published_before_award_start,
test_phrases_in_record,
user_verified,
)

CURATION_EU_RULES = {
"award_acronym_in_title": award_acronym_in_title,
"award_acronym_in_description": award_acronym_in_description,
"test_phrases_in_record": test_phrases_in_record,
"published_before_award_start": published_before_award_start,
"user_verified": user_verified,
"contains_low_conf_keywords": contains_low_conf_keywords,
"contains_high_conf_keywords": contains_high_conf_keywords,
}
"""Rules to run for EU Curation."""

CURATION_SCORES = {
"award_acronym_in_title": 5,
"award_acronym_in_description": 10,
"test_phrases_in_record": False,
"published_before_award_start": False,
"user_verified": 5,
"contains_low_conf_keywords": 5,
"contains_high_conf_keywords": 10,
}
"""Rule scores for EU Curation."""


CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 10}
CURATION_THRESHOLDS = {"EU_RECORDS_CURATION": 15}
"""Threshold values for curators/rules."""


CURATION_ENABLE_EU_CURATOR = False
"""Controls whether to dry run EU Curation."""

CURATION_LOW_CONF_KEYWORDS_EU = []
"""Low confidence keywords for EU records."""

CURATION_HIGH_CONF_KEYWORDS_EU = []
"""High confidence keywords for EU records."""
62 changes: 59 additions & 3 deletions site/zenodo_rdm/curation/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,22 @@
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Rules for curation."""

from datetime import datetime

from flask import current_app
from invenio_records_resources.proxies import current_service_registry


def award_acronym_in_description(record):
"""Check if EU award name in record description."""
award_service = current_service_registry.get("awards")
description = record.metadata["description"]
funding = record.metadata["funding"]
description = record.metadata.get("description")
if not description:
return False

funding = record.metadata["funding"]
for f in funding:
if f["funder"]["id"] == "00k4n6c32":
if award_id := f.get("award", {}).get("id"):
Expand Down Expand Up @@ -56,3 +59,56 @@ def test_phrases_in_record(record):
if word.lower() in record_data.lower():
return True
return False


def published_before_award_start(record):
"""Check if published before award start date."""
award_service = current_service_registry.get("awards")

for f in record.metadata["funding"]:
if f["funder"]["id"] == "00k4n6c32":
if award_id := f.get("award", {}).get("id"):
award = award_service.record_cls.pid.resolve(award_id)
if award.get("start_date") and (
record.created < datetime.fromisoformat(award.get("start_date"))
):
return True
return False


def user_verified(record):
"""Check if user is verified."""
is_verified = (
getattr(record.parent, "is_verified", None)
if hasattr(record, "parent")
else getattr(record, "is_verified", False)
)
return is_verified


def contains_low_conf_keywords(record):
"""Check if record contains low confidence keywords."""
low_conf_keywords_eu = current_app.config.get("CURATION_LOW_CONF_KEYWORDS_EU")
record_data = (
record.metadata["title"] + " " + record.metadata.get("description", "")
)

for word in low_conf_keywords_eu:
# TODO could possibly return a number for higher conf
if word.lower() in record_data.lower():
return True
return False


def contains_high_conf_keywords(record):
"""Check if record contains high confidence keywords."""
high_conf_keywords_eu = current_app.config.get("CURATION_HIGH_CONF_KEYWORDS_EU")
record_data = (
record.metadata["title"] + " " + record.metadata.get("description", "")
)

for word in high_conf_keywords_eu:
# TODO could possibly return a number for higher conf
if word.lower() in record_data.lower():
return True
return False

0 comments on commit cf2821c

Please sign in to comment.