Skip to content

Commit

Permalink
[recs] Recommendations for individuals modified after a date
Browse files Browse the repository at this point in the history
Adds the `last_modified` parameter to the jobs and mutations
for the merge and affiliation recommendations. This generates
recommendations only for the individuals that have been
updated after that date.

Signed-off-by: Eva Millán <[email protected]>
  • Loading branch information
evamillan committed Sep 8, 2023
1 parent 8959129 commit b72a8c6
Show file tree
Hide file tree
Showing 7 changed files with 390 additions and 59 deletions.
9 changes: 9 additions & 0 deletions releases/unreleased/recommendations-last-modified.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
title: Recommendations for individuals modified after a given date
category: added
author: Eva Millan <[email protected]>
issue: 813
notes: >
Users can generate merge and affiliation recommendations for
individuals that have been created or modified after a date
specified with the `last_modified` parameter.
26 changes: 17 additions & 9 deletions sortinghat/core/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@
AffiliationRecommendation,
MergeRecommendation,
GenderRecommendation,
ScheduledTask)
ScheduledTask,
MIN_PERIOD_DATE)
from .recommendations.engine import RecommendationEngine


Expand Down Expand Up @@ -126,7 +127,7 @@ def job_in_tenant(job, tenant):

@django_rq.job
@job_using_tenant
def recommend_affiliations(ctx, uuids=None):
def recommend_affiliations(ctx, uuids=None, last_modified=MIN_PERIOD_DATE):
"""Generate a list of affiliation recommendations from a set of individuals.
This function generates a list of recommendations which include the
Expand All @@ -140,6 +141,8 @@ def recommend_affiliations(ctx, uuids=None):
:param ctx: context where this job is run
:param uuids: list of individuals identifiers
:param last_modified: generate recommendations only for individuals modified after
this date
:returns: a dictionary with which individuals are recommended to be
affiliated to which organization.
Expand All @@ -148,7 +151,7 @@ def recommend_affiliations(ctx, uuids=None):

if not uuids:
logger.info(f"Running job {job.id} 'recommend affiliations'; uuids='all'; ...")
uuids = Individual.objects.values_list('mk', flat=True).iterator()
uuids = Individual.objects.filter(last_modified__gte=last_modified).values_list('mk', flat=True).iterator()
else:
logger.info(f"Running job {job.id} 'recommend affiliations'; uuids={uuids}; ...")
uuids = iter(uuids)
Expand Down Expand Up @@ -196,7 +199,7 @@ def recommend_affiliations(ctx, uuids=None):

@django_rq.job
@job_using_tenant
def recommend_matches(ctx, source_uuids, target_uuids, criteria, exclude=True, verbose=False, strict=True):
def recommend_matches(ctx, source_uuids, target_uuids, criteria, exclude=True, verbose=False, strict=True, last_modified=MIN_PERIOD_DATE):
"""Generate a list of affiliation recommendations from a set of individuals.
This function generates a list of recommendations which include the
Expand All @@ -222,6 +225,8 @@ def recommend_matches(ctx, source_uuids, target_uuids, criteria, exclude=True, v
RecommenderExclusionTerm table. Otherwise, results will not ignore them.
:param verbose: if set to `True`, the match results will be composed by individual
identities (even belonging to the same individual).
:param last_modified: generate recommendations only for individuals modified after
this date
:returns: a dictionary with which individuals are recommended to be
merged to which individual or which identities.
Expand All @@ -243,7 +248,7 @@ def recommend_matches(ctx, source_uuids, target_uuids, criteria, exclude=True, v

trxl = TransactionsLog.open('recommend_matches', job_ctx)

for rec in engine.recommend('matches', source_uuids, target_uuids, criteria, exclude, verbose, strict):
for rec in engine.recommend('matches', source_uuids, target_uuids, criteria, exclude, verbose, strict, last_modified):
results[rec.key] = list(rec.options)
# Store matches in the database
for match in rec.options:
Expand Down Expand Up @@ -333,7 +338,7 @@ def recommend_gender(ctx, uuids, exclude=True, no_strict_matching=False):

@django_rq.job
@job_using_tenant
def affiliate(ctx, uuids=None):
def affiliate(ctx, uuids=None, last_modified=MIN_PERIOD_DATE):
"""Affiliate a set of individuals using recommendations.
This function automates the affiliation process obtaining
Expand All @@ -348,6 +353,8 @@ def affiliate(ctx, uuids=None):
:param ctx: context where this job is run
:param uuids: list of individuals identifiers
:param last_modified: only affiliate individuals that have been
modified after this date
:returns: a dictionary with which individuals were enrolled
and the errors found running the job
Expand All @@ -356,7 +363,7 @@ def affiliate(ctx, uuids=None):

if not uuids:
logger.info(f"Running job {job.id} 'affiliate'; uuids='all'; ...")
uuids = Individual.objects.values_list('mk', flat=True).iterator()
uuids = Individual.objects.filter(last_modified__gte=last_modified).values_list('mk', flat=True).iterator()
else:
logger.info(f"Running job {job.id} 'affiliate'; uuids={uuids}; ...")
uuids = iter(uuids)
Expand Down Expand Up @@ -401,7 +408,7 @@ def affiliate(ctx, uuids=None):

@django_rq.job
@job_using_tenant
def unify(ctx, criteria, source_uuids=None, target_uuids=None, exclude=True, strict=True):
def unify(ctx, criteria, source_uuids=None, target_uuids=None, exclude=True, strict=True, last_modified=MIN_PERIOD_DATE):
"""Unify a set of individuals by merging them using matching recommendations.
This function automates the identities unify process obtaining
Expand All @@ -425,6 +432,7 @@ def unify(ctx, criteria, source_uuids=None, target_uuids=None, exclude=True, str
:param exclude: if set to `True`, the results list will ignore individual identities
if any value from the `email`, `name`, or `username` fields are found in the
RecommenderExclusionTerm table. Otherwise, results will not ignore them.
:param last_modified: only unify individuals that have been modified after this date
:returns: a list with the individuals resulting from merge operations
and the errors found running the job
Expand Down Expand Up @@ -471,7 +479,7 @@ def _group_recommendations(recs):
trxl = TransactionsLog.open('unify', job_ctx)

match_recs = {}
for rec in engine.recommend('matches', source_uuids, target_uuids, criteria, exclude=exclude, strict=strict):
for rec in engine.recommend('matches', source_uuids, target_uuids, criteria, exclude=exclude, strict=strict, last_modified=last_modified):
match_recs[rec.key] = list(rec.options)

match_groups = _group_recommendations(match_recs)
Expand Down
9 changes: 5 additions & 4 deletions sortinghat/core/recommendations/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

from ..db import (find_individual_by_uuid)
from ..errors import NotFoundError
from ..models import Identity
from ..models import Identity, MIN_PERIOD_DATE

from .exclusion import fetch_recommender_exclusion_list

Expand All @@ -39,8 +39,7 @@
EMAIL_ADDRESS_REGEX = r"^(?P<email>[^\s@]+@[^\s@.]+\.[^\s@]+)$"
NAME_REGEX = r"^\w+\s\w+"


def recommend_matches(source_uuids, target_uuids, criteria, exclude=True, verbose=False, strict=True):
def recommend_matches(source_uuids, target_uuids, criteria, exclude=True, verbose=False, strict=True, last_modified=MIN_PERIOD_DATE):
"""Recommend identity matches for a list of individuals.
Returns a generator of identity matches recommendations
Expand Down Expand Up @@ -75,6 +74,8 @@ def recommend_matches(source_uuids, target_uuids, criteria, exclude=True, verbos
:param verbose: if set to `True`, the list of results will include individual
identities. Otherwise, results will include main keys from individuals
:param strict: strict matching with well-formed email addresses and names
:param last_modified: generate recommendations only for individuals modified after
this date
:returns: a generator of recommendations
"""
Expand Down Expand Up @@ -106,7 +107,7 @@ def _get_identities(uuid):
aliases[uuid] = [identity.uuid for identity in identities]
input_set.update(identities)
else:
identities = Identity.objects.select_related('individual').all()
identities = Identity.objects.select_related('individual').filter(individual__last_modified__gte=last_modified)
input_set.update(identities)
for identity in identities:
aliases[identity.individual.mk].append(identity.uuid)
Expand Down
35 changes: 26 additions & 9 deletions sortinghat/core/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@
AffiliationRecommendation,
MergeRecommendation,
GenderRecommendation,
ScheduledTask)
ScheduledTask,
MIN_PERIOD_DATE)
from .recommendations.exclusion import delete_recommend_exclusion_term, add_recommender_exclusion_term


Expand Down Expand Up @@ -1047,17 +1048,18 @@ class RecommendAffiliations(graphene.Mutation):
class Arguments:
uuids = graphene.List(graphene.String,
required=False)
last_modified = graphene.DateTime(required=False)

job_id = graphene.Field(lambda: graphene.String)

@check_permissions('core.execute_job')
@check_auth
def mutate(self, info, uuids=None):
def mutate(self, info, uuids=None, last_modified=MIN_PERIOD_DATE):
user = info.context.user
tenant = get_db_tenant()
ctx = SortingHatContext(user=user, tenant=tenant)

job = enqueue(recommend_affiliations, ctx, uuids, job_timeout=-1)
job = enqueue(recommend_affiliations, ctx, uuids, last_modified, job_timeout=-1)

return RecommendAffiliations(
job_id=job.id
Expand All @@ -1074,17 +1076,30 @@ class Arguments:
verbose = graphene.Boolean(required=False)
exclude = graphene.Boolean(required=False)
strict = graphene.Boolean(required=False)
last_modified = graphene.DateTime(required=False)

job_id = graphene.Field(lambda: graphene.String)

@check_permissions('core.execute_job')
@check_auth
def mutate(self, info, criteria, source_uuids=None, target_uuids=None, exclude=True, verbose=False, strict=True):
def mutate(self, info, criteria,
source_uuids=None, target_uuids=None,
exclude=True, verbose=False, strict=True,
last_modified=MIN_PERIOD_DATE):
user = info.context.user
tenant = get_db_tenant()
ctx = SortingHatContext(user=user, tenant=tenant)

job = enqueue(recommend_matches, ctx, source_uuids, target_uuids, criteria, exclude, verbose, strict, job_timeout=-1)
job = enqueue(recommend_matches,
ctx,
source_uuids,
target_uuids,
criteria,
exclude,
verbose,
strict,
last_modified,
job_timeout=-1)

return RecommendMatches(
job_id=job.id
Expand Down Expand Up @@ -1117,17 +1132,18 @@ class Affiliate(graphene.Mutation):
class Arguments:
uuids = graphene.List(graphene.String,
required=False)
last_modified = graphene.DateTime(required=False)

job_id = graphene.Field(lambda: graphene.String)

@check_permissions('core.execute_job')
@check_auth
def mutate(self, info, uuids=None):
def mutate(self, info, uuids=None, last_modified=MIN_PERIOD_DATE):
user = info.context.user
tenant = get_db_tenant()
ctx = SortingHatContext(user=user, tenant=tenant)

job = enqueue(affiliate, ctx, uuids, job_timeout=-1)
job = enqueue(affiliate, ctx, uuids, last_modified, job_timeout=-1)

return Affiliate(
job_id=job.id
Expand All @@ -1143,17 +1159,18 @@ class Arguments:
criteria = graphene.List(graphene.String)
exclude = graphene.Boolean(required=False)
strict = graphene.Boolean(required=False)
last_modified = graphene.DateTime(required=False)

job_id = graphene.Field(lambda: graphene.String)

@check_permissions('core.execute_job')
@check_auth
def mutate(self, info, criteria, source_uuids=None, target_uuids=None, exclude=True, strict=True):
def mutate(self, info, criteria, source_uuids=None, target_uuids=None, exclude=True, strict=True, last_modified=MIN_PERIOD_DATE):
user = info.context.user
tenant = get_db_tenant()
ctx = SortingHatContext(user=user, tenant=tenant)

job = enqueue(unify, ctx, criteria, source_uuids, target_uuids, exclude, strict, job_timeout=-1)
job = enqueue(unify, ctx, criteria, source_uuids, target_uuids, exclude, strict, last_modified, job_timeout=-1)

return Unify(
job_id=job.id
Expand Down
32 changes: 32 additions & 0 deletions tests/rec/test_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from django.contrib.auth import get_user_model
from django.test import TestCase

from grimoirelab_toolkit.datetime import datetime_utcnow

from sortinghat.core import api
from sortinghat.core.context import SortingHatContext
from sortinghat.core.recommendations.matching import recommend_matches
Expand Down Expand Up @@ -239,6 +241,36 @@ def test_recommend_matches_exclude(self):
self.assertEqual(len(result), 3)
self.assertDictEqual(result, expected)

def test_recommend_matches_last_modified(self):
"""Check if recommendations are obtained for individuals modified after a date"""

timestamp = datetime_utcnow()

api.add_identity(self.ctx,
username='john_smith',
source='mls',
uuid=self.js_alt.uuid)
# Test
expected = {
self.js_alt.uuid: sorted([self.jsmith.uuid])
}

criteria = ['email', 'name', 'username']

# Identities which don't have the fields in `criteria` won't be returned
recs = dict(recommend_matches(None,
None,
criteria,
last_modified=timestamp))

# Preserve results order for the comparison against the expected results
result = {}
for key in recs:
result[key] = sorted(recs[key])

self.assertEqual(len(result), 1)
self.assertDictEqual(result, expected)

def test_recommend_matches_verbose(self):
"""Check if recommendations are obtained for the specified individuals, at identity level"""

Expand Down
Loading

0 comments on commit b72a8c6

Please sign in to comment.