Skip to content

Commit

Permalink
Merge branch 'match-source-option' of 'https://github.com/jjmerchante…
Browse files Browse the repository at this point in the history
  • Loading branch information
sduenas authored Jan 4, 2024
2 parents 7a33fbe + 0a7f1d3 commit 919dc53
Show file tree
Hide file tree
Showing 11 changed files with 381 additions and 22 deletions.
9 changes: 9 additions & 0 deletions releases/unreleased/unify-identities-with-same-source.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
title: Unify identities with same source
category: added
author: Jose Javier Merchante <[email protected]>
issue: null
notes: >
Include a new option to only recommend or unify identities
from trusted sources like GitHub or GitLab that have the same
username and backend.
20 changes: 17 additions & 3 deletions sortinghat/core/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,8 @@ def recommend_affiliations(ctx, uuids=None, last_modified=MIN_PERIOD_DATE):
def recommend_matches(ctx, source_uuids,
target_uuids, criteria,
exclude=True, verbose=False,
strict=True, last_modified=MIN_PERIOD_DATE):
strict=True, match_source=False,
last_modified=MIN_PERIOD_DATE):
"""Generate a list of affiliation recommendations from a set of individuals.
This function generates a list of recommendations which include the
Expand All @@ -231,6 +232,7 @@ def recommend_matches(ctx, source_uuids,
RecommenderExclusionTerm table. Otherwise, results will not ignore them.
:param verbose: if set to `True`, the match results will be composed by individual
identities (even belonging to the same individual).
:param match_source: only unify individuals that share the same source
:param last_modified: generate recommendations only for individuals modified after
this date
Expand All @@ -254,7 +256,16 @@ def recommend_matches(ctx, source_uuids,

trxl = TransactionsLog.open('recommend_matches', job_ctx)

for rec in engine.recommend('matches', source_uuids, target_uuids, criteria, exclude, verbose, strict, last_modified):
recommendations = engine.recommend('matches',
source_uuids,
target_uuids,
criteria,
exclude,
verbose,
strict,
match_source,
last_modified)
for rec in recommendations:
results[rec.key] = list(rec.options)
# Store matches in the database
for match in rec.options:
Expand Down Expand Up @@ -423,7 +434,8 @@ def affiliate(ctx, uuids=None, last_modified=MIN_PERIOD_DATE):

@django_rq.job
@job_using_tenant
def unify(ctx, criteria, source_uuids=None, target_uuids=None, exclude=True, strict=True, last_modified=MIN_PERIOD_DATE):
def unify(ctx, criteria, source_uuids=None, target_uuids=None, exclude=True,
strict=True, match_source=False, last_modified=MIN_PERIOD_DATE):
"""Unify a set of individuals by merging them using matching recommendations.
This function automates the identities unify process obtaining
Expand All @@ -447,6 +459,7 @@ def unify(ctx, criteria, source_uuids=None, target_uuids=None, exclude=True, str
:param exclude: if set to `True`, the results list will ignore individual identities
if any value from the `email`, `name`, or `username` fields are found in the
RecommenderExclusionTerm table. Otherwise, results will not ignore them.
:param match_source: only unify individuals that share the same source
:param last_modified: only unify individuals that have been modified after this date
:returns: a list with the individuals resulting from merge operations
Expand Down Expand Up @@ -512,6 +525,7 @@ def _group_recommendations(recs):
criteria,
exclude=exclude,
strict=strict,
match_source=match_source,
last_modified=last_modified):
match_recs[rec.mk] = list(rec.options)

Expand Down
28 changes: 21 additions & 7 deletions sortinghat/core/recommendations/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,13 @@
EMAIL_ADDRESS_REGEX = r"^(?P<email>[^\s@]+@[^\s@.]+\.[^\s@]+)$"
NAME_REGEX = r"^\w+\s\w+"

MATCH_USERNAME_SOURCES = ['github', 'gitlab', 'slack']


def recommend_matches(source_uuids, target_uuids,
criteria, exclude=True,
verbose=False, strict=True,
match_source=False,
last_modified=MIN_PERIOD_DATE):
"""Recommend identity matches for a list of individuals.
Expand Down Expand Up @@ -78,6 +81,7 @@ def recommend_matches(source_uuids, target_uuids,
:param verbose: if set to `True`, the list of results will include individual
identities. Otherwise, results will include main keys from individuals
:param strict: strict matching with well-formed email addresses and names
:param match_source: only matching for identities with the same source
:param last_modified: generate recommendations only for individuals modified after
this date
Expand Down Expand Up @@ -128,7 +132,8 @@ def _get_identities(uuid):
identities = Identity.objects.all()
target_set.update(identities)

matched = _find_matches(input_set, target_set, criteria, exclude=exclude, verbose=verbose, strict=strict)
matched = _find_matches(input_set, target_set, criteria, exclude=exclude, verbose=verbose, strict=strict,
match_source=match_source)
# Return filtered results
for uuid in source_uuids:
result = set()
Expand All @@ -148,7 +153,7 @@ def _get_identities(uuid):
logger.info(f"Matching recommendations generated; criteria='{criteria}'")


def _find_matches(set_x, set_y, criteria, exclude, verbose, strict):
def _find_matches(set_x, set_y, criteria, exclude, verbose, strict, match_source=False):
"""Find identities matches between two sets using Pandas' library.
This method find matches for the identities in `set_x` looking at
Expand All @@ -170,6 +175,7 @@ def _find_matches(set_x, set_y, criteria, exclude, verbose, strict):
:param verbose: if set to `True`, the list of results will include individual
identities. Otherwise, results will include main keys from individuals.
:param strict: strict matching with well-formed email addresses and names
:param match_source: only find matches for the same source
:returns: a dictionary including the set of matches found for each
identity from `set_x`.
Expand All @@ -187,10 +193,15 @@ def _apply_recommender_exclusion_list(df):
df_excluded = df[~df['username'].isin(excluded) & ~df['email'].isin(excluded) & ~df['name'].isin(excluded)]
return df_excluded

def _filter_criteria(df, c, strict=True):
def _filter_criteria(df, c, strict=True, match_source=False):
"""Filter dataframe creating a basic subset including a given column"""
cols = ['uuid', 'individual', c]
cdf = df[cols]
if match_source:
cols += ['source']
cdf = df[cols]
cdf = cdf[cdf['source'].isin(MATCH_USERNAME_SOURCES)]
else:
cdf = df[cols]
cdf = cdf.dropna(subset=[c])

if strict and c == 'email':
Expand All @@ -216,9 +227,12 @@ def _filter_criteria(df, c, strict=True):
cdfs = []

for c in criteria:
cdf_x = _filter_criteria(df_x, c, strict)
cdf_y = _filter_criteria(df_y, c, strict)
cdf = pandas.merge(cdf_x, cdf_y, on=c, how='inner')
cdf_x = _filter_criteria(df_x, c, strict, match_source)
cdf_y = _filter_criteria(df_y, c, strict, match_source)
if match_source:
cdf = pandas.merge(cdf_x, cdf_y, on=[c, 'source'], how='inner')
else:
cdf = pandas.merge(cdf_x, cdf_y, on=c, how='inner')
cdf = cdf[['individual_x', 'uuid_x', 'individual_y', 'uuid_y']]
cdfs.append(cdf)

Expand Down
17 changes: 15 additions & 2 deletions sortinghat/core/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -1075,6 +1075,7 @@ class Arguments:
verbose = graphene.Boolean(required=False)
exclude = graphene.Boolean(required=False)
strict = graphene.Boolean(required=False)
match_source = graphene.Boolean(required=False)
last_modified = graphene.DateTime(required=False)

job_id = graphene.Field(lambda: graphene.String)
Expand All @@ -1084,7 +1085,7 @@ class Arguments:
def mutate(self, info, criteria,
source_uuids=None, target_uuids=None,
exclude=True, verbose=False, strict=True,
last_modified=MIN_PERIOD_DATE):
match_source=False, last_modified=MIN_PERIOD_DATE):
user = info.context.user
tenant = get_db_tenant()
ctx = SortingHatContext(user=user, tenant=tenant)
Expand All @@ -1097,6 +1098,7 @@ def mutate(self, info, criteria,
exclude,
verbose,
strict,
match_source,
last_modified,
job_timeout=-1)

Expand Down Expand Up @@ -1158,6 +1160,7 @@ class Arguments:
criteria = graphene.List(graphene.String)
exclude = graphene.Boolean(required=False)
strict = graphene.Boolean(required=False)
match_source = graphene.Boolean(required=False)
last_modified = graphene.DateTime(required=False)

job_id = graphene.Field(lambda: graphene.String)
Expand All @@ -1167,12 +1170,22 @@ class Arguments:
def mutate(self, info, criteria,
source_uuids=None, target_uuids=None,
exclude=True, strict=True,
match_source=False,
last_modified=MIN_PERIOD_DATE):
user = info.context.user
tenant = get_db_tenant()
ctx = SortingHatContext(user=user, tenant=tenant)

job = enqueue(unify, ctx, criteria, source_uuids, target_uuids, exclude, strict, last_modified, job_timeout=-1)
job = enqueue(unify,
ctx,
criteria,
source_uuids,
target_uuids,
exclude,
strict,
match_source,
last_modified,
job_timeout=-1)

return Unify(
job_id=job.id
Expand Down
98 changes: 98 additions & 0 deletions tests/rec/test_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,3 +402,101 @@ def test_not_found_uuid_error(self):
self.assertEqual(rec[0], '1234567890abcdefg')
self.assertEqual(rec[1], '1234567890abcdefg')
self.assertEqual(rec[2], [])

def test_recommend_match_source(self):
"""Test if recommendations are created between same identities with same source"""

jr3 = api.add_identity(self.ctx,
name='J. Rae',
username='jane_rae',
source='github',
uuid=self.jane_rae.uuid)
jrae_github = api.add_identity(self.ctx,
name='Jane Rae',
username='jane_rae',
source='github')

source_uuids = [self.john_smith.uuid, self.jrae_no_name.uuid, self.jr2.uuid]
target_uuids = [self.john_smith.uuid, self.js2.uuid, self.js3.uuid,
self.js4.uuid,
self.jsmith.uuid, self.jsm2.uuid, self.jsm3.uuid,
self.jane_rae.uuid, self.jr2.uuid,
self.js_alt.uuid, self.js_alt2.uuid,
self.js_alt3.uuid, self.js_alt4.uuid,
self.jrae.uuid, self.jrae2.uuid,
self.jrae_no_name.uuid, self.jsmith_no_email.uuid,
jrae_github]

criteria = ['email', 'name', 'username']

# Recommend identities which match the fields in `criteria` for the same `source`
recs = list(recommend_matches(source_uuids,
target_uuids,
criteria,
match_source=True))

self.assertEqual(len(recs), 3)

rec = recs[0]
self.assertEqual(rec[0], self.john_smith.uuid)
self.assertEqual(rec[1], self.john_smith.individual.mk)
self.assertEqual(rec[2], [])

rec = recs[1]
self.assertEqual(rec[0], self.jrae_no_name.uuid)
self.assertEqual(rec[1], self.jrae_no_name.individual.mk)
self.assertEqual(rec[2], [])

rec = recs[2]
self.assertEqual(rec[0], self.jr2.uuid)
self.assertEqual(rec[1], self.jr2.individual.mk)
self.assertEqual(rec[2], sorted([jrae_github.individual.mk]))

def test_recommend_same_source_not_trusted(self):
"""Matches are not created for ids with same source but not github or gitlab"""

jr3 = api.add_identity(self.ctx,
name='J. Rae',
username='jane_rae',
source='git',
uuid=self.jane_rae.uuid)
jrae_git = api.add_identity(self.ctx,
name='Jane Rae',
username='jane_rae',
source='git')

source_uuids = [self.john_smith.uuid, self.jrae_no_name.uuid, self.jr2.uuid]
target_uuids = [self.john_smith.uuid, self.js2.uuid, self.js3.uuid,
self.js4.uuid,
self.jsmith.uuid, self.jsm2.uuid, self.jsm3.uuid,
self.jane_rae.uuid, self.jr2.uuid,
self.js_alt.uuid, self.js_alt2.uuid,
self.js_alt3.uuid, self.js_alt4.uuid,
self.jrae.uuid, self.jrae2.uuid,
self.jrae_no_name.uuid, self.jsmith_no_email.uuid,
jrae_git]

criteria = ['email', 'name', 'username']

# Recommend identities which match the fields in `criteria` for the same `source`
recs = list(recommend_matches(source_uuids,
target_uuids,
criteria,
match_source=True))

self.assertEqual(len(recs), 3)

rec = recs[0]
self.assertEqual(rec[0], self.john_smith.uuid)
self.assertEqual(rec[1], self.john_smith.individual.mk)
self.assertEqual(rec[2], [])

rec = recs[1]
self.assertEqual(rec[0], self.jrae_no_name.uuid)
self.assertEqual(rec[1], self.jrae_no_name.individual.mk)
self.assertEqual(rec[2], [])

rec = recs[2]
self.assertEqual(rec[0], self.jr2.uuid)
self.assertEqual(rec[1], self.jr2.individual.mk)
self.assertEqual(rec[2], [])
Loading

0 comments on commit 919dc53

Please sign in to comment.