Merge branch 'match-source-option' of 'https://github.com/jjmerchante…

…/grimoirelab-sortinghat' Merges #844 Closes #844
chaoss · Jan 4, 2024 · 919dc53 · 919dc53
2 parents 7a33fbe + 0a7f1d3
commit 919dc53
Show file tree

Hide file tree

Showing 11 changed files with 381 additions and 22 deletions.
diff --git a/releases/unreleased/unify-identities-with-same-source.yml b/releases/unreleased/unify-identities-with-same-source.yml
@@ -0,0 +1,9 @@
+---
+title: Unify identities with same source
+category: added
+author: Jose Javier Merchante <[email protected]>
+issue: null
+notes: >
+  Include a new option to only recommend or unify identities
+  from trusted sources like GitHub or GitLab that have the same
+  username and backend.
diff --git a/sortinghat/core/jobs.py b/sortinghat/core/jobs.py
@@ -205,7 +205,8 @@ def recommend_affiliations(ctx, uuids=None, last_modified=MIN_PERIOD_DATE):
 def recommend_matches(ctx, source_uuids,
                       target_uuids, criteria,
                       exclude=True, verbose=False,
-                      strict=True, last_modified=MIN_PERIOD_DATE):
+                      strict=True, match_source=False,
+                      last_modified=MIN_PERIOD_DATE):
     """Generate a list of affiliation recommendations from a set of individuals.
 
     This function generates a list of recommendations which include the
@@ -231,6 +232,7 @@ def recommend_matches(ctx, source_uuids,
         RecommenderExclusionTerm table. Otherwise, results will not ignore them.
     :param verbose: if set to `True`, the match results will be composed by individual
         identities (even belonging to the same individual).
+    :param match_source: only unify individuals that share the same source
     :param last_modified: generate recommendations only for individuals modified after
         this date
 
@@ -254,7 +256,16 @@ def recommend_matches(ctx, source_uuids,
 
     trxl = TransactionsLog.open('recommend_matches', job_ctx)
 
-    for rec in engine.recommend('matches', source_uuids, target_uuids, criteria, exclude, verbose, strict, last_modified):
+    recommendations = engine.recommend('matches',
+                                       source_uuids,
+                                       target_uuids,
+                                       criteria,
+                                       exclude,
+                                       verbose,
+                                       strict,
+                                       match_source,
+                                       last_modified)
+    for rec in recommendations:
         results[rec.key] = list(rec.options)
         # Store matches in the database
         for match in rec.options:
@@ -423,7 +434,8 @@ def affiliate(ctx, uuids=None, last_modified=MIN_PERIOD_DATE):
 
 @django_rq.job
 @job_using_tenant
-def unify(ctx, criteria, source_uuids=None, target_uuids=None, exclude=True, strict=True, last_modified=MIN_PERIOD_DATE):
+def unify(ctx, criteria, source_uuids=None, target_uuids=None, exclude=True,
+          strict=True, match_source=False, last_modified=MIN_PERIOD_DATE):
     """Unify a set of individuals by merging them using matching recommendations.
 
     This function automates the identities unify process obtaining
@@ -447,6 +459,7 @@ def unify(ctx, criteria, source_uuids=None, target_uuids=None, exclude=True, str
     :param exclude: if set to `True`, the results list will ignore individual identities
         if any value from the `email`, `name`, or `username` fields are found in the
         RecommenderExclusionTerm table. Otherwise, results will not ignore them.
+    :param match_source: only unify individuals that share the same source
     :param last_modified: only unify individuals that have been modified after this date
 
     :returns: a list with the individuals resulting from merge operations
@@ -512,6 +525,7 @@ def _group_recommendations(recs):
                                 criteria,
                                 exclude=exclude,
                                 strict=strict,
+                                match_source=match_source,
                                 last_modified=last_modified):
         match_recs[rec.mk] = list(rec.options)
 

diff --git a/sortinghat/core/recommendations/matching.py b/sortinghat/core/recommendations/matching.py
@@ -39,10 +39,13 @@
 EMAIL_ADDRESS_REGEX = r"^(?P<email>[^\s@]+@[^\s@.]+\.[^\s@]+)$"
 NAME_REGEX = r"^\w+\s\w+"
 
+MATCH_USERNAME_SOURCES = ['github', 'gitlab', 'slack']
+
 
 def recommend_matches(source_uuids, target_uuids,
                       criteria, exclude=True,
                       verbose=False, strict=True,
+                      match_source=False,
                       last_modified=MIN_PERIOD_DATE):
     """Recommend identity matches for a list of individuals.
 
@@ -78,6 +81,7 @@ def recommend_matches(source_uuids, target_uuids,
     :param verbose: if set to `True`, the list of results will include individual
     identities. Otherwise, results will include main keys from individuals
     :param strict: strict matching with well-formed email addresses and names
+    :param match_source: only matching for identities with the same source
     :param last_modified: generate recommendations only for individuals modified after
         this date
 
@@ -128,7 +132,8 @@ def _get_identities(uuid):
         identities = Identity.objects.all()
         target_set.update(identities)
 
-    matched = _find_matches(input_set, target_set, criteria, exclude=exclude, verbose=verbose, strict=strict)
+    matched = _find_matches(input_set, target_set, criteria, exclude=exclude, verbose=verbose, strict=strict,
+                            match_source=match_source)
     # Return filtered results
     for uuid in source_uuids:
         result = set()
@@ -148,7 +153,7 @@ def _get_identities(uuid):
     logger.info(f"Matching recommendations generated; criteria='{criteria}'")
 
 
-def _find_matches(set_x, set_y, criteria, exclude, verbose, strict):
+def _find_matches(set_x, set_y, criteria, exclude, verbose, strict, match_source=False):
     """Find identities matches between two sets using Pandas' library.
 
     This method find matches for the identities in `set_x` looking at
@@ -170,6 +175,7 @@ def _find_matches(set_x, set_y, criteria, exclude, verbose, strict):
     :param verbose: if set to `True`, the list of results will include individual
         identities. Otherwise, results will include main keys from individuals.
     :param strict: strict matching with well-formed email addresses and names
+    :param match_source: only find matches for the same source
 
     :returns: a dictionary including the set of matches found for each
         identity from `set_x`.
@@ -187,10 +193,15 @@ def _apply_recommender_exclusion_list(df):
         df_excluded = df[~df['username'].isin(excluded) & ~df['email'].isin(excluded) & ~df['name'].isin(excluded)]
         return df_excluded
 
-    def _filter_criteria(df, c, strict=True):
+    def _filter_criteria(df, c, strict=True, match_source=False):
         """Filter dataframe creating a basic subset including a given column"""
         cols = ['uuid', 'individual', c]
-        cdf = df[cols]
+        if match_source:
+            cols += ['source']
+            cdf = df[cols]
+            cdf = cdf[cdf['source'].isin(MATCH_USERNAME_SOURCES)]
+        else:
+            cdf = df[cols]
         cdf = cdf.dropna(subset=[c])
 
         if strict and c == 'email':
@@ -216,9 +227,12 @@ def _filter_criteria(df, c, strict=True):
     cdfs = []
 
     for c in criteria:
-        cdf_x = _filter_criteria(df_x, c, strict)
-        cdf_y = _filter_criteria(df_y, c, strict)
-        cdf = pandas.merge(cdf_x, cdf_y, on=c, how='inner')
+        cdf_x = _filter_criteria(df_x, c, strict, match_source)
+        cdf_y = _filter_criteria(df_y, c, strict, match_source)
+        if match_source:
+            cdf = pandas.merge(cdf_x, cdf_y, on=[c, 'source'], how='inner')
+        else:
+            cdf = pandas.merge(cdf_x, cdf_y, on=c, how='inner')
         cdf = cdf[['individual_x', 'uuid_x', 'individual_y', 'uuid_y']]
         cdfs.append(cdf)
 

diff --git a/sortinghat/core/schema.py b/sortinghat/core/schema.py
@@ -1075,6 +1075,7 @@ class Arguments:
         verbose = graphene.Boolean(required=False)
         exclude = graphene.Boolean(required=False)
         strict = graphene.Boolean(required=False)
+        match_source = graphene.Boolean(required=False)
         last_modified = graphene.DateTime(required=False)
 
     job_id = graphene.Field(lambda: graphene.String)
@@ -1084,7 +1085,7 @@ class Arguments:
     def mutate(self, info, criteria,
                source_uuids=None, target_uuids=None,
                exclude=True, verbose=False, strict=True,
-               last_modified=MIN_PERIOD_DATE):
+               match_source=False, last_modified=MIN_PERIOD_DATE):
         user = info.context.user
         tenant = get_db_tenant()
         ctx = SortingHatContext(user=user, tenant=tenant)
@@ -1097,6 +1098,7 @@ def mutate(self, info, criteria,
                       exclude,
                       verbose,
                       strict,
+                      match_source,
                       last_modified,
                       job_timeout=-1)
 
@@ -1158,6 +1160,7 @@ class Arguments:
         criteria = graphene.List(graphene.String)
         exclude = graphene.Boolean(required=False)
         strict = graphene.Boolean(required=False)
+        match_source = graphene.Boolean(required=False)
         last_modified = graphene.DateTime(required=False)
 
     job_id = graphene.Field(lambda: graphene.String)
@@ -1167,12 +1170,22 @@ class Arguments:
     def mutate(self, info, criteria,
                source_uuids=None, target_uuids=None,
                exclude=True, strict=True,
+               match_source=False,
                last_modified=MIN_PERIOD_DATE):
         user = info.context.user
         tenant = get_db_tenant()
         ctx = SortingHatContext(user=user, tenant=tenant)
 
-        job = enqueue(unify, ctx, criteria, source_uuids, target_uuids, exclude, strict, last_modified, job_timeout=-1)
+        job = enqueue(unify,
+                      ctx,
+                      criteria,
+                      source_uuids,
+                      target_uuids,
+                      exclude,
+                      strict,
+                      match_source,
+                      last_modified,
+                      job_timeout=-1)
 
         return Unify(
             job_id=job.id

diff --git a/tests/rec/test_matches.py b/tests/rec/test_matches.py
@@ -402,3 +402,101 @@ def test_not_found_uuid_error(self):
         self.assertEqual(rec[0], '1234567890abcdefg')
         self.assertEqual(rec[1], '1234567890abcdefg')
         self.assertEqual(rec[2], [])
+
+    def test_recommend_match_source(self):
+        """Test if recommendations are created between same identities with same source"""
+
+        jr3 = api.add_identity(self.ctx,
+                               name='J. Rae',
+                               username='jane_rae',
+                               source='github',
+                               uuid=self.jane_rae.uuid)
+        jrae_github = api.add_identity(self.ctx,
+                                       name='Jane Rae',
+                                       username='jane_rae',
+                                       source='github')
+
+        source_uuids = [self.john_smith.uuid, self.jrae_no_name.uuid, self.jr2.uuid]
+        target_uuids = [self.john_smith.uuid, self.js2.uuid, self.js3.uuid,
+                        self.js4.uuid,
+                        self.jsmith.uuid, self.jsm2.uuid, self.jsm3.uuid,
+                        self.jane_rae.uuid, self.jr2.uuid,
+                        self.js_alt.uuid, self.js_alt2.uuid,
+                        self.js_alt3.uuid, self.js_alt4.uuid,
+                        self.jrae.uuid, self.jrae2.uuid,
+                        self.jrae_no_name.uuid, self.jsmith_no_email.uuid,
+                        jrae_github]
+
+        criteria = ['email', 'name', 'username']
+
+        # Recommend identities which match the fields in `criteria` for the same `source`
+        recs = list(recommend_matches(source_uuids,
+                                      target_uuids,
+                                      criteria,
+                                      match_source=True))
+
+        self.assertEqual(len(recs), 3)
+
+        rec = recs[0]
+        self.assertEqual(rec[0], self.john_smith.uuid)
+        self.assertEqual(rec[1], self.john_smith.individual.mk)
+        self.assertEqual(rec[2], [])
+
+        rec = recs[1]
+        self.assertEqual(rec[0], self.jrae_no_name.uuid)
+        self.assertEqual(rec[1], self.jrae_no_name.individual.mk)
+        self.assertEqual(rec[2], [])
+
+        rec = recs[2]
+        self.assertEqual(rec[0], self.jr2.uuid)
+        self.assertEqual(rec[1], self.jr2.individual.mk)
+        self.assertEqual(rec[2], sorted([jrae_github.individual.mk]))
+
+    def test_recommend_same_source_not_trusted(self):
+        """Matches are not created for ids with same source but not github or gitlab"""
+
+        jr3 = api.add_identity(self.ctx,
+                               name='J. Rae',
+                               username='jane_rae',
+                               source='git',
+                               uuid=self.jane_rae.uuid)
+        jrae_git = api.add_identity(self.ctx,
+                                    name='Jane Rae',
+                                    username='jane_rae',
+                                    source='git')
+
+        source_uuids = [self.john_smith.uuid, self.jrae_no_name.uuid, self.jr2.uuid]
+        target_uuids = [self.john_smith.uuid, self.js2.uuid, self.js3.uuid,
+                        self.js4.uuid,
+                        self.jsmith.uuid, self.jsm2.uuid, self.jsm3.uuid,
+                        self.jane_rae.uuid, self.jr2.uuid,
+                        self.js_alt.uuid, self.js_alt2.uuid,
+                        self.js_alt3.uuid, self.js_alt4.uuid,
+                        self.jrae.uuid, self.jrae2.uuid,
+                        self.jrae_no_name.uuid, self.jsmith_no_email.uuid,
+                        jrae_git]
+
+        criteria = ['email', 'name', 'username']
+
+        # Recommend identities which match the fields in `criteria` for the same `source`
+        recs = list(recommend_matches(source_uuids,
+                                      target_uuids,
+                                      criteria,
+                                      match_source=True))
+
+        self.assertEqual(len(recs), 3)
+
+        rec = recs[0]
+        self.assertEqual(rec[0], self.john_smith.uuid)
+        self.assertEqual(rec[1], self.john_smith.individual.mk)
+        self.assertEqual(rec[2], [])
+
+        rec = recs[1]
+        self.assertEqual(rec[0], self.jrae_no_name.uuid)
+        self.assertEqual(rec[1], self.jrae_no_name.individual.mk)
+        self.assertEqual(rec[2], [])
+
+        rec = recs[2]
+        self.assertEqual(rec[0], self.jr2.uuid)
+        self.assertEqual(rec[1], self.jr2.individual.mk)
+        self.assertEqual(rec[2], [])