diff --git a/releases/unreleased/unify-identities-with-same-source.yml b/releases/unreleased/unify-identities-with-same-source.yml new file mode 100644 index 00000000..b902032c --- /dev/null +++ b/releases/unreleased/unify-identities-with-same-source.yml @@ -0,0 +1,9 @@ +--- +title: Unify identities with same source +category: added +author: Jose Javier Merchante +issue: null +notes: > + Include a new option to only recommend or unify identities + from trusted sources like GitHub or GitLab that have the same + username and backend. diff --git a/sortinghat/core/jobs.py b/sortinghat/core/jobs.py index 02e70026..b5ca566d 100644 --- a/sortinghat/core/jobs.py +++ b/sortinghat/core/jobs.py @@ -205,7 +205,8 @@ def recommend_affiliations(ctx, uuids=None, last_modified=MIN_PERIOD_DATE): def recommend_matches(ctx, source_uuids, target_uuids, criteria, exclude=True, verbose=False, - strict=True, last_modified=MIN_PERIOD_DATE): + strict=True, match_source=False, + last_modified=MIN_PERIOD_DATE): """Generate a list of affiliation recommendations from a set of individuals. This function generates a list of recommendations which include the @@ -231,6 +232,7 @@ def recommend_matches(ctx, source_uuids, RecommenderExclusionTerm table. Otherwise, results will not ignore them. :param verbose: if set to `True`, the match results will be composed by individual identities (even belonging to the same individual). + :param match_source: only unify individuals that share the same source :param last_modified: generate recommendations only for individuals modified after this date @@ -254,7 +256,16 @@ def recommend_matches(ctx, source_uuids, trxl = TransactionsLog.open('recommend_matches', job_ctx) - for rec in engine.recommend('matches', source_uuids, target_uuids, criteria, exclude, verbose, strict, last_modified): + recommendations = engine.recommend('matches', + source_uuids, + target_uuids, + criteria, + exclude, + verbose, + strict, + match_source, + last_modified) + for rec in recommendations: results[rec.key] = list(rec.options) # Store matches in the database for match in rec.options: @@ -423,7 +434,8 @@ def affiliate(ctx, uuids=None, last_modified=MIN_PERIOD_DATE): @django_rq.job @job_using_tenant -def unify(ctx, criteria, source_uuids=None, target_uuids=None, exclude=True, strict=True, last_modified=MIN_PERIOD_DATE): +def unify(ctx, criteria, source_uuids=None, target_uuids=None, exclude=True, + strict=True, match_source=False, last_modified=MIN_PERIOD_DATE): """Unify a set of individuals by merging them using matching recommendations. This function automates the identities unify process obtaining @@ -447,6 +459,7 @@ def unify(ctx, criteria, source_uuids=None, target_uuids=None, exclude=True, str :param exclude: if set to `True`, the results list will ignore individual identities if any value from the `email`, `name`, or `username` fields are found in the RecommenderExclusionTerm table. Otherwise, results will not ignore them. + :param match_source: only unify individuals that share the same source :param last_modified: only unify individuals that have been modified after this date :returns: a list with the individuals resulting from merge operations @@ -512,6 +525,7 @@ def _group_recommendations(recs): criteria, exclude=exclude, strict=strict, + match_source=match_source, last_modified=last_modified): match_recs[rec.mk] = list(rec.options) diff --git a/sortinghat/core/recommendations/matching.py b/sortinghat/core/recommendations/matching.py index 47dba4b9..939b6fde 100644 --- a/sortinghat/core/recommendations/matching.py +++ b/sortinghat/core/recommendations/matching.py @@ -39,10 +39,13 @@ EMAIL_ADDRESS_REGEX = r"^(?P[^\s@]+@[^\s@.]+\.[^\s@]+)$" NAME_REGEX = r"^\w+\s\w+" +MATCH_USERNAME_SOURCES = ['github', 'gitlab', 'slack'] + def recommend_matches(source_uuids, target_uuids, criteria, exclude=True, verbose=False, strict=True, + match_source=False, last_modified=MIN_PERIOD_DATE): """Recommend identity matches for a list of individuals. @@ -78,6 +81,7 @@ def recommend_matches(source_uuids, target_uuids, :param verbose: if set to `True`, the list of results will include individual identities. Otherwise, results will include main keys from individuals :param strict: strict matching with well-formed email addresses and names + :param match_source: only matching for identities with the same source :param last_modified: generate recommendations only for individuals modified after this date @@ -128,7 +132,8 @@ def _get_identities(uuid): identities = Identity.objects.all() target_set.update(identities) - matched = _find_matches(input_set, target_set, criteria, exclude=exclude, verbose=verbose, strict=strict) + matched = _find_matches(input_set, target_set, criteria, exclude=exclude, verbose=verbose, strict=strict, + match_source=match_source) # Return filtered results for uuid in source_uuids: result = set() @@ -148,7 +153,7 @@ def _get_identities(uuid): logger.info(f"Matching recommendations generated; criteria='{criteria}'") -def _find_matches(set_x, set_y, criteria, exclude, verbose, strict): +def _find_matches(set_x, set_y, criteria, exclude, verbose, strict, match_source=False): """Find identities matches between two sets using Pandas' library. This method find matches for the identities in `set_x` looking at @@ -170,6 +175,7 @@ def _find_matches(set_x, set_y, criteria, exclude, verbose, strict): :param verbose: if set to `True`, the list of results will include individual identities. Otherwise, results will include main keys from individuals. :param strict: strict matching with well-formed email addresses and names + :param match_source: only find matches for the same source :returns: a dictionary including the set of matches found for each identity from `set_x`. @@ -187,10 +193,15 @@ def _apply_recommender_exclusion_list(df): df_excluded = df[~df['username'].isin(excluded) & ~df['email'].isin(excluded) & ~df['name'].isin(excluded)] return df_excluded - def _filter_criteria(df, c, strict=True): + def _filter_criteria(df, c, strict=True, match_source=False): """Filter dataframe creating a basic subset including a given column""" cols = ['uuid', 'individual', c] - cdf = df[cols] + if match_source: + cols += ['source'] + cdf = df[cols] + cdf = cdf[cdf['source'].isin(MATCH_USERNAME_SOURCES)] + else: + cdf = df[cols] cdf = cdf.dropna(subset=[c]) if strict and c == 'email': @@ -216,9 +227,12 @@ def _filter_criteria(df, c, strict=True): cdfs = [] for c in criteria: - cdf_x = _filter_criteria(df_x, c, strict) - cdf_y = _filter_criteria(df_y, c, strict) - cdf = pandas.merge(cdf_x, cdf_y, on=c, how='inner') + cdf_x = _filter_criteria(df_x, c, strict, match_source) + cdf_y = _filter_criteria(df_y, c, strict, match_source) + if match_source: + cdf = pandas.merge(cdf_x, cdf_y, on=[c, 'source'], how='inner') + else: + cdf = pandas.merge(cdf_x, cdf_y, on=c, how='inner') cdf = cdf[['individual_x', 'uuid_x', 'individual_y', 'uuid_y']] cdfs.append(cdf) diff --git a/sortinghat/core/schema.py b/sortinghat/core/schema.py index af41b656..38eaca48 100644 --- a/sortinghat/core/schema.py +++ b/sortinghat/core/schema.py @@ -1075,6 +1075,7 @@ class Arguments: verbose = graphene.Boolean(required=False) exclude = graphene.Boolean(required=False) strict = graphene.Boolean(required=False) + match_source = graphene.Boolean(required=False) last_modified = graphene.DateTime(required=False) job_id = graphene.Field(lambda: graphene.String) @@ -1084,7 +1085,7 @@ class Arguments: def mutate(self, info, criteria, source_uuids=None, target_uuids=None, exclude=True, verbose=False, strict=True, - last_modified=MIN_PERIOD_DATE): + match_source=False, last_modified=MIN_PERIOD_DATE): user = info.context.user tenant = get_db_tenant() ctx = SortingHatContext(user=user, tenant=tenant) @@ -1097,6 +1098,7 @@ def mutate(self, info, criteria, exclude, verbose, strict, + match_source, last_modified, job_timeout=-1) @@ -1158,6 +1160,7 @@ class Arguments: criteria = graphene.List(graphene.String) exclude = graphene.Boolean(required=False) strict = graphene.Boolean(required=False) + match_source = graphene.Boolean(required=False) last_modified = graphene.DateTime(required=False) job_id = graphene.Field(lambda: graphene.String) @@ -1167,12 +1170,22 @@ class Arguments: def mutate(self, info, criteria, source_uuids=None, target_uuids=None, exclude=True, strict=True, + match_source=False, last_modified=MIN_PERIOD_DATE): user = info.context.user tenant = get_db_tenant() ctx = SortingHatContext(user=user, tenant=tenant) - job = enqueue(unify, ctx, criteria, source_uuids, target_uuids, exclude, strict, last_modified, job_timeout=-1) + job = enqueue(unify, + ctx, + criteria, + source_uuids, + target_uuids, + exclude, + strict, + match_source, + last_modified, + job_timeout=-1) return Unify( job_id=job.id diff --git a/tests/rec/test_matches.py b/tests/rec/test_matches.py index 389a7eb3..5533e92a 100644 --- a/tests/rec/test_matches.py +++ b/tests/rec/test_matches.py @@ -402,3 +402,101 @@ def test_not_found_uuid_error(self): self.assertEqual(rec[0], '1234567890abcdefg') self.assertEqual(rec[1], '1234567890abcdefg') self.assertEqual(rec[2], []) + + def test_recommend_match_source(self): + """Test if recommendations are created between same identities with same source""" + + jr3 = api.add_identity(self.ctx, + name='J. Rae', + username='jane_rae', + source='github', + uuid=self.jane_rae.uuid) + jrae_github = api.add_identity(self.ctx, + name='Jane Rae', + username='jane_rae', + source='github') + + source_uuids = [self.john_smith.uuid, self.jrae_no_name.uuid, self.jr2.uuid] + target_uuids = [self.john_smith.uuid, self.js2.uuid, self.js3.uuid, + self.js4.uuid, + self.jsmith.uuid, self.jsm2.uuid, self.jsm3.uuid, + self.jane_rae.uuid, self.jr2.uuid, + self.js_alt.uuid, self.js_alt2.uuid, + self.js_alt3.uuid, self.js_alt4.uuid, + self.jrae.uuid, self.jrae2.uuid, + self.jrae_no_name.uuid, self.jsmith_no_email.uuid, + jrae_github] + + criteria = ['email', 'name', 'username'] + + # Recommend identities which match the fields in `criteria` for the same `source` + recs = list(recommend_matches(source_uuids, + target_uuids, + criteria, + match_source=True)) + + self.assertEqual(len(recs), 3) + + rec = recs[0] + self.assertEqual(rec[0], self.john_smith.uuid) + self.assertEqual(rec[1], self.john_smith.individual.mk) + self.assertEqual(rec[2], []) + + rec = recs[1] + self.assertEqual(rec[0], self.jrae_no_name.uuid) + self.assertEqual(rec[1], self.jrae_no_name.individual.mk) + self.assertEqual(rec[2], []) + + rec = recs[2] + self.assertEqual(rec[0], self.jr2.uuid) + self.assertEqual(rec[1], self.jr2.individual.mk) + self.assertEqual(rec[2], sorted([jrae_github.individual.mk])) + + def test_recommend_same_source_not_trusted(self): + """Matches are not created for ids with same source but not github or gitlab""" + + jr3 = api.add_identity(self.ctx, + name='J. Rae', + username='jane_rae', + source='git', + uuid=self.jane_rae.uuid) + jrae_git = api.add_identity(self.ctx, + name='Jane Rae', + username='jane_rae', + source='git') + + source_uuids = [self.john_smith.uuid, self.jrae_no_name.uuid, self.jr2.uuid] + target_uuids = [self.john_smith.uuid, self.js2.uuid, self.js3.uuid, + self.js4.uuid, + self.jsmith.uuid, self.jsm2.uuid, self.jsm3.uuid, + self.jane_rae.uuid, self.jr2.uuid, + self.js_alt.uuid, self.js_alt2.uuid, + self.js_alt3.uuid, self.js_alt4.uuid, + self.jrae.uuid, self.jrae2.uuid, + self.jrae_no_name.uuid, self.jsmith_no_email.uuid, + jrae_git] + + criteria = ['email', 'name', 'username'] + + # Recommend identities which match the fields in `criteria` for the same `source` + recs = list(recommend_matches(source_uuids, + target_uuids, + criteria, + match_source=True)) + + self.assertEqual(len(recs), 3) + + rec = recs[0] + self.assertEqual(rec[0], self.john_smith.uuid) + self.assertEqual(rec[1], self.john_smith.individual.mk) + self.assertEqual(rec[2], []) + + rec = recs[1] + self.assertEqual(rec[0], self.jrae_no_name.uuid) + self.assertEqual(rec[1], self.jrae_no_name.individual.mk) + self.assertEqual(rec[2], []) + + rec = recs[2] + self.assertEqual(rec[0], self.jr2.uuid) + self.assertEqual(rec[1], self.jr2.individual.mk) + self.assertEqual(rec[2], []) diff --git a/tests/test_jobs.py b/tests/test_jobs.py index f743d1cb..afbaacdc 100644 --- a/tests/test_jobs.py +++ b/tests/test_jobs.py @@ -1002,6 +1002,66 @@ def test_recommend_matches_verbose_no_strict(self): MergeRecommendation.objects.filter(individual1=rec[0], individual2=rec[1]).exists()) + def test_recommend_matches_match_source(self): + """Check if recommendations are obtained when the identities share the source""" + + ctx = SortingHatContext(self.user) + + jr3 = api.add_identity(self.ctx, + name='J. Rae', + username='jane_rae', + source='github', + uuid=self.jane_rae.uuid) + jrae_github = api.add_identity(self.ctx, + name='Jane Rae', + username='jane_rae', + source='github') + + # Test + expected = { + 'results': { + self.john_smith.uuid: [], + self.jrae_no_name.uuid: [], + self.jr2.uuid: sorted([jrae_github.individual.mk]) + } + } + + recommendations_expected = [ + sorted([self.jr2.individual.mk, jrae_github.individual.mk]) + ] + + source_uuids = [self.john_smith.uuid, self.jrae_no_name.uuid, self.jr2.uuid] + target_uuids = [self.john_smith.uuid, self.js2.uuid, self.js3.uuid, + self.jsmith.uuid, self.jsm2.uuid, self.jsm3.uuid, + self.jane_rae.uuid, self.jr2.uuid, + self.js_alt.uuid, self.js_alt2.uuid, + self.js_alt3.uuid, self.js_alt4.uuid, + self.jrae.uuid, self.jrae2.uuid, self.jrae_no_name.uuid, + jrae_github] + + criteria = ['email', 'name', 'username'] + + # Identities which don't have the fields in `criteria` or no matches won't be returned + job = recommend_matches.delay(ctx, + source_uuids, + target_uuids, + criteria, + strict=False, + match_source=True) + # Preserve job results order for the comparison against the expected results + result = job.result + for key in result['results']: + result['results'][key] = sorted(result['results'][key]) + + self.assertDictEqual(result, expected) + + self.assertEqual(MergeRecommendation.objects.count(), 1) + + for rec in recommendations_expected: + self.assertTrue( + MergeRecommendation.objects.filter(individual1=rec[0], + individual2=rec[1]).exists()) + def test_recommend_source_not_mk(self): """Check if recommendations work when the provided uuid is not an Individual's main key""" @@ -1342,6 +1402,45 @@ def test_unify_all_individuals(self): identities = individual_2.identities.all() self.assertEqual(len(identities), 5) + def test_unify_match_source(self): + """Check if unify is applied when the identities share the source""" + + jr3 = api.add_identity(self.ctx, + name='J. Rae', + username='jane_rae', + source='github', + uuid=self.jane_rae.uuid) + jrae_github = api.add_identity(self.ctx, + name='Jane Rae', + username='jane_rae', + source='github') + + ctx = SortingHatContext(self.user) + + # Test + expected = { + 'results': [jrae_github.uuid], + 'errors': [] + } + + criteria = ['email', 'name', 'username'] + + # Identities which don't have the fields in `criteria` or no matches won't be returned + job = unify.delay(ctx, + criteria, + None, + None, + match_source=True) + + result = job.result + self.assertDictEqual(result, expected) + + # Checking if the identities have been merged + + individual = Individual.objects.get(mk=jrae_github.uuid) + identities = individual.identities.all() + self.assertEqual(len(identities), 4) + def test_unify_last_modified(self): """Check if unify is applied only for individuals updated after a given date""" diff --git a/tests/test_schema.py b/tests/test_schema.py index 56768e01..13c7b3ab 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -9728,11 +9728,13 @@ class TestUnifyMutation(django.test.TestCase): $targetUuids: [String], $criteria: [String], $strict: Boolean, + $matchSource: Boolean, $lastModified: DateTime) { unify(sourceUuids: $sourceUuids, targetUuids: $targetUuids, criteria: $criteria, strict: $strict, + matchSource: $matchSource lastModified: $lastModified) { jobId } @@ -10164,6 +10166,68 @@ def test_unify_strict(self, mock_job_id_gen): self.assertEqual(len(identities), 7) + @unittest.mock.patch('sortinghat.core.jobs.rq.job.uuid4') + def test_unify_match_source(self, mock_job_id_gen): + """Check if unify is applied for the specified individuals""" + + mock_job_id_gen.return_value = "1234-5678-90AB-CDEF" + + client = graphene.test.Client(schema) + + jr3 = api.add_identity(self.ctx, + name='J. Rae', + username='jane_rae', + source='github', + uuid=self.jane_rae.uuid) + jrae_github = api.add_identity(self.ctx, + name='Jane Rae', + username='jane_rae', + source='github') + + params = { + 'sourceUuids': [self.john_smith.uuid, self.jrae3.uuid, self.jr2.uuid], + 'targetUuids': [self.john_smith.uuid, self.js2.uuid, self.js3.uuid, + self.jsmith.uuid, self.jsm2.uuid, self.jsm3.uuid, + self.jane_rae.uuid, self.jr2.uuid, + self.js_alt.uuid, self.js_alt2.uuid, + self.js_alt3.uuid, self.js_alt4.uuid, + self.jrae.uuid, self.jrae2.uuid, self.jrae3.uuid, + jrae_github.uuid], + 'criteria': ['email', 'name', 'username'], + 'strict': False, + 'matchSource': True + } + + executed = client.execute(self.SH_UNIFY, + context_value=self.context_value, + variables=params) + + # Check if the job was run and individuals were merged + job_id = executed['data']['unify']['jobId'] + self.assertEqual(job_id, "1234-5678-90AB-CDEF") + + # Individual 1 no changed + individual_db_1 = Individual.objects.get(mk=self.jsmith.uuid) + identities = individual_db_1.identities.all() + self.assertEqual(len(identities), 3) + + # Individual 3 with new identities merged + individual_db_2 = Individual.objects.get(mk=jrae_github.uuid) + identities = individual_db_2.identities.all() + self.assertEqual(len(identities), 4) + + id1 = identities[0] + self.assertEqual(id1, jrae_github) + + id2 = identities[1] + self.assertEqual(id2, jr3) + + id3 = identities[2] + self.assertEqual(id3, self.jane_rae) + + id4 = identities[3] + self.assertEqual(id4, self.jr2) + @unittest.mock.patch('sortinghat.core.jobs.rq.job.uuid4') def test_unify_source_not_mk(self, mock_job_id_gen): """Check if unify works when the provided uuid is not an Individual's main key""" diff --git a/ui/src/apollo/mutations.js b/ui/src/apollo/mutations.js index cf6768c6..bea5a9c7 100644 --- a/ui/src/apollo/mutations.js +++ b/ui/src/apollo/mutations.js @@ -288,8 +288,18 @@ const GENDERIZE = gql` `; const UNIFY = gql` - mutation unify($criteria: [String], $exclude: Boolean, $strict: Boolean) { - unify(criteria: $criteria, exclude: $exclude, strict: $strict) { + mutation unify( + $criteria: [String] + $exclude: Boolean + $strict: Boolean + $matchSource: Boolean + ) { + unify( + criteria: $criteria + exclude: $exclude + strict: $strict + matchSource: $matchSource + ) { jobId } } @@ -312,12 +322,14 @@ const RECOMMEND_MATCHES = gql` $exclude: Boolean $sourceUuids: [String] $strict: Boolean + $matchSource: Boolean ) { recommendMatches( criteria: $criteria exclude: $exclude sourceUuids: $sourceUuids strict: $strict + matchSource: $matchSource ) { jobId } @@ -670,13 +682,14 @@ const genderize = (apollo, exclude, noStrictMatching, uuids) => { }); }; -const unify = (apollo, criteria, exclude, strict) => { +const unify = (apollo, criteria, exclude, strict, matchSource) => { return apollo.mutate({ mutation: UNIFY, variables: { criteria: criteria, exclude: exclude, strict: strict, + matchSource: matchSource, }, }); }; @@ -692,7 +705,14 @@ const manageMergeRecommendation = (apollo, id, apply) => { }); }; -const recommendMatches = (apollo, criteria, exclude, strict, sourceUuids) => { +const recommendMatches = ( + apollo, + criteria, + exclude, + strict, + sourceUuids, + matchSource +) => { return apollo.mutate({ mutation: RECOMMEND_MATCHES, variables: { @@ -700,6 +720,7 @@ const recommendMatches = (apollo, criteria, exclude, strict, sourceUuids) => { exclude: exclude, sourceUuids: sourceUuids, strict: strict, + matchSource: matchSource, }, }); }; diff --git a/ui/src/components/JobModal.vue b/ui/src/components/JobModal.vue index 8e920341..c56e053c 100644 --- a/ui/src/components/JobModal.vue +++ b/ui/src/components/JobModal.vue @@ -54,7 +54,13 @@ /> + @@ -106,6 +112,12 @@ dense hide-details /> + @@ -191,11 +203,13 @@ const defaultForms = { criteria: ["name", "email", "username"], exclude: true, strict: true, + matchSource: false, }, recommendMatches: { criteria: ["name", "email", "username"], exclude: true, strict: true, + matchSource: false, }, }; diff --git a/ui/src/views/Jobs.vue b/ui/src/views/Jobs.vue index 656a7e84..bcba4306 100644 --- a/ui/src/views/Jobs.vue +++ b/ui/src/views/Jobs.vue @@ -62,9 +62,9 @@ export default { }); } }, - async unify({ criteria, exclude, strict }) { + async unify({ criteria, exclude, strict, matchSource }) { try { - await unify(this.$apollo, criteria, exclude, strict); + await unify(this.$apollo, criteria, exclude, strict, matchSource); this.$refs.table.getPaginatedJobs(); } catch (error) { this.snackbar = Object.assign(this.snackbar, { @@ -73,9 +73,15 @@ export default { }); } }, - async recommendMatches({ criteria, exclude, strict }) { + async recommendMatches({ criteria, exclude, strict, matchSource }) { try { - await recommendMatches(this.$apollo, criteria, exclude, strict); + await recommendMatches( + this.$apollo, + criteria, + exclude, + strict, + matchSource + ); this.$refs.table.getPaginatedJobs(); } catch (error) { this.snackbar = Object.assign(this.snackbar, { diff --git a/ui/src/views/SettingsGeneral.vue b/ui/src/views/SettingsGeneral.vue index ee45e3e0..c5fa4e61 100644 --- a/ui/src/views/SettingsGeneral.vue +++ b/ui/src/views/SettingsGeneral.vue @@ -141,7 +141,13 @@ /> + @@ -258,6 +264,7 @@ export default { criteria: ["name", "email", "username"], exclude: true, strict: true, + match_source: false, }, }, },