From 4b1ed4c36e6a047512d185dbede05cc62a2fc5ce Mon Sep 17 00:00:00 2001 From: Jose Javier Merchante Date: Tue, 21 Nov 2023 13:50:34 +0100 Subject: [PATCH] [gitdm] Gitdm identities importer Create a new SortingHat identities importer for Gitdm format. This backend is configured with three parameters: a URL pointing to the file that matches emails with organizations, an optional URL for an aliases file that associates emails, and a flag for email validation to verify the validity of the provided email addresses. Signed-off-by: Jose Javier Merchante --- .../unreleased/gitdm-identities-importer.yml | 13 + sortinghat/core/importer/backends/gitdm.py | 406 ++++++++++ sortinghat/core/jobs.py | 9 +- .../data/gitdm/gitdm_email_aliases_valid.txt | 9 + .../gitdm/gitdm_email_to_employer_invalid.txt | 9 + .../gitdm/gitdm_email_to_employer_valid.txt | 9 + tests/importer/test_gitdm.py | 735 ++++++++++++++++++ 7 files changed, 1184 insertions(+), 6 deletions(-) create mode 100644 releases/unreleased/gitdm-identities-importer.yml create mode 100644 sortinghat/core/importer/backends/gitdm.py create mode 100644 tests/importer/data/gitdm/gitdm_email_aliases_valid.txt create mode 100644 tests/importer/data/gitdm/gitdm_email_to_employer_invalid.txt create mode 100644 tests/importer/data/gitdm/gitdm_email_to_employer_valid.txt create mode 100644 tests/importer/test_gitdm.py diff --git a/releases/unreleased/gitdm-identities-importer.yml b/releases/unreleased/gitdm-identities-importer.yml new file mode 100644 index 000000000..7114f6fb5 --- /dev/null +++ b/releases/unreleased/gitdm-identities-importer.yml @@ -0,0 +1,13 @@ +--- +title: Gitdm identities importer +category: added +author: Jose Javier Merchante +issue: null +notes: > + New SortingHat identities importer for Gitdm format. + This backend is configured with three parameters: a URL pointing + to the file that matches emails with organizations, an optional + URL for an aliases file that associates emails, and a flag for + email validation to verify the validity of the provided email + addresses. + diff --git a/sortinghat/core/importer/backends/gitdm.py b/sortinghat/core/importer/backends/gitdm.py new file mode 100644 index 000000000..976dfe8f6 --- /dev/null +++ b/sortinghat/core/importer/backends/gitdm.py @@ -0,0 +1,406 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2023 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Authors: +# Jose Javier Merchante +# + +from urllib.request import urlopen + +import dateutil.parser +import dateutil.tz +import logging +import re + +from ..backend import IdentitiesImporter +from sortinghat.core.importer.models import (Individual, + Identity, + Enrollment, + Organization) +from sortinghat.core.errors import InvalidFormatError +from sortinghat.core.models import MIN_PERIOD_DATE, MAX_PERIOD_DATE + +logger = logging.getLogger(__name__) + + +class GitdmImporter(IdentitiesImporter): + + NAME = 'gitdm' + + def __init__(self, ctx, url, aliases_url=None, email_validation=True): + super().__init__(ctx, url) + self.aliases_url = aliases_url + if isinstance(email_validation, str): + email_validation = email_validation.lower() in ('true', '1') + self.email_validation = email_validation + + def get_individuals(self): + """Get the individuals for the given url""" + + data = self._fetch_data(self.url) + # Some files include '!' instead of '@' + data = data.replace('!', '@') + + aliases = None + if self.aliases_url: + aliases = self._fetch_data(self.aliases_url) + + parser = GitdmParser(aliases=aliases, email_to_employer=data, + email_validation=self.email_validation) + return parser.individuals + + def _fetch_data(self, url): + with urlopen(url) as fd: + content = fd.read().decode() + return content + + +class GitdmParser(object): + """Parse identities and organizations using Gitdm files. + + Gitdm provides several files that include information about + identities, organizations and affiliations. This parser is able + to parse anyone of these file formats, together or separate. + + The individuals are stored in an object named 'individuals'. + The keys of this object are the UUID of the individuals. + Each individual object stores a list of identities and + enrollments. Email addresses will not be validated when + `email_validation` is set to `False`. + + Organizations are stored in 'organizations' object. Its keys + are the name of the organizations and each organization object is + related to a list of domains. + + :param aliases: aliases stream + :param email_to_employer: enrollments stream + :param domain_to_employer: organizations stream + :param source: source of the data + :param email_validation: validate email addresses; set to True by default + + :raises InvalidFormatError: raised when the format of any of the + given streams is not valid. + """ + + # Common Gitdm patterns + VALID_LINE_REGEX = r"^(\S+)[ \t]+([^#\n\r\f\v]+[^#\s]*)(?:([ \t]+#.*)?|\s*)$" + LINES_TO_IGNORE_REGEX = r"^\s*(?:#.*)?\s*$" + EMAIL_ADDRESS_REGEX = r"^(?P[^\s@]+@[^\s@.]+\.[^\s@]+)$" + ORGANIZATION_REGEX = r"^(?P[^#<\t\n\r\f\v]*[^#<\t\n\r\f\v\s])?$" + DOMAIN_REGEX = r"^(?P\w\S+)$" + ENROLLMENT_REGEX = r"^(?P[^#<\n\r\f\v]*[^#<\t\n\r\f\v\s])(?:[ \t]+<[ \t]+(?P\d{4}\-\d{2}\-\d{2}))?$" + + def __init__(self, aliases=None, email_to_employer=None, domain_to_employer=None, + source='gitdm', email_validation=True): + self._individuals = {} + self._organizations = {} + self.source = source + self.email_validation = email_validation + + # Raw data + self.__raw_identities = {} + self.__raw_aliases = {} + self.__raw_orgs = {} + + self.__parse(aliases, email_to_employer, + domain_to_employer) + + @property + def individuals(self): + uids = [u for u in self._individuals.values()] + return uids + + @property + def organizations(self): + orgs = [o for o in self._organizations.values()] + return orgs + + def __parse(self, aliases, email_to_employer, domain_to_employer): + """Parse Gitdm streams""" + self.__parse_organizations(domain_to_employer) + self.__parse_identities(aliases, email_to_employer) + + def __parse_identities(self, aliases, email_to_employer): + """Parse Gitdm identities""" + + # Parse streams + self.__parse_aliases_stream(aliases) + self.__parse_email_to_employer_stream(email_to_employer) + + # Create individuals from aliases list + for alias, email in self.__raw_aliases.items(): + individual = self._individuals.get(email, None) + + if not individual: + individual = Individual(uuid=email) + e = re.match(self.EMAIL_ADDRESS_REGEX, email, re.UNICODE) + if e: + identity = Identity(email=email, source=self.source) + else: + identity = Identity(username=email, source=self.source) + + individual.identities.append(identity) + + self._individuals[email] = individual + + # Create identity with alias + e = re.match(self.EMAIL_ADDRESS_REGEX, alias, re.UNICODE) + if e: + identity = Identity(email=alias, source=self.source) + else: + identity = Identity(username=alias, source=self.source) + individual.identities.append(identity) + + # Create individuals from enrollments list + for email, enrs in self.__raw_identities.items(): + + if email in self._individuals: + individual = self._individuals[email] + elif email in self.__raw_aliases: + canonical = self.__raw_aliases[email] + individual = self._individuals[canonical] + else: + individual = Individual(uuid=email) + identity = Identity(email=email, source=self.source) + individual.identities.append(identity) + self._individuals[email] = individual + + # Assign enrollments + enrs.sort(key=lambda r: r[1]) + start_date = MIN_PERIOD_DATE + + for rol in enrs: + name = rol[0] + org = self._organizations.get(name, None) + + if not org: + org = Organization(name=name) + self._organizations[name] = org + + end_date = rol[1] + + enrollment = Enrollment(start=start_date, end=end_date, + organization=org) + individual.enrollments.append(enrollment) + + if end_date != MAX_PERIOD_DATE: + start_date = end_date + + def __parse_organizations(self, domain_to_employer): + """Parse Gitdm organizations""" + + # Parse streams + self.__parse_domain_to_employer_stream(domain_to_employer) + + for org, doms in self.__raw_orgs.items(): + o = Organization(name=org) + for dom in doms: + o.domains.append(dom) + self._organizations[org] = o + + def __parse_aliases_stream(self, stream): + """Parse aliases stream. + + The stream contains a list of usernames (they can be email addresses + their username aliases. Each line has a username and an alias separated + by tabs. Comment lines start with the hash character (#). + + Example: + + # List of email aliases + jsmith@example.com jsmith@example.net + jsmith@example.net johnsmith@example.com + jdoe@example.com john_doe@example.com + jdoe@example john_doe@example.com + """ + if not stream: + return + + f = self.__parse_aliases_line + + for alias_entries in self.__parse_stream(stream, f): + alias = alias_entries[0] + username = alias_entries[1] + + self.__raw_aliases[alias] = username + + def __parse_email_to_employer_stream(self, stream): + """Parse email to employer stream. + + The stream contains a list of email addresses and their employers. + Each line has an email address and a organization name separated by + tabs. Optionally, the date when the identity withdrew from the + organization can be included followed by a '<' character. Comment + lines start with the hash character (#). + + Example: + + # List of enrollments + jsmith@example.com Example Company # John Smith + jdoe@example.com Example Company # John Doe + jsmith@example.com Bitergia < 2015-01-01 # John Smith - Bitergia + """ + if not stream: + return + + f = self.__parse_email_to_employer_line + + for rol in self.__parse_stream(stream, f): + email = rol[0] + org = rol[1] + rol_date = rol[2] + + if org not in self.__raw_orgs: + self.__raw_orgs[org] = [] + + if email not in self.__raw_identities: + self.__raw_identities[email] = [(org, rol_date)] + else: + self.__raw_identities[email].append((org, rol_date)) + + def __parse_domain_to_employer_stream(self, stream): + """Parse domain to employer stream. + + Each line of the stream has to contain a domain and a organization, + or employer, separated by tabs. Comment lines start with the hash + character (#) + + Example: + + # Domains from domains.txt + example.org Example + example.com Example + bitergia.com Bitergia + libresoft.es LibreSoft + example.org LibreSoft + """ + if not stream: + return + + f = self.__parse_domain_to_employer_line + + for o in self.__parse_stream(stream, f): + org = o[0] + dom = o[1] + + if org not in self.__raw_orgs: + self.__raw_orgs[org] = [] + + self.__raw_orgs[org].append(dom) + + def __parse_stream(self, stream, parse_line): + """Generic method to parse gitdm streams""" + + if not stream: + raise InvalidFormatError(cause='stream cannot be empty or None') + + nline = 0 + lines = stream.split('\n') + + for line in lines: + nline += 1 + + # Ignore blank lines and comments + m = re.match(self.LINES_TO_IGNORE_REGEX, line, re.UNICODE) + if m: + continue + + m = re.match(self.VALID_LINE_REGEX, line, re.UNICODE) + if not m: + cause = "Skip: '%s' -> line %s: invalid line format" % (line, str(nline)) + logger.warning(cause) + continue + + try: + result = parse_line(m.group(1), m.group(2)) + yield result + except InvalidFormatError as e: + cause = "Skip: '%s' -> line %s: %s" % (line, str(nline), e) + logger.warning(cause) + continue + + def __parse_aliases_line(self, raw_alias, raw_username): + """Parse aliases lines""" + + alias = self.__encode(raw_alias) + username = self.__encode(raw_username) + + return alias, username + + def __parse_email_to_employer_line(self, raw_email, raw_enrollment): + """Parse email to employer lines""" + + e = re.match(self.EMAIL_ADDRESS_REGEX, raw_email, re.UNICODE) + if not e and self.email_validation: + cause = "invalid email format: '%s'" % raw_email + raise InvalidFormatError(cause=cause) + + if self.email_validation: + email = e.group('email').strip() + else: + email = raw_email + + raw_enrollment = raw_enrollment.strip() if raw_enrollment != ' ' else raw_enrollment + r = re.match(self.ENROLLMENT_REGEX, raw_enrollment, re.UNICODE) + if not r: + cause = "invalid enrollment format: '%s'" % raw_enrollment + raise InvalidFormatError(cause=cause) + + org = r.group('organization').strip() + date = r.group('date') + + if date: + try: + dt = dateutil.parser.parse(r.group('date')) + dt = dt.replace(tzinfo=dateutil.tz.tzutc()) + except Exception as e: + cause = "invalid date: '%s'" % date + logger.warning(cause) + dt = MAX_PERIOD_DATE + else: + dt = MAX_PERIOD_DATE + + email = self.__encode(email) + org = self.__encode(org) + + return email, org, dt + + def __parse_domain_to_employer_line(self, raw_domain, raw_org): + """Parse domain to employer lines""" + + d = re.match(self.DOMAIN_REGEX, raw_domain, re.UNICODE) + if not d: + cause = "invalid domain format: '%s'" % raw_domain + raise InvalidFormatError(cause=cause) + + dom = d.group('domain').strip() + + raw_org = raw_org.strip() if raw_org != ' ' else raw_org + o = re.match(self.ORGANIZATION_REGEX, raw_org, re.UNICODE) + if not o: + cause = "invalid organization format: '%s'" % raw_org + raise InvalidFormatError(cause=cause) + + org = o.group('organization').strip() + + org = self.__encode(org) + dom = self.__encode(dom) + + return org, dom + + def __encode(self, s): + return s if s else None diff --git a/sortinghat/core/jobs.py b/sortinghat/core/jobs.py index 5d6810056..4f6536007 100644 --- a/sortinghat/core/jobs.py +++ b/sortinghat/core/jobs.py @@ -609,7 +609,7 @@ def genderize(ctx, uuids=None, exclude=True, no_strict_matching=False): @django_rq.job @job_using_tenant -def import_identities(ctx, backend_name, url, params=None): +def import_identities(ctx, backend_name, url, **kwargs): """Import identities to SortingHat. This job imports identities to SortingHat using the @@ -618,7 +618,7 @@ def import_identities(ctx, backend_name, url, params=None): :param ctx: context where this job is run :param backend_name: name of the importer backend :param url: URL of a file or API to fetch the identities from - :param params: specific arguments for the importer backend + :param kwargs: specific arguments for the importer backend :returns: number of identities imported """ @@ -630,15 +630,12 @@ def import_identities(ctx, backend_name, url, params=None): backends = find_import_identities_backends() klass = backends[backend_name]['class'] - if not params: - params = {} - # Create a new context to include the reference # to the job id that will perform the transaction. job_ctx = SortingHatContext(ctx.user, job.id, ctx.tenant) trxl = TransactionsLog.open('import_identities', job_ctx) - importer = klass(ctx=job_ctx, url=url, **params) + importer = klass(ctx=job_ctx, url=url, **kwargs) nidentities = importer.import_identities() trxl.close() diff --git a/tests/importer/data/gitdm/gitdm_email_aliases_valid.txt b/tests/importer/data/gitdm/gitdm_email_aliases_valid.txt new file mode 100644 index 000000000..4b1a0c43c --- /dev/null +++ b/tests/importer/data/gitdm/gitdm_email_aliases_valid.txt @@ -0,0 +1,9 @@ +# +# Gitdm aliases file +# +# alias - canonical form +# + +john_doe@example.net jdoe@example.com +jrae@example.com jrae@example.net +jrae@laptop jrae@mylaptop diff --git a/tests/importer/data/gitdm/gitdm_email_to_employer_invalid.txt b/tests/importer/data/gitdm/gitdm_email_to_employer_invalid.txt new file mode 100644 index 000000000..6e5715785 --- /dev/null +++ b/tests/importer/data/gitdm/gitdm_email_to_employer_invalid.txt @@ -0,0 +1,9 @@ +# +# Gitdm enrollments example +# + +jsmith.example.com Example Company # John Smith +jdoe$example.com Example Company # John Doe +jsmith!example.com Bitergia < 2015-01-01 # John Smith - Bitergia +jrae-example-net Bitergia +john_doeexample LibreSoft diff --git a/tests/importer/data/gitdm/gitdm_email_to_employer_valid.txt b/tests/importer/data/gitdm/gitdm_email_to_employer_valid.txt new file mode 100644 index 000000000..78e1282e1 --- /dev/null +++ b/tests/importer/data/gitdm/gitdm_email_to_employer_valid.txt @@ -0,0 +1,9 @@ +# +# Gitdm enrollments example +# + +jsmith@example.com Example Company # John Smith +jdoe@example.com Example Company # John Doe +jsmith@example.com Bitergia < 2015-01-01 # John Smith - Bitergia +jrae@example.net Bitergia +john_doe@example.net LibreSoft diff --git a/tests/importer/test_gitdm.py b/tests/importer/test_gitdm.py new file mode 100644 index 000000000..e530e6a5b --- /dev/null +++ b/tests/importer/test_gitdm.py @@ -0,0 +1,735 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2023 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Authors: +# Jose Javier Merchante +# + +import datetime +import os +import re +import unittest.mock + +from dateutil.tz import tzutc +from django.contrib.auth import get_user_model +from django.test import TestCase + +from sortinghat.core.context import SortingHatContext +from sortinghat.core.importer.backends.gitdm import GitdmImporter, GitdmParser +from sortinghat.core.models import Individual, Identity, MAX_PERIOD_DATE, MIN_PERIOD_DATE + + +DOMAINS_INVALID_FORMAT_ERROR = "line %(line)s: invalid format" + + +def read_file(filename, mode='r'): + with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), filename), mode) as f: + content = f.read() + return content + + +def mock_fetch(cls, url): + if url == 'valid_aliases': + return read_file('data/gitdm/gitdm_email_aliases_valid.txt') + elif url == 'email_employer': + return read_file('data/gitdm/gitdm_email_to_employer_valid.txt') + elif url == 'invalid_email_employer': + return read_file('data/gitdm/gitdm_email_to_employer_invalid.txt') + + +class TestGitdmImporter(TestCase): + """Test Gitdm importer""" + + def setUp(self): + """Initialize database""" + + self.user = get_user_model().objects.create(username='test') + self.ctx = SortingHatContext(self.user) + + def test_initialized(self): + """Test whether the importer is initialized""" + + importer = GitdmImporter(self.ctx, 'foo.url') + self.assertEqual(importer.ctx, self.ctx) + self.assertEqual(importer.url, 'foo.url') + self.assertEqual(importer.aliases_url, None) + self.assertEqual(importer.email_validation, True) + + def test_initialized_extra(self): + """Test whether the importer is initialized""" + + importer = GitdmImporter(self.ctx, url='foo.url', aliases_url='aliases.url', email_validation=False) + self.assertEqual(importer.ctx, self.ctx) + self.assertEqual(importer.url, 'foo.url') + self.assertEqual(importer.aliases_url, 'aliases.url') + self.assertEqual(importer.email_validation, False) + + def test_initialize_email_verification(self): + """Test whether the importer detects email_verification as string""" + + importer = GitdmImporter(self.ctx, url='foo.url', email_validation='False') + self.assertEqual(importer.email_validation, False) + + importer = GitdmImporter(self.ctx, url='foo.url', email_validation='True') + self.assertEqual(importer.email_validation, True) + + @unittest.mock.patch.object(GitdmImporter, '_fetch_data', mock_fetch) + def test_email_employer_parser(self): + """Test whether the importer detects all the enrollments in the file""" + + importer = GitdmImporter(ctx=self.ctx, url='email_employer') + individuals = importer.get_individuals() + + self.assertEqual(len(individuals), 4) + + # Individual 1 + ind1 = individuals[0] + + identities = ind1.identities + identities.sort(key=lambda x: x.email) + self.assertEqual(len(identities), 1) + + identity = identities[0] + self.assertEqual(identity.source, 'gitdm') + self.assertEqual(identity.email, 'jsmith@example.com') + self.assertEqual(identity.name, None) + self.assertEqual(identity.username, None) + + enrollments = ind1.enrollments + enrollments.sort(key=lambda x: x.organization.name) + self.assertEqual(len(enrollments), 2) + + org = enrollments[0] + self.assertEqual(org.organization.name, 'Bitergia') + self.assertEqual(org.start, MIN_PERIOD_DATE) + self.assertEqual(org.end, datetime.datetime(2015, 1, 1, tzinfo=tzutc())) + + org = enrollments[1] + self.assertEqual(org.organization.name, 'Example Company') + self.assertEqual(org.start, datetime.datetime(2015, 1, 1, tzinfo=tzutc())) + self.assertEqual(org.end, MAX_PERIOD_DATE) + + # Individual 2 + ind2 = individuals[1] + + identities = ind2.identities + identities.sort(key=lambda x: x.email) + self.assertEqual(len(identities), 1) + + identity = identities[0] + self.assertEqual(identity.source, 'gitdm') + self.assertEqual(identity.email, 'jdoe@example.com') + self.assertEqual(identity.name, None) + self.assertEqual(identity.username, None) + + enrollments = ind2.enrollments + enrollments.sort(key=lambda x: x.organization.name) + self.assertEqual(len(enrollments), 1) + + org = enrollments[0] + self.assertEqual(org.start, MIN_PERIOD_DATE) + self.assertEqual(org.end, MAX_PERIOD_DATE) + self.assertEqual(org.organization.name, 'Example Company') + + # Individual 3 + ind3 = individuals[2] + + identities = ind3.identities + identities.sort(key=lambda x: x.email) + self.assertEqual(len(identities), 1) + + identity = identities[0] + self.assertEqual(identity.source, 'gitdm') + self.assertEqual(identity.email, 'jrae@example.net') + self.assertEqual(identity.name, None) + self.assertEqual(identity.username, None) + + enrollments = ind3.enrollments + enrollments.sort(key=lambda x: x.organization.name) + self.assertEqual(len(enrollments), 1) + + org = enrollments[0] + self.assertEqual(org.start, MIN_PERIOD_DATE) + self.assertEqual(org.end, MAX_PERIOD_DATE) + self.assertEqual(org.organization.name, 'Bitergia') + + # Individual 4 + ind4 = individuals[3] + + identities = ind4.identities + identities.sort(key=lambda x: x.email) + self.assertEqual(len(identities), 1) + + identity = identities[0] + self.assertEqual(identity.source, 'gitdm') + self.assertEqual(identity.email, 'john_doe@example.net') + self.assertEqual(identity.name, None) + self.assertEqual(identity.username, None) + + enrollments = ind4.enrollments + enrollments.sort(key=lambda x: x.organization.name) + self.assertEqual(len(enrollments), 1) + + org = enrollments[0] + self.assertEqual(org.start, MIN_PERIOD_DATE) + self.assertEqual(org.end, MAX_PERIOD_DATE) + self.assertEqual(org.organization.name, 'LibreSoft') + + @unittest.mock.patch.object(GitdmImporter, '_fetch_data', mock_fetch) + def test_email_employer_aliases_parser(self): + """Test whether the importer detects all the enrollments in the file""" + + importer = GitdmImporter(ctx=self.ctx, + url='email_employer', + aliases_url='valid_aliases') + individuals = importer.get_individuals() + + self.assertEqual(len(individuals), 4) + + # jdoe@example.com & john_doe@example.net + ind1 = individuals[0] + + identities = ind1.identities + identities.sort(key=lambda x: x.email) + self.assertEqual(len(identities), 2) + + identity = identities[0] + self.assertEqual(identity.source, 'gitdm') + self.assertEqual(identity.email, 'jdoe@example.com') + self.assertEqual(identity.name, None) + self.assertEqual(identity.username, None) + + identity = identities[1] + self.assertEqual(identity.source, 'gitdm') + self.assertEqual(identity.email, 'john_doe@example.net') + self.assertEqual(identity.name, None) + self.assertEqual(identity.username, None) + + enrollments = ind1.enrollments + enrollments.sort(key=lambda x: x.organization.name) + self.assertEqual(len(enrollments), 2) + + org = enrollments[0] + self.assertEqual(org.organization.name, 'Example Company') + self.assertEqual(org.start, MIN_PERIOD_DATE) + self.assertEqual(org.end, MAX_PERIOD_DATE) + + org = enrollments[1] + self.assertEqual(org.organization.name, 'LibreSoft') + self.assertEqual(org.start, MIN_PERIOD_DATE) + self.assertEqual(org.end, MAX_PERIOD_DATE) + + # jrae@example.net & jrae@example.com + ind2 = individuals[1] + + identities = ind2.identities + identities.sort(key=lambda x: x.email) + self.assertEqual(len(identities), 2) + + identity = identities[0] + self.assertEqual(identity.source, 'gitdm') + self.assertEqual(identity.email, 'jrae@example.com') + self.assertEqual(identity.name, None) + self.assertEqual(identity.username, None) + + identity = identities[1] + self.assertEqual(identity.source, 'gitdm') + self.assertEqual(identity.email, 'jrae@example.net') + self.assertEqual(identity.name, None) + self.assertEqual(identity.username, None) + + enrollments = ind2.enrollments + self.assertEqual(len(enrollments), 1) + + org = enrollments[0] + self.assertEqual(org.organization.name, 'Bitergia') + self.assertEqual(org.start, MIN_PERIOD_DATE) + self.assertEqual(org.end, MAX_PERIOD_DATE) + + # jrae@laptop & jrae@mylaptop + ind3 = individuals[2] + identities = ind3.identities + identities.sort(key=lambda x: x.username) + self.assertEqual(len(identities), 2) + + identity = identities[0] + self.assertEqual(identity.source, 'gitdm') + self.assertEqual(identity.email, None) + self.assertEqual(identity.name, None) + self.assertEqual(identity.username, 'jrae@laptop') + + identity = identities[1] + self.assertEqual(identity.source, 'gitdm') + self.assertEqual(identity.email, None) + self.assertEqual(identity.name, None) + self.assertEqual(identity.username, 'jrae@mylaptop') + + enrollments = ind3.enrollments + self.assertEqual(len(enrollments), 0) + + # jsmith@example.com + ind4 = individuals[3] + + identities = ind4.identities + self.assertEqual(len(identities), 1) + + identity = identities[0] + self.assertEqual(identity.source, 'gitdm') + self.assertEqual(identity.email, 'jsmith@example.com') + self.assertEqual(identity.name, None) + self.assertEqual(identity.username, None) + + enrollments = ind4.enrollments + enrollments.sort(key=lambda x: x.organization.name) + self.assertEqual(len(enrollments), 2) + + org = enrollments[0] + self.assertEqual(org.organization.name, 'Bitergia') + self.assertEqual(org.start, MIN_PERIOD_DATE) + self.assertEqual(org.end, datetime.datetime(2015, 1, 1, tzinfo=tzutc())) + + org = enrollments[1] + self.assertEqual(org.organization.name, 'Example Company') + self.assertEqual(org.start, datetime.datetime(2015, 1, 1, tzinfo=tzutc())) + self.assertEqual(org.end, MAX_PERIOD_DATE) + + @unittest.mock.patch.object(GitdmImporter, '_fetch_data', mock_fetch) + def test_email_validation(self): + """Test whether the importer validates the emails""" + + importer = GitdmImporter(ctx=self.ctx, + url='invalid_email_employer') + individuals = importer.get_individuals() + + self.assertEqual(len(individuals), 1) + + # Only 1 valid individual of 5 + ind1 = individuals[0] + + identities = ind1.identities + self.assertEqual(len(identities), 1) + + identity = identities[0] + self.assertEqual(identity.source, 'gitdm') + self.assertEqual(identity.email, 'jsmith@example.com') + self.assertEqual(identity.name, None) + self.assertEqual(identity.username, None) + + enrollments = ind1.enrollments + self.assertEqual(len(enrollments), 1) + + org = enrollments[0] + self.assertEqual(org.organization.name, 'Bitergia') + self.assertEqual(org.start, MIN_PERIOD_DATE) + self.assertEqual(org.end, datetime.datetime(2015, 1, 1, tzinfo=tzutc())) + + @unittest.mock.patch.object(GitdmImporter, '_fetch_data', mock_fetch) + def test_supress_email_validation(self): + """Test whether the importer can supress email validation""" + + importer = GitdmImporter(ctx=self.ctx, + url='invalid_email_employer', + email_validation=False) + individuals = importer.get_individuals() + + self.assertEqual(len(individuals), 5) + + expected_emails = ['jsmith.example.com', 'jdoe$example.com', 'jsmith@example.com', + 'jrae-example-net', 'john_doeexample'] + + for uid in individuals: + identity = uid.identities[0] + self.assertIn(identity.email, expected_emails) + self.assertEqual(identity.name, None) + self.assertEqual(identity.username, None) + self.assertEqual(identity.source, 'gitdm') + self.assertEqual(identity.uuid, None) + + @unittest.mock.patch.object(GitdmImporter, '_fetch_data', mock_fetch) + def test_load_individuals(self): + """Test the import_identities method works""" + + expected = { + 'd5b277340e6b8a7166e219b3d104f9a2b2c3f9ac': { + 'profile': { + 'email': 'jsmith@example.com' + }, + 'identities': [ + { + 'uuid': 'd5b277340e6b8a7166e219b3d104f9a2b2c3f9ac', + 'name': None, + 'email': 'jsmith@example.com', + 'username': None + } + ] + }, + '3c5927fa7c7ad2b1276f98eabd603efeb061b089': { + 'profile': { + 'email': 'jdoe@example.com' + }, + 'identities': [ + { + 'uuid': '3c5927fa7c7ad2b1276f98eabd603efeb061b089', + 'name': None, + 'email': 'jdoe@example.com', + 'username': None + } + ] + }, + '998862fc7300c96d1962565d738fa2481d371c5e': { + 'profile': { + 'email': 'jrae@example.net' + }, + 'identities': [ + { + 'uuid': '998862fc7300c96d1962565d738fa2481d371c5e', + 'name': None, + 'email': 'jrae@example.net', + 'username': None + } + ] + }, + '1da47c3655012673aef3a7f14ddf18a851fb0e5d': { + 'profile': { + 'email': 'john_doe@example.net' + }, + 'identities': [ + { + 'uuid': '1da47c3655012673aef3a7f14ddf18a851fb0e5d', + 'name': None, + 'email': 'john_doe@example.net', + 'username': None + } + ] + }, + } + + importer = GitdmImporter(self.ctx, 'email_employer') + importer.import_identities() + + individuals = Individual.objects.all() + identities = Identity.objects.all() + self.assertEqual(len(individuals), 4) + self.assertEqual(len(identities), 4) + + # Individual 1 + for individual in individuals: + self.assertIn(individual.mk, expected) + self.assertEqual(individual.profile.email, expected[individual.mk]['profile']['email']) + self.assertEqual(individual.identities.count(), len(expected[individual.mk]['identities'])) + self.assertEqual(individual.identities.first().name, expected[individual.mk]['identities'][0]['name']) + self.assertEqual(individual.identities.first().username, expected[individual.mk]['identities'][0]['username']) + self.assertEqual(individual.identities.first().email, expected[individual.mk]['identities'][0]['email']) + self.assertEqual(individual.identities.first().source, 'gitdm') + + @unittest.mock.patch.object(GitdmImporter, '_fetch_data', mock_fetch) + def test_load_existing_individuals(self): + """Test the import_identities method works running twice""" + + importer = GitdmImporter(self.ctx, 'email_employer') + importer.import_identities() + + individuals = Individual.objects.all() + identities = Identity.objects.all() + self.assertEqual(len(individuals), 4) + self.assertEqual(len(identities), 4) + + mks_before = Individual.objects.values_list('mk', flat=True) + uuids_before = Identity.objects.values_list('uuid', flat=True) + + importer.import_identities() + + individuals = Individual.objects.all() + identities = Identity.objects.all() + self.assertEqual(len(individuals), 4) + self.assertEqual(len(identities), 4) + + mks_after = Individual.objects.values_list('mk', flat=True) + uuids_after = Identity.objects.values_list('uuid', flat=True) + + self.assertListEqual(sorted(mks_before), sorted(mks_after)) + self.assertListEqual(sorted(uuids_before), sorted(uuids_after)) + + +class TestGitdmRegEx(unittest.TestCase): + """Test regular expressions used while parsing Gitdm inputs""" + + def test_valid_lines(self): + """Check whether it parses valid lines""" + + parser = re.compile(GitdmParser.VALID_LINE_REGEX, re.UNICODE) + + # Parse some valid lines + m = parser.match("jdoe@example.com\tExample Company\t# John Doe") + self.assertIsNotNone(m) + + m = parser.match("jdoe@example.com\t\tExample < 2010-01-01\t\t# John Doe") + self.assertIsNotNone(m) + + m = parser.match("jdoe@example.com\tExample Company") + self.assertIsNotNone(m) + + m = parser.match("jdoe@example.com\t\t\tjohndoe@example.com") + self.assertIsNotNone(m) + + m = parser.match("example.org\t\tExample/n' Co. ") + self.assertIsNotNone(m) + + m = parser.match("jdoe@example.org Example") + self.assertIsNotNone(m) + + # Parse some lines with valid comments + m = parser.match("example.org\torganization\t### comment") + self.assertIsNotNone(m) + + m = parser.match("jonhdoe@exampl.com\torganization\t# \t\r") + self.assertIsNotNone(m) + + m = parser.match("domain\torganization\t#\tcomment #1\r\n") + self.assertIsNotNone(m) + + m = parser.match(u"example.org\tExamplé") + self.assertIsNotNone(m) + + # It's weird but it's a valid line + m = parser.match("jdoe@example.org\tjdoe@exa\tmple.com") + self.assertIsNotNone(m) + + # These are examples or invalid lines + m = parser.match("\texample.org\t\tExample") + self.assertIsNone(m) + + m = parser.match(" example.org Example") + self.assertIsNone(m) + + m = parser.match("jdoe@example.org\nExample\t\n") + self.assertIsNone(m) + + m = parser.match("example.org\t\n\tExample") + self.assertIsNone(m) + + m = parser.match("example.org\tExa\nmple") + self.assertIsNone(m) + + m = parser.match("domain organization\t # comment\n\t") + self.assertIsNone(m) + + def test_lines_to_ignore(self): + """Check whether it parses blank or comment lines""" + + parser = re.compile(GitdmParser.LINES_TO_IGNORE_REGEX, re.UNICODE) + + # Parse some valid blank lines + m = parser.match("") + self.assertIsNotNone(m) + + m = parser.match("\n\n\n") + self.assertIsNotNone(m) + + m = parser.match(" \t \r\n ") + self.assertIsNotNone(m) + + m = parser.match("\t\t \n \t\n") + self.assertIsNotNone(m) + + # Do not parse invalid blank lines + m = parser.match("\ndomain organization\n\n") + self.assertIsNone(m) + + m = parser.match(" domain \t organization \r\n ") + self.assertIsNone(m) + + m = parser.match("\t domain organization\t \n \n") + self.assertIsNone(m) + + # Parse some valid comments + m = parser.match("# \t\n\r") + self.assertIsNotNone(m) + + m = parser.match("#|tcomment #1\r\n") + self.assertIsNotNone(m) + + def test_email(self): + """Check email address pattern""" + + parser = re.compile(GitdmParser.EMAIL_ADDRESS_REGEX, re.UNICODE) + + # Parse some valid email addresses + m = parser.match("johndoe@example.com") + self.assertIsNotNone(m) + + m = parser.match("jonh.doe@exampel.com") + self.assertIsNotNone(m) + + m = parser.match("?¡~,123@example.com") + self.assertIsNotNone(m) + + # Do not parse invalid email addresses + m = parser.match("jonh@doe@example.com") + self.assertIsNone(m) + + m = parser.match(" johndoe@example.com") + self.assertIsNone(m) + + m = parser.match("johndoe@example.com ") + self.assertIsNone(m) + + m = parser.match("johndoe@example.com\t") + self.assertIsNone(m) + + m = parser.match("johndoe@.com") + self.assertIsNone(m) + + def test_organization(self): + """Check organization pattern""" + + parser = re.compile(GitdmParser.ORGANIZATION_REGEX, re.UNICODE) + + # Organizations must start with alphanumeric or underscore + # characters. They can have spaces or other symbols, but + # cannot include other separators like tabs or # + + # These must work + m = parser.match("Example") + self.assertIsNotNone(m) + + m = parser.match("0Example") + self.assertIsNotNone(m) + + m = parser.match("_Example") + self.assertIsNotNone(m) + + m = parser.match("My Example") + self.assertIsNotNone(m) + + m = parser.match("Example\n") + self.assertIsNotNone(m) + self.assertEqual(m.group(1), "Example") + + m = parser.match("'Example") + self.assertIsNotNone(m) + + m = parser.match("/Example") + self.assertIsNotNone(m) + + m = parser.match("-Example") + self.assertIsNotNone(m) + + # While these won't work + m = parser.match("Example ") + self.assertIsNone(m) + + m = parser.match("Exa\tmple") + self.assertIsNone(m) + + m = parser.match("Example #") + self.assertIsNone(m) + + m = parser.match(" ") + self.assertIsNone(m) + + def test_domain(self): + """Check domain pattern""" + + parser = re.compile(GitdmParser.DOMAIN_REGEX, re.UNICODE) + + # Domains must start with alphanumeric or underscore + # characters. + + # These must work + m = parser.match("__example.org") + self.assertIsNotNone(m) + + m = parser.match("9example.org") + self.assertIsNotNone(m) + + # While these won't work + m = parser.match("'_example.org") + self.assertIsNone(m) + + m = parser.match("/example.org") + self.assertIsNone(m) + + m = parser.match("exa\tmple.org") + self.assertIsNone(m) + + m = parser.match(" example.org") + self.assertIsNone(m) + + def test_enrollment(self): + """Check enrollment pattern""" + + parser = re.compile(GitdmParser.ENROLLMENT_REGEX, re.UNICODE) + + # These must work + m = parser.match("Example") + self.assertIsNotNone(m) + + m = parser.match("0Example") + self.assertIsNotNone(m) + + m = parser.match("_Example") + self.assertIsNotNone(m) + + m = parser.match("My Example") + self.assertIsNotNone(m) + + m = parser.match("Example < 2012-01-01") + self.assertIsNotNone(m) + + m = parser.match("Example, Inc.") + self.assertIsNotNone(m) + + m = parser.match("'Example") + self.assertIsNotNone(m) + + m = parser.match("/Example") + self.assertIsNotNone(m) + + m = parser.match("Example < 2012-01-01") + self.assertIsNotNone(m) + + m = parser.match("Exa\tmple") + self.assertIsNotNone(m) + + # While these won't work + m = parser.match("Example #") + self.assertIsNone(m) + + m = parser.match(" ") + self.assertIsNone(m) + + m = parser.match("Example <") + self.assertIsNone(m) + + m = parser.match("Example<") + self.assertIsNone(m) + + m = parser.match("Example < 200-01-01") + self.assertIsNone(m) + + m = parser.match("Example < 2012-1-1") + self.assertIsNone(m) + + m = parser.match("Example < 2012-01-1") + self.assertIsNone(m) + + m = parser.match("Example < 1-1-2001") + self.assertIsNone(m) + + m = parser.match("Example < 2012-01-01 <") + self.assertIsNone(m)