Skip to content

Commit

Permalink
Merge branch 'improve_enrichment' of 'https://github.com/zhquan/Grimo…
Browse files Browse the repository at this point in the history
…ireELK'

Merges #1110
Closes #1110
  • Loading branch information
sduenas authored Jul 11, 2023
2 parents 39c9119 + 2eafa75 commit ebb35e7
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 48 deletions.
2 changes: 1 addition & 1 deletion grimoire_elk/elk.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ def load_identities(ocean_backend, enrich_backend):
return identities_count


@lru_cache(1024)
@lru_cache(4096)
def add_sh_identity_cache(identity_tuple, sh_db, backend):
"""Cache add_sh_identity calls. Identity must be in tuple format"""

Expand Down
59 changes: 32 additions & 27 deletions grimoire_elk/enriched/enrich.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# Quan Zhou <[email protected]>
# Miguel Ángel Fernández <[email protected]>
#

import datetime
import json
import functools
import logging
Expand Down Expand Up @@ -121,7 +121,7 @@ def __init__(self, db_sortinghat=None, json_projects_map=None, db_user='',

perceval_backend = None
super().__init__(perceval_backend, insecure=insecure)

self._connector_name = None
self.sortinghat = False
if db_user == '':
db_user = DEFAULT_DB_USER
Expand Down Expand Up @@ -399,7 +399,9 @@ def add_metadata_filter_raw(self, eitem):
def get_connector_name(self):
""" Find the name for the current connector """
from ..utils import get_connector_name
return get_connector_name(type(self))
if not self._connector_name:
self._connector_name = get_connector_name(type(self))
return self._connector_name

def get_field_author(self):
""" Field with the author information """
Expand Down Expand Up @@ -468,10 +470,13 @@ def get_grimoire_fields(self, creation_date, item_name):
""" Return common grimoire fields for all data sources """

grimoire_date = None
try:
grimoire_date = str_to_datetime(creation_date).isoformat()
except Exception as ex:
pass
if isinstance(creation_date, datetime.datetime):
grimoire_date = creation_date.isoformat()
else:
try:
grimoire_date = str_to_datetime(creation_date).isoformat()
except Exception as ex:
pass

name = "is_" + self.get_connector_name() + "_" + item_name

Expand Down Expand Up @@ -771,7 +776,7 @@ def get_item_sh_fields(self, identity=None, item_date=None, sh_id=None,

return eitem_sh

@lru_cache()
@lru_cache(4096)
def get_sh_item_from_id(self, sh_id):
"""Get all the identity information from SortingHat using the individual id"""

Expand Down Expand Up @@ -802,7 +807,7 @@ def get_sh_item_from_identity(self, identity, backend_name):
sh_item = self.get_sh_item_from_identity_cache(identity_tuple, backend_name)
return sh_item

@lru_cache()
@lru_cache(4096)
def get_sh_item_from_identity_cache(self, identity_tuple, backend_name):
"""Get a SortingHat item with all the information related with an identity"""
sh_item = {}
Expand Down Expand Up @@ -862,15 +867,21 @@ def get_sh_item_from_identity_cache(self, identity_tuple, backend_name):

return sh_item

def get_sh_item_multi_enrollments(self, enrollments, item_date):
def get_sh_item_multi_enrollments(self, enrollments, item_date_str):
""" Get the enrollments for the uuid when the item was done """

enrolls = []
enrollments = enrollments if enrollments else []

# item_date must be offset-naive (utc)
if item_date and item_date.tzinfo:
item_date = (item_date - item_date.utcoffset()).replace(tzinfo=None)
if enrollments:
if item_date_str:
item_date = str_to_datetime(item_date_str)
else:
item_date = None

# item_date must be offset-naive (utc)
if item_date and item_date.tzinfo:
item_date = (item_date - item_date.utcoffset()).replace(tzinfo=None)

for enrollment in enrollments:
group = enrollment['group']
Expand Down Expand Up @@ -1032,9 +1043,9 @@ def get_item_sh(self, item, roles=None, date_field=None):
roles = [author_field]

if not date_field:
item_date = str_to_datetime(item[self.get_field_date()])
item_date = item[self.get_field_date()]
else:
item_date = str_to_datetime(item[date_field])
item_date = item[date_field]

users_data = self.get_users_data(item)

Expand Down Expand Up @@ -1092,29 +1103,23 @@ def generate_uuid(self, source, email=None, name=None, username=None):
args_without_empty = {k: v for k, v in args.items() if v}
return generate_uuid(**args_without_empty)

@lru_cache()
@lru_cache(4096)
def get_entity(self, id):
return SortingHat.get_entity(self.sh_db, id)

@lru_cache()
@lru_cache(4096)
def is_bot(self, uuid):
return SortingHat.is_bot(self.sh_db, uuid)

@lru_cache()
def get_uuid(self, backend_name, email=None, name=None, username=None):
# SortingHat GraphQL has not query that given the backend_name, email, name, and username
# return the uuid. That is why we use add_id to get the uuid
return SortingHat.add_id(self.sh_db, backend_name, email=email, name=name, username=username)

@lru_cache()
@lru_cache(4096)
def get_enrollments(self, uuid):
return SortingHat.get_enrollments(self.sh_db, uuid)

@lru_cache()
@lru_cache(4096)
def get_unique_identity(self, uuid):
return SortingHat.get_unique_identity(self.sh_db, uuid)

@lru_cache()
@lru_cache(4096)
def get_uuid_from_id(self, sh_id):
""" Get the SH identity uuid from the id """
return SortingHat.get_uuid_from_id(self.sh_db, sh_id)
Expand All @@ -1123,7 +1128,7 @@ def add_sh_identities(self, identities):
SortingHat.add_identities(self.sh_db, identities,
self.get_connector_name())

@lru_cache()
@lru_cache(4096)
def add_sh_identity_cache(self, identity_tuple):
"""Cache add_sh_identity calls. Identity must be in tuple format"""

Expand Down
34 changes: 21 additions & 13 deletions grimoire_elk/enriched/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,9 +211,6 @@ def get_rich_item(self, item):
# The real data
commit = item['data']

self.__fix_field_date(commit, 'AuthorDate')
self.__fix_field_date(commit, 'CommitDate')

# data fields to copy
copy_fields = ["message"]
for f in copy_fields:
Expand All @@ -237,8 +234,9 @@ def get_rich_item(self, item):

eitem['hash_short'] = eitem['hash'][0:6]
# Enrich dates
author_date = str_to_datetime(commit["AuthorDate"])
commit_date = str_to_datetime(commit["CommitDate"])

author_date = self.__cast_str_to_datetime(commit, 'AuthorDate')
commit_date = self.__cast_str_to_datetime(commit, 'CommitDate')

eitem["author_date"] = author_date.replace(tzinfo=None).isoformat()
eitem["commit_date"] = commit_date.replace(tzinfo=None).isoformat()
Expand Down Expand Up @@ -324,10 +322,11 @@ def get_rich_item(self, item):
author_domain = self.get_identity_domain(self.get_sh_identity(item, 'Author'))
eitem['git_author_domain'] = author_domain

eitem.update(self.get_grimoire_fields(commit["AuthorDate"], "commit"))
grimoire_fields = self.get_grimoire_fields(author_date, "commit")
eitem.update(grimoire_fields)

# grimoire_creation_date is needed in the item
item.update(self.get_grimoire_fields(commit["AuthorDate"], "commit"))
item.update(grimoire_fields)
eitem.update(self.get_item_sh(item, self.roles))

if self.prjs_map:
Expand All @@ -341,8 +340,8 @@ def get_rich_item(self, item):
self.__add_commit_meta_fields(eitem, commit)
return eitem

def __fix_field_date(self, item, attribute):
"""Fix possible errors in the field date"""
def __cast_str_to_datetime(self, item, attribute):
"""Convert str to datetime fixing possible errors"""

field_date = str_to_datetime(item[attribute])

Expand All @@ -351,7 +350,8 @@ def __fix_field_date(self, item, attribute):
except ValueError:
logger.warning("[git] {} in commit {} has a wrong format".format(
attribute, item['commit']))
item[attribute] = field_date.replace(tzinfo=None).isoformat()
return field_date.replace(tzinfo=None).isoformat()
return field_date

def __add_commit_meta_fields(self, eitem, commit):
"""Add commit meta fields as signed_off_by, reviwed_by, tested_by, etc."""
Expand Down Expand Up @@ -382,9 +382,17 @@ def __add_commit_meta_fields(self, eitem, commit):

if self.sortinghat:
# Create SH identity if it does not exist
identity_tuple = tuple(identity.items())
self.add_sh_identity_cache(identity_tuple)
item_date = str_to_datetime(eitem[self.get_field_date()])
backend_name = self.get_connector_name()
identity_id = self.generate_uuid(backend_name,
email=identity['email'],
name=identity['name'],
username=identity['username'])
individual = self.get_entity(identity_id)
if not individual:
identity_tuple = tuple(identity.items())
self.add_sh_identity_cache(identity_tuple)
logger.debug("Create a new individual {} in commit_meta_fields".format(identity_id))
item_date = eitem[self.get_field_date()]
sh_fields = self.get_item_sh_fields(identity, item_date, rol=meta_field)
else:
sh_fields = self.get_item_no_sh_fields(identity, rol=meta_field)
Expand Down
7 changes: 0 additions & 7 deletions grimoire_elk/enriched/sortinghat_gelk.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,6 @@ def add_identity(cls, db, identity, backend):
try:
uuid = cls.add_id(db, backend, email=identity['email'],
name=identity['name'], username=identity['username'])

profile = {"name": identity['name'] if identity['name'] else identity['username'],
"email": identity['email']}
profile_without_empty = {k: v for k, v in profile.items() if v}

cls.update_profile(db, uuid, profile_without_empty)

except UnicodeEncodeError:
logger.warning("[sortinghat] UnicodeEncodeError. Ignoring it. {} {} {}".format(
identity['email'], identity['name'], identity['username']))
Expand Down
12 changes: 12 additions & 0 deletions releases/unreleased/enrichment-process-performance-improved.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
title: Enrichment processing time reduced by 50%
category: performance
author: Quan Zhou <[email protected]>
issue: null
notes: |
The general performance was improved reducing the number of calls
to the identities manager (i.g. `SortingHat`). There were some deprecated
calls that weren't needed any longer and also, we increased the cache of
individuals in ELKs.
We were also able to reduce the processing time of the Git backend by
converting commit dates only once.

0 comments on commit ebb35e7

Please sign in to comment.