From 80ee395cef3f3aef36e67034f4dd300eff761073 Mon Sep 17 00:00:00 2001 From: Kevin Schoedel <67607049+kpschoedel@users.noreply.github.com> Date: Fri, 10 Dec 2021 11:05:41 -0500 Subject: [PATCH] [Size reports] Script improvements (1/3) (#12886) #### Problem Recent memory size investigations suggest some improvements: - Scripts use PR==0 to distinguish pull requests from master commits (push events), but it would be useful to record the associated PR along with commits. - Sometimes push events run on pull requests, and those are not currently distinguishable from master push events. - Sorting by build timestamp is inaccurate, since CI runs may finish in a different order than the commits. #### Change overview This is the first of three steps. - Use `event` rather than PR number to distinguish PR builds from master commit builds. - Record GitHub `event` and `ref` in the database representation, and add `ref` to the JSON (`event` was already present). - For master commits, use the commit timestamp rather than the build time. The second step will add the event to artifact names, so that it can be used to filter artifact downloads instead of PR==0. The third step will add the PR number from master commits, extracted from the commit message. #### Testing Manually run offline, but final confirmation requires live CI runs. --- scripts/tools/memory/gh_report.py | 75 +++++++++++++------- scripts/tools/memory/gh_sizes.py | 26 +++++-- scripts/tools/memory/gh_sizes_environment.py | 44 ++++++++---- 3 files changed, 99 insertions(+), 46 deletions(-) diff --git a/scripts/tools/memory/gh_report.py b/scripts/tools/memory/gh_report.py index 0669be621eef6e..0dc82bd809c4cb 100755 --- a/scripts/tools/memory/gh_report.py +++ b/scripts/tools/memory/gh_report.py @@ -170,7 +170,9 @@ class SizeDatabase(memdf.util.sqlite.Database): pr INTEGER DEFAULT 0, -- Github PR number time INTEGER NOT NULL, -- Unix-epoch timestamp artifact INTEGER DEFAULT 0, -- Github artifact ID - commented INTEGER DEFAULT 0, + commented INTEGER DEFAULT 0, -- 1 if recorded in a GH comment + ref TEXT, -- Target git ref + event TEXT, -- Github build trigger event UNIQUE(thing_id, hash, parent, pr, time, artifact) ) """, """ @@ -194,13 +196,15 @@ def add_sizes(self, **kwargs): """ Add a size report to the database. - The incoming arguments must contain the non-ID column names from - ‘thing’ and ‘build’ tables, plus a 'sizes' entry that is a sequence - of mappings containing 'name' and 'size'. + The incoming arguments must contain the required non-ID column names + from ‘thing’ and ‘build’ tables, plus a 'sizes' entry that is a + sequence of mappings containing 'name' and 'size'. """ td = {k: kwargs[k] for k in ('platform', 'config', 'target')} thing = self.store_and_return_id('thing', **td) - bd = {k: kwargs[k] for k in ('hash', 'parent', 'time')} + bd = {k: kwargs[k] for k in ('hash', 'parent', 'time', 'event')} + if 'ref' in kwargs: + bd['ref'] = kwargs['ref'] cd = {k: kwargs.get(k, 0) for k in ('pr', 'artifact', 'commented')} build = self.store_and_return_id('build', thing_id=thing, **bd, **cd) if build is None: @@ -213,11 +217,11 @@ def add_sizes_from_json(self, s: Union[bytes, str], origin: Dict): """Add sizes from a JSON size report.""" r = origin.copy() r.update(json.loads(s)) - by = r.get('by', 'section') - r['sizes'] = [{ - 'name': i[by], - 'size': i['size'] - } for i in r['frames'][by]] + r['sizes'] = [] + # Add section sizes. + for i in r['frames'].get('section', []): + r['sizes'].append({'name': i['section'], 'size': i['size']}) + # Add segment sizes. for i in r['frames'].get('wr', []): r['sizes'].append({ 'name': ('(read only)', '(read/write)')[int(i['wr'])], @@ -257,7 +261,7 @@ def add_sizes_from_github(self): artifact_pages = self.config['github.limit-artifact-pages'] # Size artifacts have names of the form: - # Size,{group},{pr},{commit_hash},{parent_hash} + # Size,{group},{pr},{commit_hash},{parent_hash}[,{event}] # Record them keyed by group and commit_hash to match them up # after we have the entire list. page = 0 @@ -266,12 +270,17 @@ def add_sizes_from_github(self): if not i.artifacts: break for a in i.artifacts: - if a.name.startswith('Size,'): - _, group, pr, commit, parent, *_ = (a.name + ',').split( - ',', 5) + if a.name.startswith('Size,') and a.name.count(',') >= 4: + _, group, pr, commit, parent, *etc = a.name.split(',') a.parent = parent a.pr = pr a.created_at = dateutil.parser.isoparse(a.created_at) + # Old artifact names don't include the event. + if etc: + event = etc[0] + else: + event = 'push' if pr == '0' else 'pull_request' + a.event = event if group not in size_artifacts: size_artifacts[group] = {} size_artifacts[group][commit] = a @@ -286,7 +295,7 @@ def add_sizes_from_github(self): for group, group_reports in size_artifacts.items(): logging.debug('ASG: group %s', group) for report in group_reports.values(): - if self.config['report.pr' if int(report.pr) else 'report.push']: + if self.should_report(report.event): if report.parent not in group_reports: logging.debug('ASN: No match for %s', report.name) continue @@ -308,8 +317,10 @@ def add_sizes_from_github(self): try: blob = self.gh.actions.download_artifact(i, 'zip') except Exception as e: + blob = None logging.error('Failed to download artifact %d: %s', i, e) - self.add_sizes_from_zipfile(io.BytesIO(blob), {'artifact': i}) + if blob: + self.add_sizes_from_zipfile(io.BytesIO(blob), {'artifact': i}) def read_inputs(self): """Read size report from github and/or local files.""" @@ -321,11 +332,15 @@ def read_inputs(self): def select_matching_commits(self): """Find matching builds, where one's commit is the other's parent.""" return self.execute(''' - SELECT DISTINCT c.pr AS pr, c.hash AS hash, p.hash AS parent - FROM build c - INNER JOIN build p ON p.hash = c.parent - WHERE c.commented = 0 - ORDER BY c.time DESC, c.pr, c.hash, p.hash + SELECT DISTINCT + c.event as event, + c.pr AS pr, + c.hash AS hash, + p.hash AS parent + FROM build c + INNER JOIN build p ON p.hash = c.parent + WHERE c.commented = 0 + ORDER BY c.time DESC, c.pr, c.hash, p.hash ''') def set_commented(self, build_ids: Iterable[int]): @@ -363,6 +378,14 @@ def delete_stale_artifacts(self, stale_artifacts: Iterable[int]): logging.info('DSA: deleting obsolete artifact %d', artifact_id) self.delete_artifact(artifact_id) + def should_report(self, event: Optional[str] = None) -> bool: + """Return true if reporting is enabled for the event.""" + if event is None: + return self.config['report.pr'] or self.config['report.push'] + if event == 'pull_request': + return self.config['report.pr'] + return self.config['report.push'] + def gh_open(config: Config) -> Optional[ghapi.core.GhApi]: """Return a GhApi, if so configured.""" @@ -647,7 +670,7 @@ def v1_comment_summary(df: pd.DataFrame) -> str: def report_matching_commits(db: SizeDatabase) -> Dict[str, pd.DataFrame]: """Report on all new comparable commits.""" - if not (db.config['report.pr'] or db.config['report.push']): + if not db.should_report(): return {} comment_count = 0 @@ -655,13 +678,11 @@ def report_matching_commits(db: SizeDatabase) -> Dict[str, pd.DataFrame]: comment_enabled = (db.config['github.comment'] or db.config['github.dryrun-comment']) - report_push = db.config['report.push'] - report_pr = db.config['report.pr'] only_pr = db.config['github.limit-pr'] dfs = {} - for pr, commit, parent in db.select_matching_commits().fetchall(): - if not (report_pr if pr else report_push): + for event, pr, commit, parent in db.select_matching_commits().fetchall(): + if not db.should_report(event): continue # Github doesn't have a way to fetch artifacts associated with a @@ -672,7 +693,7 @@ def report_matching_commits(db: SizeDatabase) -> Dict[str, pd.DataFrame]: df = changes_for_commit(db, pr, commit, parent) dfs[df.attrs['name']] = df - if (pr and comment_enabled + if (event == 'pull_request' and comment_enabled and (comment_limit == 0 or comment_limit > comment_count)): if gh_send_change_report(db, df): # Mark the originating builds, and remove the originating diff --git a/scripts/tools/memory/gh_sizes.py b/scripts/tools/memory/gh_sizes.py index d6648481e86910..0468d0bc82dc28 100755 --- a/scripts/tools/memory/gh_sizes.py +++ b/scripts/tools/memory/gh_sizes.py @@ -31,11 +31,11 @@ This script also expects certain environment variables, which can be set in a github workflow as follows: - env: - BUILD_TYPE: nrfconnect - GH_EVENT_PR: ${{ github.event_name == 'pull_request' && github.event.number || 0 }} - GH_EVENT_HASH: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - GH_EVENT_PARENT: ${{ github.event_name == 'pull_request' && github.event.pull_request.base.sha || github.event.before }} + - name: Set up environment for size reports + if: ${{ !env.ACT }} + env: + GH_CONTEXT: ${{ toJson(github) }} + run: gh_sizes_environment.py "${GH_CONTEXT}" Default output file is {platform}-{configname}-{buildname}-sizes.json in the binary's directory. This file has the form: @@ -51,11 +51,16 @@ "parent": "20796f752061726520746f6f20636c6f73652e0a", "pr": 12345, "by": "section", + "ref": "refs/pull/12345/merge" "frames": { "section": [ {"section": ".bss", "size": 260496}, {"section": ".data", "size": 1648}, {"section": ".text", "size": 740236} + ], + "wr": [ + {"wr": 0, "size": 262144}, + {"wr": 1, "size": 74023} ] } } @@ -100,10 +105,17 @@ 'metavar': 'HASH', 'default': os.environ.get('GH_EVENT_PARENT'), }, + 'ref': { + 'help': 'Target ref', + 'metavar': 'REF', + 'default': os.environ.get('GH_EVENT_REF'), + }, 'timestamp': { 'help': 'Build timestamp', 'metavar': 'TIME', - 'default': int(datetime.datetime.now().timestamp()), + 'default': int( + os.environ.get('GH_EVENT_TIMESTAMP') + or datetime.datetime.now().timestamp()), }, } @@ -165,7 +177,7 @@ def main(argv): config.put('output.metadata.time', config['timestamp']) config.put('output.metadata.input', binary) config.put('output.metadata.by', 'section') - for key in ['event', 'hash', 'parent', 'pr']: + for key in ['event', 'hash', 'parent', 'pr', 'ref']: if value := config[key]: config.putl(['output', 'metadata', key], value) diff --git a/scripts/tools/memory/gh_sizes_environment.py b/scripts/tools/memory/gh_sizes_environment.py index 4b38a53e12c2db..89950f387bdab3 100755 --- a/scripts/tools/memory/gh_sizes_environment.py +++ b/scripts/tools/memory/gh_sizes_environment.py @@ -16,10 +16,12 @@ Sets the following environment variables: -- `GH_EVENT_PR` For a pull request, the PR number; otherwise 0. -- `GH_EVENT_HASH` SHA of the commit under test. -- `GH_EVENT_PARENT` SHA of the parent commit to which the commit under - test is applied. +- `GH_EVENT_PR` For a pull request, the PR number; otherwise 0. +- `GH_EVENT_HASH` SHA of the commit under test. +- `GH_EVENT_PARENT` SHA of the parent commit to which the commit under + test is applied. +- `GH_EVENT_REF` The branch or tag ref that triggered the workflow run. +- `GH_EVENT_TIMESTAMP` For `push` events only, the timestamp of the commit. """ import json @@ -28,11 +30,21 @@ import subprocess import sys +import dateutil.parser + github = json.loads(sys.argv[1]) +commit = None +timestamp = None +ref = github['ref'] + if github['event_name'] == 'pull_request': + pr = github['event']['number'] commit = github['event']['pull_request']['head']['sha'] + + # Try to find the actual commit against which the current PR compares + # by scraping the HEAD commit message. r = subprocess.run(['git', 'show', '--no-patch', '--format=%s', 'HEAD'], capture_output=True, text=True, check=True) m = re.fullmatch('Merge [0-9a-f]+ into ([0-9a-f]+)', r.stdout) @@ -40,17 +52,25 @@ parent = m.group(1) else: parent = github['event']['pull_request']['base']['sha'] -else: - pr = 0 + +elif github['event_name'] == 'push': + commit = github['sha'] parent = github['event']['before'] + timestamp = dateutil.parser.isoparse( + github['event']['head_commit']['timestamp']).timestamp() + pr = 0 # Environment variables for subsequent workflow steps are set by # writing to the file named by `$GITHUB_ENV`. -env = os.environ.get('GITHUB_ENV') -assert env -with open(env, 'at') as out: - print(f'GH_EVENT_PR={pr}', file=out) - print(f'GH_EVENT_HASH={commit}', file=out) - print(f'GH_EVENT_PARENT={parent}', file=out) +if commit is not None: + env = os.environ.get('GITHUB_ENV') + assert env + with open(env, 'at') as out: + print(f'GH_EVENT_PR={pr}', file=out) + print(f'GH_EVENT_HASH={commit}', file=out) + print(f'GH_EVENT_PARENT={parent}', file=out) + print(f'GH_EVENT_REF={ref}', file=out) + if timestamp: + print(f'GH_EVENT_TIMESTAMP={timestamp}', file=out)