Skip to content

Commit

Permalink
[Size reports] Script improvements (1/3) (#12886)
Browse files Browse the repository at this point in the history
#### Problem

Recent memory size investigations suggest some improvements:

- Scripts use PR==0 to distinguish pull requests from master commits
  (push events), but it would be useful to record the associated PR
  along with commits.
- Sometimes push events run on pull requests, and those are not
  currently distinguishable from master push events.
- Sorting by build timestamp is inaccurate, since CI runs may finish
  in a different order than the commits.

#### Change overview

This is the first of three steps.

- Use `event` rather than PR number to distinguish PR builds from
  master commit builds.
- Record GitHub `event` and `ref` in the database representation, and
  add `ref` to the JSON (`event` was already present).
- For master commits, use the commit timestamp rather than the build
  time.

The second step will add the event to artifact names, so that it can
be used to filter artifact downloads instead of PR==0.

The third step will add the PR number from master commits, extracted
from the commit message.

#### Testing

Manually run offline, but final confirmation requires live CI runs.
  • Loading branch information
kpschoedel authored Dec 10, 2021
1 parent 4c5d042 commit 7e6b25f
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 46 deletions.
75 changes: 48 additions & 27 deletions scripts/tools/memory/gh_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,9 @@ class SizeDatabase(memdf.util.sqlite.Database):
pr INTEGER DEFAULT 0, -- Github PR number
time INTEGER NOT NULL, -- Unix-epoch timestamp
artifact INTEGER DEFAULT 0, -- Github artifact ID
commented INTEGER DEFAULT 0,
commented INTEGER DEFAULT 0, -- 1 if recorded in a GH comment
ref TEXT, -- Target git ref
event TEXT, -- Github build trigger event
UNIQUE(thing_id, hash, parent, pr, time, artifact)
)
""", """
Expand All @@ -194,13 +196,15 @@ def add_sizes(self, **kwargs):
"""
Add a size report to the database.
The incoming arguments must contain the non-ID column names from
‘thing’ and ‘build’ tables, plus a 'sizes' entry that is a sequence
of mappings containing 'name' and 'size'.
The incoming arguments must contain the required non-ID column names
from ‘thing’ and ‘build’ tables, plus a 'sizes' entry that is a
sequence of mappings containing 'name' and 'size'.
"""
td = {k: kwargs[k] for k in ('platform', 'config', 'target')}
thing = self.store_and_return_id('thing', **td)
bd = {k: kwargs[k] for k in ('hash', 'parent', 'time')}
bd = {k: kwargs[k] for k in ('hash', 'parent', 'time', 'event')}
if 'ref' in kwargs:
bd['ref'] = kwargs['ref']
cd = {k: kwargs.get(k, 0) for k in ('pr', 'artifact', 'commented')}
build = self.store_and_return_id('build', thing_id=thing, **bd, **cd)
if build is None:
Expand All @@ -213,11 +217,11 @@ def add_sizes_from_json(self, s: Union[bytes, str], origin: Dict):
"""Add sizes from a JSON size report."""
r = origin.copy()
r.update(json.loads(s))
by = r.get('by', 'section')
r['sizes'] = [{
'name': i[by],
'size': i['size']
} for i in r['frames'][by]]
r['sizes'] = []
# Add section sizes.
for i in r['frames'].get('section', []):
r['sizes'].append({'name': i['section'], 'size': i['size']})
# Add segment sizes.
for i in r['frames'].get('wr', []):
r['sizes'].append({
'name': ('(read only)', '(read/write)')[int(i['wr'])],
Expand Down Expand Up @@ -257,7 +261,7 @@ def add_sizes_from_github(self):
artifact_pages = self.config['github.limit-artifact-pages']

# Size artifacts have names of the form:
# Size,{group},{pr},{commit_hash},{parent_hash}
# Size,{group},{pr},{commit_hash},{parent_hash}[,{event}]
# Record them keyed by group and commit_hash to match them up
# after we have the entire list.
page = 0
Expand All @@ -266,12 +270,17 @@ def add_sizes_from_github(self):
if not i.artifacts:
break
for a in i.artifacts:
if a.name.startswith('Size,'):
_, group, pr, commit, parent, *_ = (a.name + ',').split(
',', 5)
if a.name.startswith('Size,') and a.name.count(',') >= 4:
_, group, pr, commit, parent, *etc = a.name.split(',')
a.parent = parent
a.pr = pr
a.created_at = dateutil.parser.isoparse(a.created_at)
# Old artifact names don't include the event.
if etc:
event = etc[0]
else:
event = 'push' if pr == '0' else 'pull_request'
a.event = event
if group not in size_artifacts:
size_artifacts[group] = {}
size_artifacts[group][commit] = a
Expand All @@ -286,7 +295,7 @@ def add_sizes_from_github(self):
for group, group_reports in size_artifacts.items():
logging.debug('ASG: group %s', group)
for report in group_reports.values():
if self.config['report.pr' if int(report.pr) else 'report.push']:
if self.should_report(report.event):
if report.parent not in group_reports:
logging.debug('ASN: No match for %s', report.name)
continue
Expand All @@ -308,8 +317,10 @@ def add_sizes_from_github(self):
try:
blob = self.gh.actions.download_artifact(i, 'zip')
except Exception as e:
blob = None
logging.error('Failed to download artifact %d: %s', i, e)
self.add_sizes_from_zipfile(io.BytesIO(blob), {'artifact': i})
if blob:
self.add_sizes_from_zipfile(io.BytesIO(blob), {'artifact': i})

def read_inputs(self):
"""Read size report from github and/or local files."""
Expand All @@ -321,11 +332,15 @@ def read_inputs(self):
def select_matching_commits(self):
"""Find matching builds, where one's commit is the other's parent."""
return self.execute('''
SELECT DISTINCT c.pr AS pr, c.hash AS hash, p.hash AS parent
FROM build c
INNER JOIN build p ON p.hash = c.parent
WHERE c.commented = 0
ORDER BY c.time DESC, c.pr, c.hash, p.hash
SELECT DISTINCT
c.event as event,
c.pr AS pr,
c.hash AS hash,
p.hash AS parent
FROM build c
INNER JOIN build p ON p.hash = c.parent
WHERE c.commented = 0
ORDER BY c.time DESC, c.pr, c.hash, p.hash
''')

def set_commented(self, build_ids: Iterable[int]):
Expand Down Expand Up @@ -363,6 +378,14 @@ def delete_stale_artifacts(self, stale_artifacts: Iterable[int]):
logging.info('DSA: deleting obsolete artifact %d', artifact_id)
self.delete_artifact(artifact_id)

def should_report(self, event: Optional[str] = None) -> bool:
"""Return true if reporting is enabled for the event."""
if event is None:
return self.config['report.pr'] or self.config['report.push']
if event == 'pull_request':
return self.config['report.pr']
return self.config['report.push']


def gh_open(config: Config) -> Optional[ghapi.core.GhApi]:
"""Return a GhApi, if so configured."""
Expand Down Expand Up @@ -647,21 +670,19 @@ def v1_comment_summary(df: pd.DataFrame) -> str:

def report_matching_commits(db: SizeDatabase) -> Dict[str, pd.DataFrame]:
"""Report on all new comparable commits."""
if not (db.config['report.pr'] or db.config['report.push']):
if not db.should_report():
return {}

comment_count = 0
comment_limit = db.config['github.limit-comments']
comment_enabled = (db.config['github.comment']
or db.config['github.dryrun-comment'])

report_push = db.config['report.push']
report_pr = db.config['report.pr']
only_pr = db.config['github.limit-pr']

dfs = {}
for pr, commit, parent in db.select_matching_commits().fetchall():
if not (report_pr if pr else report_push):
for event, pr, commit, parent in db.select_matching_commits().fetchall():
if not db.should_report(event):
continue

# Github doesn't have a way to fetch artifacts associated with a
Expand All @@ -672,7 +693,7 @@ def report_matching_commits(db: SizeDatabase) -> Dict[str, pd.DataFrame]:
df = changes_for_commit(db, pr, commit, parent)
dfs[df.attrs['name']] = df

if (pr and comment_enabled
if (event == 'pull_request' and comment_enabled
and (comment_limit == 0 or comment_limit > comment_count)):
if gh_send_change_report(db, df):
# Mark the originating builds, and remove the originating
Expand Down
26 changes: 19 additions & 7 deletions scripts/tools/memory/gh_sizes.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@
This script also expects certain environment variables, which can be set in a
github workflow as follows:
env:
BUILD_TYPE: nrfconnect
GH_EVENT_PR: ${{ github.event_name == 'pull_request' && github.event.number || 0 }}
GH_EVENT_HASH: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
GH_EVENT_PARENT: ${{ github.event_name == 'pull_request' && github.event.pull_request.base.sha || github.event.before }}
- name: Set up environment for size reports
if: ${{ !env.ACT }}
env:
GH_CONTEXT: ${{ toJson(github) }}
run: gh_sizes_environment.py "${GH_CONTEXT}"
Default output file is {platform}-{configname}-{buildname}-sizes.json in the
binary's directory. This file has the form:
Expand All @@ -51,11 +51,16 @@
"parent": "20796f752061726520746f6f20636c6f73652e0a",
"pr": 12345,
"by": "section",
"ref": "refs/pull/12345/merge"
"frames": {
"section": [
{"section": ".bss", "size": 260496},
{"section": ".data", "size": 1648},
{"section": ".text", "size": 740236}
],
"wr": [
{"wr": 0, "size": 262144},
{"wr": 1, "size": 74023}
]
}
}
Expand Down Expand Up @@ -100,10 +105,17 @@
'metavar': 'HASH',
'default': os.environ.get('GH_EVENT_PARENT'),
},
'ref': {
'help': 'Target ref',
'metavar': 'REF',
'default': os.environ.get('GH_EVENT_REF'),
},
'timestamp': {
'help': 'Build timestamp',
'metavar': 'TIME',
'default': int(datetime.datetime.now().timestamp()),
'default': int(
os.environ.get('GH_EVENT_TIMESTAMP')
or datetime.datetime.now().timestamp()),
},
}

Expand Down Expand Up @@ -165,7 +177,7 @@ def main(argv):
config.put('output.metadata.time', config['timestamp'])
config.put('output.metadata.input', binary)
config.put('output.metadata.by', 'section')
for key in ['event', 'hash', 'parent', 'pr']:
for key in ['event', 'hash', 'parent', 'pr', 'ref']:
if value := config[key]:
config.putl(['output', 'metadata', key], value)

Expand Down
44 changes: 32 additions & 12 deletions scripts/tools/memory/gh_sizes_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@
Sets the following environment variables:
- `GH_EVENT_PR` For a pull request, the PR number; otherwise 0.
- `GH_EVENT_HASH` SHA of the commit under test.
- `GH_EVENT_PARENT` SHA of the parent commit to which the commit under
test is applied.
- `GH_EVENT_PR` For a pull request, the PR number; otherwise 0.
- `GH_EVENT_HASH` SHA of the commit under test.
- `GH_EVENT_PARENT` SHA of the parent commit to which the commit under
test is applied.
- `GH_EVENT_REF` The branch or tag ref that triggered the workflow run.
- `GH_EVENT_TIMESTAMP` For `push` events only, the timestamp of the commit.
"""

import json
Expand All @@ -28,29 +30,47 @@
import subprocess
import sys

import dateutil.parser

github = json.loads(sys.argv[1])

commit = None
timestamp = None
ref = github['ref']

if github['event_name'] == 'pull_request':

pr = github['event']['number']
commit = github['event']['pull_request']['head']['sha']

# Try to find the actual commit against which the current PR compares
# by scraping the HEAD commit message.
r = subprocess.run(['git', 'show', '--no-patch', '--format=%s', 'HEAD'],
capture_output=True, text=True, check=True)
m = re.fullmatch('Merge [0-9a-f]+ into ([0-9a-f]+)', r.stdout)
if m:
parent = m.group(1)
else:
parent = github['event']['pull_request']['base']['sha']
else:
pr = 0

elif github['event_name'] == 'push':

commit = github['sha']
parent = github['event']['before']
timestamp = dateutil.parser.isoparse(
github['event']['head_commit']['timestamp']).timestamp()
pr = 0

# Environment variables for subsequent workflow steps are set by
# writing to the file named by `$GITHUB_ENV`.

env = os.environ.get('GITHUB_ENV')
assert env
with open(env, 'at') as out:
print(f'GH_EVENT_PR={pr}', file=out)
print(f'GH_EVENT_HASH={commit}', file=out)
print(f'GH_EVENT_PARENT={parent}', file=out)
if commit is not None:
env = os.environ.get('GITHUB_ENV')
assert env
with open(env, 'at') as out:
print(f'GH_EVENT_PR={pr}', file=out)
print(f'GH_EVENT_HASH={commit}', file=out)
print(f'GH_EVENT_PARENT={parent}', file=out)
print(f'GH_EVENT_REF={ref}', file=out)
if timestamp:
print(f'GH_EVENT_TIMESTAMP={timestamp}', file=out)

0 comments on commit 7e6b25f

Please sign in to comment.