From d571fb99094d7f5a713879df6a6496ab10f1dd8c Mon Sep 17 00:00:00 2001 From: Kevin Schoedel Date: Fri, 3 Sep 2021 10:29:49 -0400 Subject: [PATCH] report --- scripts/tools/memory/gh_report.py | 410 ++++++++++++++++++++++ scripts/tools/memory/memdf/report.py | 18 +- scripts/tools/memory/memdf/util/sqlite.py | 108 ++++++ 3 files changed, 531 insertions(+), 5 deletions(-) create mode 100755 scripts/tools/memory/gh_report.py create mode 100644 scripts/tools/memory/memdf/util/sqlite.py diff --git a/scripts/tools/memory/gh_report.py b/scripts/tools/memory/gh_report.py new file mode 100755 index 00000000000000..68cb74072608db --- /dev/null +++ b/scripts/tools/memory/gh_report.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2021 Project CHIP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Generate reports from size artifacts. + +XXX +""" + +import io +import itertools +import json +import logging +import os +import os.path +import sqlite3 +import sys +import zipfile + +from pathlib import Path +from typing import Callable, Dict, IO, List, Optional, Set, Tuple, Union + +import dateutil # type: ignore +import fastcore # type: ignore +import ghapi.all # type: ignore +import tabulate # type: ignore + +import memdf.report +import memdf.util.config +import memdf.util.sqlite +from memdf import DFs, Config, ConfigDescription + +GITHUB_CONFIG: ConfigDescription = { + Config.group_def('github'): { + 'title': 'github options', + }, + 'github.token': { + 'help': 'Github API token, or "SKIP" to suppress connecting to github', + 'metavar': 'TOKEN', + 'default': '', + 'argparse': { + 'alias': ['--github-api-token', '--token'], + }, + }, + 'github.repository': { + 'help': 'Github repostiory', + 'metavar': 'OWNER/REPO', + 'default': '', + 'argparse': { + 'alias': ['--repo'], + }, + }, + 'github.comment': { + 'help': 'Send output as github PR comments', + 'default': False, + }, +} + + +class SizeDatabase(memdf.util.sqlite.Database): + """A database for recording and comparing size reports.""" + on_open = ["PRAGMA foreign_keys = ON", "PRAGMA encoding = 'UTF-8'"] + on_writable = [ + """ + -- A ‘thing’ identifies the kind of built object. + -- Builds of the same thing are comparable. + CREATE TABLE IF NOT EXISTS thing ( + id INTEGER PRIMARY KEY, + platform TEXT NOT NULL, -- Build platform + config TEXT NOT NULL, -- Build configuration discriminator + target TEXT NOT NULL, -- Build target + UNIQUE(platform, config, target) + ) + """, """ + -- A ‘build’ identifies a built instance of a thing at some point. + CREATE TABLE IF NOT EXISTS build ( + id INTEGER PRIMARY KEY, + thing_id INTEGER REFERENCES thing(id), + hash TEXT NOT NULL, -- Commit hash + parent TEXT NOT NULL, -- Parent commit hash + pr INTEGER DEFAULT -1, -- Github PR number + time INTEGER NOT NULL, -- Unix-epoch timestamp + artifact INTEGER DEFAULT -1, -- Github artifact ID + commented INTEGER DEFAULT 0, + UNIQUE(thing_id, hash, parent, pr, time, artifact) + ) + """, """ + -- A ‘size’ entry gives the size of a section for a particular build. + CREATE TABLE IF NOT EXISTS size ( + build_id INTEGER REFERENCES build(id), + section TEXT NOT NULL, -- Section name + size INTEGER NOT NULL, -- Section size in bytes + PRIMARY KEY (build_id, section) + ) + """ + ] + + def __init__(self, config: Config): + super().__init__(config['database.file']) + self.config = config + self.gh = gh_open(config) + self.deleted_artifacts: set[int] = set() + + def add_sizes(self, **kwargs): + """ + Add a size report to the database. + + The incoming arguments must contain the non-ID column names from + ‘thing’ and ‘build’ tables, plus a 'sizes' entry that is a sequence + of mappings containing 'section' and 'size'. + """ + td = {k: kwargs[k] for k in ('platform', 'config', 'target')} + thing = self.store_and_return_id('thing', **td) + bd = {k: kwargs[k] for k in ('hash', 'parent', 'time')} + cd = {k: kwargs.get(k, v) for k, v in (('pr', -1), ('artifact', -1))} + build = self.store_and_return_id('build', thing_id=thing, **bd, **cd) + for d in kwargs['sizes']: + self.store('size', build_id=build, **d) + + def add_sizes_from_json(self, s: Union[bytes, str], origin: Dict): + """Add sizes from a JSON size report.""" + r = origin.copy() + r.update(json.loads(s)) + self.add_sizes(sizes=r['frames']['sizes'], **r) + + def add_sizes_from_zipfile(self, f: Union[IO, Path], origin: Dict): + """Add size reports from a zip.""" + with zipfile.ZipFile(f, 'r') as zip_file: + for i in zip_file.namelist(): + if i.endswith('-sizes.json'): + origin['member'] = i + with zip_file.open(i) as member: + self.add_sizes_from_json(member.read(), origin) + + def add_sizes_from_file(self, filename: str): + """Add size reports from a file.""" + origin = {'file': filename} + path = Path(filename) + if path.suffix == '.json': + logging.info('Reading JSON %s', path) + with open(path) as f: + self.add_sizes_from_json(f.read(), origin) + elif path.suffix == '.zip': + logging.info('Reading ZIP %s', path) + self.add_sizes_from_zipfile(path, origin) + else: + logging.warning('Unknown file type "%s" ignored', filename) + + def add_sizes_from_github(self): + """Read size report artifacts from github.""" + if not self.gh: + return + + # Size artifacts have names of the form + # Sizes,{group},{commit_hash},{parent_hash} + # Record them keyed by group and commit_hash to match them up + # after we have the entire list. + size_artifacts: Dict[str, Dict[str, fastcore.basics.AttrDict]] = {} + for i in ghapi.all.paged(self.gh.actions.list_artifacts_for_repo): + if not i.artifacts: + break + for a in i.artifacts: + if a.name.startswith('Sizes,'): + _, group, sha, parent, *_ = (a.name + ',').split(',', 4) + a.parent = parent + a.created_at = dateutil.parser.isoparse(a.created_at) + if group not in size_artifacts: + size_artifacts[group] = {} + size_artifacts[group][sha] = a + + # Determine required size artifacts. + required_artifact_ids: set[int] = set() + for group, group_reports in size_artifacts.items(): + logging.info('Group %s', group) + for sha, report in group_reports.items(): + if report.parent not in group_reports: + logging.info(' No match for %s', report.name) + continue + # We have size information for both this report and its parent, + # so ensure that both artifacts are downloaded. + parent = group_reports[report.parent] + required_artifact_ids.add(report.id) + required_artifact_ids.add(parent.id) + logging.info(' Match %s', report.parent) + logging.info(' %s %s', report.id, report.name) + logging.info(' %s %s', parent.id, parent.name) + + # Download and add required artifacts. + for i in required_artifact_ids: + logging.debug('Download artifact %d', i) + try: + blob = self.gh.actions.download_artifact(i, 'zip') + except Exception as e: + logging.error('Failed to download artifact %d: %s', i, e) + self.add_sizes_from_zipfile(io.BytesIO(blob), {'artifact': i}) + + def select_matching_commits(self): + """Find matching builds, where one's commit is the other's parent.""" + return self.execute(''' + SELECT DISTINCT c.pr AS pr, c.hash AS hash, p.hash AS parent + FROM build c + INNER JOIN build p ON p.hash = c.parent + WHERE c.commented = 0 + ORDER BY c.pr, c.hash, p.hash ASC + ''') + + def delete_build(self, build_id): + self.execute('DELETE FROM size WHERE build_id = ?', (build_id,)) + self.execute('DELETE FROM build WHERE id = ?', (build_id,)) + + def delete_artifact(self, artifact_id): + if self.gh and artifact_id not in self.deleted_artifacts: + self.deleted_artifacts.add(artifact_id) + self.gh.actions.delete_artifact(artifact_id) + + +def gh_open(config: Config) -> Optional[ghapi.core.GhApi]: + """Return a GhApi, if so configured.""" + gh: Optional[ghapi.core.GhApi] = None + if config['github.repository']: + owner, repo = config.get('github.repository').split('/', 1) + config.put('github.owner', owner) + config.put('github.repo', repo) + if not config['github.token']: + config['github.token'] = os.environ.get('GITHUB_TOKEN') + if not config['github.token']: + logging.error('Missing --github-token') + return None + token = config['github.token'] + if token != 'SKIP': + gh = ghapi.all.GhApi(owner=owner, + repo=repo, + token=config['github.token']) + return gh + + +def gh_get_comments_for_pr(gh: ghapi.core.GhApi, pr: int): + return itertools.chain.from_iterable( + ghapi.all.paged(gh.issues.list_comments, pr)) + + +def table_for_commit( + db: SizeDatabase, commit: str, + parent: str) -> Tuple[Set[int], Set[int], Set[int], List[List]]: + """XXX""" + cur = db.execute( + ''' + SELECT DISTINCT + t.id AS thing, + cb.artifact AS artifact, + pb.id AS parent_build, + cb.id AS commit_build, + t.platform, t.config, t.target, + cs.section, + ps.size AS parent_size, + cs.size AS commit_size, + cs.size - ps.size AS change + FROM thing t + INNER JOIN build cb ON cb.thing_id = t.id + INNER JOIN build pb ON pb.thing_id = t.id AND pb.hash = cb.parent + INNER JOIN size cs ON cs.build_id = cb.id + INNER JOIN size ps ON ps.build_id = pb.id AND cs.section = ps.section + WHERE cb.hash = ? AND pb.hash = ? + ORDER BY t.platform, t.config, t.target, + cs.section, cb.time DESC, pb.time DESC + ''', (commit, parent)) + + stale_builds = set() + stale_artifacts = set() + previous: Optional[sqlite3.Row] = None + keep = ('platform', 'target', 'config', 'section', 'parent_size', + 'commit_size', 'change') + rows = [['platform', 'target', 'config', 'section', parent[:8], commit[:8], + 'change']] + + things: set[int] = set() + artifacts: set[int] = set() + builds: set[int] = set() + for row in cur.fetchall(): + row = sqlite3.Row(cur, row) + things.add(row['thing']) + if (previous is not None and row['thing'] == previous['thing'] + and row['section'] == previous['section']): + # This is duplicate build, older because we sort descending, + # presumably from a partial workflow re-run. + if row['parent_build'] != previous['parent_build']: + stale_builds.add(row['parent_build']) + if row['commit_build'] != previous['commit_build']: + stale_builds.add(row['commit_build']) + stale_artifacts.add(row['artifact']) + else: + previous = row + rows.append([row[k] for k in keep]) + artifacts.add(row['artifact']) + builds.add(row['commit_build']) + + if stale_builds: + for build_id in stale_builds: + logging.info('Deleting obsolete build %d', build_id) + db.delete_build(build_id) + db.commit() + + for artifact_id in stale_artifacts: + logging.info('Deleting obsolete artifact %d', artifact_id) + db.delete_artifact(artifact_id) + + return (things, builds, artifacts, rows) + + +def report_changes_for_commit(db: SizeDatabase, pr: int, commit: str, + parent: str): + """Report all changes for the given commit pair.""" + title = f'Size comparison from {commit} to {parent}' + things, builds, artifacts, table = table_for_commit(db, commit, parent) + table = memdf.report.hierify_rows(table) + + if pr >= 0: + print(f'PR #{pr}: ', end='') + print(f'{title}\n') + print(tabulate.tabulate(table, headers="firstrow", tablefmt='simple')) + + if pr >= 0 and db.gh and db.config['github.comment']: + sent = gh_send_change_report(db.gh, pr, title, table, len(things)) + if sent: + # Mark the originating builds, and remove the originating + # artifacts, so that they don't generate duplicate report comments. + for artifact_id in artifacts: + logging.info('Deleting reported artifact %d', artifact_id) + db.delete_artifact(artifact_id) + + +def gh_send_change_report(gh: ghapi.core.GhApi, pr: int, title: str, + table: List[List], count: int) -> bool: + """XXX""" + initial_text = title + existing_comment_id = -1 + for comment in gh_get_comments_for_pr(gh, pr): + if comment.body.partition('\n')[0] == title: + existing_comment_id = comment.id + initial_text = comment.body + break + + md = io.StringIO(initial_text) + md.write(f'\n\n
\n{count} builds\n') + md.write(tabulate.tabulate(table, headers="firstrow", tablefmt='pipe')) + md.write('
\n') + text = md.getvalue() + md.close() + + try: + if existing_comment_id >= 0: + gh.issues.update_comment(existing_comment_id, text) + else: + gh.issues.create_comment(pr, text) + return True + except: + return False + + +def main(argv): + status = 0 + try: + config = Config().init({ + **memdf.util.config.CONFIG, + **memdf.util.sqlite.CONFIG, + **memdf.report.REPORT_CONFIG, + **memdf.report.REPORT_BY_CONFIG, + **memdf.report.OUTPUT_CONFIG, + **GITHUB_CONFIG, + }) + config.argparse.add_argument('inputs', metavar='FILE', nargs='*') + config.parse(argv) + + with SizeDatabase(config) as db: + + # XXX To do: Revise artifact name to include event and optionally only download for PRs. + + # Get size reports from github, if requested. + db.add_sizes_from_github() + + # Read size reports from local files, if any. + for filename in config['args.inputs']: + db.add_sizes_from_file(filename) + + db.commit() + + for pr, commit, parent in db.select_matching_commits().fetchall(): + report_changes_for_commit(db, pr, commit, parent) + + except Exception as exception: + status = 1 + raise exception + + return status + + +if __name__ == '__main__': + sys.exit(main(sys.argv)) diff --git a/scripts/tools/memory/memdf/report.py b/scripts/tools/memory/memdf/report.py index f5fcfc8afc6b78..545b79e197ad86 100644 --- a/scripts/tools/memory/memdf/report.py +++ b/scripts/tools/memory/memdf/report.py @@ -21,7 +21,8 @@ import pathlib import sys -from typing import Any, Dict, IO, Optional, Protocol, Tuple, Union +from typing import (Any, Dict, List, IO, Optional, Protocol, Sequence, Tuple, + Union) import cxxfilt # type: ignore import pandas as pd # type: ignore @@ -97,11 +98,12 @@ def demangle(symbol: str): return symbol -def hierify(df: pd.DataFrame) -> pd.DataFrame: - columns = list(df.columns) +def hierify_rows(table: Sequence[Sequence[Any]]) -> List[List[Any]]: + if not table: + return table rows = [] - persist = [None] * len(columns) - for row in df.itertuples(index=False): + persist = [None] * len(table[0]) + for row in table: new_persist = [] new_row = [] changed = False @@ -115,6 +117,12 @@ def hierify(df: pd.DataFrame) -> pd.DataFrame: new_persist.append(new) rows.append(new_row) persist = new_persist + return rows + + +def hierify(df: pd.DataFrame) -> pd.DataFrame: + columns = list(df.columns) + rows = hierify_rows(df.itertuples(index=False)) return pd.DataFrame(rows, columns=columns) diff --git a/scripts/tools/memory/memdf/util/sqlite.py b/scripts/tools/memory/memdf/util/sqlite.py new file mode 100644 index 00000000000000..f196da91b8ccc9 --- /dev/null +++ b/scripts/tools/memory/memdf/util/sqlite.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2021 Project CHIP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""XXX""" + +import sqlite3 + +from typing import List, Optional + +from memdf import Config, ConfigDescription + +CONFIG: ConfigDescription = { + Config.group_def('database'): { + 'title': 'database options', + }, + 'database.file': { + 'help': 'Sqlite3 file', + 'metavar': 'FILENAME', + 'default': ':memory:', + 'argparse': { + 'alias': ['--db'], + }, + }, +} + + +class Database: + on_open: Optional[List[str]] = None + on_writable: Optional[List[str]] = None + + def __init__(self, filename: str, writable: bool = True): + self.filename = filename + self.writable = writable + self.con: Optional[sqlite3.Connection] = None + + def __enter__(self): + return self.open() + + def __exit__(self, et, ev, traceback): + self.close() + return False + + def open(self): + if not self.con: + db = 'file:' + self.filename + if not self.writable: + db += '?mode=ro' + self.con = sqlite3.connect(db, uri=True) + if self.on_open: + for i in self.on_open: + self.con.execute(i) + if self.writable and self.on_writable: + for i in self.on_writable: + self.con.execute(i) + return self + + def close(self): + if self.con: + self.con.close() + self.con = None + return self + + def connection(self) -> sqlite3.Connection: + assert self.con + return self.con + + def execute(self, query, parameters=None): + if parameters: + return self.con.execute(query, parameters) + return self.con.execute(query) + + def commit(self): + self.con.commit() + return self + + def store(self, table: str, **kwargs): + """Insert the data if it does not already exist.""" + q = (f"INSERT INTO {table} ({','.join(kwargs.keys())})" + f" VALUES ({','.join('?' * len(kwargs))})" + f" ON CONFLICT DO NOTHING") + v = list(kwargs.values()) + self.connection().execute(q, v) + + def get_matching(self, table: str, columns: List[str], **kwargs): + return self.connection().execute( + f"SELECT {','.join(columns)} FROM {table}" + f" WHERE {'=? AND '.join(kwargs.keys())}=?", + list(kwargs.values())) + + def get_matching_id(self, table: str, **kwargs): + return self.get_matching(table, ['id'], **kwargs).fetchone()[0] + + def store_and_return_id(self, table: str, **kwargs) -> int: + self.store(table, **kwargs) + return self.get_matching_id(table, **kwargs)