Skip to content

Commit

Permalink
Add scripts to misc/ for computing and applying diffs of mypy caches (#…
Browse files Browse the repository at this point in the history
…8906)

This can (with some infrastructure) allow for much faster distribution
of cache artifacts.
  • Loading branch information
msullivan authored May 28, 2020
1 parent c48624a commit dae2ae8
Show file tree
Hide file tree
Showing 2 changed files with 207 additions and 0 deletions.
60 changes: 60 additions & 0 deletions misc/apply-cache-diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python3
"""Script for applying a cache diff.
With some infrastructure, this can allow for distributing small cache diffs to users in
many cases instead of full cache artifacts.
"""

import argparse
import json
import os
import sys

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from mypy.metastore import MetadataStore, FilesystemMetadataStore, SqliteMetadataStore


def make_cache(input_dir: str, sqlite: bool) -> MetadataStore:
if sqlite:
return SqliteMetadataStore(input_dir)
else:
return FilesystemMetadataStore(input_dir)


def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None:
cache = make_cache(cache_dir, sqlite)
with open(diff_file, "r") as f:
diff = json.load(f)

old_deps = json.loads(cache.read("@deps.meta.json"))

for file, data in diff.items():
if data is None:
cache.remove(file)
else:
cache.write(file, data)
if file.endswith('.meta.json') and "@deps" not in file:
meta = json.loads(data)
old_deps["snapshot"][meta["id"]] = meta["hash"]

cache.write("@deps.meta.json", json.dumps(old_deps))

cache.commit()


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('--sqlite', action='store_true', default=False,
help='Use a sqlite cache')
parser.add_argument('cache_dir',
help="Directory for the cache")
parser.add_argument('diff',
help="Cache diff file")
args = parser.parse_args()

apply_diff(args.cache_dir, args.diff, args.sqlite)


if __name__ == '__main__':
main()
147 changes: 147 additions & 0 deletions misc/diff-cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#!/usr/bin/env python3
"""Produce a diff between mypy caches.
With some infrastructure, this can allow for distributing small cache diffs to users in
many cases instead of full cache artifacts.
"""

import argparse
import json
import os
import sys

from collections import defaultdict
from typing import Any, Dict, Optional, Set

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore


def make_cache(input_dir: str, sqlite: bool) -> MetadataStore:
if sqlite:
return SqliteMetadataStore(input_dir)
else:
return FilesystemMetadataStore(input_dir)


def merge_deps(all: Dict[str, Set[str]], new: Dict[str, Set[str]]) -> None:
for k, v in new.items():
all.setdefault(k, set()).update(v)


def load(cache: MetadataStore, s: str) -> Any:
data = cache.read(s)
obj = json.loads(data)
if s.endswith(".meta.json"):
# For meta files, zero out the mtimes and sort the
# dependencies to avoid spurious conflicts
obj["mtime"] = 0
obj["data_mtime"] = 0
if "dependencies" in obj:
all_deps = obj["dependencies"] + obj["suppressed"]
num_deps = len(obj["dependencies"])
thing = list(zip(all_deps, obj["dep_prios"], obj["dep_lines"]))

def unzip(x: Any) -> Any:
return zip(*x) if x else ((), (), ())

obj["dependencies"], prios1, lines1 = unzip(sorted(thing[:num_deps]))
obj["suppressed"], prios2, lines2 = unzip(sorted(thing[num_deps:]))
obj["dep_prios"] = prios1 + prios2
obj["dep_lines"] = lines1 + lines2
if s.endswith(".deps.json"):
# For deps files, sort the deps to avoid spurious mismatches
for v in obj.values():
v.sort()
return obj


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"--verbose", action="store_true", default=False, help="Increase verbosity"
)
parser.add_argument(
"--sqlite", action="store_true", default=False, help="Use a sqlite cache"
)
parser.add_argument("input_dir1", help="Input directory for the cache")
parser.add_argument("input_dir2", help="Input directory for the cache")
parser.add_argument("output", help="Output file")
args = parser.parse_args()

cache1 = make_cache(args.input_dir1, args.sqlite)
cache2 = make_cache(args.input_dir2, args.sqlite)

type_misses: Dict[str, int] = defaultdict(int)
type_hits: Dict[str, int] = defaultdict(int)

updates: Dict[str, Optional[str]] = {}

deps1: Dict[str, Set[str]] = {}
deps2: Dict[str, Set[str]] = {}

misses = hits = 0
cache1_all = list(cache1.list_all())
for s in cache1_all:
obj1 = load(cache1, s)
try:
obj2 = load(cache2, s)
except FileNotFoundError:
obj2 = None

typ = s.split(".")[-2]
if obj1 != obj2:
misses += 1
type_misses[typ] += 1

# Collect the dependencies instead of including them directly in the diff
# so we can produce a much smaller direct diff of them.
if ".deps." not in s:
if obj2 is not None:
updates[s] = json.dumps(obj2)
else:
updates[s] = None
elif obj2:
merge_deps(deps1, obj1)
merge_deps(deps2, obj2)
else:
hits += 1
type_hits[typ] += 1

cache1_all_set = set(cache1_all)
for s in cache2.list_all():
if s not in cache1_all_set:
updates[s] = cache2.read(s)

# Compute what deps have been added and merge them all into the
# @root deps file.
new_deps = {k: deps1.get(k, set()) - deps2.get(k, set()) for k in deps2}
new_deps = {k: v for k, v in new_deps.items() if v}
try:
root_deps = load(cache1, "@root.deps.json")
except FileNotFoundError:
root_deps = {}
merge_deps(new_deps, root_deps)

new_deps_json = {k: list(v) for k, v in new_deps.items() if v}
updates["@root.deps.json"] = json.dumps(new_deps_json)

# Drop updates to deps.meta.json for size reasons. The diff
# applier will manually fix it up.
updates.pop("./@deps.meta.json", None)
updates.pop("@deps.meta.json", None)

###

print("Generated incremental cache:", hits, "hits,", misses, "misses")
if args.verbose:
print("hits", type_hits)
print("misses", type_misses)

with open(args.output, "w") as f:
json.dump(updates, f)


if __name__ == "__main__":
main()

0 comments on commit dae2ae8

Please sign in to comment.