Skip to content

Commit

Permalink
Correct/improve sampling script
Browse files Browse the repository at this point in the history
  • Loading branch information
bensteinberg committed Nov 21, 2024
1 parent 2356776 commit 1c1fd71
Showing 1 changed file with 19 additions and 7 deletions.
26 changes: 19 additions & 7 deletions perma_web/tasks/dev.py
Original file line number Diff line number Diff line change
Expand Up @@ -1470,7 +1470,8 @@ def get_etag(bucket, path):
f.write("import hashlib\n")
f.write("import math\n")
f.write("import sys\n")
f.write("from pathlib import Path\n\n")
f.write("from pathlib import Path\n")
f.write("from statistics import NormalDist\n\n")
f.write(f"objects = {objects}\n")
f.write(inspect.getsource(calculate_s3_etag))
f.write(inspect.getsource(check_mirror))
Expand All @@ -1493,9 +1494,13 @@ def check_mirror():
directories = sys.argv[2:]

n = len(objects) # noqa
if n * p < 10 or n - (n * p) < 10:
print(f"Sample size of {n} does not satisfy the success/failure condition for p of {p}.") # noqa
return

successes = 0
failures = 0
blocksize = 2 ** 20
blocksize = 2 ** 20 * 8

for o in objects: # noqa
success = 0
Expand All @@ -1506,32 +1511,39 @@ def check_mirror():
full_path = Path(d) / "generated" / o[archive]["path"]
if full_path.exists():
with open(full_path, "rb") as f:
etag = calculate_s3_etag(f, blocksize)
multipart = "-" in o[archive]["etag"]
etag = calculate_s3_etag(f, blocksize, multipart)
if etag != o[archive]["etag"]:
failure += 1
print(
f'etag mismatch for {o[archive]["path"]}'
)
else:
success += 1
if not success:
print(f'no file found for {o[archive]["path"]}')
if failure or not success:
failures += 1
elif not success:
failures += 1
print(f'no file found for {o[archive]["path"]}')
else:
successes += 1

assert successes + failures == n

# observed proportion
p_hat = failures / n

# standard deviation
sd = math.sqrt((p * (1 - p)) / n) # noqa

# z-score
z = (p_hat - p) / sd

# area under the standard Normal curve
probability = NormalDist().cdf(z) # noqa

print(f"From a sample of {n} links:")
print(f"{successes} successes, {failures} failures")
print(f"Expected proportion is {p}")
print(f"Standard deviation is {sd}")
print(f"Observed proportion is {p_hat}")
print(f"z-score is {z}")
print(f"Chance of this result is {probability*100:.3f}%")

0 comments on commit 1c1fd71

Please sign in to comment.