From 1c1fd7166e0e4e900e7003ef5d699d3c993ca511 Mon Sep 17 00:00:00 2001 From: bensteinberg Date: Thu, 21 Nov 2024 09:53:37 -0500 Subject: [PATCH] Correct/improve sampling script --- perma_web/tasks/dev.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/perma_web/tasks/dev.py b/perma_web/tasks/dev.py index 738bb4909..badb12fa6 100644 --- a/perma_web/tasks/dev.py +++ b/perma_web/tasks/dev.py @@ -1470,7 +1470,8 @@ def get_etag(bucket, path): f.write("import hashlib\n") f.write("import math\n") f.write("import sys\n") - f.write("from pathlib import Path\n\n") + f.write("from pathlib import Path\n") + f.write("from statistics import NormalDist\n\n") f.write(f"objects = {objects}\n") f.write(inspect.getsource(calculate_s3_etag)) f.write(inspect.getsource(check_mirror)) @@ -1493,9 +1494,13 @@ def check_mirror(): directories = sys.argv[2:] n = len(objects) # noqa + if n * p < 10 or n - (n * p) < 10: + print(f"Sample size of {n} does not satisfy the success/failure condition for p of {p}.") # noqa + return + successes = 0 failures = 0 - blocksize = 2 ** 20 + blocksize = 2 ** 20 * 8 for o in objects: # noqa success = 0 @@ -1506,7 +1511,8 @@ def check_mirror(): full_path = Path(d) / "generated" / o[archive]["path"] if full_path.exists(): with open(full_path, "rb") as f: - etag = calculate_s3_etag(f, blocksize) + multipart = "-" in o[archive]["etag"] + etag = calculate_s3_etag(f, blocksize, multipart) if etag != o[archive]["etag"]: failure += 1 print( @@ -1514,24 +1520,30 @@ def check_mirror(): ) else: success += 1 - if not success: - print(f'no file found for {o[archive]["path"]}') if failure or not success: failures += 1 + elif not success: + failures += 1 + print(f'no file found for {o[archive]["path"]}') else: successes += 1 - assert successes + failures == n - + # observed proportion p_hat = failures / n + # standard deviation sd = math.sqrt((p * (1 - p)) / n) # noqa + # z-score z = (p_hat - p) / sd + # area under the standard Normal curve + probability = NormalDist().cdf(z) # noqa + print(f"From a sample of {n} links:") print(f"{successes} successes, {failures} failures") print(f"Expected proportion is {p}") print(f"Standard deviation is {sd}") print(f"Observed proportion is {p_hat}") print(f"z-score is {z}") + print(f"Chance of this result is {probability*100:.3f}%")