From e5a2caf333ab17ea1b27591a6a5d5e15c99e4e22 Mon Sep 17 00:00:00 2001 From: Chris Peterson Date: Wed, 20 Nov 2024 14:20:14 -0800 Subject: [PATCH 1/3] scripts: mirror-checker check all mirrors --- apt-deps.txt | 3 + scripts/mirror-checker.py | 471 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 450 insertions(+), 24 deletions(-) diff --git a/apt-deps.txt b/apt-deps.txt index 636a199bc..6c6d00f7b 100644 --- a/apt-deps.txt +++ b/apt-deps.txt @@ -29,6 +29,7 @@ python3-debian python3-dev python3-distro-info python3-distutils-extra +python3-dnspython python3-flake8 python3-gi python3-jsonschema @@ -37,6 +38,8 @@ python3-mypy python3-nose python3-parameterized python3-pip +python3-prettytable +python3-pycountry python3-pyflakes python3-pyroute2 python3-pytest diff --git a/scripts/mirror-checker.py b/scripts/mirror-checker.py index 4d8fa5f0d..afb1204b3 100755 --- a/scripts/mirror-checker.py +++ b/scripts/mirror-checker.py @@ -1,17 +1,26 @@ #!/usr/bin/env python3 import argparse +import concurrent.futures import os import shlex import subprocess import sys import tempfile +from datetime import datetime, timezone +from typing import TextIO +from pathlib import Path +from typing import Any +import dns.resolver +from prettytable import PrettyTable +from pycountry import countries -def run_cmd(cmd: list[str]) -> int: + +def run_cmd(cmd: list[str], output: TextIO = sys.stdout) -> int: """Run a cmd list. Return output.""" # Print out command bash -x style - print(f"+ {shlex.join(cmd)}") + print(f"+ {shlex.join(cmd)}", file=output) with subprocess.Popen( cmd, stdout=subprocess.PIPE, @@ -19,59 +28,473 @@ def run_cmd(cmd: list[str]) -> int: encoding="utf-8", ) as proc: try: - for line in iter(proc.stdout.readline, ""): - print(line, end="") + for line in iter(proc.stdout.readline, ""): # type: ignore + print(line, end="", file=output, flush=True) except KeyboardInterrupt: - print("Killed by user.") + print("Killed by user.", file=output, flush=True) finally: - proc.stdout.close() + proc.stdout.close() # type: ignore proc.wait() return proc.returncode -def parse_args() -> argparse.Namespace: +def check_mirror( + mirror_url: str, + release: str, + output: TextIO = sys.stdout, +) -> int: + """Check the health of a single mirror for a particular release. + + Uses debootstrap to create a schroot using the specified mirror_url. + Returns a 0 on success and a 1 on failure. + """ + # Default to 1 so if run_cmd fails then we still get a failure. + ret = 1 + with tempfile.TemporaryDirectory() as tempdir: + + print(datetime.now(timezone.utc), file=output) + print(f"Creating schroot in: {tempdir}", file=output) + print(f"Targeting release: {release}", file=output) + print(f"Using mirror: {mirror_url}", file=output) + + debootstrap_cmd = ["debootstrap", release, tempdir, mirror_url] + ret = run_cmd(debootstrap_cmd, output=output) + + return ret + + +def check_country_mirror( + country_code: str, + release: str, + output: TextIO = sys.stdout, +) -> int: + """Check the health of a single country mirror for a particular release. + + Uses debootstrap to create a schroot using the specified mirror. Returns + a 0 on success and a 1 on failure. + """ + mirror: str = f"http://{country_code}.archive.ubuntu.com/ubuntu/" + return check_mirror(mirror, release, output=output) + + +def check_primary_archive( + release: str, + output: TextIO = sys.stdout, +) -> int: + mirror: str = "http://archive.ubuntu.com/ubuntu/" + return check_mirror(mirror, release, output=output) + + +def get_cname(domain: str) -> str: + """Get CNAME records for domain.""" + try: + answers = dns.resolver.Resolver().resolve(domain, "CNAME") + # There can only ever be one CNAME record + return answers[0].target.to_text() + except dns.resolver.NoAnswer: + return "" + + +def get_arecords(domain: str) -> set[str]: + """Get A records (ip addrs) for domain.""" + try: + answers = dns.resolver.Resolver().resolve(domain, "A") + return {data.address for data in answers} + except dns.resolver.NoAnswer: + return set() + + +def get_dns_information(country_codes: list[str]) -> dict[str, Any]: + """Return a dictionary of CNAMES and A records for all country mirrors.""" + # Don't put protocol (i.e. 'http://') in url. Only pass domain name. + country_code_to_dns_data: dict[str, Any] = {} + for cc in country_codes: + domain = f"{cc}.archive.ubuntu.com" + country_code_to_dns_data[cc] = { + "domain": domain, + "A": get_arecords(domain), + "CNAME": get_cname(domain), + } + + return country_code_to_dns_data + + +def get_multi_country_mirrors(cc_to_dns: dict[str, Any]) -> dict[str, set[str]]: + """Calculate which mirrors are an alias of another archive mirror. + + Returns a dictionary where the keys are the real archive mirrors and the + values are the set of aliases for that mirror. + """ + mirror_map = {cc: None for cc in cc_to_dns} + for cc, data in cc_to_dns.items(): + + if (cname := data["CNAME"]) != "": + parts = cname.split(".") + if parts[1:4] == ["archive", "ubuntu", "com"]: + mirror_map[cc] = parts[0] + + mirror_to_aliases: dict[str, set[str]] = {} + for alias, target in mirror_map.items(): + + if target is None: + if alias not in mirror_to_aliases: + mirror_to_aliases[alias] = set() + # else: nothing to do + continue + + aliases = [] + curr_alias = alias + curr_target = target + while curr_target is not None: + aliases.append(curr_alias) + curr_alias = target + curr_target = mirror_map[curr_alias] + + known_aliases = mirror_to_aliases.get(curr_alias, set()) + mirror_to_aliases[curr_alias] = known_aliases.union(set(aliases)) + + # Remove mirrors with no aliases + ret = {c: a for c, a in mirror_to_aliases.items() if len(a) > 0} + + return ret + + +def _print_dns_information(country_code_to_dns_data: dict[str, Any]) -> None: + + for cc, data in country_code_to_dns_data.items(): + print(f"{data["domain"]}:") + print(f"\tCNAME: {data['CNAME']!r}") + print("\tA records:") + for r in data["A"]: + print(f"\t\t{r!r}") + + +def filter_mirrors( + country_code_to_dns_data: dict[str, Any], + primary_archive: dict[str, Any], +) -> dict[str, list[str]]: + """Filter mirrors by country mirrors, canonical mirrors, non mirrors. + + Returns a dictionary whose keys are the mirror types and the values are + a list of country codes for that mirror type. + + Mirror Types: + + non_mirrors - For mirrors which don't exist, cc.archive.ubuntu.com will + just fallback to the primary archive. So these mirrors are those with + identical A records (IPs) to the primary archive. + + cname_mirrors - For actual country mirrors, these have CNAME records that + point cc.archive.ubuntu.com domain lookups to another domain + (e.g. ubuntu.mymirror.com). + + canonical_mirrors - There are a few special mirrors, usually "us" and "gb", + that have no CNAME records and do not have identical A records with the + primary archive. These appear to be Canonical hosted country mirrors. + The union of A records for these mirrors should be the same as the A + record results of archive.ubuntu.com. + """ + + cname_mirrors = [] + non_mirrors = [] + canonical_mirrors = [] + + for cc, data in country_code_to_dns_data.items(): + if data["CNAME"] != "": + cname_mirrors.append(cc) + elif data["A"] == primary_archive["A"]: + non_mirrors.append(cc) + else: + canonical_mirrors.append(cc) + + return { + "non_mirrors": non_mirrors, + "cname_mirrors": cname_mirrors, + "canonical_mirrors": canonical_mirrors, + } + + +def _print_mirror_stats( + filtered_mirrors: dict[str, Any], + country_codes: list[str], + country_code_to_dns_data: dict[str, Any], + mirror_aliases: dict[str, Any], +) -> None: + + cname_mirrors = filtered_mirrors["cname_mirrors"] + canonical_mirrors = filtered_mirrors["canonical_mirrors"] + non_mirrors = filtered_mirrors["non_mirrors"] + print(f"Total country codes checked: {len(country_codes)}") + print(f"Total mirrors with a CNAME record: {len(cname_mirrors)}") + print(f"Total mirrors hosted by Canonical: {len(canonical_mirrors)}") + print(f"Total country codes with no mirror: {len(non_mirrors)}") + + print("Found mirrors:") + print("\tCanonical mirrors:") + for mirror in canonical_mirrors: + print(f"\t\t{mirror!r}") + print("\tRegistered mirrors (have CNAME):") + for mirror in cname_mirrors: + print(f"\t\t{mirror!r} -> {country_code_to_dns_data[mirror]['CNAME']}") + print("Alias information:") + + for mirror, aliases in mirror_aliases.items(): + if len(aliases) == 0: + continue + print(f"\tMirror {mirror!r} is aliased by:") + for m in aliases: + print(f"\t\t{m!r}") + + +def _multi_mirror_check( + mirrors_to_check: set[tuple[str, str]], + release: str, + output_dir: Path, + n_jobs: int, + mirror_status: dict[str, tuple[str, str, str | Path]], +) -> None: + + def _run_check(cc: str, release: str) -> int: + + output_path: Path = output_dir / f"{cc}.txt" + with output_path.open("w") as f: + if cc == "primary": + mirror = "http://archive.ubuntu.com/ubuntu/" + print(f"checking {mirror} -> {output_path}") + return check_primary_archive(release, output=f) + else: + mirror = f"http://{cc}.archive.ubuntu.com/ubuntu/" + print(f"checking {mirror} -> {output_path}") + return check_country_mirror(cc, release, output=f) + + with concurrent.futures.ThreadPoolExecutor(max_workers=n_jobs) as executor: + future_to_result = { + executor.submit(_run_check, cc, release): (cc, d) + for (cc, d) in mirrors_to_check + } + for future in concurrent.futures.as_completed(future_to_result): + (cc, d) = future_to_result[future] + exitcode = -1 + try: + exitcode = future.result() + except Exception as exc: + print(f"{cc} generated an exception: {exc}") + + status_text = "ok" if exitcode == 0 else "not ok" + + print(f"{cc}: {status_text}") + + mirror_status[cc] = ( + status_text, + d, + output_dir / f"{cc}.txt", + ) + + +def check_all_mirrors( + n_jobs: int, + release: str, + show_non_mirrors: bool, + output_location: str, +) -> int: + """Check the health of all country mirrors.""" + + print(datetime.now(timezone.utc)) + print("Checking all mirrors...") + country_codes: list[str] = [c.alpha_2.lower() for c in countries] + + print("Checking DNS information for mirrors...") + country_code_to_dns_data = get_dns_information(country_codes) + + # Also collect IPs for the primary archive + primary_archive = { + "domain": "archive.ubuntu.com", + "A": get_arecords("archive.ubuntu.com"), + "CNAME": "", # No CNAME for primary archive + } + + # Print out all the mirror information found + _print_dns_information( + { + **{"primary": primary_archive}, + **country_code_to_dns_data, + } + ) + + filtered_mirrors = filter_mirrors(country_code_to_dns_data, primary_archive) + + mirror_aliases = get_multi_country_mirrors(country_code_to_dns_data) + + _print_mirror_stats( + filtered_mirrors, + country_codes, + country_code_to_dns_data, + mirror_aliases, + ) + + # Get ready to check only active mirrors + mirror_status: dict[str, tuple[str, str, str | Path]] = {} + + header = ["Mirror", "Status", "Domain", "Log"] + + # Generate NA status result for non mirrors + for cc in filtered_mirrors["non_mirrors"]: + mirror_status[cc] = ( + "NA", + "archive.ubuntu.com", + "-", + ) + + # Generate stub for mirror aliases + for mirror, aliases in mirror_aliases.items(): + for cc in aliases: + mirror_status[cc] = ( + "-", # To be updated + f"{mirror}.archive.ubuntu.com", + "-", # To be updated + ) + + # Remaining mirrors are real mirrors + mirrors_to_check = set() + for cc in country_codes: + if cc not in mirror_status: + domain = country_code_to_dns_data[cc]["CNAME"] + if domain == "": + domain = f"{cc}.archive.ubuntu.com" + mirrors_to_check.add((cc, domain)) + + # Also check the primary archive + mirrors_to_check.add(("primary", "archive.ubuntu.com")) + + # Setup output directories + output_dir = Path(output_location) + output_dir.mkdir(exist_ok=True) + + _multi_mirror_check( + mirrors_to_check, + release, + output_dir, + n_jobs, + mirror_status, + ) + + # Fill in results for aliases + for mirror, aliases in mirror_aliases.items(): + for cc in aliases: + mirror_status[cc] = mirror_status[mirror] + + table = PrettyTable(header) + table.add_row(["primary", *mirror_status.pop("primary")]) + for cc, status in sorted(mirror_status.items()): + if cc in filtered_mirrors["non_mirrors"] and not show_non_mirrors: + continue + table.add_row([cc, *status]) + + print(table) + + return 0 + + +def check_handler(args: argparse.Namespace) -> int: + """Handle the "check" subcommand and return exit code.""" + return check_country_mirror(args.country_code, args.release) + + +def check_all_handler(args: argparse.Namespace) -> int: + """Handle the "check-all" subcommand and return exit code.""" + return check_all_mirrors( + n_jobs=args.n, + release=args.release, + show_non_mirrors=args.all, + output_location=args.output, + ) + + +def parse_args() -> tuple[argparse.Namespace, argparse.ArgumentParser]: parser = argparse.ArgumentParser( description="Use debootstrap to test mirror health. Requires root.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - parser.add_argument( + # Set up subcommand parser. + # set required=False so we can show help on empty command. + subparsers = parser.add_subparsers( + title="subcommands", + required=False, + dest="command", + ) + + # create parser for single mirror mode + check = subparsers.add_parser( + "check", + help="check a single mirror", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + check.add_argument( "country_code", help="- two letter country code", ) - parser.add_argument( + check.add_argument( "release", nargs="?", default="noble", help="- release", ) + check.set_defaults(func=check_handler) - return parser.parse_args() + # create parser for check all mirrors mode + check_all = subparsers.add_parser( + "check-all", + help="check all country mirrors", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + check_all.add_argument( + "release", + nargs="?", + default="noble", + help="- release", + ) + check_all.add_argument( + "--output", + "-o", + default="mirror_checker_output", + help="- output location", + ) + check_all.add_argument( + "--number-jobs", + "-n", + type=int, + default=1, + help="- number of parallel jobs", + dest="n", + ) + check_all.add_argument( + "--all", + "-a", + action="store_true", + help="- show country codes with no mirror", + ) + check_all.set_defaults(func=check_all_handler) + + return parser.parse_args(), parser def main() -> int: - args = parse_args() + args, parser = parse_args() + + if args.command is None: + parser.print_help() + return 1 if os.geteuid() != 0: print("mirror-checker requires sudo") return 1 - release: str = args.release - mirror: str = f"http://{args.country_code}.archive.ubuntu.com/ubuntu/" - - with tempfile.TemporaryDirectory() as tempdir: - - print(f"Creating schroot in: {tempdir}") - print(f"Targeting release: {release}") - print(f"Using mirror: {mirror}") - - debootstrap_cmd = ["debootstrap", release, tempdir, mirror] - ret = run_cmd(debootstrap_cmd) - - return ret + return args.func(args) if __name__ == "__main__": From cbc022a2f9df158140a020bf908e278b8bc4e35a Mon Sep 17 00:00:00 2001 From: Chris Peterson Date: Thu, 21 Nov 2024 12:46:31 -0800 Subject: [PATCH 2/3] scripts: mirror-checker - stricter CNAME lookup Theoretically a domain could have multiple CNAME records, even if it's out of spec. Let's not try to debootstrap from these mirrors and report them bad at the point of configuration. Co-authored-by: Dan Bungert --- scripts/mirror-checker.py | 54 ++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/scripts/mirror-checker.py b/scripts/mirror-checker.py index afb1204b3..76c9ba644 100755 --- a/scripts/mirror-checker.py +++ b/scripts/mirror-checker.py @@ -8,9 +8,8 @@ import sys import tempfile from datetime import datetime, timezone -from typing import TextIO from pathlib import Path -from typing import Any +from typing import Any, TextIO import dns.resolver from prettytable import PrettyTable @@ -89,11 +88,14 @@ def check_primary_archive( def get_cname(domain: str) -> str: """Get CNAME records for domain.""" try: - answers = dns.resolver.Resolver().resolve(domain, "CNAME") - # There can only ever be one CNAME record - return answers[0].target.to_text() - except dns.resolver.NoAnswer: + # There should only ever be one CNAME record + [answer] = dns.resolver.Resolver().resolve(domain, "CNAME") + return answer.target.to_text() + except dns.resolver.NoAnswer: # No record is OK return "" + except ValueError as ve: # Unpack error. + print(f"Error: More than 1 CNAME record for {domain!r}") + raise ve def get_arecords(domain: str) -> set[str]: @@ -105,19 +107,39 @@ def get_arecords(domain: str) -> set[str]: return set() -def get_dns_information(country_codes: list[str]) -> dict[str, Any]: - """Return a dictionary of CNAMES and A records for all country mirrors.""" - # Don't put protocol (i.e. 'http://') in url. Only pass domain name. +def get_dns_information(country_codes: list[str]) -> tuple[dict[str, Any], list[str]]: + """Return a dns information (CNAMES and A records) for all country mirrors. + + Returns a 2-tuple containing the following information: + - A dictionary keyed on the 2-letter country code whose value is + another dictionary containing the domain, A records, and CNAME record, + if any. + - A list of 2-letter country codes which resulted in CNAME lookups + that returned more than 1 result. This is not compliant with DNS + specification and should be considered a mirror configuration error. + """ country_code_to_dns_data: dict[str, Any] = {} + + cname_errors: list[str] = [] + for cc in country_codes: + # Don't put protocol (i.e. 'http://') in url. Only pass domain name. domain = f"{cc}.archive.ubuntu.com" + + cname: str = "" + try: + cname = get_cname(domain) + except ValueError: + cname_errors.append(cc) + continue + country_code_to_dns_data[cc] = { "domain": domain, "A": get_arecords(domain), - "CNAME": get_cname(domain), + "CNAME": cname, } - return country_code_to_dns_data + return country_code_to_dns_data, cname_errors def get_multi_country_mirrors(cc_to_dns: dict[str, Any]) -> dict[str, set[str]]: @@ -305,7 +327,7 @@ def check_all_mirrors( country_codes: list[str] = [c.alpha_2.lower() for c in countries] print("Checking DNS information for mirrors...") - country_code_to_dns_data = get_dns_information(country_codes) + country_code_to_dns_data, cname_errors = get_dns_information(country_codes) # Also collect IPs for the primary archive primary_archive = { @@ -355,6 +377,14 @@ def check_all_mirrors( "-", # To be updated ) + # Generate "not ok" status for mirrors with bad cname + for cc in cname_errors: + mirror_status[cc] = ( + "not ok", + "more than 1 CNAME record", + "-", + ) + # Remaining mirrors are real mirrors mirrors_to_check = set() for cc in country_codes: From 60fdeffa50d33e32d63ac5b48eed3ff8e0fc7dbb Mon Sep 17 00:00:00 2001 From: Chris Peterson Date: Thu, 21 Nov 2024 14:01:16 -0800 Subject: [PATCH 3/3] scripts: mirror-checker sort by status Print mirrors with "not ok" status at the bottom so they're easier to find when reading the logs. Co-authored-by: Dan Bungert --- scripts/mirror-checker.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/scripts/mirror-checker.py b/scripts/mirror-checker.py index 76c9ba644..2cafa821a 100755 --- a/scripts/mirror-checker.py +++ b/scripts/mirror-checker.py @@ -414,12 +414,30 @@ def check_all_mirrors( for cc in aliases: mirror_status[cc] = mirror_status[mirror] + # Print the table so that bad mirrors are at the bottom and are easier to + # find. + table = PrettyTable(header) - table.add_row(["primary", *mirror_status.pop("primary")]) + + primary_status = mirror_status.pop("primary") + + if primary_status[0] == "ok": + table.add_row(["primary", *primary_status]) + + # "ok" or "NA" statuses first for cc, status in sorted(mirror_status.items()): if cc in filtered_mirrors["non_mirrors"] and not show_non_mirrors: continue - table.add_row([cc, *status]) + if status[0] != "not ok": + table.add_row([cc, *status]) + + # Print "not ok" status second + if primary_status[0] == "not ok": + table.add_row(["primary", *primary_status]) + + for cc, status in sorted(mirror_status.items()): + if status[0] == "not ok": + table.add_row([cc, *status]) print(table)