From 154cd1468b6bb2f9b5c0b60d69ec692ff68bcfe2 Mon Sep 17 00:00:00 2001 From: Gary White Jr <7660110+GaryPWhite@users.noreply.github.com> Date: Thu, 2 May 2024 10:21:00 -0400 Subject: [PATCH] handle rate limiting and some more messages when loading repos Signed-off-by: Gary White Jr <7660110+GaryPWhite@users.noreply.github.com> --- augur/application/cli/__init__.py | 25 ++++--- augur/application/cli/db.py | 82 ++++++++++++----------- augur/application/db/models/augur_data.py | 21 ++++++ 3 files changed, 81 insertions(+), 47 deletions(-) diff --git a/augur/application/cli/__init__.py b/augur/application/cli/__init__.py index e07e880bd9..f15758c9cf 100644 --- a/augur/application/cli/__init__.py +++ b/augur/application/cli/__init__.py @@ -3,9 +3,9 @@ from functools import update_wrapper import os import sys -import socket import re import json +import httpx from augur.application.db.engine import DatabaseEngine from augur.application.db import get_engine, dispose_database_engine @@ -16,13 +16,22 @@ def test_connection(function_internet_connection): @click.pass_context def new_func(ctx, *args, **kwargs): usage = re.search(r"Usage:\s(.*)\s\[OPTIONS\]", str(ctx.get_usage())).groups()[0] - try: - #try to ping google's dns server - socket.create_connection(("8.8.8.8",53)) - return ctx.invoke(function_internet_connection, *args, **kwargs) - except OSError as e: - print(e) - print(f"\n\n{usage} command setup failed\nYou are not connect to the internet. Please connect to the internet to run Augur\n") + with httpx.Client() as client: + try: + _ = client.request( + method="GET", url="http://chaoss.community", timeout=10, follow_redirects=True) + + return ctx.invoke(function_internet_connection, *args, **kwargs) + except (TimeoutError, httpx.TimeoutException): + print("Request timed out.") + except httpx.NetworkError: + print(f"Network Error: {httpx.NetworkError}") + except httpx.ProtocolError: + print(f"Protocol Error: {httpx.ProtocolError}") + print(f"\n\n{usage} command setup failed\n \ + You are not connected to the internet.\n \ + Please connect to the internet to run Augur\n \ + Consider setting http_proxy variables for limited access installations.") sys.exit() return update_wrapper(new_func, function_internet_connection) diff --git a/augur/application/cli/db.py b/augur/application/cli/db.py index c2ffc9463e..c20fcf0b2e 100644 --- a/augur/application/cli/db.py +++ b/augur/application/cli/db.py @@ -14,7 +14,12 @@ import re import stat as stat_module -from augur.application.cli import test_connection, test_db_connection, with_database, DatabaseContext +from augur.application.cli import ( + test_connection, + test_db_connection, + with_database, + DatabaseContext, +) from augur.application.db.session import DatabaseSession from sqlalchemy import update @@ -23,8 +28,9 @@ logger = logging.getLogger(__name__) + @click.group("db", short_help="Database utilities") -@click.pass_context +@click.pass_context def cli(ctx): ctx.obj = DatabaseContext() @@ -36,36 +42,43 @@ def cli(ctx): @with_database @click.pass_context def add_repos(ctx, filename): - """Add repositories to Augur's database. + """Add repositories to Augur's database. The .csv file format should be repo_url,group_id NOTE: The Group ID must already exist in the REPO_Groups Table. - If you want to add an entire GitHub organization, refer to the command: augur db add-github-org""" + If you want to add an entire GitHub organization, refer to the command: augur db add-github-org""" from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.util.repo_load_controller import RepoLoadController with GithubTaskSession(logger, engine=ctx.obj.engine) as session: - controller = RepoLoadController(session) + line_total = len(open(filename).readlines()) with open(filename) as upload_repos_file: data = csv.reader(upload_repos_file, delimiter=",") - for row in data: - + for line_num, row in enumerate(data): repo_data = {} repo_data["url"] = row[0] try: repo_data["repo_group_id"] = int(row[1]) except ValueError: - print(f"Invalid repo group_id: {row[1]} for Git url: `{repo_data['url']}`") + print( + f"Invalid repo group_id: {row[1]} for Git url: `{repo_data['url']}`" + ) continue - + print( - f"Inserting repo with Git URL `{repo_data['url']}` into repo group {repo_data['repo_group_id']}") - controller.add_cli_repo(repo_data) + f"Inserting repo {line_num}/{line_total} with Git URL `{repo_data['url']}` into repo group {repo_data['repo_group_id']}" + ) + succeeded, message = controller.add_cli_repo(repo_data) + if not succeeded: + logger.error(f"insert repo failed with error: {message['status']}`") + else: + logger.info(f"Repo added: {repo_data}") + print("Success") @cli.command("get-repo-groups") @@ -101,7 +114,6 @@ def add_repo_groups(ctx, filename): Create new repo groups in Augur's database """ with ctx.obj.engine.begin() as connection: - df = pd.read_sql( s.sql.text("SELECT repo_group_id FROM augur_data.repo_groups"), connection, @@ -117,7 +129,6 @@ def add_repo_groups(ctx, filename): with open(filename) as create_repo_groups_file: data = csv.reader(create_repo_groups_file, delimiter=",") for row in data: - # Handle case where there's a hanging empty row. if not row: logger.info("Skipping empty data...") @@ -137,6 +148,7 @@ def add_repo_groups(ctx, filename): f"Repo group with ID {row[1]} for repo group {row[1]} already exists, skipping..." ) + @cli.command("add-github-org") @click.argument("organization_name") @test_connection @@ -151,14 +163,13 @@ def add_github_org(ctx, organization_name): from augur.util.repo_load_controller import RepoLoadController with GithubTaskSession(logger, engine=ctx.obj.engine) as session: - controller = RepoLoadController(session) controller.add_cli_org(organization_name) + # get_db_version is a helper function to print_db_version and upgrade_db_version def get_db_version(engine): - db_version_sql = s.sql.text( """ SELECT * FROM augur_operations.augur_settings WHERE setting = 'augur_data_version' @@ -166,14 +177,12 @@ def get_db_version(engine): ) with engine.connect() as connection: - result = int(connection.execute(db_version_sql).fetchone()[2]) engine.dispose() return result - @cli.command("print-db-version") @test_connection @test_db_connection @@ -252,10 +261,10 @@ def update_api_key(ctx, api_key): ) with ctx.obj.engine.begin() as connection: - connection.execute(update_api_key_sql, api_key=api_key) logger.info(f"Updated Augur API key to: {api_key}") + @cli.command("get-api-key") @test_connection @test_db_connection @@ -282,20 +291,21 @@ def get_api_key(ctx): def check_pgpass(): augur_db_env_var = getenv("AUGUR_DB") if augur_db_env_var: - # gets the user, passowrd, host, port, and database_name out of environment variable # assumes database string of structure //:@:/ # it returns a tuple like (, , , , bool: continue data = result.json() + if result.status_code == 403: #GH Rate limiting + wait_until = int(result.headers.get("x-ratelimit-reset")) + # use time package to find how many seconds to wait + wait_in_seconds = int( + mktime(gmtime(wait_until)) - + mktime(gmtime(time())) + ) + wait_until_time = localtime(wait_until) + logger.error(f"rate limited fetching {url}z") + logger.error(f"sleeping until {wait_until_time.tm_hour}:{wait_until_time.tm_min} ({wait_in_seconds} seconds)") + sleep(wait_in_seconds) # if there was an error return False if "message" in data.keys(): @@ -934,6 +946,8 @@ def is_valid_github_repo(gh_session, url: str) -> bool: return True, {"status": "Valid repo", "repo_type": data["owner"]["type"]} + return False, {"status": "Failed to validate repo after multiple attempts"} + @staticmethod def is_valid_gitlab_repo(gl_session, url: str) -> bool: """Determine whether a GitLab repo URL is valid. @@ -961,6 +975,11 @@ def is_valid_gitlab_repo(gl_session, url: str) -> bool: while attempts < 10: response = hit_api(gl_session.oauths, url, logger) + if wait_in_seconds := response.headers.get("Retry-After") is not None: + logger.info(f"rate limited fetching {url}, sleeping for {wait_in_seconds}") + print(f"rate limited fetching {url}, sleeping for {wait_in_seconds}") + sleep(int(wait_in_seconds)) + if response.status_code == 404: return False, {"status": "Invalid repo"} @@ -968,6 +987,8 @@ def is_valid_gitlab_repo(gl_session, url: str) -> bool: return True, {"status": "Valid repo"} attempts += 1 + logger.info(f"could not validate {url}, will attempt again in {attempts*5} seconds") + sleep(attempts*3) return False, {"status": "Failed to validate repo after multiple attempts"}