From 724ff741f8e983168674683325f6b417779353c4 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Mon, 14 Oct 2024 16:37:48 -0500 Subject: [PATCH 01/41] big reorg --- src/pipeline/utils/pg.py => core/db | 0 {src/pipeline => core}/models/__init__.py | 0 {db => docker/db}/CHAI_ERD.png | Bin {db => docker/db}/queries.md | 0 {alembic => docker/migrations}/.pkgx.yaml | 0 {alembic => docker/migrations}/Dockerfile | 0 {alembic => docker/migrations}/alembic.ini | 0 {alembic => docker/migrations}/env.py | 0 .../migrations}/init-script.sql | 0 .../migrations}/run_migrations.sh | 0 {alembic => docker/migrations}/script.py.mako | 0 .../20240918_1200-initial_migration.py | 0 .../20240923_0821-add_load_history.py | 0 .../versions/20240925_0808-add_users_urls.py | 0 ...2034-link_tables_add_unique_constraints.py | 0 ...241002_1456-new_data_models_and_indexes.py | 0 .../20241003_0040-import_id_for_versions.py | 0 .../versions/20241003_1554-redo_data_model.py | 0 ...9_0915-modify_users_username_uniqueness.py | 0 .../20241009_1713-url_is_not_unique.py | 0 ...1010_1347-add_license_name_and_id_index.py | 0 ...0241018_0815-check_default_server_types.py | 0 {monitor => docker/monitor}/Dockerfile | 0 {monitor => docker/monitor}/main.py | 0 {monitor => docker/monitor}/requirements.txt | 0 {monitor => docker/monitor}/run_monitor.sh | 0 .../crates | 0 {src => package_managers}/run_scheduler.py | 0 pyproject.toml | 30 --- src/.pkgx.yaml | 6 - src/pipeline/pkgx.py | 58 ----- src/pipeline/utils/crates/structures.py | 26 --- src/pipeline/utils/crates/transformer.py | 201 ------------------ src/pipeline/utils/fetcher.py | 108 ---------- src/pipeline/utils/transformer.py | 50 ----- 35 files changed, 479 deletions(-) rename src/pipeline/utils/pg.py => core/db (100%) rename {src/pipeline => core}/models/__init__.py (100%) rename {db => docker/db}/CHAI_ERD.png (100%) rename {db => docker/db}/queries.md (100%) rename {alembic => docker/migrations}/.pkgx.yaml (100%) rename {alembic => docker/migrations}/Dockerfile (100%) rename {alembic => docker/migrations}/alembic.ini (100%) rename {alembic => docker/migrations}/env.py (100%) rename {alembic => docker/migrations}/init-script.sql (100%) rename {alembic => docker/migrations}/run_migrations.sh (100%) rename {alembic => docker/migrations}/script.py.mako (100%) rename {alembic => docker/migrations}/versions/20240918_1200-initial_migration.py (100%) rename {alembic => docker/migrations}/versions/20240923_0821-add_load_history.py (100%) rename {alembic => docker/migrations}/versions/20240925_0808-add_users_urls.py (100%) rename {alembic => docker/migrations}/versions/20240930_2034-link_tables_add_unique_constraints.py (100%) rename {alembic => docker/migrations}/versions/20241002_1456-new_data_models_and_indexes.py (100%) rename {alembic => docker/migrations}/versions/20241003_0040-import_id_for_versions.py (100%) rename {alembic => docker/migrations}/versions/20241003_1554-redo_data_model.py (100%) rename {alembic => docker/migrations}/versions/20241009_0915-modify_users_username_uniqueness.py (100%) rename {alembic => docker/migrations}/versions/20241009_1713-url_is_not_unique.py (100%) rename {alembic => docker/migrations}/versions/20241010_1347-add_license_name_and_id_index.py (100%) rename {alembic => docker/migrations}/versions/20241018_0815-check_default_server_types.py (100%) rename {monitor => docker/monitor}/Dockerfile (100%) rename {monitor => docker/monitor}/main.py (100%) rename {monitor => docker/monitor}/requirements.txt (100%) rename {monitor => docker/monitor}/run_monitor.sh (100%) rename src/run_pipeline.sh => package_managers/crates (100%) rename {src => package_managers}/run_scheduler.py (100%) delete mode 100644 pyproject.toml delete mode 100644 src/.pkgx.yaml delete mode 100644 src/pipeline/pkgx.py delete mode 100644 src/pipeline/utils/crates/structures.py delete mode 100644 src/pipeline/utils/crates/transformer.py delete mode 100644 src/pipeline/utils/fetcher.py delete mode 100644 src/pipeline/utils/transformer.py diff --git a/src/pipeline/utils/pg.py b/core/db similarity index 100% rename from src/pipeline/utils/pg.py rename to core/db diff --git a/src/pipeline/models/__init__.py b/core/models/__init__.py similarity index 100% rename from src/pipeline/models/__init__.py rename to core/models/__init__.py diff --git a/db/CHAI_ERD.png b/docker/db/CHAI_ERD.png similarity index 100% rename from db/CHAI_ERD.png rename to docker/db/CHAI_ERD.png diff --git a/db/queries.md b/docker/db/queries.md similarity index 100% rename from db/queries.md rename to docker/db/queries.md diff --git a/alembic/.pkgx.yaml b/docker/migrations/.pkgx.yaml similarity index 100% rename from alembic/.pkgx.yaml rename to docker/migrations/.pkgx.yaml diff --git a/alembic/Dockerfile b/docker/migrations/Dockerfile similarity index 100% rename from alembic/Dockerfile rename to docker/migrations/Dockerfile diff --git a/alembic/alembic.ini b/docker/migrations/alembic.ini similarity index 100% rename from alembic/alembic.ini rename to docker/migrations/alembic.ini diff --git a/alembic/env.py b/docker/migrations/env.py similarity index 100% rename from alembic/env.py rename to docker/migrations/env.py diff --git a/alembic/init-script.sql b/docker/migrations/init-script.sql similarity index 100% rename from alembic/init-script.sql rename to docker/migrations/init-script.sql diff --git a/alembic/run_migrations.sh b/docker/migrations/run_migrations.sh similarity index 100% rename from alembic/run_migrations.sh rename to docker/migrations/run_migrations.sh diff --git a/alembic/script.py.mako b/docker/migrations/script.py.mako similarity index 100% rename from alembic/script.py.mako rename to docker/migrations/script.py.mako diff --git a/alembic/versions/20240918_1200-initial_migration.py b/docker/migrations/versions/20240918_1200-initial_migration.py similarity index 100% rename from alembic/versions/20240918_1200-initial_migration.py rename to docker/migrations/versions/20240918_1200-initial_migration.py diff --git a/alembic/versions/20240923_0821-add_load_history.py b/docker/migrations/versions/20240923_0821-add_load_history.py similarity index 100% rename from alembic/versions/20240923_0821-add_load_history.py rename to docker/migrations/versions/20240923_0821-add_load_history.py diff --git a/alembic/versions/20240925_0808-add_users_urls.py b/docker/migrations/versions/20240925_0808-add_users_urls.py similarity index 100% rename from alembic/versions/20240925_0808-add_users_urls.py rename to docker/migrations/versions/20240925_0808-add_users_urls.py diff --git a/alembic/versions/20240930_2034-link_tables_add_unique_constraints.py b/docker/migrations/versions/20240930_2034-link_tables_add_unique_constraints.py similarity index 100% rename from alembic/versions/20240930_2034-link_tables_add_unique_constraints.py rename to docker/migrations/versions/20240930_2034-link_tables_add_unique_constraints.py diff --git a/alembic/versions/20241002_1456-new_data_models_and_indexes.py b/docker/migrations/versions/20241002_1456-new_data_models_and_indexes.py similarity index 100% rename from alembic/versions/20241002_1456-new_data_models_and_indexes.py rename to docker/migrations/versions/20241002_1456-new_data_models_and_indexes.py diff --git a/alembic/versions/20241003_0040-import_id_for_versions.py b/docker/migrations/versions/20241003_0040-import_id_for_versions.py similarity index 100% rename from alembic/versions/20241003_0040-import_id_for_versions.py rename to docker/migrations/versions/20241003_0040-import_id_for_versions.py diff --git a/alembic/versions/20241003_1554-redo_data_model.py b/docker/migrations/versions/20241003_1554-redo_data_model.py similarity index 100% rename from alembic/versions/20241003_1554-redo_data_model.py rename to docker/migrations/versions/20241003_1554-redo_data_model.py diff --git a/alembic/versions/20241009_0915-modify_users_username_uniqueness.py b/docker/migrations/versions/20241009_0915-modify_users_username_uniqueness.py similarity index 100% rename from alembic/versions/20241009_0915-modify_users_username_uniqueness.py rename to docker/migrations/versions/20241009_0915-modify_users_username_uniqueness.py diff --git a/alembic/versions/20241009_1713-url_is_not_unique.py b/docker/migrations/versions/20241009_1713-url_is_not_unique.py similarity index 100% rename from alembic/versions/20241009_1713-url_is_not_unique.py rename to docker/migrations/versions/20241009_1713-url_is_not_unique.py diff --git a/alembic/versions/20241010_1347-add_license_name_and_id_index.py b/docker/migrations/versions/20241010_1347-add_license_name_and_id_index.py similarity index 100% rename from alembic/versions/20241010_1347-add_license_name_and_id_index.py rename to docker/migrations/versions/20241010_1347-add_license_name_and_id_index.py diff --git a/alembic/versions/20241018_0815-check_default_server_types.py b/docker/migrations/versions/20241018_0815-check_default_server_types.py similarity index 100% rename from alembic/versions/20241018_0815-check_default_server_types.py rename to docker/migrations/versions/20241018_0815-check_default_server_types.py diff --git a/monitor/Dockerfile b/docker/monitor/Dockerfile similarity index 100% rename from monitor/Dockerfile rename to docker/monitor/Dockerfile diff --git a/monitor/main.py b/docker/monitor/main.py similarity index 100% rename from monitor/main.py rename to docker/monitor/main.py diff --git a/monitor/requirements.txt b/docker/monitor/requirements.txt similarity index 100% rename from monitor/requirements.txt rename to docker/monitor/requirements.txt diff --git a/monitor/run_monitor.sh b/docker/monitor/run_monitor.sh similarity index 100% rename from monitor/run_monitor.sh rename to docker/monitor/run_monitor.sh diff --git a/src/run_pipeline.sh b/package_managers/crates similarity index 100% rename from src/run_pipeline.sh rename to package_managers/crates diff --git a/src/run_scheduler.py b/package_managers/run_scheduler.py similarity index 100% rename from src/run_scheduler.py rename to package_managers/run_scheduler.py diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index bdea2ec..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,30 +0,0 @@ -[project] -name = "pipeline" -version = "0.1.0" -dependencies = [ - "alembic~=1.12", - "pyyaml~=6.0", - "psycopg2~=2.9", - "requests~=2.31", - "ruff~=0.6", - "sqlalchemy~=2.0", -] -requires-python = "~=3.11" -authors = [{ name = "Sanchit Ram Arvind", email = "sanchit@pkgx.dev" }] -maintainers = [{ name = "Sanchit Ram Arvind", email = "sanchit@pkgx.dev" }] -description = "the open-source package data pipeline" -readme = "README.md" -keywords = ["tea", "package managerment", "data pipeline"] - -[project.urls] -Homepage = "https://tea.xyz" -Repository = "https://github.com/teaxyz/chai.git" - -[project.scripts] -# only useful for local development -chai-pipeline-fetch = "pipeline.main:main" - -[tool.pytest.ini_options] -addopts = ["--import-mode=importlib"] -pythonpath = ["src", "."] -testpaths = ["tests"] diff --git a/src/.pkgx.yaml b/src/.pkgx.yaml deleted file mode 100644 index 0124ee4..0000000 --- a/src/.pkgx.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# this is the pkgx config for the pipeline - -dependencies: - python.org: ~3.11 - astral.sh/uv: 0 - postgresql.org: 16 diff --git a/src/pipeline/pkgx.py b/src/pipeline/pkgx.py deleted file mode 100644 index 1775560..0000000 --- a/src/pipeline/pkgx.py +++ /dev/null @@ -1,58 +0,0 @@ -import json -import os - -from requests import RequestException, get -from requests.exceptions import HTTPError -from src.pipeline.utils.logger import Logger - -# env vars -PANTRY_URL = "https://api.github.com/repos/pkgxdev/pantry" -OUTPUT_DIR = "data/pkgx" - -# setup -headers = { - "Authorization": f"Bearer {os.getenv('GITHUB_TOKEN')}", -} - - -def get_contents(path: str) -> list[dict]: - url = f"{PANTRY_URL}/contents/{path}" - response = get(url, headers=headers) - response.raise_for_status() - return response.json() - - -def get_pkgx_packages(logger: Logger) -> None: - try: - packages = {} - projects = get_contents("projects") - - for project in projects: - logger.debug(f"project: {project}") - if project["type"] == "dir": - project_contents = get_contents(project["path"]) - for item in project_contents: - if item["name"] == "package.yml": - response = get(item["download_url"], headers=headers) - packages[project["name"]] = response.text - - # Ensure output directory exists - os.makedirs(OUTPUT_DIR, exist_ok=True) - - # Write packages to JSON file - output_file = os.path.join(OUTPUT_DIR, "pkgx_packages.json") - with open(output_file, "w") as f: - json.dump(packages, f, indent=2) - - except HTTPError as e: - if e.response.status_code == 404: - logger.error("404, probs bad url") - elif e.response.status_code == 401: - logger.error("401, probs bad token") - raise e - except RequestException as e: - logger.error(f"RequestException: {e}") - raise e - except Exception as e: - logger.error(f"Exception: {e}") - raise e diff --git a/src/pipeline/utils/crates/structures.py b/src/pipeline/utils/crates/structures.py deleted file mode 100644 index 06b2e6c..0000000 --- a/src/pipeline/utils/crates/structures.py +++ /dev/null @@ -1,26 +0,0 @@ -from enum import IntEnum -from dataclasses import dataclass -from sqlalchemy import UUID - - -class DependencyType(IntEnum): - NORMAL = 0 - BUILD = 1 # used for build scripts - DEV = 2 # used for testing or benchmarking - OPTIONAL = 3 - - def __str__(self): - return self.name.lower() - - -@dataclass -class URLTypes: - homepage: UUID - repository: UUID - documentation: UUID - - -@dataclass -class UserTypes: - crates: UUID - github: UUID diff --git a/src/pipeline/utils/crates/transformer.py b/src/pipeline/utils/crates/transformer.py deleted file mode 100644 index ded4ff5..0000000 --- a/src/pipeline/utils/crates/transformer.py +++ /dev/null @@ -1,201 +0,0 @@ -import csv -from typing import Dict, Generator - -from src.pipeline.utils.utils import safe_int -from src.pipeline.utils.crates.structures import DependencyType, URLTypes, UserTypes -from src.pipeline.utils.transformer import Transformer - - -# crates provides homepage and repository urls, so we'll initialize this transformer -# with the ids for those url types -class CratesTransformer(Transformer): - def __init__(self, url_types: URLTypes, user_types: UserTypes): - super().__init__("crates") - self.files = { - "projects": "crates.csv", - "versions": "versions.csv", - "dependencies": "dependencies.csv", - "users": "users.csv", - "urls": "crates.csv", - "user_packages": "crate_owners.csv", - "user_versions": "versions.csv", - } - self.url_types = url_types - self.user_types = user_types - - def packages(self) -> Generator[Dict[str, str], None, None]: - projects_path = self.finder(self.files["projects"]) - - with open(projects_path) as f: - reader = csv.DictReader(f) - for row in reader: - crate_id = row["id"] - name = row["name"] - readme = row["readme"] - - yield {"name": name, "import_id": crate_id, "readme": readme} - - def versions(self) -> Generator[Dict[str, str], None, None]: - versions_path = self.finder(self.files["versions"]) - - with open(versions_path) as f: - reader = csv.DictReader(f) - for row in reader: - crate_id = row["crate_id"] - version_num = row["num"] - version_id = row["id"] - crate_size = safe_int(row["crate_size"]) - created_at = row["created_at"] - license = row["license"] - downloads = safe_int(row["downloads"]) - checksum = row["checksum"] - - yield { - "crate_id": crate_id, - "version": version_num, - "import_id": version_id, - "size": crate_size, - "published_at": created_at, - "license": license, - "downloads": downloads, - "checksum": checksum, - } - - def dependencies(self) -> Generator[Dict[str, str], None, None]: - dependencies_path = self.finder(self.files["dependencies"]) - - with open(dependencies_path) as f: - reader = csv.DictReader(f) - for row in reader: - start_id = row["version_id"] - end_id = row["crate_id"] - req = row["req"] - kind = int(row["kind"]) - - # map string to enum - dependency_type = DependencyType(kind) - - yield { - "version_id": start_id, - "crate_id": end_id, - "semver_range": req, - "dependency_type": dependency_type, - } - - # gh_id is unique to github, and is from GitHub - # our users table is unique on import_id and source_id - # so, we actually get some github data for free here! - def users(self) -> Generator[Dict[str, str], None, None]: - users_path = self.finder(self.files["users"]) - usernames = set() - - with open(users_path) as f: - reader = csv.DictReader(f) - for row in reader: - gh_login = row["gh_login"] - id = row["id"] - - # deduplicate - if gh_login in usernames: - self.logger.warn(f"duplicate username: {id}, {gh_login}") - continue - usernames.add(gh_login) - - # gh_login is a non-nullable column in crates, so we'll always be - # able to load this - source_id = self.user_types.github - yield {"import_id": id, "username": gh_login, "source_id": source_id} - - # for crate_owners, owner_id and created_by are foreign keys on users.id - # and owner_kind is 0 for user and 1 for team - # secondly, created_at is nullable. we'll ignore for now and focus on owners - def user_packages(self) -> Generator[Dict[str, str], None, None]: - user_packages_path = self.finder(self.files["user_packages"]) - - with open(user_packages_path) as f: - reader = csv.DictReader(f) - for row in reader: - owner_kind = int(row["owner_kind"]) - if owner_kind == 1: - continue - - crate_id = row["crate_id"] - owner_id = row["owner_id"] - - yield { - "crate_id": crate_id, - "owner_id": owner_id, - } - - # TODO: reopening files: versions.csv contains all the published_by ids - def user_versions(self) -> Generator[Dict[str, str], None, None]: - user_versions_path = self.finder(self.files["user_versions"]) - - with open(user_versions_path) as f: - reader = csv.DictReader(f) - for row in reader: - version_id = row["id"] - published_by = row["published_by"] - - if published_by == "": - continue - - yield {"version_id": version_id, "published_by": published_by} - - # crates provides three urls for each crate: homepage, repository, and documentation - # however, any of these could be null, so we should check for that - # also, we're not going to deduplicate here - def urls(self) -> Generator[Dict[str, str], None, None]: - urls_path = self.finder(self.files["urls"]) - - with open(urls_path) as f: - reader = csv.DictReader(f) - for row in reader: - homepage = row["homepage"] - repository = row["repository"] - documentation = row["documentation"] - - if homepage: - yield {"url": homepage, "url_type_id": self.url_types.homepage} - - if repository: - yield {"url": repository, "url_type_id": self.url_types.repository} - - if documentation: - yield { - "url": documentation, - "url_type_id": self.url_types.documentation, - } - - # TODO: reopening files: crates.csv contains all the urls - def package_urls(self) -> Generator[Dict[str, str], None, None]: - urls_path = self.finder(self.files["urls"]) - - with open(urls_path) as f: - reader = csv.DictReader(f) - for row in reader: - crate_id = row["id"] - homepage = row["homepage"] - repository = row["repository"] - documentation = row["documentation"] - - if homepage: - yield { - "import_id": crate_id, - "url": homepage, - "url_type_id": self.url_types.homepage, - } - - if repository: - yield { - "import_id": crate_id, - "url": repository, - "url_type_id": self.url_types.repository, - } - - if documentation: - yield { - "import_id": crate_id, - "url": documentation, - "url_type_id": self.url_types.documentation, - } diff --git a/src/pipeline/utils/fetcher.py b/src/pipeline/utils/fetcher.py deleted file mode 100644 index da25064..0000000 --- a/src/pipeline/utils/fetcher.py +++ /dev/null @@ -1,108 +0,0 @@ -import os -import tarfile -from dataclasses import dataclass -from datetime import datetime -from io import BytesIO -from typing import Any - -from requests import get -from src.pipeline.utils.logger import Logger - - -@dataclass -class Data: - file_path: str - file_name: str - content: Any # json or bytes - - -class Fetcher: - def __init__(self, name: str, source: str): - self.name = name - self.source = source - self.output = f"data/{name}" - self.logger = Logger(f"{name}_fetcher") - - def write(self, files: list[Data]): - """generic write function for some collection of files""" - - # prep the file location - now = datetime.now().strftime("%Y-%m-%d") - root_path = f"{self.output}/{now}" - - # write - # it can be anything - json, tarball, etc. - for item in files: - file_path = item.file_path - file_name = item.file_name - file_content = item.content - full_path = os.path.join(root_path, file_path) - - # make sure the path exists - os.makedirs(full_path, exist_ok=True) - - with open(os.path.join(full_path, file_name), "wb") as f: - self.logger.debug(f"writing {full_path}") - f.write(file_content) - - # update the latest symlink - self.update_symlink(now) - - def update_symlink(self, latest_path: str): - latest_symlink = f"{self.output}/latest" - if os.path.islink(latest_symlink): - self.logger.debug(f"removing existing symlink {latest_symlink}") - os.remove(latest_symlink) - - self.logger.debug(f"creating symlink {latest_symlink} -> {latest_path}") - os.symlink(latest_path, latest_symlink) - - def fetch(self): - response = get(self.source) - try: - response.raise_for_status() - except Exception as e: - self.logger.error(f"error fetching {self.source}: {e}") - raise e - - return response.content - - -class TarballFetcher(Fetcher): - def __init__(self, name: str, source: str): - super().__init__(name, source) - - def fetch(self) -> list[Data]: - content = super().fetch() - - bytes_io_object = BytesIO(content) - bytes_io_object.seek(0) - - files = [] - with tarfile.open(fileobj=bytes_io_object, mode="r:gz") as tar: - for member in tar.getmembers(): - if member.isfile(): - bytes_io_file = BytesIO(tar.extractfile(member).read()) - destination_key = member.name - file_name = destination_key.split("/")[-1] - file_path = "/".join(destination_key.split("/")[:-1]) - self.logger.debug(f"file_path/file_name: {file_path}/{file_name}") - files.append(Data(file_path, file_name, bytes_io_file.read())) - - return files - - -class JSONFetcher(Fetcher): - def __init__(self, name: str, source: str): - super().__init__(name, source) - - def fetch(self): - pass - - -class YAMLFetcher(Fetcher): - def __init__(self, name: str, source: str): - super().__init__(name, source) - - def fetch(self): - pass diff --git a/src/pipeline/utils/transformer.py b/src/pipeline/utils/transformer.py deleted file mode 100644 index 47f0cf8..0000000 --- a/src/pipeline/utils/transformer.py +++ /dev/null @@ -1,50 +0,0 @@ -import csv -import os -from typing import Dict - -from sqlalchemy import UUID - -from src.pipeline.utils.logger import Logger - -# this is a temporary fix, but sometimes the raw files have weird characters -# and lots of data within certain fields -# this fix allows us to read the files with no hassles -csv.field_size_limit(10000000) - - -# the transformer class knows what files to open, and provide a generic wrapper -# for the data within the files -# each package manager will have its own transformer, that knows what data needs to be -# extracted for our data model -class Transformer: - def __init__(self, name: str): - self.name = name - self.input = f"data/{name}/latest" - self.logger = Logger(f"{name}_transformer") - self.files: Dict[str, str] = { - "projects": "", - "versions": "", - "dependencies": "", - "users": "", - "urls": "", - } - self.url_types: Dict[str, UUID] = {} - - def finder(self, file_name: str) -> str: - input_dir = os.path.realpath(self.input) - - for root, _, files in os.walk(input_dir): - if file_name in files: - return os.path.join(root, file_name) - else: - self.logger.error(f"{file_name} not found in {input_dir}") - raise FileNotFoundError(f"Missing {file_name} file") - - def packages(self): - pass - - def versions(self): - pass - - def dependencies(self): - pass From 94b5f3037f1942b03329334e023d1cafc9e1ed6d Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Mon, 14 Oct 2024 18:20:06 -0500 Subject: [PATCH 02/41] docker builds db and alembic --- .dockerignore | 1 - docker-compose.yml | 100 ++++++++---------- docker/{migrations => alembic}/.pkgx.yaml | 0 docker/{migrations => alembic}/Dockerfile | 3 +- docker/{migrations => alembic}/alembic.ini | 0 docker/{migrations => alembic}/env.py | 0 .../{migrations => alembic}/init-script.sql | 0 .../{migrations => alembic}/run_migrations.sh | 0 docker/{migrations => alembic}/script.py.mako | 0 .../20240918_1200-initial_migration.py | 0 .../20240923_0821-add_load_history.py | 0 .../versions/20240925_0808-add_users_urls.py | 0 ...2034-link_tables_add_unique_constraints.py | 0 ...241002_1456-new_data_models_and_indexes.py | 0 .../20241003_0040-import_id_for_versions.py | 0 .../versions/20241003_1554-redo_data_model.py | 0 ...9_0915-modify_users_username_uniqueness.py | 0 .../20241009_1713-url_is_not_unique.py | 0 ...1010_1347-add_license_name_and_id_index.py | 0 ...0241018_0815-check_default_server_types.py | 0 20 files changed, 45 insertions(+), 59 deletions(-) rename docker/{migrations => alembic}/.pkgx.yaml (100%) rename docker/{migrations => alembic}/Dockerfile (95%) rename docker/{migrations => alembic}/alembic.ini (100%) rename docker/{migrations => alembic}/env.py (100%) rename docker/{migrations => alembic}/init-script.sql (100%) rename docker/{migrations => alembic}/run_migrations.sh (100%) rename docker/{migrations => alembic}/script.py.mako (100%) rename docker/{migrations => alembic}/versions/20240918_1200-initial_migration.py (100%) rename docker/{migrations => alembic}/versions/20240923_0821-add_load_history.py (100%) rename docker/{migrations => alembic}/versions/20240925_0808-add_users_urls.py (100%) rename docker/{migrations => alembic}/versions/20240930_2034-link_tables_add_unique_constraints.py (100%) rename docker/{migrations => alembic}/versions/20241002_1456-new_data_models_and_indexes.py (100%) rename docker/{migrations => alembic}/versions/20241003_0040-import_id_for_versions.py (100%) rename docker/{migrations => alembic}/versions/20241003_1554-redo_data_model.py (100%) rename docker/{migrations => alembic}/versions/20241009_0915-modify_users_username_uniqueness.py (100%) rename docker/{migrations => alembic}/versions/20241009_1713-url_is_not_unique.py (100%) rename docker/{migrations => alembic}/versions/20241010_1347-add_license_name_and_id_index.py (100%) rename docker/{migrations => alembic}/versions/20241018_0815-check_default_server_types.py (100%) diff --git a/.dockerignore b/.dockerignore index 0384b02..9a4f821 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,5 @@ # directories data/ -db/ .venv/ # other files diff --git a/docker-compose.yml b/docker-compose.yml index 8e152f6..f169f1d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,8 +16,10 @@ services: alembic: build: - context: ./ - dockerfile: ./alembic/Dockerfile + context: ./docker/alembic + dockerfile: Dockerfile + args: + BUILDKIT_PROGRESS: plain environment: - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai - PGPASSWORD=s3cr3t @@ -27,60 +29,44 @@ services: working_dir: /alembic entrypoint: ["./run_migrations.sh"] - pipeline: - build: - context: ./ - dockerfile: ./src/Dockerfile - args: - BUILDKIT_PROGRESS: plain - environment: - - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai - - FREQUENCY=${FREQUENCY:-24} - - PYTHONPATH=/ - - PKG_MANAGER=${PKG_MANAGER:-crates} - - TEST=${TEST:-false} - - DEBUG=${DEBUG:-true} - - FETCH=${FETCH:-true} - depends_on: - db: - condition: service_healthy - alembic: - condition: service_completed_successfully - working_dir: /src - entrypoint: ["./run_pipeline.sh"] + # crates: + # build: + # context: ./package_managers/crates + # dockerfile: Dockerfile + # environment: + # - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai + # volumes: + # - ./core:/app/core + # - ./package_managers/crates:/app/crates + # depends_on: + # db: + # condition: service_healthy + # alembic: + # condition: service_completed_successfully - api: - build: - context: ./api - dockerfile: Dockerfile - environment: - - DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai - - HOST=0.0.0.0 - - PORT=8080 - ports: - - "8080:8080" - depends_on: - db: - condition: service_healthy - alembic: - condition: service_completed_successfully - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8080/heartbeat"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 5s + # homebrew: + # build: + # context: ./package_managers/homebrew + # dockerfile: Dockerfile + # environment: + # - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai + # volumes: + # - ./core:/app/core + # - ./package_managers/homebrew:/app/homebrew + # depends_on: + # db: + # condition: service_healthy + # alembic: + # condition: service_completed_successfully - monitor: - build: monitor - environment: - - DOCKER_HOST=${DOCKER_HOST:-unix:///var/run/docker.sock} - volumes: - - ./monitor:/app - - /var/run/docker.sock:/var/run/docker.sock - depends_on: - pipeline: - condition: service_started - working_dir: /usr/src/monitor - entrypoint: ["./run_monitor.sh"] + # monitor: + # build: + # context: ./docker/monitor + # dockerfile: Dockerfile + # environment: + # - DOCKER_HOST=${DOCKER_HOST:-unix:///var/run/docker.sock} + # volumes: + # - /var/run/docker.sock:/var/run/docker.sock + # depends_on: + # - crates + # - homebrew \ No newline at end of file diff --git a/docker/migrations/.pkgx.yaml b/docker/alembic/.pkgx.yaml similarity index 100% rename from docker/migrations/.pkgx.yaml rename to docker/alembic/.pkgx.yaml diff --git a/docker/migrations/Dockerfile b/docker/alembic/Dockerfile similarity index 95% rename from docker/migrations/Dockerfile rename to docker/alembic/Dockerfile index 24b6261..2708121 100644 --- a/docker/migrations/Dockerfile +++ b/docker/alembic/Dockerfile @@ -5,7 +5,8 @@ RUN apt -y install postgresql RUN apt -y install alembic RUN apt -y install python3-psycopg2 RUN apt -y install python3-sqlalchemy python3-sqlalchemy-ext -COPY . . WORKDIR /alembic +COPY . . RUN chmod +x run_migrations.sh ENTRYPOINT ["/alembic/run_migrations.sh"] +CMD ["/bin/bash"] \ No newline at end of file diff --git a/docker/migrations/alembic.ini b/docker/alembic/alembic.ini similarity index 100% rename from docker/migrations/alembic.ini rename to docker/alembic/alembic.ini diff --git a/docker/migrations/env.py b/docker/alembic/env.py similarity index 100% rename from docker/migrations/env.py rename to docker/alembic/env.py diff --git a/docker/migrations/init-script.sql b/docker/alembic/init-script.sql similarity index 100% rename from docker/migrations/init-script.sql rename to docker/alembic/init-script.sql diff --git a/docker/migrations/run_migrations.sh b/docker/alembic/run_migrations.sh similarity index 100% rename from docker/migrations/run_migrations.sh rename to docker/alembic/run_migrations.sh diff --git a/docker/migrations/script.py.mako b/docker/alembic/script.py.mako similarity index 100% rename from docker/migrations/script.py.mako rename to docker/alembic/script.py.mako diff --git a/docker/migrations/versions/20240918_1200-initial_migration.py b/docker/alembic/versions/20240918_1200-initial_migration.py similarity index 100% rename from docker/migrations/versions/20240918_1200-initial_migration.py rename to docker/alembic/versions/20240918_1200-initial_migration.py diff --git a/docker/migrations/versions/20240923_0821-add_load_history.py b/docker/alembic/versions/20240923_0821-add_load_history.py similarity index 100% rename from docker/migrations/versions/20240923_0821-add_load_history.py rename to docker/alembic/versions/20240923_0821-add_load_history.py diff --git a/docker/migrations/versions/20240925_0808-add_users_urls.py b/docker/alembic/versions/20240925_0808-add_users_urls.py similarity index 100% rename from docker/migrations/versions/20240925_0808-add_users_urls.py rename to docker/alembic/versions/20240925_0808-add_users_urls.py diff --git a/docker/migrations/versions/20240930_2034-link_tables_add_unique_constraints.py b/docker/alembic/versions/20240930_2034-link_tables_add_unique_constraints.py similarity index 100% rename from docker/migrations/versions/20240930_2034-link_tables_add_unique_constraints.py rename to docker/alembic/versions/20240930_2034-link_tables_add_unique_constraints.py diff --git a/docker/migrations/versions/20241002_1456-new_data_models_and_indexes.py b/docker/alembic/versions/20241002_1456-new_data_models_and_indexes.py similarity index 100% rename from docker/migrations/versions/20241002_1456-new_data_models_and_indexes.py rename to docker/alembic/versions/20241002_1456-new_data_models_and_indexes.py diff --git a/docker/migrations/versions/20241003_0040-import_id_for_versions.py b/docker/alembic/versions/20241003_0040-import_id_for_versions.py similarity index 100% rename from docker/migrations/versions/20241003_0040-import_id_for_versions.py rename to docker/alembic/versions/20241003_0040-import_id_for_versions.py diff --git a/docker/migrations/versions/20241003_1554-redo_data_model.py b/docker/alembic/versions/20241003_1554-redo_data_model.py similarity index 100% rename from docker/migrations/versions/20241003_1554-redo_data_model.py rename to docker/alembic/versions/20241003_1554-redo_data_model.py diff --git a/docker/migrations/versions/20241009_0915-modify_users_username_uniqueness.py b/docker/alembic/versions/20241009_0915-modify_users_username_uniqueness.py similarity index 100% rename from docker/migrations/versions/20241009_0915-modify_users_username_uniqueness.py rename to docker/alembic/versions/20241009_0915-modify_users_username_uniqueness.py diff --git a/docker/migrations/versions/20241009_1713-url_is_not_unique.py b/docker/alembic/versions/20241009_1713-url_is_not_unique.py similarity index 100% rename from docker/migrations/versions/20241009_1713-url_is_not_unique.py rename to docker/alembic/versions/20241009_1713-url_is_not_unique.py diff --git a/docker/migrations/versions/20241010_1347-add_license_name_and_id_index.py b/docker/alembic/versions/20241010_1347-add_license_name_and_id_index.py similarity index 100% rename from docker/migrations/versions/20241010_1347-add_license_name_and_id_index.py rename to docker/alembic/versions/20241010_1347-add_license_name_and_id_index.py diff --git a/docker/migrations/versions/20241018_0815-check_default_server_types.py b/docker/alembic/versions/20241018_0815-check_default_server_types.py similarity index 100% rename from docker/migrations/versions/20241018_0815-check_default_server_types.py rename to docker/alembic/versions/20241018_0815-check_default_server_types.py From 697b244582cca7d90d1e1d5fb98e8da67d626837 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Mon, 14 Oct 2024 18:24:16 -0500 Subject: [PATCH 03/41] alembic and db docker actually builds now also, migrations run --- {docker/alembic => alembic}/.pkgx.yaml | 0 {docker/alembic => alembic}/Dockerfile | 7 +++---- {docker/alembic => alembic}/alembic.ini | 0 {docker/alembic => alembic}/env.py | 12 +++--------- {docker/alembic => alembic}/init-script.sql | 0 {docker/alembic => alembic}/run_migrations.sh | 0 {docker/alembic => alembic}/script.py.mako | 0 .../versions/20240918_1200-initial_migration.py | 0 .../versions/20240923_0821-add_load_history.py | 0 .../versions/20240925_0808-add_users_urls.py | 0 ...40930_2034-link_tables_add_unique_constraints.py | 0 .../20241002_1456-new_data_models_and_indexes.py | 0 .../20241003_0040-import_id_for_versions.py | 0 .../versions/20241003_1554-redo_data_model.py | 0 ...0241009_0915-modify_users_username_uniqueness.py | 0 .../versions/20241009_1713-url_is_not_unique.py | 0 .../20241010_1347-add_license_name_and_id_index.py | 0 .../20241018_0815-check_default_server_types.py | 0 core/{db => db.py} | 0 {docker/db => db}/CHAI_ERD.png | Bin {docker/db => db}/queries.md | 0 docker-compose.yml | 4 ++-- {docker/monitor => monitor}/Dockerfile | 0 {docker/monitor => monitor}/main.py | 0 {docker/monitor => monitor}/requirements.txt | 0 {docker/monitor => monitor}/run_monitor.sh | 0 26 files changed, 8 insertions(+), 15 deletions(-) rename {docker/alembic => alembic}/.pkgx.yaml (100%) rename {docker/alembic => alembic}/Dockerfile (77%) rename {docker/alembic => alembic}/alembic.ini (100%) rename {docker/alembic => alembic}/env.py (88%) rename {docker/alembic => alembic}/init-script.sql (100%) rename {docker/alembic => alembic}/run_migrations.sh (100%) rename {docker/alembic => alembic}/script.py.mako (100%) rename {docker/alembic => alembic}/versions/20240918_1200-initial_migration.py (100%) rename {docker/alembic => alembic}/versions/20240923_0821-add_load_history.py (100%) rename {docker/alembic => alembic}/versions/20240925_0808-add_users_urls.py (100%) rename {docker/alembic => alembic}/versions/20240930_2034-link_tables_add_unique_constraints.py (100%) rename {docker/alembic => alembic}/versions/20241002_1456-new_data_models_and_indexes.py (100%) rename {docker/alembic => alembic}/versions/20241003_0040-import_id_for_versions.py (100%) rename {docker/alembic => alembic}/versions/20241003_1554-redo_data_model.py (100%) rename {docker/alembic => alembic}/versions/20241009_0915-modify_users_username_uniqueness.py (100%) rename {docker/alembic => alembic}/versions/20241009_1713-url_is_not_unique.py (100%) rename {docker/alembic => alembic}/versions/20241010_1347-add_license_name_and_id_index.py (100%) rename {docker/alembic => alembic}/versions/20241018_0815-check_default_server_types.py (100%) rename core/{db => db.py} (100%) rename {docker/db => db}/CHAI_ERD.png (100%) rename {docker/db => db}/queries.md (100%) rename {docker/monitor => monitor}/Dockerfile (100%) rename {docker/monitor => monitor}/main.py (100%) rename {docker/monitor => monitor}/requirements.txt (100%) rename {docker/monitor => monitor}/run_monitor.sh (100%) diff --git a/docker/alembic/.pkgx.yaml b/alembic/.pkgx.yaml similarity index 100% rename from docker/alembic/.pkgx.yaml rename to alembic/.pkgx.yaml diff --git a/docker/alembic/Dockerfile b/alembic/Dockerfile similarity index 77% rename from docker/alembic/Dockerfile rename to alembic/Dockerfile index 2708121..8749c8b 100644 --- a/docker/alembic/Dockerfile +++ b/alembic/Dockerfile @@ -5,8 +5,7 @@ RUN apt -y install postgresql RUN apt -y install alembic RUN apt -y install python3-psycopg2 RUN apt -y install python3-sqlalchemy python3-sqlalchemy-ext -WORKDIR /alembic COPY . . -RUN chmod +x run_migrations.sh -ENTRYPOINT ["/alembic/run_migrations.sh"] -CMD ["/bin/bash"] \ No newline at end of file +WORKDIR /alembic +RUN chmod +x /alembic/run_migrations.sh +ENTRYPOINT ["/alembic/run_migrations.sh"] \ No newline at end of file diff --git a/docker/alembic/alembic.ini b/alembic/alembic.ini similarity index 100% rename from docker/alembic/alembic.ini rename to alembic/alembic.ini diff --git a/docker/alembic/env.py b/alembic/env.py similarity index 88% rename from docker/alembic/env.py rename to alembic/env.py index ca8b713..2608372 100644 --- a/docker/alembic/env.py +++ b/alembic/env.py @@ -1,10 +1,9 @@ import os from logging.config import fileConfig -from sqlalchemy import engine_from_config, pool - from alembic import context -from src.pipeline.models import Base +from sqlalchemy import engine_from_config, pool +from core.models import Base # this is the Alembic Config object, which provides # access to the values within the .ini file in use. @@ -40,7 +39,6 @@ def run_migrations_offline() -> None: target_metadata=target_metadata, literal_binds=True, dialect_opts={"paramstyle": "named"}, - compare_server_default=True, ) with context.begin_transaction(): @@ -60,11 +58,7 @@ def run_migrations_online() -> None: ) with connectable.connect() as connection: - context.configure( - connection=connection, - target_metadata=target_metadata, - compare_server_default=True, - ) + context.configure(connection=connection, target_metadata=target_metadata) with context.begin_transaction(): context.run_migrations() diff --git a/docker/alembic/init-script.sql b/alembic/init-script.sql similarity index 100% rename from docker/alembic/init-script.sql rename to alembic/init-script.sql diff --git a/docker/alembic/run_migrations.sh b/alembic/run_migrations.sh similarity index 100% rename from docker/alembic/run_migrations.sh rename to alembic/run_migrations.sh diff --git a/docker/alembic/script.py.mako b/alembic/script.py.mako similarity index 100% rename from docker/alembic/script.py.mako rename to alembic/script.py.mako diff --git a/docker/alembic/versions/20240918_1200-initial_migration.py b/alembic/versions/20240918_1200-initial_migration.py similarity index 100% rename from docker/alembic/versions/20240918_1200-initial_migration.py rename to alembic/versions/20240918_1200-initial_migration.py diff --git a/docker/alembic/versions/20240923_0821-add_load_history.py b/alembic/versions/20240923_0821-add_load_history.py similarity index 100% rename from docker/alembic/versions/20240923_0821-add_load_history.py rename to alembic/versions/20240923_0821-add_load_history.py diff --git a/docker/alembic/versions/20240925_0808-add_users_urls.py b/alembic/versions/20240925_0808-add_users_urls.py similarity index 100% rename from docker/alembic/versions/20240925_0808-add_users_urls.py rename to alembic/versions/20240925_0808-add_users_urls.py diff --git a/docker/alembic/versions/20240930_2034-link_tables_add_unique_constraints.py b/alembic/versions/20240930_2034-link_tables_add_unique_constraints.py similarity index 100% rename from docker/alembic/versions/20240930_2034-link_tables_add_unique_constraints.py rename to alembic/versions/20240930_2034-link_tables_add_unique_constraints.py diff --git a/docker/alembic/versions/20241002_1456-new_data_models_and_indexes.py b/alembic/versions/20241002_1456-new_data_models_and_indexes.py similarity index 100% rename from docker/alembic/versions/20241002_1456-new_data_models_and_indexes.py rename to alembic/versions/20241002_1456-new_data_models_and_indexes.py diff --git a/docker/alembic/versions/20241003_0040-import_id_for_versions.py b/alembic/versions/20241003_0040-import_id_for_versions.py similarity index 100% rename from docker/alembic/versions/20241003_0040-import_id_for_versions.py rename to alembic/versions/20241003_0040-import_id_for_versions.py diff --git a/docker/alembic/versions/20241003_1554-redo_data_model.py b/alembic/versions/20241003_1554-redo_data_model.py similarity index 100% rename from docker/alembic/versions/20241003_1554-redo_data_model.py rename to alembic/versions/20241003_1554-redo_data_model.py diff --git a/docker/alembic/versions/20241009_0915-modify_users_username_uniqueness.py b/alembic/versions/20241009_0915-modify_users_username_uniqueness.py similarity index 100% rename from docker/alembic/versions/20241009_0915-modify_users_username_uniqueness.py rename to alembic/versions/20241009_0915-modify_users_username_uniqueness.py diff --git a/docker/alembic/versions/20241009_1713-url_is_not_unique.py b/alembic/versions/20241009_1713-url_is_not_unique.py similarity index 100% rename from docker/alembic/versions/20241009_1713-url_is_not_unique.py rename to alembic/versions/20241009_1713-url_is_not_unique.py diff --git a/docker/alembic/versions/20241010_1347-add_license_name_and_id_index.py b/alembic/versions/20241010_1347-add_license_name_and_id_index.py similarity index 100% rename from docker/alembic/versions/20241010_1347-add_license_name_and_id_index.py rename to alembic/versions/20241010_1347-add_license_name_and_id_index.py diff --git a/docker/alembic/versions/20241018_0815-check_default_server_types.py b/alembic/versions/20241018_0815-check_default_server_types.py similarity index 100% rename from docker/alembic/versions/20241018_0815-check_default_server_types.py rename to alembic/versions/20241018_0815-check_default_server_types.py diff --git a/core/db b/core/db.py similarity index 100% rename from core/db rename to core/db.py diff --git a/docker/db/CHAI_ERD.png b/db/CHAI_ERD.png similarity index 100% rename from docker/db/CHAI_ERD.png rename to db/CHAI_ERD.png diff --git a/docker/db/queries.md b/db/queries.md similarity index 100% rename from docker/db/queries.md rename to db/queries.md diff --git a/docker-compose.yml b/docker-compose.yml index f169f1d..bdca9dc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,8 +16,8 @@ services: alembic: build: - context: ./docker/alembic - dockerfile: Dockerfile + context: . + dockerfile: ./alembic/Dockerfile args: BUILDKIT_PROGRESS: plain environment: diff --git a/docker/monitor/Dockerfile b/monitor/Dockerfile similarity index 100% rename from docker/monitor/Dockerfile rename to monitor/Dockerfile diff --git a/docker/monitor/main.py b/monitor/main.py similarity index 100% rename from docker/monitor/main.py rename to monitor/main.py diff --git a/docker/monitor/requirements.txt b/monitor/requirements.txt similarity index 100% rename from docker/monitor/requirements.txt rename to monitor/requirements.txt diff --git a/docker/monitor/run_monitor.sh b/monitor/run_monitor.sh similarity index 100% rename from docker/monitor/run_monitor.sh rename to monitor/run_monitor.sh From 45decedaaaba5f85a895d284fac3e2f20fbc3fdb Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Mon, 14 Oct 2024 18:33:13 -0500 Subject: [PATCH 04/41] reloaded the crates stuff, and fixed imports --- core/db.py | 6 +- core/fetcher.py | 108 ++++++++++++ core/logger.py | 55 +++++++ core/transformer.py | 50 ++++++ core/utils.py | 18 ++ package_managers/crates | 7 - package_managers/crates/Dockerfile | 6 + package_managers/crates/main.py | 100 +++++++++++ package_managers/crates/requirements.txt | 32 ++++ package_managers/crates/structs.py | 26 +++ package_managers/crates/transformer.py | 201 +++++++++++++++++++++++ 11 files changed, 599 insertions(+), 10 deletions(-) create mode 100644 core/fetcher.py create mode 100644 core/logger.py create mode 100644 core/transformer.py create mode 100644 core/utils.py delete mode 100755 package_managers/crates create mode 100644 package_managers/crates/Dockerfile create mode 100644 package_managers/crates/main.py create mode 100644 package_managers/crates/requirements.txt create mode 100644 package_managers/crates/structs.py create mode 100644 package_managers/crates/transformer.py diff --git a/core/db.py b/core/db.py index 5867d97..d3d52bd 100644 --- a/core/db.py +++ b/core/db.py @@ -1,12 +1,12 @@ import os from typing import Any, Dict, Iterable, List, Type -from src.pipeline.utils.utils import build_query_params +from core.utils import build_query_params from sqlalchemy import UUID, create_engine from sqlalchemy.dialects import postgresql from sqlalchemy.dialects.postgresql import insert from sqlalchemy.orm import sessionmaker from sqlalchemy.orm.decl_api import DeclarativeMeta -from src.pipeline.models import ( +from core.models import ( DependsOn, License, LoadHistory, @@ -21,7 +21,7 @@ UserVersion, Version, ) -from src.pipeline.utils.logger import Logger +from core.logger import Logger CHAI_DATABASE_URL = os.getenv("CHAI_DATABASE_URL") DEFAULT_BATCH_SIZE = 10000 diff --git a/core/fetcher.py b/core/fetcher.py new file mode 100644 index 0000000..edfd741 --- /dev/null +++ b/core/fetcher.py @@ -0,0 +1,108 @@ +import os +import tarfile +from dataclasses import dataclass +from datetime import datetime +from io import BytesIO +from typing import Any + +from requests import get +from core.logger import Logger + + +@dataclass +class Data: + file_path: str + file_name: str + content: Any # json or bytes + + +class Fetcher: + def __init__(self, name: str, source: str): + self.name = name + self.source = source + self.output = f"data/{name}" + self.logger = Logger(f"{name}_fetcher") + + def write(self, files: list[Data]): + """generic write function for some collection of files""" + + # prep the file location + now = datetime.now().strftime("%Y-%m-%d") + root_path = f"{self.output}/{now}" + + # write + # it can be anything - json, tarball, etc. + for item in files: + file_path = item.file_path + file_name = item.file_name + file_content = item.content + full_path = os.path.join(root_path, file_path) + + # make sure the path exists + os.makedirs(full_path, exist_ok=True) + + with open(os.path.join(full_path, file_name), "wb") as f: + self.logger.debug(f"writing {full_path}") + f.write(file_content) + + # update the latest symlink + self.update_symlink(now) + + def update_symlink(self, latest_path: str): + latest_symlink = f"{self.output}/latest" + if os.path.islink(latest_symlink): + self.logger.debug(f"removing existing symlink {latest_symlink}") + os.remove(latest_symlink) + + self.logger.debug(f"creating symlink {latest_symlink} -> {latest_path}") + os.symlink(latest_path, latest_symlink) + + def fetch(self): + response = get(self.source) + try: + response.raise_for_status() + except Exception as e: + self.logger.error(f"error fetching {self.source}: {e}") + raise e + + return response.content + + +class TarballFetcher(Fetcher): + def __init__(self, name: str, source: str): + super().__init__(name, source) + + def fetch(self) -> list[Data]: + content = super().fetch() + + bytes_io_object = BytesIO(content) + bytes_io_object.seek(0) + + files = [] + with tarfile.open(fileobj=bytes_io_object, mode="r:gz") as tar: + for member in tar.getmembers(): + if member.isfile(): + bytes_io_file = BytesIO(tar.extractfile(member).read()) + destination_key = member.name + file_name = destination_key.split("/")[-1] + file_path = "/".join(destination_key.split("/")[:-1]) + self.logger.debug(f"file_path/file_name: {file_path}/{file_name}") + files.append(Data(file_path, file_name, bytes_io_file.read())) + + return files + + +class JSONFetcher(Fetcher): + def __init__(self, name: str, source: str): + super().__init__(name, source) + + def fetch(self): + pass + + +class YAMLFetcher(Fetcher): + def __init__(self, name: str, source: str): + super().__init__(name, source) + + def fetch(self): + pass diff --git a/core/logger.py b/core/logger.py new file mode 100644 index 0000000..e66e081 --- /dev/null +++ b/core/logger.py @@ -0,0 +1,55 @@ +from os import getenv +import time +import sys +import traceback + +DEBUG = getenv("DEBUG", "false").lower() == "true" + +# use inspect to print the line of code as well? +# caller = inspect.currentframe().f_back +# filename = caller.f_code.co_filename, lineno = caller.f_lineno + + +def as_minutes(seconds: float) -> float: + return seconds / 60 + + +class Logger: + SILENT = 0 + NORMAL = 1 + VERBOSE = 2 + + def __init__(self, name: str, mode=NORMAL, start=time.time()) -> None: + self.name = name + self.start = start + self.mode = Logger.VERBOSE if DEBUG else mode + + def print(self, msg: str): + print(f"{self.time_diff():.2f}: [{self.name}]: {msg}") + + def error(self, message): + self.print(f"[ERROR]: {message}") + + def log(self, message): + if self.mode >= Logger.NORMAL: + self.print(f"{message}") + + def debug(self, message): + if self.mode >= Logger.VERBOSE: + self.print(f"[DEBUG]: {message}") + + def warn(self, message): + if self.mode >= Logger.NORMAL: + self.print(f"[WARN]: {message}") + + def is_verbose(self): + return self.mode >= Logger.VERBOSE + + def time_diff(self): + return time.time() - self.start + + def exception(self): + exc_type, exc_value, exc_traceback = sys.exc_info() + self.print(f"{exc_type.__name__}: {exc_value}") + self.print("***** TRACEBACK *****") + print(f"{''.join(traceback.format_tb(exc_traceback))}") diff --git a/core/transformer.py b/core/transformer.py new file mode 100644 index 0000000..0de6147 --- /dev/null +++ b/core/transformer.py @@ -0,0 +1,50 @@ +import csv +import os +from typing import Dict + +from sqlalchemy import UUID + +from core.logger import Logger + +# this is a temporary fix, but sometimes the raw files have weird characters +# and lots of data within certain fields +# this fix allows us to read the files with no hassles +csv.field_size_limit(10000000) + + +# the transformer class knows what files to open, and provide a generic wrapper +# for the data within the files +# each package manager will have its own transformer, that knows what data needs to be +# extracted for our data model +class Transformer: + def __init__(self, name: str): + self.name = name + self.input = f"data/{name}/latest" + self.logger = Logger(f"{name}_transformer") + self.files: Dict[str, str] = { + "projects": "", + "versions": "", + "dependencies": "", + "users": "", + "urls": "", + } + self.url_types: Dict[str, UUID] = {} + + def finder(self, file_name: str) -> str: + input_dir = os.path.realpath(self.input) + + for root, _, files in os.walk(input_dir): + if file_name in files: + return os.path.join(root, file_name) + else: + self.logger.error(f"{file_name} not found in {input_dir}") + raise FileNotFoundError(f"Missing {file_name} file") + + def packages(self): + pass + + def versions(self): + pass + + def dependencies(self): + pass diff --git a/core/utils.py b/core/utils.py new file mode 100644 index 0000000..3d7a1bb --- /dev/null +++ b/core/utils.py @@ -0,0 +1,18 @@ +from typing import List, Dict + + +def safe_int(val: str) -> int | None: + if val == "": + return None + return int(val) + + +# TODO: needs explanation or simplification +def build_query_params( + items: List[Dict[str, str]], cache: dict, attr: str +) -> List[str]: + params = set() + for item in items: + if item[attr] not in cache: + params.add(item[attr]) + return list(params) diff --git a/package_managers/crates b/package_managers/crates deleted file mode 100755 index 0b3818e..0000000 --- a/package_managers/crates +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -# make the data directory -mkdir -p data/{crates,pkgx,homebrew,npm,pypi,rubys} - -# run the pipeline -python -u /src/run_scheduler.py diff --git a/package_managers/crates/Dockerfile b/package_managers/crates/Dockerfile new file mode 100644 index 0000000..b49d7fc --- /dev/null +++ b/package_managers/crates/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.11 +WORKDIR /app/crates +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY . . +CMD ["python", "main.py"] \ No newline at end of file diff --git a/package_managers/crates/main.py b/package_managers/crates/main.py new file mode 100644 index 0000000..ece4555 --- /dev/null +++ b/package_managers/crates/main.py @@ -0,0 +1,100 @@ +from os import getenv + +from dataclasses import dataclass + +from package_managers.crates.structs import URLTypes, UserTypes +from core.fetcher import TarballFetcher +from core.logger import Logger +from core.db import DB +from core.transformer import CratesTransformer + +logger = Logger("crates_orchestrator") + + +# TODO: global config class +@dataclass +class Config: + file_location: str + test: bool + fetch: bool + package_manager_id: str + url_types: URLTypes + user_types: UserTypes + + def __str__(self): + return f"Config(file_location={self.file_location}, test={self.test}, \ + fetch={self.fetch}, package_manager_id={self.package_manager_id}, \ + url_types={self.url_types}, user_types={self.user_types})" + + +def initialize(db: DB) -> Config: + file_location = "https://static.crates.io/db-dump.tar.gz" + test = getenv("TEST", "false").lower() == "true" + fetch = getenv("FETCH", "true").lower() == "true" + package_manager = db.select_package_manager_by_name("crates", create=True) + homepage_url = db.select_url_types_homepage(create=True) + repository_url = db.select_url_types_repository(create=True) + documentation_url = db.select_url_types_documentation(create=True) + crates_source = db.select_source_by_name("crates", create=True) + github_source = db.select_source_by_name("github", create=True) + url_types = URLTypes( + homepage=homepage_url.id, + repository=repository_url.id, + documentation=documentation_url.id, + ) + user_types = UserTypes(crates=crates_source.id, github=github_source.id) + + logger.debug("initialized config") + + return Config( + file_location=file_location, + test=test, + fetch=fetch, + package_manager_id=package_manager.id, + url_types=url_types, + user_types=user_types, + ) + + +def fetch(config: Config) -> None: + fetcher = TarballFetcher("crates", config.file_location) + files = fetcher.fetch() + fetcher.write(files) + + +def load(db: DB, transformer: CratesTransformer, config: Config) -> None: + db.insert_packages(transformer.packages(), config.package_manager_id, "crates") + db.insert_versions(transformer.versions()) + db.insert_users(transformer.users(), config.user_types.crates) + db.insert_user_packages(transformer.user_packages()) + db.insert_urls(transformer.urls()) + + if not config.test: + # these are bigger files, so we skip them in tests + db.insert_user_versions(transformer.user_versions(), config.user_types.github) + # db.insert_package_urls(transformer.package_urls()) FIXME + db.insert_dependencies(transformer.dependencies()) + + db.insert_load_history(config.package_manager_id) + logger.log("✅ crates") + + +def main(db: DB) -> None: + config = initialize(db) + logger.debug(config) + if config.fetch: + fetch(config) + + transformer = CratesTransformer(config.url_types, config.user_types) + load(db, transformer, config) + + coda = """ + validate by running + `psql "postgresql://postgres:s3cr3t@localhost:5435/chai" \ + -c "SELECT * FROM load_history;"` + """ + logger.log(coda) + + +if __name__ == "__main__": + main() diff --git a/package_managers/crates/requirements.txt b/package_managers/crates/requirements.txt new file mode 100644 index 0000000..6b37d4f --- /dev/null +++ b/package_managers/crates/requirements.txt @@ -0,0 +1,32 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile pyproject.toml -o src/requirements.txt +alembic==1.13.2 + # via pipeline (pyproject.toml) +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +idna==3.8 + # via requests +mako==1.3.5 + # via alembic +markupsafe==2.1.5 + # via mako +psycopg2==2.9.9 + # via pipeline (pyproject.toml) +pyyaml==6.0.2 + # via pipeline (pyproject.toml) +requests==2.32.3 + # via pipeline (pyproject.toml) +ruff==0.6.5 + # via pipeline (pyproject.toml) +sqlalchemy==2.0.34 + # via + # pipeline (pyproject.toml) + # alembic +typing-extensions==4.12.2 + # via + # alembic + # sqlalchemy +urllib3==2.2.2 + # via requests \ No newline at end of file diff --git a/package_managers/crates/structs.py b/package_managers/crates/structs.py new file mode 100644 index 0000000..06b2e6c --- /dev/null +++ b/package_managers/crates/structs.py @@ -0,0 +1,26 @@ +from enum import IntEnum +from dataclasses import dataclass +from sqlalchemy import UUID + + +class DependencyType(IntEnum): + NORMAL = 0 + BUILD = 1 # used for build scripts + DEV = 2 # used for testing or benchmarking + OPTIONAL = 3 + + def __str__(self): + return self.name.lower() + + +@dataclass +class URLTypes: + homepage: UUID + repository: UUID + documentation: UUID + + +@dataclass +class UserTypes: + crates: UUID + github: UUID diff --git a/package_managers/crates/transformer.py b/package_managers/crates/transformer.py new file mode 100644 index 0000000..7d41375 --- /dev/null +++ b/package_managers/crates/transformer.py @@ -0,0 +1,201 @@ +import csv +from typing import Dict, Generator + +from core.utils import safe_int +from package_managers.crates.structs import DependencyType, URLTypes, UserTypes +from core.transformer import Transformer + + +# crates provides homepage and repository urls, so we'll initialize this transformer +# with the ids for those url types +class CratesTransformer(Transformer): + def __init__(self, url_types: URLTypes, user_types: UserTypes): + super().__init__("crates") + self.files = { + "projects": "crates.csv", + "versions": "versions.csv", + "dependencies": "dependencies.csv", + "users": "users.csv", + "urls": "crates.csv", + "user_packages": "crate_owners.csv", + "user_versions": "versions.csv", + } + self.url_types = url_types + self.user_types = user_types + + def packages(self) -> Generator[Dict[str, str], None, None]: + projects_path = self.finder(self.files["projects"]) + + with open(projects_path) as f: + reader = csv.DictReader(f) + for row in reader: + crate_id = row["id"] + name = row["name"] + readme = row["readme"] + + yield {"name": name, "import_id": crate_id, "readme": readme} + + def versions(self) -> Generator[Dict[str, str], None, None]: + versions_path = self.finder(self.files["versions"]) + + with open(versions_path) as f: + reader = csv.DictReader(f) + for row in reader: + crate_id = row["crate_id"] + version_num = row["num"] + version_id = row["id"] + crate_size = safe_int(row["crate_size"]) + created_at = row["created_at"] + license = row["license"] + downloads = safe_int(row["downloads"]) + checksum = row["checksum"] + + yield { + "crate_id": crate_id, + "version": version_num, + "import_id": version_id, + "size": crate_size, + "published_at": created_at, + "license": license, + "downloads": downloads, + "checksum": checksum, + } + + def dependencies(self) -> Generator[Dict[str, str], None, None]: + dependencies_path = self.finder(self.files["dependencies"]) + + with open(dependencies_path) as f: + reader = csv.DictReader(f) + for row in reader: + start_id = row["version_id"] + end_id = row["crate_id"] + req = row["req"] + kind = int(row["kind"]) + + # map string to enum + dependency_type = DependencyType(kind) + + yield { + "version_id": start_id, + "crate_id": end_id, + "semver_range": req, + "dependency_type": dependency_type, + } + + # gh_id is unique to github, and is from GitHub + # our users table is unique on import_id and source_id + # so, we actually get some github data for free here! + def users(self) -> Generator[Dict[str, str], None, None]: + users_path = self.finder(self.files["users"]) + usernames = set() + + with open(users_path) as f: + reader = csv.DictReader(f) + for row in reader: + gh_login = row["gh_login"] + id = row["id"] + + # deduplicate + if gh_login in usernames: + self.logger.warn(f"duplicate username: {id}, {gh_login}") + continue + usernames.add(gh_login) + + # gh_login is a non-nullable column in crates, so we'll always be + # able to load this + source_id = self.user_types.github + yield {"import_id": id, "username": gh_login, "source_id": source_id} + + # for crate_owners, owner_id and created_by are foreign keys on users.id + # and owner_kind is 0 for user and 1 for team + # secondly, created_at is nullable. we'll ignore for now and focus on owners + def user_packages(self) -> Generator[Dict[str, str], None, None]: + user_packages_path = self.finder(self.files["user_packages"]) + + with open(user_packages_path) as f: + reader = csv.DictReader(f) + for row in reader: + owner_kind = int(row["owner_kind"]) + if owner_kind == 1: + continue + + crate_id = row["crate_id"] + owner_id = row["owner_id"] + + yield { + "crate_id": crate_id, + "owner_id": owner_id, + } + + # TODO: reopening files: versions.csv contains all the published_by ids + def user_versions(self) -> Generator[Dict[str, str], None, None]: + user_versions_path = self.finder(self.files["user_versions"]) + + with open(user_versions_path) as f: + reader = csv.DictReader(f) + for row in reader: + version_id = row["id"] + published_by = row["published_by"] + + if published_by == "": + continue + + yield {"version_id": version_id, "published_by": published_by} + + # crates provides three urls for each crate: homepage, repository, and documentation + # however, any of these could be null, so we should check for that + # also, we're not going to deduplicate here + def urls(self) -> Generator[Dict[str, str], None, None]: + urls_path = self.finder(self.files["urls"]) + + with open(urls_path) as f: + reader = csv.DictReader(f) + for row in reader: + homepage = row["homepage"] + repository = row["repository"] + documentation = row["documentation"] + + if homepage: + yield {"url": homepage, "url_type_id": self.url_types.homepage} + + if repository: + yield {"url": repository, "url_type_id": self.url_types.repository} + + if documentation: + yield { + "url": documentation, + "url_type_id": self.url_types.documentation, + } + + # TODO: reopening files: crates.csv contains all the urls + def package_urls(self) -> Generator[Dict[str, str], None, None]: + urls_path = self.finder(self.files["urls"]) + + with open(urls_path) as f: + reader = csv.DictReader(f) + for row in reader: + crate_id = row["id"] + homepage = row["homepage"] + repository = row["repository"] + documentation = row["documentation"] + + if homepage: + yield { + "import_id": crate_id, + "url": homepage, + "url_type_id": self.url_types.homepage, + } + + if repository: + yield { + "import_id": crate_id, + "url": repository, + "url_type_id": self.url_types.repository, + } + + if documentation: + yield { + "import_id": crate_id, + "url": documentation, + "url_type_id": self.url_types.documentation, + } From e72e2c69bd202b53fb02cf6cd851e20082b499e5 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Tue, 15 Oct 2024 15:36:16 -0500 Subject: [PATCH 05/41] crates image is building and running --- .dockerignore | 5 +- docker-compose.yml | 78 ++++++++++++-------------- package_managers/crates/Dockerfile | 7 +-- package_managers/crates/main.py | 4 +- package_managers/crates/transformer.py | 3 +- 5 files changed, 48 insertions(+), 49 deletions(-) diff --git a/.dockerignore b/.dockerignore index 9a4f821..a00944d 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,4 +4,7 @@ data/ # other files .gitignore -docker-compose.yml \ No newline at end of file +docker-compose.yml +.DS_Store +.git +README.md \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index bdca9dc..969334e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,8 +18,6 @@ services: build: context: . dockerfile: ./alembic/Dockerfile - args: - BUILDKIT_PROGRESS: plain environment: - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai - PGPASSWORD=s3cr3t @@ -29,44 +27,42 @@ services: working_dir: /alembic entrypoint: ["./run_migrations.sh"] - # crates: - # build: - # context: ./package_managers/crates - # dockerfile: Dockerfile - # environment: - # - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai - # volumes: - # - ./core:/app/core - # - ./package_managers/crates:/app/crates - # depends_on: - # db: - # condition: service_healthy - # alembic: - # condition: service_completed_successfully + crates: + build: + context: . + dockerfile: ./package_managers/crates/Dockerfile + environment: + - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai + - PYTHONPATH=/ + depends_on: + db: + condition: service_healthy + alembic: + condition: service_completed_successfully - # homebrew: - # build: - # context: ./package_managers/homebrew - # dockerfile: Dockerfile - # environment: - # - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai - # volumes: - # - ./core:/app/core - # - ./package_managers/homebrew:/app/homebrew - # depends_on: - # db: - # condition: service_healthy - # alembic: - # condition: service_completed_successfully + homebrew: + build: + context: ./package_managers/homebrew + dockerfile: Dockerfile + environment: + - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai + volumes: + - ./core:/app/core + - ./package_managers/homebrew:/app/homebrew + depends_on: + db: + condition: service_healthy + alembic: + condition: service_completed_successfully - # monitor: - # build: - # context: ./docker/monitor - # dockerfile: Dockerfile - # environment: - # - DOCKER_HOST=${DOCKER_HOST:-unix:///var/run/docker.sock} - # volumes: - # - /var/run/docker.sock:/var/run/docker.sock - # depends_on: - # - crates - # - homebrew \ No newline at end of file + monitor: + build: + context: ./docker/monitor + dockerfile: Dockerfile + environment: + - DOCKER_HOST=${DOCKER_HOST:-unix:///var/run/docker.sock} + volumes: + - /var/run/docker.sock:/var/run/docker.sock + depends_on: + - crates + - homebrew \ No newline at end of file diff --git a/package_managers/crates/Dockerfile b/package_managers/crates/Dockerfile index b49d7fc..f599716 100644 --- a/package_managers/crates/Dockerfile +++ b/package_managers/crates/Dockerfile @@ -1,6 +1,5 @@ FROM python:3.11 -WORKDIR /app/crates -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt COPY . . -CMD ["python", "main.py"] \ No newline at end of file +WORKDIR /package_managers/crates +RUN pip install --no-cache-dir -r requirements.txt +CMD ["python", "/package_managers/crates/main.py"] \ No newline at end of file diff --git a/package_managers/crates/main.py b/package_managers/crates/main.py index ece4555..438e25b 100644 --- a/package_managers/crates/main.py +++ b/package_managers/crates/main.py @@ -2,11 +2,11 @@ from dataclasses import dataclass -from package_managers.crates.structs import URLTypes, UserTypes from core.fetcher import TarballFetcher from core.logger import Logger from core.db import DB -from core.transformer import CratesTransformer +from package_managers.crates.structs import URLTypes, UserTypes +from package_managers.crates.transformer import CratesTransformer logger = Logger("crates_orchestrator") diff --git a/package_managers/crates/transformer.py b/package_managers/crates/transformer.py index 7d41375..edb9f16 100644 --- a/package_managers/crates/transformer.py +++ b/package_managers/crates/transformer.py @@ -1,8 +1,9 @@ import csv from typing import Dict, Generator -from core.utils import safe_int from package_managers.crates.structs import DependencyType, URLTypes, UserTypes + +from core.utils import safe_int from core.transformer import Transformer From 7585d51e7da2b2128e6405b9090507badfe07a33 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Tue, 15 Oct 2024 15:48:38 -0500 Subject: [PATCH 06/41] crates works --- package_managers/crates/main.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/package_managers/crates/main.py b/package_managers/crates/main.py index 438e25b..1e9b9a7 100644 --- a/package_managers/crates/main.py +++ b/package_managers/crates/main.py @@ -79,22 +79,23 @@ def load(db: DB, transformer: CratesTransformer, config: Config) -> None: logger.log("✅ crates") -def main(db: DB) -> None: - config = initialize(db) - logger.debug(config) +def main(db: DB, config: Config) -> None: if config.fetch: fetch(config) transformer = CratesTransformer(config.url_types, config.user_types) load(db, transformer, config) - coda = """ - validate by running - `psql "postgresql://postgres:s3cr3t@localhost:5435/chai" \ - -c "SELECT * FROM load_history;"` - """ + coda = ( + "validate by running " + + '`psql "postgresql://postgres:s3cr3t@localhost:5435/chai" ' + + '-c "SELECT * FROM load_history;"`' + ) logger.log(coda) if __name__ == "__main__": - main() + db = DB() + config = initialize(db) + logger.debug(config) + main(db, config) From cbca4dfb4bed0615af20a9141f145057c35bad34 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Tue, 15 Oct 2024 15:53:53 -0500 Subject: [PATCH 07/41] wait on homebrew --- docker-compose.yml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 969334e..4ef7c3b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -40,20 +40,20 @@ services: alembic: condition: service_completed_successfully - homebrew: - build: - context: ./package_managers/homebrew - dockerfile: Dockerfile - environment: - - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai - volumes: - - ./core:/app/core - - ./package_managers/homebrew:/app/homebrew - depends_on: - db: - condition: service_healthy - alembic: - condition: service_completed_successfully + # homebrew: + # build: + # context: ./package_managers/homebrew + # dockerfile: Dockerfile + # environment: + # - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai + # volumes: + # - ./core:/app/core + # - ./package_managers/homebrew:/app/homebrew + # depends_on: + # db: + # condition: service_healthy + # alembic: + # condition: service_completed_successfully monitor: build: @@ -65,4 +65,4 @@ services: - /var/run/docker.sock:/var/run/docker.sock depends_on: - crates - - homebrew \ No newline at end of file + # - homebrew \ No newline at end of file From ca3d9475335147054b85da262b256837bcaccd0d Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Tue, 15 Oct 2024 15:58:17 -0500 Subject: [PATCH 08/41] monitor service is building --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 4ef7c3b..e3624cd 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -57,7 +57,7 @@ services: monitor: build: - context: ./docker/monitor + context: ./monitor dockerfile: Dockerfile environment: - DOCKER_HOST=${DOCKER_HOST:-unix:///var/run/docker.sock} From d3f3b89a66434602c23b27c5ce4513150841fd36 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Tue, 15 Oct 2024 16:11:16 -0500 Subject: [PATCH 09/41] env vars for crates --- docker-compose.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index e3624cd..7b84a9b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -34,6 +34,9 @@ services: environment: - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai - PYTHONPATH=/ + - DEBUG=${DEBUG:-true} + - TEST=${TEST:-false} + - FETCH=${FETCH:-true} depends_on: db: condition: service_healthy From efceae2caf64965fab174d9126ae16fbcecd71b1 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Tue, 15 Oct 2024 16:23:58 -0500 Subject: [PATCH 10/41] monitor service is running --- docker-compose.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 7b84a9b..98a7687 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -67,5 +67,7 @@ services: volumes: - /var/run/docker.sock:/var/run/docker.sock depends_on: - - crates - # - homebrew \ No newline at end of file + crates: + condition: service_started + # homebrew: + # condition: service_started From f0d283e8931362a4a14f59389109c824a52f3c41 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Wed, 16 Oct 2024 17:14:42 -0500 Subject: [PATCH 11/41] delete some old stuff --- src/Dockerfile | 5 -- src/pipeline/crates.py | 122 ----------------------------------- src/pipeline/main.py | 39 ----------- src/pipeline/utils/logger.py | 55 ---------------- src/pipeline/utils/utils.py | 21 ------ src/requirements.txt | 34 ---------- 6 files changed, 276 deletions(-) delete mode 100644 src/Dockerfile delete mode 100644 src/pipeline/crates.py delete mode 100644 src/pipeline/main.py delete mode 100644 src/pipeline/utils/logger.py delete mode 100644 src/pipeline/utils/utils.py delete mode 100644 src/requirements.txt diff --git a/src/Dockerfile b/src/Dockerfile deleted file mode 100644 index 80413d0..0000000 --- a/src/Dockerfile +++ /dev/null @@ -1,5 +0,0 @@ -FROM python:3.11 -COPY . . -WORKDIR /src -RUN pip install --no-cache-dir -r requirements.txt -RUN chmod +x run_pipeline.sh diff --git a/src/pipeline/crates.py b/src/pipeline/crates.py deleted file mode 100644 index 73f1d18..0000000 --- a/src/pipeline/crates.py +++ /dev/null @@ -1,122 +0,0 @@ -from os import getenv - -from dataclasses import dataclass - -from src.pipeline.utils.crates.structures import URLTypes, UserTypes -from src.pipeline.utils.fetcher import TarballFetcher -from src.pipeline.utils.logger import Logger -from src.pipeline.utils.pg import DB -from src.pipeline.utils.crates.transformer import CratesTransformer - -logger = Logger("crates_orchestrator") - - -# TODO: global config class -@dataclass -class Config: - file_location: str - test: bool - fetch: bool - package_manager_id: str - url_types: URLTypes - user_types: UserTypes - - def __str__(self): - return f"Config(file_location={self.file_location}, test={self.test}, \ - fetch={self.fetch}, package_manager_id={self.package_manager_id}, \ - url_types={self.url_types}, user_types={self.user_types})" - - -def initialize(db: DB) -> Config: - file_location = "https://static.crates.io/db-dump.tar.gz" - test = getenv("TEST", "false").lower() == "true" - fetch = getenv("FETCH", "true").lower() == "true" - package_manager = db.select_package_manager_by_name("crates", create=True) - homepage_url = db.select_url_types_homepage(create=True) - repository_url = db.select_url_types_repository(create=True) - documentation_url = db.select_url_types_documentation(create=True) - crates_source = db.select_source_by_name("crates", create=True) - github_source = db.select_source_by_name("github", create=True) - url_types = URLTypes( - homepage=homepage_url.id, - repository=repository_url.id, - documentation=documentation_url.id, - ) - user_types = UserTypes(crates=crates_source.id, github=github_source.id) - - logger.debug("initialized config") - - return Config( - file_location=file_location, - test=test, - fetch=fetch, - package_manager_id=package_manager.id, - url_types=url_types, - user_types=user_types, - ) - - -def fetch(config: Config) -> None: - fetcher = TarballFetcher("crates", config.file_location) - files = fetcher.fetch() - fetcher.write(files) - - -def load(db: DB, transformer: CratesTransformer, config: Config) -> None: - logger.log("loading crates packages...this should take a minute") - db.insert_packages(transformer.packages(), config.package_manager_id, "crates") - logger.log("✅ inserted packages") - - logger.log("loading crates urls...this should take a minute") - db.insert_urls(transformer.urls()) - logger.log("✅ inserted urls") - - logger.log("loading crates package urls...this should take ~3 minutes") - db.insert_package_urls(transformer.package_urls()) - logger.log("✅ inserted package urls") - - logger.log("loading crates versions...this should take ~5 minutes") - db.insert_versions(transformer.versions()) - logger.log("✅ inserted versions") - - logger.log("loading crates users...this should take a minute") - db.insert_users(transformer.users(), config.user_types.crates) - logger.log("✅ inserted users") - - logger.log("loading crates user packages...this should take a few seconds") - db.insert_user_packages(transformer.user_packages()) - logger.log("✅ inserted user packages") - - if not config.test: - # these are bigger files, so we skip them in tests - logger.log("loading crates user versions...this should take ~5 minutes") - db.insert_user_versions(transformer.user_versions(), config.user_types.github) - logger.log("✅ inserted user versions") - - logger.log("loading crates dependencies...this should take ~1 hour") - db.insert_dependencies(transformer.dependencies()) - logger.log("✅ inserted dependencies") - - db.insert_load_history(config.package_manager_id) - logger.log("✅ crates") - - -def main(db: DB) -> None: - config = initialize(db) - logger.debug(config) - if config.fetch: - fetch(config) - - transformer = CratesTransformer(config.url_types, config.user_types) - load(db, transformer, config) - - coda = """ - validate by running - `psql "postgresql://postgres:s3cr3t@localhost:5435/chai" \ - -c "SELECT * FROM load_history;"` - """ - logger.log(coda) - - -if __name__ == "__main__": - main() diff --git a/src/pipeline/main.py b/src/pipeline/main.py deleted file mode 100644 index 211b1d9..0000000 --- a/src/pipeline/main.py +++ /dev/null @@ -1,39 +0,0 @@ -import sys -from os import getenv - -from src.pipeline.crates import main as crates_main -from src.pipeline.utils.logger import Logger -from src.pipeline.utils.pg import DB - -logger = Logger("main_pipeline") - - -def main(): - try: - if len(sys.argv) != 2: - raise ValueError("usage: python main.py ") - - if getenv("CHAI_DATABASE_URL") is None: - raise ValueError("CHAI_DATABASE_URL is not set") - - # initialize the db and handoff everywhere - db = DB() - package_manager = sys.argv[1] - - print(f"[main] Running pipeline for {package_manager}...") - - # run the pipeline for the specified package manager - if package_manager == "crates": - print("[main] Running crates pipeline...") - crates_main(db) - else: - raise ValueError("invalid package manager") - except Exception: - logger.exception() - sys.exit(1) - - sys.exit(0) - - -if __name__ == "__main__": - main() diff --git a/src/pipeline/utils/logger.py b/src/pipeline/utils/logger.py deleted file mode 100644 index 0c2cc88..0000000 --- a/src/pipeline/utils/logger.py +++ /dev/null @@ -1,55 +0,0 @@ -from os import getenv -import time -import sys -import traceback - -DEBUG = getenv("DEBUG", "false").lower() == "true" - -# use inspect to print the line of code as well? -# caller = inspect.currentframe().f_back -# filename = caller.f_code.co_filename, lineno = caller.f_lineno - - -def as_minutes(seconds: float) -> float: - return seconds / 60 - - -class Logger: - SILENT = 0 - NORMAL = 1 - VERBOSE = 2 - - def __init__(self, name: str, mode=NORMAL, start=time.time()) -> None: - self.name = name - self.start = start - self.mode = Logger.VERBOSE if DEBUG else mode - - def print(self, msg: str): - print(f"{self.time_diff():.2f}: [{self.name}]: {msg}", flush=True) - - def error(self, message): - self.print(f"[ERROR]: {message}") - - def log(self, message): - if self.mode >= Logger.NORMAL: - self.print(f"{message}") - - def debug(self, message): - if self.mode >= Logger.VERBOSE: - self.print(f"[DEBUG]: {message}") - - def warn(self, message): - if self.mode >= Logger.NORMAL: - self.print(f"[WARN]: {message}") - - def is_verbose(self): - return self.mode >= Logger.VERBOSE - - def time_diff(self): - return time.time() - self.start - - def exception(self): - exc_type, exc_value, exc_traceback = sys.exc_info() - self.print(f"{exc_type.__name__}: {exc_value}") - self.print("***** TRACEBACK *****") - print(f"{''.join(traceback.format_tb(exc_traceback))}") diff --git a/src/pipeline/utils/utils.py b/src/pipeline/utils/utils.py deleted file mode 100644 index 3de9bee..0000000 --- a/src/pipeline/utils/utils.py +++ /dev/null @@ -1,21 +0,0 @@ -from typing import List, Dict - - -def safe_int(val: str) -> int | None: - if val == "": - return None - return int(val) - - -# given some items and a cache, this returns a list of attributes that are not in the -# cache so that we can use them in a query -# attr has to be an attribute in the item -# item[attr] is a key in the cache -def build_query_params( - items: List[Dict[str, str]], cache: dict, attr: str -) -> List[str]: - params = set() - for item in items: - if item[attr] not in cache: - params.add(item[attr]) - return list(params) diff --git a/src/requirements.txt b/src/requirements.txt deleted file mode 100644 index 63fff01..0000000 --- a/src/requirements.txt +++ /dev/null @@ -1,34 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile pyproject.toml -o src/requirements.txt -alembic==1.13.2 - # via pipeline (pyproject.toml) -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -idna==3.8 - # via requests -mako==1.3.5 - # via alembic -markupsafe==2.1.5 - # via mako -psycopg2==2.9.9 - # via pipeline (pyproject.toml) -pyyaml==6.0.2 - # via pipeline (pyproject.toml) -requests==2.32.3 - # via pipeline (pyproject.toml) -ruff==0.6.5 - # via pipeline (pyproject.toml) -schedule==1.2.0 - # via pipeline (pyproject.toml) -sqlalchemy==2.0.34 - # via - # pipeline (pyproject.toml) - # alembic -typing-extensions==4.12.2 - # via - # alembic - # sqlalchemy -urllib3==2.2.2 - # via requests From 1bc9a77cd66f1318b66cb9ae1680ee3975886762 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Wed, 16 Oct 2024 17:14:56 -0500 Subject: [PATCH 12/41] the scheduler class --- core/scheduler.py | 39 ++++++++++++++++++++++++ docker-compose.yml | 1 + package_managers/crates/main.py | 24 +++++++++++++-- package_managers/crates/requirements.txt | 2 ++ 4 files changed, 63 insertions(+), 3 deletions(-) create mode 100644 core/scheduler.py diff --git a/core/scheduler.py b/core/scheduler.py new file mode 100644 index 0000000..c144a53 --- /dev/null +++ b/core/scheduler.py @@ -0,0 +1,39 @@ +from os import getenv +import schedule +import time +from threading import Thread +from typing import Callable +from core.logger import Logger + +FREQUENCY = int(getenv("FREQUENCY", 24)) + + +class Scheduler: + def __init__(self, name: str, frequency: int = FREQUENCY): + self.name = name + self.frequency = frequency + self.logger = Logger(f"{name}_scheduler") + self.job = None + self.is_running = False + + def start(self, task: Callable, *args): + self.job = schedule.every(self.frequency).hours.do(task, *args) + self.is_running = True + self.logger.log(f"scheduled {self.name} to run every {self.frequency} hours") + + def run_schedule(): + while self.is_running: + schedule.run_pending() + time.sleep(1) + + Thread(target=run_schedule, daemon=True).start() + + def stop(self): + if self.job: + schedule.cancel_job(self.job) + self.is_running = False + self.logger.log(f"stopped {self.name} scheduler") + + def run_now(self, task: Callable, *args): + self.logger.log(f"running {self.name} now") + task(*args) diff --git a/docker-compose.yml b/docker-compose.yml index 98a7687..70e134e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -37,6 +37,7 @@ services: - DEBUG=${DEBUG:-true} - TEST=${TEST:-false} - FETCH=${FETCH:-true} + - FREQUENCY=${FREQUENCY:-24} depends_on: db: condition: service_healthy diff --git a/package_managers/crates/main.py b/package_managers/crates/main.py index 1e9b9a7..5c7a291 100644 --- a/package_managers/crates/main.py +++ b/package_managers/crates/main.py @@ -1,3 +1,4 @@ +import time from os import getenv from dataclasses import dataclass @@ -5,6 +6,7 @@ from core.fetcher import TarballFetcher from core.logger import Logger from core.db import DB +from core.scheduler import Scheduler from package_managers.crates.structs import URLTypes, UserTypes from package_managers.crates.transformer import CratesTransformer @@ -79,7 +81,7 @@ def load(db: DB, transformer: CratesTransformer, config: Config) -> None: logger.log("✅ crates") -def main(db: DB, config: Config) -> None: +def run_pipeline(db: DB, config: Config) -> None: if config.fetch: fetch(config) @@ -94,8 +96,24 @@ def main(db: DB, config: Config) -> None: logger.log(coda) -if __name__ == "__main__": +def main(): db = DB() config = initialize(db) logger.debug(config) - main(db, config) + + scheduler = Scheduler("crates") + scheduler.start(run_pipeline, db, config) + + # run immediately + scheduler.run_now(run_pipeline, db, config) + + # keep the main thread alive so we can terminate the program with Ctrl+C + try: + while True: + time.sleep(1) + except KeyboardInterrupt: + scheduler.stop() + + +if __name__ == "__main__": + main() diff --git a/package_managers/crates/requirements.txt b/package_managers/crates/requirements.txt index 6b37d4f..7357d56 100644 --- a/package_managers/crates/requirements.txt +++ b/package_managers/crates/requirements.txt @@ -20,6 +20,8 @@ requests==2.32.3 # via pipeline (pyproject.toml) ruff==0.6.5 # via pipeline (pyproject.toml) +schedule==1.2.0 + # via pipeline (pyproject.toml) sqlalchemy==2.0.34 # via # pipeline (pyproject.toml) From 739eeeb8e18a181561bbc88cdad005da3fce1040 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Wed, 16 Oct 2024 17:15:22 -0500 Subject: [PATCH 13/41] remove old scheduler --- package_managers/run_scheduler.py | 52 ------------------------------- 1 file changed, 52 deletions(-) delete mode 100644 package_managers/run_scheduler.py diff --git a/package_managers/run_scheduler.py b/package_managers/run_scheduler.py deleted file mode 100644 index 61f207d..0000000 --- a/package_managers/run_scheduler.py +++ /dev/null @@ -1,52 +0,0 @@ -import schedule -import time -import subprocess -import sys -import os - -PKG_MANAGER = os.getenv("PKG_MANAGER", "crates") -FREQUENCY = int(os.getenv("FREQUENCY", 24)) - - -def run_pipeline(): - # using Popen so we can continuously capture output - process = subprocess.Popen( - [sys.executable, "/src/pipeline/main.py", PKG_MANAGER], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - bufsize=1, - universal_newlines=True, - text=True, - ) - # this is hacky, but ensures we capture all output - while True: - output = process.stdout.readline() - if output == "" and process.poll() is not None: - break - if output: - print(output.strip()) - rc = process.poll() - if rc != 0: - print(process.stderr.read(), file=sys.stderr) - raise Exception(f"Pipeline failed with return code {rc}") - - -def main(): - # make sure we're in the correct directory - os.chdir("/src") - - # schedule - print(f"scheduling pipeline to run every {FREQUENCY} hours...") - schedule.every(FREQUENCY).hours.do(run_pipeline) - - # run now - run_pipeline() - - # keep running - while True: - schedule.run_pending() - time.sleep(1) - - -if __name__ == "__main__": - main() From eecee8573f2d72874be5847ff6ab06a36d8494d0 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Thu, 17 Oct 2024 22:38:42 -0500 Subject: [PATCH 14/41] crates working with docker compose --- core/config.py | 83 ++++++++++++++++++++++++++ core/structs.py | 16 +++++ docker-compose.yml | 29 ++++----- package_managers/crates/main.py | 55 ++--------------- package_managers/crates/structs.py | 15 ----- package_managers/crates/transformer.py | 5 +- 6 files changed, 120 insertions(+), 83 deletions(-) create mode 100644 core/config.py create mode 100644 core/structs.py diff --git a/core/config.py b/core/config.py new file mode 100644 index 0000000..0928a8c --- /dev/null +++ b/core/config.py @@ -0,0 +1,83 @@ +from dataclasses import dataclass +from enum import Enum +from os import getenv + +from core.db import DB +from core.logger import Logger +from core.structs import URLTypes, UserTypes + +logger = Logger("config") + +TEST = getenv("TEST", "false").lower() == "true" +FETCH = getenv("FETCH", "true").lower() == "true" + + +class PackageManager(Enum): + CRATES = "crates" + HOMEBREW = "homebrew" + + +class Sources: + crates: str = "https://static.crates.io/db-dump.tar.gz" + homebrew: str = "https://github.com/Homebrew/homebrew-core/tree/master/Formula" + + +@dataclass +class Config: + file_location: str + test: bool + fetch: bool + package_manager_id: str + url_types: URLTypes + user_types: UserTypes + + def __str__(self): + return f"Config(file_location={self.file_location}, test={self.test}, \ + fetch={self.fetch}, package_manager_id={self.package_manager_id}, \ + url_types={self.url_types}, user_types={self.user_types})" + + +def load_url_types(db: DB) -> URLTypes: + logger.debug("loading url types, and creating if not exists") + homepage_url = db.select_url_types_homepage(create=True) + repository_url = db.select_url_types_repository(create=True) + documentation_url = db.select_url_types_documentation(create=True) + return URLTypes( + homepage=homepage_url.id, + repository=repository_url.id, + documentation=documentation_url.id, + ) + + +def load_user_types(db: DB) -> UserTypes: + logger.debug("loading user types, and creating if not exists") + crates_source = db.select_source_by_name("crates", create=True) + github_source = db.select_source_by_name("github", create=True) + return UserTypes( + crates=crates_source.id, + github=github_source.id, + ) + + +def initialize(package_manager: PackageManager, db: DB) -> Config: + url_types = load_url_types(db) + user_types = load_user_types(db) + + if package_manager == PackageManager.CRATES: + return Config( + file_location=Sources.crates, + test=False, + fetch=True, + package_manager_id=PackageManager.CRATES.value, + url_types=url_types, + user_types=user_types, + ) + elif package_manager == PackageManager.HOMEBREW: + return Config( + file_location=Sources.homebrew, + test=False, + fetch=True, + package_manager_id=PackageManager.HOMEBREW.value, + url_types=url_types, + user_types=user_types, + ) diff --git a/core/structs.py b/core/structs.py new file mode 100644 index 0000000..fcfb672 --- /dev/null +++ b/core/structs.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass + +from sqlalchemy import UUID + + +@dataclass +class URLTypes: + homepage: UUID + repository: UUID + documentation: UUID + + +@dataclass +class UserTypes: + crates: UUID + github: UUID diff --git a/docker-compose.yml b/docker-compose.yml index 70e134e..13cd8c9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -44,20 +44,21 @@ services: alembic: condition: service_completed_successfully - # homebrew: - # build: - # context: ./package_managers/homebrew - # dockerfile: Dockerfile - # environment: - # - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai - # volumes: - # - ./core:/app/core - # - ./package_managers/homebrew:/app/homebrew - # depends_on: - # db: - # condition: service_healthy - # alembic: - # condition: service_completed_successfully + homebrew: + build: + context: . + dockerfile: ./package_managers/homebrew/Dockerfile + environment: + - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai + - DEBUG=${DEBUG:-true} + - TEST=${TEST:-false} + - FETCH=${FETCH:-true} + - FREQUENCY=${FREQUENCY:-24} + depends_on: + db: + condition: service_healthy + alembic: + condition: service_completed_successfully monitor: build: diff --git a/package_managers/crates/main.py b/package_managers/crates/main.py index 5c7a291..e8a030c 100644 --- a/package_managers/crates/main.py +++ b/package_managers/crates/main.py @@ -1,61 +1,14 @@ import time -from os import getenv - -from dataclasses import dataclass +from core.config import Config, PackageManager, initialize +from core.db import DB from core.fetcher import TarballFetcher from core.logger import Logger -from core.db import DB from core.scheduler import Scheduler -from package_managers.crates.structs import URLTypes, UserTypes from package_managers.crates.transformer import CratesTransformer logger = Logger("crates_orchestrator") - - -# TODO: global config class -@dataclass -class Config: - file_location: str - test: bool - fetch: bool - package_manager_id: str - url_types: URLTypes - user_types: UserTypes - - def __str__(self): - return f"Config(file_location={self.file_location}, test={self.test}, \ - fetch={self.fetch}, package_manager_id={self.package_manager_id}, \ - url_types={self.url_types}, user_types={self.user_types})" - - -def initialize(db: DB) -> Config: - file_location = "https://static.crates.io/db-dump.tar.gz" - test = getenv("TEST", "false").lower() == "true" - fetch = getenv("FETCH", "true").lower() == "true" - package_manager = db.select_package_manager_by_name("crates", create=True) - homepage_url = db.select_url_types_homepage(create=True) - repository_url = db.select_url_types_repository(create=True) - documentation_url = db.select_url_types_documentation(create=True) - crates_source = db.select_source_by_name("crates", create=True) - github_source = db.select_source_by_name("github", create=True) - url_types = URLTypes( - homepage=homepage_url.id, - repository=repository_url.id, - documentation=documentation_url.id, - ) - user_types = UserTypes(crates=crates_source.id, github=github_source.id) - - logger.debug("initialized config") - - return Config( - file_location=file_location, - test=test, - fetch=fetch, - package_manager_id=package_manager.id, - url_types=url_types, - user_types=user_types, - ) +crates = PackageManager.CRATES def fetch(config: Config) -> None: @@ -98,7 +51,7 @@ def run_pipeline(db: DB, config: Config) -> None: def main(): db = DB() - config = initialize(db) + config = initialize(crates, db) logger.debug(config) scheduler = Scheduler("crates") diff --git a/package_managers/crates/structs.py b/package_managers/crates/structs.py index 06b2e6c..60f557e 100644 --- a/package_managers/crates/structs.py +++ b/package_managers/crates/structs.py @@ -1,6 +1,4 @@ from enum import IntEnum -from dataclasses import dataclass -from sqlalchemy import UUID class DependencyType(IntEnum): @@ -11,16 +9,3 @@ class DependencyType(IntEnum): def __str__(self): return self.name.lower() - - -@dataclass -class URLTypes: - homepage: UUID - repository: UUID - documentation: UUID - - -@dataclass -class UserTypes: - crates: UUID - github: UUID diff --git a/package_managers/crates/transformer.py b/package_managers/crates/transformer.py index edb9f16..229286c 100644 --- a/package_managers/crates/transformer.py +++ b/package_managers/crates/transformer.py @@ -1,10 +1,9 @@ import csv from typing import Dict, Generator -from package_managers.crates.structs import DependencyType, URLTypes, UserTypes - -from core.utils import safe_int from core.transformer import Transformer +from core.utils import safe_int +from package_managers.crates.structs import DependencyType, URLTypes, UserTypes # crates provides homepage and repository urls, so we'll initialize this transformer From f8bd30707893ab1100d74d0d49e14b321c522ad0 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Thu, 17 Oct 2024 22:39:02 -0500 Subject: [PATCH 15/41] jq formulae for homebrew --- package_managers/homebrew/jq/dependencies.jq | 30 ++++++++++++++++++++ package_managers/homebrew/jq/packages.jq | 2 ++ package_managers/homebrew/jq/urls.jq | 3 ++ package_managers/homebrew/jq/versions.jq | 16 +++++++++++ 4 files changed, 51 insertions(+) create mode 100644 package_managers/homebrew/jq/dependencies.jq create mode 100644 package_managers/homebrew/jq/packages.jq create mode 100644 package_managers/homebrew/jq/urls.jq create mode 100644 package_managers/homebrew/jq/versions.jq diff --git a/package_managers/homebrew/jq/dependencies.jq b/package_managers/homebrew/jq/dependencies.jq new file mode 100644 index 0000000..9ec1e2b --- /dev/null +++ b/package_managers/homebrew/jq/dependencies.jq @@ -0,0 +1,30 @@ +# build_dependencies +# dependencies +# test_dependencies +# optional_dependencies +# uses_from_macos +# variations + +.[] | +.name as $name | +( + (.uses_from_macos // []) | + map({package: $name, dependency_type: "uses_from_macos", dependency: .}) +), +( + (.dependencies // []) | + map({package: $name, dependency_type: "dependency", dependency: .}) +), +( + (.test_dependencies // []) | + map({package: $name, dependency_type: "test_dependency", dependency: .}) +), +( + (.optional_dependencies // []) | + map({package: $name, dependency_type: "optional_dependency", dependency: .}) +), +( + (.build_dependencies // []) | + map({package: $name, dependency_type: "build_dependency", dependency: .}) +) +| .[] \ No newline at end of file diff --git a/package_managers/homebrew/jq/packages.jq b/package_managers/homebrew/jq/packages.jq new file mode 100644 index 0000000..3890512 --- /dev/null +++ b/package_managers/homebrew/jq/packages.jq @@ -0,0 +1,2 @@ +# we just need the name for the packages models +'[.[] | {name: .name, derived_id: ("homebrew/" + .name), import_id: .name, readme: null}]' \ No newline at end of file diff --git a/package_managers/homebrew/jq/urls.jq b/package_managers/homebrew/jq/urls.jq new file mode 100644 index 0000000..7aceac3 --- /dev/null +++ b/package_managers/homebrew/jq/urls.jq @@ -0,0 +1,3 @@ +# homepage is at the main key +# source is inside stable, and it's the tarball +[.[] | {homepage: .homepage, source: .urls.stable.url}] \ No newline at end of file diff --git a/package_managers/homebrew/jq/versions.jq b/package_managers/homebrew/jq/versions.jq new file mode 100644 index 0000000..62ee5d0 --- /dev/null +++ b/package_managers/homebrew/jq/versions.jq @@ -0,0 +1,16 @@ +# homebrew has the problem where there are no versions +# we're gonna assume the version available is the latest +# and we'll deal with that later + +# TODO: `downloads: .analytics.install_on_request."365d".[$name]` +# above gives us the downloads for the last 365 days +# not available in the full JSON API + +# TODO: there are also a problem of versioned formulae +.[] | +.name as $name | +{ + version: .versions.stable, + import_id: .name, + license: .license +} \ No newline at end of file From 977adf393598f429e166959aa32ca7e9bd5eb47d Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Thu, 17 Oct 2024 22:55:21 -0500 Subject: [PATCH 16/41] fix packages script --- package_managers/homebrew/jq/packages.jq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package_managers/homebrew/jq/packages.jq b/package_managers/homebrew/jq/packages.jq index 3890512..b070648 100644 --- a/package_managers/homebrew/jq/packages.jq +++ b/package_managers/homebrew/jq/packages.jq @@ -1,2 +1,2 @@ # we just need the name for the packages models -'[.[] | {name: .name, derived_id: ("homebrew/" + .name), import_id: .name, readme: null}]' \ No newline at end of file +[.[] | {name: .name, derived_id: ("homebrew/" + .name), import_id: .name, readme: null}] \ No newline at end of file From c26d61759f0f87a3d45c4c0847c052b40b369d38 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Thu, 17 Oct 2024 22:55:33 -0500 Subject: [PATCH 17/41] start of orchestrator for homebrew...that works! --- package_managers/homebrew/pipeline.sh | 31 +++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100755 package_managers/homebrew/pipeline.sh diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh new file mode 100755 index 0000000..1da181e --- /dev/null +++ b/package_managers/homebrew/pipeline.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +set -exu + +export SOURCE="https://formulae.brew.sh/api/formula.json" +export NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +export JQ_DIR="package_managers/homebrew/jq" +mkdir -p data/homebrew/$NOW + +# extract +curl -s $SOURCE > data/homebrew/$NOW/source.json + +# make a symlink called latest, pointing to $NOW +ln -sfn $NOW data/homebrew/latest + +# transform +for x in $JQ_DIR/*.jq; do + echo $x + filename=$(basename "$x" .jq) + pkgx jq -f $x data/homebrew/latest/source.json > data/homebrew/latest/${filename}.json + # | json2csv > data/homebrew/latest/${x%.jq}.csv +done + +# load + + + +# make it all csv +# jq -r '(map(keys) | add | unique) as $cols | map(. as $row | $cols | map($row[.])) as $rows | $cols, $rows[] | @csv' + +# psql to load raw csv files \ No newline at end of file From b9dfbaf95aa74702dd7ea73959482f775030d724 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 00:27:18 -0500 Subject: [PATCH 18/41] jq transforms --- package_managers/homebrew/jq/dependencies.jq | 4 ++-- package_managers/homebrew/jq/package_url.jq | 8 ++++++++ package_managers/homebrew/jq/urls.jq | 11 ++++++++++- package_managers/homebrew/jq/versions.jq | 4 ++-- 4 files changed, 22 insertions(+), 5 deletions(-) create mode 100644 package_managers/homebrew/jq/package_url.jq diff --git a/package_managers/homebrew/jq/dependencies.jq b/package_managers/homebrew/jq/dependencies.jq index 9ec1e2b..93b469f 100644 --- a/package_managers/homebrew/jq/dependencies.jq +++ b/package_managers/homebrew/jq/dependencies.jq @@ -5,7 +5,7 @@ # uses_from_macos # variations -.[] | +[.[] | .name as $name | ( (.uses_from_macos // []) | @@ -27,4 +27,4 @@ (.build_dependencies // []) | map({package: $name, dependency_type: "build_dependency", dependency: .}) ) -| .[] \ No newline at end of file +| .[]] \ No newline at end of file diff --git a/package_managers/homebrew/jq/package_url.jq b/package_managers/homebrew/jq/package_url.jq new file mode 100644 index 0000000..f1f767a --- /dev/null +++ b/package_managers/homebrew/jq/package_url.jq @@ -0,0 +1,8 @@ +[.[] | { + package_name: .name, + homepage_url: .homepage, + source_url: .urls.stable.url +} | [ + {package_name: .package_name, url: .homepage_url}, + {package_name: .package_name, url: .source_url} +] | .[]] diff --git a/package_managers/homebrew/jq/urls.jq b/package_managers/homebrew/jq/urls.jq index 7aceac3..c51c2fc 100644 --- a/package_managers/homebrew/jq/urls.jq +++ b/package_managers/homebrew/jq/urls.jq @@ -1,3 +1,12 @@ # homepage is at the main key # source is inside stable, and it's the tarball -[.[] | {homepage: .homepage, source: .urls.stable.url}] \ No newline at end of file +[.[] | { + homepage: .homepage, + source: .urls.stable.url +} | to_entries | map({ + name: .key, + url: .value +}) | .[] | { + url: .url, + url_type: .name +}] \ No newline at end of file diff --git a/package_managers/homebrew/jq/versions.jq b/package_managers/homebrew/jq/versions.jq index 62ee5d0..8d76b0d 100644 --- a/package_managers/homebrew/jq/versions.jq +++ b/package_managers/homebrew/jq/versions.jq @@ -7,10 +7,10 @@ # not available in the full JSON API # TODO: there are also a problem of versioned formulae -.[] | +[.[] | .name as $name | { version: .versions.stable, import_id: .name, license: .license -} \ No newline at end of file +}] \ No newline at end of file From 745dd2241d26f582f81e783d5ae7850c11908bdd Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 01:21:43 -0500 Subject: [PATCH 19/41] volume mapping, docker compose --- docker-compose.yml | 8 +++++++- package_managers/homebrew/Dockerfile | 9 +++++++++ package_managers/homebrew/pipeline.sh | 17 +++++++---------- 3 files changed, 23 insertions(+), 11 deletions(-) create mode 100644 package_managers/homebrew/Dockerfile diff --git a/docker-compose.yml b/docker-compose.yml index 13cd8c9..2f261c3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -50,10 +50,16 @@ services: dockerfile: ./package_managers/homebrew/Dockerfile environment: - CHAI_DATABASE_URL=postgresql://postgres:s3cr3t@db:5432/chai - - DEBUG=${DEBUG:-true} - TEST=${TEST:-false} - FETCH=${FETCH:-true} - FREQUENCY=${FREQUENCY:-24} + - SOURCE=https://formulae.brew.sh/api/formula.json + - JQ_DIR=/package_managers/homebrew/jq + - DATA_DIR=/data/homebrew + # TODO: can I reference DATA_DIR in the volumes key? + # https://stackoverflow.com/questions/29377853/how-can-i-use-environment-variables-in-docker-compose + volumes: + - ./data/homebrew:/data/homebrew depends_on: db: condition: service_healthy diff --git a/package_managers/homebrew/Dockerfile b/package_managers/homebrew/Dockerfile new file mode 100644 index 0000000..90dc1a3 --- /dev/null +++ b/package_managers/homebrew/Dockerfile @@ -0,0 +1,9 @@ +FROM ubuntu:24.10 +RUN apt-get update && \ + apt-get install -y jq curl && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +COPY . . +WORKDIR /package_managers/homebrew +RUN chmod +x /package_managers/homebrew/pipeline.sh +CMD ["/package_managers/homebrew/pipeline.sh"] diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh index 1da181e..f2e9e60 100755 --- a/package_managers/homebrew/pipeline.sh +++ b/package_managers/homebrew/pipeline.sh @@ -1,23 +1,20 @@ #!/bin/bash set -exu - -export SOURCE="https://formulae.brew.sh/api/formula.json" -export NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -export JQ_DIR="package_managers/homebrew/jq" -mkdir -p data/homebrew/$NOW +NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +mkdir -p "$DATA_DIR"/"$NOW" # extract -curl -s $SOURCE > data/homebrew/$NOW/source.json +curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json # make a symlink called latest, pointing to $NOW -ln -sfn $NOW data/homebrew/latest +ln -sfn "$NOW" "$DATA_DIR"/latest # transform -for x in $JQ_DIR/*.jq; do - echo $x +echo "$JQ_DIR" +for x in "$JQ_DIR"/*.jq; do filename=$(basename "$x" .jq) - pkgx jq -f $x data/homebrew/latest/source.json > data/homebrew/latest/${filename}.json + jq -f "$x" "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".json # | json2csv > data/homebrew/latest/${x%.jq}.csv done From 55d9614ca081fd248b58138b1f598701478a9454 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 01:28:54 -0500 Subject: [PATCH 20/41] fix crates import --- package_managers/crates/transformer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/package_managers/crates/transformer.py b/package_managers/crates/transformer.py index 229286c..33097fb 100644 --- a/package_managers/crates/transformer.py +++ b/package_managers/crates/transformer.py @@ -1,9 +1,10 @@ import csv from typing import Dict, Generator +from core.structs import URLTypes, UserTypes from core.transformer import Transformer from core.utils import safe_int -from package_managers.crates.structs import DependencyType, URLTypes, UserTypes +from package_managers.crates.structs import DependencyType # crates provides homepage and repository urls, so we'll initialize this transformer From 2e1066fb0e20dfb6a8de4770cd754b22231748df Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 01:48:39 -0500 Subject: [PATCH 21/41] lint on db.py --- core/db.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/core/db.py b/core/db.py index d3d52bd..44d7e1a 100644 --- a/core/db.py +++ b/core/db.py @@ -1,12 +1,15 @@ import os from typing import Any, Dict, Iterable, List, Type -from core.utils import build_query_params + from sqlalchemy import UUID, create_engine from sqlalchemy.dialects import postgresql from sqlalchemy.dialects.postgresql import insert from sqlalchemy.orm import sessionmaker from sqlalchemy.orm.decl_api import DeclarativeMeta + +from core.logger import Logger from core.models import ( + URL, DependsOn, License, LoadHistory, @@ -16,12 +19,11 @@ Source, URLType, User, - URL, UserPackage, UserVersion, Version, ) -from core.logger import Logger +from core.utils import build_query_params CHAI_DATABASE_URL = os.getenv("CHAI_DATABASE_URL") DEFAULT_BATCH_SIZE = 10000 From 7c103b642459acb79ba4f1d52861de18e1517f47 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 01:48:47 -0500 Subject: [PATCH 22/41] config for crates --- core/config.py | 44 ++++++++++++++++++++++++++++---------------- core/structs.py | 11 +++++++++++ 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/core/config.py b/core/config.py index 0928a8c..1c4354e 100644 --- a/core/config.py +++ b/core/config.py @@ -1,10 +1,9 @@ from dataclasses import dataclass -from enum import Enum from os import getenv from core.db import DB from core.logger import Logger -from core.structs import URLTypes, UserTypes +from core.structs import PackageManager, PackageManagerIDs, Sources, URLTypes, UserTypes logger = Logger("config") @@ -12,16 +11,6 @@ FETCH = getenv("FETCH", "true").lower() == "true" -class PackageManager(Enum): - CRATES = "crates" - HOMEBREW = "homebrew" - - -class Sources: - crates: str = "https://static.crates.io/db-dump.tar.gz" - homebrew: str = "https://github.com/Homebrew/homebrew-core/tree/master/Formula" - - @dataclass class Config: file_location: str @@ -59,25 +48,48 @@ def load_user_types(db: DB) -> UserTypes: ) +def load_package_manager_ids(db: DB) -> PackageManagerIDs: + logger.debug("loading package manager ids, and creating if not exists") + crates_package_manager = db.select_package_manager_by_name("crates", create=True) + homebrew_package_manager = db.select_package_manager_by_name( + "homebrew", create=True + ) + return { + PackageManager.CRATES: crates_package_manager.id, + PackageManager.HOMEBREW: homebrew_package_manager.id, + } + + +def load_sources() -> Sources: + return { + PackageManager.CRATES: "https://static.crates.io/db-dump.tar.gz", + PackageManager.HOMEBREW: ( + "https://github.com/Homebrew/homebrew-core/tree/master/Formula" + ), + } + + def initialize(package_manager: PackageManager, db: DB) -> Config: url_types = load_url_types(db) user_types = load_user_types(db) + package_manager_ids = load_package_manager_ids(db) + sources = load_sources() if package_manager == PackageManager.CRATES: return Config( - file_location=Sources.crates, + file_location=sources[PackageManager.CRATES], test=False, fetch=True, - package_manager_id=PackageManager.CRATES.value, + package_manager_id=package_manager_ids[PackageManager.CRATES], url_types=url_types, user_types=user_types, ) elif package_manager == PackageManager.HOMEBREW: return Config( - file_location=Sources.homebrew, + file_location=sources[PackageManager.HOMEBREW], test=False, fetch=True, - package_manager_id=PackageManager.HOMEBREW.value, + package_manager_id=package_manager_ids[PackageManager.HOMEBREW], url_types=url_types, user_types=user_types, ) diff --git a/core/structs.py b/core/structs.py index fcfb672..e7d72b7 100644 --- a/core/structs.py +++ b/core/structs.py @@ -1,8 +1,19 @@ from dataclasses import dataclass +from enum import Enum +from typing import Dict from sqlalchemy import UUID +class PackageManager(Enum): + CRATES = "crates" + HOMEBREW = "homebrew" + + +PackageManagerIDs = Dict[PackageManager, UUID] +Sources = Dict[PackageManager, str] + + @dataclass class URLTypes: homepage: UUID From c9d8fd771fd0fe21609004cacc769eb9cd685a28 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 01:55:29 -0500 Subject: [PATCH 23/41] Dockerfile? --- package_managers/homebrew/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/package_managers/homebrew/Dockerfile b/package_managers/homebrew/Dockerfile index 90dc1a3..8a33889 100644 --- a/package_managers/homebrew/Dockerfile +++ b/package_managers/homebrew/Dockerfile @@ -7,3 +7,4 @@ COPY . . WORKDIR /package_managers/homebrew RUN chmod +x /package_managers/homebrew/pipeline.sh CMD ["/package_managers/homebrew/pipeline.sh"] + From 8b5fcdd152145285c4dbdfa1ddc410df10a7dbc5 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 02:07:29 -0500 Subject: [PATCH 24/41] docker compose yml todo --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 2f261c3..3d16a9c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -56,7 +56,7 @@ services: - SOURCE=https://formulae.brew.sh/api/formula.json - JQ_DIR=/package_managers/homebrew/jq - DATA_DIR=/data/homebrew - # TODO: can I reference DATA_DIR in the volumes key? + # TODO: can I reference DATA_DIR in the volumes key? do I need it? # https://stackoverflow.com/questions/29377853/how-can-i-use-environment-variables-in-docker-compose volumes: - ./data/homebrew:/data/homebrew From 1ff756b5f8d5258133f8708c85883c393cccaf89 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 02:07:38 -0500 Subject: [PATCH 25/41] we've got csvs! --- package_managers/homebrew/jq/dependencies.jq | 11 +++++++++-- package_managers/homebrew/pipeline.sh | 14 +++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/package_managers/homebrew/jq/dependencies.jq b/package_managers/homebrew/jq/dependencies.jq index 93b469f..9e7181a 100644 --- a/package_managers/homebrew/jq/dependencies.jq +++ b/package_managers/homebrew/jq/dependencies.jq @@ -3,13 +3,20 @@ # test_dependencies # optional_dependencies # uses_from_macos -# variations +# TODO: variations (linux only, by architecture) +# all of the above are the fields that contain dependency info for Homebrew + +# uses from macos sometimes specifies build / test -- right now logging that as macos only [.[] | .name as $name | ( (.uses_from_macos // []) | - map({package: $name, dependency_type: "uses_from_macos", dependency: .}) + map({ + package: $name, + dependency_type: "uses_from_macos", + dependency: (if type == "object" then keys[0] else . end) + }) ), ( (.dependencies // []) | diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh index f2e9e60..445eee2 100755 --- a/package_managers/homebrew/pipeline.sh +++ b/package_managers/homebrew/pipeline.sh @@ -14,8 +14,15 @@ ln -sfn "$NOW" "$DATA_DIR"/latest echo "$JQ_DIR" for x in "$JQ_DIR"/*.jq; do filename=$(basename "$x" .jq) - jq -f "$x" "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".json - # | json2csv > data/homebrew/latest/${x%.jq}.csv + # first jq line uses the formulas defined in the jq folder to get the fields we need + # second jq line transforms the json into csv + jq -f "$x" "$DATA_DIR"/latest/source.json \ + | jq -r ' + (map(keys) | add | unique) as $cols | + map(. as $row | $cols | map($row[.])) as $rows | + $cols, $rows[] | @csv + ' \ + > "$DATA_DIR"/latest/"${filename}".csv done # load @@ -23,6 +30,7 @@ done # make it all csv -# jq -r '(map(keys) | add | unique) as $cols | map(. as $row | $cols | map($row[.])) as $rows | $cols, $rows[] | @csv' +# +# > "$DATA_DIR"/latest/"${filename}".json # psql to load raw csv files \ No newline at end of file From 51c5b060d806fa0f50bbd27b1cd9555fa4974d9b Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 02:49:23 -0500 Subject: [PATCH 26/41] everything but load --- docker-compose.yml | 1 + package_managers/homebrew/Dockerfile | 2 +- package_managers/homebrew/pipeline.sh | 61 +++++++++++-------- package_managers/homebrew/sed/package_url.sed | 4 ++ package_managers/homebrew/sed/packages.sed | 2 + package_managers/homebrew/sed/urls.sed | 2 + 6 files changed, 44 insertions(+), 28 deletions(-) create mode 100644 package_managers/homebrew/sed/package_url.sed create mode 100644 package_managers/homebrew/sed/packages.sed create mode 100644 package_managers/homebrew/sed/urls.sed diff --git a/docker-compose.yml b/docker-compose.yml index 3d16a9c..459e674 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -55,6 +55,7 @@ services: - FREQUENCY=${FREQUENCY:-24} - SOURCE=https://formulae.brew.sh/api/formula.json - JQ_DIR=/package_managers/homebrew/jq + - SED_DIR=/package_managers/homebrew/sed - DATA_DIR=/data/homebrew # TODO: can I reference DATA_DIR in the volumes key? do I need it? # https://stackoverflow.com/questions/29377853/how-can-i-use-environment-variables-in-docker-compose diff --git a/package_managers/homebrew/Dockerfile b/package_managers/homebrew/Dockerfile index 8a33889..9551376 100644 --- a/package_managers/homebrew/Dockerfile +++ b/package_managers/homebrew/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:24.10 RUN apt-get update && \ - apt-get install -y jq curl && \ + apt-get install -y jq curl postgresql-client && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* COPY . . diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh index 445eee2..79e6b9d 100755 --- a/package_managers/homebrew/pipeline.sh +++ b/package_managers/homebrew/pipeline.sh @@ -5,32 +5,39 @@ NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") mkdir -p "$DATA_DIR"/"$NOW" # extract -curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json - -# make a symlink called latest, pointing to $NOW -ln -sfn "$NOW" "$DATA_DIR"/latest - -# transform -echo "$JQ_DIR" -for x in "$JQ_DIR"/*.jq; do - filename=$(basename "$x" .jq) - # first jq line uses the formulas defined in the jq folder to get the fields we need - # second jq line transforms the json into csv - jq -f "$x" "$DATA_DIR"/latest/source.json \ - | jq -r ' - (map(keys) | add | unique) as $cols | - map(. as $row | $cols | map($row[.])) as $rows | - $cols, $rows[] | @csv - ' \ - > "$DATA_DIR"/latest/"${filename}".csv -done +if [ "$FETCH" = true ]; then + curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json + + # make a symlink called latest, pointing to $NOW + ln -sfn "$NOW" "$DATA_DIR"/latest + + # transform + echo "$JQ_DIR" + for x in "$JQ_DIR"/*.jq; do + filename=$(basename "$x" .jq) + # first jq line uses the formulas defined in the jq folder to get the fields we need + # second jq line transforms the json into csv + jq -f "$x" "$DATA_DIR"/latest/source.json \ + | jq -r ' + (map(keys) | add | unique) as $cols | + map(. as $row | $cols | map($row[.])) as $rows | + $cols, $rows[] | @csv + ' \ + > "$DATA_DIR"/latest/"${filename}".csv + done +fi # load - - - -# make it all csv -# -# > "$DATA_DIR"/latest/"${filename}".json - -# psql to load raw csv files \ No newline at end of file +# TODO: put in a sed folder +# sed -f "$SED_DIR/packages.sed" "$DATA_DIR/latest/packages.csv" > "$DATA_DIR/latest/package_inserts.sql" +sed '1d;s/"\([^"]*\)","\([^"]*\)","\([^"]*\)",*/INSERT INTO packages (derived_id, import_id, name) VALUES ('\''\1'\'', '\''\2'\'', '\''\3'\'');/' "$DATA_DIR"/latest/packages.csv > "$DATA_DIR"/latest/package_inserts.sql +psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_inserts.sql + +# sed -f "$SED_DIR"/urls.sed "$DATA_DIR"/latest/urls.csv \ +# > "$DATA_DIR"/latest/url_inserts.sql +# psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/url_inserts.sql + +# loading package_urls is a bit more complicated, because we need to do it in batches +# and we need to get the ids from the package and url tables +# sed -f "$SED_DIR"/package_url.sed "$DATA_DIR"/latest/package_url.csv > "$DATA_DIR"/latest/package_url_inserts.sql +# psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_url_inserts.sql diff --git a/package_managers/homebrew/sed/package_url.sed b/package_managers/homebrew/sed/package_url.sed new file mode 100644 index 0000000..2811c32 --- /dev/null +++ b/package_managers/homebrew/sed/package_url.sed @@ -0,0 +1,4 @@ +sed -E ' + 1d; + s/^"?([^,"]*),"?([^,"]*)"?$/INSERT INTO package_url (package_id, url_id) SELECT (SELECT id FROM package WHERE name = '\''\\1'\''), (SELECT id FROM url WHERE url = '\''\\2'\'');/g +' package_url.csv > package_url_inserts.sql \ No newline at end of file diff --git a/package_managers/homebrew/sed/packages.sed b/package_managers/homebrew/sed/packages.sed new file mode 100644 index 0000000..be2c29e --- /dev/null +++ b/package_managers/homebrew/sed/packages.sed @@ -0,0 +1,2 @@ +1d +'s/"\([^"]*\)","\([^"]*\)","\([^"]*\)",*/INSERT INTO package (derived_id, import_id, name, readme) VALUES ("\1", "\2", "\3", NULL);/' \ No newline at end of file diff --git a/package_managers/homebrew/sed/urls.sed b/package_managers/homebrew/sed/urls.sed new file mode 100644 index 0000000..3517258 --- /dev/null +++ b/package_managers/homebrew/sed/urls.sed @@ -0,0 +1,2 @@ +1d +s/^"?([^,"]*),"?([^,"]*)"?$/INSERT INTO url (url) VALUES ('\''\\1'\'');/g \ No newline at end of file From 705949a7c3adec0067b46e249345b61fea9a120f Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 07:59:12 -0500 Subject: [PATCH 27/41] cleanup --- package_managers/homebrew/pipeline.sh | 4 ++-- package_managers/homebrew/sed/packages.sed | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh index 79e6b9d..3b770b0 100755 --- a/package_managers/homebrew/pipeline.sh +++ b/package_managers/homebrew/pipeline.sh @@ -4,8 +4,8 @@ set -exu NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") mkdir -p "$DATA_DIR"/"$NOW" -# extract if [ "$FETCH" = true ]; then + # extract curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json # make a symlink called latest, pointing to $NOW @@ -16,7 +16,7 @@ if [ "$FETCH" = true ]; then for x in "$JQ_DIR"/*.jq; do filename=$(basename "$x" .jq) # first jq line uses the formulas defined in the jq folder to get the fields we need - # second jq line transforms the json into csv + # second jq line transforms the json into csv so we can use sed to clean it up jq -f "$x" "$DATA_DIR"/latest/source.json \ | jq -r ' (map(keys) | add | unique) as $cols | diff --git a/package_managers/homebrew/sed/packages.sed b/package_managers/homebrew/sed/packages.sed index be2c29e..db7b364 100644 --- a/package_managers/homebrew/sed/packages.sed +++ b/package_managers/homebrew/sed/packages.sed @@ -1,2 +1,2 @@ 1d -'s/"\([^"]*\)","\([^"]*\)","\([^"]*\)",*/INSERT INTO package (derived_id, import_id, name, readme) VALUES ("\1", "\2", "\3", NULL);/' \ No newline at end of file +'s/"\([^"]*\)","\([^"]*\)","\([^"]*\)",*/INSERT INTO packages (derived_id, import_id, name) VALUES ("\1", "\2", "\3");/' From 45bb38756068ee52a5783dd9969b0994770522cc Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 08:49:33 -0500 Subject: [PATCH 28/41] inserting packages works! --- package_managers/homebrew/homebrew_id.sql | 9 +++++++++ package_managers/homebrew/pipeline.sh | 12 +++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 package_managers/homebrew/homebrew_id.sql diff --git a/package_managers/homebrew/homebrew_id.sql b/package_managers/homebrew/homebrew_id.sql new file mode 100644 index 0000000..578c43b --- /dev/null +++ b/package_managers/homebrew/homebrew_id.sql @@ -0,0 +1,9 @@ +WITH source_id AS ( + INSERT INTO sources ("type") + VALUES ('homebrew') + ON CONFLICT ("type") DO UPDATE SET "type" = EXCLUDED."type" + RETURNING id +) +SELECT id +FROM package_managers +WHERE source_id = (SELECT id FROM source_id); \ No newline at end of file diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh index 3b770b0..8afa507 100755 --- a/package_managers/homebrew/pipeline.sh +++ b/package_managers/homebrew/pipeline.sh @@ -4,6 +4,12 @@ set -exu NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") mkdir -p "$DATA_DIR"/"$NOW" +# get the ID for Homebrew from our database +HOMEBREW_ID=$(psql "$CHAI_DATABASE_URL" -f homebrew_id.sql -v "ON_ERROR_STOP=1" -tA) + +# if you've already pulled the Homebrew data, you can `export FETCH=false` to skip the +# download, and just work off the latest symlink +# Note that this only works if the volumes are mounted if [ "$FETCH" = true ]; then # extract curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json @@ -15,8 +21,8 @@ if [ "$FETCH" = true ]; then echo "$JQ_DIR" for x in "$JQ_DIR"/*.jq; do filename=$(basename "$x" .jq) - # first jq line uses the formulas defined in the jq folder to get the fields we need - # second jq line transforms the json into csv so we can use sed to clean it up + # first jq line uses the formulas defined in the jq folder for each data model + # second jq line transforms the json into csv so we can use sed to prep psql stmts jq -f "$x" "$DATA_DIR"/latest/source.json \ | jq -r ' (map(keys) | add | unique) as $cols | @@ -30,7 +36,7 @@ fi # load # TODO: put in a sed folder # sed -f "$SED_DIR/packages.sed" "$DATA_DIR/latest/packages.csv" > "$DATA_DIR/latest/package_inserts.sql" -sed '1d;s/"\([^"]*\)","\([^"]*\)","\([^"]*\)",*/INSERT INTO packages (derived_id, import_id, name) VALUES ('\''\1'\'', '\''\2'\'', '\''\3'\'');/' "$DATA_DIR"/latest/packages.csv > "$DATA_DIR"/latest/package_inserts.sql +sed '1d;s/"\([^"]*\)","\([^"]*\)","\([^"]*\)",*/INSERT INTO packages (derived_id, import_id, name, package_manager_id) VALUES ('\''\1'\'', '\''\2'\'', '\''\3'\'', '\'''"$HOMEBREW_ID"''\'');/' "$DATA_DIR"/latest/packages.csv > "$DATA_DIR"/latest/package_inserts.sql psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_inserts.sql # sed -f "$SED_DIR"/urls.sed "$DATA_DIR"/latest/urls.csv \ From e8630b22f161278c0e14d52691add5dd415ed5dc Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 10:23:24 -0500 Subject: [PATCH 29/41] all homebrew except dependencies --- .../homebrew/create_url_types.sql | 13 +++++ package_managers/homebrew/pipeline.sh | 47 ++++++++++++++----- .../homebrew/sed/dependencies.sed | 2 + package_managers/homebrew/sed/package_url.sed | 6 +-- package_managers/homebrew/sed/packages.sed | 2 +- package_managers/homebrew/sed/urls.sed | 4 +- package_managers/homebrew/sed/versions.sed | 2 + 7 files changed, 58 insertions(+), 18 deletions(-) create mode 100644 package_managers/homebrew/create_url_types.sql create mode 100644 package_managers/homebrew/sed/dependencies.sed create mode 100644 package_managers/homebrew/sed/versions.sed diff --git a/package_managers/homebrew/create_url_types.sql b/package_managers/homebrew/create_url_types.sql new file mode 100644 index 0000000..44a15c0 --- /dev/null +++ b/package_managers/homebrew/create_url_types.sql @@ -0,0 +1,13 @@ +WITH inserted_rows AS ( + INSERT INTO url_types (name) + VALUES ('source'), ('homepage') + ON CONFLICT (name) DO NOTHING + RETURNING id, name +) +SELECT id, name +FROM inserted_rows +UNION ALL +SELECT id, name +FROM url_types +WHERE name IN ('source', 'homepage') + AND name NOT IN (SELECT name FROM inserted_rows); \ No newline at end of file diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh index 8afa507..c92a679 100755 --- a/package_managers/homebrew/pipeline.sh +++ b/package_managers/homebrew/pipeline.sh @@ -1,16 +1,20 @@ #!/bin/bash set -exu -NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -mkdir -p "$DATA_DIR"/"$NOW" # get the ID for Homebrew from our database HOMEBREW_ID=$(psql "$CHAI_DATABASE_URL" -f homebrew_id.sql -v "ON_ERROR_STOP=1" -tA) +# homebrew provides `source` and `homepage` url types - let's create them ahead of time +psql "$CHAI_DATABASE_URL" -f create_url_types.sql + # if you've already pulled the Homebrew data, you can `export FETCH=false` to skip the # download, and just work off the latest symlink # Note that this only works if the volumes are mounted if [ "$FETCH" = true ]; then + NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + mkdir -p "$DATA_DIR"/"$NOW" + # extract curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json @@ -34,16 +38,35 @@ if [ "$FETCH" = true ]; then fi # load -# TODO: put in a sed folder -# sed -f "$SED_DIR/packages.sed" "$DATA_DIR/latest/packages.csv" > "$DATA_DIR/latest/package_inserts.sql" -sed '1d;s/"\([^"]*\)","\([^"]*\)","\([^"]*\)",*/INSERT INTO packages (derived_id, import_id, name, package_manager_id) VALUES ('\''\1'\'', '\''\2'\'', '\''\3'\'', '\'''"$HOMEBREW_ID"''\'');/' "$DATA_DIR"/latest/packages.csv > "$DATA_DIR"/latest/package_inserts.sql +# TODO: loop? + +# packages +# pass HOMEBREW_ID to sed to replace the @@HOMEBREW_ID@@ placeholder +sed \ + -f "$SED_DIR/packages.sed" "$DATA_DIR/latest/packages.csv" | \ + sed "s/@@HOMEBREW_ID@@/$HOMEBREW_ID/" \ + > "$DATA_DIR/latest/package_inserts.sql" psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_inserts.sql -# sed -f "$SED_DIR"/urls.sed "$DATA_DIR"/latest/urls.csv \ -# > "$DATA_DIR"/latest/url_inserts.sql -# psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/url_inserts.sql +# urls +sed \ + -f "$SED_DIR/urls.sed" "$DATA_DIR/latest/urls.csv" \ + > "$DATA_DIR/latest/url_inserts.sql" +psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/url_inserts.sql + +# versions +# TODO: licenses (license id is annoying) +# TODO: some random parsing errors happening in versions.csv +sed \ + -f "$SED_DIR/versions.sed" "$DATA_DIR/latest/versions.csv" \ + > "$DATA_DIR/latest/version_inserts.sql" +psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/version_inserts.sql + +# package_urls +# TODO: ERROR: more than one row returned by a subquery used as an expression +sed \ + -f "$SED_DIR/package_url.sed" "$DATA_DIR/latest/package_url.csv" \ + > "$DATA_DIR/latest/package_url_inserts.sql" +psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_url_inserts.sql -# loading package_urls is a bit more complicated, because we need to do it in batches -# and we need to get the ids from the package and url tables -# sed -f "$SED_DIR"/package_url.sed "$DATA_DIR"/latest/package_url.csv > "$DATA_DIR"/latest/package_url_inserts.sql -# psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_url_inserts.sql +# TODO: dependencies -> dependency_type is annoying \ No newline at end of file diff --git a/package_managers/homebrew/sed/dependencies.sed b/package_managers/homebrew/sed/dependencies.sed new file mode 100644 index 0000000..3a4e234 --- /dev/null +++ b/package_managers/homebrew/sed/dependencies.sed @@ -0,0 +1,2 @@ +1d +s%"\([^"]*\)","\([^"]*\)",*%INSERT INTO dependencies (version_id, dependency_id, dependency_type_id) SELECT (SELECT id FROM versions WHERE import_id = '\3'), (SELECT id FROM packages WHERE derived_id = 'homebrew/\1'), (SELECT id FROM dependency_types WHERE name = '\2');% diff --git a/package_managers/homebrew/sed/package_url.sed b/package_managers/homebrew/sed/package_url.sed index 2811c32..ecc6f33 100644 --- a/package_managers/homebrew/sed/package_url.sed +++ b/package_managers/homebrew/sed/package_url.sed @@ -1,4 +1,2 @@ -sed -E ' - 1d; - s/^"?([^,"]*),"?([^,"]*)"?$/INSERT INTO package_url (package_id, url_id) SELECT (SELECT id FROM package WHERE name = '\''\\1'\''), (SELECT id FROM url WHERE url = '\''\\2'\'');/g -' package_url.csv > package_url_inserts.sql \ No newline at end of file +1d +s%"\([^"]*\)","\([^"]*\)",*%INSERT INTO package_urls (package_id, url_id) SELECT (SELECT id FROM packages WHERE derived_id = 'homebrew/\1'), (SELECT id FROM urls WHERE url = '\2') ON CONFLICT ("package_id", "url_id") DO NOTHING;% diff --git a/package_managers/homebrew/sed/packages.sed b/package_managers/homebrew/sed/packages.sed index db7b364..6373b11 100644 --- a/package_managers/homebrew/sed/packages.sed +++ b/package_managers/homebrew/sed/packages.sed @@ -1,2 +1,2 @@ 1d -'s/"\([^"]*\)","\([^"]*\)","\([^"]*\)",*/INSERT INTO packages (derived_id, import_id, name) VALUES ("\1", "\2", "\3");/' +s/"\([^"]*\)","\([^"]*\)","\([^"]*\)",*/INSERT INTO packages (derived_id, import_id, name, package_manager_id) VALUES ('\1', '\2', '\3', '@@HOMEBREW_ID@@') ON CONFLICT ("derived_id") DO NOTHING;/ \ No newline at end of file diff --git a/package_managers/homebrew/sed/urls.sed b/package_managers/homebrew/sed/urls.sed index 3517258..16f7639 100644 --- a/package_managers/homebrew/sed/urls.sed +++ b/package_managers/homebrew/sed/urls.sed @@ -1,2 +1,4 @@ + + 1d -s/^"?([^,"]*),"?([^,"]*)"?$/INSERT INTO url (url) VALUES ('\''\\1'\'');/g \ No newline at end of file +s/"\([^"]*\)","\([^"]*\)",*/INSERT INTO urls (url, url_type_id) VALUES ('\1', (SELECT id FROM url_types WHERE "name" = '\2')) ON CONFLICT ("url", "url_type_id") DO NOTHING;/ \ No newline at end of file diff --git a/package_managers/homebrew/sed/versions.sed b/package_managers/homebrew/sed/versions.sed new file mode 100644 index 0000000..d3453b7 --- /dev/null +++ b/package_managers/homebrew/sed/versions.sed @@ -0,0 +1,2 @@ +1d +s%"\([^"]*\)","\([^"]*\)","\([^"]*\)",*%INSERT INTO versions (import_id, package_id, "version") VALUES ('\1', (SELECT id FROM packages WHERE derived_id = 'homebrew/\1'), '\3') ON CONFLICT ("package_id", "version") DO NOTHING;% From de82b367e5ce30be3ff274979b45f5ab9d9f9695 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 10:56:04 -0500 Subject: [PATCH 30/41] correctly create homebrew package manager row --- package_managers/homebrew/homebrew_id.sql | 9 ++++++--- package_managers/homebrew/pipeline.sh | 6 ++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/package_managers/homebrew/homebrew_id.sql b/package_managers/homebrew/homebrew_id.sql index 578c43b..b2ec9f1 100644 --- a/package_managers/homebrew/homebrew_id.sql +++ b/package_managers/homebrew/homebrew_id.sql @@ -1,9 +1,12 @@ -WITH source_id AS ( +WITH homebrew_source_id AS ( INSERT INTO sources ("type") VALUES ('homebrew') ON CONFLICT ("type") DO UPDATE SET "type" = EXCLUDED."type" RETURNING id +), package_manager_id AS ( + INSERT INTO package_managers (source_id) + VALUES ((SELECT id FROM homebrew_source_id)) + RETURNING id ) SELECT id -FROM package_managers -WHERE source_id = (SELECT id FROM source_id); \ No newline at end of file +FROM package_manager_id; \ No newline at end of file diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh index c92a679..d11db5e 100755 --- a/package_managers/homebrew/pipeline.sh +++ b/package_managers/homebrew/pipeline.sh @@ -5,6 +5,12 @@ set -exu # get the ID for Homebrew from our database HOMEBREW_ID=$(psql "$CHAI_DATABASE_URL" -f homebrew_id.sql -v "ON_ERROR_STOP=1" -tA) +# fail if HOMEBREW_ID is empty +if [ -z "$HOMEBREW_ID" ]; then + echo "Error: Failed to retrieve Homebrew ID from the database." + exit 1 +fi + # homebrew provides `source` and `homepage` url types - let's create them ahead of time psql "$CHAI_DATABASE_URL" -f create_url_types.sql From 1f9c2cc2d1980154d4b22b3a7d7940538a2a579e Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 11:01:40 -0500 Subject: [PATCH 31/41] yep, much faster --- package_managers/homebrew/pipeline.sh | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh index d11db5e..c9e5be0 100755 --- a/package_managers/homebrew/pipeline.sh +++ b/package_managers/homebrew/pipeline.sh @@ -2,8 +2,15 @@ set -exu +# Set PSQL_FLAGS based on DEBUG environment variable +if [ "${DEBUG:-false}" = false ]; then + PSQL_FLAGS="-q" +else + PSQL_FLAGS="" +fi + # get the ID for Homebrew from our database -HOMEBREW_ID=$(psql "$CHAI_DATABASE_URL" -f homebrew_id.sql -v "ON_ERROR_STOP=1" -tA) +HOMEBREW_ID=$(psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f homebrew_id.sql -v "ON_ERROR_STOP=1" -tA) # fail if HOMEBREW_ID is empty if [ -z "$HOMEBREW_ID" ]; then @@ -12,7 +19,7 @@ if [ -z "$HOMEBREW_ID" ]; then fi # homebrew provides `source` and `homepage` url types - let's create them ahead of time -psql "$CHAI_DATABASE_URL" -f create_url_types.sql +psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f create_url_types.sql # if you've already pulled the Homebrew data, you can `export FETCH=false` to skip the # download, and just work off the latest symlink @@ -52,13 +59,13 @@ sed \ -f "$SED_DIR/packages.sed" "$DATA_DIR/latest/packages.csv" | \ sed "s/@@HOMEBREW_ID@@/$HOMEBREW_ID/" \ > "$DATA_DIR/latest/package_inserts.sql" -psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_inserts.sql +psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_inserts.sql # urls sed \ -f "$SED_DIR/urls.sed" "$DATA_DIR/latest/urls.csv" \ > "$DATA_DIR/latest/url_inserts.sql" -psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/url_inserts.sql +psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/url_inserts.sql # versions # TODO: licenses (license id is annoying) @@ -66,13 +73,13 @@ psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/url_inserts.sql sed \ -f "$SED_DIR/versions.sed" "$DATA_DIR/latest/versions.csv" \ > "$DATA_DIR/latest/version_inserts.sql" -psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/version_inserts.sql +psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/version_inserts.sql # package_urls # TODO: ERROR: more than one row returned by a subquery used as an expression sed \ -f "$SED_DIR/package_url.sed" "$DATA_DIR/latest/package_url.csv" \ > "$DATA_DIR/latest/package_url_inserts.sql" -psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_url_inserts.sql +psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_url_inserts.sql -# TODO: dependencies -> dependency_type is annoying \ No newline at end of file +# TODO: dependencies -> dependency_type is annoying From 29e4853822ab5fa00b37a99c17db2b2bc776a674 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Fri, 18 Oct 2024 11:17:24 -0500 Subject: [PATCH 32/41] crates fix to create source and package manager --- core/db.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/core/db.py b/core/db.py index 44d7e1a..eaf7555 100644 --- a/core/db.py +++ b/core/db.py @@ -373,8 +373,13 @@ def process_package_url(item: Dict[str, str]): PackageURL, self._process_batch(batch, process_package_url) ) - def insert_source(self, name: str) -> UUID: + def insert_source(self, name: str) -> Source: with self.session() as session: + existing_source = session.query(Source).filter_by(type=name).first() + if existing_source: + self.logger.warn(f"Source '{name}' already exists") + return existing_source + session.add(Source(type=name)) session.commit() return session.query(Source).filter_by(type=name).first() From 1d0791a92088a2dc4d75ce4e3919288c2f845aa4 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Mon, 21 Oct 2024 00:29:13 -0500 Subject: [PATCH 33/41] remove sed --- docker-compose.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 459e674..bef45d1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -53,10 +53,11 @@ services: - TEST=${TEST:-false} - FETCH=${FETCH:-true} - FREQUENCY=${FREQUENCY:-24} + - DEBUG=${DEBUG:-true} - SOURCE=https://formulae.brew.sh/api/formula.json - - JQ_DIR=/package_managers/homebrew/jq - - SED_DIR=/package_managers/homebrew/sed + - CODE_DIR=/package_managers/homebrew - DATA_DIR=/data/homebrew + - PYTHONPATH=/ # TODO: can I reference DATA_DIR in the volumes key? do I need it? # https://stackoverflow.com/questions/29377853/how-can-i-use-environment-variables-in-docker-compose volumes: From a5ab8b3f6697801abe51a4fd3a03beab531890fa Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Mon, 21 Oct 2024 00:29:48 -0500 Subject: [PATCH 34/41] config changes to load dependency types --- core/config.py | 36 ++++++++++++++++++++++++++-- core/db.py | 21 ++++++++++++++++ core/logger.py | 11 ++++----- core/structs.py | 11 +++++++++ core/utils.py | 2 +- package_managers/homebrew/Dockerfile | 5 ++-- 6 files changed, 74 insertions(+), 12 deletions(-) diff --git a/core/config.py b/core/config.py index 1c4354e..82eb0f4 100644 --- a/core/config.py +++ b/core/config.py @@ -3,7 +3,14 @@ from core.db import DB from core.logger import Logger -from core.structs import PackageManager, PackageManagerIDs, Sources, URLTypes, UserTypes +from core.structs import ( + DependencyTypes, + PackageManager, + PackageManagerIDs, + Sources, + URLTypes, + UserTypes, +) logger = Logger("config") @@ -19,11 +26,13 @@ class Config: package_manager_id: str url_types: URLTypes user_types: UserTypes + dependency_types: DependencyTypes def __str__(self): return f"Config(file_location={self.file_location}, test={self.test}, \ fetch={self.fetch}, package_manager_id={self.package_manager_id}, \ - url_types={self.url_types}, user_types={self.user_types})" + url_types={self.url_types}, user_types={self.user_types}, \ + dependency_types={self.dependency_types})" def load_url_types(db: DB) -> URLTypes: @@ -31,10 +40,12 @@ def load_url_types(db: DB) -> URLTypes: homepage_url = db.select_url_types_homepage(create=True) repository_url = db.select_url_types_repository(create=True) documentation_url = db.select_url_types_documentation(create=True) + source_url = db.select_url_types_source(create=True) return URLTypes( homepage=homepage_url.id, repository=repository_url.id, documentation=documentation_url.id, + source=source_url.id, ) @@ -60,6 +71,24 @@ def load_package_manager_ids(db: DB) -> PackageManagerIDs: } +def load_dependency_types(db: DB) -> DependencyTypes: + logger.debug("loading dependency types, and creating if not exists") + build_dep_type = db.select_dependency_type_by_name("build", create=True) + dev_dep_type = db.select_dependency_type_by_name("development", create=True) + runtime_dep_type = db.select_dependency_type_by_name("runtime", create=True) + test_dep_type = db.select_dependency_type_by_name("test", create=True) + optional_dep_type = db.select_dependency_type_by_name("optional", create=True) + recommended_dep_type = db.select_dependency_type_by_name("recommended", create=True) + return DependencyTypes( + build=build_dep_type.id, + development=dev_dep_type.id, + runtime=runtime_dep_type.id, + test=test_dep_type.id, + optional=optional_dep_type.id, + recommended=recommended_dep_type.id, + ) + + def load_sources() -> Sources: return { PackageManager.CRATES: "https://static.crates.io/db-dump.tar.gz", @@ -73,6 +102,7 @@ def initialize(package_manager: PackageManager, db: DB) -> Config: url_types = load_url_types(db) user_types = load_user_types(db) package_manager_ids = load_package_manager_ids(db) + dependency_types = load_dependency_types(db) sources = load_sources() if package_manager == PackageManager.CRATES: @@ -83,6 +113,7 @@ def initialize(package_manager: PackageManager, db: DB) -> Config: package_manager_id=package_manager_ids[PackageManager.CRATES], url_types=url_types, user_types=user_types, + dependency_types=dependency_types, ) elif package_manager == PackageManager.HOMEBREW: return Config( @@ -92,4 +123,5 @@ def initialize(package_manager: PackageManager, db: DB) -> Config: package_manager_id=package_manager_ids[PackageManager.HOMEBREW], url_types=url_types, user_types=user_types, + dependency_types=dependency_types, ) diff --git a/core/db.py b/core/db.py index eaf7555..9efe13d 100644 --- a/core/db.py +++ b/core/db.py @@ -11,6 +11,7 @@ from core.models import ( URL, DependsOn, + DependsOnType, License, LoadHistory, Package, @@ -426,6 +427,9 @@ def select_url_types_repository(self, create: bool = False) -> URLType | None: def select_url_types_documentation(self, create: bool = False) -> URLType | None: return self.select_url_type("documentation", create) + def select_url_types_source(self, create: bool = False) -> URLType | None: + return self.select_url_type("source", create) + def select_package_manager_by_name( self, package_manager: str, create: bool = False ) -> PackageManager | None: @@ -521,3 +525,20 @@ def select_packages_by_import_ids(self, iids: Iterable[str]) -> List[Package]: def select_licenses_by_name(self, names: Iterable[str]) -> List[License]: with self.session() as session: return session.query(License).filter(License.name.in_(names)).all() + + def select_dependency_type_by_name( + self, name: str, create: bool = False + ) -> DependsOnType: + with self.session() as session: + result = session.query(DependsOnType).filter_by(name=name).first() + if result: + return result + if create: + return self.insert_dependency_type(name) + return None + + def insert_dependency_type(self, name: str) -> DependsOnType: + with self.session() as session: + session.add(DependsOnType(name=name)) + session.commit() + return session.query(DependsOnType).filter_by(name=name).first() diff --git a/core/logger.py b/core/logger.py index e66e081..bef9e82 100644 --- a/core/logger.py +++ b/core/logger.py @@ -1,13 +1,10 @@ -from os import getenv -import time import sys +import time import traceback +from os import getenv -DEBUG = getenv("DEBUG", "false").lower() == "true" - -# use inspect to print the line of code as well? -# caller = inspect.currentframe().f_back -# filename = caller.f_code.co_filename, lineno = caller.f_lineno +debug = getenv("DEBUG", "false").lower() +DEBUG = debug == "true" or debug == "1" def as_minutes(seconds: float) -> float: diff --git a/core/structs.py b/core/structs.py index e7d72b7..d926438 100644 --- a/core/structs.py +++ b/core/structs.py @@ -19,9 +19,20 @@ class URLTypes: homepage: UUID repository: UUID documentation: UUID + source: UUID @dataclass class UserTypes: crates: UUID github: UUID + + +@dataclass +class DependencyTypes: + build: UUID + development: UUID + runtime: UUID + test: UUID + optional: UUID + recommended: UUID diff --git a/core/utils.py b/core/utils.py index 3d7a1bb..f622cf9 100644 --- a/core/utils.py +++ b/core/utils.py @@ -1,4 +1,4 @@ -from typing import List, Dict +from typing import Dict, List def safe_int(val: str) -> int | None: diff --git a/package_managers/homebrew/Dockerfile b/package_managers/homebrew/Dockerfile index 9551376..c9c0a56 100644 --- a/package_managers/homebrew/Dockerfile +++ b/package_managers/homebrew/Dockerfile @@ -1,10 +1,11 @@ -FROM ubuntu:24.10 +FROM python:3.11 RUN apt-get update && \ - apt-get install -y jq curl postgresql-client && \ + apt-get install -y jq curl && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* COPY . . WORKDIR /package_managers/homebrew RUN chmod +x /package_managers/homebrew/pipeline.sh +RUN pip install --no-cache-dir -r requirements.txt CMD ["/package_managers/homebrew/pipeline.sh"] From 78325994d2399e3bf58773957ccce3b3f59c5a6a Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Mon, 21 Oct 2024 11:42:37 -0500 Subject: [PATCH 35/41] create the data types upfront --- alembic/load-values.sql | 26 +++++++++++++++++ alembic/run_migrations.sh | 3 ++ ..._0803-package_managers_should_be_unique.py | 29 +++++++++++++++++++ core/models/__init__.py | 4 ++- 4 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 alembic/load-values.sql create mode 100644 alembic/versions/20241021_0803-package_managers_should_be_unique.py diff --git a/alembic/load-values.sql b/alembic/load-values.sql new file mode 100644 index 0000000..1bdde5d --- /dev/null +++ b/alembic/load-values.sql @@ -0,0 +1,26 @@ +-- url types +INSERT INTO "url_types" ("name") +VALUES ('source'), ('homepage'), ('documentation'), ('repository') +ON CONFLICT (name) DO NOTHING; + +-- dependency types +INSERT INTO "depends_on_types" ("name") +VALUES +('build'), +('development'), +('runtime'), +('test'), +('optional'), +('recommended'), +('uses_from_macos') +ON CONFLICT (name) DO NOTHING; + +-- sources +INSERT INTO "sources" ("type") +VALUES ('crates'), ('npm'), ('pypi'), ('rubygems'), ('github'), ('homebrew') +ON CONFLICT (type) DO NOTHING; + +INSERT INTO "package_managers" ("source_id") +SELECT id +FROM "sources" +WHERE "type" IN ('crates', 'npm', 'pypi', 'rubygems', 'github', 'homebrew'); diff --git a/alembic/run_migrations.sh b/alembic/run_migrations.sh index 88e081a..6e6beee 100755 --- a/alembic/run_migrations.sh +++ b/alembic/run_migrations.sh @@ -25,3 +25,6 @@ else echo "migrations failed" exit 1 fi + +# load values +psql -U postgres -h db -d chai -f load-values.sql -a diff --git a/alembic/versions/20241021_0803-package_managers_should_be_unique.py b/alembic/versions/20241021_0803-package_managers_should_be_unique.py new file mode 100644 index 0000000..01bf5df --- /dev/null +++ b/alembic/versions/20241021_0803-package_managers_should_be_unique.py @@ -0,0 +1,29 @@ +"""package managers should be unique + +Revision ID: 38cc41599874 +Revises: 2481138a729a +Create Date: 2024-10-21 08:03:43.647535 + +""" + +from typing import Sequence, Union + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "38cc41599874" +down_revision: Union[str, None] = "2481138a729a" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_unique_constraint( + op.f("uq_package_managers_source_id"), "package_managers", ["source_id"] + ) + + +def downgrade() -> None: + op.drop_constraint( + op.f("uq_package_managers_source_id"), "package_managers", type_="unique" + ) diff --git a/core/models/__init__.py b/core/models/__init__.py index 1dcfbcc..b7a11b3 100644 --- a/core/models/__init__.py +++ b/core/models/__init__.py @@ -78,7 +78,9 @@ class PackageManager(Base): default=func.uuid_generate_v4(), server_default=func.uuid_generate_v4(), ) - source_id = Column(UUID(as_uuid=True), ForeignKey("sources.id"), nullable=False) + source_id = Column( + UUID(as_uuid=True), ForeignKey("sources.id"), nullable=False, unique=True + ) created_at = Column( DateTime, nullable=False, default=func.now(), server_default=func.now() ) From 7ca5710f1061f8d5652ba98b2feb292534e77136 Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Mon, 21 Oct 2024 11:43:10 -0500 Subject: [PATCH 36/41] get homebrew env vars --- .../homebrew/create_url_types.sql | 13 --------- package_managers/homebrew/homebrew_id.sql | 12 -------- .../homebrew/sql/homebrew_vars.sql | 28 +++++++++++++++++++ 3 files changed, 28 insertions(+), 25 deletions(-) delete mode 100644 package_managers/homebrew/create_url_types.sql delete mode 100644 package_managers/homebrew/homebrew_id.sql create mode 100644 package_managers/homebrew/sql/homebrew_vars.sql diff --git a/package_managers/homebrew/create_url_types.sql b/package_managers/homebrew/create_url_types.sql deleted file mode 100644 index 44a15c0..0000000 --- a/package_managers/homebrew/create_url_types.sql +++ /dev/null @@ -1,13 +0,0 @@ -WITH inserted_rows AS ( - INSERT INTO url_types (name) - VALUES ('source'), ('homepage') - ON CONFLICT (name) DO NOTHING - RETURNING id, name -) -SELECT id, name -FROM inserted_rows -UNION ALL -SELECT id, name -FROM url_types -WHERE name IN ('source', 'homepage') - AND name NOT IN (SELECT name FROM inserted_rows); \ No newline at end of file diff --git a/package_managers/homebrew/homebrew_id.sql b/package_managers/homebrew/homebrew_id.sql deleted file mode 100644 index b2ec9f1..0000000 --- a/package_managers/homebrew/homebrew_id.sql +++ /dev/null @@ -1,12 +0,0 @@ -WITH homebrew_source_id AS ( - INSERT INTO sources ("type") - VALUES ('homebrew') - ON CONFLICT ("type") DO UPDATE SET "type" = EXCLUDED."type" - RETURNING id -), package_manager_id AS ( - INSERT INTO package_managers (source_id) - VALUES ((SELECT id FROM homebrew_source_id)) - RETURNING id -) -SELECT id -FROM package_manager_id; \ No newline at end of file diff --git a/package_managers/homebrew/sql/homebrew_vars.sql b/package_managers/homebrew/sql/homebrew_vars.sql new file mode 100644 index 0000000..35ec82f --- /dev/null +++ b/package_managers/homebrew/sql/homebrew_vars.sql @@ -0,0 +1,28 @@ +SELECT + pm.id AS package_manager_id, + (SELECT id FROM url_types WHERE name = 'homepage') AS homepage_url_type_id, + (SELECT id FROM url_types WHERE name = 'source') AS source_url_type_id, + ( + SELECT id FROM depends_on_types WHERE name = 'build' + ) AS build_depends_on_type_id, + ( + SELECT id FROM depends_on_types WHERE name = 'runtime' + ) AS runtime_depends_on_type_id, + ( + SELECT id FROM depends_on_types WHERE name = 'recommended' + ) AS recommended_depends_on_type_id, + ( + SELECT id FROM depends_on_types WHERE name = 'optional' + ) AS optional_depends_on_type_id, + ( + SELECT id FROM depends_on_types WHERE name = 'test' + ) AS test_depends_on_type_id, + ( + SELECT id FROM depends_on_types WHERE name = 'uses_from_macos' + ) AS uses_from_macos_depends_on_type_id +FROM + package_managers AS pm +INNER JOIN + sources AS s ON pm.source_id = s.id +WHERE + s.type = 'homebrew'; From 824aeab01a9b9156ab876b699bbbc7c0e1965ddb Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Mon, 21 Oct 2024 11:43:21 -0500 Subject: [PATCH 37/41] remove python --- package_managers/homebrew/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package_managers/homebrew/Dockerfile b/package_managers/homebrew/Dockerfile index c9c0a56..bf1a4d8 100644 --- a/package_managers/homebrew/Dockerfile +++ b/package_managers/homebrew/Dockerfile @@ -1,11 +1,11 @@ FROM python:3.11 RUN apt-get update && \ - apt-get install -y jq curl && \ + apt-get install -y jq curl postgresql-client && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* COPY . . WORKDIR /package_managers/homebrew RUN chmod +x /package_managers/homebrew/pipeline.sh -RUN pip install --no-cache-dir -r requirements.txt +# RUN pip install --no-cache-dir -r requirements.txt CMD ["/package_managers/homebrew/pipeline.sh"] From e349474d40ff5ab04d90def91461cbb61322fe8f Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Mon, 21 Oct 2024 11:43:39 -0500 Subject: [PATCH 38/41] replace sed with jq --- package_managers/homebrew/jq/dependencies.jq | 63 +++++++++---------- package_managers/homebrew/jq/package_url.jq | 18 ++++-- package_managers/homebrew/jq/packages.jq | 15 ++++- package_managers/homebrew/jq/urls.jq | 18 ++++-- package_managers/homebrew/jq/versions.jq | 13 ++-- .../homebrew/sed/dependencies.sed | 2 - package_managers/homebrew/sed/package_url.sed | 2 - package_managers/homebrew/sed/packages.sed | 2 - package_managers/homebrew/sed/urls.sed | 4 -- package_managers/homebrew/sed/versions.sed | 2 - 10 files changed, 76 insertions(+), 63 deletions(-) delete mode 100644 package_managers/homebrew/sed/dependencies.sed delete mode 100644 package_managers/homebrew/sed/package_url.sed delete mode 100644 package_managers/homebrew/sed/packages.sed delete mode 100644 package_managers/homebrew/sed/urls.sed delete mode 100644 package_managers/homebrew/sed/versions.sed diff --git a/package_managers/homebrew/jq/dependencies.jq b/package_managers/homebrew/jq/dependencies.jq index 9e7181a..56b9055 100644 --- a/package_managers/homebrew/jq/dependencies.jq +++ b/package_managers/homebrew/jq/dependencies.jq @@ -1,37 +1,30 @@ -# build_dependencies -# dependencies -# test_dependencies -# optional_dependencies -# uses_from_macos # TODO: variations (linux only, by architecture) -# all of the above are the fields that contain dependency info for Homebrew -# uses from macos sometimes specifies build / test -- right now logging that as macos only - -[.[] | -.name as $name | -( - (.uses_from_macos // []) | - map({ - package: $name, - dependency_type: "uses_from_macos", - dependency: (if type == "object" then keys[0] else . end) - }) -), -( - (.dependencies // []) | - map({package: $name, dependency_type: "dependency", dependency: .}) -), -( - (.test_dependencies // []) | - map({package: $name, dependency_type: "test_dependency", dependency: .}) -), -( - (.optional_dependencies // []) | - map({package: $name, dependency_type: "optional_dependency", dependency: .}) -), -( - (.build_dependencies // []) | - map({package: $name, dependency_type: "build_dependency", dependency: .}) -) -| .[]] \ No newline at end of file +[.[] | { + package_name: .name, + build_deps: .build_dependencies, + runtime_deps: .dependencies, + recommended_deps: .recommended_dependencies, + test_deps: .test_dependencies, + optional_deps: .optional_dependencies, + uses_from_macos: .uses_from_macos +} | + # here's where we'd substitute the depends_on_type ids, for each depends_on type ids + # the `[]` at the end is to ensure that we're exploding the arrays, so each dependency gets its own row! + {package_name: .package_name, depends_on_type: $build_deps_type_id, depends_on: .build_deps[]}, + {package_name: .package_name, depends_on_type: $runtime_deps_type_id, depends_on: .runtime_deps[]}, + {package_name: .package_name, depends_on_type: $recommended_deps_type_id, depends_on: .recommended_deps[]}, + {package_name: .package_name, depends_on_type: $test_deps_type_id, depends_on: .test_deps[]}, + {package_name: .package_name, depends_on_type: $optional_deps_type_id, depends_on: .optional_deps[]}, + {package_name: .package_name, depends_on_type: $uses_from_macos_type_id, depends_on: .uses_from_macos[]} + | + # now, filter out the null dependencies + select(.depends_on != null) | + # and only look at the ones that are strings TODO: some are JSONs? + select(.depends_on | type == "string") | + # generate the sql statements! + "INSERT INTO dependencies (version_id, dependency_id, depends_on_type_id) VALUES ( + (SELECT id FROM versions WHERE import_id = '" + .package_name + "'), + (SELECT id FROM packages WHERE name = '" + .depends_on + "'), + '" + .depends_on_type + "');" +] | join("\n") \ No newline at end of file diff --git a/package_managers/homebrew/jq/package_url.jq b/package_managers/homebrew/jq/package_url.jq index f1f767a..04b50df 100644 --- a/package_managers/homebrew/jq/package_url.jq +++ b/package_managers/homebrew/jq/package_url.jq @@ -1,8 +1,18 @@ +# mapping package to urls is straightforward +# but, in the first normal form we've gotta do the mapping ourselves +# luckily, homebrew is small enough that we can push some of that work to the db + [.[] | { package_name: .name, homepage_url: .homepage, source_url: .urls.stable.url -} | [ - {package_name: .package_name, url: .homepage_url}, - {package_name: .package_name, url: .source_url} -] | .[]] +} | + # here's where we substitute the url type ids, for each url type + {package_name: .package_name, type: $homepage_url_type_id, url: .homepage_url}, + {package_name: .package_name, type: $source_url_type_id, url: .source_url} + | + # and here we say "for each url, generate an insert statement" + "INSERT INTO package_urls (package_id, url_id) VALUES ( + (SELECT id FROM packages WHERE name = '" + .package_name + "'), + (SELECT id FROM urls WHERE url = '" + .url + "' AND url_type_id = '" + .type + "'));" +] | join("\n") diff --git a/package_managers/homebrew/jq/packages.jq b/package_managers/homebrew/jq/packages.jq index b070648..42cef0c 100644 --- a/package_managers/homebrew/jq/packages.jq +++ b/package_managers/homebrew/jq/packages.jq @@ -1,2 +1,13 @@ -# we just need the name for the packages models -[.[] | {name: .name, derived_id: ("homebrew/" + .name), import_id: .name, readme: null}] \ No newline at end of file + +[.[] | + "INSERT INTO packages (name, derived_id, import_id, package_manager_id) VALUES ('" + + # for every single row, extract the name => it's the only key we need from Homebrew + (.name) + "', '" + + # the derived_id is the package manager name + "/" + the package name, which enforces + # uniqueness on the packages table + ("homebrew/" + .name) + "', '" + + # the import_id is the same as the package name (used for joins) + .name + "', '" + + # the package manager ID is passed in as a variable + $package_manager_id + "');" +] | join("\n") diff --git a/package_managers/homebrew/jq/urls.jq b/package_managers/homebrew/jq/urls.jq index c51c2fc..3d904c5 100644 --- a/package_managers/homebrew/jq/urls.jq +++ b/package_managers/homebrew/jq/urls.jq @@ -1,12 +1,18 @@ -# homepage is at the main key -# source is inside stable, and it's the tarball +# from our sources.json, we're extracting homepage and source: + # homepage is at the main key + # source is inside stable, and it's the tarball + +# for every single row, extract the homepage and source: [.[] | { homepage: .homepage, source: .urls.stable.url } | to_entries | map({ +# `map` basically explodes the json, creating two rows for each JSON object name: .key, url: .value -}) | .[] | { - url: .url, - url_type: .name -}] \ No newline at end of file +}) | .[] | +# and here, we can generate our SQL statement! + "INSERT INTO urls (url, url_type_id) VALUES ('" + + .url + "', '" + + if .name == "source" then $source_url_type_id else $homepage_url_type_id end + "');" +] | join("\n") diff --git a/package_managers/homebrew/jq/versions.jq b/package_managers/homebrew/jq/versions.jq index 8d76b0d..413d05c 100644 --- a/package_managers/homebrew/jq/versions.jq +++ b/package_managers/homebrew/jq/versions.jq @@ -1,16 +1,21 @@ # homebrew has the problem where there are no versions # we're gonna assume the version available is the latest -# and we'll deal with that later # TODO: `downloads: .analytics.install_on_request."365d".[$name]` # above gives us the downloads for the last 365 days # not available in the full JSON API # TODO: there are also a problem of versioned formulae + +# TODO: licenses is in source.json, but we need a long-term mapping solution + [.[] | .name as $name | { version: .versions.stable, - import_id: .name, - license: .license -}] \ No newline at end of file + import_id: .name +} | +"INSERT INTO versions (version, package_id) VALUES ('" + + .version + "', '" + + .import_id + "');" +] | join("\n") diff --git a/package_managers/homebrew/sed/dependencies.sed b/package_managers/homebrew/sed/dependencies.sed deleted file mode 100644 index 3a4e234..0000000 --- a/package_managers/homebrew/sed/dependencies.sed +++ /dev/null @@ -1,2 +0,0 @@ -1d -s%"\([^"]*\)","\([^"]*\)",*%INSERT INTO dependencies (version_id, dependency_id, dependency_type_id) SELECT (SELECT id FROM versions WHERE import_id = '\3'), (SELECT id FROM packages WHERE derived_id = 'homebrew/\1'), (SELECT id FROM dependency_types WHERE name = '\2');% diff --git a/package_managers/homebrew/sed/package_url.sed b/package_managers/homebrew/sed/package_url.sed deleted file mode 100644 index ecc6f33..0000000 --- a/package_managers/homebrew/sed/package_url.sed +++ /dev/null @@ -1,2 +0,0 @@ -1d -s%"\([^"]*\)","\([^"]*\)",*%INSERT INTO package_urls (package_id, url_id) SELECT (SELECT id FROM packages WHERE derived_id = 'homebrew/\1'), (SELECT id FROM urls WHERE url = '\2') ON CONFLICT ("package_id", "url_id") DO NOTHING;% diff --git a/package_managers/homebrew/sed/packages.sed b/package_managers/homebrew/sed/packages.sed deleted file mode 100644 index 6373b11..0000000 --- a/package_managers/homebrew/sed/packages.sed +++ /dev/null @@ -1,2 +0,0 @@ -1d -s/"\([^"]*\)","\([^"]*\)","\([^"]*\)",*/INSERT INTO packages (derived_id, import_id, name, package_manager_id) VALUES ('\1', '\2', '\3', '@@HOMEBREW_ID@@') ON CONFLICT ("derived_id") DO NOTHING;/ \ No newline at end of file diff --git a/package_managers/homebrew/sed/urls.sed b/package_managers/homebrew/sed/urls.sed deleted file mode 100644 index 16f7639..0000000 --- a/package_managers/homebrew/sed/urls.sed +++ /dev/null @@ -1,4 +0,0 @@ - - -1d -s/"\([^"]*\)","\([^"]*\)",*/INSERT INTO urls (url, url_type_id) VALUES ('\1', (SELECT id FROM url_types WHERE "name" = '\2')) ON CONFLICT ("url", "url_type_id") DO NOTHING;/ \ No newline at end of file diff --git a/package_managers/homebrew/sed/versions.sed b/package_managers/homebrew/sed/versions.sed deleted file mode 100644 index d3453b7..0000000 --- a/package_managers/homebrew/sed/versions.sed +++ /dev/null @@ -1,2 +0,0 @@ -1d -s%"\([^"]*\)","\([^"]*\)","\([^"]*\)",*%INSERT INTO versions (import_id, package_id, "version") VALUES ('\1', (SELECT id FROM packages WHERE derived_id = 'homebrew/\1'), '\3') ON CONFLICT ("package_id", "version") DO NOTHING;% From 57096074a85381e8043889f12ff02d9af85844ce Mon Sep 17 00:00:00 2001 From: Sanchit Ram Arvind Date: Mon, 21 Oct 2024 12:00:37 -0500 Subject: [PATCH 39/41] jq corrections, pipeline.sh fixes, dockerfile --- package_managers/homebrew/Dockerfile | 2 - package_managers/homebrew/jq/dependencies.jq | 6 +- package_managers/homebrew/jq/package_url.jq | 3 +- package_managers/homebrew/jq/packages.jq | 2 +- package_managers/homebrew/jq/urls.jq | 3 +- package_managers/homebrew/jq/versions.jq | 8 +- package_managers/homebrew/pipeline.sh | 121 ++++++++++--------- 7 files changed, 74 insertions(+), 71 deletions(-) diff --git a/package_managers/homebrew/Dockerfile b/package_managers/homebrew/Dockerfile index bf1a4d8..895fccc 100644 --- a/package_managers/homebrew/Dockerfile +++ b/package_managers/homebrew/Dockerfile @@ -6,6 +6,4 @@ RUN apt-get update && \ COPY . . WORKDIR /package_managers/homebrew RUN chmod +x /package_managers/homebrew/pipeline.sh -# RUN pip install --no-cache-dir -r requirements.txt CMD ["/package_managers/homebrew/pipeline.sh"] - diff --git a/package_managers/homebrew/jq/dependencies.jq b/package_managers/homebrew/jq/dependencies.jq index 56b9055..811c0b4 100644 --- a/package_managers/homebrew/jq/dependencies.jq +++ b/package_managers/homebrew/jq/dependencies.jq @@ -23,8 +23,8 @@ # and only look at the ones that are strings TODO: some are JSONs? select(.depends_on | type == "string") | # generate the sql statements! - "INSERT INTO dependencies (version_id, dependency_id, depends_on_type_id) VALUES ( + "INSERT INTO dependencies (version_id, dependency_id, dependency_type_id) VALUES ( (SELECT id FROM versions WHERE import_id = '" + .package_name + "'), - (SELECT id FROM packages WHERE name = '" + .depends_on + "'), - '" + .depends_on_type + "');" + (SELECT id FROM packages WHERE import_id = '" + .depends_on + "'), + '" + .depends_on_type + "') ON CONFLICT DO NOTHING;" ] | join("\n") \ No newline at end of file diff --git a/package_managers/homebrew/jq/package_url.jq b/package_managers/homebrew/jq/package_url.jq index 04b50df..9e63306 100644 --- a/package_managers/homebrew/jq/package_url.jq +++ b/package_managers/homebrew/jq/package_url.jq @@ -14,5 +14,6 @@ # and here we say "for each url, generate an insert statement" "INSERT INTO package_urls (package_id, url_id) VALUES ( (SELECT id FROM packages WHERE name = '" + .package_name + "'), - (SELECT id FROM urls WHERE url = '" + .url + "' AND url_type_id = '" + .type + "'));" + (SELECT id FROM urls WHERE url = '" + .url + "' AND url_type_id = '" + .type + "')) + ON CONFLICT DO NOTHING;" ] | join("\n") diff --git a/package_managers/homebrew/jq/packages.jq b/package_managers/homebrew/jq/packages.jq index 42cef0c..db44a61 100644 --- a/package_managers/homebrew/jq/packages.jq +++ b/package_managers/homebrew/jq/packages.jq @@ -9,5 +9,5 @@ # the import_id is the same as the package name (used for joins) .name + "', '" + # the package manager ID is passed in as a variable - $package_manager_id + "');" + $package_manager_id + "') ON CONFLICT DO NOTHING;" ] | join("\n") diff --git a/package_managers/homebrew/jq/urls.jq b/package_managers/homebrew/jq/urls.jq index 3d904c5..8f74533 100644 --- a/package_managers/homebrew/jq/urls.jq +++ b/package_managers/homebrew/jq/urls.jq @@ -14,5 +14,6 @@ # and here, we can generate our SQL statement! "INSERT INTO urls (url, url_type_id) VALUES ('" + .url + "', '" + - if .name == "source" then $source_url_type_id else $homepage_url_type_id end + "');" + if .name == "source" then $source_url_type_id else $homepage_url_type_id end + "') + ON CONFLICT DO NOTHING;" ] | join("\n") diff --git a/package_managers/homebrew/jq/versions.jq b/package_managers/homebrew/jq/versions.jq index 413d05c..4338034 100644 --- a/package_managers/homebrew/jq/versions.jq +++ b/package_managers/homebrew/jq/versions.jq @@ -15,7 +15,9 @@ version: .versions.stable, import_id: .name } | -"INSERT INTO versions (version, package_id) VALUES ('" + - .version + "', '" + - .import_id + "');" +"INSERT INTO versions (version, import_id, package_id) VALUES ( + '" + .version + "', + '" + .import_id + "', + (SELECT id FROM packages WHERE import_id = '" + .import_id + "') + ) ON CONFLICT DO NOTHING;" ] | join("\n") diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh index c9e5be0..b69dc25 100755 --- a/package_managers/homebrew/pipeline.sh +++ b/package_managers/homebrew/pipeline.sh @@ -1,29 +1,39 @@ #!/bin/bash -set -exu +set -exuo pipefail -# Set PSQL_FLAGS based on DEBUG environment variable -if [ "${DEBUG:-false}" = false ]; then - PSQL_FLAGS="-q" -else - PSQL_FLAGS="" -fi +# get all the required IDs and URLs from the database +IDS=$(psql "$CHAI_DATABASE_URL" -f sql/homebrew_vars.sql -t -A -F'|') + +# Parse the results and export variables +IFS='|' read -r PACKAGE_MANAGER_ID HOMEPAGE_URL_TYPE_ID SOURCE_URL_TYPE_ID \ + BUILD_DEPENDS_ON_TYPE_ID RUNTIME_DEPENDS_ON_TYPE_ID \ + RECOMMENDED_DEPENDS_ON_TYPE_ID OPTIONAL_DEPENDS_ON_TYPE_ID \ + TEST_DEPENDS_ON_TYPE_ID USES_FROM_MACOS_DEPENDS_ON_TYPE_ID <<< "$IDS" -# get the ID for Homebrew from our database -HOMEBREW_ID=$(psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f homebrew_id.sql -v "ON_ERROR_STOP=1" -tA) +export PACKAGE_MANAGER_ID +export HOMEPAGE_URL_TYPE_ID +export SOURCE_URL_TYPE_ID +export BUILD_DEPENDS_ON_TYPE_ID +export RUNTIME_DEPENDS_ON_TYPE_ID +export RECOMMENDED_DEPENDS_ON_TYPE_ID +export OPTIONAL_DEPENDS_ON_TYPE_ID +export TEST_DEPENDS_ON_TYPE_ID +export USES_FROM_MACOS_DEPENDS_ON_TYPE_ID -# fail if HOMEBREW_ID is empty -if [ -z "$HOMEBREW_ID" ]; then - echo "Error: Failed to retrieve Homebrew ID from the database." +# if any of the IDs are empty, exit +if [ -z "$PACKAGE_MANAGER_ID" ] || [ -z "$HOMEPAGE_URL_TYPE_ID" ] || [ -z "$SOURCE_URL_TYPE_ID" ] || [ -z "$BUILD_DEPENDS_ON_TYPE_ID" ] || [ -z "$RUNTIME_DEPENDS_ON_TYPE_ID" ] || [ -z "$RECOMMENDED_DEPENDS_ON_TYPE_ID" ] || [ -z "$OPTIONAL_DEPENDS_ON_TYPE_ID" ] || [ -z "$TEST_DEPENDS_ON_TYPE_ID" ] || [ -z "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" ]; then + echo "One or more IDs are empty. Exiting." exit 1 fi -# homebrew provides `source` and `homepage` url types - let's create them ahead of time -psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f create_url_types.sql - # if you've already pulled the Homebrew data, you can `export FETCH=false` to skip the # download, and just work off the latest symlink -# Note that this only works if the volumes are mounted + +# > [!IMPORTANT] +# > +# > ONLY WORKS IF THE VOLUMES ARE MOUNTED + if [ "$FETCH" = true ]; then NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") mkdir -p "$DATA_DIR"/"$NOW" @@ -35,51 +45,42 @@ if [ "$FETCH" = true ]; then ln -sfn "$NOW" "$DATA_DIR"/latest # transform - echo "$JQ_DIR" - for x in "$JQ_DIR"/*.jq; do + for x in "$CODE_DIR"/jq/*.jq; do filename=$(basename "$x" .jq) - # first jq line uses the formulas defined in the jq folder for each data model - # second jq line transforms the json into csv so we can use sed to prep psql stmts - jq -f "$x" "$DATA_DIR"/latest/source.json \ - | jq -r ' - (map(keys) | add | unique) as $cols | - map(. as $row | $cols | map($row[.])) as $rows | - $cols, $rows[] | @csv - ' \ - > "$DATA_DIR"/latest/"${filename}".csv + # use the formulas defined in the jq folder for each data model + if [ "$filename" = "packages" ]; then + jq -f "$x" -r --arg package_manager_id "$PACKAGE_MANAGER_ID" "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + elif [ "$filename" = "urls" ]; then + jq -f "$x" -r \ + --arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ + --arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + elif [ "$filename" = "versions" ]; then + jq -f "$x" -r \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + elif [ "$filename" = "package_url" ]; then + jq -f "$x" -r \ + --arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ + --arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + elif [ "$filename" = "dependencies" ]; then + jq -f "$x" -r \ + --arg build_deps_type_id "$BUILD_DEPENDS_ON_TYPE_ID" \ + --arg runtime_deps_type_id "$RUNTIME_DEPENDS_ON_TYPE_ID" \ + --arg recommended_deps_type_id "$RECOMMENDED_DEPENDS_ON_TYPE_ID" \ + --arg optional_deps_type_id "$OPTIONAL_DEPENDS_ON_TYPE_ID" \ + --arg test_deps_type_id "$TEST_DEPENDS_ON_TYPE_ID" \ + --arg uses_from_macos_type_id "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + else + echo "skipping $filename" + fi done fi -# load -# TODO: loop? - -# packages -# pass HOMEBREW_ID to sed to replace the @@HOMEBREW_ID@@ placeholder -sed \ - -f "$SED_DIR/packages.sed" "$DATA_DIR/latest/packages.csv" | \ - sed "s/@@HOMEBREW_ID@@/$HOMEBREW_ID/" \ - > "$DATA_DIR/latest/package_inserts.sql" -psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_inserts.sql - -# urls -sed \ - -f "$SED_DIR/urls.sed" "$DATA_DIR/latest/urls.csv" \ - > "$DATA_DIR/latest/url_inserts.sql" -psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/url_inserts.sql - -# versions -# TODO: licenses (license id is annoying) -# TODO: some random parsing errors happening in versions.csv -sed \ - -f "$SED_DIR/versions.sed" "$DATA_DIR/latest/versions.csv" \ - > "$DATA_DIR/latest/version_inserts.sql" -psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/version_inserts.sql - -# package_urls -# TODO: ERROR: more than one row returned by a subquery used as an expression -sed \ - -f "$SED_DIR/package_url.sed" "$DATA_DIR/latest/package_url.csv" \ - > "$DATA_DIR/latest/package_url_inserts.sql" -psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_url_inserts.sql - -# TODO: dependencies -> dependency_type is annoying +# load - order matters +psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/packages.sql +psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/urls.sql +psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/versions.sql +psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_url.sql +psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/dependencies.sql From 4aee155e83b57abd62367cdfd15e26436cf773e9 Mon Sep 17 00:00:00 2001 From: Jacob Heider Date: Mon, 21 Oct 2024 13:19:49 -0400 Subject: [PATCH 40/41] pipeline.sh improvements --- package_managers/homebrew/pipeline.sh | 183 ++++++++++++++++---------- 1 file changed, 113 insertions(+), 70 deletions(-) diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh index b69dc25..6cf8487 100755 --- a/package_managers/homebrew/pipeline.sh +++ b/package_managers/homebrew/pipeline.sh @@ -1,86 +1,129 @@ #!/bin/bash +# Homebrew Pipeline Script +# This script fetches, transforms, and loads Homebrew package data into a PostgreSQL database. + +# Set bash options: +# -e: Exit immediately if a command exits with a non-zero status. +# -x: Print commands and their arguments as they are executed. +# -u: Treat unset variables as an error when substituting. +# -o pipefail: Return value of a pipeline is the status of the last command to exit with a non-zero status. set -exuo pipefail -# get all the required IDs and URLs from the database -IDS=$(psql "$CHAI_DATABASE_URL" -f sql/homebrew_vars.sql -t -A -F'|') +# Function to log messages with timestamps +log() { + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" +} -# Parse the results and export variables -IFS='|' read -r PACKAGE_MANAGER_ID HOMEPAGE_URL_TYPE_ID SOURCE_URL_TYPE_ID \ - BUILD_DEPENDS_ON_TYPE_ID RUNTIME_DEPENDS_ON_TYPE_ID \ - RECOMMENDED_DEPENDS_ON_TYPE_ID OPTIONAL_DEPENDS_ON_TYPE_ID \ - TEST_DEPENDS_ON_TYPE_ID USES_FROM_MACOS_DEPENDS_ON_TYPE_ID <<< "$IDS" +log "Starting Homebrew pipeline script" -export PACKAGE_MANAGER_ID -export HOMEPAGE_URL_TYPE_ID -export SOURCE_URL_TYPE_ID -export BUILD_DEPENDS_ON_TYPE_ID -export RUNTIME_DEPENDS_ON_TYPE_ID -export RECOMMENDED_DEPENDS_ON_TYPE_ID -export OPTIONAL_DEPENDS_ON_TYPE_ID -export TEST_DEPENDS_ON_TYPE_ID -export USES_FROM_MACOS_DEPENDS_ON_TYPE_ID +# Fetch required IDs and URLs from the database +log "Fetching required IDs and URLs from the database" +IDS=$(psql "$CHAI_DATABASE_URL" -f sql/homebrew_vars.sql -t -A -F'|') -# if any of the IDs are empty, exit -if [ -z "$PACKAGE_MANAGER_ID" ] || [ -z "$HOMEPAGE_URL_TYPE_ID" ] || [ -z "$SOURCE_URL_TYPE_ID" ] || [ -z "$BUILD_DEPENDS_ON_TYPE_ID" ] || [ -z "$RUNTIME_DEPENDS_ON_TYPE_ID" ] || [ -z "$RECOMMENDED_DEPENDS_ON_TYPE_ID" ] || [ -z "$OPTIONAL_DEPENDS_ON_TYPE_ID" ] || [ -z "$TEST_DEPENDS_ON_TYPE_ID" ] || [ -z "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" ]; then - echo "One or more IDs are empty. Exiting." - exit 1 -fi +# Parse the results +IFS='|' read -r \ + PACKAGE_MANAGER_ID \ + HOMEPAGE_URL_TYPE_ID \ + SOURCE_URL_TYPE_ID \ + BUILD_DEPENDS_ON_TYPE_ID \ + RUNTIME_DEPENDS_ON_TYPE_ID \ + RECOMMENDED_DEPENDS_ON_TYPE_ID \ + OPTIONAL_DEPENDS_ON_TYPE_ID \ + TEST_DEPENDS_ON_TYPE_ID \ + USES_FROM_MACOS_DEPENDS_ON_TYPE_ID <<< "$IDS" -# if you've already pulled the Homebrew data, you can `export FETCH=false` to skip the -# download, and just work off the latest symlink +# Validate that all required IDs are present and export them +required_vars=( + PACKAGE_MANAGER_ID + HOMEPAGE_URL_TYPE_ID + SOURCE_URL_TYPE_ID + BUILD_DEPENDS_ON_TYPE_ID + RUNTIME_DEPENDS_ON_TYPE_ID + RECOMMENDED_DEPENDS_ON_TYPE_ID + OPTIONAL_DEPENDS_ON_TYPE_ID + TEST_DEPENDS_ON_TYPE_ID + USES_FROM_MACOS_DEPENDS_ON_TYPE_ID +) -# > [!IMPORTANT] -# > -# > ONLY WORKS IF THE VOLUMES ARE MOUNTED +for var in "${required_vars[@]}"; do + if [ -z "${!var}" ]; then + log "ERROR: Required variable $var is empty or unset. Exiting." + exit 1 + fi + # shellcheck disable=SC2163 + export "$var" +done +# Data fetching and processing if [ "$FETCH" = true ]; then - NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") - mkdir -p "$DATA_DIR"/"$NOW" + log "Fetching new data from Homebrew" - # extract - curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json + # Create timestamped directory for this run + NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + mkdir -p "$DATA_DIR"/"$NOW" - # make a symlink called latest, pointing to $NOW - ln -sfn "$NOW" "$DATA_DIR"/latest + # Download source data + log "Downloading source data" + curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json - # transform - for x in "$CODE_DIR"/jq/*.jq; do - filename=$(basename "$x" .jq) - # use the formulas defined in the jq folder for each data model - if [ "$filename" = "packages" ]; then - jq -f "$x" -r --arg package_manager_id "$PACKAGE_MANAGER_ID" "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql - elif [ "$filename" = "urls" ]; then - jq -f "$x" -r \ - --arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ - --arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ - "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql - elif [ "$filename" = "versions" ]; then - jq -f "$x" -r \ - "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql - elif [ "$filename" = "package_url" ]; then - jq -f "$x" -r \ - --arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ - --arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ - "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql - elif [ "$filename" = "dependencies" ]; then - jq -f "$x" -r \ - --arg build_deps_type_id "$BUILD_DEPENDS_ON_TYPE_ID" \ - --arg runtime_deps_type_id "$RUNTIME_DEPENDS_ON_TYPE_ID" \ - --arg recommended_deps_type_id "$RECOMMENDED_DEPENDS_ON_TYPE_ID" \ - --arg optional_deps_type_id "$OPTIONAL_DEPENDS_ON_TYPE_ID" \ - --arg test_deps_type_id "$TEST_DEPENDS_ON_TYPE_ID" \ - --arg uses_from_macos_type_id "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" \ - "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql - else - echo "skipping $filename" - fi - done + # Update 'latest' symlink + ln -sfn "$NOW" "$DATA_DIR"/latest + + # Transform data using jq scripts + log "Transforming data" + for x in "$CODE_DIR"/jq/*.jq; do + filename=$(basename "$x" .jq) + log "Processing $filename" + case "$filename" in + packages) + jq -f "$x" -r \ + --arg package_manager_id "$PACKAGE_MANAGER_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + urls) + jq -f "$x" -r \ + --arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ + --arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + versions) + jq -f "$x" -r \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + package_url) + jq -f "$x" -r \ + --arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ + --arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + dependencies) + jq -f "$x" -r \ + --arg build_deps_type_id "$BUILD_DEPENDS_ON_TYPE_ID" \ + --arg runtime_deps_type_id "$RUNTIME_DEPENDS_ON_TYPE_ID" \ + --arg recommended_deps_type_id "$RECOMMENDED_DEPENDS_ON_TYPE_ID" \ + --arg optional_deps_type_id "$OPTIONAL_DEPENDS_ON_TYPE_ID" \ + --arg test_deps_type_id "$TEST_DEPENDS_ON_TYPE_ID" \ + --arg uses_from_macos_type_id "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + *) + log "Skipping unknown file: $filename" + ;; + esac + done +else + log "Skipping data fetch (FETCH=false)" fi -# load - order matters -psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/packages.sql -psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/urls.sql -psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/versions.sql -psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_url.sql -psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/dependencies.sql +# Load data into database +log "Loading data into database" +psql "$CHAI_DATABASE_URL" < Date: Mon, 21 Oct 2024 13:17:20 -0500 Subject: [PATCH 41/41] some cleanups --- docker-compose.yml | 8 ++------ package_managers/homebrew/pipeline.sh | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index bef45d1..99cad6b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -53,13 +53,9 @@ services: - TEST=${TEST:-false} - FETCH=${FETCH:-true} - FREQUENCY=${FREQUENCY:-24} - - DEBUG=${DEBUG:-true} - SOURCE=https://formulae.brew.sh/api/formula.json - CODE_DIR=/package_managers/homebrew - DATA_DIR=/data/homebrew - - PYTHONPATH=/ - # TODO: can I reference DATA_DIR in the volumes key? do I need it? - # https://stackoverflow.com/questions/29377853/how-can-i-use-environment-variables-in-docker-compose volumes: - ./data/homebrew:/data/homebrew depends_on: @@ -79,5 +75,5 @@ services: depends_on: crates: condition: service_started - # homebrew: - # condition: service_started + homebrew: + condition: service_started diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh index 6cf8487..bb33efb 100755 --- a/package_managers/homebrew/pipeline.sh +++ b/package_managers/homebrew/pipeline.sh @@ -118,7 +118,7 @@ fi # Load data into database log "Loading data into database" -psql "$CHAI_DATABASE_URL" <