Skip to content

Commit

Permalink
homebrew (#8)
Browse files Browse the repository at this point in the history
* big reorg

* docker builds db and alembic

* alembic and db docker actually builds now

also, migrations run

* reloaded the crates stuff, and fixed imports

* crates image is building and running

* crates works

* wait on homebrew

* monitor service is building

* env vars for crates

* monitor service is running

* delete some old stuff

* the scheduler class

* remove old scheduler

* crates working with docker compose

* jq formulae for homebrew

* fix packages script

* start of orchestrator for homebrew...that works!

* jq transforms

* volume mapping, docker compose

* fix crates import

* lint on db.py

* config for crates

* Dockerfile?

* docker compose yml todo

* we've got csvs!

* everything but load

* cleanup

* inserting packages works!

* all homebrew except dependencies

* correctly create homebrew package manager row

* yep, much faster

* crates fix to create source and package manager

* remove sed

* config changes to load dependency types

* create the data types upfront

* get homebrew env vars

* remove python

* replace sed with jq

* jq corrections, pipeline.sh fixes, dockerfile

* pipeline.sh improvements

* some cleanups

---------

Co-authored-by: Jacob Heider <[email protected]>
  • Loading branch information
sanchitram1 and jhheider authored Oct 21, 2024
1 parent 189e43c commit cffd6fb
Show file tree
Hide file tree
Showing 37 changed files with 692 additions and 403 deletions.
6 changes: 4 additions & 2 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# directories
data/
db/
.venv/

# other files
.gitignore
docker-compose.yml
docker-compose.yml
.DS_Store
.git
README.md
4 changes: 2 additions & 2 deletions alembic/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ RUN apt -y install python3-psycopg2
RUN apt -y install python3-sqlalchemy python3-sqlalchemy-ext
COPY . .
WORKDIR /alembic
RUN chmod +x run_migrations.sh
ENTRYPOINT ["/alembic/run_migrations.sh"]
RUN chmod +x /alembic/run_migrations.sh
ENTRYPOINT ["/alembic/run_migrations.sh"]
12 changes: 3 additions & 9 deletions alembic/env.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import os
from logging.config import fileConfig

from sqlalchemy import engine_from_config, pool

from alembic import context
from src.pipeline.models import Base
from sqlalchemy import engine_from_config, pool
from core.models import Base

# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
Expand Down Expand Up @@ -40,7 +39,6 @@ def run_migrations_offline() -> None:
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
compare_server_default=True,
)

with context.begin_transaction():
Expand All @@ -60,11 +58,7 @@ def run_migrations_online() -> None:
)

with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=target_metadata,
compare_server_default=True,
)
context.configure(connection=connection, target_metadata=target_metadata)

with context.begin_transaction():
context.run_migrations()
Expand Down
26 changes: 26 additions & 0 deletions alembic/load-values.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
-- url types
INSERT INTO "url_types" ("name")
VALUES ('source'), ('homepage'), ('documentation'), ('repository')
ON CONFLICT (name) DO NOTHING;

-- dependency types
INSERT INTO "depends_on_types" ("name")
VALUES
('build'),
('development'),
('runtime'),
('test'),
('optional'),
('recommended'),
('uses_from_macos')
ON CONFLICT (name) DO NOTHING;

-- sources
INSERT INTO "sources" ("type")
VALUES ('crates'), ('npm'), ('pypi'), ('rubygems'), ('github'), ('homebrew')
ON CONFLICT (type) DO NOTHING;

INSERT INTO "package_managers" ("source_id")
SELECT id
FROM "sources"
WHERE "type" IN ('crates', 'npm', 'pypi', 'rubygems', 'github', 'homebrew');
3 changes: 3 additions & 0 deletions alembic/run_migrations.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,6 @@ else
echo "migrations failed"
exit 1
fi

# load values
psql -U postgres -h db -d chai -f load-values.sql -a
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""package managers should be unique
Revision ID: 38cc41599874
Revises: 2481138a729a
Create Date: 2024-10-21 08:03:43.647535
"""

from typing import Sequence, Union

from alembic import op

# revision identifiers, used by Alembic.
revision: str = "38cc41599874"
down_revision: Union[str, None] = "2481138a729a"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
op.create_unique_constraint(
op.f("uq_package_managers_source_id"), "package_managers", ["source_id"]
)


def downgrade() -> None:
op.drop_constraint(
op.f("uq_package_managers_source_id"), "package_managers", type_="unique"
)
127 changes: 127 additions & 0 deletions core/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
from dataclasses import dataclass
from os import getenv

from core.db import DB
from core.logger import Logger
from core.structs import (
DependencyTypes,
PackageManager,
PackageManagerIDs,
Sources,
URLTypes,
UserTypes,
)

logger = Logger("config")

TEST = getenv("TEST", "false").lower() == "true"
FETCH = getenv("FETCH", "true").lower() == "true"


@dataclass
class Config:
file_location: str
test: bool
fetch: bool
package_manager_id: str
url_types: URLTypes
user_types: UserTypes
dependency_types: DependencyTypes

def __str__(self):
return f"Config(file_location={self.file_location}, test={self.test}, \
fetch={self.fetch}, package_manager_id={self.package_manager_id}, \
url_types={self.url_types}, user_types={self.user_types}, \
dependency_types={self.dependency_types})"


def load_url_types(db: DB) -> URLTypes:
logger.debug("loading url types, and creating if not exists")
homepage_url = db.select_url_types_homepage(create=True)
repository_url = db.select_url_types_repository(create=True)
documentation_url = db.select_url_types_documentation(create=True)
source_url = db.select_url_types_source(create=True)
return URLTypes(
homepage=homepage_url.id,
repository=repository_url.id,
documentation=documentation_url.id,
source=source_url.id,
)


def load_user_types(db: DB) -> UserTypes:
logger.debug("loading user types, and creating if not exists")
crates_source = db.select_source_by_name("crates", create=True)
github_source = db.select_source_by_name("github", create=True)
return UserTypes(
crates=crates_source.id,
github=github_source.id,
)


def load_package_manager_ids(db: DB) -> PackageManagerIDs:
logger.debug("loading package manager ids, and creating if not exists")
crates_package_manager = db.select_package_manager_by_name("crates", create=True)
homebrew_package_manager = db.select_package_manager_by_name(
"homebrew", create=True
)
return {
PackageManager.CRATES: crates_package_manager.id,
PackageManager.HOMEBREW: homebrew_package_manager.id,
}


def load_dependency_types(db: DB) -> DependencyTypes:
logger.debug("loading dependency types, and creating if not exists")
build_dep_type = db.select_dependency_type_by_name("build", create=True)
dev_dep_type = db.select_dependency_type_by_name("development", create=True)
runtime_dep_type = db.select_dependency_type_by_name("runtime", create=True)
test_dep_type = db.select_dependency_type_by_name("test", create=True)
optional_dep_type = db.select_dependency_type_by_name("optional", create=True)
recommended_dep_type = db.select_dependency_type_by_name("recommended", create=True)
return DependencyTypes(
build=build_dep_type.id,
development=dev_dep_type.id,
runtime=runtime_dep_type.id,
test=test_dep_type.id,
optional=optional_dep_type.id,
recommended=recommended_dep_type.id,
)


def load_sources() -> Sources:
return {
PackageManager.CRATES: "https://static.crates.io/db-dump.tar.gz",
PackageManager.HOMEBREW: (
"https://github.com/Homebrew/homebrew-core/tree/master/Formula"
),
}


def initialize(package_manager: PackageManager, db: DB) -> Config:
url_types = load_url_types(db)
user_types = load_user_types(db)
package_manager_ids = load_package_manager_ids(db)
dependency_types = load_dependency_types(db)
sources = load_sources()

if package_manager == PackageManager.CRATES:
return Config(
file_location=sources[PackageManager.CRATES],
test=False,
fetch=True,
package_manager_id=package_manager_ids[PackageManager.CRATES],
url_types=url_types,
user_types=user_types,
dependency_types=dependency_types,
)
elif package_manager == PackageManager.HOMEBREW:
return Config(
file_location=sources[PackageManager.HOMEBREW],
test=False,
fetch=True,
package_manager_id=package_manager_ids[PackageManager.HOMEBREW],
url_types=url_types,
user_types=user_types,
dependency_types=dependency_types,
)
38 changes: 33 additions & 5 deletions src/pipeline/utils/pg.py → core/db.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import os
from typing import Any, Dict, Iterable, List, Type
from src.pipeline.utils.utils import build_query_params

from sqlalchemy import UUID, create_engine
from sqlalchemy.dialects import postgresql
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm.decl_api import DeclarativeMeta
from src.pipeline.models import (

from core.logger import Logger
from core.models import (
URL,
DependsOn,
DependsOnType,
License,
LoadHistory,
Package,
Expand All @@ -16,12 +20,11 @@
Source,
URLType,
User,
URL,
UserPackage,
UserVersion,
Version,
)
from src.pipeline.utils.logger import Logger
from core.utils import build_query_params

CHAI_DATABASE_URL = os.getenv("CHAI_DATABASE_URL")
DEFAULT_BATCH_SIZE = 10000
Expand Down Expand Up @@ -371,8 +374,13 @@ def process_package_url(item: Dict[str, str]):
PackageURL, self._process_batch(batch, process_package_url)
)

def insert_source(self, name: str) -> UUID:
def insert_source(self, name: str) -> Source:
with self.session() as session:
existing_source = session.query(Source).filter_by(type=name).first()
if existing_source:
self.logger.warn(f"Source '{name}' already exists")
return existing_source

session.add(Source(type=name))
session.commit()
return session.query(Source).filter_by(type=name).first()
Expand Down Expand Up @@ -419,6 +427,9 @@ def select_url_types_repository(self, create: bool = False) -> URLType | None:
def select_url_types_documentation(self, create: bool = False) -> URLType | None:
return self.select_url_type("documentation", create)

def select_url_types_source(self, create: bool = False) -> URLType | None:
return self.select_url_type("source", create)

def select_package_manager_by_name(
self, package_manager: str, create: bool = False
) -> PackageManager | None:
Expand Down Expand Up @@ -514,3 +525,20 @@ def select_packages_by_import_ids(self, iids: Iterable[str]) -> List[Package]:
def select_licenses_by_name(self, names: Iterable[str]) -> List[License]:
with self.session() as session:
return session.query(License).filter(License.name.in_(names)).all()

def select_dependency_type_by_name(
self, name: str, create: bool = False
) -> DependsOnType:
with self.session() as session:
result = session.query(DependsOnType).filter_by(name=name).first()
if result:
return result
if create:
return self.insert_dependency_type(name)
return None

def insert_dependency_type(self, name: str) -> DependsOnType:
with self.session() as session:
session.add(DependsOnType(name=name))
session.commit()
return session.query(DependsOnType).filter_by(name=name).first()
2 changes: 1 addition & 1 deletion src/pipeline/utils/fetcher.py → core/fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Any

from requests import get
from src.pipeline.utils.logger import Logger
from core.logger import Logger


@dataclass
Expand Down
13 changes: 5 additions & 8 deletions src/pipeline/utils/logger.py → core/logger.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
from os import getenv
import time
import sys
import time
import traceback
from os import getenv

DEBUG = getenv("DEBUG", "false").lower() == "true"

# use inspect to print the line of code as well?
# caller = inspect.currentframe().f_back
# filename = caller.f_code.co_filename, lineno = caller.f_lineno
debug = getenv("DEBUG", "false").lower()
DEBUG = debug == "true" or debug == "1"


def as_minutes(seconds: float) -> float:
Expand All @@ -25,7 +22,7 @@ def __init__(self, name: str, mode=NORMAL, start=time.time()) -> None:
self.mode = Logger.VERBOSE if DEBUG else mode

def print(self, msg: str):
print(f"{self.time_diff():.2f}: [{self.name}]: {msg}", flush=True)
print(f"{self.time_diff():.2f}: [{self.name}]: {msg}")

def error(self, message):
self.print(f"[ERROR]: {message}")
Expand Down
4 changes: 3 additions & 1 deletion src/pipeline/models/__init__.py → core/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ class PackageManager(Base):
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
source_id = Column(UUID(as_uuid=True), ForeignKey("sources.id"), nullable=False)
source_id = Column(
UUID(as_uuid=True), ForeignKey("sources.id"), nullable=False, unique=True
)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
Expand Down
Loading

0 comments on commit cffd6fb

Please sign in to comment.