Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

homebrew #8

Merged
merged 41 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
724ff74
big reorg
sanchitram1 Oct 14, 2024
94b5f30
docker builds db and alembic
sanchitram1 Oct 14, 2024
697b244
alembic and db docker actually builds now
sanchitram1 Oct 14, 2024
45deced
reloaded the crates stuff, and fixed imports
sanchitram1 Oct 14, 2024
e72e2c6
crates image is building and running
sanchitram1 Oct 15, 2024
7585d51
crates works
sanchitram1 Oct 15, 2024
cbca4df
wait on homebrew
sanchitram1 Oct 15, 2024
ca3d947
monitor service is building
sanchitram1 Oct 15, 2024
d3f3b89
env vars for crates
sanchitram1 Oct 15, 2024
efceae2
monitor service is running
sanchitram1 Oct 15, 2024
f0d283e
delete some old stuff
sanchitram1 Oct 16, 2024
1bc9a77
the scheduler class
sanchitram1 Oct 16, 2024
739eeeb
remove old scheduler
sanchitram1 Oct 16, 2024
eecee85
crates working with docker compose
sanchitram1 Oct 18, 2024
f8bd307
jq formulae for homebrew
sanchitram1 Oct 18, 2024
977adf3
fix packages script
sanchitram1 Oct 18, 2024
c26d617
start of orchestrator for homebrew...that works!
sanchitram1 Oct 18, 2024
b9dfbaf
jq transforms
sanchitram1 Oct 18, 2024
745dd22
volume mapping, docker compose
sanchitram1 Oct 18, 2024
55d9614
fix crates import
sanchitram1 Oct 18, 2024
2e1066f
lint on db.py
sanchitram1 Oct 18, 2024
7c103b6
config for crates
sanchitram1 Oct 18, 2024
c9d8fd7
Dockerfile?
sanchitram1 Oct 18, 2024
8b5fcdd
docker compose yml todo
sanchitram1 Oct 18, 2024
1ff756b
we've got csvs!
sanchitram1 Oct 18, 2024
51c5b06
everything but load
sanchitram1 Oct 18, 2024
705949a
cleanup
sanchitram1 Oct 18, 2024
45bb387
inserting packages works!
sanchitram1 Oct 18, 2024
e8630b2
all homebrew except dependencies
sanchitram1 Oct 18, 2024
de82b36
correctly create homebrew package manager row
sanchitram1 Oct 18, 2024
1f9c2cc
yep, much faster
sanchitram1 Oct 18, 2024
29e4853
crates fix to create source and package manager
sanchitram1 Oct 18, 2024
1d0791a
remove sed
sanchitram1 Oct 21, 2024
a5ab8b3
config changes to load dependency types
sanchitram1 Oct 21, 2024
7832599
create the data types upfront
sanchitram1 Oct 21, 2024
7ca5710
get homebrew env vars
sanchitram1 Oct 21, 2024
824aeab
remove python
sanchitram1 Oct 21, 2024
e349474
replace sed with jq
sanchitram1 Oct 21, 2024
5709607
jq corrections, pipeline.sh fixes, dockerfile
sanchitram1 Oct 21, 2024
4aee155
pipeline.sh improvements
jhheider Oct 21, 2024
a56b18b
some cleanups
sanchitram1 Oct 21, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# directories
data/
db/
.venv/

# other files
.gitignore
docker-compose.yml
docker-compose.yml
.DS_Store
.git
README.md
4 changes: 2 additions & 2 deletions alembic/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ RUN apt -y install python3-psycopg2
RUN apt -y install python3-sqlalchemy python3-sqlalchemy-ext
COPY . .
WORKDIR /alembic
RUN chmod +x run_migrations.sh
ENTRYPOINT ["/alembic/run_migrations.sh"]
RUN chmod +x /alembic/run_migrations.sh
ENTRYPOINT ["/alembic/run_migrations.sh"]
12 changes: 3 additions & 9 deletions alembic/env.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import os
from logging.config import fileConfig

from sqlalchemy import engine_from_config, pool

from alembic import context
from src.pipeline.models import Base
from sqlalchemy import engine_from_config, pool
from core.models import Base

# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
Expand Down Expand Up @@ -40,7 +39,6 @@ def run_migrations_offline() -> None:
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
compare_server_default=True,
)

with context.begin_transaction():
Expand All @@ -60,11 +58,7 @@ def run_migrations_online() -> None:
)

with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=target_metadata,
compare_server_default=True,
)
context.configure(connection=connection, target_metadata=target_metadata)

with context.begin_transaction():
context.run_migrations()
Expand Down
26 changes: 26 additions & 0 deletions alembic/load-values.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
-- url types
INSERT INTO "url_types" ("name")
VALUES ('source'), ('homepage'), ('documentation'), ('repository')
ON CONFLICT (name) DO NOTHING;

-- dependency types
INSERT INTO "depends_on_types" ("name")
VALUES
('build'),
('development'),
('runtime'),
('test'),
('optional'),
('recommended'),
('uses_from_macos')
ON CONFLICT (name) DO NOTHING;

-- sources
INSERT INTO "sources" ("type")
VALUES ('crates'), ('npm'), ('pypi'), ('rubygems'), ('github'), ('homebrew')
ON CONFLICT (type) DO NOTHING;

INSERT INTO "package_managers" ("source_id")
SELECT id
FROM "sources"
WHERE "type" IN ('crates', 'npm', 'pypi', 'rubygems', 'github', 'homebrew');
3 changes: 3 additions & 0 deletions alembic/run_migrations.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,6 @@ else
echo "migrations failed"
exit 1
fi

# load values
psql -U postgres -h db -d chai -f load-values.sql -a
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""package managers should be unique

Revision ID: 38cc41599874
Revises: 2481138a729a
Create Date: 2024-10-21 08:03:43.647535

"""

from typing import Sequence, Union

from alembic import op

# revision identifiers, used by Alembic.
revision: str = "38cc41599874"
down_revision: Union[str, None] = "2481138a729a"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
op.create_unique_constraint(
op.f("uq_package_managers_source_id"), "package_managers", ["source_id"]
)


def downgrade() -> None:
op.drop_constraint(
op.f("uq_package_managers_source_id"), "package_managers", type_="unique"
)
127 changes: 127 additions & 0 deletions core/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
from dataclasses import dataclass
from os import getenv

from core.db import DB
from core.logger import Logger
from core.structs import (
DependencyTypes,
PackageManager,
PackageManagerIDs,
Sources,
URLTypes,
UserTypes,
)

logger = Logger("config")

TEST = getenv("TEST", "false").lower() == "true"
FETCH = getenv("FETCH", "true").lower() == "true"


@dataclass
class Config:
file_location: str
test: bool
fetch: bool
package_manager_id: str
url_types: URLTypes
user_types: UserTypes
dependency_types: DependencyTypes

def __str__(self):
return f"Config(file_location={self.file_location}, test={self.test}, \
fetch={self.fetch}, package_manager_id={self.package_manager_id}, \
url_types={self.url_types}, user_types={self.user_types}, \
dependency_types={self.dependency_types})"


def load_url_types(db: DB) -> URLTypes:
logger.debug("loading url types, and creating if not exists")
homepage_url = db.select_url_types_homepage(create=True)
repository_url = db.select_url_types_repository(create=True)
documentation_url = db.select_url_types_documentation(create=True)
source_url = db.select_url_types_source(create=True)
return URLTypes(
homepage=homepage_url.id,
repository=repository_url.id,
documentation=documentation_url.id,
source=source_url.id,
)


def load_user_types(db: DB) -> UserTypes:
logger.debug("loading user types, and creating if not exists")
crates_source = db.select_source_by_name("crates", create=True)
github_source = db.select_source_by_name("github", create=True)
return UserTypes(
crates=crates_source.id,
github=github_source.id,
)


def load_package_manager_ids(db: DB) -> PackageManagerIDs:
logger.debug("loading package manager ids, and creating if not exists")
crates_package_manager = db.select_package_manager_by_name("crates", create=True)
homebrew_package_manager = db.select_package_manager_by_name(
"homebrew", create=True
)
return {
PackageManager.CRATES: crates_package_manager.id,
PackageManager.HOMEBREW: homebrew_package_manager.id,
}


def load_dependency_types(db: DB) -> DependencyTypes:
logger.debug("loading dependency types, and creating if not exists")
build_dep_type = db.select_dependency_type_by_name("build", create=True)
dev_dep_type = db.select_dependency_type_by_name("development", create=True)
runtime_dep_type = db.select_dependency_type_by_name("runtime", create=True)
test_dep_type = db.select_dependency_type_by_name("test", create=True)
optional_dep_type = db.select_dependency_type_by_name("optional", create=True)
recommended_dep_type = db.select_dependency_type_by_name("recommended", create=True)
return DependencyTypes(
build=build_dep_type.id,
development=dev_dep_type.id,
runtime=runtime_dep_type.id,
test=test_dep_type.id,
optional=optional_dep_type.id,
recommended=recommended_dep_type.id,
)


def load_sources() -> Sources:
return {
PackageManager.CRATES: "https://static.crates.io/db-dump.tar.gz",
PackageManager.HOMEBREW: (
"https://github.com/Homebrew/homebrew-core/tree/master/Formula"
),
}


def initialize(package_manager: PackageManager, db: DB) -> Config:
url_types = load_url_types(db)
user_types = load_user_types(db)
package_manager_ids = load_package_manager_ids(db)
dependency_types = load_dependency_types(db)
sources = load_sources()

if package_manager == PackageManager.CRATES:
return Config(
file_location=sources[PackageManager.CRATES],
test=False,
fetch=True,
package_manager_id=package_manager_ids[PackageManager.CRATES],
url_types=url_types,
user_types=user_types,
dependency_types=dependency_types,
)
elif package_manager == PackageManager.HOMEBREW:
return Config(
file_location=sources[PackageManager.HOMEBREW],
test=False,
fetch=True,
package_manager_id=package_manager_ids[PackageManager.HOMEBREW],
url_types=url_types,
user_types=user_types,
dependency_types=dependency_types,
)
38 changes: 33 additions & 5 deletions src/pipeline/utils/pg.py → core/db.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import os
from typing import Any, Dict, Iterable, List, Type
from src.pipeline.utils.utils import build_query_params

from sqlalchemy import UUID, create_engine
from sqlalchemy.dialects import postgresql
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm.decl_api import DeclarativeMeta
from src.pipeline.models import (

from core.logger import Logger
from core.models import (
URL,
DependsOn,
DependsOnType,
License,
LoadHistory,
Package,
Expand All @@ -16,12 +20,11 @@
Source,
URLType,
User,
URL,
UserPackage,
UserVersion,
Version,
)
from src.pipeline.utils.logger import Logger
from core.utils import build_query_params

CHAI_DATABASE_URL = os.getenv("CHAI_DATABASE_URL")
DEFAULT_BATCH_SIZE = 10000
Expand Down Expand Up @@ -371,8 +374,13 @@ def process_package_url(item: Dict[str, str]):
PackageURL, self._process_batch(batch, process_package_url)
)

def insert_source(self, name: str) -> UUID:
def insert_source(self, name: str) -> Source:
with self.session() as session:
existing_source = session.query(Source).filter_by(type=name).first()
if existing_source:
self.logger.warn(f"Source '{name}' already exists")
return existing_source

session.add(Source(type=name))
session.commit()
return session.query(Source).filter_by(type=name).first()
Expand Down Expand Up @@ -419,6 +427,9 @@ def select_url_types_repository(self, create: bool = False) -> URLType | None:
def select_url_types_documentation(self, create: bool = False) -> URLType | None:
return self.select_url_type("documentation", create)

def select_url_types_source(self, create: bool = False) -> URLType | None:
return self.select_url_type("source", create)

def select_package_manager_by_name(
self, package_manager: str, create: bool = False
) -> PackageManager | None:
Expand Down Expand Up @@ -514,3 +525,20 @@ def select_packages_by_import_ids(self, iids: Iterable[str]) -> List[Package]:
def select_licenses_by_name(self, names: Iterable[str]) -> List[License]:
with self.session() as session:
return session.query(License).filter(License.name.in_(names)).all()

def select_dependency_type_by_name(
self, name: str, create: bool = False
) -> DependsOnType:
with self.session() as session:
result = session.query(DependsOnType).filter_by(name=name).first()
if result:
return result
if create:
return self.insert_dependency_type(name)
return None

def insert_dependency_type(self, name: str) -> DependsOnType:
with self.session() as session:
session.add(DependsOnType(name=name))
session.commit()
return session.query(DependsOnType).filter_by(name=name).first()
2 changes: 1 addition & 1 deletion src/pipeline/utils/fetcher.py → core/fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Any

from requests import get
from src.pipeline.utils.logger import Logger
from core.logger import Logger


@dataclass
Expand Down
13 changes: 5 additions & 8 deletions src/pipeline/utils/logger.py → core/logger.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
from os import getenv
import time
import sys
import time
import traceback
from os import getenv

DEBUG = getenv("DEBUG", "false").lower() == "true"

# use inspect to print the line of code as well?
# caller = inspect.currentframe().f_back
# filename = caller.f_code.co_filename, lineno = caller.f_lineno
debug = getenv("DEBUG", "false").lower()
DEBUG = debug == "true" or debug == "1"


def as_minutes(seconds: float) -> float:
Expand All @@ -25,7 +22,7 @@ def __init__(self, name: str, mode=NORMAL, start=time.time()) -> None:
self.mode = Logger.VERBOSE if DEBUG else mode

def print(self, msg: str):
print(f"{self.time_diff():.2f}: [{self.name}]: {msg}", flush=True)
print(f"{self.time_diff():.2f}: [{self.name}]: {msg}")

def error(self, message):
self.print(f"[ERROR]: {message}")
Expand Down
4 changes: 3 additions & 1 deletion src/pipeline/models/__init__.py → core/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ class PackageManager(Base):
default=func.uuid_generate_v4(),
server_default=func.uuid_generate_v4(),
)
source_id = Column(UUID(as_uuid=True), ForeignKey("sources.id"), nullable=False)
source_id = Column(
UUID(as_uuid=True), ForeignKey("sources.id"), nullable=False, unique=True
)
created_at = Column(
DateTime, nullable=False, default=func.now(), server_default=func.now()
)
Expand Down
Loading
Loading