Skip to content

Commit

Permalink
[wip]
Browse files Browse the repository at this point in the history
  • Loading branch information
jonavellecuerdo committed Nov 5, 2024
1 parent fd3a0c8 commit d85185a
Show file tree
Hide file tree
Showing 9 changed files with 514 additions and 323 deletions.
8 changes: 8 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
SHELL=/bin/bash
DATETIME:=$(shell date -u +%Y%m%dT%H%M%SZ)
MINIO_COMPOSE_FILE=abdiff/helpers/minio/docker-compose.yaml

help: # Preview Makefile commands
@awk 'BEGIN { FS = ":.*#"; print "Usage: make <target>\n\nTargets:" } \
Expand Down Expand Up @@ -54,3 +55,10 @@ black-apply: # Apply changes with 'black'

ruff-apply: # Resolve 'fixable errors' with 'ruff'
pipenv run ruff check --fix .

# Development commands
start-minio-server:
docker compose -f $(MINIO_COMPOSE_FILE) up -d

stop-minio-server:
docker compose -f $(MINIO_COMPOSE_FILE) stop
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ boto3 = "*"

[dev-packages]
black = "*"
boto3-stubs = {version = "*", extras = ["s3"]}
coveralls = "*"
freezegun = "*"
ipython = "*"
Expand Down
704 changes: 381 additions & 323 deletions Pipfile.lock

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,23 @@ Compare transformed TIMDEX records from two versions (A,B) of Transmogrifier.
- To lint the repo: `make lint`
- To run the app: `pipenv run abdiff --help`

### Storing Files in a Local Minio Server

TIMDEX extract files from S3 (i.e., input files to use in transformations) can be downloaded to a local MinIO server hosted via Docker container. [MinIO is an object storage solution that provides an Amazon Web Services S3-compatible API and supports all core S3 features](https://min.io/docs/minio/kubernetes/upstream/). Downloading extract files improves the runtime of a diff by reducing the number of requests sent to S3 and avoids repeated downloads of extract files.

1. Create an AWS profile `minio`. When prompted for an "AWS Access Key ID" and "AWS Secret Access Key", pass the values set for the `MINIO_ROOT_USER` and `MINIO_ROOT_PASSWORD` environment variables in the Docker Compose YAML file.
```shell
aws configure --profile minio
```

2. Launch a local Minio server via Docker container: `make start-minio-server`.
The API is accessible at: http://127.0.0.1:9000.
The WebUI is accessible at: http://127.0.0.1:9001.

3. On your browser, navigate to the WebUI and sign into the local Minio server using the credentials set in the Docker Compose YAML file.

4. Through the UI, create a bucket in the local Minio server named after the S3 bucket containing the TIMDEX extract files that will be used in the A/B Diff.

## Concepts

A **Job** in `abdiff` represents the A/B test for comparing the results from two versions of Transmogrifier. When a job is first created, a working directory and a JSON file `job.json` with an initial set of configurations is created.
Expand Down Expand Up @@ -90,6 +107,7 @@ AWS_SESSION_TOKEN=# passed to Transmogrifier containers for use
### Optional

```text
AWS_ENDPOINT_URL=# endpoint for MinIO server API; default is "http://localhost:9000/"
WEBAPP_HOST=# host for flask webapp
WEBAPP_PORT=# port for flask webapp
TRANSMOGRIFIER_MAX_WORKERS=# max number of Transmogrifier containers to run in parallel; default is 6
Expand Down
14 changes: 14 additions & 0 deletions abdiff/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from abdiff.config import Config, configure_logger
from abdiff.core import (
build_ab_images,
download_input_files,
calc_ab_diffs,
calc_ab_metrics,
collate_ab_transforms,
Expand Down Expand Up @@ -180,6 +181,19 @@ def run_diff(job_directory: str, input_files: str, message: str) -> None:
)


@main.command()
@click.option(
"-i",
"--input-files",
type=str,
required=True,
help="Input files to transform.",
)
def download_files(input_files: str):
input_files_list = [filepath.strip() for filepath in input_files.split(",")]
download_input_files(input_files_list)


@main.command()
@click.option(
"-d",
Expand Down
5 changes: 5 additions & 0 deletions abdiff/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class Config:
"WORKSPACE",
)
OPTIONAL_ENV_VARS = (
"AWS_ENDPOINT_URL",
"WEBAPP_HOST",
"WEBAPP_PORT",
"TRANSMOGRIFIER_MAX_WORKERS",
Expand All @@ -25,6 +26,10 @@ def __getattr__(self, name: str) -> Any: # noqa: ANN401
message = f"'{name}' not a valid configuration variable"
raise AttributeError(message)

@property
def aws_endpoint_url(self) -> str:
return self.AWS_ENDPOINT_URL or "http://localhost:9000/"

@property
def webapp_host(self) -> str:
return self.WEBAPP_HOST or "localhost"
Expand Down
2 changes: 2 additions & 0 deletions abdiff/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from abdiff.core.calc_ab_diffs import calc_ab_diffs
from abdiff.core.calc_ab_metrics import calc_ab_metrics
from abdiff.core.collate_ab_transforms import collate_ab_transforms
from abdiff.core.download_input_files import download_input_files
from abdiff.core.init_job import init_job
from abdiff.core.init_run import init_run
from abdiff.core.run_ab_transforms import run_ab_transforms
Expand All @@ -15,6 +16,7 @@
"init_job",
"init_run",
"build_ab_images",
"download_input_files",
"run_ab_transforms",
"collate_ab_transforms",
"calc_ab_diffs",
Expand Down
62 changes: 62 additions & 0 deletions abdiff/core/download_input_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import logging
import subprocess
from typing import TYPE_CHECKING

import boto3
from botocore.exceptions import ClientError

if TYPE_CHECKING:
from mypy_boto3_s3.client import S3Client

from abdiff.config import Config


logger = logging.getLogger(__name__)

CONFIG = Config()


def download_input_files(input_files: str):
s3_client = boto3.client("s3")

for input_file in input_files:
if check_object_exists(CONFIG.TIMDEX_BUCKET, input_file, s3_client):
continue

logger.info(f"Downloading input file from {CONFIG.TIMDEX_BUCKET}: {input_file}")
copy_command = ["aws", "s3", "cp", input_file, "-"]
upload_command = [
"aws",
"s3",
"cp",
"--endpoint-url",
CONFIG.aws_endpoint_url,
"--profile",
"minio",
"-",
input_file,
]

try:
copy_process = subprocess.run(
args=copy_command, check=True, capture_output=True
)
subprocess.run(
args=upload_command,
check=True,
input=copy_process.stdout,
)
except subprocess.CalledProcessError:
logger.exception(f"Failed to download input file: {input_file}")


def check_object_exists(bucket: str, input_file: str, s3_client: S3Client) -> bool:
key = input_file.replace(f"s3://{bucket}/", "")
try:
s3_client.head_object(Bucket=bucket, Key=key)
return True
except ClientError as exception:
if exception.response["Error"]["Code"] == "NoSuchKey":
return False
logger.exception(f"Cannot determine if object exists for key {key}.")
return False
23 changes: 23 additions & 0 deletions abdiff/helpers/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Settings and configurations that are common for all containers
x-minio-common: &minio-common
image: quay.io/minio/minio:RELEASE.2024-10-29T16-01-48Z
command: server --console-address ":9001" /mnt/data
ports:
- "9000:9000" # API port
- "9001:9001" # Console port
environment:
MINIO_ROOT_USER: minioadmin
MINIO_ROOT_PASSWORD: minioadmin
healthcheck:
test: ["CMD", "mc", "ready", "local"]
interval: 5s
timeout: 5s
retries: 5

services:
minio:
<<: *minio-common
volumes:
# TODO: env var for absolute path
- ./data:/mnt/data

0 comments on commit d85185a

Please sign in to comment.