Skip to content

Commit

Permalink
Upgrade schedule validator and save version as metadata (#1729)
Browse files Browse the repository at this point in the history
* update to v3 validator, fix dockerfile

* finally deploy the schedule validator image through github actions

* bring in latest calitp

* use new calitp, simplify metadata, add version to notice rows, couple qol improvements

* change flag per v3

* use poetry export install here too

* lock

* export install here too

* add verbose, just copy jar instead of download

* use environ directly
  • Loading branch information
atvaccaro authored Sep 2, 2022
1 parent 076f43a commit 592921d
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 90 deletions.
28 changes: 28 additions & 0 deletions .github/workflows/build-gtfs-schedule-validator.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Build and push gtfs-schedule-validator image

on:
push:
branches:
- 'main'
paths:
- '.github/workflows/build-gtfs-schedule-validator.yml'
- 'jobs/gtfs-schedule-validator/**'

jobs:
build_push:
name: Package docker image
runs-on: ubuntu-18.04
steps:
- uses: actions/checkout@v2
- name: Login to GitHub Container Registry
uses: docker/login-action@v1
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push
uses: docker/build-push-action@v2
with:
context: jobs/gtfs-schedule-validator
push: true
tags: ghcr.io/${{github.repository}}/gtfs-schedule-validator:latest
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ arguments:
- "gtfs_schedule_validator"
- "validate-day"
- "{{ ds }}"
- "--verbose"

is_delete_operator_pod: true
get_logs: true
Expand Down
4 changes: 2 additions & 2 deletions jobs/gtfs-rt-parser-v2/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ WORKDIR /app

COPY ./pyproject.toml /app/pyproject.toml
COPY ./poetry.lock /app/poetry.lock
RUN poetry config virtualenvs.create false
RUN poetry install
RUN poetry export -f requirements.txt --without-hashes --output requirements.txt \
&& pip install -r requirements.txt
RUN pip install memray

COPY . /app
Expand Down
19 changes: 9 additions & 10 deletions jobs/gtfs-schedule-validator/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,23 @@ FROM openjdk:11

LABEL org.opencontainers.image.source https://github.com/cal-itp/data-infra

ENV GTFS_SCHEDULE_VALIDATOR_JAR=/gtfs-validator-v2.0.0_cli
ENV GTFS_SCHEDULE_VALIDATOR_VERSION=v2.0.0

RUN apt-get update -y \
&& apt-get install -y python3 python3-pip
&& apt-get install -y python3 python3-pip python3-venv

RUN curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python3 -
ENV PATH="${PATH}:/root/.poetry/bin"
RUN curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/install-poetry.py | python3 -
ENV PATH="/root/.local/bin:${PATH}"

# formerly the "1.0.0-SNAPSHOT" from S3
COPY ./gtfs-validator-v2.0.0_cli.jar ${GTFS_SCHEDULE_VALIDATOR_JAR}
ENV GTFS_SCHEDULE_VALIDATOR_JAR=/gtfs-validator-3.1.1-cli.jar
ENV GTFS_SCHEDULE_VALIDATOR_VERSION=v3.1.1
# from https://github.com/MobilityData/gtfs-validator/releases/download/v3.1.1/gtfs-validator-3.1.1-cli.jar
COPY ./gtfs-validator-3.1.1-cli.jar ${GTFS_SCHEDULE_VALIDATOR_JAR}

WORKDIR /app

COPY ./pyproject.toml /app/pyproject.toml
COPY ./poetry.lock /app/poetry.lock
RUN poetry config virtualenvs.create false
RUN poetry install
RUN poetry export -f requirements.txt --without-hashes --output requirements.txt \
&& pip install -r requirements.txt
RUN pip install memray

COPY . /app
Expand Down
Binary file not shown.
73 changes: 41 additions & 32 deletions jobs/gtfs-schedule-validator/gtfs_schedule_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import typer
from calitp.storage import (
fetch_all_in_partition,
GTFSFeedExtractInfo,
GTFSScheduleFeedExtract,
get_fs,
GTFSFeedType,
JSONL_GZIP_EXTENSION,
Expand All @@ -24,15 +24,16 @@
JSONL_EXTENSION,
SCHEDULE_RAW_BUCKET,
)
from pydantic import Field, validator
from pydantic import validator

JAVA_EXECUTABLE_PATH_KEY = "GTFS_SCHEDULE_VALIDATOR_JAVA_EXECUTABLE"
SCHEDULE_VALIDATOR_JAR_LOCATION_ENV_KEY = "GTFS_SCHEDULE_VALIDATOR_JAR"
JAR_DEFAULT = typer.Option(
default=os.environ.get(SCHEDULE_VALIDATOR_JAR_LOCATION_ENV_KEY),
help="Path to the GTFS Schedule Validator JAR",
)
SCHEDULE_VALIDATION_BUCKET = os.getenv("CALITP_BUCKET__GTFS_SCHEDULE_VALIDATION")
SCHEDULE_VALIDATION_BUCKET = os.environ["CALITP_BUCKET__GTFS_SCHEDULE_VALIDATION"]
GTFS_VALIDATOR_VERSION = os.environ["GTFS_SCHEDULE_VALIDATOR_VERSION"]

app = typer.Typer()
logging.basicConfig()
Expand All @@ -42,34 +43,25 @@
# similar to the extracts sharing some functionality
class GTFSScheduleFeedValidation(PartitionedGCSArtifact):
bucket: ClassVar[str] = SCHEDULE_VALIDATION_BUCKET
table: ClassVar[str] = "validation_reports"
extract: GTFSFeedExtractInfo = Field(..., exclude={"config"})
partition_names: ClassVar[List[str]] = GTFSScheduleFeedExtract.partition_names
table: ClassVar[str] = "validation_notices"
ts: pendulum.DateTime
base64_url: str
extract_path: str
system_errors: Dict

@validator("filename", allow_reuse=True)
def is_jsonl_gz(cls, v):
assert v.endswith(JSONL_GZIP_EXTENSION)
return v

@property
def partition_names(self) -> List[str]:
return self.extract.partition_names

@property
def dt(self) -> pendulum.Date:
return self.extract.ts.date()

@property
def base64_url(self) -> str:
return self.extract.config.base64_encoded_url

@property
def ts(self) -> pendulum.DateTime:
return self.extract.ts
return self.ts.date()


class GTFSScheduleFeedExtractValidationOutcome(ProcessingOutcome):
extract: GTFSFeedExtractInfo = Field(..., exclude={"config"})
extract_path: str
validation: Optional[GTFSScheduleFeedValidation]


Expand Down Expand Up @@ -106,11 +98,9 @@ def save(self, fs):


def execute_schedule_validator(
fs,
zip_path: Path,
output_dir: Path,
jar_path: Path = os.environ.get(SCHEDULE_VALIDATOR_JAR_LOCATION_ENV_KEY),
verbose=False,
) -> (Dict, Dict):
if not isinstance(zip_path, Path):
raise TypeError("must provide a path to the zip file")
Expand All @@ -123,7 +113,7 @@ def execute_schedule_validator(
str(zip_path),
"--output_base",
str(output_dir),
"--feed_name",
"--country_code",
"us-na",
]

Expand All @@ -134,7 +124,8 @@ def execute_schedule_validator(
subprocess.run(
args,
capture_output=True,
).check_returncode()
check=True,
)

with open(report_path) as f:
report = json.load(f)
Expand Down Expand Up @@ -169,18 +160,19 @@ def validate_day(
help="The date of data to validate.",
formats=["%Y-%m-%d"],
),
verbose: bool = False,
) -> None:
day = pendulum.instance(day).date()

extracts: List[GTFSFeedExtractInfo] = fetch_all_in_partition(
cls=GTFSFeedExtractInfo,
extracts: List[GTFSScheduleFeedExtract] = fetch_all_in_partition(
cls=GTFSScheduleFeedExtract,
bucket=SCHEDULE_RAW_BUCKET,
table=GTFSFeedType.schedule,
fs=get_fs(),
partitions={
"dt": day,
},
verbose=True,
verbose=verbose,
)

if not extracts:
Expand All @@ -197,7 +189,7 @@ def validate_day(
fs = get_fs()
outcomes = []

for i, extract in enumerate(extracts):
for i, extract in enumerate(extracts, start=1):
typer.secho(f"processing {i} of {len(extracts)}")
try:
with tempfile.TemporaryDirectory() as tmp_dir:
Expand All @@ -208,16 +200,28 @@ def validate_day(
)
fs.get_file(extract.path, zip_path)
report, system_errors = execute_schedule_validator(
fs=fs,
zip_path=Path(zip_path),
output_dir=tmp_dir,
)
validation = GTFSScheduleFeedValidation(
filename=f"validation_notices{JSONL_GZIP_EXTENSION}",
extract=extract,
ts=extract.ts,
base64_url=extract.base64_url,
extract_path=extract.path,
system_errors=system_errors,
)
notices = report["notices"]

notices = [
{
"metadata": {
"extract_path": extract.path,
"gtfs_validator_version": GTFS_VALIDATOR_VERSION,
},
**notice,
}
for notice in report["notices"]
]

typer.secho(
f"saving {len(notices)} validation notices to {validation.path}",
fg=typer.colors.GREEN,
Expand All @@ -231,7 +235,7 @@ def validate_day(
outcomes.append(
GTFSScheduleFeedExtractValidationOutcome(
success=True,
extract=extract,
extract_path=extract.path,
validation=validation,
)
)
Expand All @@ -240,10 +244,15 @@ def validate_day(
f"encountered exception on extract {extract.path}: {e}\n{traceback.format_exc()}",
fg=typer.colors.RED,
)
if verbose and isinstance(e, subprocess.CalledProcessError):
typer.secho(
e.stderr,
fg=typer.colors.RED,
)
outcomes.append(
GTFSScheduleFeedExtractValidationOutcome(
success=False,
extract=extract,
extract_path=extract.path,
exception=e,
)
)
Expand Down
Loading

0 comments on commit 592921d

Please sign in to comment.