Skip to content

Commit

Permalink
Merge pull request #141 from Synthetixio/feat/indexer-cleaning-script
Browse files Browse the repository at this point in the history
Indexed Data Cleaning Script
  • Loading branch information
Tburm authored Nov 21, 2024
2 parents db31b23 + cfa29dc commit 1376a99
Show file tree
Hide file tree
Showing 8 changed files with 1,464 additions and 39 deletions.
11 changes: 11 additions & 0 deletions indexers/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
.git
__pycache__
*.pyc
*.pyo
*.pyd
.DS_Store
.env
.venv
venv
env
node_modules
18 changes: 10 additions & 8 deletions indexers/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
FROM node:16-alpine
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/

WORKDIR /app

COPY package*.json ./
COPY requirements.txt .
COPY patches/ ./patches/

RUN apk add --no-cache python3 py3-pip
RUN apk add --no-cache build-base && npm ci && apk del build-base
RUN apk add --no-cache python3 python3-dev py3-pip clang cmake build-base git

COPY . .
RUN npm ci

RUN chmod +x entrypoint.sh
COPY pyproject.toml uv.lock ./
RUN uv sync --frozen

RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r requirements.txt
RUN apk del build-base

COPY . .

RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]
CMD ["./entrypoint.sh"]
24 changes: 0 additions & 24 deletions indexers/config.ts

This file was deleted.

5 changes: 1 addition & 4 deletions indexers/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,11 @@
set -e

# Get contract data from SDK and generate squidgen.yaml and squid.yaml
python3 main.py --network_name $NETWORK_NAME --protocol_name $PROTOCOL_NAME "$@"
uv run main.py --network_name $NETWORK_NAME --protocol_name $PROTOCOL_NAME "$@"

# Generate squid processor
npm run generate:processor

# Move config.ts to src
mv config.ts src/config.ts

# Build squid processor
npm run build

Expand Down
13 changes: 13 additions & 0 deletions indexers/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[project]
name = "indexers"
version = "0.1.0"
description = "A blockchain indexer"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"fastparquet>=2024.11.0",
"pandas>=2.2.3",
"python-dotenv>=1.0.1",
"pyyaml>=6.0.2",
"synthetix>=0.1.21",
]
3 changes: 0 additions & 3 deletions indexers/requirements.txt

This file was deleted.

52 changes: 52 additions & 0 deletions indexers/scripts/clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import argparse
from pathlib import Path
import pandas as pd
import os


def clean_parquet_files(network_name: str, protocol_name: str):
source_base = f"/parquet-data/indexers/raw/{network_name}/{protocol_name}"
target_base = f"/parquet-data/indexers/clean/{network_name}/{protocol_name}"

protocol_path = Path(source_base)
if not protocol_path.exists():
raise ValueError(f"Source path {source_base} does not exist")
Path(target_base).mkdir(parents=True, exist_ok=True)

for block_range_dir in protocol_path.iterdir():
if not block_range_dir.is_dir():
continue
block_range = block_range_dir.name

for parquet_file in block_range_dir.glob("*.parquet"):
event_name = parquet_file.stem
event_dir = Path(target_base) / event_name
output_file = event_dir / f"{event_name}_{block_range}.parquet"

# Skip if file already exists
if output_file.exists():
continue

df = pd.read_parquet(parquet_file)
if df.empty:
continue
event_dir.mkdir(parents=True, exist_ok=True)
df.to_parquet(output_file, index=False)
print(f"Processed {protocol_name} {block_range}")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--network_name", type=str)
parser.add_argument("--protocol_name", type=str)
args = parser.parse_args()

network_name = os.getenv("NETWORK_NAME") or args.network_name
protocol_name = os.getenv("PROTOCOL_NAME") or args.protocol_name

print(f"Cleaning {network_name} {protocol_name}")

if network_name is None or protocol_name is None:
raise ValueError("Network and protocol must be provided")

clean_parquet_files(network_name, protocol_name)
1,377 changes: 1,377 additions & 0 deletions indexers/uv.lock

Large diffs are not rendered by default.

0 comments on commit 1376a99

Please sign in to comment.