-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #141 from Synthetixio/feat/indexer-cleaning-script
Indexed Data Cleaning Script
- Loading branch information
Showing
8 changed files
with
1,464 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
.git | ||
__pycache__ | ||
*.pyc | ||
*.pyo | ||
*.pyd | ||
.DS_Store | ||
.env | ||
.venv | ||
venv | ||
env | ||
node_modules |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,22 @@ | ||
FROM node:16-alpine | ||
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ | ||
|
||
WORKDIR /app | ||
|
||
COPY package*.json ./ | ||
COPY requirements.txt . | ||
COPY patches/ ./patches/ | ||
|
||
RUN apk add --no-cache python3 py3-pip | ||
RUN apk add --no-cache build-base && npm ci && apk del build-base | ||
RUN apk add --no-cache python3 python3-dev py3-pip clang cmake build-base git | ||
|
||
COPY . . | ||
RUN npm ci | ||
|
||
RUN chmod +x entrypoint.sh | ||
COPY pyproject.toml uv.lock ./ | ||
RUN uv sync --frozen | ||
|
||
RUN pip install --upgrade pip | ||
RUN pip install --no-cache-dir -r requirements.txt | ||
RUN apk del build-base | ||
|
||
COPY . . | ||
|
||
RUN chmod +x entrypoint.sh | ||
|
||
ENTRYPOINT ["./entrypoint.sh"] | ||
CMD ["./entrypoint.sh"] |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
[project] | ||
name = "indexers" | ||
version = "0.1.0" | ||
description = "A blockchain indexer" | ||
readme = "README.md" | ||
requires-python = ">=3.11" | ||
dependencies = [ | ||
"fastparquet>=2024.11.0", | ||
"pandas>=2.2.3", | ||
"python-dotenv>=1.0.1", | ||
"pyyaml>=6.0.2", | ||
"synthetix>=0.1.21", | ||
] |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import argparse | ||
from pathlib import Path | ||
import pandas as pd | ||
import os | ||
|
||
|
||
def clean_parquet_files(network_name: str, protocol_name: str): | ||
source_base = f"/parquet-data/indexers/raw/{network_name}/{protocol_name}" | ||
target_base = f"/parquet-data/indexers/clean/{network_name}/{protocol_name}" | ||
|
||
protocol_path = Path(source_base) | ||
if not protocol_path.exists(): | ||
raise ValueError(f"Source path {source_base} does not exist") | ||
Path(target_base).mkdir(parents=True, exist_ok=True) | ||
|
||
for block_range_dir in protocol_path.iterdir(): | ||
if not block_range_dir.is_dir(): | ||
continue | ||
block_range = block_range_dir.name | ||
|
||
for parquet_file in block_range_dir.glob("*.parquet"): | ||
event_name = parquet_file.stem | ||
event_dir = Path(target_base) / event_name | ||
output_file = event_dir / f"{event_name}_{block_range}.parquet" | ||
|
||
# Skip if file already exists | ||
if output_file.exists(): | ||
continue | ||
|
||
df = pd.read_parquet(parquet_file) | ||
if df.empty: | ||
continue | ||
event_dir.mkdir(parents=True, exist_ok=True) | ||
df.to_parquet(output_file, index=False) | ||
print(f"Processed {protocol_name} {block_range}") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--network_name", type=str) | ||
parser.add_argument("--protocol_name", type=str) | ||
args = parser.parse_args() | ||
|
||
network_name = os.getenv("NETWORK_NAME") or args.network_name | ||
protocol_name = os.getenv("PROTOCOL_NAME") or args.protocol_name | ||
|
||
print(f"Cleaning {network_name} {protocol_name}") | ||
|
||
if network_name is None or protocol_name is None: | ||
raise ValueError("Network and protocol must be provided") | ||
|
||
clean_parquet_files(network_name, protocol_name) |
Large diffs are not rendered by default.
Oops, something went wrong.