Skip to content

Commit

Permalink
fix(DRAFT): Don't generate csv on refresh
Browse files Browse the repository at this point in the history
  • Loading branch information
dangotbanned committed Nov 20, 2024
1 parent 0817ff8 commit e01fdd7
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 8 deletions.
Binary file modified altair/datasets/_metadata/url.csv.gz
Binary file not shown.
21 changes: 13 additions & 8 deletions tools/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,9 @@ def github(self) -> GitHub:
def npm(self) -> Npm:
return self._npm

def refresh(self, *, include_typing: bool = False) -> pl.DataFrame:
def refresh(
self, *, include_typing: bool = False, include_csv: bool = False
) -> pl.DataFrame:
"""
Update and sync all dataset metadata files.
Expand All @@ -139,13 +141,16 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame:
gh_trees = self.github.refresh_trees(gh_tags)
self.write_parquet(gh_trees, self._paths["gh_trees"])

npm_urls_min = (
gh_trees.lazy()
.filter(col("tag") == col("tag").first(), col("suffix") != ".parquet")
.filter(col("size") == col("size").min().over("dataset_name"))
.select("dataset_name", "url_npm")
)
self.write_csv_gzip(npm_urls_min, self._fp_url)
if include_csv:
# BUG: Non-deterministic
# https://github.com/vega/altair/actions/runs/11942284568/job/33288974210?pr=3631
npm_urls_min = (
gh_trees.lazy()
.filter(col("tag") == col("tag").first(), col("suffix") != ".parquet")
.filter(col("size") == col("size").min().over("dataset_name"))
.select("dataset_name", "url_npm")
)
self.write_csv_gzip(npm_urls_min, self._fp_url)

if include_typing:
self.generate_typing(self._fp_typing)
Expand Down

0 comments on commit e01fdd7

Please sign in to comment.