From 0817ff8503f728a4bc0c8d160abaab311f829fd7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 20 Nov 2024 21:46:22 +0000 Subject: [PATCH] feat: Support `url(...)` without dependencies https://github.com/vega/altair/pull/3631#discussion_r1846662053, https://github.com/vega/altair/pull/3631#issuecomment-2488621316, https://github.com/vega/altair/pull/3631#issuecomment-2481977891 --- altair/datasets/__init__.py | 13 ++++- altair/datasets/_loader.py | 77 +++++++++++++++++++++++++-- altair/datasets/_metadata/url.csv.gz | Bin 0 -> 855 bytes altair/datasets/_readers.py | 5 +- tests/test_datasets.py | 70 +++++++++++++++++++----- tools/datasets/__init__.py | 23 ++++++++ 6 files changed, 168 insertions(+), 20 deletions(-) create mode 100644 altair/datasets/_metadata/url.csv.gz diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index ac7ac9f06..e426ca467 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -78,9 +78,18 @@ def url( - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516 - https://github.com/vega/altair/pull/3631#discussion_r1846662053 """ - from altair.datasets._loader import load + from altair.datasets._readers import AltairDatasetsError - return load.url(name, suffix, tag=tag) + try: + from altair.datasets._loader import load + + url = load.url(name, suffix, tag=tag) + except AltairDatasetsError: + from altair.datasets._loader import url_cache + + url = url_cache[name] + + return url def __getattr__(name): diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 3c2a0ee21..5d8c1ec8b 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Generic, final, overload +from pathlib import Path +from typing import TYPE_CHECKING, Generic, TypeVar, final, get_args, overload from narwhals.typing import IntoDataFrameT, IntoFrameT @@ -8,8 +9,8 @@ if TYPE_CHECKING: import sys - from pathlib import Path - from typing import Any, Literal + from collections.abc import MutableMapping + from typing import Any, Final, Literal import pandas as pd import polars as pl @@ -23,8 +24,15 @@ from altair.datasets._readers import _Backend from altair.datasets._typing import Dataset, Extension, Version + __all__ = ["Loader", "load"] +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") +_T = TypeVar("_T") + +_URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz" + class Loader(Generic[IntoDataFrameT, IntoFrameT]): """ @@ -377,6 +385,69 @@ def __call__( return self.from_backend(backend)(name, suffix, tag=tag, **kwds) +class UrlCache(Generic[_KT, _VT]): + """ + `csv`_, `gzip`_ -based, lazy url lookup. + + Operates on a subset of available datasets: + - Only the latest version + - Excludes `.parquet`, which `cannot be read via url`_ + - Name collisions are pre-resolved + - Only provide the smallest (e.g. ``weather.json`` instead of ``weather.csv``) + + .. _csv: + https://docs.python.org/3/library/csv.html + .. _gzip: + https://docs.python.org/3/library/gzip.html + .. _cannot be read via url: + https://github.com/vega/vega/issues/3961 + """ + + def __init__( + self, + fp: Path, + /, + *, + columns: tuple[str, str] = ("dataset_name", "url_npm"), + tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"], + ) -> None: + self.fp: Path = fp + self.columns: tuple[str, str] = columns + self._mapping: MutableMapping[_KT, _VT] = tp() + + def read(self) -> Any: + import csv + import gzip + + with gzip.open(self.fp, mode="rb") as f: + b_lines = f.readlines() + reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect) + header = tuple(next(reader)) + if header != self.columns: + msg = f"Expected header to match {self.columns!r},\n" f"but got: {header!r}" + raise ValueError(msg) + return dict(reader) + + def __getitem__(self, key: _KT, /) -> _VT: + if url := self.get(key, None): + return url + + from altair.datasets._typing import Dataset + + if key in get_args(Dataset): + msg = f"{key!r} cannot be loaded via url." + raise TypeError(msg) + else: + msg = f"{key!r} does not refer to a known dataset." + raise TypeError(msg) + + def get(self, key: _KT, default: _T) -> _VT | _T: + if not self._mapping: + self._mapping.update(self.read()) + return self._mapping.get(key, default) + + +url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL) load: _Load[Any, Any] diff --git a/altair/datasets/_metadata/url.csv.gz b/altair/datasets/_metadata/url.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..3580606d7cca77cefee4c5bd2b48134f9fac22d9 GIT binary patch literal 855 zcmV-d1E~BTiwFn-B0gsV|8;U~E@N|c0Ik?dQ`;~Q0N^{n1vw!zOlNxPwAUVc&&ZNu zi^y7452x_!tCyXSP&%W{2?*?`_3GgjoO3H3_r`tQY#7(w zi{nDc*>+m^P5g_^ECxz=iFM!RUHA0VZ8zzIO$zRe9v-N)2CR3@(gJkM%@0)TKov1o zFhp|ilo$y*!j8ez3xrvK!u8ZD@!E`)@JdO`owxER+G}`W zEtvIEBdio&C`N62QYpAHupLh2qjt z=LMpUtB{|STRBTTv}+~4Bqyl#OjpT<8rlR6jb?__SlKys|TI+ysb(2Ji^3oO1m3l7I%_CtIcgP|{!TI~FZ z5nzH6z(o`Ca%eSoP~I-#4E#o3^u+CDCVsCkDHGIC#d&IkW>6RrX~Y| zRj;Hh`SzhdXFnSGUPBezJa4zDciy(MD{&TaSaCeCBciT3JWC;7FH^hF-VLupS%yK! z7D>VD6yKbLG7HYdW|IepyP<#1-VS}2fjXZmq-8pJFh~EHsEMX~-qgJq{nRH8>u8dgNo+|G_y6nVA$Qi?D~SmD^hzYb99B`0-a+w4v7E$To`#? hPGA)$PnhL%CQ6!b(lFqNy}1B6!M}FHK(GfF008u|rd9v| literal 0 HcmV?d00001 diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 953401bae..e93fb55e1 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -83,6 +83,9 @@ _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet" +class AltairDatasetsError(Exception): ... + + class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): """ Describes basic IO for remote & local tabular resources. @@ -502,7 +505,7 @@ def infer_backend( if reader := next(it, None): return reader msg = f"Found no supported backend, searched:\n" f"{priority!r}" - raise NotImplementedError(msg) + raise AltairDatasetsError(msg) @overload diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 6de691ff2..e5d1f1d3f 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,5 +1,6 @@ from __future__ import annotations +import contextlib import datetime as dt import re import sys @@ -18,8 +19,8 @@ ) from narwhals.stable import v1 as nw -from altair.datasets import Loader -from altair.datasets._readers import _METADATA +from altair.datasets import Loader, url +from altair.datasets._readers import _METADATA, AltairDatasetsError from altair.datasets._typing import Dataset, Extension, Metadata, Version from tests import skip_requires_pyarrow, slow @@ -115,6 +116,13 @@ def metadata_columns() -> frozenset[str]: ) +def match_url(name: Dataset, url: str) -> bool: + return ( + re.match(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+", url) + is not None + ) + + @backends def test_loader_from_backend(backend: _Backend) -> None: data = Loader.from_backend(backend) @@ -124,13 +132,8 @@ def test_loader_from_backend(backend: _Backend) -> None: @backends def test_loader_url(backend: _Backend) -> None: data = Loader.from_backend(backend) - dataset_name = "volcano" - pattern = re.compile( - rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{dataset_name}\..+" - ) - url = data.url(dataset_name) - assert isinstance(url, str) - assert pattern.match(url) is not None + dataset_name: Dataset = "volcano" + assert match_url(dataset_name, data.url(dataset_name)) def test_load(monkeypatch: pytest.MonkeyPatch) -> None: @@ -178,7 +181,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pyarrow", None) - with pytest.raises(NotImplementedError, match="no.+backend"): + with pytest.raises(AltairDatasetsError, match="no.+backend"): from altair.datasets import load @@ -239,10 +242,49 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: def test_url(name: Dataset) -> None: from altair.datasets import url - pattern = re.compile(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+") - result = url(name) - assert isinstance(result, str) - assert pattern.match(result) is not None + assert match_url(name, url(name)) + + +def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: + import altair.datasets + from altair.datasets._loader import url_cache + + monkeypatch.setitem(sys.modules, "polars", None) + monkeypatch.setitem(sys.modules, "pandas", None) + monkeypatch.setitem(sys.modules, "pyarrow", None) + + assert url_cache._mapping == {} + + with contextlib.suppress(AltairDatasetsError): + monkeypatch.delattr(altair.datasets._loader, "load", raising=False) + with pytest.raises(AltairDatasetsError): + from altair.datasets import load as load + + assert match_url("jobs", url("jobs")) + + assert url_cache._mapping != {} + + assert match_url("cars", url("cars")) + assert match_url("stocks", url("stocks")) + assert match_url("countries", url("countries")) + assert match_url("crimea", url("crimea")) + assert match_url("disasters", url("disasters")) + assert match_url("driving", url("driving")) + assert match_url("earthquakes", url("earthquakes")) + assert match_url("flare", url("flare")) + assert match_url("flights-10k", url("flights-10k")) + assert match_url("flights-200k", url("flights-200k")) + + with pytest.raises(TypeError, match="cannot be loaded via url"): + url("climate") + + with pytest.raises(TypeError, match="cannot be loaded via url"): + url("flights-3m") + + with pytest.raises( + TypeError, match="'fake data' does not refer to a known dataset" + ): + url("fake data") @backends diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 3702028ac..ae4d0b583 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -15,12 +15,15 @@ from __future__ import annotations +import gzip import json import types +from io import BytesIO from pathlib import Path from typing import TYPE_CHECKING, Any, Literal import polars as pl +from polars import col from tools.codemod import ruff from tools.datasets.github import GitHub @@ -107,6 +110,7 @@ def __init__( } ) self._fp_typing: Path = out_fp_typing + self._fp_url: Path = out_dir_altair / "url.csv.gz" @property def github(self) -> GitHub: @@ -135,6 +139,14 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: gh_trees = self.github.refresh_trees(gh_tags) self.write_parquet(gh_trees, self._paths["gh_trees"]) + npm_urls_min = ( + gh_trees.lazy() + .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet") + .filter(col("size") == col("size").min().over("dataset_name")) + .select("dataset_name", "url_npm") + ) + self.write_csv_gzip(npm_urls_min, self._fp_url) + if include_typing: self.generate_typing(self._fp_typing) return gh_trees @@ -159,6 +171,17 @@ def _from_alias(self, name: _PathAlias, /) -> Path: else: return self._paths[name] + def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: + if fp.suffix != ".gz": + fp = fp.with_suffix(".csv.gz") + if not fp.exists(): + fp.touch() + df = frame.lazy().collect() + buf = BytesIO() + with gzip.open(fp, mode="wb") as f: + df.write_csv(buf) + f.write(buf.getbuffer()) + def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: """Write ``frame`` to ``fp``, with some extra safety.""" if not fp.exists():