Skip to content

Commit

Permalink
feat: Support url(...) without dependencies
Browse files Browse the repository at this point in the history
  • Loading branch information
dangotbanned committed Nov 20, 2024
1 parent c835c13 commit 0817ff8
Show file tree
Hide file tree
Showing 6 changed files with 168 additions and 20 deletions.
13 changes: 11 additions & 2 deletions altair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,18 @@ def url(
- https://github.com/vega/altair/discussions/3150#discussioncomment-11280516
- https://github.com/vega/altair/pull/3631#discussion_r1846662053
"""
from altair.datasets._loader import load
from altair.datasets._readers import AltairDatasetsError

return load.url(name, suffix, tag=tag)
try:
from altair.datasets._loader import load

url = load.url(name, suffix, tag=tag)
except AltairDatasetsError:
from altair.datasets._loader import url_cache

url = url_cache[name]

return url


def __getattr__(name):
Expand Down
77 changes: 74 additions & 3 deletions altair/datasets/_loader.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Generic, final, overload
from pathlib import Path
from typing import TYPE_CHECKING, Generic, TypeVar, final, get_args, overload

from narwhals.typing import IntoDataFrameT, IntoFrameT

from altair.datasets._readers import _Reader, backend

if TYPE_CHECKING:
import sys
from pathlib import Path
from typing import Any, Literal
from collections.abc import MutableMapping
from typing import Any, Final, Literal

import pandas as pd
import polars as pl
Expand All @@ -23,8 +24,15 @@
from altair.datasets._readers import _Backend
from altair.datasets._typing import Dataset, Extension, Version


__all__ = ["Loader", "load"]

_KT = TypeVar("_KT")
_VT = TypeVar("_VT")
_T = TypeVar("_T")

_URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz"


class Loader(Generic[IntoDataFrameT, IntoFrameT]):
"""
Expand Down Expand Up @@ -377,6 +385,69 @@ def __call__(
return self.from_backend(backend)(name, suffix, tag=tag, **kwds)


class UrlCache(Generic[_KT, _VT]):
"""
`csv`_, `gzip`_ -based, lazy url lookup.
Operates on a subset of available datasets:
- Only the latest version
- Excludes `.parquet`, which `cannot be read via url`_
- Name collisions are pre-resolved
- Only provide the smallest (e.g. ``weather.json`` instead of ``weather.csv``)
.. _csv:
https://docs.python.org/3/library/csv.html
.. _gzip:
https://docs.python.org/3/library/gzip.html
.. _cannot be read via url:
https://github.com/vega/vega/issues/3961
"""

def __init__(
self,
fp: Path,
/,
*,
columns: tuple[str, str] = ("dataset_name", "url_npm"),
tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"],
) -> None:
self.fp: Path = fp
self.columns: tuple[str, str] = columns
self._mapping: MutableMapping[_KT, _VT] = tp()

def read(self) -> Any:
import csv
import gzip

with gzip.open(self.fp, mode="rb") as f:
b_lines = f.readlines()
reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect)
header = tuple(next(reader))
if header != self.columns:
msg = f"Expected header to match {self.columns!r},\n" f"but got: {header!r}"
raise ValueError(msg)
return dict(reader)

def __getitem__(self, key: _KT, /) -> _VT:
if url := self.get(key, None):
return url

from altair.datasets._typing import Dataset

if key in get_args(Dataset):
msg = f"{key!r} cannot be loaded via url."
raise TypeError(msg)
else:
msg = f"{key!r} does not refer to a known dataset."
raise TypeError(msg)

def get(self, key: _KT, default: _T) -> _VT | _T:
if not self._mapping:
self._mapping.update(self.read())
return self._mapping.get(key, default)


url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL)
load: _Load[Any, Any]


Expand Down
Binary file added altair/datasets/_metadata/url.csv.gz
Binary file not shown.
5 changes: 4 additions & 1 deletion altair/datasets/_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@
_METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet"


class AltairDatasetsError(Exception): ...


class _Reader(Protocol[IntoDataFrameT, IntoFrameT]):
"""
Describes basic IO for remote & local tabular resources.
Expand Down Expand Up @@ -502,7 +505,7 @@ def infer_backend(
if reader := next(it, None):
return reader
msg = f"Found no supported backend, searched:\n" f"{priority!r}"
raise NotImplementedError(msg)
raise AltairDatasetsError(msg)


@overload
Expand Down
70 changes: 56 additions & 14 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import contextlib
import datetime as dt
import re
import sys
Expand All @@ -18,8 +19,8 @@
)
from narwhals.stable import v1 as nw

from altair.datasets import Loader
from altair.datasets._readers import _METADATA
from altair.datasets import Loader, url
from altair.datasets._readers import _METADATA, AltairDatasetsError
from altair.datasets._typing import Dataset, Extension, Metadata, Version
from tests import skip_requires_pyarrow, slow

Expand Down Expand Up @@ -115,6 +116,13 @@ def metadata_columns() -> frozenset[str]:
)


def match_url(name: Dataset, url: str) -> bool:
return (
re.match(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+", url)
is not None
)


@backends
def test_loader_from_backend(backend: _Backend) -> None:
data = Loader.from_backend(backend)
Expand All @@ -124,13 +132,8 @@ def test_loader_from_backend(backend: _Backend) -> None:
@backends
def test_loader_url(backend: _Backend) -> None:
data = Loader.from_backend(backend)
dataset_name = "volcano"
pattern = re.compile(
rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{dataset_name}\..+"
)
url = data.url(dataset_name)
assert isinstance(url, str)
assert pattern.match(url) is not None
dataset_name: Dataset = "volcano"
assert match_url(dataset_name, data.url(dataset_name))


def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
Expand Down Expand Up @@ -178,7 +181,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.delattr(altair.datasets._loader, "load")
monkeypatch.setitem(sys.modules, "pyarrow", None)

with pytest.raises(NotImplementedError, match="no.+backend"):
with pytest.raises(AltairDatasetsError, match="no.+backend"):
from altair.datasets import load


Expand Down Expand Up @@ -239,10 +242,49 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None:
def test_url(name: Dataset) -> None:
from altair.datasets import url

pattern = re.compile(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+")
result = url(name)
assert isinstance(result, str)
assert pattern.match(result) is not None
assert match_url(name, url(name))


def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None:
import altair.datasets
from altair.datasets._loader import url_cache

monkeypatch.setitem(sys.modules, "polars", None)
monkeypatch.setitem(sys.modules, "pandas", None)
monkeypatch.setitem(sys.modules, "pyarrow", None)

assert url_cache._mapping == {}

with contextlib.suppress(AltairDatasetsError):
monkeypatch.delattr(altair.datasets._loader, "load", raising=False)
with pytest.raises(AltairDatasetsError):
from altair.datasets import load as load

assert match_url("jobs", url("jobs"))

assert url_cache._mapping != {}

assert match_url("cars", url("cars"))
assert match_url("stocks", url("stocks"))
assert match_url("countries", url("countries"))
assert match_url("crimea", url("crimea"))
assert match_url("disasters", url("disasters"))
assert match_url("driving", url("driving"))
assert match_url("earthquakes", url("earthquakes"))
assert match_url("flare", url("flare"))
assert match_url("flights-10k", url("flights-10k"))
assert match_url("flights-200k", url("flights-200k"))

with pytest.raises(TypeError, match="cannot be loaded via url"):
url("climate")

with pytest.raises(TypeError, match="cannot be loaded via url"):
url("flights-3m")

with pytest.raises(
TypeError, match="'fake data' does not refer to a known dataset"
):
url("fake data")


@backends
Expand Down
23 changes: 23 additions & 0 deletions tools/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@

from __future__ import annotations

import gzip
import json
import types
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal

import polars as pl
from polars import col

from tools.codemod import ruff
from tools.datasets.github import GitHub
Expand Down Expand Up @@ -107,6 +110,7 @@ def __init__(
}
)
self._fp_typing: Path = out_fp_typing
self._fp_url: Path = out_dir_altair / "url.csv.gz"

@property
def github(self) -> GitHub:
Expand Down Expand Up @@ -135,6 +139,14 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame:
gh_trees = self.github.refresh_trees(gh_tags)
self.write_parquet(gh_trees, self._paths["gh_trees"])

npm_urls_min = (
gh_trees.lazy()
.filter(col("tag") == col("tag").first(), col("suffix") != ".parquet")
.filter(col("size") == col("size").min().over("dataset_name"))
.select("dataset_name", "url_npm")
)
self.write_csv_gzip(npm_urls_min, self._fp_url)

if include_typing:
self.generate_typing(self._fp_typing)
return gh_trees
Expand All @@ -159,6 +171,17 @@ def _from_alias(self, name: _PathAlias, /) -> Path:
else:
return self._paths[name]

def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
if fp.suffix != ".gz":
fp = fp.with_suffix(".csv.gz")
if not fp.exists():
fp.touch()
df = frame.lazy().collect()
buf = BytesIO()
with gzip.open(fp, mode="wb") as f:
df.write_csv(buf)
f.write(buf.getbuffer())

def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None:
"""Write ``frame`` to ``fp``, with some extra safety."""
if not fp.exists():
Expand Down

0 comments on commit 0817ff8

Please sign in to comment.