ibis-project · jitingxu1 · Aug 1, 2024 · Aug 2, 2024 · Aug 2, 2024 · Aug 5, 2024
diff --git a/ibis/backends/__init__.py b/ibis/backends/__init__.py
@@ -3,10 +3,13 @@
 import abc
 import collections.abc
 import functools
+import glob
 import importlib.metadata
 import keyword
 import re
 import urllib.parse
+import urllib.request
+from io import BytesIO
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar
 
@@ -1199,6 +1202,61 @@
             f"{cls.name} backend has not implemented `has_operation` API"
         )
 
+    def read_parquet(
+        self, path: str | Path, table_name: str | None = None, **kwargs: Any
+    ) -> ir.Table:
+        """Register a parquet file as a table in the current backend.
+
+        Parameters
+        ----------
+        path
+            The data source. May be a path to a file, an iterable of files,
+            or directory of parquet files.
+        table_name
+            An optional name to use for the created table. This defaults to
+            a sequentially generated name.
+        **kwargs
+            Additional keyword arguments passed to the pyarrow loading function.
+            See https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html
+            for more information.
+
+        Returns
+        -------
+        ir.Table
+            The just-registered table
+
+        """
+
+        table = self._get_pyarrow_table_from_path(path, **kwargs)
+        table_name = table_name or util.gen_name("read_parquet")
+        self.create_table(table_name, table)
+        return self.table(table_name)
+
+    def _get_pyarrow_table_from_path(self, path: str | Path, **kwargs) -> pa.Table:
+        pq = util.import_object("pyarrow.parquet")
+
+        path = str(path)
+        # handle url
+        if util.is_url(path):
+            headers = kwargs.pop("headers", {})
+            req_info = urllib.request.Request(path, headers=headers)  # noqa: S310
+            with urllib.request.urlopen(req_info) as req:  # noqa: S310
+                with BytesIO(req.read()) as reader:
+                    return pq.read_table(reader)
+
+        # handle fsspec compatible url
+        if util.is_fsspec_url(path):
+            return pq.read_table(path, **kwargs)
+
+        # Handle local file paths or patterns
+        paths = glob.glob(path)
+        if not paths:
+            raise ValueError(f"No files found matching pattern: {path!r}")
+        elif len(paths) == 1:
+            paths = paths[0]
+
+        return pq.read_table(paths, **kwargs)
+
     def _cached(self, expr: ir.Table):
         """Cache the provided expression.
 

diff --git a/ibis/backends/tests/test_register.py b/ibis/backends/tests/test_register.py
@@ -18,8 +18,6 @@
 
     import pyarrow as pa
 
-pytestmark = pytest.mark.notimpl(["druid", "exasol", "oracle"])
-
 
 @contextlib.contextmanager
 def pushd(new_dir):
@@ -98,6 +96,7 @@ def gzip_csv(data_dir, tmp_path):
         "trino",
     ]
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_register_csv(con, data_dir, fname, in_table_name, out_table_name):
     with pushd(data_dir / "csv"):
         with pytest.warns(FutureWarning, match="v9.1"):
@@ -109,7 +108,7 @@ def test_register_csv(con, data_dir, fname, in_table_name, out_table_name):
 
 
 # TODO: rewrite or delete test when register api is removed
-@pytest.mark.notimpl(["datafusion"])
+@pytest.mark.notimpl(["datafusion", "druid", "exasol", "oracle"])
 @pytest.mark.notyet(
     [
         "bigquery",
@@ -153,6 +152,7 @@ def test_register_csv_gz(con, data_dir, gzip_csv):
         "trino",
     ]
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_register_with_dotted_name(con, data_dir, tmp_path):
     basename = "foo.bar.baz/diamonds.csv"
     f = tmp_path.joinpath(basename)
@@ -212,6 +212,7 @@ def read_table(path: Path) -> Iterator[tuple[str, pa.Table]]:
         "trino",
     ]
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_register_parquet(
     con, tmp_path, data_dir, fname, in_table_name, out_table_name
 ):
@@ -252,6 +253,7 @@ def test_register_parquet(
         "trino",
     ]
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_register_iterator_parquet(
     con,
     tmp_path,
@@ -280,7 +282,7 @@ def test_register_iterator_parquet(
 # TODO: remove entirely when `register` is removed
 # This same functionality is implemented across all backends
 # via `create_table` and tested in `test_client.py`
-@pytest.mark.notimpl(["datafusion"])
+@pytest.mark.notimpl(["datafusion", "druid", "exasol", "oracle"])
 @pytest.mark.notyet(
     [
         "bigquery",
@@ -316,7 +318,7 @@ def test_register_pandas(con):
 # TODO: remove entirely when `register` is removed
 # This same functionality is implemented across all backends
 # via `create_table` and tested in `test_client.py`
-@pytest.mark.notimpl(["datafusion", "polars"])
+@pytest.mark.notimpl(["datafusion", "polars", "druid", "exasol", "oracle"])
 @pytest.mark.notyet(
     [
         "bigquery",
@@ -361,6 +363,7 @@ def test_register_pyarrow_tables(con):
         "trino",
     ]
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_csv_reregister_schema(con, tmp_path):
     foo = tmp_path.joinpath("foo.csv")
     with foo.open("w", newline="") as csvfile:
@@ -390,10 +393,13 @@ def test_csv_reregister_schema(con, tmp_path):
         "clickhouse",
         "dask",
         "datafusion",
+        "druid",
+        "exasol",
         "flink",
         "impala",
         "mysql",
         "mssql",
+        "oracle",
         "pandas",
         "polars",
         "postgres",
@@ -425,9 +431,8 @@ def test_register_garbage(con, monkeypatch):
         ("functional_alltypes.parquet", "funk_all"),
     ],
 )
-@pytest.mark.notyet(
-    ["flink", "impala", "mssql", "mysql", "postgres", "risingwave", "sqlite", "trino"]
-)
+@pytest.mark.notyet(["flink"])
+@pytest.mark.notimpl(["druid"])
 def test_read_parquet(con, tmp_path, data_dir, fname, in_table_name):
     pq = pytest.importorskip("pyarrow.parquet")
 
@@ -456,19 +461,8 @@ def ft_data(data_dir):
     return table.slice(0, nrows)
 
 
-@pytest.mark.notyet(
-    [
-        "flink",
-        "impala",
-        "mssql",
-        "mysql",
-        "pandas",
-        "postgres",
-        "risingwave",
-        "sqlite",
-        "trino",
-    ]
-)
+@pytest.mark.notyet(["flink", "pandas"])
+@pytest.mark.notimpl(["druid"])
 def test_read_parquet_glob(con, tmp_path, ft_data):
     pq = pytest.importorskip("pyarrow.parquet")
 
@@ -498,6 +492,7 @@ def test_read_parquet_glob(con, tmp_path, ft_data):
         "trino",
     ]
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_read_csv_glob(con, tmp_path, ft_data):
     pc = pytest.importorskip("pyarrow.csv")
 
@@ -534,6 +529,7 @@ def test_read_csv_glob(con, tmp_path, ft_data):
     raises=ValueError,
     reason="read_json() missing required argument: 'schema'",
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_read_json_glob(con, tmp_path, ft_data):
     nrows = len(ft_data)
     ntables = 2
@@ -580,6 +576,7 @@ def num_diamonds(data_dir):
 @pytest.mark.notyet(
     ["flink", "impala", "mssql", "mysql", "postgres", "risingwave", "sqlite", "trino"]
 )
+@pytest.mark.notimpl(["druid", "exasol", "oracle"])
 def test_read_csv(con, data_dir, in_table_name, num_diamonds):
     fname = "diamonds.csv"
     with pushd(data_dir / "csv"):

diff --git a/ibis/util.py b/ibis/util.py
@@ -18,6 +18,12 @@
 import warnings
 from types import ModuleType
 from typing import TYPE_CHECKING, Any, Generic, TypeVar
+from urllib.parse import (
+    urlparse,
+    uses_netloc,
+    uses_params,
+    uses_relative,
+)
 from uuid import uuid4
 
 import toolz
@@ -42,6 +48,10 @@
 # https://www.compart.com/en/unicode/U+2026
 HORIZONTAL_ELLIPSIS = "\u2026"
 
+_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
+_VALID_URLS.discard("")
+_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://")
+
 
 def guid() -> str:
     """Return a uuid4 hexadecimal value."""
@@ -485,6 +495,42 @@ def import_object(qualname: str) -> Any:
         raise ImportError(f"cannot import name {name!r} from {mod_name!r}") from None
 
 
+def is_url(url: str) -> bool:
+    """Check to see if a URL has a valid protocol.
+
+    Parameters
+    ----------
+    url : str
+        The URL to be checked.
+
+    Returns
+    -------
+    bool
+        True if the URL has a valid protocol, False otherwise
+
+    """
+
+    return urlparse(url).scheme in _VALID_URLS
+
+
+def is_fsspec_url(url: str) -> bool:
+    """Check if the given URL looks like something fsspec can handle.
+
+    Parameters
+    ----------
+    url : str
+        The URL string to be checked.
+
+    Returns
+    -------
+    bool
+        True if the URL is likely compatible with fsspec, False otherwise.
+    """
+    return bool(_RFC_3986_PATTERN.match(url)) and not url.startswith(
+        ("http://", "https://")
+    )
+
+
 def normalize_filename(source: str | Path) -> str:
     source = str(source)
     for prefix in (