Use magic bytes to identify file formats (#143)

* add more common file suffixes * determine format based on magic bytes * add release note * update docstring, keep netCDF4 alias * small contributing section, update env * add tests * ruff * disable tif,fits,hdf5 examples for now
zarr-developers · Jun 26, 2024 · 412c23c · 412c23c
1 parent 843197f
commit 412c23c
Show file tree

Hide file tree

Showing 7 changed files with 106 additions and 26 deletions.
diff --git a/docs/contributing.md b/docs/contributing.md
@@ -1,5 +1,16 @@
 # Contributing
 
+## Contributing code
+
+```bash
+mamba env create -f ci/environment.yml
+mamba activate virtualizarr-tests
+pre-commit install
+# git checkout -b new-feature
+python -m pip install -e . --no-deps
+python -m pytest ./virtualizarr --run-network-tests --cov=./ --cov-report=xml --verbose
+```
+
 ## Contributing documentation
 
 ### Build the documentation locally

diff --git a/docs/releases.rst b/docs/releases.rst
@@ -43,6 +43,8 @@ Internal Changes
   (:pull:`107`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
 - Mark tests which require network access so that they are only run when `--run-network-tests` is passed a command-line argument to pytest.
   (:pull:`144`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
+  - Determine file format from magic bytes rather than name suffix
+  (:pull:`143`) By `Scott Henderson <https://github.com/scottyhq>`_.
 
 .. _v0.1:
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -114,3 +114,8 @@ line-ending = "auto"
 
 [tool.ruff.lint.isort]
 known-first-party = ["virtualizarr"]
+
+[tool.pytest.ini_options]
+markers = [
+    "network: marks test requiring internet (select with '--run-network-tests')",
+]
diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py
@@ -33,7 +33,9 @@ def _generate_next_value_(name, start, count, last_values):
 
 class FileType(AutoName):
     netcdf3 = auto()
-    netcdf4 = auto()
+    netcdf4 = auto()  # NOTE: netCDF4 is a subset of hdf5
+    hdf4 = auto()
+    hdf5 = auto()
     grib = auto()
     tiff = auto()
     fits = auto()
@@ -86,7 +88,7 @@ def read_kerchunk_references_from_file(
 
         refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate()
 
-    elif filetype.name.lower() == "netcdf4":
+    elif filetype.name.lower() == "hdf5" or filetype.name.lower() == "netcdf4":
         from kerchunk.hdf import SingleHdf5ToZarr
 
         refs = SingleHdf5ToZarr(
@@ -99,11 +101,11 @@ def read_kerchunk_references_from_file(
     elif filetype.name.lower() == "tiff":
         from kerchunk.tiff import tiff_to_zarr
 
-        refs = tiff_to_zarr(filepath, inline_threshold=0, **reader_options)
+        refs = tiff_to_zarr(filepath, **reader_options)
     elif filetype.name.lower() == "fits":
         from kerchunk.fits import process_file
 
-        refs = process_file(filepath, inline_threshold=0, **reader_options)
+        refs = process_file(filepath, **reader_options)
     else:
         raise NotImplementedError(f"Unsupported file type: {filetype.name}")
 
@@ -116,34 +118,34 @@ def _automatically_determine_filetype(
     filepath: str,
     reader_options: Optional[dict[str, Any]] = None,
 ) -> FileType:
-    file_extension = Path(filepath).suffix
+    if Path(filepath).suffix == ".zarr":
+        # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one...
+        raise NotImplementedError()
+
+    # Read magic bytes from local or remote file
     fpath = _fsspec_openfile_from_filepath(
         filepath=filepath, reader_options=reader_options
     )
+    magic_bytes = fpath.read(8)
+    fpath.close()
 
-    if file_extension == ".nc":
-        # based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167
-        magic = fpath.read()
-
-        if magic[0:3] == b"CDF":
-            filetype = FileType.netcdf3
-        elif magic[1:4] == b"HDF":
-            filetype = FileType.netcdf4
-        else:
-            raise ValueError(".nc file does not appear to be NETCDF3 OR NETCDF4")
-    elif file_extension == ".zarr":
-        # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one...
-        raise NotImplementedError()
-    elif file_extension == ".grib":
+    if magic_bytes.startswith(b"CDF"):
+        filetype = FileType.netcdf3
+    elif magic_bytes.startswith(b"\x0e\x03\x13\x01"):
+        raise NotImplementedError("HDF4 formatted files not supported")
+    elif magic_bytes.startswith(b"\x89HDF"):
+        filetype = FileType.hdf5
+    elif magic_bytes.startswith(b"GRIB"):
         filetype = FileType.grib
-    elif file_extension == ".tiff":
+    elif magic_bytes.startswith(b"II*"):
         filetype = FileType.tiff
-    elif file_extension == ".fits":
+    elif magic_bytes.startswith(b"SIMPLE"):
         filetype = FileType.fits
     else:
-        raise NotImplementedError(f"Unrecognised file extension: {file_extension}")
+        raise NotImplementedError(
+            f"Unrecognised file based on header bytes: {magic_bytes}"
+        )
 
-    fpath.close()
     return filetype
 
 

diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py
@@ -223,15 +223,43 @@ def test_automatically_determine_filetype_netcdf3_netcdf4():
     assert FileType("netcdf3") == _automatically_determine_filetype(
         filepath=netcdf3_file_path
     )
-    assert FileType("netcdf4") == _automatically_determine_filetype(
+    assert FileType("hdf5") == _automatically_determine_filetype(
         filepath=netcdf4_file_path
     )
 
 
+@pytest.mark.parametrize(
+    "filetype,headerbytes",
+    [
+        ("netcdf3", b"CDF"),
+        ("hdf5", b"\x89HDF"),
+        ("grib", b"GRIB"),
+        ("tiff", b"II*"),
+        ("fits", b"SIMPLE"),
+    ],
+)
+def test_valid_filetype_bytes(tmp_path, filetype, headerbytes):
+    filepath = tmp_path / "file.abc"
+    with open(filepath, "wb") as f:
+        f.write(headerbytes)
+    assert FileType(filetype) == _automatically_determine_filetype(filepath=filepath)
+
+
+def test_notimplemented_filetype(tmp_path):
+    for headerbytes in [b"JUNK", b"\x0e\x03\x13\x01"]:
+        filepath = tmp_path / "file.abc"
+        with open(filepath, "wb") as f:
+            f.write(headerbytes)
+        with pytest.raises(NotImplementedError):
+            _automatically_determine_filetype(filepath=filepath)
+
+
 def test_FileType():
     # tests if FileType converts user supplied strings to correct filetype
     assert "netcdf3" == FileType("netcdf3").name
     assert "netcdf4" == FileType("netcdf4").name
+    assert "hdf4" == FileType("hdf4").name
+    assert "hdf5" == FileType("hdf5").name
     assert "grib" == FileType("grib").name
     assert "tiff" == FileType("tiff").name
     assert "fits" == FileType("fits").name

diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
@@ -308,6 +308,31 @@ def test_anon_read_s3(self, filetype, indexes):
             assert isinstance(vds[var].data, ManifestArray), var
 
 
+@network
+class TestReadFromURL:
+    def test_read_from_url(self):
+        examples = {
+            "grib": "https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib",
+            "netcdf3": "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc",
+            "netcdf4": "https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc",
+            "hdf4": "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf",
+            # https://github.com/zarr-developers/VirtualiZarr/issues/159
+            # "hdf5": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5",
+            # https://github.com/zarr-developers/VirtualiZarr/issues/160
+            # "tiff": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif",
+            # "fits": "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits",
+            "jpg": "https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg",
+        }
+
+        for filetype, url in examples.items():
+            if filetype in ["grib", "jpg", "hdf4"]:
+                with pytest.raises(NotImplementedError):
+                    vds = open_virtual_dataset(url, reader_options={})
+            else:
+                vds = open_virtual_dataset(url, reader_options={})
+                assert isinstance(vds, xr.Dataset)
+
+
 class TestLoadVirtualDataset:
     def test_loadable_variables(self, netcdf4_file):
         vars_to_load = ["air", "time"]
@@ -325,6 +350,13 @@ def test_loadable_variables(self, netcdf4_file):
             if name in vars_to_load:
                 xrt.assert_identical(vds.variables[name], full_ds.variables[name])
 
+    def test_explicit_filetype(self, netcdf4_file):
+        with pytest.raises(ValueError):
+            open_virtual_dataset(netcdf4_file, filetype="unknown")
+
+        with pytest.raises(NotImplementedError):
+            open_virtual_dataset(netcdf4_file, filetype="grib")
+
     @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file")
     def test_open_virtual_dataset_passes_expected_args(
         self, mock_read_kerchunk, netcdf4_file

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
@@ -55,8 +55,8 @@ def open_virtual_dataset(
         File path to open as a set of virtualized zarr arrays.
     filetype : FileType, default None
         Type of file to be opened. Used to determine which kerchunk file format backend to use.
-        Can be one of {'netCDF3', 'netCDF4', 'zarr_v3'}.
-        If not provided will attempt to automatically infer the correct filetype from the the filepath's extension.
+        Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}.
+        If not provided will attempt to automatically infer the correct filetype from header bytes.
     drop_variables: list[str], default is None
         Variables in the file to drop before returning.
     loadable_variables: list[str], default is None