Skip to content

Commit

Permalink
Use magic bytes to identify file formats (#143)
Browse files Browse the repository at this point in the history
* add more common file suffixes

* determine format based on magic bytes

* add release note

* update docstring, keep netCDF4 alias

* small contributing section, update env

* add tests

* ruff

* disable tif,fits,hdf5 examples for now
  • Loading branch information
scottyhq authored Jun 26, 2024
1 parent 843197f commit 412c23c
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 26 deletions.
11 changes: 11 additions & 0 deletions docs/contributing.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# Contributing

## Contributing code

```bash
mamba env create -f ci/environment.yml
mamba activate virtualizarr-tests
pre-commit install
# git checkout -b new-feature
python -m pip install -e . --no-deps
python -m pytest ./virtualizarr --run-network-tests --cov=./ --cov-report=xml --verbose
```

## Contributing documentation

### Build the documentation locally
Expand Down
2 changes: 2 additions & 0 deletions docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ Internal Changes
(:pull:`107`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
- Mark tests which require network access so that they are only run when `--run-network-tests` is passed a command-line argument to pytest.
(:pull:`144`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
- Determine file format from magic bytes rather than name suffix
(:pull:`143`) By `Scott Henderson <https://github.com/scottyhq>`_.

.. _v0.1:

Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,8 @@ line-ending = "auto"

[tool.ruff.lint.isort]
known-first-party = ["virtualizarr"]

[tool.pytest.ini_options]
markers = [
"network: marks test requiring internet (select with '--run-network-tests')",
]
48 changes: 25 additions & 23 deletions virtualizarr/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ def _generate_next_value_(name, start, count, last_values):

class FileType(AutoName):
netcdf3 = auto()
netcdf4 = auto()
netcdf4 = auto() # NOTE: netCDF4 is a subset of hdf5
hdf4 = auto()
hdf5 = auto()
grib = auto()
tiff = auto()
fits = auto()
Expand Down Expand Up @@ -86,7 +88,7 @@ def read_kerchunk_references_from_file(

refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate()

elif filetype.name.lower() == "netcdf4":
elif filetype.name.lower() == "hdf5" or filetype.name.lower() == "netcdf4":
from kerchunk.hdf import SingleHdf5ToZarr

refs = SingleHdf5ToZarr(
Expand All @@ -99,11 +101,11 @@ def read_kerchunk_references_from_file(
elif filetype.name.lower() == "tiff":
from kerchunk.tiff import tiff_to_zarr

refs = tiff_to_zarr(filepath, inline_threshold=0, **reader_options)
refs = tiff_to_zarr(filepath, **reader_options)
elif filetype.name.lower() == "fits":
from kerchunk.fits import process_file

refs = process_file(filepath, inline_threshold=0, **reader_options)
refs = process_file(filepath, **reader_options)
else:
raise NotImplementedError(f"Unsupported file type: {filetype.name}")

Expand All @@ -116,34 +118,34 @@ def _automatically_determine_filetype(
filepath: str,
reader_options: Optional[dict[str, Any]] = None,
) -> FileType:
file_extension = Path(filepath).suffix
if Path(filepath).suffix == ".zarr":
# TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one...
raise NotImplementedError()

# Read magic bytes from local or remote file
fpath = _fsspec_openfile_from_filepath(
filepath=filepath, reader_options=reader_options
)
magic_bytes = fpath.read(8)
fpath.close()

if file_extension == ".nc":
# based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167
magic = fpath.read()

if magic[0:3] == b"CDF":
filetype = FileType.netcdf3
elif magic[1:4] == b"HDF":
filetype = FileType.netcdf4
else:
raise ValueError(".nc file does not appear to be NETCDF3 OR NETCDF4")
elif file_extension == ".zarr":
# TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one...
raise NotImplementedError()
elif file_extension == ".grib":
if magic_bytes.startswith(b"CDF"):
filetype = FileType.netcdf3
elif magic_bytes.startswith(b"\x0e\x03\x13\x01"):
raise NotImplementedError("HDF4 formatted files not supported")
elif magic_bytes.startswith(b"\x89HDF"):
filetype = FileType.hdf5
elif magic_bytes.startswith(b"GRIB"):
filetype = FileType.grib
elif file_extension == ".tiff":
elif magic_bytes.startswith(b"II*"):
filetype = FileType.tiff
elif file_extension == ".fits":
elif magic_bytes.startswith(b"SIMPLE"):
filetype = FileType.fits
else:
raise NotImplementedError(f"Unrecognised file extension: {file_extension}")
raise NotImplementedError(
f"Unrecognised file based on header bytes: {magic_bytes}"
)

fpath.close()
return filetype


Expand Down
30 changes: 29 additions & 1 deletion virtualizarr/tests/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,15 +223,43 @@ def test_automatically_determine_filetype_netcdf3_netcdf4():
assert FileType("netcdf3") == _automatically_determine_filetype(
filepath=netcdf3_file_path
)
assert FileType("netcdf4") == _automatically_determine_filetype(
assert FileType("hdf5") == _automatically_determine_filetype(
filepath=netcdf4_file_path
)


@pytest.mark.parametrize(
"filetype,headerbytes",
[
("netcdf3", b"CDF"),
("hdf5", b"\x89HDF"),
("grib", b"GRIB"),
("tiff", b"II*"),
("fits", b"SIMPLE"),
],
)
def test_valid_filetype_bytes(tmp_path, filetype, headerbytes):
filepath = tmp_path / "file.abc"
with open(filepath, "wb") as f:
f.write(headerbytes)
assert FileType(filetype) == _automatically_determine_filetype(filepath=filepath)


def test_notimplemented_filetype(tmp_path):
for headerbytes in [b"JUNK", b"\x0e\x03\x13\x01"]:
filepath = tmp_path / "file.abc"
with open(filepath, "wb") as f:
f.write(headerbytes)
with pytest.raises(NotImplementedError):
_automatically_determine_filetype(filepath=filepath)


def test_FileType():
# tests if FileType converts user supplied strings to correct filetype
assert "netcdf3" == FileType("netcdf3").name
assert "netcdf4" == FileType("netcdf4").name
assert "hdf4" == FileType("hdf4").name
assert "hdf5" == FileType("hdf5").name
assert "grib" == FileType("grib").name
assert "tiff" == FileType("tiff").name
assert "fits" == FileType("fits").name
Expand Down
32 changes: 32 additions & 0 deletions virtualizarr/tests/test_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,31 @@ def test_anon_read_s3(self, filetype, indexes):
assert isinstance(vds[var].data, ManifestArray), var


@network
class TestReadFromURL:
def test_read_from_url(self):
examples = {
"grib": "https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib",
"netcdf3": "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc",
"netcdf4": "https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc",
"hdf4": "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf",
# https://github.com/zarr-developers/VirtualiZarr/issues/159
# "hdf5": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5",
# https://github.com/zarr-developers/VirtualiZarr/issues/160
# "tiff": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif",
# "fits": "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits",
"jpg": "https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg",
}

for filetype, url in examples.items():
if filetype in ["grib", "jpg", "hdf4"]:
with pytest.raises(NotImplementedError):
vds = open_virtual_dataset(url, reader_options={})
else:
vds = open_virtual_dataset(url, reader_options={})
assert isinstance(vds, xr.Dataset)


class TestLoadVirtualDataset:
def test_loadable_variables(self, netcdf4_file):
vars_to_load = ["air", "time"]
Expand All @@ -325,6 +350,13 @@ def test_loadable_variables(self, netcdf4_file):
if name in vars_to_load:
xrt.assert_identical(vds.variables[name], full_ds.variables[name])

def test_explicit_filetype(self, netcdf4_file):
with pytest.raises(ValueError):
open_virtual_dataset(netcdf4_file, filetype="unknown")

with pytest.raises(NotImplementedError):
open_virtual_dataset(netcdf4_file, filetype="grib")

@patch("virtualizarr.kerchunk.read_kerchunk_references_from_file")
def test_open_virtual_dataset_passes_expected_args(
self, mock_read_kerchunk, netcdf4_file
Expand Down
4 changes: 2 additions & 2 deletions virtualizarr/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ def open_virtual_dataset(
File path to open as a set of virtualized zarr arrays.
filetype : FileType, default None
Type of file to be opened. Used to determine which kerchunk file format backend to use.
Can be one of {'netCDF3', 'netCDF4', 'zarr_v3'}.
If not provided will attempt to automatically infer the correct filetype from the the filepath's extension.
Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}.
If not provided will attempt to automatically infer the correct filetype from header bytes.
drop_variables: list[str], default is None
Variables in the file to drop before returning.
loadable_variables: list[str], default is None
Expand Down

0 comments on commit 412c23c

Please sign in to comment.