From fe7a6a51428b2fbea505fb05c39411e59774a594 Mon Sep 17 00:00:00 2001 From: Altay Sansal Date: Fri, 18 Nov 2022 15:47:50 -0600 Subject: [PATCH] auto chunk logic for segy export --- src/mdio/converters/mdio.py | 9 ++---- src/mdio/segy/utilities.py | 63 +++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 6 deletions(-) diff --git a/src/mdio/converters/mdio.py b/src/mdio/converters/mdio.py index d614cc7a..ec4964e7 100644 --- a/src/mdio/converters/mdio.py +++ b/src/mdio/converters/mdio.py @@ -15,6 +15,7 @@ from mdio.segy.byte_utils import Dtype from mdio.segy.creation import concat_files from mdio.segy.creation import mdio_spec_to_segy +from mdio.segy.utilities import segy_export_rechunker try: @@ -105,12 +106,8 @@ def mdio_to_segy( # noqa: C901 storage_options=storage_options, ) - ndim = mdio.n_dim - - # We flatten the z-axis (time or depth); so ieee2ibm, and byte-swaps etc - # can run on big chunks of data. - auto_chunk = (None,) + ("300M",) * (ndim - 2) + (-1,) - new_chunks = new_chunks if new_chunks is not None else auto_chunk + if new_chunks is None: + new_chunks = segy_export_rechunker(mdio.chunks, mdio.shape, mdio._traces.dtype) creation_args = [ mdio_path_or_buffer, diff --git a/src/mdio/segy/utilities.py b/src/mdio/segy/utilities.py index e32a597b..ded521c6 100644 --- a/src/mdio/segy/utilities.py +++ b/src/mdio/segy/utilities.py @@ -8,6 +8,7 @@ import numpy as np import numpy.typing as npt +from dask.array.core import auto_chunks from mdio.core import Dimension from mdio.segy.parsers import parse_sample_axis @@ -70,3 +71,65 @@ def get_grid_plan( dims.append(sample_dim) return dims, index_headers if return_headers else dims + + +def segy_export_rechunker( + chunks: tuple[int], shape: tuple[int], dtype: npt.DTypeLike, limit: str = "300M" +) -> tuple[int]: + """Determine chunk sizes for writing out SEG-Y given limit. + + This module finds the desired chunk sizes for given chunk size + `limit` in a depth first order. + + On disk chunks for MDIO are mainly optimized for visualization + and ML applications. When we want to do export back to SEG-Y, it + makes sense to have larger virtual chunks for processing of traces. + We also recursively merge multiple files to reduce memory footprint. + + We choose to adjust chunks to be approx. 300 MB. We also need to do + this in the order of fastest changing axis to the slowest changing + axis becase the traces are expected to be serialized in the natural + data order. + + Args: + chunks: The chunk sizes on disk. + shape: Shape of the whole array. + dtype: Numpy `dtype` of the array. + limit: Chunk size limit in, optional. Default is "300 MB" + + Returns: + Adjusted chunk sizes for further processing + + Raises: + ValueError: If resulting chunks will split file on disk. + """ + ndim = len(shape) + + # set sample chunks to max + prev_chunks = chunks[:-1] + (shape[-1],) + + for idx in range(-2, -ndim, -1): + tmp_chunks = prev_chunks[:idx] + ("auto",) + prev_chunks[idx + 1 :] + + new_chunks = auto_chunks( + chunks=tmp_chunks, + shape=shape, + limit=limit, + previous_chunks=prev_chunks, + dtype=dtype, + ) + + # Ensure it is integers + new_chunks = tuple(map(int, new_chunks)) + prev_chunks = new_chunks + + if new_chunks < chunks: + msg = ( + f"One of chunk sizes in {new_chunks=} are smaller than the on " + f"disk {chunks=} with given {limit=}. This will cause very poor " + "performance due to redundant reads. Please increase limit to " + "get larger chunks. However, this may require more memory." + ) + raise ValueError(msg) + + return new_chunks