auto chunk logic for segy export

TGSAI · Nov 18, 2022 · fe7a6a5 · fe7a6a5
1 parent 9ee5998
commit fe7a6a5
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 6 deletions.
diff --git a/src/mdio/converters/mdio.py b/src/mdio/converters/mdio.py
@@ -15,6 +15,7 @@
 from mdio.segy.byte_utils import Dtype
 from mdio.segy.creation import concat_files
 from mdio.segy.creation import mdio_spec_to_segy
+from mdio.segy.utilities import segy_export_rechunker
 
 
 try:
@@ -105,12 +106,8 @@ def mdio_to_segy(  # noqa: C901
         storage_options=storage_options,
     )
 
-    ndim = mdio.n_dim
-
-    # We flatten the z-axis (time or depth); so ieee2ibm, and byte-swaps etc
-    # can run on big chunks of data.
-    auto_chunk = (None,) + ("300M",) * (ndim - 2) + (-1,)
-    new_chunks = new_chunks if new_chunks is not None else auto_chunk
+    if new_chunks is None:
+        new_chunks = segy_export_rechunker(mdio.chunks, mdio.shape, mdio._traces.dtype)
 
     creation_args = [
         mdio_path_or_buffer,

diff --git a/src/mdio/segy/utilities.py b/src/mdio/segy/utilities.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 import numpy.typing as npt
+from dask.array.core import auto_chunks
 
 from mdio.core import Dimension
 from mdio.segy.parsers import parse_sample_axis
@@ -70,3 +71,65 @@ def get_grid_plan(
     dims.append(sample_dim)
 
     return dims, index_headers if return_headers else dims
+
+
+def segy_export_rechunker(
+    chunks: tuple[int], shape: tuple[int], dtype: npt.DTypeLike, limit: str = "300M"
+) -> tuple[int]:
+    """Determine chunk sizes for writing out SEG-Y given limit.
+
+    This module finds the desired chunk sizes for given chunk size
+    `limit` in a depth first order.
+
+    On disk chunks for MDIO are mainly optimized for visualization
+    and ML applications. When we want to do export back to SEG-Y, it
+    makes sense to have larger virtual chunks for processing of traces.
+    We also recursively merge multiple files to reduce memory footprint.
+
+    We choose to adjust chunks to be approx. 300 MB. We also need to do
+    this in the order of fastest changing axis to the slowest changing
+    axis becase the traces are expected to be serialized in the natural
+    data order.
+
+    Args:
+        chunks: The chunk sizes on disk.
+        shape: Shape of the whole array.
+        dtype: Numpy `dtype` of the array.
+        limit: Chunk size limit in, optional. Default is "300 MB"
+
+    Returns:
+        Adjusted chunk sizes for further processing
+
+    Raises:
+        ValueError: If resulting chunks will split file on disk.
+    """
+    ndim = len(shape)
+
+    # set sample chunks to max
+    prev_chunks = chunks[:-1] + (shape[-1],)
+
+    for idx in range(-2, -ndim, -1):
+        tmp_chunks = prev_chunks[:idx] + ("auto",) + prev_chunks[idx + 1 :]
+
+        new_chunks = auto_chunks(
+            chunks=tmp_chunks,
+            shape=shape,
+            limit=limit,
+            previous_chunks=prev_chunks,
+            dtype=dtype,
+        )
+
+        # Ensure it is integers
+        new_chunks = tuple(map(int, new_chunks))
+        prev_chunks = new_chunks
+
+    if new_chunks < chunks:
+        msg = (
+            f"One of chunk sizes in {new_chunks=} are smaller than the on "
+            f"disk {chunks=} with given {limit=}. This will cause very poor "
+            "performance due to redundant reads. Please increase limit to "
+            "get larger chunks. However, this may require more memory."
+        )
+        raise ValueError(msg)
+
+    return new_chunks