Skip to content

Commit

Permalink
auto chunk logic for segy export
Browse files Browse the repository at this point in the history
  • Loading branch information
Altay Sansal committed Nov 18, 2022
1 parent 9ee5998 commit fe7a6a5
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 6 deletions.
9 changes: 3 additions & 6 deletions src/mdio/converters/mdio.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from mdio.segy.byte_utils import Dtype
from mdio.segy.creation import concat_files
from mdio.segy.creation import mdio_spec_to_segy
from mdio.segy.utilities import segy_export_rechunker


try:
Expand Down Expand Up @@ -105,12 +106,8 @@ def mdio_to_segy( # noqa: C901
storage_options=storage_options,
)

ndim = mdio.n_dim

# We flatten the z-axis (time or depth); so ieee2ibm, and byte-swaps etc
# can run on big chunks of data.
auto_chunk = (None,) + ("300M",) * (ndim - 2) + (-1,)
new_chunks = new_chunks if new_chunks is not None else auto_chunk
if new_chunks is None:
new_chunks = segy_export_rechunker(mdio.chunks, mdio.shape, mdio._traces.dtype)

creation_args = [
mdio_path_or_buffer,
Expand Down
63 changes: 63 additions & 0 deletions src/mdio/segy/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import numpy as np
import numpy.typing as npt
from dask.array.core import auto_chunks

from mdio.core import Dimension
from mdio.segy.parsers import parse_sample_axis
Expand Down Expand Up @@ -70,3 +71,65 @@ def get_grid_plan(
dims.append(sample_dim)

return dims, index_headers if return_headers else dims


def segy_export_rechunker(
chunks: tuple[int], shape: tuple[int], dtype: npt.DTypeLike, limit: str = "300M"
) -> tuple[int]:
"""Determine chunk sizes for writing out SEG-Y given limit.
This module finds the desired chunk sizes for given chunk size
`limit` in a depth first order.
On disk chunks for MDIO are mainly optimized for visualization
and ML applications. When we want to do export back to SEG-Y, it
makes sense to have larger virtual chunks for processing of traces.
We also recursively merge multiple files to reduce memory footprint.
We choose to adjust chunks to be approx. 300 MB. We also need to do
this in the order of fastest changing axis to the slowest changing
axis becase the traces are expected to be serialized in the natural
data order.
Args:
chunks: The chunk sizes on disk.
shape: Shape of the whole array.
dtype: Numpy `dtype` of the array.
limit: Chunk size limit in, optional. Default is "300 MB"
Returns:
Adjusted chunk sizes for further processing
Raises:
ValueError: If resulting chunks will split file on disk.
"""
ndim = len(shape)

# set sample chunks to max
prev_chunks = chunks[:-1] + (shape[-1],)

for idx in range(-2, -ndim, -1):
tmp_chunks = prev_chunks[:idx] + ("auto",) + prev_chunks[idx + 1 :]

new_chunks = auto_chunks(
chunks=tmp_chunks,
shape=shape,
limit=limit,
previous_chunks=prev_chunks,
dtype=dtype,
)

# Ensure it is integers
new_chunks = tuple(map(int, new_chunks))
prev_chunks = new_chunks

if new_chunks < chunks:
msg = (
f"One of chunk sizes in {new_chunks=} are smaller than the on "
f"disk {chunks=} with given {limit=}. This will cause very poor "
"performance due to redundant reads. Please increase limit to "
"get larger chunks. However, this may require more memory."
)
raise ValueError(msg)

return new_chunks

0 comments on commit fe7a6a5

Please sign in to comment.