NaN filing between partitions with overlap (#165)

* simplify NaN filling in FlashLoader and make it lazy * add the map_overlap method to do intrafile filling * add ffill as dfops with tests * add optional df len computation * add test for df len compute * implement multipass ffill * add config for ffill multipass iteration * add pyarrow * load metadata directly for parquet files using pyarrow or speedup --------- Co-authored-by: Steinn Ymir Agustsson <[email protected]>
OpenCOMPES · Oct 10, 2023 · e3af145 · e3af145
1 parent 8f49f40
commit e3af145
Show file tree

Hide file tree

Showing 6 changed files with 215 additions and 66 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,7 @@ tifffile = ">=2022.2.9, <2023.0.0"
 tqdm = "^4.62.3"
 xarray = "^0.20.2"
 joblib = "^1.2.0"
+pyarrow = "^13.0.0"
 jupyter = {version = "^1.0.0", extras = ["notebook"], optional = true}
 ipykernel = {version = "^6.9.1", extras = ["notebook"], optional = true}
 sphinx = {version = ">4.4.0", extras = ["docs"], optional = true}

diff --git a/sed/config/flash_example_config.yaml b/sed/config/flash_example_config.yaml
@@ -21,7 +21,8 @@ core:
 dataframe:
   # The offset correction to the pulseId
   ubid_offset: 5
-
+  # the number of iterations to fill the pulseId forward. 
+  forward_fill_iterations: 2 
   # The name of the DAQ system to use. Necessary to resolve the filenames/paths.
   daq: fl1user3
 

diff --git a/sed/core/dfops.py b/sed/core/dfops.py
@@ -8,6 +8,7 @@
 from typing import Union
 
 import dask.dataframe
+from dask.diagnostics import ProgressBar
 import numpy as np
 import pandas as pd
 
@@ -138,3 +139,57 @@ def map_columns_2d(
     )
 
     return df
+
+
+def forward_fill_lazy(
+        df: dask.dataframe.DataFrame,
+        channels: Sequence[str],
+        before: Union[str, int] = 'max',
+        compute_lengths: bool = False,
+        iterations: int = 2,
+) -> dask.dataframe.DataFrame:
+    """Forward fill the specified columns multiple times in a dask dataframe.
+
+    Allows forward filling between partitions. This is useful for dataframes
+    that have sparse data, such as those with many NaNs.
+    Runnin the forward filling multiple times can fix the issue of having
+    entire partitions consisting of NaNs. By default we run this twice, which
+    is enough to fix the issue for dataframes with no consecutive partitions of NaNs.
+
+    Args:
+        df (dask.dataframe.DataFrame): The dataframe to forward fill.
+        channels (list): The columns to forward fill.
+        before (int, str, optional): The number of rows to include before the current partition.
+            if 'max' it takes as much as possible from the previous partition, which is
+            the size of the smallest partition in the dataframe. Defaults to 'max'.
+        after (int, optional): The number of rows to include after the current partition.
+            Defaults to 'part'.
+        compute_lengths (bool, optional): Whether to compute the length of each partition
+        iterations (int, optional): The number of times to forward fill the dataframe.
+
+    Returns:
+        dask.dataframe.DataFrame: The dataframe with the specified columns forward filled.
+    """
+    # Define a custom function to forward fill specified columns
+    def forward_fill_partition(df):
+        df[channels] = df[channels].ffill()
+        return df
+
+    # calculate the number of rows in each partition and choose least
+    if before == 'max':
+        nrows = df.map_partitions(len)
+        if compute_lengths:
+            with ProgressBar():
+                print("Computing dataframe shape...")
+                nrows = nrows.compute()
+        before = min(nrows)
+    elif not isinstance(before, int):
+        raise TypeError('before must be an integer or "max"')
+    # Use map_overlap to apply forward_fill_partition
+    for _ in range(iterations):
+        df = df.map_overlap(
+            forward_fill_partition,
+            before=before,
+            after=0,
+        )
+    return df