From db48d74a3843177317907fa075313bc75c6480e3 Mon Sep 17 00:00:00 2001
From: Dimitri Vlachos <dimitrios.vlachos@diamond.ac.uk>
Date: Tue, 5 Nov 2024 10:54:49 +0000
Subject: [PATCH] Implement extended dispersion spotfinding (#1)

Implement extended dispersion spotfinding

Implement a GPU-based version of the extended dispersion spotfinding
algorithm. This builds on regular dispersion by making two passes.
This allows for the detection of fainter spots by using the first pass
to detect candidate spots and exclude them from the background
calculation in the second pass.

Extended dispersion spotfinding is unavoidably slower than regular
dispersion by the fact that it requires two passes. However, the
performance gained through massively parallel processing on the GPU
should make this a viable option, when needed, even for fast feedback.

Create several CUDA kernels to perform the extended dispersion
spotfinding algorithm (`threshold.cu`, `erosion.cu`).

Refactor the dispersion kernel to share code with extended dispersion.

Move common code to `cuda_common.hpp`.

Create basic test script for extended dispersion spotfinding.

Add an `--algorithm` argument to `spotfinder.cc` along with the
necessary code to parse it, allowing for algorithm selection at runtime.

Add new files to the CMakeLists.txt file to include them in the build.

See also: #12, #13, #14
---
 include/common.hpp                  |   3 +
 include/cuda_common.hpp             |  38 ++-
 spotfinder/CMakeLists.txt           |   2 +
 spotfinder/kernels/erosion.cu       | 114 +++++++++
 spotfinder/kernels/erosion.cuh      |  11 +
 spotfinder/kernels/masking.cu       |  15 +-
 spotfinder/kernels/thresholding.cu  | 356 ++++++++++++++++++++++++++++
 spotfinder/kernels/thresholding.cuh |  51 ++++
 spotfinder/spotfinder.cc            | 105 ++++----
 spotfinder/spotfinder.cu            | 353 ++++++++++++++++++---------
 spotfinder/spotfinder.cuh           |  49 ++--
 tests/dispersion_extended.sh        |  41 ++++
 tests/res_comparison.sh             |   1 +
 13 files changed, 963 insertions(+), 176 deletions(-)
 create mode 100644 spotfinder/kernels/erosion.cu
 create mode 100644 spotfinder/kernels/erosion.cuh
 create mode 100644 spotfinder/kernels/thresholding.cu
 create mode 100644 spotfinder/kernels/thresholding.cuh
 create mode 100755 tests/dispersion_extended.sh
diff --git a/include/common.hpp b/include/common.hpp
index ca6f84b..b37ae02 100644
--- a/include/common.hpp
+++ b/include/common.hpp
@@ -12,6 +12,9 @@
 #include <type_traits>
 #include <vector>
 
+#define VALID_PIXEL 1
+#define MASKED_PIXEL 0
+
 template <typename T1, typename... TS>
 auto with_formatting(const std::string &code, const T1 &first, TS... args)
   -> std::string {
diff --git a/include/cuda_common.hpp b/include/cuda_common.hpp
index f51cf29..8e4f212 100644
--- a/include/cuda_common.hpp
+++ b/include/cuda_common.hpp
@@ -266,6 +266,42 @@ auto make_cuda_pitched_malloc(size_t width, size_t height) {
     return std::make_pair(std::shared_ptr<T[]>(obj, deleter), pitch / sizeof(T));
 }
 
+/**
+ * @brief Function to allocate a pitched memory buffer on the GPU.
+ * @param data The pointer to the allocated memory.
+ * @param width The width of the buffer.
+ * @param height The height of the buffer.
+ * @param pitch The pitch of the buffer.
+ */
+template <typename T>
+struct PitchedMalloc {
+  public:
+    using value_type = T;
+    PitchedMalloc(std::shared_ptr<T[]> data, size_t width, size_t height, size_t pitch)
+        : _data(data), width(width), height(height), pitch(pitch) {}
+
+    PitchedMalloc(size_t width, size_t height) : width(width), height(height) {
+        auto [alloc, alloc_pitch] = make_cuda_pitched_malloc<T>(width, height);
+        _data = alloc;
+        pitch = alloc_pitch;
+    }
+
+    auto get() {
+        return _data.get();
+    }
+    auto size_bytes() -> size_t const {
+        return pitch * height * sizeof(T);
+    }
+    auto pitch_bytes() -> size_t const {
+        return pitch * sizeof(T);
+    }
+
+    std::shared_ptr<T[]> _data;
+    size_t width;
+    size_t height;
+    size_t pitch;
+};
+
 class CudaStream {
     cudaStream_t _stream;
 
@@ -409,7 +445,7 @@ void save_device_data_to_txt(PixelType *device_ptr,
     for (int y = 0, k = 0; y < height; ++y) {
         for (int x = 0; x < width; ++x, ++k) {
             if (condition_func(host_data[k])) {
-                out.print("{}, {}\n", x, y);
+                out.print("{}, {}, {}\n", x, y, host_data[k]);
             }
         }
     }
diff --git a/spotfinder/CMakeLists.txt b/spotfinder/CMakeLists.txt
index a378ba4..87991a1 100644
--- a/spotfinder/CMakeLists.txt
+++ b/spotfinder/CMakeLists.txt
@@ -12,6 +12,8 @@ add_executable(spotfinder
     shmread.cc
     cbfread.cc
     kernels/masking.cu
+    kernels/thresholding.cu
+    kernels/erosion.cu
 )
 target_link_libraries(spotfinder
     PRIVATE
diff --git a/spotfinder/kernels/erosion.cu b/spotfinder/kernels/erosion.cu
new file mode 100644
index 0000000..2c29e08
--- /dev/null
+++ b/spotfinder/kernels/erosion.cu
@@ -0,0 +1,114 @@
+/**
+ * @file erosion.cu
+ * @brief Contains the CUDA kernel implementation for performing
+ *        morphological erosion on an image using a given kernel radius
+ *        and Chebyshev distance threshold.
+ *
+ * This kernel processes a dispersion mask containing potential signal
+ * spots and background pixels. The kernel processes each pixel in the
+ * mask and iterates over its local neighbourhood defined by a given
+ * kernel radius. The kernel then checks if each pixel is within a given
+ * Chebyshev distance threshold of a background pixel. If the pixel is
+ * within the threshold, it is considered part of the background and is
+ * marked as a background pixel in the erosion mask. Therefore eroding
+ * the edges of the signal spots.
+ * 
+ * @note The __restrict__ keyword is used to indicate to the compiler
+ *       that the two pointers are not aliased, allowing the compiler to
+ *       perform more aggressive optimizations.
+ */
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+
+#include "cuda_common.hpp"
+#include "erosion.cuh"
+
+namespace cg = cooperative_groups;
+
+#pragma region Erosion kernel
+__global__ void erosion(uint8_t __restrict__ *dispersion_mask,
+                        uint8_t __restrict__ *erosion_mask,
+                        uint8_t __restrict__ *mask,
+                        size_t dispersion_mask_pitch,
+                        size_t erosion_mask_pitch,
+                        size_t mask_pitch,
+                        int width,
+                        int height,
+                        uint8_t radius) {
+    // Calculate the pixel coordinates
+    auto block = cg::this_thread_block();
+    int x = block.group_index().x * block.group_dim().x + block.thread_index().x;
+    int y = block.group_index().y * block.group_dim().y + block.thread_index().y;
+
+    // Guards
+    if (x >= width || y >= height) return;  // Out of bounds guard
+
+    bool is_background = dispersion_mask[y * dispersion_mask_pitch + x] == MASKED_PIXEL;
+    if (is_background) {
+        /*
+         * If the pixel is masked, we want to set it to VALID_PIXEL
+         * in order to invert the mask.
+        */
+        erosion_mask[y * erosion_mask_pitch + x] = VALID_PIXEL;
+        return;
+    }
+
+    // Calculate the bounds of the erosion kernel
+    int x_start = max(0, x - radius);
+    int x_end = min(x + radius + 1, width);
+    int y_start = max(0, y - radius);
+    int y_end = min(y + radius + 1, height);
+
+    bool should_erase = false;  // Flag to determine if the pixel should be erased
+    constexpr uint8_t chebyshev_distance_threshold = 2;
+
+    // Iterate over the kernel bounds
+    for (int kernel_x = x_start; kernel_x < x_end; ++kernel_x) {
+        for (int kernel_y = y_start; kernel_y < y_end; ++kernel_y) {
+            /*
+             * TODO: Investigate whether we should be doing this or not!
+             * Intuition says that we should be considering the mask,
+             * however DIALS does not do this. May be a bug, may be on
+             * purpose? Investigate!
+            */
+            // if (mask[kernel_y * mask_pitch + kernel_x] == 0) {
+            //     continue;
+            // }
+            if (dispersion_mask[kernel_y * dispersion_mask_pitch + kernel_x]
+                == MASKED_PIXEL) {
+                // If the current pixel is background, check the Chebyshev distance
+                uint8_t chebyshev_distance = max(abs(kernel_x - x), abs(kernel_y - y));
+
+                if (chebyshev_distance <= chebyshev_distance_threshold) {
+                    // If a background pixel is too close, the current pixel should be erased
+                    should_erase = true;
+                    // We can then break out of the loop, as no further checks are necessary
+                    goto termination;
+                }
+            }
+        }
+    }
+
+termination:
+    if (should_erase) {
+        /*
+         * Erase the pixel from the background mask. This is done by setting the pixel
+         * as valid (i.e. not masked) in the erosion_mask data. This allows the pixel to be
+         * considered as a background pixel in the background calculation as it is not
+         * considered part of the signal.
+        */
+        erosion_mask[y * erosion_mask_pitch + x] = VALID_PIXEL;
+    } else {
+        /*
+         * If the pixel should not be erased, this means that it is part of the signal.
+         * and needs to be marked as masked in the erosion_mask data. This prevents the pixel
+         * from being considered as part of the background in the background calculation.
+        */
+
+        // Invert 'valid' signal spot to 'masked' background spots
+        erosion_mask[y * erosion_mask_pitch + x] =
+          !dispersion_mask[y * dispersion_mask_pitch + x];
+    }
+}
+#pragma enregion Erosion kernel
\ No newline at end of file
diff --git a/spotfinder/kernels/erosion.cuh b/spotfinder/kernels/erosion.cuh
new file mode 100644
index 0000000..2fd8f79
--- /dev/null
+++ b/spotfinder/kernels/erosion.cuh
@@ -0,0 +1,11 @@
+#pragma once
+
+__global__ void erosion(uint8_t __restrict__ *dispersion_mask,
+                               uint8_t __restrict__ *erosion_mask,
+                               uint8_t __restrict__ *mask,
+                               size_t dispersion_mask_pitch,
+                               size_t erosion_mask_pitch,
+                               size_t mask_pitch,
+                               int width,
+                               int height,
+                               uint8_t radius);
\ No newline at end of file
diff --git a/spotfinder/kernels/masking.cu b/spotfinder/kernels/masking.cu
index cd9dfd8..ed13c5c 100644
--- a/spotfinder/kernels/masking.cu
+++ b/spotfinder/kernels/masking.cu
@@ -33,18 +33,21 @@
  * @param pixel_size_y The pixel size of the detector in the y-direction in m
  * @return The calculated distance from the beam center in m
 */
-__device__ float get_distance_from_centre(float x,
+__device__ float get_distance_from_center(float x,
                                           float y,
-                                          float centre_x,
-                                          float centre_y,
+                                          float center_x,
+                                          float center_y,
                                           float pixel_size_x,
                                           float pixel_size_y) {
     /*
      * Since this calculation is for a broad, general exclusion, we can
      * use basic Pythagoras to calculate the distance from the center.
+     * 
+     * We add 0.5 in order to move to the center of the pixel. Since
+     * starting at 0, 0, for instance, would infact be the corner.
     */
-    float dx = (x - centre_x) * pixel_size_x;
-    float dy = (y - centre_y) * pixel_size_y;
+    float dx = ((x + 0.5f) - center_x) * pixel_size_x;
+    float dy = ((y + 0.5f) - center_y) * pixel_size_y;
     return sqrtf(dx * dx + dy * dy);
 }
 
@@ -117,7 +120,7 @@ __global__ void apply_resolution_mask(uint8_t *mask,
         return;
     }
 
-    float distance_from_centre = get_distance_from_centre(
+    float distance_from_centre = get_distance_from_center(
       x, y, beam_center_x, beam_center_y, pixel_size_x, pixel_size_y);
     float resolution =
       get_resolution(wavelength, distance_to_detector, distance_from_centre);
diff --git a/spotfinder/kernels/thresholding.cu b/spotfinder/kernels/thresholding.cu
new file mode 100644
index 0000000..30cdc31
--- /dev/null
+++ b/spotfinder/kernels/thresholding.cu
@@ -0,0 +1,356 @@
+/**
+ * @file thresholding.cu
+ * @brief Contains CUDA kernel implementations for thresholding image 
+ *        data.
+ *
+ * This file contains CUDA kernels for thresholding image data based on
+ * the variance and mean of the local neighbourhood of each pixel. These
+ * thresholds are used to identify potential signal spots against a
+ * background in the input image based on local variance, mean, and
+ * user-defined significance levels.
+ * 
+ * @note The __restrict__ keyword is used to indicate to the compiler
+ *       that the two pointers are not aliased, allowing the compiler to
+ *       perform more aggressive optimizations.
+ */
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+
+#include <cuda/std/tuple>
+
+#include "cuda_common.hpp"
+#include "thresholding.cuh"
+
+namespace cg = cooperative_groups;
+
+#pragma region Device Functions
+/**
+ * @brief Calculate the dispersion flags for a given pixel.
+ * @param image Pointer to the input image data.
+ * @param mask Pointer to the mask data indicating valid pixels.
+ * @param image_pitch The pitch of the image data.
+ * @param mask_pitch The pitch of the mask data.
+ * @param this_pixel The pixel value at the current position.
+ * @param x The x-coordinate of the pixel.
+ * @param y The y-coordinate of the pixel.
+ * @param width The width of the image.
+ * @param height The height of the image.
+ * @param kernel_width The radius of the kernel in the x-direction.
+ * @param kernel_height The radius of the kernel in the y-direction.
+ * @param min_count Minimum number of valid pixels required to be considered a valid spot.
+ * @param n_sig_b Background noise significance level.
+ * @param n_sig_s Signal significance level.
+ * @return A tuple <bool not_background, bool is_signal, uint8_t n> where:
+ *        - not_background: True if the pixel is not part of the background.
+ *        - is_signal: True if the pixel is a strong signal.
+ *        - n: The number of valid pixels in the local neighbourhood.
+ */
+__device__ cuda::std::tuple<bool, bool, uint8_t> calculate_dispersion_flags(
+  pixel_t *image,
+  uint8_t *mask,
+  size_t image_pitch,
+  size_t mask_pitch,
+  pixel_t this_pixel,
+  int x,
+  int y,
+  int width,
+  int height,
+  uint8_t kernel_width,
+  uint8_t kernel_height,
+  uint8_t min_count,
+  float n_sig_b,
+  float n_sig_s) {
+    // Initialize variables for computing the local sum and count
+    uint sum = 0;
+    size_t sumsq = 0;
+    uint8_t n = 0;
+
+    int row_start = max(0, y - kernel_height);
+    int row_end = min(y + kernel_height + 1, height);
+
+    for (int row = row_start; row < row_end; ++row) {
+        int row_offset = image_pitch * row;
+        int mask_offset = mask_pitch * row;
+
+        int col_start = max(0, x - kernel_width);
+        int col_end = min(x + kernel_width + 1, width);
+
+        for (int col = col_start; col < col_end; ++col) {
+            pixel_t pixel = image[row_offset + col];
+            uint8_t mask_pixel = mask[mask_offset + col];
+            bool include_pixel = mask_pixel != 0;  // If the pixel is valid
+            if (include_pixel) {
+                sum += pixel;
+                sumsq += pixel * pixel;
+                n += 1;
+            }
+        }
+    }
+
+    // Compute local mean and variance
+    float sum_f = static_cast<float>(sum);
+    float sumsq_f = static_cast<float>(sumsq);
+
+    float mean = sum_f / n;
+    float variance = (n * sumsq_f - (sum_f * sum_f)) / (n * (n - 1));
+    float dispersion = variance / mean;
+
+    // Compute the background threshold and signal threshold
+    float background_threshold = 1 + n_sig_b * sqrt(2.0f / (n - 1));
+    bool not_background = dispersion > background_threshold;
+    float signal_threshold = mean + n_sig_s * sqrt(mean);
+
+    // Check if the pixel is a strong pixel
+    bool is_signal = this_pixel > signal_threshold;
+
+    return {not_background, is_signal, n};
+}
+#pragma endregion
+
+#pragma region Thresholding kernels
+/**
+ * @brief Kernel for computing the basic threshold based on variance and mean.
+ * @param image Pointer to the input image data.
+ * @param mask Pointer to the mask data indicating valid pixels.
+ * @param result_mask Pointer to the output mask data where results will be stored.
+ * @param image_pitch The pitch of the image data.
+ * @param mask_pitch The pitch of the mask data.
+ * @param result_pitch The pitch of the result mask data.
+ * @param width The width of the image.
+ * @param height The height of the image.
+ * @param max_valid_pixel_value The maximum valid trusted pixel value.
+ * @param min_count Minimum number of valid pixels required to be considered a valid spot.
+ * @param threshold The global threshold for intensity.
+ * @param n_sig_b Background noise significance level.
+ * @param n_sig_s Signal significance level.
+ */
+__global__ void dispersion(pixel_t __restrict__ *image,
+                           uint8_t __restrict__ *mask,
+                           uint8_t __restrict__ *result_mask,
+                           size_t image_pitch,
+                           size_t mask_pitch,
+                           size_t result_pitch,
+                           int width,
+                           int height,
+                           pixel_t max_valid_pixel_value,
+                           uint8_t kernel_width,
+                           uint8_t kernel_height,
+                           uint8_t min_count,
+                           float n_sig_b,
+                           float n_sig_s) {
+    // Move pointers to the correct slice
+    image = image + (image_pitch * height * blockIdx.z);
+    result_mask = result_mask + (mask_pitch * height * blockIdx.z);
+
+    // Calculate the pixel coordinates
+    auto block = cg::this_thread_block();
+    int x = block.group_index().x * block.group_dim().x + block.thread_index().x;
+    int y = block.group_index().y * block.group_dim().y + block.thread_index().y;
+
+    if (x >= width || y >= height) return;  // Out of bounds guard
+
+    pixel_t this_pixel = image[y * image_pitch + x];
+
+    // Check if the pixel is masked and below the maximum valid pixel value
+    bool px_is_valid =
+      mask[y * mask_pitch + x] != 0 && this_pixel <= max_valid_pixel_value;
+
+    // Validity guard
+    if (!px_is_valid) {
+        result_mask[x + result_pitch * y] = 0;
+        return;
+    }
+
+    // Calculate the dispersion flags
+    auto [not_background, is_signal, n] = calculate_dispersion_flags(image,
+                                                                     mask,
+                                                                     image_pitch,
+                                                                     mask_pitch,
+                                                                     this_pixel,
+                                                                     x,
+                                                                     y,
+                                                                     width,
+                                                                     height,
+                                                                     kernel_width,
+                                                                     kernel_height,
+                                                                     min_count,
+                                                                     n_sig_b,
+                                                                     n_sig_s);
+
+    result_mask[x + result_pitch * y] = not_background && is_signal && n > 1;
+}
+
+/**
+ * @brief Kernel for computing the dispersion threshold.
+ * @param image Pointer to the input image data.
+ * @param mask Pointer to the mask data indicating valid pixels.
+ * @param result_mask Pointer to the output mask data where results will be stored.
+ * @param image_pitch The pitch of the image data.
+ * @param mask_pitch The pitch of the mask data.
+ * @param result_pitch The pitch of the result mask data.
+ * @param width The width of the image.
+ * @param height The height of the image.
+ * @param max_valid_pixel_value The maximum valid trusted pixel value.
+ * @param kernel_width The radius of the kernel in the x-direction.
+ * @param kernel_height The radius of the kernel in the y-direction.
+ * @param min_count Minimum number of valid pixels required to be considered a valid spot.
+ * @param n_sig_b Background noise significance level.
+ * @param n_sig_s Signal significance level.
+ */
+__global__ void dispersion_extended_first_pass(pixel_t __restrict__ *image,
+                                               uint8_t __restrict__ *mask,
+                                               uint8_t __restrict__ *result_mask,
+                                               size_t image_pitch,
+                                               size_t mask_pitch,
+                                               size_t result_pitch,
+                                               int width,
+                                               int height,
+                                               pixel_t max_valid_pixel_value,
+                                               uint8_t kernel_width,
+                                               uint8_t kernel_height,
+                                               uint8_t min_count,
+                                               float n_sig_b,
+                                               float n_sig_s) {
+    // Move pointers to the correct slice
+    image = image + (image_pitch * height * blockIdx.z);
+    result_mask = result_mask + (mask_pitch * height * blockIdx.z);
+
+    // Calculate the pixel coordinates
+    auto block = cg::this_thread_block();
+    int x = block.group_index().x * block.group_dim().x + block.thread_index().x;
+    int y = block.group_index().y * block.group_dim().y + block.thread_index().y;
+
+    if (x >= width || y >= height) return;  // Out of bounds guard
+
+    pixel_t this_pixel = image[y * image_pitch + x];
+
+    // Check if the pixel is masked and below the maximum valid pixel value
+    bool px_is_valid =
+      mask[y * mask_pitch + x] != 0 && this_pixel <= max_valid_pixel_value;
+
+    // Validity guard
+    if (!px_is_valid) {
+        result_mask[x + result_pitch * y] = 0;
+        return;
+    }
+
+    // Calculate the dispersion flags
+    auto [not_background, is_signal, n] = calculate_dispersion_flags(image,
+                                                                     mask,
+                                                                     image_pitch,
+                                                                     mask_pitch,
+                                                                     this_pixel,
+                                                                     x,
+                                                                     y,
+                                                                     width,
+                                                                     height,
+                                                                     kernel_width,
+                                                                     kernel_height,
+                                                                     min_count,
+                                                                     n_sig_b,
+                                                                     n_sig_s);
+
+    result_mask[x + result_pitch * y] = not_background && n > 1;
+}
+
+/**
+ * @brief Kernel for computing the final threshold after dispersion mask.
+ * @param image Pointer to the input image data.
+ * @param mask Pointer to the mask data indicating valid pixels.
+ * @param dispersion_mask Pointer to the dispersion mask used for extended algorithm.
+ * @param result_mask Pointer to the output mask data where results will be stored.
+ * @param image_pitch The pitch of the image data.
+ * @param mask_pitch The pitch of the mask data.
+ * @param dispersion_mask_pitch The pitch of the dispersion mask data.
+ * @param result_mask_pitch The pitch of the result mask data.
+ * @param width The width of the image.
+ * @param height The height of the image.
+ * @param max_valid_pixel_value The maximum valid trusted pixel value.
+ * @param n_sig_s Signal significance level.
+ * @param threshold Global threshold for the intensity.
+ */
+__global__ void dispersion_extended_second_pass(pixel_t __restrict__ *image,
+                                                uint8_t __restrict__ *mask,
+                                                uint8_t __restrict__ *dispersion_mask,
+                                                uint8_t __restrict__ *result_mask,
+                                                size_t image_pitch,
+                                                size_t mask_pitch,
+                                                size_t dispersion_mask_pitch,
+                                                size_t result_mask_pitch,
+                                                int width,
+                                                int height,
+                                                pixel_t max_valid_pixel_value,
+                                                uint8_t kernel_width,
+                                                uint8_t kernel_height,
+                                                float n_sig_s,
+                                                float threshold) {
+    // Move pointers to the correct slice
+    image = image + (image_pitch * height * blockIdx.z);
+    result_mask = result_mask + (result_mask_pitch * height * blockIdx.z);
+
+    // Calculate the pixel coordinates
+    auto block = cg::this_thread_block();
+    int x = block.group_index().x * block.group_dim().x + block.thread_index().x;
+    int y = block.group_index().y * block.group_dim().y + block.thread_index().y;
+
+    if (x >= width || y >= height) return;  // Out of bounds guard
+
+    pixel_t this_pixel = image[y * image_pitch + x];
+
+    // Check if the pixel is masked and below the maximum valid pixel value
+    bool px_is_valid =
+      mask[y * mask_pitch + x] != 0 && this_pixel <= max_valid_pixel_value;
+
+    // Initialize variables for computing the local sum and count
+    uint sum = 0;
+    uint8_t n = 0;
+
+    int row_start = max(0, y - kernel_height);
+    int row_end = min(y + kernel_height + 1, height);
+
+    for (int row = row_start; row < row_end; ++row) {
+        int row_offset = image_pitch * row;
+        int mask_offset = mask_pitch * row;
+
+        int col_start = max(0, x - kernel_width);
+        int col_end = min(x + kernel_width + 1, width);
+
+        for (int col = col_start; col < col_end; ++col) {
+            pixel_t pixel = image[row_offset + col];
+            uint8_t mask_pixel = mask[mask_offset + col];
+            uint8_t disp_mask_pixel =
+              dispersion_mask[row * dispersion_mask_pitch + col];
+            /*
+               * Check if the pixel is valid. That means that it is not
+               * masked and was not marked as potentially signal in the
+               * dispersion mask.
+              */
+            bool include_pixel =
+              mask_pixel != MASKED_PIXEL && disp_mask_pixel != MASKED_PIXEL;
+            if (include_pixel) {
+                sum += pixel;
+                n += 1;
+            }
+        }
+    }
+
+    // Calculate the thresholding
+    if (px_is_valid && n > 0) {
+        float sum_f = static_cast<float>(sum);
+
+        // The pixel must have been marked as potentially signal in the dispersion mask
+        bool disp_mask = dispersion_mask[y * dispersion_mask_pitch + x] == MASKED_PIXEL;
+        // The pixel must be above the global threshold
+        bool global_mask = image[y * image_pitch + x] > threshold;
+        // Calculate the local mean
+        float mean = (n > 1 ? sum_f / n : 0);  // If n is less than 1, set mean to 0
+        // The pixel must be above the local threshold
+        bool local_mask = image[y * image_pitch + x] >= (mean + n_sig_s * sqrtf(mean));
+
+        result_mask[y * result_mask_pitch + x] = disp_mask && global_mask && local_mask;
+    } else {
+        result_mask[y * result_mask_pitch + x] = 0;
+    }
+}
+#pragma endregion
\ No newline at end of file
diff --git a/spotfinder/kernels/thresholding.cuh b/spotfinder/kernels/thresholding.cuh
new file mode 100644
index 0000000..61fb4a1
--- /dev/null
+++ b/spotfinder/kernels/thresholding.cuh
@@ -0,0 +1,51 @@
+#pragma once
+
+#include "h5read.h"
+
+using pixel_t = H5Read::image_type;
+
+__global__ void dispersion(pixel_t __restrict__ *image,
+                                         uint8_t __restrict__ *mask,
+                                         uint8_t __restrict__ *result_mask,
+                                         size_t image_pitch,
+                                         size_t mask_pitch,
+                                         size_t result_pitch,
+                                         int width,
+                                         int height,
+                                         pixel_t max_valid_pixel_value,
+                                         uint8_t kernel_width,
+                                         uint8_t kernel_height,
+                                         uint8_t min_count,
+                                         float n_sig_b,
+                                         float n_sig_s);
+
+__global__ void dispersion_extended_first_pass(pixel_t __restrict__ *image,
+                                                    uint8_t __restrict__ *mask,
+                                                    uint8_t __restrict__ *result_mask,
+                                                    size_t image_pitch,
+                                                    size_t mask_pitch,
+                                                    size_t result_pitch,
+                                                    int width,
+                                                    int height,
+                                                    pixel_t max_valid_pixel_value,
+                                                    uint8_t kernel_width,
+                                                    uint8_t kernel_height,
+                                                    uint8_t min_count,
+                                                    float n_sig_b,
+                                                    float n_sig_s);
+
+__global__ void dispersion_extended_second_pass(pixel_t __restrict__ *image,
+                                               uint8_t __restrict__ *mask,
+                                               uint8_t __restrict__ *dispersion_mask,
+                                               uint8_t __restrict__ *result_mask,
+                                               size_t image_pitch,
+                                               size_t mask_pitch,
+                                               size_t dispersion_mask_pitch,
+                                               size_t result_mask_pitch,
+                                               int width,
+                                               int height,
+                                               pixel_t max_valid_pixel_value,
+                                               uint8_t kernel_width,
+                                               uint8_t kernel_height,
+                                               float n_sig_s,
+                                               float threshold);
diff --git a/spotfinder/spotfinder.cc b/spotfinder/spotfinder.cc
index 73f3775..f40924b 100644
--- a/spotfinder/spotfinder.cc
+++ b/spotfinder/spotfinder.cc
@@ -39,6 +39,7 @@ using json = nlohmann::json;
 std::stop_source global_stop;
 
 constexpr auto fmt_cyan = fmt::fg(fmt::terminal_color::cyan) | fmt::emphasis::bold;
+constexpr auto fmt_green = fmt::fg(fmt::terminal_color::green) | fmt::emphasis::bold;
 
 // Function for passing to std::signal to register the stop request
 extern "C" void stop_processing(int sig) {
@@ -65,35 +66,6 @@ struct Reflection {
     int num_pixels = 0;
 };
 
-template <typename T>
-struct PitchedMalloc {
-  public:
-    using value_type = T;
-    PitchedMalloc(std::shared_ptr<T[]> data, size_t width, size_t height, size_t pitch)
-        : _data(data), width(width), height(height), pitch(pitch) {}
-
-    PitchedMalloc(size_t width, size_t height) : width(width), height(height) {
-        auto [alloc, alloc_pitch] = make_cuda_pitched_malloc<T>(width, height);
-        _data = alloc;
-        pitch = alloc_pitch;
-    }
-
-    auto get() {
-        return _data.get();
-    }
-    auto size_bytes() -> size_t const {
-        return pitch * height * sizeof(T);
-    }
-    auto pitch_bytes() -> size_t const {
-        return pitch * sizeof(T);
-    }
-
-    std::shared_ptr<T[]> _data;
-    size_t width;
-    size_t height;
-    size_t pitch;
-};
-
 /// Copy the mask from a reader into a pitched GPU area
 template <typename T>
 auto upload_mask(T &reader) -> PitchedMalloc<uint8_t> {
@@ -211,6 +183,34 @@ void wait_for_ready_for_read(const std::string &path,
     }
 }
 
+/**
+ * @brief Struct to store the dispersion algorithm and its string representation.
+ */
+struct DispersionAlgorithm {
+    std::string algorithm_str;
+    enum class Algorithm { DISPERSION, DISPERSION_EXTENDED };
+    Algorithm algorithm;
+
+    /**
+     * @brief Constructor to initialize the DispersionAlgorithm object.
+     * @param input The string representation of the algorithm.
+     */
+    DispersionAlgorithm(std::string input) {
+        // Convert the input to lowercase for case-insensitive comparison
+        this->algorithm_str = input;
+        std::transform(input.begin(), input.end(), input.begin(), ::tolower);
+        if (input == "dispersion") {
+            this->algorithm_str = "Dispersion";
+            this->algorithm = Algorithm::DISPERSION;
+        } else if (input == "dispersion_extended") {
+            this->algorithm_str = "Dispersion Extended";  // ✨
+            this->algorithm = Algorithm::DISPERSION_EXTENDED;
+        } else {
+            throw std::invalid_argument("Invalid algorithm specified");
+        }
+    }
+};
+
 /**
  * @brief Class for handling a pipe and sending data through it in a thread-safe manner.
  */
@@ -305,6 +305,10 @@ int main(int argc, char **argv) {
       .metavar("FD")
       .default_value<int>(-1)
       .scan<'i', int>();
+    parser.add_argument("-a", "--algorithm")
+      .help("Dispersion algorithm to use")
+      .metavar("ALGO")
+      .default_value<std::string>("dispersion_extended");
     parser.add_argument("--dmin")
       .help("Minimum resolution (Å)")
       .metavar("MIN D")
@@ -330,6 +334,9 @@ int main(int argc, char **argv) {
     float dmin = parser.get<float>("dmin");
     float dmax = parser.get<float>("dmax");
 
+    DispersionAlgorithm dispersion_algorithm(parser.get<std::string>("algorithm"));
+    print("Algorithm: {}\n", styled(dispersion_algorithm.algorithm_str, fmt_green));
+
     uint32_t num_cpu_threads = parser.get<uint32_t>("threads");
     if (num_cpu_threads < 1) {
         print("Error: Thread count must be >= 1\n");
@@ -721,19 +728,33 @@ int main(int argc, char **argv) {
 
 #pragma region Spotfinding
                 // When done, launch the spotfind kernel
-                // do_spotfinding_naive<<<blocks_dims, gpu_thread_block_size, 0, stream>>>(
-                call_do_spotfinding_naive(blocks_dims,
-                                          gpu_thread_block_size,
-                                          0,
-                                          stream,
-                                          device_image.get(),
-                                          device_image.pitch,
-                                          mask.get(),
-                                          mask.pitch,
-                                          width,
-                                          height,
-                                          trusted_px_max,
-                                          device_results.get());
+                switch (dispersion_algorithm.algorithm) {
+                case DispersionAlgorithm::Algorithm::DISPERSION:
+                    call_do_spotfinding_dispersion(blocks_dims,
+                                                   gpu_thread_block_size,
+                                                   0,
+                                                   stream,
+                                                   device_image,
+                                                   mask,
+                                                   width,
+                                                   height,
+                                                   trusted_px_max,
+                                                   &device_results);
+                    break;
+                case DispersionAlgorithm::Algorithm::DISPERSION_EXTENDED:
+                    call_do_spotfinding_extended(blocks_dims,
+                                                 gpu_thread_block_size,
+                                                 0,
+                                                 stream,
+                                                 device_image,
+                                                 mask,
+                                                 width,
+                                                 height,
+                                                 trusted_px_max,
+                                                 &device_results,
+                                                 do_writeout);
+                    break;
+                }
                 post.record(stream);
 
                 // Copy the results buffer back to the CPU
diff --git a/spotfinder/spotfinder.cu b/spotfinder/spotfinder.cu
index cf0a449..47df6c4 100644
--- a/spotfinder/spotfinder.cu
+++ b/spotfinder/spotfinder.cu
@@ -9,118 +9,257 @@
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 
+#include "kernels/erosion.cuh"
+#include "kernels/thresholding.cuh"
 #include "spotfinder.cuh"
 
-#define VALID_PIXEL 1
-#define MASKED_PIXEL 0
-
 namespace cg = cooperative_groups;
 
-#pragma region Spotfinding
-__global__ void do_spotfinding_naive(pixel_t *image,
-                                     size_t image_pitch,
-                                     uint8_t *mask,
-                                     size_t mask_pitch,
-                                     int width,
-                                     int height,
-                                     pixel_t max_valid_pixel_value,
-                                     //  int *result_sum,
-                                     //  size_t *result_sumsq,
-                                     //  uint8_t *result_n,
-                                     uint8_t *result_strong) {
-    image = image + (image_pitch * height * blockIdx.z);
-    // result_sum = result_sum + (image_pitch * height * blockIdx.z);
-    // result_sumsq = result_sumsq + (image_pitch * height * blockIdx.z);
-    // result_n = result_n + (mask_pitch * height * blockIdx.z);
-    result_strong = result_strong + (mask_pitch * height * blockIdx.z);
-
-    auto block = cg::this_thread_block();
-    // auto warp = cg::tiled_partition<32>(block);
-    // int warpId = warp.meta_group_rank();
-    // int lane = warp.thread_rank();
-
-    uint sum = 0;
-    size_t sumsq = 0;
-    uint8_t n = 0;
-
-    int x = block.group_index().x * block.group_dim().x + block.thread_index().x;
-    int y = block.group_index().y * block.group_dim().y + block.thread_index().y;
-
-    // Don't calculate for masked pixels
-    pixel_t this_pixel = image[y * image_pitch + x];
-    bool px_is_valid =
-      mask[y * mask_pitch + x] != 0 && this_pixel <= max_valid_pixel_value;
-
-    if (px_is_valid) {
-        for (int row = max(0, y - KERNEL_HEIGHT);
-             row < min(y + KERNEL_HEIGHT + 1, height);
-             ++row) {
-            int row_offset = image_pitch * row;
-            int mask_offset = mask_pitch * row;
-            for (int col = max(0, x - KERNEL_WIDTH);
-                 col < min(x + KERNEL_WIDTH + 1, width);
-                 ++col) {
-                pixel_t pixel = image[row_offset + col];
-                uint8_t mask_pixel = mask[mask_offset + col];
-                if (mask_pixel) {
-                    sum += pixel;
-                    sumsq += pixel * pixel;
-                    n += 1;
-                }
-            }
-        }
+/**
+ * @brief Device function for writing out debug information in PNG and TXT formats.
+ *
+ * This function writes the specified device data to both PNG and TXT files,
+ * applying a pixel conversion function for PNG output and a condition function
+ * for TXT output.
+ *
+ * @tparam PixelTransformFunc A callable object that takes a pixel value and returns a transformed value.
+ * @tparam PixelConditionFunc A callable object that checks a condition on a pixel.
+ * @param device_data Pointer to the device data.
+ * @param pitch_bytes The pitch of the data in bytes.
+ * @param width The width of the image.
+ * @param height The height of the image.
+ * @param stream The CUDA stream.
+ * @param filename The base filename for the output.
+ * @param pixel_transform The function to transform pixel values for PNG output.
+ * @param condition The function to check pixel conditions for TXT output.
+ */
+template <typename PixelTransformFunc, typename PixelConditionFunc>
+void debug_writeout(uint8_t *device_data,
+                    size_t pitch_bytes,
+                    int width,
+                    int height,
+                    cudaStream_t stream,
+                    const char *filename,
+                    PixelTransformFunc pixel_transform,
+                    PixelConditionFunc condition) {
+    // Write to PNG using the pixel transformation function
+    save_device_data_to_png(
+      device_data, pitch_bytes, width, height, stream, filename, pixel_transform);
+
+    // Write to TXT using the condition function
+    save_device_data_to_txt(
+      device_data, pitch_bytes, width, height, stream, filename, condition);
+}
+
+#pragma region Launch Wrappers
+/**
+ * @brief Wrapper function to call the dispersion-based spotfinding algorithm.
+ * This function launches the `dispersion` kernel to perform
+ * the spotfinding based on the basic dispersion threshold.
+ *
+ * @param blocks The dimensions of the grid of blocks.
+ * @param threads The dimensions of the grid of threads within each block.
+ * @param shared_memory The size of shared memory required per block (in bytes).
+ * @param stream The CUDA stream to execute the kernel.
+ * @param image PitchedMalloc object for the image data.
+ * @param mask PitchedMalloc object for the mask data indicating valid pixels.
+ * @param width The width of the image.
+ * @param height The height of the image.
+ * @param max_valid_pixel_value The maximum valid trusted pixel value.
+ * @param result_strong (Output) Device pointer for the strong pixel mask data to be written to.
+ * @param min_count The minimum number of valid pixels required in the local neighborhood. Default is 3.
+ * @param n_sig_b The background noise significance level. Default is 6.0.
+ * @param n_sig_s The signal significance level. Default is 3.0.
+ */
+void call_do_spotfinding_dispersion(dim3 blocks,
+                                    dim3 threads,
+                                    size_t shared_memory,
+                                    cudaStream_t stream,
+                                    PitchedMalloc<pixel_t> &image,
+                                    PitchedMalloc<uint8_t> &mask,
+                                    int width,
+                                    int height,
+                                    pixel_t max_valid_pixel_value,
+                                    PitchedMalloc<uint8_t> *result_strong,
+                                    uint8_t min_count,
+                                    float n_sig_b,
+                                    float n_sig_s) {
+    /// One-direction width of kernel. Total kernel span is (K * 2 + 1)
+    constexpr uint8_t basic_kernel_radius = 3;
+
+    // Launch the dispersion threshold kernel
+    dispersion<<<blocks, threads, shared_memory, stream>>>(
+      image.get(),            // Image data pointer
+      mask.get(),             // Mask data pointer
+      result_strong->get(),   // Output mask pointer
+      image.pitch,            // Image pitch
+      mask.pitch,             // Mask pitch
+      result_strong->pitch,   // Output mask pitch
+      width,                  // Image width
+      height,                 // Image height
+      max_valid_pixel_value,  // Maximum valid pixel value
+      basic_kernel_radius,    // Kernel width
+      basic_kernel_radius,    // Kernel height
+      min_count,              // Minimum count
+      n_sig_b,                // Background significance level
+      n_sig_s                 // Signal significance level
+    );
+
+    cudaStreamSynchronize(
+      stream);  // Synchronize the CUDA stream to ensure the kernel is complete
+}
+
+/**
+ * @brief Wrapper function to call the extended dispersion-based spotfinding algorithm.
+ * This function launches the `dispersion_extended_second_pass` for final thresholding
+ * after applying the dispersion mask and the `dispersion_extended_first_pass`
+ * for initial thresholding.
+ *
+ * @param blocks The dimensions of the grid of blocks.
+ * @param threads The dimensions of the grid of threads within each block.
+ * @param shared_memory The size of shared memory required per block (in bytes).
+ * @param stream The CUDA stream to execute the kernel.
+ * @param image PitchedMalloc object for the image data.
+ * @param mask PitchedMalloc object for the mask data indicating valid pixels.
+ * @param width The width of the image.
+ * @param height The height of the image.
+ * @param max_valid_pixel_value The maximum valid trusted pixel value.
+ * @param result_strong (Output) Device pointer for the strong pixel mask data to be written to.
+ * @param do_writeout Flag to indicate if the output should be written to file. Default is false.
+ * @param min_count The minimum number of valid pixels required in the local neighborhood. Default is 3.
+ * @param n_sig_b The background noise significance level. Default is 6.0.
+ * @param n_sig_s The signal significance level. Default is 3.0.
+ * @param threshold The global threshold for intensity values. Default is 0.
+ */
+void call_do_spotfinding_extended(dim3 blocks,
+                                  dim3 threads,
+                                  size_t shared_memory,
+                                  cudaStream_t stream,
+                                  PitchedMalloc<pixel_t> &image,
+                                  PitchedMalloc<uint8_t> &mask,
+                                  int width,
+                                  int height,
+                                  pixel_t max_valid_pixel_value,
+                                  PitchedMalloc<uint8_t> *result_strong,
+                                  bool do_writeout,
+                                  uint8_t min_count,
+                                  float n_sig_b,
+                                  float n_sig_s,
+                                  float threshold) {
+    // Allocate intermediate buffer for the dispersion mask on the device
+    PitchedMalloc<uint8_t> d_dispersion_mask(width, height);
+    PitchedMalloc<uint8_t> d_erosion_mask(width, height);
+
+    constexpr uint8_t first_pass_kernel_radius = 3;
+    constexpr uint8_t second_pass_kernel_radius = 5;
+
+    /*
+     * First pass 🔎
+     * Perform the initial dispersion thresholding only on the background
+     * threshold. The surviving pixels are then used as a mask later to
+     * exclude them from the background calculation in the second pass.
+    */
+    dispersion_extended_first_pass<<<blocks, threads, shared_memory, stream>>>(
+      image.get(),               // Image data pointer
+      mask.get(),                // Mask data pointer
+      d_dispersion_mask.get(),   // Output dispersion mask pointer
+      image.pitch,               // Image pitch
+      mask.pitch,                // Mask pitch
+      d_dispersion_mask.pitch,   // Output dispersion mask pitch
+      width,                     // Image width
+      height,                    // Image height
+      max_valid_pixel_value,     // Maximum valid pixel value
+      first_pass_kernel_radius,  // Kernel radius
+      first_pass_kernel_radius,  // Kernel radius
+      min_count,                 // Minimum count
+      n_sig_b,                   // Background significance level
+      n_sig_s                    // Signal significance level
+    );
+    cudaStreamSynchronize(
+      stream);  // Synchronize the CUDA stream to ensure the first pass is complete
+
+    if (do_writeout) {
+        printf("First pass complete\n");
+        debug_writeout(
+          d_dispersion_mask.get(),
+          d_dispersion_mask.pitch_bytes(),
+          width,
+          height,
+          stream,
+          "first_pass_dispersion_result",
+          [](uint8_t pixel) { return pixel == MASKED_PIXEL ? 0 : 255; },
+          [](uint8_t pixel) { return pixel != 0; });
+    }
+
+    /*
+     * Erosion pass ✂
+     * Erode the first pass results.
+     * The surviving pixels are then used as a mask to exclude them
+     * from the background calculation in the second pass.
+    */
+
+    // Perform erosion
+    erosion<<<blocks, threads, shared_memory, stream>>>(d_dispersion_mask.get(),
+                                                        d_erosion_mask.get(),
+                                                        mask.get(),
+                                                        d_dispersion_mask.pitch,
+                                                        d_erosion_mask.pitch,
+                                                        mask.pitch,
+                                                        width,
+                                                        height,
+                                                        first_pass_kernel_radius);
+    cudaStreamSynchronize(
+      stream);  // Synchronize the CUDA stream to ensure the erosion pass is complete
+
+    if (do_writeout) {
+        printf("Erosion pass complete\n");
+        debug_writeout(
+          d_erosion_mask.get(),
+          d_erosion_mask.pitch_bytes(),
+          width,
+          height,
+          stream,
+          "eroded_dispersion_result",
+          [](uint8_t pixel) { return pixel == MASKED_PIXEL ? 0 : 255; },
+          [](uint8_t pixel) { return pixel == MASKED_PIXEL; });
     }
 
-    if (x < width && y < height) {
-        // result_sum[x + image_pitch * y] = sum;
-        // result_sumsq[x + image_pitch * y] = sumsq;
-        // result_n[x + mask_pitch * y] = n;
-
-        // Calculate the thresholding
-        if (px_is_valid) {
-            constexpr float n_sig_s = 3.0f;
-            constexpr float n_sig_b = 6.0f;
-
-            float sum_f = static_cast<float>(sum);
-            float sumsq_f = static_cast<float>(sumsq);
-
-            float mean = sum_f / n;
-            float variance = (n * sumsq_f - (sum_f * sum_f)) / (n * (n - 1));
-            float dispersion = variance / mean;
-            float background_threshold = 1 + n_sig_b * sqrt(2.0f / (n - 1));
-            bool not_background = dispersion > background_threshold;
-            float signal_threshold = mean + n_sig_s * sqrt(mean);
-            bool is_signal = this_pixel > signal_threshold;
-            bool is_strong_pixel = not_background && is_signal;
-            result_strong[x + mask_pitch * y] = is_strong_pixel;
-        } else {
-            result_strong[x + mask_pitch * y] = 0;
-        }
+    /*
+     * Second pass 🎯
+     * Perform the final thresholding using the dispersion mask.
+    */
+    dispersion_extended_second_pass<<<blocks, threads, shared_memory, stream>>>(
+      image.get(),                // Image data pointer
+      mask.get(),                 // Mask data pointer
+      d_erosion_mask.get(),       // Dispersion mask pointer
+      result_strong->get(),       // Output result mask pointer
+      image.pitch,                // Image pitch
+      mask.pitch,                 // Mask pitch
+      d_erosion_mask.pitch,       // Dispersion mask pitch
+      result_strong->pitch,       // Output result mask pitch
+      width,                      // Image width
+      height,                     // Image height
+      max_valid_pixel_value,      // Maximum valid pixel value
+      second_pass_kernel_radius,  // Kernel radius
+      second_pass_kernel_radius,  // Kernel radius
+      n_sig_s,                    // Signal significance level
+      threshold                   // Global threshold
+    );
+    cudaStreamSynchronize(
+      stream);  // Synchronize the CUDA stream to ensure the second pass is complete
+
+    if (do_writeout) {
+        printf("Second pass complete\n");
+        debug_writeout(
+          result_strong->get(),
+          mask.pitch_bytes(),
+          width,
+          height,
+          stream,
+          "final_extended_threshold_result",
+          [](uint8_t pixel) { return pixel == VALID_PIXEL ? 255 : 0; },
+          [](uint8_t pixel) { return pixel != 0; });
     }
 }
-void call_do_spotfinding_naive(dim3 blocks,
-                               dim3 threads,
-                               size_t shared_memory,
-                               cudaStream_t stream,
-                               pixel_t *image,
-                               size_t image_pitch,
-                               uint8_t *mask,
-                               size_t mask_pitch,
-                               int width,
-                               int height,
-                               pixel_t max_valid_pixel_value,
-                               //  int *result_sum,
-                               //  size_t *result_sumsq,
-                               //  uint8_t *result_n,
-                               uint8_t *result_strong) {
-    do_spotfinding_naive<<<blocks, threads, shared_memory, stream>>>(
-      image,
-      image_pitch,
-      mask,
-      mask_pitch,
-      width,
-      height,
-      max_valid_pixel_value,
-      result_strong);
-}
-#pragma endregion Spotfinding
\ No newline at end of file
+
+#pragma endregion Launch Wrappers
diff --git a/spotfinder/spotfinder.cuh b/spotfinder/spotfinder.cuh
index 071edd3..e563947 100644
--- a/spotfinder/spotfinder.cuh
+++ b/spotfinder/spotfinder.cuh
@@ -3,30 +3,39 @@
 
 #include <builtin_types.h>
 
-#include "common.hpp"
+#include "cuda_common.hpp"
 #include "h5read.h"
 
 using pixel_t = H5Read::image_type;
 
-/// One-direction width of kernel. Total kernel span is (K_W * 2 + 1)
-constexpr int KERNEL_WIDTH = 3;
-/// One-direction height of kernel. Total kernel span is (K_H * 2 + 1)
-constexpr int KERNEL_HEIGHT = 3;
+void call_do_spotfinding_dispersion(dim3 blocks,
+                                    dim3 threads,
+                                    size_t shared_memory,
+                                    cudaStream_t stream,
+                                    PitchedMalloc<pixel_t> &image,
+                                    PitchedMalloc<uint8_t> &mask,
+                                    int width,
+                                    int height,
+                                    pixel_t max_valid_pixel_value,
+                                    PitchedMalloc<uint8_t> *result_strong,
+                                    uint8_t min_count = 3,
+                                    float nsig_b = 6.0f,
+                                    float nsig_s = 3.0f);
 
-void call_do_spotfinding_naive(dim3 blocks,
-                               dim3 threads,
-                               size_t shared_memory,
-                               cudaStream_t stream,
-                               pixel_t *image,
-                               size_t image_pitch,
-                               uint8_t *mask,
-                               size_t mask_pitch,
-                               int width,
-                               int height,
-                               pixel_t max_valid_pixel_value,
-                               //  int *result_sum,
-                               //  size_t *result_sumsq,
-                               //  uint8_t *result_n,
-                               uint8_t *result_strong);
+void call_do_spotfinding_extended(dim3 blocks,
+                                  dim3 threads,
+                                  size_t shared_memory,
+                                  cudaStream_t stream,
+                                  PitchedMalloc<pixel_t> &image,
+                                  PitchedMalloc<uint8_t> &mask,
+                                  int width,
+                                  int height,
+                                  pixel_t max_valid_pixel_value,
+                                  PitchedMalloc<uint8_t> *result_strong,
+                                  bool do_writeout = false,
+                                  uint8_t min_count = 3,
+                                  float nsig_b = 6.0f,
+                                  float nsig_s = 3.0f,
+                                  float threshold = 0.0f);
 
 #endif
\ No newline at end of file
diff --git a/tests/dispersion_extended.sh b/tests/dispersion_extended.sh
new file mode 100755
index 0000000..bf985eb
--- /dev/null
+++ b/tests/dispersion_extended.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# This script tests the dispersion_extended algorithm on a test dataset
+# It is meant to be run from the build folder: working_directory/build
+
+# Ensure the output directory exists
+mkdir -p _debug_output
+
+# Change to the output directory
+cd _debug_output
+
+# Create a JSON file with the detector parameters
+# cat << EOF > detector.json
+# {
+#     "pixel_size_x": 0.075,
+#     "pixel_size_y": 0.075,
+#     "distance": 306.765,
+#     "beam_center_x": 119.78358,
+#     "beam_center_y": 126.83430
+# }
+# EOF
+
+# Open file descriptor 3 for writing to output_file.txt
+exec 3> output_file.txt
+
+# Extended dispersion test dataset
+# This is a good dataset to test the dispersion_extended algorithm. However this is 32bit data and requires truncation in order to be read
+# ../bin/spotfinder /dls/i24/data/2024/nr27313-319/gw/Test_Insulin/ins_big_15/ins_big_15_2_master.h5 \
+../bin/spotfinder /dls/i03/data/2024/cm37235-2/xraycentring/TestInsulin/ins_14/ins_14_24.nxs \
+  --min-spot-size 3 \
+  --pipe_fd 3 \
+  --dmin 4 \
+  --algorithm "dispersion_extended" \
+  --images 1 \
+  --writeout
+
+# Close file descriptor 3
+exec 3>&-
+
+# Delete the JSON file
+# rm detector.json
\ No newline at end of file
diff --git a/tests/res_comparison.sh b/tests/res_comparison.sh
index 63f040c..9efa37e 100755
--- a/tests/res_comparison.sh
+++ b/tests/res_comparison.sh
@@ -36,6 +36,7 @@ exec 3> output_file.txt
   --min-spot-size 3 \
   --pipe_fd 3 \
   --dmin 4 \
+  --algorithm "dispersion" \
   --threads 1 \
   --images 1