diff --git a/config/compilerEnv.cmake b/config/compilerEnv.cmake
index f0e1fc69e1..d66480d28b 100644
--- a/config/compilerEnv.cmake
+++ b/config/compilerEnv.cmake
@@ -344,6 +344,8 @@ macro(dbsSetupCxx)
     include(unix-g++)
   elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC")
     include(windows-cl)
+  elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "XLClang" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "XLCLang")
+    include(unix-xl)
   else()
     # missing CMAKE_CXX_COMPILER_ID? - try to match the compiler path+name to a string.
     if("${my_cxx_compiler}" MATCHES "pgCC" OR "${my_cxx_compiler}" MATCHES "pgc[+][+]")
diff --git a/config/unix-clang.cmake b/config/unix-clang.cmake
index dc25cf4f66..048eedf37d 100644
--- a/config/unix-clang.cmake
+++ b/config/unix-clang.cmake
@@ -71,7 +71,12 @@ if(NOT CXX_FLAGS_INITIALIZED)
     #
     # ld.lld: error: corrupt input file: version definition index 0 for symbol mpiprivc_ is out of
     # bounds
-    string(APPEND CMAKE_EXE_LINKER_FLAGS " -fuse-ld=bfd")
+    #
+    # As of 2021-08-10, this is required on Capulin/Thunder when using cce@11, but must be ommitted
+    # on rznevada when using cce@12.
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.0.0)
+      string(APPEND CMAKE_EXE_LINKER_FLAGS " -fuse-ld=bfd")
+    endif()
   else()
     string(APPEND CMAKE_CXX_FLAGS " -stdlib=libc++")
   endif()
diff --git a/config/unix-xl.cmake b/config/unix-xl.cmake
index 557fee3a75..20eb5d4f3a 100644
--- a/config/unix-xl.cmake
+++ b/config/unix-xl.cmake
@@ -29,7 +29,7 @@ if(NOT CXX_FLAGS_INITIALIZED)
   if(EXISTS /usr/gapps)
     # ATS-2
     string(APPEND CMAKE_C_FLAGS " --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1")
-  elseif(EXISTS /projects/opt/ppc64le/ibm)
+  elseif(EXISTS /projects/opt/ppc64le/ibm AND NOT $ENV{CXX} MATCHES "-F")
     # Darwin power9 - extract version from module environment.
     string(REPLACE ":" ";" modules $ENV{LOADEDMODULES})
     foreach(module ${modules})
@@ -43,7 +43,7 @@ if(NOT CXX_FLAGS_INITIALIZED)
                              ${xlc_version})
       elseif(${module} MATCHES "^cuda")
         if(NOT DEFINED cuda_version)
-          string(REGEX REPLACE "[^0-9]*([0-9]+).([0-9]+)" "\\1.\\2" cuda_version ${module})
+          string(REGEX REPLACE "[^0-9]*([0-9]+).([0-9]+).*" "\\1.\\2" cuda_version ${module})
         endif()
       endif()
     endforeach()
diff --git a/environment/bashrc/.bashrc_ats4 b/environment/bashrc/.bashrc_ats4
index e5e3d99096..0a3c686d02 100644
--- a/environment/bashrc/.bashrc_ats4
+++ b/environment/bashrc/.bashrc_ats4
@@ -30,23 +30,27 @@ export JSM_JSRUN_NO_WARN_OVERSUBSCRIBE=1
 #
 
 # 1. Determine if the module command is available
-# modcmd=`declare -f module`
-# # If not found, look for it in /usr/share/Modules
-# if [[ ! ${modcmd} ]]; then
-#   source /usr/share/lmod/lmod/init/bash || die \
-#     "ERROR: The module command was not found. No modules will be loaded (ats-2 e01)."
-# fi
-# modcmd=`declare -f module`
+modcmd=`declare -f module`
+# If not found, look for it in /usr/share/Modules
+if [[ ! ${modcmd} ]]; then
+  source /usr/share/lmod/lmod/init/bash || die \
+    "ERROR: The module command was not found. No modules will be loaded (ats-4 e01)."
+fi
+modcmd=`declare -f module`
 
 # 2. Use modules found in the draco directory
-# if [[ ! ${modcmd} ]]; then
-#   echo "ERROR: The module command was not found. No modules will be loaded (ats-2, e02)."
-# else
-#  module use --append /usr/gapps/jayenne/Modules
-#  module unuse /usr/share/lmod/lmod/modulefiles/Core
-#  module unuse /collab/usr/global/tools/modulefiles/blueos_3_ppc64le_ib_p9/Core
-#  module load draco/xl2021.03.11-cuda-11.0.2
-#fi
+if [[ ! ${modcmd} ]]; then
+  echo "ERROR: The module command was not found. No modules will be loaded (ats-4 e02)."
+else
+  module use --append /usr/gapps/jayenne/Modules/rznevada
+  module unuse /opt/cray/pe/lmod/modulefiles/compiler/crayclang/10.0
+  module unuse /opt/cray/pe/lmod/modulefiles/perftools/21.05.0
+  module unuse /opt/cray/pe/lmod/modulefiles/cpu/x86-rome/1.0
+  module unuse /usr/apps/modulefiles
+  module unuse /usr/share/lmod/lmod/modulefiles/Core
+# module unuse /collab/usr/global/tools/modulefiles/blueos_3_ppc64le_ib_p9/Core
+ module load draco/cce1201.lua
+fi
 
 # Do not escape $ for bash completion
 shopt -s direxpand
diff --git a/environment/git/install-hooks.sh b/environment/git/install-hooks.sh
index c1ef038c7f..4c5bfbaf35 100755
--- a/environment/git/install-hooks.sh
+++ b/environment/git/install-hooks.sh
@@ -14,7 +14,7 @@
 # CONFIGURATION:
 # select which pre-commit hooks are going to be installed
 HOOKS="pre-commit pre-commit-clang-format pre-commit-autopep8 pre-commit-flake8 pre-commit-fprettify"
-HOOKS="$HOOKS pre-commit-cmake-format pre-commit-cmake-lint"
+HOOKS="$HOOKS pre-commit-cmake-format pre-commit-cmake-lint pre-commit-copyright"
 TOOLS="common.sh"
 ###########################################################
 # There should be no need to change anything below this line.
diff --git a/environment/git/pre-commit b/environment/git/pre-commit
index 573435e47a..7dda71d2e7 100755
--- a/environment/git/pre-commit
+++ b/environment/git/pre-commit
@@ -14,7 +14,7 @@
 # pre-commit hooks to be executed. They should be in the same .git/hooks/ folder as this
 # script. Hooks should return 0 if successful and nonzero to cancel the commit. They are executed in
 # the order in which they are listed.
-HOOKS="pre-commit-clang-format"
+HOOKS="pre-commit-clang-format pre-commit-copyright"
 
 # only run autopep8 if the tool is available
 [[ $(which autopep8 2> /dev/null | wc -w) -gt 0 ]] && HOOKS+=" pre-commit-autopep8"
diff --git a/environment/git/pre-commit-copyright b/environment/git/pre-commit-copyright
new file mode 100755
index 0000000000..447b59aec7
--- /dev/null
+++ b/environment/git/pre-commit-copyright
@@ -0,0 +1,177 @@
+#!/bin/bash
+
+# git pre-commit hook that checks/updates the copyright block
+# Features:
+#  - Attempts to fix the copyright block in place
+#  - abort commit when commit does not comply with the style guidelines
+
+#--------------------------------------------------------------------------------------------------#
+# SETTINGS
+#
+# - none
+#--------------------------------------------------------------------------------------------------#
+
+# make tmp file readable only by owner
+umask 0077
+
+debug=off
+function debugprint()
+{
+  if [[ "$debug" == "on" ]]; then echo "==>" "$@"; fi
+}
+
+debugprint "running pre-commit-copyright"
+
+# remove any older patches from previous commits. Set to true or false.
+# DELETE_OLD_PATCHES=true
+
+# file types to parse.
+FILE_EXTS=".c .cc .cmake .h .hh .in .f90 .F90 .f .F .py .txt"
+#FILE_ENDINGS_INCLUDE="_f.h _f77.h _f90.h"
+FILE_ENDINGS_EXCLUDE="ChangeLog Release.cc"
+export FILE_EXTS FILE_ENDINGS_EXCLUDE
+
+##################################################################
+# There should be no need to change anything below this line.
+# shellcheck source=environment/git/canonicalize_filename.sh
+source "$(dirname -- "$0")/canonicalize_filename.sh"
+
+# shellcheck source=tools/common.sh
+source "$(dirname -- "$0")/common.sh"
+
+# necessary check for initial commit
+if git rev-parse --verify HEAD >/dev/null 2>&1 ; then
+  against=HEAD
+else
+  # Initial commit: diff against an empty tree object
+  against=4b825dc642cb6eb9a060e54bf8d69288fbee4904
+fi
+
+# Arguments
+# - file
+function rewrite_copyright_block()
+{
+  local filename=$1
+  local gitfile=$2
+  local today
+  today=$(date +%Y)
+
+  # This data was found in the header comments.  It might be a single year or a range.
+  local crl
+  crl=$(grep Copyright "${filename}")
+  local create_date
+  # shellcheck disable=SC2001
+  create_date=$(echo "${crl}" | sed -e 's/.* \([0-9][0-9]*\).*/\1/')
+
+  # These dates are reported by git
+  local git_last_mod_date
+  local git_create_date
+  git_last_mod_date=$(git log -1 "${gitfile}" | grep Date | \
+                        sed -e 's/.* \([0-9][0-9][0-9][0-9]\).*/\1/')
+  git_create_date=$(git log "${gitfile}" | grep Date | tail -n 1 | \
+                      sed -e 's/.* \([0-9][0-9][0-9][0-9]\).*/\1/')
+
+  debugprint "$crl"
+  debugprint "$create_date $git_last_mod_date $git_create_date"
+
+  # Sanity Checks
+  [[ "${create_date}" =~ "Copyright" ]] && die "Failed to parse copyright line"
+  # [[ "${mod_date}" =~ "Copyright" ]] && die "Failed to parse copyright line"
+  [[ "${git_last_mod_date}" =~ "Copyright" ]] && die "Failed to parse copyright line"
+  [[ "${git_create_date}" =~ "Copyright" ]] && die "Failed to parse copyright line"
+  if [[ "${create_date}" -gt "${today}" ]] || [[ "${create_date}" -lt "1990" ]]; then
+    die "Existing copyright date range is corrupt. Please fix $filename manually."
+  fi
+  if [[ "${git_create_date}" -gt "${today}" ]] || [[ "${git_create_date}" -lt "1990" ]]; then
+    die "Existing copyright date range is corrupt. Please fix $filename manually."
+  fi
+  if [[ "${create_date}" -gt "${today}" ]] || [[ "${create_date}" -lt "1990" ]]; then
+    die "Existing copyright date range is corrupt. Please fix $filename manually."
+  fi
+
+  # We converted from CVS to svn in 2010. This is the oldest create date that git will report.  In
+  # this case older data is lost, so just use whatever is in the file as the create date.
+  [[ "${git_create_date}" -lt "2011" ]] && git_create_date="${create_date}"
+
+  # Expected Copyright line:
+  local ecrl="Copyright (C) ${git_create_date}-${today} Triad National Security, LLC., "
+  ecrl+="All rights reserved."
+  debugprint "ecrl = $ecrl"
+
+  # If existing copyright spans two lines, reduce it to one line.
+  local twolines
+  twolines=$(grep -A 1 Copyright "${filename}" | tail -n 1 | grep -c reserved)
+  if [[ $twolines -gt 0 ]]; then
+    sed -i 's/All rights reserved[.]*//' "${filename}"
+  fi
+
+  # Do we have terminating comement character on the 'copyright' line.  If so, keep it.
+  local ecm=""
+  if [[ $(echo "${crl}" | grep -c "\\\*/") -gt 0 ]]; then ecm=" */"; fi
+
+  # Replace copyright with new one
+  debugprint "sed -i s%Copyright.*%${ecrl}${ecm}% ${filename}"
+  sed -i "s%Copyright.*%${ecrl}${ecm}%" "${filename}"
+}
+
+# create a random filename to store our generated patch
+prefix="pre-commit-copyright"
+suffix="$(date +%s)"
+
+# clean up any older fprettify patches
+# $DELETE_OLD_PATCHES && rm -f /tmp/$prefix-*. &> /dev/null
+patchfile=$(mktemp "/tmp/$USER/${prefix}-${suffix}.patch.XXXXXXXX")
+
+# create one patch containing all changes to the files
+# shellcheck disable=SC2162
+git diff-index --cached --diff-filter=ACMR --name-only $against -- | while read file;
+do
+  debugprint "should we process $file ?"
+
+  # only process f90 files.
+  if ! matches_extension "$file"; then continue; fi
+
+  # If file is added to commit but still has local modifications, abort
+  if [[ $(git diff "${file}" | wc -l) != 0 ]]; then
+    echo -e "\nERROR: File ${file} has local edits that are not staged. Stash modifications or add "
+    echo -e "       changes to this commit.\n\n"
+    exit 1
+  fi
+
+  debugprint "Looking at $file"
+  file_nameonly=$(basename "${file}")
+  tmpfile1="/tmp/${prefix}-$file_nameonly"
+
+  # Copy the file and attempt update it.
+  cp "${file}" "${tmpfile1}"
+  rewrite_copyright_block "$tmpfile1" "$file"
+  debugprint "  updating patchfile"
+  diff -u "${file}" "${tmpfile1}" | \
+    sed -e "1s|--- |--- a/|" -e "2s|+++ ${tmpfile1}|+++ b/${file}|" >> "$patchfile"
+  rm "${tmpfile1}"
+done
+
+# if no patch has been generated all is ok, clean up the file stub and exit
+if ! [[ -s "$patchfile" ]]; then
+  printf "Files in this commit comply with the expected copyright block rules.\n"
+  rm -f "$patchfile"
+  exit 0
+fi
+
+# If we get here, there are files that don't comply...
+
+# If user wants to automatically apply these changes, then do it, otherwise, print the diffs and
+# reject the commit.
+if [[ -s "$patchfile" ]]; then
+  git apply "$patchfile"
+  printf "\nFiles in this commit were updated to comply with the copyright block rules.\n"
+  printf "You must check and test these changes and then stage these updates to\n"
+  printf "be part of your current change set and retry the commit.\n\n"
+  git status
+  rm -f "$patchfile"
+  exit 1
+fi
+
+# ------------------------------------------------------------------------------------------------ #
+# End pre-commit-copyright
+# ------------------------------------------------------------------------------------------------ #
diff --git a/src/c4/opstream.cc b/src/c4/opstream.cc
index db108b9f85..710f5684d9 100644
--- a/src/c4/opstream.cc
+++ b/src/c4/opstream.cc
@@ -4,7 +4,7 @@
  * \author Kent G. Budge
  * \date   Mon Jun 25 12:12:31 MDT 2018
  * \brief  Define methods of class opstream
- * \note   Copyright (C) 2018-2020 Triad National Security, LLC., All rights reserved. */
+ * \note   Copyright (C) 2018-2021 Triad National Security, LLC., All rights reserved. */
 //------------------------------------------------------------------------------------------------//
 
 #include "opstream.hh"
@@ -17,12 +17,14 @@ namespace rtt_c4 {
  *
  * Causes all buffered data to be written to console in MPI rank order; that is, all data from rank
  * 0 is written first, then all data from rank 1, and so on.
+ *
+ * /param[in,out] out ostream buffer to write data into. defaults to std::cout.
  */
-void opstream::mpibuf::send() {
+void opstream::mpibuf::send(std::ostream &myout) {
   unsigned const pid = rtt_c4::node();
   if (pid == 0) {
     buffer_.push_back('\0'); // guarantees that buffer_.size() > 0
-    std::cout << &buffer_[0];
+    myout << &buffer_[0];
     buffer_.clear();
 
     unsigned const pids = rtt_c4::nodes();
@@ -34,7 +36,7 @@ void opstream::mpibuf::send() {
         rtt_c4::receive(&buffer_[0], N, i);
       }
       buffer_.push_back('\0');
-      std::cout << &buffer_[0]; // guarantees that buffer_.size() > 0
+      myout << &buffer_[0]; // guarantees that buffer_.size() > 0
     }
   } else {
 
diff --git a/src/c4/opstream.hh b/src/c4/opstream.hh
index e441709ab7..c357909d7f 100644
--- a/src/c4/opstream.hh
+++ b/src/c4/opstream.hh
@@ -3,7 +3,7 @@
  * \file   c4/opstream.hh
  * \author Kent G. Budge
  * \brief  Define class opstream
- * \note   Copyright (C) 2018-2020 Triad National Security, LLC., All rights reserved. */
+ * \note   Copyright (C) 2018-2021 Triad National Security, LLC., All rights reserved. */
 //------------------------------------------------------------------------------------------------//
 
 #ifndef c4_opstream_hh
@@ -58,7 +58,7 @@ public:
   }
 
   //! Send all buffered data synchronously to the console.
-  void send() { sb_.send(); }
+  void send(std::ostream &myout = std::cout) { sb_.send(myout); }
 
   //! Shrink the internal buffer to fit the current buffered data.
   void shrink_to_fit() { sb_.shrink_to_fit(); }
@@ -66,7 +66,7 @@ public:
 private:
   struct mpibuf : public std::streambuf {
 
-    void send();
+    void send(std::ostream &myout);
     void shrink_to_fit();
 
     int_type overflow(int_type c) override;
diff --git a/src/ds++/dbc.hh b/src/ds++/dbc.hh
index 7c19618416..76a0708aab 100644
--- a/src/ds++/dbc.hh
+++ b/src/ds++/dbc.hh
@@ -19,15 +19,15 @@ namespace rtt_dsxx {
 
 //! Check whether a sequence is monotonically increasing.
 template <typename Forward_Iterator>
-bool is_monotonic_increasing(Forward_Iterator first, Forward_Iterator last);
+bool is_monotonic_increasing(Forward_Iterator first, Forward_Iterator const last);
 
 //! Check whether a sequence is strictly monotonically increasing.
 template <typename Forward_Iterator>
-bool is_strict_monotonic_increasing(Forward_Iterator first, Forward_Iterator last);
+bool is_strict_monotonic_increasing(Forward_Iterator first, Forward_Iterator const last);
 
 //! Check whether a sequence is strictly monotonically decreasing.
 template <typename Forward_Iterator>
-bool is_strict_monotonic_decreasing(Forward_Iterator first, Forward_Iterator last);
+bool is_strict_monotonic_decreasing(Forward_Iterator first, Forward_Iterator const last);
 
 //! Check whether a matrix is symmetric.
 template <typename Random_Access_Container>
diff --git a/src/ds++/dbc.i.hh b/src/ds++/dbc.i.hh
index 114601e997..1639036bd3 100644
--- a/src/ds++/dbc.i.hh
+++ b/src/ds++/dbc.i.hh
@@ -4,7 +4,7 @@
  * \author Kent G. Budge
  * \date   Wed Jan 22 15:18:23 MST 2003
  * \brief  Template implementation for dbc
- * \note   Copyright (C) 2016-2020 Triad National Security, LLC., All rights reserved.
+ * \note   Copyright (C) 2016-2021 Triad National Security, LLC., All rights reserved.
  *
  * This header defines several function templates that perform common numerical operations not
  * standardized in the STL algorithm header. It also defines some useful STL-style predicates. These
@@ -21,7 +21,7 @@
 
 namespace rtt_dsxx {
 
-//-------------------------------------------------------------------------//
+//------------------------------------------------------------------------------------------------//
 /*!
  * \brief Check whether a sequence is monotonically increasing.
  *
@@ -29,20 +29,15 @@ namespace rtt_dsxx {
  * sequence.  This is particularly useful for Design by Contract assertions that check that a
  * sequence is sorted.
  *
- * \arg \a Forward_Iterator
- * A forward iterator whose value type supports \c operator<.
- *
- * \param first
- * Points to the first element of the sequence.
- *
- * \param last
- * Points one element past the end of the sequence.
+ * \tparam Forward_Iterator A forward iterator whose value type supports \c operator<.
+ * \param[in,out] first Points to the first element of the sequence.
+ * \param[in]     last Points one element past the end of the sequence.
  *
  * \return \c true if \f$a_i<=a_{i+1}\f$ for all \f$a_i\f$ in the sequence;
  * \c false otherwise.
  */
 template <typename Forward_Iterator>
-bool is_monotonic_increasing(Forward_Iterator first, Forward_Iterator last) {
+bool is_monotonic_increasing(Forward_Iterator first, Forward_Iterator const last) {
   Forward_Iterator prev = first;
   while (++first != last) {
     if (*first < *prev)
@@ -53,7 +48,7 @@ bool is_monotonic_increasing(Forward_Iterator first, Forward_Iterator last) {
   return true;
 }
 
-//-------------------------------------------------------------------------//
+//------------------------------------------------------------------------------------------------//
 /*!
  * \brief Check whether a sequence is strictly monotonically increasing.
  *
@@ -61,20 +56,14 @@ bool is_monotonic_increasing(Forward_Iterator first, Forward_Iterator last) {
  * is particularly useful for Design by Contract assertions that check the validity of a table of
  * data.
  *
- * \arg \a Forward_Iterator
- * A forward iterator whose value type supports \c operator<.
+ * \tparam Forward_Iterator A forward iterator whose value type supports \c operator<.
+ * \param[in,out] first Points to the first element of the sequence.
+ * \param[in]     last Points one element past the end of the sequence.
  *
- * \param first
- * Points to the first element of the sequence.
- *
- * \param last
- * Points one element past the end of the sequence.
- *
- * \return \c true if \f$a_i<a_{i+1}\f$ for all \f$a_i\f$ in the sequence;
- * \c false otherwise.
+ * \return \c true if \f$a_i<a_{i+1}\f$ for all \f$a_i\f$ in the sequence; \c false otherwise.
  */
 template <typename Forward_Iterator>
-bool is_strict_monotonic_increasing(Forward_Iterator first, Forward_Iterator last) {
+bool is_strict_monotonic_increasing(Forward_Iterator first, Forward_Iterator const last) {
   Forward_Iterator prev = first;
   while (++first != last) {
     if (!(*prev < *first))
@@ -84,20 +73,15 @@ bool is_strict_monotonic_increasing(Forward_Iterator first, Forward_Iterator las
   return true;
 }
 
-//-------------------------------------------------------------------------//
+//------------------------------------------------------------------------------------------------//
 /*!
  * \brief Check whether a sequence is strictly monotonically decreasing.
  *
  * Checks whether every element in a sequence is greater than the next element of the sequence.
  *
- * \arg \a Forward_Iterator
- * A forward iterator whose value type supports \c operator<.
- *
- * \param first
- * Points to the first element of the sequence.
- *
- * \param last
- * Points one element past the end of the sequence.
+ * \tparam Forward_Iterator A forward iterator whose value type supports \c operator<.
+ * \param[in,out] first Points to the first element of the sequence.
+ * \param[in]     last Points one element past the end of the sequence.
  *
  * \pre \c last>first
  *
@@ -105,7 +89,7 @@ bool is_strict_monotonic_increasing(Forward_Iterator first, Forward_Iterator las
  * \c false otherwise.
  */
 template <typename Forward_Iterator>
-bool is_strict_monotonic_decreasing(Forward_Iterator first, Forward_Iterator last) {
+bool is_strict_monotonic_decreasing(Forward_Iterator first, Forward_Iterator const last) {
   Require(first < last);
   Forward_Iterator prev = first;
   while (++first != last) {
@@ -116,24 +100,19 @@ bool is_strict_monotonic_decreasing(Forward_Iterator first, Forward_Iterator las
   return true;
 }
 
-//-------------------------------------------------------------------------//
+//------------------------------------------------------------------------------------------------//
 /*!
  * \brief Check whether a matrix is symmetric.
  *
- * \arg \a Random_Access_Container
- * A random access container type.
- *
- * \param A Matrix that is supposed to be symmetric.
- *
- * \param n Rank of the matrix.
- *
- * \param tolerance Tolerance for comparing matrix elements.
+ * \tparam Random_Access_Container A random access container type.
+ * \param[in] A Matrix that is supposed to be symmetric.
+ * \param[in] n Rank of the matrix.
+ * \param[in] tolerance Tolerance for comparing matrix elements.
  *
  * \pre \c A.size()==n*n
  * \pre \c tolerance>=0.0
  *
- * \return \c true if <code>A[i+n*j]==A[j+n*i]</code> for all \c i and \c j; \c false
- * otherwise.
+ * \return \c true if <code>A[i+n*j]==A[j+n*i]</code> for all \c i and \c j; \c false otherwise.
  */
 template <typename Random_Access_Container>
 bool is_symmetric_matrix(Random_Access_Container const &A, unsigned const n,
diff --git a/src/kde/kde.cc b/src/kde/kde.cc
index a635e1811f..1cd3b80ac5 100644
--- a/src/kde/kde.cc
+++ b/src/kde/kde.cc
@@ -3,22 +3,20 @@
  * \file   kde/kde.cc
  * \author Mathew Cleveland
  * \date   November 10th 2020
- * \brief  Explicitly defined KDE functions for various dimensions and coordinate
- *         KDE or Kernel Density Estimators are unbiased statical based
- *         reconstruction.  They can significantly increase the convergence
- *         rate of statical distributions. The KDE performs a reconstruction by
- *         evaluating a mean over some discrete kernel shape. In this DRACO
- *         implementation the mean is evaluated based on the sample locations
- *         that are bound by the kernel shape.  A renormalization is used to
- *         ensure the proper mean is returned given there is no guarantee the
- *         full kernel (which integrates exactly to 1) will be integrated fully
- *         in space. This renormalization also avoids the need for boundary
- *         fix-ups which are typically used in KDE applications to account for
- *         the kernel extending beyond the bounds of the spatial domain. Other
- *         approaches that could be considered are quadrature based approaches
- *         that fully sample the Kernel space reducing the need for the
- *         normalization.
- * \note   Copyright (C) 2018-2020 Triad National Security, LLC., All rights reserved. */
+ * \brief  Explicitly defined KDE functions for various dimensions and coordinate KDE or Kernel
+ *         Density Estimators are unbiased statical based reconstruction.  They can significantly
+ *         increase the convergence rate of statical distributions. The KDE performs a
+ *         reconstruction by evaluating a mean over some discrete kernel shape. In this DRACO
+ *         implementation the mean is evaluated based on the sample locations that are bound by the
+ *         kernel shape.  A renormalization is used to ensure the proper mean is returned given
+ *         there is no guarantee the full kernel (which integrates exactly to 1) will be integrated
+ *         fully in space. This renormalization also avoids the need for boundary fix-ups which are
+ *         typically used in KDE applications to account for the kernel extending beyond the bounds
+ *         of the spatial domain. Other approaches that could be considered are quadrature based
+ *         approaches that fully sample the Kernel space reducing the need for the normalization.
+ *
+ * \note   Copyright (C) 2020-2021 Triad National Security, LLC., All rights reserved. 
+ */
 //------------------------------------------------------------------------------------------------//
 
 #include "kde.hh"
@@ -30,124 +28,466 @@ namespace rtt_kde {
 
 //------------------------------------------------------------------------------------------------//
 /*!
- * \brief Cartesian geometry KDE reconstruction of a 1D distribution
+ * \brief Calculate Cartesian Weight
+ * 
+ * \pre Calculate the effective weight in Cartesian geometry from a given location to the current
+ * kernel 
+ *
+ * \param[in] r0 current kernel center location
+ * \param[in] one_over_h0 current kernel width
+ * \param[in] r data location
+ * \param[in] one_over_h kernel width at this data location
+ * \param[in] qindex quick indexing class
+ * \param[in] discontinuity_cutoff maximum size of value discrepancies to include in the
+ * reconstruction
+ *
+ * \return weight contribution to the current kernel
+ *
+ * \post the local reconstruction of the original data is returned.
+ */
+double kde::calc_cartesian_weight(const std::array<double, 3> &r0,
+                                  const std::array<double, 3> &one_over_h0,
+                                  const std::array<double, 3> &r,
+                                  const std::array<double, 3> &one_over_h,
+                                  const quick_index &qindex,
+                                  const double &discontinuity_cutoff) const {
+  Require(one_over_h0[0] > 0.0);
+  Require(qindex.dim > 1 ? one_over_h0[1] > 0.0 : true);
+  Require(qindex.dim > 2 ? one_over_h0[2] > 0.0 : true);
+  Require(one_over_h[0] > 0.0);
+  Require(qindex.dim > 1 ? one_over_h[1] > 0.0 : true);
+  Require(qindex.dim > 2 ? one_over_h[2] > 0.0 : true);
+  double weight = 1.0;
+  for (size_t d = 0; d < qindex.dim; d++) {
+    const double u = (r0[d] - r[d]) * one_over_h0[d];
+    const double scale =
+        fabs(one_over_h0[d] - one_over_h[d]) / std::max(one_over_h0[d], one_over_h[d]) >
+                discontinuity_cutoff
+            ? 0.0
+            : 1.0;
+    // Apply Boundary Condition Weighting
+    double bc_weight = 1.0;
+    const bool low_reflect = reflect_boundary[d * 2];
+    const bool high_reflect = reflect_boundary[d * 2 + 1];
+    if (low_reflect) {
+      const double low_u =
+          ((r0[d] - qindex.bounding_box_min[d]) + (r[d] - qindex.bounding_box_min[d])) *
+          one_over_h0[d];
+      bc_weight += epan_kernel(low_u);
+    }
+    if (high_reflect) {
+      const double high_u =
+          ((qindex.bounding_box_max[d] - r0[d]) + (qindex.bounding_box_max[d] - r[d])) *
+          one_over_h0[d];
+      bc_weight += epan_kernel(high_u);
+    }
+    weight *= scale * bc_weight * epan_kernel(u) * one_over_h0[d];
+  }
+  Ensure(!(weight < 0.0));
+  return weight;
+}
+
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief calculate spherical weight
  * 
- * \pre The local reconstruction data is passed into this function which
- * includes the original data distribution, its spatial position, and the
- * optimal bandwidth to be used at each point.
+ * \pre Calculate the effective weight from a given location to the current kernel 
+ *
+ * \param[in] r0 current kernel center location
+ * \param[in] one_over_h0 current kernel width
+ * \param[in] r data location
+ * \param[in] one_over_h kernel width at this data location
+ * \param[in] qindex quick indexing class
+ * \param[in] discontinuity_cutoff maximum size of value discrepancies to include in the
+ * reconstruction
+ *
+ * \return weight contribution to the current kernel
+ *
+ * \post the local reconstruction of the original data is returned.
+ */
+double kde::calc_spherical_weight(const std::array<double, 3> &r0,
+                                  const std::array<double, 3> &one_over_h0,
+                                  const std::array<double, 3> &r,
+                                  const std::array<double, 3> &one_over_h,
+                                  const quick_index &qindex,
+                                  const double &discontinuity_cutoff) const {
+  Require(one_over_h0[0] > 0.0);
+  Require(qindex.dim > 1 ? one_over_h0[1] > 0.0 : true);
+  Require(qindex.dim > 2 ? one_over_h0[2] > 0.0 : true);
+  Require(one_over_h[0] > 0.0);
+  Require(qindex.dim > 1 ? one_over_h[1] > 0.0 : true);
+  Require(qindex.dim > 2 ? one_over_h[2] > 0.0 : true);
+
+  // largest active smoothing length
+  const auto r0_theta_phi = qindex.transform_r_theta(sphere_center, r0);
+  // if we are near the origin of the sphere, fall back to xyz reconstruction
+  if (r0_theta_phi[0] < sphere_min_radius || r0_theta_phi[0] > sphere_max_radius)
+    return calc_cartesian_weight(r0, one_over_h0, r, one_over_h, qindex, discontinuity_cutoff);
+
+  const auto r_theta_phi = qindex.transform_r_theta(sphere_center, r);
+  const double radius = r0_theta_phi[0];
+  double weight = 1.0;
+  for (size_t d = 0; d < qindex.dim; d++) {
+    const double arch_scale = d > 0 ? radius : 1.0;
+    const double u = (r0_theta_phi[d] - r_theta_phi[d]) * arch_scale * one_over_h0[d];
+    const double scale =
+        fabs(one_over_h0[d] - one_over_h[d]) / std::max(one_over_h0[d], one_over_h[d]) >
+                discontinuity_cutoff
+            ? 0.0
+            : 1.0;
+    // Apply Boundary Condition Weighting
+    double bc_weight = 1.0;
+    /* BC are a little tricky so I will implement them later
+    const bool low_reflect = reflect_boundary[d * 2];
+    const bool high_reflect = reflect_boundary[d * 2 + 1];
+    if (low_reflect) {
+      const double low_u =
+          ((r0_theta_phi[d] - qindex.bounding_box_min[d]) + (r[d] - qindex.bounding_box_min[d])) *
+          one_over_h0[d];
+      bc_weight += epan_kernel(low_u);
+    }
+    if (high_reflect) {
+      const double high_u =
+          ((qindex.bounding_box_max[d] - r0[d]) + (qindex.bounding_box_max[d] - r[d])) *
+          one_over_h0[d];
+      bc_weight += epan_kernel(high_u);
+    }
+    */
+    weight *= scale * bc_weight * epan_kernel(u) * one_over_h0[d];
+  }
+  Ensure(!(weight < 0.0));
+  return weight;
+}
+
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief KDE reconstruction 
+ * 
+ * \pre The local reconstruction data is passed into this function which includes the original data
+ * distribution, its spatial position, and the optimal bandwidth to be used at each point.
  *
  * \param[in] distribution original data to be reconstructed
- * \param[in] position local of the original data 
- * \param[in] one_over_band_width inverse bandwidth size to be used at each data location
- * \param[in] domain_decomposed bool flag to switch between domain replicated and decomposed data
+ * \param[in] one_over_bandwidth inverse bandwidth size to be used at each data location
+ * \param[in] qindex quick_index class to be used for data access.
+ * \param[in] discontinuity_cutoff maximum size of value discrepancies to include in the
+ * reconstruction
  * \return final local KDE function distribution reconstruction
  *
  * \post the local reconstruction of the original data is returned.
  */
-template <>
-template <>
-std::vector<double> kde<kde_coordinates::CART>::reconstruction<1>(
-    const std::vector<double> &distribution, const std::vector<std::array<double, 3>> &position,
-    const std::vector<std::array<double, 3>> &one_over_band_width,
-    const bool domain_decomposed) const {
-  const int64_t local_size = distribution.size();
-  Require(static_cast<int64_t>(position.size()) == local_size);
-  Require(static_cast<int64_t>(one_over_band_width.size()) == local_size);
+std::vector<double>
+kde::reconstruction(const std::vector<double> &distribution,
+                    const std::vector<std::array<double, 3>> &one_over_bandwidth,
+                    const quick_index &qindex, const double discontinuity_cutoff) const {
+  Require(qindex.dim < 3 && qindex.dim > 0);
+  const size_t local_size = distribution.size();
+  // be sure that the quick_index matches this data size
+  Require(qindex.locations.size() == local_size);
+  Require(one_over_bandwidth.size() == local_size);
 
   // used for the zero accumulation conservation
-  int64_t size = local_size;
-  double global_conservation = std::accumulate(distribution.begin(), distribution.end(), 0.0);
   std::vector<double> result(local_size, 0.0);
   std::vector<double> normal(local_size, 0.0);
-  if (domain_decomposed) {
-    // minimize global values and only allocate them in DD problems
-    int64_t global_lower_bound = 0;
-    std::vector<double> global_distribution;
-    std::vector<double> global_x_position;
-
-    // calculate global off sets
-    int n_ranks = rtt_c4::nodes();
-    int64_t izero(0);
-    std::vector<int64_t> rank_size(rtt_c4::nodes(), izero);
-    rank_size[rtt_c4::node()] = local_size;
-    rtt_c4::global_sum(rank_size.data(), n_ranks);
-    size = std::accumulate(rank_size.begin(), rank_size.end(), izero);
-    std::vector<int64_t> accum_rank_size(rank_size);
-    std::partial_sum(rank_size.begin(), rank_size.end(), accum_rank_size.begin());
-
-    if (rtt_c4::node() > 0) {
-      global_lower_bound = accum_rank_size[rtt_c4::node() - 1];
-    }
+  if (qindex.domain_decomposed) {
 
-    // set up global arrays
-    global_distribution.resize(size, 0.0);
-    global_x_position.resize(size, 0.0);
+    std::vector<double> ghost_distribution(qindex.local_ghost_buffer_size);
+    qindex.collect_ghost_data(distribution, ghost_distribution);
+    std::vector<std::array<double, 3>> ghost_one_over_bandwidth(qindex.local_ghost_buffer_size,
+                                                                {0.0, 0.0, 0.0});
+    qindex.collect_ghost_data(one_over_bandwidth, ghost_one_over_bandwidth);
 
-    // build up global positions
-    for (int i = 0; i < local_size; i++) {
-      global_x_position[i + global_lower_bound] = position[i][0];
-      global_distribution[i + global_lower_bound] = distribution[i];
+    std::array<double, 3> win_min{0.0, 0.0, 0.0};
+    std::array<double, 3> win_max{0.0, 0.0, 0.0};
+    // now apply the kernel to the local ranks
+    for (size_t i = 0; i < local_size; i++) {
+      const std::array<double, 3> r0 = qindex.locations[i];
+      const std::array<double, 3> one_over_h0 = one_over_bandwidth[i];
+      calc_win_min_max(qindex, r0, one_over_h0, win_min, win_max);
+      const std::vector<size_t> coarse_bins = qindex.window_coarse_index_list(win_min, win_max);
+      // fetch local contribution
+      for (auto &cb : coarse_bins) {
+        // skip bins that aren't present in the map (for constness)
+        auto mapItr = qindex.coarse_index_map.find(cb);
+        if (mapItr != qindex.coarse_index_map.end()) {
+          // loop over local data
+          for (auto &l : mapItr->second) {
+            const double weight = calc_weight(r0, one_over_h0, qindex.locations[l],
+                                              one_over_bandwidth[l], qindex, discontinuity_cutoff);
+            result[i] += distribution[l] * weight;
+            normal[i] += weight;
+          }
+        }
+        auto gmapItr = qindex.local_ghost_index_map.find(cb);
+        if (gmapItr != qindex.local_ghost_index_map.end()) {
+          // loop over ghost data
+          for (auto &g : gmapItr->second) {
+            const double weight =
+                calc_weight(r0, one_over_h0, qindex.local_ghost_locations[g],
+                            ghost_one_over_bandwidth[g], qindex, discontinuity_cutoff);
+            result[i] += ghost_distribution[g] * weight;
+            normal[i] += weight;
+          }
+        }
+      }
     }
+  } else { // local reconstruction only
 
-    rtt_c4::global_sum(global_x_position.data(), size);
-    rtt_c4::global_sum(global_distribution.data(), size);
-    rtt_c4::global_sum(global_conservation);
+    std::array<double, 3> win_min{0.0, 0.0, 0.0};
+    std::array<double, 3> win_max{0.0, 0.0, 0.0};
+    // now apply the kernel to the local ranks
+    for (size_t i = 0; i < local_size; i++) {
+      const std::array<double, 3> r0 = qindex.locations[i];
+      const std::array<double, 3> one_over_h0 = one_over_bandwidth[i];
+      calc_win_min_max(qindex, r0, one_over_h0, win_min, win_max);
+      const std::vector<size_t> coarse_bins = qindex.window_coarse_index_list(win_min, win_max);
+      for (auto &cb : coarse_bins) {
+        // skip bins that aren't present in the map (can't use [] operator with constness)
+        auto mapItr = qindex.coarse_index_map.find(cb);
+        if (mapItr != qindex.coarse_index_map.end()) {
+          // loop over local data
+          for (auto &l : mapItr->second) {
+            const double weight = calc_weight(r0, one_over_h0, qindex.locations[l],
+                                              one_over_bandwidth[l], qindex, discontinuity_cutoff);
+            result[i] += distribution[l] * weight;
+            normal[i] += weight;
+          }
+        }
+      }
+    }
+  }
 
+  // normalize the integrated weight contributions
+  for (size_t i = 0; i < local_size; i++) {
+    Check(normal[i] > 0.0);
+    result[i] /= normal[i];
+  }
+
+  return result;
+}
+
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief KDE reconstruction done in logarithmic data space
+ * 
+ * \pre The local reconstruction data is passed into this function which includes the original data
+ * distribution, its spatial position, and the optimal bandwidth to be used at each point. The
+ * original data distribution is transformed into log space prior and post reconstruction. This is
+ * helpful for strongly peaked data and should be exact for exponential distributions.
+ *
+ * \param[in] distribution original data to be reconstructed
+ * \param[in] one_over_bandwidth inverse bandwidth size to be used at each data location
+ * \param[in] qindex quick_index class to be used for data access.
+ * \param[in] discontinuity_cutoff maximum size of value discrepancies to include in the
+ * reconstruction
+ * \return final local KDE function distribution reconstruction
+ *
+ * \post the local reconstruction of the original data is returned.
+ */
+std::vector<double>
+kde::log_reconstruction(const std::vector<double> &distribution,
+                        const std::vector<std::array<double, 3>> &one_over_bandwidth,
+                        const quick_index &qindex, const double discontinuity_cutoff) const {
+  Require(qindex.dim < 3 && qindex.dim > 0);
+  const size_t local_size = distribution.size();
+  Require(qindex.locations.size() == local_size);
+  Require(one_over_bandwidth.size() == local_size);
+
+  // used for the zero accumulation conservation
+  std::vector<double> result(local_size, 0.0);
+  std::vector<double> normal(local_size, 0.0);
+  double min_value = *std::min_element(distribution.begin(), distribution.end());
+  double max_value = *std::max_element(distribution.begin(), distribution.end());
+  double log_bias = fabs(min_value) + (max_value - min_value);
+  if (qindex.domain_decomposed) {
+
+    rtt_c4::global_min(min_value);
+    rtt_c4::global_min(max_value);
+
+    std::vector<double> ghost_distribution(qindex.local_ghost_buffer_size);
+    qindex.collect_ghost_data(distribution, ghost_distribution);
+    std::vector<std::array<double, 3>> ghost_one_over_bandwidth(qindex.local_ghost_buffer_size,
+                                                                {0.0, 0.0, 0.0});
+    qindex.collect_ghost_data(one_over_bandwidth, ghost_one_over_bandwidth);
+
+    log_bias = fabs(min_value) + (max_value - min_value);
+    // if the log bias is zero the answer must be zero everywhere
+    if (!(log_bias > 0.0))
+      return result;
     // now apply the kernel to the local ranks
-    for (int i = 0; i < local_size; i++) {
-      const double x0 = position[i][0];
-      const double one_over_h = one_over_band_width[i][0];
+    std::array<double, 3> win_min{0.0, 0.0, 0.0};
+    std::array<double, 3> win_max{0.0, 0.0, 0.0};
+    for (size_t i = 0; i < local_size; i++) {
+      const std::array<double, 3> r0 = qindex.locations[i];
+      const std::array<double, 3> one_over_h0 = one_over_bandwidth[i];
+      calc_win_min_max(qindex, r0, one_over_h0, win_min, win_max);
+      const std::vector<size_t> coarse_bins = qindex.window_coarse_index_list(win_min, win_max);
       // fetch local contribution
-      for (int j = 0; j < size; j++) {
-        const double x = global_x_position[j];
-        const double u = (x0 - x) * one_over_h;
-        const double weight = (epan_kernel(u)) * one_over_h;
-        result[i] += global_distribution[j] * weight;
-        normal[i] += weight;
+      for (auto &cb : coarse_bins) {
+        // skip bins that aren't present in the map (can't use [] operator with constness)
+        auto mapItr = qindex.coarse_index_map.find(cb);
+        if (mapItr != qindex.coarse_index_map.end()) {
+          // loop over local data
+          for (auto &l : mapItr->second) {
+            const double weight = calc_weight(r0, one_over_h0, qindex.locations[l],
+                                              one_over_bandwidth[l], qindex, discontinuity_cutoff);
+            result[i] += log_transform(distribution[l], log_bias) * weight;
+            normal[i] += weight;
+          }
+        }
+        auto gmapItr = qindex.local_ghost_index_map.find(cb);
+        if (gmapItr != qindex.local_ghost_index_map.end()) {
+          // loop over ghost data
+          for (auto &g : gmapItr->second) {
+            const double weight =
+                calc_weight(r0, one_over_h0, qindex.local_ghost_locations[g],
+                            ghost_one_over_bandwidth[g], qindex, discontinuity_cutoff);
+            result[i] += log_transform(ghost_distribution[g], log_bias) * weight;
+            normal[i] += weight;
+          }
+        }
       }
     }
   } else { // local reconstruction only
+
+    // if the log bias is zero the answer must be zero everywhere
+    if (!(log_bias > 0.0))
+      return result;
+
     // now apply the kernel to the local ranks
-    for (int i = 0; i < local_size; i++) {
-      const double x0 = position[i][0];
-      const double one_over_h = one_over_band_width[i][0];
+    std::array<double, 3> win_min{0.0, 0.0, 0.0};
+    std::array<double, 3> win_max{0.0, 0.0, 0.0};
+    for (size_t i = 0; i < local_size; i++) {
+      const std::array<double, 3> r0 = qindex.locations[i];
+      const std::array<double, 3> one_over_h0 = one_over_bandwidth[i];
+      calc_win_min_max(qindex, r0, one_over_h0, win_min, win_max);
+      const std::vector<size_t> coarse_bins = qindex.window_coarse_index_list(win_min, win_max);
       // fetch local contribution
-      for (int j = 0; j < local_size; j++) {
-        const double x = position[j][0];
-        const double u = (x0 - x) * one_over_h;
-        const double weight = (epan_kernel(u)) * one_over_h;
-        result[i] += distribution[j] * weight;
-        normal[i] += weight;
+      for (auto &cb : coarse_bins) {
+        // skip bins that aren't present in the map (can't use [] operator with constness)
+        auto mapItr = qindex.coarse_index_map.find(cb);
+        if (mapItr != qindex.coarse_index_map.end()) {
+          // loop over local data
+          for (auto &l : mapItr->second) {
+            const double weight = calc_weight(r0, one_over_h0, qindex.locations[l],
+                                              one_over_bandwidth[l], qindex, discontinuity_cutoff);
+            result[i] += log_transform(distribution[l], log_bias) * weight;
+            normal[i] += weight;
+          }
+        }
       }
     }
   }
 
   // normalize the integrated weight contributions
-  for (int i = 0; i < local_size; i++) {
+  for (size_t i = 0; i < local_size; i++) {
     Check(normal[i] > 0.0);
     result[i] /= normal[i];
+    result[i] = log_inv_transform(result[i], log_bias);
+    // ZERO IS ZERO AND THE LOG TRANSFORM CAN MAKE THE ZEROS NOT MATCH... SO FIX IT LIKE THIS
+    if (rtt_dsxx::soft_equiv(result[i], 0.0) && rtt_dsxx::soft_equiv(distribution[i], 0.0))
+      result[i] = distribution[i];
+  }
+
+  return result;
+}
+
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief KDE apply conservation
+ * 
+ * \pre Apply conservation fix to the new distribution so sum(original_distribution) ==
+ * sum(new_distribution)
+ *
+ * \param[in] original_distribution original data to be reconstructed
+ * \param[in,out] new_distribution original data to be reconstructed
+ * \param[in] domain_decomposed bool
+ *
+ */
+void kde::apply_conservation(const std::vector<double> &original_distribution,
+                             std::vector<double> &new_distribution,
+                             const bool domain_decomposed) const {
+
+  const size_t local_size = original_distribution.size();
+  Insist(new_distribution.size() == local_size,
+         "Original and new distributions must be the same size");
+
+  // compute absolute solution
+  std::vector<double> abs_distribution(local_size, 0.0);
+  for (size_t i = 0; i < local_size; i++) {
+    if (!rtt_dsxx::soft_equiv(new_distribution[i], original_distribution[i], 1e-12))
+      abs_distribution[i] = fabs(new_distribution[i]);
   }
 
-  double reconstruction_conservation = std::accumulate(result.begin(), result.end(), 0.0);
+  // compute totals to be used in residual calculation
+  double original_conservation =
+      std::accumulate(original_distribution.begin(), original_distribution.end(), 0.0);
+  double reconstruction_conservation =
+      std::accumulate(new_distribution.begin(), new_distribution.end(), 0.0);
+  double abs_distribution_conservation =
+      std::accumulate(abs_distribution.begin(), abs_distribution.end(), 0.0);
 
   if (domain_decomposed) {
     // accumulate global contribution
+    rtt_c4::global_sum(original_conservation);
     rtt_c4::global_sum(reconstruction_conservation);
+    rtt_c4::global_sum(abs_distribution_conservation);
   }
 
-  if (!rtt_dsxx::soft_equiv(reconstruction_conservation, 0.0) &&
-      !rtt_dsxx::soft_equiv(global_conservation, 0.0)) {
-    // Totals are non-zero so scale the result for conservation
-    for (int i = 0; i < local_size; i++)
-      result[i] *= global_conservation / reconstruction_conservation;
-  } else {
-    // a zero distribution is possible. If it occurs fall back to residual conservation;
-    const double res = global_conservation - reconstruction_conservation;
-    for (int i = 0; i < local_size; i++)
-      result[i] += res / double(size);
+  // Apply residual
+  if (abs_distribution_conservation > 0.0) {
+    const double res = original_conservation - reconstruction_conservation;
+    for (size_t i = 0; i < local_size; i++)
+      new_distribution[i] += res * abs_distribution[i] / abs_distribution_conservation;
   }
+}
 
-  return result;
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief Calculate window min and max bounds.
+ *
+ *  Calculate the bounding window (via win_min (x_min,y_min,z_min) and win_max (x_max, y_max,
+ *  z_max)) given a central location and the bandwidth size in each dimension (dx,dy) for Cartesian
+ *  or (dr,arc_length) for spherical.
+ * 
+ * \param[in] qindex quick index class for finding bounds xy bounds of a wedge shape
+ * \param[in] position is the central location of the bounds
+ * \param[in] one_over_bandwidth size of the reconstruction domain in each dimension. This is
+ * (dx,dy) for Caresian and (dr, arc_length) for spherical. 
+ * \param[in,out] win_min is the minimum corner of the bounding box (x_min, y_min, z_min)
+ * \param[in,out] win_max is the maximum corner of the bounding box (x_max, y_max, z_max)
+ *
+ */
+void kde::calc_win_min_max(const quick_index &qindex, const std::array<double, 3> &position,
+                           const std::array<double, 3> &one_over_bandwidth,
+                           std::array<double, 3> &win_min, std::array<double, 3> &win_max) const {
+  const size_t dim = qindex.dim;
+  Require(dim > 0);
+  Require(one_over_bandwidth[0] > 0.0);
+  Require(dim > 1 ? one_over_bandwidth[1] > 0.0 : true);
+  Require(dim > 2 ? one_over_bandwidth[2] > 0.0 : true);
+  if (use_spherical_reconstruction) {
+    const double dr = 1.0 / one_over_bandwidth[0];
+    const double rmax = sqrt((sphere_center[0] - position[0]) * (sphere_center[0] - position[0]) +
+                             (sphere_center[1] - position[1]) * (sphere_center[1] - position[1])) +
+                        dr;
+    Check(rmax > 0.0);
+    const double dtheta =
+        std::min(1.0 / (one_over_bandwidth[1] * rmax), rtt_units::PI / 2.0 - 1e-12);
+    if (!(rmax < sphere_min_radius || rmax > sphere_max_radius)) {
+      // dtheta = arch_length_max/rmax
+      qindex.calc_wedge_xy_bounds(position, sphere_center, {dr, dtheta, 0.0}, win_min, win_max);
+    } else {
+      for (size_t d = 0; d < dim; d++) {
+        win_min[d] = position[d] - 1.0 / one_over_bandwidth[d];
+        win_max[d] = position[d] + 1.0 / one_over_bandwidth[d];
+      }
+    }
+  } else {
+    for (size_t d = 0; d < dim; d++) {
+      win_min[d] = position[d] - 1.0 / one_over_bandwidth[d];
+      win_max[d] = position[d] + 1.0 / one_over_bandwidth[d];
+    }
+  }
 }
 
 } // end namespace rtt_kde
diff --git a/src/kde/kde.hh b/src/kde/kde.hh
index c85e017343..24db717fd2 100644
--- a/src/kde/kde.hh
+++ b/src/kde/kde.hh
@@ -3,21 +3,20 @@
  * \file   kde/kde.hh
  * \author Mathew Cleveland
  * \brief  Define class kernel density estimator class
- * \note   Copyright (C) 2018-2020 Triad National Security, LLC.
- *         All rights reserved. */
+ * \note   Copyright (C) 2020-2021 Triad National Security, LLC., All rights reserved. 
+ */
 //------------------------------------------------------------------------------------------------//
 
 #ifndef kde_kde_hh
 #define kde_kde_hh
 
+#include "quick_index.hh"
 #include "c4/global.hh"
 #include <array>
 #include <vector>
 
 namespace rtt_kde {
 
-enum kde_coordinates { CART, CYL, SPH };
-
 //================================================================================================//
 /*!
  * \class kde
@@ -29,46 +28,97 @@ enum kde_coordinates { CART, CYL, SPH };
  * Returns a KDE reconstruction of a multidimensional distribution
  */
 //================================================================================================//
-template <int coord> class kde {
+class kde {
 public:
-  // NESTED CLASSES AND TYPEDEFS
-
-  // CREATORS
-
-  // ACCESSORS
-
-  // SERVICES
+  //! Constructor
+  kde(const std::array<bool, 6> reflect_boundary_ = {false, false, false, false, false, false})
+      : reflect_boundary(reflect_boundary_) {}
 
   //! Reconstruct distribution
-  template <int dim = 1>
   std::vector<double> reconstruction(const std::vector<double> &distribution,
-                                     const std::vector<std::array<double, 3>> &position,
                                      const std::vector<std::array<double, 3>> &one_over_band_width,
-                                     const bool domain_decomposed) const;
-
+                                     const quick_index &qindex,
+                                     const double discontinuity_cutoff = 1.0) const;
+
+  //! Reconstruct distribution in logarithmic space
+  std::vector<double>
+  log_reconstruction(const std::vector<double> &distribution,
+                     const std::vector<std::array<double, 3>> &one_over_band_width,
+                     const quick_index &qindex, const double discontinuity_cutoff = 1.0) const;
+
+  //! Apply conservation to reconstructed distribution
+  void apply_conservation(const std::vector<double> &original_distribution,
+                          std::vector<double> &new_distribution, const bool domain_decompsed) const;
   // STATICS
 
   //! Epanechikov Kernel
-  double epan_kernel(const double x) const;
+  inline double epan_kernel(const double x) const;
+
+  //! Transform the solution into log space
+  inline double log_transform(const double value, const double bias) const;
+
+  //! Move the solution back from log space
+  inline double log_inv_transform(const double log_value, const double bias) const;
+
+  //! Calculate radius
+  inline double calc_radius(const std::array<double, 3> &center,
+                            const std::array<double, 3> &location) const;
+
+  //! Calculate Arch Length between two locations at a specific radius
+  inline double calc_arch_length(const std::array<double, 3> &center, const double radius,
+                                 const std::array<double, 3> &location_1,
+                                 const std::array<double, 3> &location_2) const;
+  //! Setup a spherical reconstruction with a
+  void set_sphere_center(const std::array<double, 3> &sph_center, const double min_radius,
+                         const double max_radius) {
+    Insist(max_radius > min_radius, "Spherical KDE max radius must be larger then min radius");
+
+    use_spherical_reconstruction = true;
+    sphere_center = sph_center;
+    sphere_min_radius = min_radius;
+    sphere_max_radius = max_radius;
+  }
 
 protected:
   // IMPLEMENTATION
 
 private:
-  // NESTED CLASSES AND TYPEDEFS
-
-  // IMPLEMENTATION
+  //! Private function to calculate kernel weight
+  double calc_weight(const std::array<double, 3> &r0, const std::array<double, 3> &one_over_h0,
+                     const std::array<double, 3> &r, const std::array<double, 3> &one_over_h,
+                     const quick_index &qindex, const double &discontinuity_cutoff) const {
+    return use_spherical_reconstruction
+               ? calc_spherical_weight(r0, one_over_h0, r, one_over_h, qindex, discontinuity_cutoff)
+               : calc_cartesian_weight(r0, one_over_h0, r, one_over_h, qindex,
+                                       discontinuity_cutoff);
+  }
+
+  void calc_win_min_max(const quick_index &qindex, const std::array<double, 3> &position,
+                        const std::array<double, 3> &one_over_bandwidth, std::array<double, 3> &min,
+                        std::array<double, 3> &max) const;
+
+  double calc_spherical_weight(const std::array<double, 3> &r0,
+                               const std::array<double, 3> &one_over_h0,
+                               const std::array<double, 3> &r,
+                               const std::array<double, 3> &one_over_h, const quick_index &qindex,
+                               const double &discontinuity_cutoff) const;
+
+  double calc_cartesian_weight(const std::array<double, 3> &r0,
+                               const std::array<double, 3> &one_over_h0,
+                               const std::array<double, 3> &r,
+                               const std::array<double, 3> &one_over_h, const quick_index &qindex,
+                               const double &discontinuity_cutoff) const;
 
   // DATA
+  //! reflecting boundary conditions [lower_x, upper_x, lower_y, upper_y, lower_z, upper_z]
+  const std::array<bool, 6> reflect_boundary;
+  //! Spherical Mesh Reconstruction Data
+  std::array<double, 3> sphere_center{0.0, 0.0, 0.0};
+  double sphere_min_radius{0.0};
+  double sphere_max_radius{0.0};
+  bool use_spherical_reconstruction{false};
 };
 
-//! Forward declaration of the reconstruction 1D Cartesian reconstruction.
-template <>
-template <>
-std::vector<double> kde<kde_coordinates::CART>::reconstruction<1>(
-    const std::vector<double> &distribution, const std::vector<std::array<double, 3>> &position,
-    const std::vector<std::array<double, 3>> &one_over_band_width, const bool dd) const;
-
 } // end namespace rtt_kde
 
 #include "kde.i.hh"
diff --git a/src/kde/kde.i.hh b/src/kde/kde.i.hh
index 820a02e495..c6f5620f6f 100644
--- a/src/kde/kde.i.hh
+++ b/src/kde/kde.i.hh
@@ -4,8 +4,8 @@
  * \author Mathew Cleveland
  * \date   Nov. 10th 2020
  * \brief  Member definitions of class kde
- * \note   Copyright (C) 2018-2020 Triad National Security, LLC.
- *         All rights reserved. */
+ * \note   Copyright (C) 2021 Triad National Security, LLC., All rights reserved. 
+ */
 //------------------------------------------------------------------------------------------------//
 
 #ifndef kde_kde_i_hh
@@ -17,49 +17,125 @@ namespace rtt_kde {
 
 //------------------------------------------------------------------------------------------------//
 /*!
- * \brief
- * DEFAULT reconstruction to return error if instantiation is not found
+ * \brief epan_kernel 
  *
- * \tparam coord enumerated value to specify KDE coordinate system
- * \tparam dim integer specifying the data dimensionality 
- * \param[in] distribution 
- * \param[in] position
- * \param[in] one_over_band_width
- * \param[in] domain_decomposed
- * \return final local function distribution
+ * Basis function used during reconstruction.
+ *
+ * Epanechnikov kenrel to be used in reconstrtuction
+ *
+ * \param[in] x from kernel origin
+ * \return distribution weight based on distance from the kernel center 
  *
+ * Test of kde.
  */
+inline double kde::epan_kernel(const double x) const {
+  const double x2 = x * x;
+  return x2 > 1.0 ? 0.0 : 0.75 * (1.0 - x2);
+}
 
-template <int coord>
-template <int dim>
-std::vector<double>
-kde<coord>::reconstruction(const std::vector<double> & /*distribution*/,
-                           const std::vector<std::array<double, 3>> & /*position*/,
-                           const std::vector<std::array<double, 3>> & /*one_over_band_width*/,
-                           const bool /*domain_decomposed*/) const {
-
-  Insist(false, "kde::reconstruction has not been implemented for this coordinate system and or "
-                "dimension combination");
-  return std::vector<double>(1, 0.0);
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief log_transform 
+ *
+ * Transforms data to log space given a bias.
+ *
+ *
+ * \param[in] value of original distribution
+ * \param[in] bias used to ensure positivity
+ *
+ * \return the logarithmic transform of the original value
+ *
+ * Test of kde.
+ */
+inline double kde::log_transform(const double value, const double bias) const {
+  Require(value + bias > 0.0);
+  return log(value + bias);
 }
 
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief log_inv_transform 
+ *
+ * Inverse transform back from log space given the current bias.
+ *
+ *
+ * \param[in] log_value of original distribution
+ * \param[in] bias used to ensure positivity
+ *
+ * \return the logarithmic transform of the original value
+ *
+ * Test of kde.
+ */
+
+inline double kde::log_inv_transform(const double log_value, const double bias) const {
+  return exp(log_value) - bias;
+}
+
+//! Lambda to calculate a vector
+auto calc_vec = [](const auto &v1, const auto &v2) {
+  Require(v1.size() == 3);
+  Require(v2.size() == 3);
+  return std::array<double, 3>{v2[0] - v1[0], v2[1] - v1[1], v2[2] - v1[2]};
+};
+
+//! Lambda to calculate vector magnitude
+auto calc_mag = [](const auto &v) {
+  Require(v.size() == 3);
+  return sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+};
+
+//! Lambda to calculate unit vector
+auto calc_unit_vec = [](const auto &v) {
+  Require(v.size() == 3);
+  const double mag = calc_mag(v);
+  return std::array<double, 3>{v[0] / mag, v[1] / mag, v[2] / mag};
+};
+
 //------------------------------------------------------------------------------------------------//
 /*!
  * \brief
- * epan_kernel basis function used during reconstruction
+ * Calculate the radius given a sphere center and the current location.
  *
- * Epanechnikov kenrel to be used in reconstrtuction
  *
- * \param[in] x from kernel origin
- * \return distribution weight based on distance from the kernel center 
+ * \param[in] center the center location (x,y,z) or (r,z) of the sphere
+ * \param[in] location data location (x,y,z) or (r,z)
+ *
+ * \return radius from cell center
  *
  * Test of kde.
  */
-template <int coord> double kde<coord>::epan_kernel(const double x) const {
-  const double x2 = x * x;
-  return x2 > 1.0 ? 0.0 : 0.75 * (1.0 - x2);
+
+inline double kde::calc_radius(const std::array<double, 3> &center,
+                               const std::array<double, 3> &location) const {
+  return calc_mag(calc_vec(center, location));
 }
 
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief Calculate the arch length between two points 
+ *
+ * Calculate the arch length between two points (infinitely extended from sphere center) at a
+ * specified radius.
+ *
+ *
+ * \param[in] center the center location (x,y,z) or (r,z) of the sphere
+ * \param[in] radius from sphere center to calculate the arch length
+ * \param[in] location_1 data location (x,y,z) or (r,z)
+ * \param[in] location_2 data location (x,y,z) or (r,z)
+ *
+ * \return arch length
+ *
+ * Test of kde.
+ */
+
+inline double kde::calc_arch_length(const std::array<double, 3> &center, const double radius,
+                                    const std::array<double, 3> &location_1,
+                                    const std::array<double, 3> &location_2) const {
+  const std::array<double, 3> v1{calc_unit_vec(calc_vec(center, location_1))};
+  const std::array<double, 3> v2{calc_unit_vec(calc_vec(center, location_2))};
+  const double cos_theta = v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
+  return radius * acos(cos_theta);
+}
 } // end namespace  rtt_kde
 
 #endif // kde_kde_i_hh
diff --git a/src/kde/quick_index.cc b/src/kde/quick_index.cc
new file mode 100644
index 0000000000..2c627a5466
--- /dev/null
+++ b/src/kde/quick_index.cc
@@ -0,0 +1,1547 @@
+//--------------------------------------------*-C++-*---------------------------------------------//
+/*!
+ * \file   kde/quick_index.cc
+ * \author Mathew Cleveland
+ * \brief  Explicitly defined quick_index functions.
+ * \note   Copyright (C) 2021 Triad National Security, LLC., All rights reserved. 
+ */
+//------------------------------------------------------------------------------------------------//
+
+#include "quick_index.hh"
+#include "ds++/dbc.hh"
+#include <cmath>
+#include <numeric>
+#include <tuple>
+
+namespace rtt_kde {
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief quick_index constructor. 
+ *
+ * This function builds up a global indexing table to quickly access data that is spatial located
+ * near each other. It breaks up the data into equally spaced bins in each dimension. For domain
+ * decomposed data it builds a one sided communication map to place local data that is need on other
+ * processors for ghost cells. The ghost cell extents is determined by the max_data_window spatial
+ * size such that any cell on the local domain will have access to all points that should fall into
+ * the spatial window centered on any given local point.
+ *
+ * \param[in] dim_ specifying the data dimensionality 
+ * \param[in] locations_ data locations.
+ * \param[in] max_window_size_ maximum supported window size
+ * \param[in] bins_per_dimension_ number of bins in each dimension
+ * \param[in] domain_decomposed_
+ */
+quick_index::quick_index(const size_t dim_, const std::vector<std::array<double, 3>> &locations_,
+                         const double max_window_size_, const size_t bins_per_dimension_,
+                         const bool domain_decomposed_)
+    : dim(dim_), domain_decomposed(domain_decomposed_), coarse_bin_resolution(bins_per_dimension_),
+      max_window_size(max_window_size_), locations(locations_), n_locations(locations_.size()) {
+  Require(dim > 0);
+  Require(coarse_bin_resolution > 0);
+
+  // Build local bounding box
+  bounding_box_min = {0, 0, 0};
+  bounding_box_max = {0, 0, 0};
+  // only set initial values for working dimensions
+  for (size_t d = 0; d < dim; d++) {
+    bounding_box_min[d] = 1e20;
+    bounding_box_max[d] = -1e20;
+  }
+
+  for (auto &loc : locations) {
+    for (size_t d = 0; d < dim; d++) {
+      if (loc[d] < bounding_box_min[d])
+        bounding_box_min[d] = loc[d];
+      if (loc[d] > bounding_box_max[d])
+        bounding_box_max[d] = loc[d];
+    }
+  }
+
+  if (domain_decomposed) {
+    // Store the local bounding box and extend to maximum non-local data size
+    local_bounding_box_min = bounding_box_min;
+    local_bounding_box_max = bounding_box_max;
+    for (size_t d = 0; d < dim; d++) {
+      local_bounding_box_min[d] -= max_window_size * 0.5;
+      local_bounding_box_max[d] += max_window_size * 0.5;
+    }
+    // Global reduce to get the global min and max
+    rtt_c4::global_min(&bounding_box_min[0], 3);
+    rtt_c4::global_max(&bounding_box_max[0], 3);
+    for (size_t d = 0; d < dim; d++) {
+      local_bounding_box_min[d] = std::max(local_bounding_box_min[d], bounding_box_min[d]);
+      local_bounding_box_max[d] = std::min(local_bounding_box_max[d], bounding_box_max[d]);
+    }
+  }
+
+  // temp cast corse_bin_resolution to double for interpolation
+  const auto crd = static_cast<double>(coarse_bin_resolution);
+
+  // build up the local hash table of into global bins
+  size_t locIndex = 0;
+  for (auto &loc : locations) {
+    std::array<size_t, 3> index{0UL, 0UL, 0UL};
+    for (size_t d = 0; d < dim; d++) {
+      Check(bounding_box_min[d] < bounding_box_max[d]);
+      index[d] = static_cast<size_t>(std::floor(crd * (loc[d] - bounding_box_min[d]) /
+                                                (bounding_box_max[d] - bounding_box_min[d])));
+      index[d] = std::min(index[d], coarse_bin_resolution - 1);
+    }
+    // build up the local index hash
+    const size_t global_index = index[0] + index[1] * coarse_bin_resolution +
+                                index[2] * coarse_bin_resolution * coarse_bin_resolution;
+    coarse_index_map[global_index].push_back(locIndex);
+    locIndex++;
+  }
+
+  // Now we need to build up ghost location map data for domain decomposed mode
+  if (domain_decomposed) {
+    // temporary cast of the nodes to prevent conversion warnings
+    const auto nodes = static_cast<size_t>(rtt_c4::nodes());
+    const auto node = static_cast<size_t>(rtt_c4::node());
+
+    // build list of local bins based on the local bounds
+    local_bins = window_coarse_index_list(local_bounding_box_min, local_bounding_box_max);
+
+    // build a global map for number of entries into the global bins on each processor
+    // creates a (nbins**dim)*nranks sized array
+    // NOTE: If this gets to big we could stride over a subset of coarse bins
+    // and do multiple iterations of mpi communication to build up the map
+    size_t nbins = coarse_bin_resolution;
+    for (size_t d = 1; d < dim; d++)
+      nbins *= coarse_bin_resolution;
+
+    std::vector<int> global_index_per_bin_per_proc(nbins * nodes, 0UL);
+    for (auto &map : coarse_index_map) {
+      size_t gipbpp_index = map.first + nbins * node;
+      // must cast to an int to accomidate mpi int types.
+      global_index_per_bin_per_proc[gipbpp_index] = static_cast<int>(map.second.size());
+    }
+    rtt_c4::global_sum(&global_index_per_bin_per_proc[0], nbins * nodes);
+
+    // calculate local ghost buffer size
+    local_ghost_buffer_size = 0;
+    for (size_t proc = 0; proc < nodes; proc++) {
+      for (auto &bin : local_bins) {
+        if (node != proc) {
+          size_t gipbpp_index = bin + nbins * proc;
+          // build up the local ghost index map
+          for (int i = 0; i < global_index_per_bin_per_proc[gipbpp_index]; i++)
+            local_ghost_index_map[bin].push_back(local_ghost_buffer_size + i);
+          // accumulate the total ghost points
+          local_ghost_buffer_size += global_index_per_bin_per_proc[gipbpp_index];
+        }
+      }
+    }
+
+    std::vector<int> global_need_bins_per_proc(nbins * nodes, 0UL);
+    // global need bins
+    for (auto &bin : local_bins) {
+      global_need_bins_per_proc[bin + nbins * node] += 1;
+    }
+    rtt_c4::global_sum(&global_need_bins_per_proc[0], nbins * nodes);
+
+    // Build a global list of buffer sizes
+    std::vector<int> proc_ghost_buffer_size(nodes, 0);
+    proc_ghost_buffer_size[node] = static_cast<int>(local_ghost_buffer_size);
+    rtt_c4::global_sum(&proc_ghost_buffer_size[0], nodes);
+
+    // calculate the put map so each node knows which processor to send data
+    // and where to index that data
+    // PERFORMANCE NOTE: This would be more efficient to use a MPI_SCAN and
+    // std::partial_sum but I need to think how this would actually look.
+    max_put_buffer_size = 0;
+    for (int rec_proc = 0; rec_proc < rtt_c4::nodes(); rec_proc++) {
+      // calculating the offset SUCKS!!! If anyone can find a better way please help.
+      int offset = 0;
+      for (int send_proc = 0; send_proc < rtt_c4::node(); send_proc++) {
+        if (rec_proc == send_proc)
+          continue;
+        for (size_t bin = 0; bin < nbins; bin++) {
+          if (global_need_bins_per_proc[bin + nbins * rec_proc] > 0) {
+            offset += global_index_per_bin_per_proc[bin + nbins * send_proc];
+          }
+        }
+      }
+      for (auto &map : coarse_index_map) {
+        if (rtt_c4::node() != rec_proc) {
+          size_t gipbpp_index = map.first + nbins * rec_proc;
+          if (global_need_bins_per_proc[gipbpp_index] > 0) {
+            // capture the largest put buffer on this rank
+            if (map.second.size() > max_put_buffer_size)
+              max_put_buffer_size = map.second.size();
+
+            // build up map data
+            put_window_map[map.first].push_back(
+                std::array<int, 3>{rec_proc, proc_ghost_buffer_size[rec_proc], offset});
+            offset += static_cast<int>(map.second.size());
+          }
+        }
+      }
+    }
+
+    // allocate ghost locations
+    local_ghost_locations =
+        std::vector<std::array<double, 3>>(local_ghost_buffer_size, {0.0, 0.0, 0.0});
+    // collect the local ghost locations
+    collect_ghost_data(locations, local_ghost_locations);
+
+  } // End domain decomposed data construction
+}
+
+#ifdef C4_MPI
+//------------------------------------------------------------------------------------------------//
+// call MPI_put using a chunk style write to avoid error in MPI_put with large local buffers.
+auto put_lambda = [](auto &put, auto &put_buffer, auto &put_size, auto &win) {
+  // temporary work around until RMA is available in c4
+  // loop over all ranks we need to send this buffer too.
+  for (auto &putv : put.second) {
+    const int put_rank = putv[0];
+    const int put_rank_buffer_size = putv[1];
+    const int put_offset = putv[2];
+    // This is dumb, but we need to write in chunks because MPI_Put writes
+    // junk with large (>10,000) buffer sizes.
+    int chunk_size = 1000;
+    const auto nchunks = static_cast<int>(
+        std::ceil(static_cast<double>(put_size) / static_cast<double>(chunk_size)));
+    int nput = 0;
+    for (int c = 0; c < nchunks; c++) {
+      chunk_size = std::min(chunk_size, static_cast<int>(put_size) - nput);
+      Check(chunk_size > 0);
+      MPI_Put(&put_buffer[nput], chunk_size, MPI_DOUBLE, put_rank, put_offset, put_rank_buffer_size,
+              MPI_DOUBLE, win);
+      nput += chunk_size;
+    }
+  }
+};
+#endif
+
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief Collect ghost data for a vector<std::array<double, 3>>
+ * 
+ * Collect ghost data for vector of 3 dimensional arrays. This function uses RMA and the local
+ * put_window_map to allow each rank to independently fill in its data to ghost cells of other
+ * ranks.
+ *
+ * \param[in] local_data the local 3 dimensional data that is required to be available as ghost cell
+ * data on other processors.
+ * \param[in] local_ghost_data the resulting 3 dimensional ghost data data. 
+ */
+void quick_index::collect_ghost_data(const std::vector<std::array<double, 3>> &local_data,
+                                     std::vector<std::array<double, 3>> &local_ghost_data) const {
+  Require(local_data.size() == n_locations);
+  Insist(domain_decomposed, "Calling collect_ghost_data with a quick_index object that specified "
+                            "domain_decomposed=.false.");
+
+  Insist(local_ghost_data.size() == local_ghost_buffer_size,
+         "ghost_data input must be sized via quick_index.local_ghost_buffer_size");
+#ifdef C4_MPI // temporary work around until RMA is available in c4
+  // Use one sided MPI Put commands to fill up the ghost cell location data
+  std::vector<double> local_ghost_buffer(local_ghost_buffer_size, 0.0);
+  std::vector<double> put_buffer(max_put_buffer_size, 0.0);
+  MPI_Win win;
+  MPI_Win_create(local_ghost_buffer.data(), local_ghost_buffer_size * sizeof(double),
+                 sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+
+  // working from my local data put the ghost data on the other ranks
+  for (size_t d = 0; d < dim; d++) {
+    Remember(int errorcode =) MPI_Win_fence(MPI_MODE_NOSTORE, win);
+    Check(errorcode == MPI_SUCCESS);
+    for (auto &put : put_window_map) {
+      // use map.at() to allow const access
+      Check((coarse_index_map.at(put.first)).size() <= max_put_buffer_size);
+      // fill up the current ghost cell data for this dimension
+      int putIndex = 0;
+      for (auto &l : coarse_index_map.at(put.first)) {
+        put_buffer[putIndex] = local_data[l][d];
+        putIndex++;
+      }
+      put_lambda(put, put_buffer, putIndex, win);
+    }
+    Remember(errorcode =) MPI_Win_fence((MPI_MODE_NOSTORE | MPI_MODE_NOSUCCEED), win);
+    Check(errorcode == MPI_SUCCESS);
+
+    // alright move the position buffer to the final correct array positions
+    int posIndex = 0;
+    for (auto &pos : local_ghost_buffer) {
+      local_ghost_data[posIndex][d] = pos;
+      posIndex++;
+    }
+  }
+  MPI_Win_free(&win);
+#endif
+}
+
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief Collect ghost data for a vector<vector<double>> 
+ * 
+ * Collect ghost data for vector<vector<double>> arrays. This function uses RMA and the local
+ * put_window_map to allow each rank to independently fill in its data to ghost cells of other
+ * ranks.
+ *
+ * \param[in] local_data the local multi-dimensional data that is required to be available as ghost
+ * cell data on other processors.
+ * \param[in,out] local_ghost_data the resulting multi-dimensional ghost data
+ */
+void quick_index::collect_ghost_data(const std::vector<std::vector<double>> &local_data,
+                                     std::vector<std::vector<double>> &local_ghost_data) const {
+  Insist(domain_decomposed, "Calling collect_ghost_data with a quick_index object that specified "
+                            "domain_decomposed=.false.");
+  size_t data_dim = local_data.size();
+  size_t ghost_data_dim = local_ghost_data.size();
+  Insist(data_dim == ghost_data_dim,
+         "The local_data.size() and the local_ghost_data.size() vectors much match");
+  // Check ghost data
+  for (size_t d = 0; d < ghost_data_dim; d++) {
+    Insist(local_ghost_data[d].size() == local_ghost_buffer_size,
+           "ghost_data[" + std::to_string(d) +
+               "] input must be sized via quick_index.local_ghost_buffer_size");
+  }
+#ifdef C4_MPI // temporary work around until RMA is available in c4
+  // Use one sided MPI Put commands to fill up the ghost cell location data
+  std::vector<double> local_ghost_buffer(local_ghost_buffer_size, 0.0);
+  std::vector<double> put_buffer(max_put_buffer_size, 0.0);
+  MPI_Win win;
+  MPI_Win_create(local_ghost_buffer.data(), local_ghost_buffer_size * sizeof(double),
+                 sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+
+  // working from my local data put the ghost data on the other ranks
+  for (size_t d = 0; d < data_dim; d++) {
+    Check(local_data[d].size() == n_locations);
+    Remember(int errorcode =) MPI_Win_fence(MPI_MODE_NOSTORE, win);
+    Check(errorcode == MPI_SUCCESS);
+    for (auto &put : put_window_map) {
+      // use map.at() to allow const access
+      Check((coarse_index_map.at(put.first)).size() <= max_put_buffer_size);
+      // fill up the current ghost cell data for this dimension
+      int putIndex = 0;
+      for (auto &l : coarse_index_map.at(put.first)) {
+        put_buffer[putIndex] = local_data[d][l];
+        putIndex++;
+      }
+      put_lambda(put, put_buffer, putIndex, win);
+    }
+    Remember(errorcode =) MPI_Win_fence((MPI_MODE_NOSTORE | MPI_MODE_NOSUCCEED), win);
+    Check(errorcode == MPI_SUCCESS);
+    // alright move the position buffer to the final correct vector positions
+    int posIndex = 0;
+    for (auto &pos : local_ghost_buffer) {
+      local_ghost_data[d][posIndex] = pos;
+      posIndex++;
+    }
+  }
+  MPI_Win_free(&win);
+#endif
+}
+
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief Collect ghost data for a vector<double>
+ * 
+ * Collect ghost data for a single vector. This function uses RMA and the local put_window_map to
+ * allow each rank to independently fill in its data to ghost cells of other ranks.
+ *
+ * \param[in] local_data the local vector data that is required to be available as ghost cell data
+ * on other processors.
+ * \param[in,out] local_ghost_data the resulting ghost data
+ */
+void quick_index::collect_ghost_data(const std::vector<double> &local_data,
+                                     std::vector<double> &local_ghost_data) const {
+  Require(local_data.size() == n_locations);
+  Insist(domain_decomposed, "Calling collect_ghost_data with a quick_index object that specified "
+                            "domain_decomposed=.false.");
+  Insist(local_ghost_data.size() == local_ghost_buffer_size,
+         "ghost_data input must be sized via quick_index.local_ghost_buffer_size");
+#ifdef C4_MPI // temporary work around until RMA is available in c4
+  std::vector<double> local_ghost_buffer(local_ghost_buffer_size, 0.0);
+  std::vector<double> put_buffer(max_put_buffer_size, 0.0);
+  MPI_Win win;
+  MPI_Win_create(local_ghost_data.data(), local_ghost_buffer_size * sizeof(double), sizeof(double),
+                 MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+
+  // working from my local data put the ghost data on the other ranks
+  Remember(int errorcode =) MPI_Win_fence(MPI_MODE_NOSTORE, win);
+  Check(errorcode == MPI_SUCCESS);
+  for (auto put : put_window_map) {
+    // use map.at() to allow const access
+    Check((coarse_index_map.at(put.first)).size() <= max_put_buffer_size);
+    // fill up the current ghost cell data for this dimension
+    int putIndex = 0;
+    for (auto &l : coarse_index_map.at(put.first)) {
+      put_buffer[putIndex] = local_data[l];
+      putIndex++;
+    }
+    put_lambda(put, put_buffer, putIndex, win);
+  }
+  Remember(errorcode =) MPI_Win_fence((MPI_MODE_NOSTORE | MPI_MODE_NOSUCCEED), win);
+  Check(errorcode == MPI_SUCCESS);
+  MPI_Win_free(&win);
+#endif
+}
+
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief Generate a coarse index list for a window.
+ * 
+ *  Provides a list of global indices that are required by any given window range.
+ *
+ * \param[in] window_min the smallest corner point for every dimension
+ * \param[in] window_max the largest corner point for every dimension
+ * \return bin_list list of global bins requested for the current window.
+ */
+std::vector<size_t>
+quick_index::window_coarse_index_list(const std::array<double, 3> &window_min,
+                                      const std::array<double, 3> &window_max) const {
+  Require(window_min[0] <= window_max[0]);
+  Require(window_min[1] <= window_max[1]);
+  Require(window_min[2] <= window_max[2]);
+
+  // temp cast corse_bin_resolution to double for interpolation
+  const auto crd = static_cast<double>(coarse_bin_resolution);
+
+  // calculate the global index range that each processor needs to
+  // accommodate the specified data window size
+  std::array<size_t, 3> index_min = {0UL, 0UL, 0UL};
+  std::array<size_t, 3> index_max = {0UL, 0UL, 0UL};
+  size_t nbins = 1;
+  for (size_t d = 0; d < dim; d++) {
+    // because local bounds can extend beyond the mesh we need to force a
+    // positive index if necessary
+    index_min[d] = static_cast<size_t>(std::floor(std::max(
+        crd * (window_min[d] - bounding_box_min[d]) / (bounding_box_max[d] - bounding_box_min[d]),
+        0.0)));
+    index_max[d] = static_cast<size_t>(std::floor(crd * (window_max[d] - bounding_box_min[d]) /
+                                                  (bounding_box_max[d] - bounding_box_min[d])));
+    // because local bounds can extend beyond the mesh we need to floor to
+    // the max bin size
+    index_max[d] = std::min(index_max[d], coarse_bin_resolution - 1);
+
+    // Use multiplicity to accumulate total bins;
+    if ((index_max[d] - index_min[d]) > 0)
+      nbins *= index_max[d] - index_min[d] + 1;
+  }
+
+  // Fill up bin list
+  size_t count = 0;
+  std::vector<size_t> bin_list(nbins);
+  for (size_t k = index_min[2]; k <= index_max[2]; k++) {
+    for (size_t j = index_min[1]; j <= index_max[1]; j++) {
+      for (size_t i = index_min[0]; i <= index_max[0]; i++) {
+        size_t bin_index =
+            i + j * coarse_bin_resolution + k * coarse_bin_resolution * coarse_bin_resolution;
+        bin_list[count] = bin_index;
+        count++;
+      }
+    }
+  }
+  return bin_list;
+}
+
+//------------------------------------------------------------------------------------------------//
+// Lambda for getting the mapped window bin
+auto get_window_bin = [](const auto dim, const auto &grid_bins, const auto &location,
+                         const auto &window_min, const auto &window_max,
+                         const auto &Remember(n_map_bins)) {
+  // calculate local bin index
+  bool valid = true;
+  std::array<size_t, 3> bin_id{0, 0, 0};
+  std::array<double, 3> bin_center{0, 0, 0};
+  for (size_t d = 0; d < dim; d++) {
+    Check((window_max[d] - window_min[d]) > 0.0);
+    const double bin_value = static_cast<double>(grid_bins[d]) * (location[d] - window_min[d]) /
+                             (window_max[d] - window_min[d]);
+    if (bin_value < 0.0 || bin_value > static_cast<double>(grid_bins[d])) {
+      valid = false;
+      break;
+    } else {
+      bin_id[d] = static_cast<size_t>(bin_value);
+      // catch any values exactly on the edge of the top bin
+      bin_id[d] = std::min(grid_bins[d] - 1, bin_id[d]);
+      bin_center[d] =
+          window_min[d] + (static_cast<double>(bin_id[d]) / static_cast<double>(grid_bins[d]) +
+                           0.5 / static_cast<double>(grid_bins[d])) *
+                              (window_max[d] - window_min[d]);
+    }
+  }
+  const size_t local_window_bin =
+      bin_id[0] + bin_id[1] * grid_bins[0] + bin_id[2] * grid_bins[0] * grid_bins[1];
+
+  Check(valid ? local_window_bin < n_map_bins : true);
+
+  return std::tuple<bool, size_t, std::array<double, 3>>{valid, local_window_bin, bin_center};
+};
+
+//------------------------------------------------------------------------------------------------//
+// Lambda for getting the mapped window bin
+auto get_sphere_window_bin = [](const auto &grid_bins, const auto &location, const auto &window_min,
+                                const auto &window_max, const auto &Remember(n_map_bins),
+                                const auto pi) {
+  // calculate local bin index
+  bool valid = true;
+  std::array<size_t, 3> bin_id{0, 0, 0};
+  std::array<double, 3> bin_center{0, 0, 0};
+  {
+    Check((window_max[0] - window_min[0]) > 0.0);
+    const double bin_value = static_cast<double>(grid_bins[0]) * (location[0] - window_min[0]) /
+                             (window_max[0] - window_min[0]);
+    if (bin_value < 0.0 || bin_value > static_cast<double>(grid_bins[0])) {
+      valid = false;
+    } else {
+      bin_id[0] = static_cast<size_t>(bin_value);
+      // catch any values exactly on the edge of the top bin
+      bin_id[0] = std::min(grid_bins[0] - 1, bin_id[0]);
+      bin_center[0] =
+          window_min[0] + (static_cast<double>(bin_id[0]) / static_cast<double>(grid_bins[0]) +
+                           0.5 / static_cast<double>(grid_bins[0])) *
+                              (window_max[0] - window_min[0]);
+    }
+  }
+  if (valid) {
+    // catch the window that wraps around the zero theta location
+    const double theta_location =
+        (window_max[1] - window_min[1]) > 0.0
+            ? location[1]
+            : location[1] < window_max[1] ? location[1] + 2 * pi : location[1];
+    const double theta_max =
+        (window_max[1] - window_min[1]) > 0.0 ? window_max[1] : 2 * pi + window_max[1];
+    Check(!((theta_max - window_min[1]) < 0.0));
+    const double bin_value = static_cast<double>(grid_bins[1]) * (theta_location - window_min[1]) /
+                             (theta_max - window_min[1]);
+    if (bin_value < 0.0 || bin_value > static_cast<double>(grid_bins[1])) {
+      valid = false;
+    } else {
+      bin_id[1] = static_cast<size_t>(bin_value);
+      // catch any values exactly on the edge of the top bin
+      bin_id[1] = std::min(grid_bins[1] - 1, bin_id[1]);
+      bin_center[1] =
+          window_min[1] + (static_cast<double>(bin_id[1]) / static_cast<double>(grid_bins[1]) +
+                           0.5 / static_cast<double>(grid_bins[1])) *
+                              (theta_max - window_min[1]);
+      bin_center[1] = bin_center[1] < 2.0 * pi ? bin_center[1] : bin_center[1] - 2.0 * pi;
+    }
+  }
+
+  const size_t local_window_bin =
+      bin_id[0] + bin_id[1] * grid_bins[0] + bin_id[2] * grid_bins[0] * grid_bins[1];
+
+  Check(valid ? local_window_bin < n_map_bins : true);
+
+  return std::tuple<bool, size_t, std::array<double, 3>>{valid, local_window_bin, bin_center};
+};
+
+//------------------------------------------------------------------------------------------------//
+// Lambda for mapping the data
+auto map_data = [](auto &bias_cell_count, auto &data_count, auto &grid_data, auto &min_distance,
+                   const auto &dim, const auto &map_type, const auto &data, const auto &bin_center,
+                   const auto &location, const auto &local_window_bin, const auto &data_bin) {
+  // regardless of map type if it is the first value to enter the bin it
+  // gets set to that value
+  if (data_count[local_window_bin] == 0) {
+    bias_cell_count += 1.0;
+    data_count[local_window_bin]++;
+    double distance = 0.0;
+    for (size_t d = 0; d < dim; d++) {
+      distance += (location[d] - bin_center[d]) * (location[d] - bin_center[d]);
+    }
+    min_distance[local_window_bin] = sqrt(distance);
+    grid_data[local_window_bin] = data[data_bin];
+  } else if (map_type == "max") {
+    if (data[data_bin] > grid_data[local_window_bin])
+      grid_data[local_window_bin] = data[data_bin];
+  } else if (map_type == "min") {
+    if (data[data_bin] < grid_data[local_window_bin])
+      grid_data[local_window_bin] = data[data_bin];
+  } else if (map_type == "ave") {
+    data_count[local_window_bin] += 1;
+    grid_data[local_window_bin] += data[data_bin];
+  } else if (map_type == "nearest") {
+    double distance = 0.0;
+    for (size_t d = 0; d < dim; d++) {
+      distance += (location[d] - bin_center[d]) * (location[d] - bin_center[d]);
+    }
+    distance = sqrt(distance);
+    if (rtt_dsxx::soft_equiv(distance, min_distance[local_window_bin])) {
+      data_count[local_window_bin] += 1;
+      grid_data[local_window_bin] += data[data_bin];
+    } else if (distance < min_distance[local_window_bin]) {
+      min_distance[local_window_bin] = distance;
+      data_count[local_window_bin] = 1;
+      grid_data[local_window_bin] = data[data_bin];
+    } // else exclude the far points.
+  } else {
+    Insist(false, "Error: map_type=" + map_type + " is invalid. Must be max, min, or ave.");
+  }
+};
+
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief Map data to a grid window for vector<double> data
+ * 
+ * Maps local+ghost data to a fixed mesh grid based on a specified weighting type. This data can
+ * additionally be normalized and positively biased on the grid.
+ * 
+ *
+ * \param[in] local_data the local data on the processor to be mapped to the window
+ * \param[in] ghost_data the ghost data on the processor to be mapped to the window
+ * \param[in,out] grid_data the resulting data map
+ * \param[in] window_min the smallest corner point for every dimension
+ * \param[in] window_max the largest corner point for every dimension
+ * \param[in] grid_bins number of equally spaced bins in each dir
+ * \param[in] map_type_in string indicating the mapping (max, min, ave)
+ * \param[in] normalize bool operator to specify if the data should be normalized to a pdf
+ * \param[in] bias bool operator to specify if the data should be moved to the
+ * positive domain space
+ */
+void quick_index::map_data_to_grid_window(
+    const std::vector<double> &local_data, const std::vector<double> &ghost_data,
+    std::vector<double> &grid_data, const std::array<double, 3> &window_min,
+    const std::array<double, 3> &window_max, const std::array<size_t, 3> &grid_bins,
+    const std::string &map_type_in, const bool normalize, const bool bias) const {
+  Require(local_data.size() == n_locations);
+  Require(!(window_max[0] < window_min[0]));
+  Require(!(window_max[1] < window_min[1]));
+  Require(!(window_max[2] < window_min[2]));
+  Require(domain_decomposed ? ghost_data.size() == local_ghost_buffer_size : true);
+  Require(domain_decomposed
+              ? (fabs(window_max[0] - window_min[0]) - max_window_size) / max_window_size < 1e-6
+              : true);
+  Require(domain_decomposed
+              ? (fabs(window_max[1] - window_min[1]) - max_window_size) / max_window_size < 1e-6
+              : true);
+  Require(domain_decomposed
+              ? (fabs(window_max[2] - window_min[2]) - max_window_size) / max_window_size < 1e-6
+              : true);
+
+  bool fill = false;
+  std::string map_type = map_type_in;
+  if (map_type_in == "max_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use max_fill option");
+    fill = true;
+    map_type = "max";
+  } else if (map_type_in == "min_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use min_fill option");
+    fill = true;
+    map_type = "min";
+  } else if (map_type_in == "ave_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use ave_fill option");
+    fill = true;
+    map_type = "ave";
+  } else if (map_type_in == "nearest_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use ave_fill option");
+    fill = true;
+    map_type = "nearest";
+  }
+
+  for (size_t d = 0; d < dim; d++)
+    Insist(grid_bins[d] > 0, "Bin size must be greater then zero for each active dimension");
+
+  size_t n_map_bins = 1;
+  for (size_t d = 0; d < dim; d++)
+    n_map_bins *= grid_bins[d];
+
+  Insist(grid_data.size() == n_map_bins,
+         "grid_data must match the flatten grid_bin size for the active dimensions (in 3d "
+         "grid_data.size()==grib_bins[0]*grid_bins[1]*grid_bins[2])");
+
+  std::fill(grid_data.begin(), grid_data.end(), 0.0);
+
+  // Grab the global bins that lie in this window
+  std::vector<size_t> global_bins = window_coarse_index_list(window_min, window_max);
+
+  std::vector<int> data_count(n_map_bins, 0);
+  std::vector<double> min_distance(n_map_bins, 0);
+  double bias_cell_count = 0.0;
+  // Loop over all possible bins
+  for (auto &cb : global_bins) {
+    // skip bins that aren't present in the map (can't use [] operator with constness)
+    // loop over the local data
+    auto mapItr = coarse_index_map.find(cb);
+    if (mapItr != coarse_index_map.end()) {
+      for (auto &l : mapItr->second) {
+        bool valid;
+        size_t local_window_bin;
+        std::array<double, 3> bin_center;
+        std::tie(valid, local_window_bin, bin_center) =
+            get_window_bin(dim, grid_bins, locations[l], window_min, window_max, n_map_bins);
+
+        // If the bin is outside the window continue to the next poin
+        if (!valid)
+          continue;
+
+        // lambda for mapping the data
+        map_data(bias_cell_count, data_count, grid_data, min_distance, dim, map_type, local_data,
+                 bin_center, locations[l], local_window_bin, l);
+
+      } // end local point loop
+    }   // if valid local bin loop
+    if (domain_decomposed) {
+      // loop over the ghost data
+      auto gmapItr = local_ghost_index_map.find(cb);
+      if (gmapItr != local_ghost_index_map.end()) {
+        // loop over ghost data
+        for (auto &g : gmapItr->second) {
+          bool valid;
+          size_t local_window_bin;
+          std::array<double, 3> bin_center;
+          std::tie(valid, local_window_bin, bin_center) = get_window_bin(
+              dim, grid_bins, local_ghost_locations[g], window_min, window_max, n_map_bins);
+
+          // If the bin is outside the window continue to the next poin
+          if (!valid)
+            continue;
+
+          // lambda for mapping the data
+          map_data(bias_cell_count, data_count, grid_data, min_distance, dim, map_type, ghost_data,
+                   bin_center, local_ghost_locations[g], local_window_bin, g);
+        } // end ghost point loop
+      }   // if valid ghost bin
+    }     // if dd
+  }       // end coarse bin loop
+
+  if (map_type == "ave" || map_type == "nearest") {
+    for (size_t i = 0; i < n_map_bins; i++) {
+      if (data_count[i] > 0) {
+        grid_data[i] /= data_count[i];
+      }
+    }
+  }
+  if (fill) {
+    double last_val = 0.0;
+    int last_data_count = 0;
+    for (size_t i = 0; i < n_map_bins; i++) {
+      if (data_count[i] > 0) {
+        last_val = grid_data[i];
+        last_data_count = data_count[i];
+      } else {
+        grid_data[i] = last_val;
+        data_count[i] = last_data_count;
+      }
+    }
+  }
+
+  if (bias && normalize) {
+    // return a positive normalized distribution
+    const double bias_value =
+        fabs(std::min(0.0, *std::min_element(grid_data.begin(), grid_data.end())));
+    const double sum =
+        std::accumulate(grid_data.begin(), grid_data.end(), 0.0) + bias_value * bias_cell_count;
+    // catch zero instance
+    const double scale = !rtt_dsxx::soft_equiv(sum, 0.0) ? 1.0 / sum : 1.0;
+    for (size_t i = 0; i < n_map_bins; i++)
+      grid_data[i] = (grid_data[i] + bias_value) * scale;
+  } else if (bias) {
+    // return a positive distribution
+    const double bias_value =
+        fabs(std::min(0.0, *std::min_element(grid_data.begin(), grid_data.end())));
+    for (size_t i = 0; i < n_map_bins; i++)
+      grid_data[i] += bias_value;
+  } else if (normalize) {
+    // return a normalized distribution
+    const double sum = std::accumulate(grid_data.begin(), grid_data.end(), 0.0);
+    // catch zero instance
+    const double scale = !rtt_dsxx::soft_equiv(sum, 0.0) ? 1.0 / sum : 1.0;
+    for (size_t i = 0; i < n_map_bins; i++)
+      grid_data[i] *= scale;
+  }
+}
+
+//------------------------------------------------------------------------------------------------//
+// Lambda for mapping the vector data
+auto map_vector_data = [](auto &bias_cell_count, auto &data_count, auto &grid_data,
+                          auto &min_distance, const auto &dim, const auto &map_type,
+                          const auto &data, const auto &bin_center, const auto &location,
+                          const auto &local_window_bin, const auto &data_bin, const auto &vsize) {
+  // regardless of map type if it is the first value to enter the bin it gets set to that value
+  if (data_count[local_window_bin] == 0) {
+    bias_cell_count += 1.0;
+    data_count[local_window_bin]++;
+    double distance = 0.0;
+    for (size_t d = 0; d < dim; d++) {
+      distance += (location[d] - bin_center[d]) * (location[d] - bin_center[d]);
+    }
+    min_distance[local_window_bin] = sqrt(distance);
+    for (size_t v = 0; v < vsize; v++)
+      grid_data[v][local_window_bin] = data[v][data_bin];
+  } else if (map_type == "max") {
+    for (size_t v = 0; v < vsize; v++)
+      if (data[v][data_bin] > grid_data[v][local_window_bin])
+        grid_data[v][local_window_bin] = data[v][data_bin];
+  } else if (map_type == "min") {
+    for (size_t v = 0; v < vsize; v++)
+      if (data[v][data_bin] < grid_data[v][local_window_bin])
+        grid_data[v][local_window_bin] = data[v][data_bin];
+  } else if (map_type == "ave") {
+    data_count[local_window_bin] += 1;
+    for (size_t v = 0; v < vsize; v++)
+      grid_data[v][local_window_bin] += data[v][data_bin];
+  } else if (map_type == "nearest") {
+    double distance = 0.0;
+    for (size_t d = 0; d < dim; d++) {
+      distance += (location[d] - bin_center[d]) * (location[d] - bin_center[d]);
+    }
+    distance = sqrt(distance);
+    if (rtt_dsxx::soft_equiv(distance, min_distance[local_window_bin])) {
+      data_count[local_window_bin] += 1;
+      for (size_t v = 0; v < vsize; v++)
+        grid_data[v][local_window_bin] += data[v][data_bin];
+    } else if (distance < min_distance[local_window_bin]) {
+      min_distance[local_window_bin] = distance;
+      data_count[local_window_bin] = 1;
+      for (size_t v = 0; v < vsize; v++)
+        grid_data[v][local_window_bin] = data[v][data_bin];
+    } // else exclude the far points.
+  } else {
+    Insist(false, "Error: map_type=" + map_type + " is invalid. Must be max, min, or ave.");
+  }
+};
+
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief Map data to a grid window for vector<vector<double>>
+ * 
+ * Maps multiple local+ghost data vectors to a fixed mesh grid based on a specified weighting type.
+ * This data can additionally be normalized and positively biased on the grid.
+ * 
+ *
+ * \param[in] local_data the local data on the processor to be mapped to the window
+ * \param[in] ghost_data the ghost data on the processor to be mapped to the window
+ * \param[in,out] grid_data the resulting data map
+ * \param[in] window_min the smallest corner point for every dimension
+ * \param[in] window_max the largest corner point for every dimension
+ * \param[in] grid_bins number of equally spaced bins in each dir
+ * \param[in] map_type_in string indicating the mapping (max, min, ave)
+ * \param[in] normalize bool operator to specify if the data should be normalized to a pdf
+ * (independent of each data vector)
+ * \param[in] bias bool operator to specify if the data should be moved to the positive domain space
+ * (independent of each data vector)
+ * \return bin_list list of global bins requested for the current window.
+ */
+void quick_index::map_data_to_grid_window(const std::vector<std::vector<double>> &local_data,
+                                          const std::vector<std::vector<double>> &ghost_data,
+                                          std::vector<std::vector<double>> &grid_data,
+                                          const std::array<double, 3> &window_min,
+                                          const std::array<double, 3> &window_max,
+                                          const std::array<size_t, 3> &grid_bins,
+                                          const std::string &map_type_in, const bool normalize,
+                                          const bool bias) const {
+  Require(domain_decomposed ? local_data.size() == ghost_data.size() : true);
+  Require(!(window_max[0] < window_min[0]));
+  Require(!(window_max[1] < window_min[1]));
+  Require(!(window_max[2] < window_min[2]));
+  Require(domain_decomposed
+              ? (fabs(window_max[0] - window_min[0]) - max_window_size) / max_window_size < 1e-6
+              : true);
+  Require(domain_decomposed
+              ? (fabs(window_max[1] - window_min[1]) - max_window_size) / max_window_size < 1e-6
+              : true);
+  Require(domain_decomposed
+              ? (fabs(window_max[2] - window_min[2]) - max_window_size) / max_window_size < 1e-6
+              : true);
+
+  bool fill = false;
+  std::string map_type = map_type_in;
+  if (map_type_in == "max_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use max_fill option");
+    fill = true;
+    map_type = "max";
+  } else if (map_type_in == "min_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use min_fill option");
+    fill = true;
+    map_type = "min";
+  } else if (map_type_in == "ave_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use ave_fill option");
+    fill = true;
+    map_type = "ave";
+  } else if (map_type_in == "nearest_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use ave_fill option");
+    fill = true;
+    map_type = "nearest";
+  }
+
+  for (size_t d = 0; d < dim; d++)
+    Insist(grid_bins[d] > 0, "Bin size must be greater then zero for each active dimension");
+
+  const size_t vsize = local_data.size();
+  // Grab the global bins that lie in this window
+  std::vector<size_t> global_bins = window_coarse_index_list(window_min, window_max);
+  size_t n_map_bins = 1;
+  for (size_t d = 0; d < dim; d++) {
+    n_map_bins *= grid_bins[d];
+  }
+
+  for (size_t v = 0; v < vsize; v++) {
+    Insist(grid_data[v].size() == n_map_bins,
+           "grid_data[" + std::to_string(v) +
+               "] must match the flatten grid_bin size for the active dimensions (in 3d "
+               "grid_data.size()==grib_bins[0]*grid_bins[1]*grid_bins[2])");
+    std::fill(grid_data[v].begin(), grid_data[v].end(), 0.0);
+  }
+
+  // initialize grid data
+  std::vector<int> data_count(n_map_bins, 0);
+  std::vector<double> min_distance(n_map_bins, 0);
+  double bias_cell_count = 0.0;
+  // Loop over all possible bins
+  for (auto &cb : global_bins) {
+    // skip bins that aren't present in the map (can't use [] operator with constness)
+    // loop over the local data
+    auto mapItr = coarse_index_map.find(cb);
+    if (mapItr != coarse_index_map.end()) {
+      for (auto &l : mapItr->second) {
+        bool valid;
+        size_t local_window_bin;
+        std::array<double, 3> bin_center;
+        std::tie(valid, local_window_bin, bin_center) =
+            get_window_bin(dim, grid_bins, locations[l], window_min, window_max, n_map_bins);
+        // If the bin is outside the window continue to the next poin
+        if (!valid)
+          continue;
+        Check(local_window_bin < n_map_bins);
+        map_vector_data(bias_cell_count, data_count, grid_data, min_distance, dim, map_type,
+                        local_data, bin_center, locations[l], local_window_bin, l, vsize);
+      } // end local point loop
+    }   // if valid local bin loop
+    if (domain_decomposed) {
+      // loop over the ghost data
+      auto gmapItr = local_ghost_index_map.find(cb);
+      if (gmapItr != local_ghost_index_map.end()) {
+        // loop over ghost data
+        for (auto &g : gmapItr->second) {
+          bool valid;
+          size_t local_window_bin;
+          std::array<double, 3> bin_center;
+          std::tie(valid, local_window_bin, bin_center) = get_window_bin(
+              dim, grid_bins, local_ghost_locations[g], window_min, window_max, n_map_bins);
+
+          // If the bin is outside the window continue to the next poin
+          if (!valid)
+            continue;
+          map_vector_data(bias_cell_count, data_count, grid_data, min_distance, dim, map_type,
+                          ghost_data, bin_center, local_ghost_locations[g], local_window_bin, g,
+                          vsize);
+        } // end ghost point loop
+      }   // if valid ghost bin
+    }     // if dd
+  }       // end coarse bin loop
+
+  if (map_type == "ave" || map_type == "nearest") {
+    for (size_t i = 0; i < n_map_bins; i++) {
+      for (size_t v = 0; v < vsize; v++) {
+        if (data_count[i] > 0) {
+          grid_data[v][i] /= data_count[i];
+        }
+      }
+    }
+  }
+  if (fill) {
+    std::vector<double> last_val(vsize, 0.0);
+    int last_data_count = 0;
+    for (size_t i = 0; i < n_map_bins; i++) {
+      for (size_t v = 0; v < vsize; v++) {
+        if (data_count[i] > 0) {
+          last_val[v] = grid_data[v][i];
+          last_data_count = data_count[i];
+        } else {
+          grid_data[v][i] = last_val[v];
+          if (v == vsize - 1)
+            data_count[i] = last_data_count;
+        }
+      }
+    }
+  }
+
+  if (bias && normalize) {
+    // return a positive normalized distribution
+    for (size_t v = 0; v < vsize; v++) {
+      const double bias_value =
+          fabs(std::min(0.0, *std::min_element(grid_data[v].begin(), grid_data[v].end())));
+      const double sum = std::accumulate(grid_data[v].begin(), grid_data[v].end(), 0.0) +
+                         bias_value * bias_cell_count;
+      // catch zero instance
+      const double scale = !rtt_dsxx::soft_equiv(sum, 0.0) ? 1.0 / sum : 1.0;
+      for (size_t i = 0; i < n_map_bins; i++)
+        if (data_count[i] > 0)
+          grid_data[v][i] = (grid_data[v][i] + bias_value) * scale;
+    }
+  } else if (bias) {
+    // return a positive distribution
+    for (size_t v = 0; v < vsize; v++) {
+      const double bias_value =
+          fabs(std::min(0.0, *std::min_element(grid_data[v].begin(), grid_data[v].end())));
+      for (size_t i = 0; i < n_map_bins; i++)
+        if (data_count[i] > 0)
+          grid_data[v][i] += bias_value;
+    }
+  } else if (normalize) {
+    // return a normalized distribution
+    for (size_t v = 0; v < vsize; v++) {
+      const double sum = std::accumulate(grid_data[v].begin(), grid_data[v].end(), 0.0);
+      // catch zero instance
+      const double scale = !rtt_dsxx::soft_equiv(sum, 0.0) ? 1.0 / sum : 1.0;
+      for (size_t i = 0; i < n_map_bins; i++)
+        if (data_count[i] > 0)
+          grid_data[v][i] *= scale;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief Transform (x, y, z) position to (r, theta, phi) grid
+ *
+ * Calculate a relative r theta and phi coordinate relative to a sphere center location from a
+ * standard (x,y,z) or (r,z) coordinates
+ *
+ * \param[in] sphere_center center of sphere in (x,y,z) or (r,z) coordinates
+ * \param[in] location (x,y,z) or (r,z) location to transform to relative (r, theta, phi) space.
+ *
+ * \return relative r theta phi location
+ */
+std::array<double, 3> quick_index::transform_r_theta(const std::array<double, 3> &sphere_center,
+                                                     const std::array<double, 3> &location) const {
+  Insist(dim == 2, "Transform_r_theta Only implemented in 2d");
+  const std::array<double, 3> v{location[0] - sphere_center[0], location[1] - sphere_center[1],
+                                0.0};
+  const double r = sqrt(v[0] * v[0] + v[1] * v[1]);
+  const double mag = sqrt(v[0] * v[0] + v[1] * v[1]);
+  double cos_theta = mag > 0.0 ? std::max(std::min(v[1] / mag, 1.0), -1.0) : 0.0;
+  return std::array<double, 3>{
+      r, location[0] < sphere_center[0] ? 2.0 * pi - acos(cos_theta) : acos(cos_theta), 0.0};
+}
+
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief Calculate the bounding box of a wedge
+ *
+ * Calculates the (x,y,0.0) min and max bounds for a pre-defined wedge [origin (x,y,z),
+ * wedge_xyz_center (x,y,z), wedge_dr_dtheta (dr, dtheta, 0.0)]. This computes the bounding box for
+ * the truncated wedge (bounded by rmin and rmax).
+ *
+ *                                  win_max
+ *               ---------------(xmax,ymax,0.0)
+ *               |       x      | __
+ *               |      /|\     |  
+ *               |     / | \    | dr
+ *               |    /  *  \   | --
+ *               |   /   |   \  | dr
+ *     win_min   |  / dt | dt \ |__
+ * (xmin,ymin,0.0)---------------
+ *
+ * '*' is the geometric center (not the centroid) 
+ * 'x' is the wedge_origin (or center of the spherical grid)
+ *
+ * \param[in] wedge_xyz_center geometric center of the wedge in (x,y,z) or (r,z) coordinates
+ * \param[in] wedge_origin axisymetric origin (x,y,z) or (r,z) of the wedge.
+ * \param[in] wedge_dr_dtheta differential size of the wedge in each dimension. 
+ * \param[in,out] win_min differential size of the wedge in each dimension. 
+ * \param[in,out] win_max differential size of the wedge in each dimension. 
+ * 
+ */
+void quick_index::calc_wedge_xy_bounds(const std::array<double, 3> &wedge_xyz_center,
+                                       const std::array<double, 3> &wedge_origin,
+                                       const std::array<double, 3> &wedge_dr_dtheta,
+                                       std::array<double, 3> &win_min,
+                                       std::array<double, 3> &win_max) const {
+  Require(wedge_dr_dtheta[0] > 0.0);
+  Require(wedge_dr_dtheta[1] > 0.0);
+  // Some of the checks might not hold for large theta angles
+  Require(wedge_dr_dtheta[1] < pi / 2.0);
+  const auto r_theta = transform_r_theta(wedge_origin, wedge_xyz_center);
+  const double rmin = std::max(0.0, r_theta[0] - wedge_dr_dtheta[0]);
+  const double rmax = r_theta[0] + wedge_dr_dtheta[0];
+  const double dtheta = wedge_dr_dtheta[1];
+  const double theta_min =
+      dtheta < r_theta[1] ? r_theta[1] - dtheta : 2.0 * pi + r_theta[1] - dtheta;
+  const double theta_max = dtheta + r_theta[1];
+  const double cos_theta = cos(r_theta[1]);
+  const double cos_theta_y_min = r_theta[1] < pi ? cos(theta_max) : cos(theta_min);
+  const double cos_theta_y_max = r_theta[1] < pi ? cos(theta_min) : cos(theta_max);
+  const double ymin = theta_max > pi && theta_min < pi
+                          ? wedge_origin[1] - rmax
+                          : cos_theta_y_min < 0.0 ? wedge_origin[1] + rmax * cos_theta_y_min
+                                                  : wedge_origin[1] + rmin * cos_theta_y_min;
+  const double ymax = theta_max > 2.0 * pi || theta_min > theta_max
+                          ? wedge_origin[1] + rmax
+                          : cos_theta_y_max < 0.0 ? wedge_origin[1] + rmin * cos_theta_y_max
+                                                  : wedge_origin[1] + rmax * cos_theta_y_max;
+  const double xmin_theta = cos_theta < 0 ? theta_max : theta_min;
+  const double xmin_r = xmin_theta < pi ? rmin : rmax;
+  const double xmax_theta = cos_theta < 0 ? theta_min : theta_max;
+  const double xmax_r = xmax_theta < pi ? rmax : rmin;
+  const double sign_min = xmin_theta < pi ? 1.0 : -1.0;
+  const double sign_max = xmax_theta < pi ? 1.0 : -1.0;
+  const double xmin =
+      theta_max > 3. / 2. * pi && theta_min < 3. / 2. * pi
+          ? wedge_origin[0] - rmax
+          : wedge_origin[0] +
+                sign_min * sqrt(xmin_r * xmin_r * (1.0 - cos(xmin_theta) * cos(xmin_theta)));
+
+  const double xmax =
+      theta_max > pi / 2. && theta_min < pi / 2.0
+          ? wedge_origin[0] + rmax
+          : wedge_origin[0] +
+                sign_max * sqrt(xmax_r * xmax_r * (1.0 - cos(xmax_theta) * cos(xmax_theta)));
+  win_min[0] = xmin;
+  win_min[1] = ymin;
+  win_max[0] = xmax;
+  win_max[1] = ymax;
+  Ensure(!(win_min[0] > win_max[0]));
+  Ensure(!(win_min[1] > win_max[1]));
+  return;
+}
+
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief Map data to sphere grid window for vector<double> data
+ *
+ * Maps local+ghost data to a fixed r-theta mesh grid based on a specified weighting type. This data
+ * can additionally be normalized and positively biased on the grid.
+ * 
+ *
+ * \param[in] local_data the local data on the processor to be mapped to the window
+ * \param[in] ghost_data the ghost data on the processor to be mapped to the window
+ * \param[in,out] grid_data the resulting data map
+ * \param[in] sphere_center the center location of the sphere mesh
+ * \param[in] wedge_window_center the geometric center (x,y,x) of the wedge window
+ * \param[in] wedge_dr_dtheta the differential size in each direction (dr, dtheta, 0.0) used to form
+ * the wedge
+ * \param[in] grid_bins number of equally spaced bins in each dir
+ * \param[in] map_type_in string indicating the mapping (max, min, ave)
+ * \param[in] normalize bool operator to specify if the data should be normalized to a pdf
+ * \param[in] bias bool operator to specify if the data should be moved to the positive domain space
+ */
+void quick_index::map_data_to_sphere_grid_window(
+    const std::vector<double> &local_data, const std::vector<double> &ghost_data,
+    std::vector<double> &grid_data, const std::array<double, 3> &sphere_center,
+    const std::array<double, 3> &wedge_window_center, const std::array<double, 3> &wedge_dr_dtheta,
+    const std::array<size_t, 3> &grid_bins, const std::string &map_type_in, const bool normalize,
+    const bool bias) const {
+  Insist(dim > 1, "Sphere grid window is invalid in 1d geometry");
+  const auto r_theta = transform_r_theta(sphere_center, wedge_window_center);
+  // Store some r-theta values
+  const std::array<double, 3> r_theta_phi_max{r_theta[0] + wedge_dr_dtheta[0],
+                                              r_theta[1] + wedge_dr_dtheta[1], 0.0};
+  const std::array<double, 3> r_theta_phi_min{std::max(r_theta[0] - wedge_dr_dtheta[0], 0.0),
+                                              wedge_dr_dtheta[1] < r_theta[1]
+                                                  ? r_theta[1] - wedge_dr_dtheta[1]
+                                                  : r_theta[1] - wedge_dr_dtheta[1] + 2. * pi,
+                                              0.0};
+  Check(!(r_theta_phi_min[1] > 2. * pi));
+  // setup the xy window_max_min
+  std::array<double, 3> window_max{0.0, 0.0, 0.0};
+  std::array<double, 3> window_min{0.0, 0.0, 0.0};
+  calc_wedge_xy_bounds(wedge_window_center, sphere_center, wedge_dr_dtheta, window_min, window_max);
+
+  Require(local_data.size() == n_locations);
+  Require(!(window_max[0] < window_min[0]));
+  Require(!(window_max[1] < window_min[1]));
+  Require(!(window_max[2] < window_min[2]));
+  Require(domain_decomposed ? ghost_data.size() == local_ghost_buffer_size : true);
+  Require(domain_decomposed
+              ? (fabs(window_max[0] - window_min[0]) - max_window_size) / max_window_size < 1e-6
+              : true);
+  Require(domain_decomposed
+              ? (fabs(window_max[1] - window_min[1]) - max_window_size) / max_window_size < 1e-6
+              : true);
+  Require(domain_decomposed
+              ? (fabs(window_max[2] - window_min[2]) - max_window_size) / max_window_size < 1e-6
+              : true);
+
+  bool fill = false;
+  std::string map_type = map_type_in;
+  if (map_type_in == "max_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use max_fill option");
+    fill = true;
+    map_type = "max";
+  } else if (map_type_in == "min_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use min_fill option");
+    fill = true;
+    map_type = "min";
+  } else if (map_type_in == "ave_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use ave_fill option");
+    fill = true;
+    map_type = "ave";
+  } else if (map_type_in == "nearest_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use ave_fill option");
+    fill = true;
+    map_type = "nearest";
+  }
+
+  for (size_t d = 0; d < dim; d++)
+    Insist(grid_bins[d] > 0, "Bin size must be greater then zero for each active dimension");
+
+  size_t n_map_bins = 1;
+  for (size_t d = 0; d < dim; d++)
+    n_map_bins *= grid_bins[d];
+
+  Insist(grid_data.size() == n_map_bins,
+         "grid_data must match the flatten grid_bin size for the active dimensions (in 3d "
+         "grid_data.size()==grib_bins[0]*grid_bins[1]*grid_bins[2])");
+
+  std::fill(grid_data.begin(), grid_data.end(), 0.0);
+
+  // Grab the global bins that lie in this window
+  std::vector<size_t> global_bins = window_coarse_index_list(window_min, window_max);
+
+  std::vector<int> data_count(n_map_bins, 0);
+  std::vector<double> min_distance(n_map_bins, 0);
+  double bias_cell_count = 0.0;
+  // Loop over all possible bins
+  for (auto &cb : global_bins) {
+    // skip bins that aren't present in the map (can't use [] operator with constness)
+    // loop over the local data
+    auto mapItr = coarse_index_map.find(cb);
+    if (mapItr != coarse_index_map.end()) {
+      for (auto &l : mapItr->second) {
+        bool valid;
+        size_t local_window_bin;
+        std::array<double, 3> bin_center;
+        std::tie(valid, local_window_bin, bin_center) =
+            get_sphere_window_bin(grid_bins, transform_r_theta(sphere_center, locations[l]),
+                                  r_theta_phi_min, r_theta_phi_max, n_map_bins, pi);
+
+        // If the bin is outside the window continue to the next point
+        if (!valid)
+          continue;
+
+        // lambda for mapping the data
+        map_data(bias_cell_count, data_count, grid_data, min_distance, dim, map_type, local_data,
+                 bin_center, locations[l], local_window_bin, l);
+
+      } // end local point loop
+    }   // if valid local bin loop
+    if (domain_decomposed) {
+      // loop over the ghost data
+      auto gmapItr = local_ghost_index_map.find(cb);
+      if (gmapItr != local_ghost_index_map.end()) {
+        // loop over ghost data
+        for (auto &g : gmapItr->second) {
+          bool valid;
+          size_t local_window_bin;
+          std::array<double, 3> bin_center;
+          std::tie(valid, local_window_bin, bin_center) = get_sphere_window_bin(
+              grid_bins, transform_r_theta(sphere_center, local_ghost_locations[g]),
+              r_theta_phi_min, r_theta_phi_max, n_map_bins, pi);
+
+          // If the bin is outside the window continue to the next poin
+          if (!valid)
+            continue;
+
+          // lambda for mapping the data
+          map_data(bias_cell_count, data_count, grid_data, min_distance, dim, map_type, ghost_data,
+                   bin_center, local_ghost_locations[g], local_window_bin, g);
+        } // end ghost point loop
+      }   // if valid ghost bin
+    }     // if dd
+  }       // end coarse bin loop
+
+  if (map_type == "ave" || map_type == "nearest") {
+    for (size_t i = 0; i < n_map_bins; i++) {
+      if (data_count[i] > 0) {
+        grid_data[i] /= data_count[i];
+      }
+    }
+  }
+  if (fill) {
+    double last_val = 0.0;
+    int last_data_count = 0;
+    for (size_t i = 0; i < n_map_bins; i++) {
+      if (data_count[i] > 0) {
+        last_val = grid_data[i];
+        last_data_count = data_count[i];
+      } else {
+        grid_data[i] = last_val;
+        data_count[i] = last_data_count;
+      }
+    }
+  }
+
+  if (bias && normalize) {
+    // return a positive normalized distribution
+    const double bias_value =
+        fabs(std::min(0.0, *std::min_element(grid_data.begin(), grid_data.end())));
+    const double sum =
+        std::accumulate(grid_data.begin(), grid_data.end(), 0.0) + bias_value * bias_cell_count;
+    // catch zero instance
+    const double scale = !rtt_dsxx::soft_equiv(sum, 0.0) ? 1.0 / sum : 1.0;
+    for (size_t i = 0; i < n_map_bins; i++)
+      grid_data[i] = (grid_data[i] + bias_value) * scale;
+  } else if (bias) {
+    // return a positive distribution
+    const double bias_value =
+        fabs(std::min(0.0, *std::min_element(grid_data.begin(), grid_data.end())));
+    for (size_t i = 0; i < n_map_bins; i++)
+      grid_data[i] += bias_value;
+  } else if (normalize) {
+    // return a normalized distribution
+    const double sum = std::accumulate(grid_data.begin(), grid_data.end(), 0.0);
+    // catch zero instance
+    const double scale = !rtt_dsxx::soft_equiv(sum, 0.0) ? 1.0 / sum : 1.0;
+    for (size_t i = 0; i < n_map_bins; i++)
+      grid_data[i] *= scale;
+  }
+}
+
+//------------------------------------------------------------------------------------------------//
+/*!
+ * \brief map_data_to_sphere_grid_window for vector<vector<double>> data
+ *
+ * Maps local+ghost data to a fixed r-theta mesh grid based on a specified weighting type. This data
+ * can additionally be normalized and positively biased on the grid.
+ * 
+ *
+ * \param[in] local_data the local data on the processor to be mapped to the window
+ * \param[in] ghost_data the ghost data on the processor to be mapped to the window
+ * \param[in,out] grid_data the resulting data map
+ * \param[in] sphere_center the center location of the sphere mesh
+ * \param[in] wedge_window_center the geometric center (x,y,x) of the wedge window
+ * \param[in] wedge_dr_dtheta the differential size in each direction (dr, dtheta, 0.0) used to form
+ * the wedge
+ * \param[in] grid_bins number of equally spaced bins in each dir
+ * \param[in] map_type_in string indicating the mapping (max, min, ave)
+ * \param[in] normalize bool operator to specify if the data should be normalized to a pdf
+ * (independent of each data vector)
+ * \param[in] bias bool operator to specify if the data should be moved to the positive domain space
+ * (independent of each data vector)
+ * \return bin_list list of global bins requested for the current window.
+ */
+void quick_index::map_data_to_sphere_grid_window(
+    const std::vector<std::vector<double>> &local_data,
+    const std::vector<std::vector<double>> &ghost_data, std::vector<std::vector<double>> &grid_data,
+    const std::array<double, 3> &sphere_center, const std::array<double, 3> &wedge_window_center,
+    const std::array<double, 3> &wedge_dr_dtheta, const std::array<size_t, 3> &grid_bins,
+    const std::string &map_type_in, const bool normalize, const bool bias) const {
+  Insist(dim > 1, "Sphere grid window is invalid in 1d geometry");
+  const auto r_theta = transform_r_theta(sphere_center, wedge_window_center);
+  // Store some r-theta values
+  const std::array<double, 3> r_theta_phi_max{r_theta[0] + wedge_dr_dtheta[0],
+                                              r_theta[1] + wedge_dr_dtheta[1], 0.0};
+  const std::array<double, 3> r_theta_phi_min{std::max(r_theta[0] - wedge_dr_dtheta[0], 0.0),
+                                              wedge_dr_dtheta[1] < r_theta[1]
+                                                  ? r_theta[1] - wedge_dr_dtheta[1]
+                                                  : r_theta[1] - wedge_dr_dtheta[1] + 2. * pi,
+                                              0.0};
+  Check(!(r_theta_phi_min[1] > 2. * pi));
+  // setup the xy window_max_min
+  std::array<double, 3> window_max{0.0, 0.0, 0.0};
+  std::array<double, 3> window_min{0.0, 0.0, 0.0};
+  calc_wedge_xy_bounds(wedge_window_center, sphere_center, wedge_dr_dtheta, window_min, window_max);
+
+  Require(domain_decomposed ? local_data.size() == ghost_data.size() : true);
+  Require(!(window_max[0] < window_min[0]));
+  Require(!(window_max[1] < window_min[1]));
+  Require(!(window_max[2] < window_min[2]));
+  Require(domain_decomposed
+              ? (fabs(window_max[0] - window_min[0]) - max_window_size) / max_window_size < 1e-6
+              : true);
+  Require(domain_decomposed
+              ? (fabs(window_max[1] - window_min[1]) - max_window_size) / max_window_size < 1e-6
+              : true);
+  Require(domain_decomposed
+              ? (fabs(window_max[2] - window_min[2]) - max_window_size) / max_window_size < 1e-6
+              : true);
+
+  bool fill = false;
+  std::string map_type = map_type_in;
+  if (map_type_in == "max_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use max_fill option");
+    fill = true;
+    map_type = "max";
+  } else if (map_type_in == "min_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use min_fill option");
+    fill = true;
+    map_type = "min";
+  } else if (map_type_in == "ave_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use ave_fill option");
+    fill = true;
+    map_type = "ave";
+  } else if (map_type_in == "nearest_fill") {
+    Insist((grid_bins[0] > 1 && grid_bins[1] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[1] > 1 && grid_bins[0] <= 1 && grid_bins[2] <= 1) ||
+               (grid_bins[2] > 1 && grid_bins[0] <= 1 && grid_bins[1] <= 1),
+           "one of grid bins must be == 1, Grid must be 1D to use ave_fill option");
+    fill = true;
+    map_type = "nearest";
+  }
+
+  for (size_t d = 0; d < dim; d++)
+    Insist(grid_bins[d] > 0, "Bin size must be greater then zero for each active dimension");
+
+  const size_t vsize = local_data.size();
+  // Grab the global bins that lie in this window
+  std::vector<size_t> global_bins = window_coarse_index_list(window_min, window_max);
+  size_t n_map_bins = 1;
+  for (size_t d = 0; d < dim; d++) {
+    n_map_bins *= grid_bins[d];
+  }
+
+  for (size_t v = 0; v < vsize; v++) {
+    Insist(grid_data[v].size() == n_map_bins,
+           "grid_data[" + std::to_string(v) +
+               "] must match the flatten grid_bin size for the active dimensions (in 3d "
+               "grid_data.size()==grib_bins[0]*grid_bins[1]*grid_bins[2])");
+    std::fill(grid_data[v].begin(), grid_data[v].end(), 0.0);
+  }
+
+  // initialize grid data
+  std::vector<int> data_count(n_map_bins, 0);
+  std::vector<double> min_distance(n_map_bins, 0);
+  double bias_cell_count = 0.0;
+  // Loop over all possible bins
+  for (auto &cb : global_bins) {
+    // skip bins that aren't present in the map (can't use [] operator with constness)
+    // loop over the local data
+    auto mapItr = coarse_index_map.find(cb);
+    if (mapItr != coarse_index_map.end()) {
+      for (auto &l : mapItr->second) {
+        bool valid;
+        size_t local_window_bin;
+        std::array<double, 3> bin_center;
+        std::tie(valid, local_window_bin, bin_center) =
+            get_sphere_window_bin(grid_bins, transform_r_theta(sphere_center, locations[l]),
+                                  r_theta_phi_min, r_theta_phi_max, n_map_bins, pi);
+        // If the bin is outside the window continue to the next poin
+        if (!valid)
+          continue;
+        Check(local_window_bin < n_map_bins);
+        map_vector_data(bias_cell_count, data_count, grid_data, min_distance, dim, map_type,
+                        local_data, bin_center, locations[l], local_window_bin, l, vsize);
+      } // end local point loop
+    }   // if valid local bin loop
+    if (domain_decomposed) {
+      // loop over the ghost data
+      auto gmapItr = local_ghost_index_map.find(cb);
+      if (gmapItr != local_ghost_index_map.end()) {
+        // loop over ghost data
+        for (auto &g : gmapItr->second) {
+          bool valid;
+          size_t local_window_bin;
+          std::array<double, 3> bin_center;
+          std::tie(valid, local_window_bin, bin_center) = get_sphere_window_bin(
+              grid_bins, transform_r_theta(sphere_center, local_ghost_locations[g]),
+              r_theta_phi_min, r_theta_phi_max, n_map_bins, pi);
+
+          // If the bin is outside the window continue to the next poin
+          if (!valid)
+            continue;
+          map_vector_data(bias_cell_count, data_count, grid_data, min_distance, dim, map_type,
+                          ghost_data, bin_center, local_ghost_locations[g], local_window_bin, g,
+                          vsize);
+        } // end ghost point loop
+      }   // if valid ghost bin
+    }     // if dd
+  }       // end coarse bin loop
+
+  if (map_type == "ave" || map_type == "nearest") {
+    for (size_t i = 0; i < n_map_bins; i++) {
+      for (size_t v = 0; v < vsize; v++) {
+        if (data_count[i] > 0) {
+          grid_data[v][i] /= data_count[i];
+        }
+      }
+    }
+  }
+  if (fill) {
+    std::vector<double> last_val(vsize, 0.0);
+    int last_data_count = 0;
+    for (size_t i = 0; i < n_map_bins; i++) {
+      for (size_t v = 0; v < vsize; v++) {
+        if (data_count[i] > 0) {
+          last_val[v] = grid_data[v][i];
+          last_data_count = data_count[i];
+        } else {
+          grid_data[v][i] = last_val[v];
+          if (v == vsize - 1)
+            data_count[i] = last_data_count;
+        }
+      }
+    }
+  }
+
+  if (bias && normalize) {
+    // return a positive normalized distribution
+    for (size_t v = 0; v < vsize; v++) {
+      const double bias_value =
+          fabs(std::min(0.0, *std::min_element(grid_data[v].begin(), grid_data[v].end())));
+      const double sum = std::accumulate(grid_data[v].begin(), grid_data[v].end(), 0.0) +
+                         bias_value * bias_cell_count;
+      // catch zero instance
+      const double scale = !rtt_dsxx::soft_equiv(sum, 0.0) ? 1.0 / sum : 1.0;
+      for (size_t i = 0; i < n_map_bins; i++)
+        if (data_count[i] > 0)
+          grid_data[v][i] = (grid_data[v][i] + bias_value) * scale;
+    }
+  } else if (bias) {
+    // return a positive distribution
+    for (size_t v = 0; v < vsize; v++) {
+      const double bias_value =
+          fabs(std::min(0.0, *std::min_element(grid_data[v].begin(), grid_data[v].end())));
+      for (size_t i = 0; i < n_map_bins; i++)
+        if (data_count[i] > 0)
+          grid_data[v][i] += bias_value;
+    }
+  } else if (normalize) {
+    // return a normalized distribution
+    for (size_t v = 0; v < vsize; v++) {
+      const double sum = std::accumulate(grid_data[v].begin(), grid_data[v].end(), 0.0);
+      // catch zero instance
+      const double scale = !rtt_dsxx::soft_equiv(sum, 0.0) ? 1.0 / sum : 1.0;
+      for (size_t i = 0; i < n_map_bins; i++)
+        if (data_count[i] > 0)
+          grid_data[v][i] *= scale;
+    }
+  }
+}
+
+} // namespace rtt_kde
+
+//------------------------------------------------------------------------------------------------//
+// end of quick_index.cc
+//------------------------------------------------------------------------------------------------//
diff --git a/src/kde/quick_index.hh b/src/kde/quick_index.hh
new file mode 100644
index 0000000000..dc87e281f8
--- /dev/null
+++ b/src/kde/quick_index.hh
@@ -0,0 +1,145 @@
+//--------------------------------------------*-C++-*---------------------------------------------//
+/*!
+ * \file   kde/quick_index.hh
+ * \author Mathew Cleveland
+ * \brief  This class generates coarse spatial indexing to quickly access near-neighbor data. This
+ *         additionally provides simple interpolation schemes to map data to simple structured
+ *         meshes. 
+ * \note   Copyright (C) 2021 Triad National Security, LLC., All rights reserved. 
+ */
+//------------------------------------------------------------------------------------------------//
+
+#ifndef rtt_kde_quick_index_hh
+#define rtt_kde_quick_index_hh
+
+#include "c4/global.hh"
+#include "units/MathConstants.hh"
+#include <array>
+#include <map>
+#include <vector>
+
+namespace rtt_kde {
+
+//================================================================================================//
+/*!
+ * \brief quick_index
+ *
+ * Provide a hash like index of spatial distributed data along with simple mapping functions.
+ * 
+ */
+//================================================================================================//
+
+class quick_index {
+public:
+  //! Default constructors.
+  quick_index(const size_t dim, const std::vector<std::array<double, 3>> &locations,
+              const double max_window_size, const size_t bins_per_dimension,
+              const bool domain_decomposed);
+
+  //! Collect Ghost Data
+  void collect_ghost_data(const std::vector<double> &local_data,
+                          std::vector<double> &local_ghost_data) const;
+
+  //! Override function of 3D array ghost data.
+  void collect_ghost_data(const std::vector<std::array<double, 3>> &local_data,
+                          std::vector<std::array<double, 3>> &local_ghost_data) const;
+
+  //! Override function for vector<vector<double> array ghost data.
+  void collect_ghost_data(const std::vector<std::vector<double>> &local_data,
+                          std::vector<std::vector<double>> &local_ghost_data) const;
+
+  //! Fetch list of coarse index values bound by the window
+  std::vector<size_t> window_coarse_index_list(const std::array<double, 3> &window_min,
+                                               const std::array<double, 3> &window_max) const;
+
+  //! Map local+ghost data to grid window
+  void map_data_to_grid_window(const std::vector<double> &local_data,
+                               const std::vector<double> &ghost_data,
+                               std::vector<double> &grid_data,
+                               const std::array<double, 3> &window_min,
+                               const std::array<double, 3> &window_max,
+                               const std::array<size_t, 3> &grid_bins, const std::string &map_type,
+                               const bool normalize, const bool bias) const;
+
+  //! Map local+ghost data to grid window for multi-dimensional data
+  void map_data_to_grid_window(const std::vector<std::vector<double>> &local_data,
+                               const std::vector<std::vector<double>> &ghost_data,
+                               std::vector<std::vector<double>> &grid_data,
+                               const std::array<double, 3> &window_min,
+                               const std::array<double, 3> &window_max,
+                               const std::array<size_t, 3> &grid_bins, const std::string &map_type,
+                               const bool normalize, const bool bias) const;
+
+  //! Map local+ghost data to grid window
+  void map_data_to_sphere_grid_window(
+      const std::vector<double> &local_data, const std::vector<double> &ghost_data,
+      std::vector<double> &grid_data, const std::array<double, 3> &sphere_center,
+      const std::array<double, 3> &window_min, const std::array<double, 3> &window_max,
+      const std::array<size_t, 3> &grid_bins, const std::string &map_type, const bool normalize,
+      const bool bias) const;
+
+  //! Map local+ghost data to grid window for multi-dimensional data
+  void map_data_to_sphere_grid_window(const std::vector<std::vector<double>> &local_data,
+                                      const std::vector<std::vector<double>> &ghost_data,
+                                      std::vector<std::vector<double>> &grid_data,
+                                      const std::array<double, 3> &sphere_center,
+                                      const std::array<double, 3> &wedge_window_center,
+                                      const std::array<double, 3> &wedge_dr_dtheta,
+                                      const std::array<size_t, 3> &grid_bins,
+                                      const std::string &map_type, const bool normalize,
+                                      const bool bias) const;
+
+  std::array<double, 3> transform_r_theta(const std::array<double, 3> &sphere_center,
+                                          const std::array<double, 3> &location) const;
+
+  void calc_wedge_xy_bounds(const std::array<double, 3> &wedge_xyz_center,
+                            const std::array<double, 3> &wedge_origin,
+                            const std::array<double, 3> &wedge_dr_dtheta,
+                            std::array<double, 3> &win_min, std::array<double, 3> &win_max) const;
+  // PUBLIC DATA
+  // Quick index initialization data
+  const size_t dim;
+  const bool domain_decomposed;
+  const size_t coarse_bin_resolution;
+  const double max_window_size;
+  // keep a copy of the locations
+  const std::vector<std::array<double, 3>> locations;
+  const size_t n_locations;
+
+  // Global bounds
+  std::array<double, 3> bounding_box_min;
+  std::array<double, 3> bounding_box_max;
+  // Local Data map
+  std::map<size_t, std::vector<size_t>> coarse_index_map;
+
+  // DOMAIN DECOMPOSED DATA
+  // Local bounds
+  std::array<double, 3> local_bounding_box_min;
+  std::array<double, 3> local_bounding_box_max;
+  // Ordered list of local bins (indexes values are based on the global bin structure)
+  std::vector<size_t> local_bins;
+  // Size of ghost data buffer
+  size_t local_ghost_buffer_size;
+  // Map used to index into a local ghost buffer
+  std::map<size_t, std::vector<size_t>> local_ghost_index_map;
+  // Local ghost locations (build at construction time)
+  std::vector<std::array<double, 3>> local_ghost_locations;
+
+private:
+  // PRIVATE DATA
+  // Map used to write local data to other processor ghost cells
+  // put_window_map[global_id] = [put_rank, ghost_proc_buffer_size, ghost_proc_put_offset]
+  // array is integers to accommodate mpi data types
+  std::map<size_t, std::vector<std::array<int, 3>>> put_window_map;
+  // max put buffer size;
+  size_t max_put_buffer_size;
+  const double pi = rtt_units::PI;
+};
+
+} // end namespace  rtt_kde
+
+#endif // rtt_kde_quick_index_hh
+
+//------------------------------------------------------------------------------------------------//
+// end of kde/quick_index.hh
+//------------------------------------------------------------------------------------------------//
diff --git a/src/kde/test/CMakeLists.txt b/src/kde/test/CMakeLists.txt
index 569eaf7a86..96f4fc0850 100644
--- a/src/kde/test/CMakeLists.txt
+++ b/src/kde/test/CMakeLists.txt
@@ -19,11 +19,16 @@ file( GLOB test_sources *.cc )
 #--------------------------------------------------------------------------------------------------#
 # Build Unit tests
 #--------------------------------------------------------------------------------------------------#
-
+if (MSVC )
+    # MSMPI fails for MPI_Win_fence.
+    set(pe_list "1")
+else()
+    set(pe_list "1;3")
+endif()
 add_parallel_tests(
    SOURCES   "${test_sources}"
    DEPS      Lib_kde
-   PE_LIST   "1;3")
+   PE_LIST   ${pe_list})
 #--------------------------------------------------------------------------------------------------#
 # end kde/test/CMakeLists.txt
 #--------------------------------------------------------------------------------------------------#
diff --git a/src/kde/test/tstkde.cc b/src/kde/test/tstkde.cc
index cc73c58c51..4484895c47 100644
--- a/src/kde/test/tstkde.cc
+++ b/src/kde/test/tstkde.cc
@@ -4,11 +4,12 @@
  * \author Mathew Cleveland
  * \date   Nov. 10th 2020
  * \brief  KDE function tests
- * \note   Copyright (C) 2018-2020 Triad National Security, LLC.
- *         All rights reserved. */
+ * \note   Copyright (C) 2020-2021 Triad National Security, LLC., All rights reserved. 
+ */
 //------------------------------------------------------------------------------------------------//
 
 #include "kde/kde.hh"
+#include "kde/quick_index.hh"
 #include "c4/ParallelUnitTest.hh"
 #include "ds++/Release.hh"
 #include <numeric>
@@ -22,13 +23,268 @@ using namespace rtt_kde;
 //------------------------------------------------------------------------------------------------//
 //
 void test_replication(ParallelUnitTest &ut) {
-  kde<kde_coordinates::CART> test_kde;
+  kde test_kde;
 
   // test the epan kernel
   double value = test_kde.epan_kernel(0.0);
   if (!rtt_dsxx::soft_equiv(value, 0.75))
     ITFAILS;
 
+  // test some public sphere calculations
+  {
+    const std::array<double, 3> sphere_center{0.0, 0.0, 0.0};
+    const std::array<double, 3> location{sqrt(2), sqrt(2), 0.0};
+    const std::array<double, 3> location2{0.0, 2.0, 0.0};
+    const double radius = 2.0;
+    const double small_radius = 1.0;
+    const double pi_over_4 = 0.78539816;
+    FAIL_IF_NOT(rtt_dsxx::soft_equiv(test_kde.calc_radius(sphere_center, location), 2.0));
+    FAIL_IF_NOT(
+        rtt_dsxx::soft_equiv(test_kde.calc_arch_length(sphere_center, radius, location, location2),
+                             2.0 * pi_over_4, 1e-6));
+    FAIL_IF_NOT(rtt_dsxx::soft_equiv(
+        test_kde.calc_arch_length(sphere_center, small_radius, location, location2), pi_over_4,
+        1e-6));
+  }
+
+  // spherical reconstruction
+  {
+    const std::array<double, 3> sphere_center{0.0, -1.0, 0.0};
+    const double max_radius = 1.0;
+    const double min_radius = 0.0;
+    kde sphere_kde;
+    sphere_kde.set_sphere_center(sphere_center, min_radius, max_radius);
+    const std::array<double, 8> radial_edges{0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1.0};
+    const std::array<double, 9> cosine_edges{-1.0, -0.75, -0.5, -0.25, 0.0, 0.25, 0.5, 0.75, 1.0};
+    const size_t data_size = radial_edges.size() * cosine_edges.size();
+    std::vector<std::array<double, 3>> position_array(data_size,
+                                                      std::array<double, 3>{0.0, 0.0, 0.0});
+
+    std::vector<double> shell_data(data_size, 0.0);
+    std::vector<double> spoke_data(data_size, 0.0);
+    std::vector<double> const_data(data_size, 1.0);
+    size_t point_i = 0;
+    size_t ri = 0;
+    for (auto &r : radial_edges) {
+      size_t mui = 0;
+      for (auto &mu : cosine_edges) {
+        spoke_data[point_i] = static_cast<double>(mui) + 1.0;
+        shell_data[point_i] = static_cast<double>(ri) + 1.0;
+        double rel_y = r * mu;
+        position_array[point_i][0] =
+            rtt_dsxx::soft_equiv(r * r, rel_y * rel_y, 1e-6) ? 0.0 : sqrt(r * r - rel_y * rel_y);
+        position_array[point_i][1] = sphere_center[1] + rel_y;
+        point_i++;
+        mui++;
+      }
+      ri++;
+    }
+
+    // zero reconstruction array
+    {
+      std::vector<double> zero_data(data_size, 0.0);
+      std::vector<std::array<double, 3>> one_over_bandwidth_array(
+          data_size, std::array<double, 3>{1.0, 1.0e12, 0.0});
+      const bool dd = false;
+      // two bins per point
+      const size_t n_coarse_bins = 5;
+      const double max_window_size = 1.0;
+      const size_t dim = 2;
+      quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+
+      std::vector<double> smooth_result =
+          sphere_kde.reconstruction(zero_data, one_over_bandwidth_array, qindex);
+      std::vector<double> log_smooth_result =
+          sphere_kde.log_reconstruction(zero_data, one_over_bandwidth_array, qindex);
+      // Apply Conservation
+      sphere_kde.apply_conservation(zero_data, smooth_result, qindex.domain_decomposed);
+      sphere_kde.apply_conservation(spoke_data, log_smooth_result, qindex.domain_decomposed);
+
+      // Check smooth result
+      for (size_t i = 0; i < data_size; i++) {
+        if (!rtt_dsxx::soft_equiv(smooth_result[i], zero_data[i]))
+          ITFAILS;
+        if (!rtt_dsxx::soft_equiv(log_smooth_result[i], zero_data[i]))
+          ITFAILS;
+      }
+
+      // Energy conservation
+      if (!rtt_dsxx::soft_equiv(std::accumulate(zero_data.begin(), zero_data.end(), 0.0),
+                                std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(
+              std::accumulate(zero_data.begin(), zero_data.end(), 0.0),
+              std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+        ITFAILS;
+    }
+    // spoke reconstruction array
+    {
+      std::vector<std::array<double, 3>> one_over_bandwidth_array(
+          data_size, std::array<double, 3>{1.0, 1.0e12, 0.0});
+      const bool dd = false;
+      // two bins per point
+      const size_t n_coarse_bins = 5;
+      const double max_window_size = 1.0;
+      const size_t dim = 2;
+      quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+
+      std::vector<double> smooth_result =
+          sphere_kde.reconstruction(spoke_data, one_over_bandwidth_array, qindex);
+      std::vector<double> log_smooth_result =
+          sphere_kde.log_reconstruction(spoke_data, one_over_bandwidth_array, qindex);
+      // Apply Conservation
+      sphere_kde.apply_conservation(spoke_data, smooth_result, qindex.domain_decomposed);
+      sphere_kde.apply_conservation(spoke_data, log_smooth_result, qindex.domain_decomposed);
+
+      // Check smooth result
+      for (size_t i = 0; i < data_size; i++) {
+        if (!rtt_dsxx::soft_equiv(smooth_result[i], spoke_data[i]))
+          ITFAILS;
+        if (!rtt_dsxx::soft_equiv(log_smooth_result[i], spoke_data[i]))
+          ITFAILS;
+      }
+
+      // Energy conservation
+      if (!rtt_dsxx::soft_equiv(std::accumulate(spoke_data.begin(), spoke_data.end(), 0.0),
+                                std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(
+              std::accumulate(spoke_data.begin(), spoke_data.end(), 0.0),
+              std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+        ITFAILS;
+    }
+
+    // shell reconstruction array
+    {
+      std::vector<std::array<double, 3>> one_over_bandwidth_array(
+          data_size, std::array<double, 3>{1.0e12, 1.0, 0.0});
+      const bool dd = false;
+      // two bins per point
+      const size_t n_coarse_bins = 5;
+      const double max_window_size = 1.0;
+      const size_t dim = 2;
+      quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+
+      std::vector<double> smooth_result =
+          sphere_kde.reconstruction(shell_data, one_over_bandwidth_array, qindex);
+      std::vector<double> log_smooth_result =
+          sphere_kde.log_reconstruction(shell_data, one_over_bandwidth_array, qindex);
+      // Apply Conservation
+      sphere_kde.apply_conservation(shell_data, smooth_result, qindex.domain_decomposed);
+      sphere_kde.apply_conservation(shell_data, log_smooth_result, qindex.domain_decomposed);
+
+      // Check smooth result
+      for (size_t i = 0; i < data_size; i++) {
+        if (!rtt_dsxx::soft_equiv(smooth_result[i], shell_data[i]))
+          ITFAILS;
+        if (!rtt_dsxx::soft_equiv(log_smooth_result[i], shell_data[i]))
+          ITFAILS;
+      }
+
+      // Energy conservation
+      if (!rtt_dsxx::soft_equiv(std::accumulate(shell_data.begin(), shell_data.end(), 0.0),
+                                std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(
+              std::accumulate(shell_data.begin(), shell_data.end(), 0.0),
+              std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+        ITFAILS;
+    }
+
+    // spoke smoothing on shell array
+    {
+
+      std::vector<double> spoke_smoothed_shells{
+          2.51488, 2.99904, 2.99904, 3.69002, 3.72457, 3.69002, 2.99904, 2.99904, 2.51488,
+          2.51645, 3.00418, 3.00418, 3.72015, 3.7866,  3.72015, 3.00418, 3.00418, 2.51645,
+          2.51803, 3.00928, 3.00928, 3.74919, 3.84522, 3.74919, 3.00928, 3.00928, 2.51803,
+          2.51961, 3.01436, 3.01436, 3.77729, 3.90089, 3.77729, 3.01436, 3.01436, 2.51961,
+          5.52169, 3.04531, 3.04531, 3.93334, 4.19165, 3.93334, 3.04531, 3.04531, 5.52169,
+          5.55417, 6.52859, 6.95461, 4.19454, 4.61685, 4.19454, 6.95461, 6.52859, 5.55417,
+          7.53548, 6.56107, 7,       4.58158, 5.14978, 4.58158, 7,       6.56107, 7.53548,
+          7.56796, 8,       7.14194, 8,       6.33455, 8,       7.14194, 8,       7.56796};
+      std::vector<std::array<double, 3>> one_over_bandwidth_array(
+          data_size, std::array<double, 3>{1.0, 1.0e12, 0.0});
+      const bool dd = false;
+      // two bins per point
+      const size_t n_coarse_bins = 5;
+      const double max_window_size = 1.0;
+      const size_t dim = 2;
+      quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+
+      std::vector<double> smooth_result =
+          sphere_kde.reconstruction(shell_data, one_over_bandwidth_array, qindex);
+      std::vector<double> log_smooth_result =
+          sphere_kde.reconstruction(shell_data, one_over_bandwidth_array, qindex);
+      // Apply Conservation
+      sphere_kde.apply_conservation(shell_data, smooth_result, qindex.domain_decomposed);
+      sphere_kde.apply_conservation(shell_data, log_smooth_result, qindex.domain_decomposed);
+
+      // Check smooth result
+      for (size_t i = 0; i < data_size; i++) {
+        if (!rtt_dsxx::soft_equiv(smooth_result[i], spoke_smoothed_shells[i], 1e-3))
+          ITFAILS;
+        if (!rtt_dsxx::soft_equiv(log_smooth_result[i], spoke_smoothed_shells[i], 1e-3))
+          ITFAILS;
+      }
+
+      // Energy conservation
+      if (!rtt_dsxx::soft_equiv(std::accumulate(shell_data.begin(), shell_data.end(), 0.0),
+                                std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(
+              std::accumulate(shell_data.begin(), shell_data.end(), 0.0),
+              std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+        ITFAILS;
+    }
+
+    // shell smoothing on spoke array
+    {
+      std::vector<double> shell_smoothed_spoke{
+          4.82322, 4.82519, 4.82608, 4.82681, 4.8275, 4.82819, 4.82892, 4.8298,  4.83177,
+          4.81029, 4.81825, 4.8218,  4.82475, 4.8275, 4.83025, 4.8332,  4.83675, 4.8447,
+          4.78839, 4.80659, 4.81462, 4.82129, 4.8275, 4.83371, 4.84037, 4.84841, 4.86661,
+          4.75694, 4.79008, 4.8045,  4.81642, 4.8275, 4.83857, 4.8505,  4.86492, 4.89805,
+          4.04388, 4.22611, 4.67522, 4.75503, 5,      4.97765, 5.14326, 6.35454, 5.61112,
+          2.62832, 3.75795, 4.12091, 4.47922, 5,      6.9148,  6.66199, 7.35789, 7.02668,
+          1.66976, 3.01482, 3.72878, 4.43378, 5,      8.6895,  8.00342, 6.64018, 7.98524,
+          1,       2,       4.02682, 4.51075, 5,      5.14424, 5.62818, 8,       9};
+      std::vector<std::array<double, 3>> one_over_bandwidth_array(
+          data_size, std::array<double, 3>{1.0e12, 1.0, 0.0});
+      const bool dd = false;
+      // two bins per point
+      const size_t n_coarse_bins = 5;
+      const double max_window_size = 1.0;
+      const size_t dim = 2;
+      quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+
+      std::vector<double> smooth_result =
+          sphere_kde.reconstruction(spoke_data, one_over_bandwidth_array, qindex);
+      std::vector<double> log_smooth_result =
+          sphere_kde.reconstruction(spoke_data, one_over_bandwidth_array, qindex);
+      // Apply Conservation
+      sphere_kde.apply_conservation(spoke_data, smooth_result, qindex.domain_decomposed);
+      sphere_kde.apply_conservation(spoke_data, log_smooth_result, qindex.domain_decomposed);
+
+      // Check smooth result
+      for (size_t i = 0; i < data_size; i++) {
+        if (!rtt_dsxx::soft_equiv(smooth_result[i], shell_smoothed_spoke[i], 1e-3))
+          ITFAILS;
+        if (!rtt_dsxx::soft_equiv(log_smooth_result[i], shell_smoothed_spoke[i], 1e-3))
+          ITFAILS;
+      }
+
+      // Energy conservation
+      if (!rtt_dsxx::soft_equiv(std::accumulate(spoke_data.begin(), spoke_data.end(), 0.0),
+                                std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(
+              std::accumulate(spoke_data.begin(), spoke_data.end(), 0.0),
+              std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+        ITFAILS;
+    }
+  }
+
   // No mean reconstruction because of small basis functions
   {
     std::vector<double> data{0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
@@ -38,21 +294,81 @@ void test_replication(ParallelUnitTest &ut) {
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0 / 0.1, 0., 0.0});
+        10, std::array<double, 3>{1.0 / 0.1, 0.0, 0.0});
+    const bool dd = false;
+    // two bins per point
+    const size_t n_coarse_bins = 5;
+    const double max_window_size = 0.1;
+    const size_t dim = 1;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(data, log_smooth_result, qindex.domain_decomposed);
+
+    // Check smooth result
+    for (int i = 0; i < 10; i++) {
+      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.1))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.1))
+        ITFAILS;
+    }
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+      ITFAILS;
+    if (!rtt_dsxx::soft_equiv(
+            std::accumulate(data.begin(), data.end(), 0.0),
+            std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+      ITFAILS;
+  }
+
+  // 2D No mean reconstruction because of small basis functions
+  {
+    std::vector<double> data{0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0 / 0.1, 1.0 / 0.1, 0.0});
     const bool dd = false;
+    // two bins per point
+    const size_t n_coarse_bins = 5;
+    const double max_window_size = 0.1;
+    const size_t dim = 2;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+
     std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(data, position_array, one_over_bandwidth_array, dd);
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(data, log_smooth_result, qindex.domain_decomposed);
 
     // Check smooth result
     for (int i = 0; i < 10; i++) {
       if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.1))
         ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.1))
+        ITFAILS;
     }
 
     // Energy conservation
     if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
                               std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
       ITFAILS;
+    if (!rtt_dsxx::soft_equiv(
+            std::accumulate(data.begin(), data.end(), 0.0),
+            std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+      ITFAILS;
   }
 
   // "Smoothed" reconstruction.
@@ -64,21 +380,79 @@ void test_replication(ParallelUnitTest &ut) {
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0 / 4.0, 0., 0.0});
+        10, std::array<double, 3>{1.0 / 4.0, 0.0, 0.0});
+    const bool dd = false;
+    // one bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 4.0;
+    const size_t dim = 1;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(data, log_smooth_result, qindex.domain_decomposed);
+
+    // Check smooth result
+    for (int i = 0; i < 10; i++) {
+      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.1, 1e-1))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.1, 1e-1))
+        ITFAILS;
+    }
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+      ITFAILS;
+    if (!rtt_dsxx::soft_equiv(
+            std::accumulate(data.begin(), data.end(), 0.0),
+            std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+      ITFAILS;
+  }
+
+  // 2D "Smoothed" reconstruction.
+  {
+    std::vector<double> data{0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0 / 4.0, 1.0 / 4.0, 0.0});
     const bool dd = false;
+    // one bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 4.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
     std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(data, position_array, one_over_bandwidth_array, dd);
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(data, log_smooth_result, qindex.domain_decomposed);
 
     // Check smooth result
     for (int i = 0; i < 10; i++) {
       if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.1, 1e-1))
         ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.1, 1e-1))
+        ITFAILS;
     }
 
     // Energy conservation
     if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
                               std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
       ITFAILS;
+    if (!rtt_dsxx::soft_equiv(
+            std::accumulate(data.begin(), data.end(), 0.0),
+            std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+      ITFAILS;
   }
 
   // No reconstruction because of small basis functions
@@ -90,24 +464,1134 @@ void test_replication(ParallelUnitTest &ut) {
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0 / 0.1, 0., 0.0});
+        10, std::array<double, 3>{1.0 / 0.1, 0.0, 0.0});
+    const bool dd = false;
+    // 2X bin per point
+    const size_t n_coarse_bins = 20;
+    const double max_window_size = 0.1;
+    const size_t dim = 1;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(data, log_smooth_result, qindex.domain_decomposed);
+
+    // Check smooth result
+    for (int i = 0; i < 10; i++) {
+      if (!rtt_dsxx::soft_equiv(data[i], smooth_result[i]))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(data[i], log_smooth_result[i]))
+        ITFAILS;
+    }
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+      ITFAILS;
+    if (!rtt_dsxx::soft_equiv(
+            std::accumulate(data.begin(), data.end(), 0.0),
+            std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+      ITFAILS;
+  }
+
+  // 2D No reconstruction because of small basis in both directions
+  {
+    std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0 / 0.1, 1.0 / 0.1, 0.0});
+    const bool dd = false;
+    // 2X bin per point
+    const size_t n_coarse_bins = 20;
+    const double max_window_size = 0.1;
+    const size_t dim = 2;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(data, log_smooth_result, qindex.domain_decomposed);
+
+    // Check smooth result
+    for (int i = 0; i < 10; i++) {
+      if (!rtt_dsxx::soft_equiv(data[i], smooth_result[i]))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(data[i], log_smooth_result[i]))
+        ITFAILS;
+    }
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+      ITFAILS;
+    if (!rtt_dsxx::soft_equiv(
+            std::accumulate(data.begin(), data.end(), 0.0),
+            std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+      ITFAILS;
+  }
+
+  // 2D No reconstruction because of small bias in dim=1 keeps dim=2 from
+  // accumulating offset data. This test can't be achieved in the opposite
+  // direction without a small bandwidth in both dirs because the rows are
+  // exactly in line with one another, while the columns are offset.
+  {
+    std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0 / 0.1, 1.0 / 4.0, 0.0});
+    const bool dd = false;
+    // 2X bin per point
+    const size_t n_coarse_bins = 20;
+    const double max_window_size = 4.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(data, log_smooth_result, qindex.domain_decomposed);
+
+    // Check smooth result
+    for (int i = 0; i < 10; i++) {
+      if (!rtt_dsxx::soft_equiv(data[i], smooth_result[i]))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(data[i], log_smooth_result[i]))
+        ITFAILS;
+    }
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+      ITFAILS;
+    if (!rtt_dsxx::soft_equiv(
+            std::accumulate(data.begin(), data.end(), 0.0),
+            std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+      ITFAILS;
+  }
+
+  // 2D reconstruct only along dim=1 for each row in dim=2
+  {
+    std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0 / 4.0, 1.0 / 0.1, 0.0});
+    const bool dd = false;
+    // 2X bin per point
+    const size_t n_coarse_bins = 20;
+    const double max_window_size = 4.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(data, log_smooth_result, qindex.domain_decomposed);
+
+    // Check smooth result
+    for (int i = 0; i < 10; i++) {
+      if (i < 5) {
+        // 0.14 = (0.1*3+0.2*2)/5
+        if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.14, 3e-2))
+          ITFAILS;
+        // 0.14 = (0.1*3+0.2*2)/5
+        if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.14, 3e-2))
+          ITFAILS;
+      } else {
+        // 0.16 = (0.1*2+0.2*3)/5
+        if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.16, 3e-2))
+          ITFAILS;
+        if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.16, 3e-2))
+          ITFAILS;
+      }
+    }
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+      ITFAILS;
+    if (!rtt_dsxx::soft_equiv(
+            std::accumulate(data.begin(), data.end(), 0.0),
+            std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+      ITFAILS;
+  }
+
+  // 2D reconstruct mainly along dim=2 (rows are offset by 0.5 so we have to
+  // have a larger bandwidth in dim=1 to get any smoothing in dim=2) for each
+  // column in dim=1
+  {
+    std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0 / 0.6, 1.0 / 4.0, 0.0});
+    const bool dd = false;
+    // 2X bin per point
+    const size_t n_coarse_bins = 20;
+    const double max_window_size = 4.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(data, log_smooth_result, qindex.domain_decomposed);
+
+    std::vector<double> bench{0.122267, 0.181788, 0.118212, 0.181788, 0.118212,
+                              0.181788, 0.118212, 0.181788, 0.118212, 0.177733};
+
+    std::vector<double> log_bench{0.121638, 0.182268, 0.117873, 0.182268, 0.117873,
+                                  0.182268, 0.117873, 0.182268, 0.117873, 0.177799};
+
+    // Check smooth result
+    for (int i = 0; i < 10; i++) {
+      if (!rtt_dsxx::soft_equiv(smooth_result[i], bench[i], 1e-4))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], log_bench[i], 1e-4))
+        ITFAILS;
+    }
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+      ITFAILS;
+    if (!rtt_dsxx::soft_equiv(
+            std::accumulate(data.begin(), data.end(), 0.0),
+            std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+      ITFAILS;
+  }
+
+  // 2D Smoothed reconstruction should be close to the problem mean of 0.15
+  {
+    std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0 / 4.0, 1.0 / 4.0, 0.0});
+    const bool dd = false;
+    // 1 bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 4.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(data, log_smooth_result, qindex.domain_decomposed);
+
+    // Check smooth result
+    for (int i = 0; i < 10; i++) {
+      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.15, 1e-1))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.15, 1e-1))
+        ITFAILS;
+    }
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+      ITFAILS;
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(
+            std::accumulate(data.begin(), data.end(), 0.0),
+            std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+      ITFAILS;
+  }
+
+  // Smoothed reconstruction should be close to the problem mean of 0.15
+  {
+    std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0 / 4.0, 0.0, 0.0});
     const bool dd = false;
+    // 1 bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 4.0;
+    const size_t dim = 1;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(data, log_smooth_result, qindex.domain_decomposed);
+
+    // Check smooth result
+    for (int i = 0; i < 10; i++) {
+      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.15, 1e-1))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.15, 1e-1))
+        ITFAILS;
+    }
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+      ITFAILS;
+    if (!rtt_dsxx::soft_equiv(
+            std::accumulate(data.begin(), data.end(), 0.0),
+            std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+      ITFAILS;
+  }
+
+  // No variable band width test.
+  {
+    std::vector<double> data{0.01, 0.02, 0.1, 0.2, 0.1, 0.02, 0.01, 0.2, 0.1, 0.2};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0, 0.0, 0.0});
+
+    const bool dd = false;
+    // 1 bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 1.0;
+    const size_t dim = 1;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+
+    std::vector<double> bench{0.01446,   0.0172074, 0.10425,  0.172074, 0.131586,
+                              0.0172074, 0.040488,  0.172074, 0.131586, 0.15906};
+
+    // Check smooth result
+    for (int i = 0; i < 10; i++) {
+      if (!rtt_dsxx::soft_equiv(bench[i], smooth_result[i], 1e-4))
+        ITFAILS;
+    }
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+      ITFAILS;
+  }
+
+  // 2D No variable band width test.
+  {
+    std::vector<double> data{0.01, 0.02, 0.1, 0.2, 0.1, 0.02, 0.01, 0.2, 0.1, 0.2};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0, 1.0 / 4.0, 0.0});
+
+    const bool dd = false;
+    // 1 bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 4.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+
+    std::vector<double> bench{0.0142901, 0.0172733, 0.104099, 0.172733, 0.130699,
+                              0.0172733, 0.0396694, 0.172733, 0.130699, 0.160531};
+
+    // Check smooth result
+    for (int i = 0; i < 10; i++) {
+      if (!rtt_dsxx::soft_equiv(bench[i], smooth_result[i], 1e-4))
+        ITFAILS;
+    }
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+      ITFAILS;
+  }
+
+  // 2D  variable band width test.
+  {
+    std::vector<double> data{0.01, 0.02, 0.1, 0.2, 0.1, 0.02, 0.01, 0.2, 0.1, 0.2};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0, 1.0 / 4.0, 0.0});
+
+    // lets make the array a little bit more complicated
+    one_over_bandwidth_array[9] = {1.0 / 0.5, 1.0 / 4.0, 0.0};
+    one_over_bandwidth_array[3] = {1.0 / 1.0, 1.0 / 0.1, 0.0};
+    one_over_bandwidth_array[4] = {1.0 / 0.5, 1.0 / 4.0, 0.0};
+    one_over_bandwidth_array[2] = {1.0 / 0.1, 1.0 / 4.0, 0.0};
+    const bool dd = false;
+    // 1 bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 4.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+
+    std::vector<double> bench{0.0131256, 0.0158657, 0.1,      0.2,      0.1,
+                              0.0158657, 0.0364369, 0.158657, 0.120049, 0.2};
+
+    // Check smooth result
+    for (int i = 0; i < 10; i++) {
+      if (!rtt_dsxx::soft_equiv(bench[i], smooth_result[i], 1e-4))
+        ITFAILS;
+    }
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+      ITFAILS;
+  }
+
+  //  step band width test.
+  {
+    std::vector<double> data{0.01, 0.02, 0.1, 0.2, 0.1, 0.02, 0.01, 0.2, 0.1, 0.2};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0, 0., 0.0});
+
+    // transition at 1.75
+    // lets make the array a little bit more complicated
+    one_over_bandwidth_array[0] = {1.0 / 1.75, 0.0, 0.0};
+    one_over_bandwidth_array[1] = {1.0 / 0.75, 0.0, 0.0};
+    one_over_bandwidth_array[2] = {1.0 / 0.25, 0.0, 0.0};
+    one_over_bandwidth_array[3] = {1.0 / 1.25, 0.0, 0.0};
+    one_over_bandwidth_array[4] = {1.0 / 2.25, 0.0, 0.0};
+    one_over_bandwidth_array[5] = {1.0 / 1.25, 0.0, 0.0};
+    one_over_bandwidth_array[6] = {1.0 / 0.25, 0.0, 0.0};
+    one_over_bandwidth_array[7] = {1.0 / 0.75, 0.0, 0.0};
+    one_over_bandwidth_array[8] = {1.0 / 1.75, 0.0, 0.0};
+    one_over_bandwidth_array[9] = {1.0 / 2.75, 0.0, 0.0};
+    const bool dd = false;
+    // 1 bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 3.0;
+    const size_t dim = 1;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+
+    std::vector<double> bench{0.0159208, 0.0177581, 0.1,      0.157576, 0.15506,
+                              0.0164128, 0.01,      0.177581, 0.154304, 0.155386};
+
+    // Check smooth result
+    for (int i = 0; i < 10; i++) {
+      if (!rtt_dsxx::soft_equiv(bench[i], smooth_result[i], 1e-4))
+        ITFAILS;
+    }
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+      ITFAILS;
+  }
+
+  // what if half of it is negative and the mean is zero
+  {
+    std::vector<double> data{-0.2, 0.2, -0.2, 0.2, -0.2, 0.2, -0.2, 0.2, -0.2, 0.2};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0 / 4.0, 0.0, 0.0});
+
+    // lets make the array a little bit more complicated
+    const bool dd = false;
+    // 1 bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 4.0;
+    const size_t dim = 1;
+    quick_index qindex(dim, position_array, max_window_size, n_coarse_bins, dd);
+
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(data, one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(data, one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(data, log_smooth_result, qindex.domain_decomposed);
+
+    for (int i = 0; i < 10; i++) {
+      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.0, 1e-2))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.0, 1e-2))
+        ITFAILS;
+    }
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+      ITFAILS;
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(
+            std::accumulate(data.begin(), data.end(), 0.0),
+            std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0)))
+      ITFAILS;
+  }
+
+  if (ut.numFails == 0) {
+    PASSMSG("KDE checks pass");
+  } else {
+    FAILMSG("KDE checks failed");
+  }
+}
+
+void test_decomposition(ParallelUnitTest &ut) {
+  kde test_kde;
+
+  // test the epan kernel
+  double value = test_kde.epan_kernel(0.0);
+  if (!rtt_dsxx::soft_equiv(value, 0.75))
+    ITFAILS;
+
+  if (rtt_c4::nodes() != 3)
+    ITFAILS;
+
+  // spherical reconstruction
+  {
+    const size_t local_size = 24;
+    const std::array<double, 3> sphere_center{0.0, -1.0, 0.0};
+    const double max_radius = 1.0;
+    const double min_radius = 0.0;
+    const double shell_min_radius = 0.5;
+    kde sphere_kde;
+    sphere_kde.set_sphere_center(sphere_center, min_radius, max_radius);
+    kde shell_kde;
+    shell_kde.set_sphere_center(sphere_center, shell_min_radius, max_radius);
+    const std::array<double, 8> radial_edges{0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1.0};
+    const std::array<double, 9> cosine_edges{-1.0, -0.75, -0.5, -0.25, 0.0, 0.25, 0.5, 0.75, 1.0};
+    const size_t data_size = radial_edges.size() * cosine_edges.size();
+    std::vector<std::array<double, 3>> position_array(data_size,
+                                                      std::array<double, 3>{0.0, 0.0, 0.0});
+
+    std::vector<double> shell_data(data_size, 0.0);
+    std::vector<double> spoke_data(data_size, 0.0);
+    size_t point_i = 0;
+    size_t ri = 0;
+    for (auto &r : radial_edges) {
+      size_t mui = 0;
+      for (auto &mu : cosine_edges) {
+        spoke_data[point_i] = static_cast<double>(mui) + 1.0;
+        shell_data[point_i] = static_cast<double>(ri) + 1.0;
+        double rel_y = r * mu;
+        position_array[point_i][0] =
+            rtt_dsxx::soft_equiv(r * r, rel_y * rel_y, 1e-6) ? 0.0 : sqrt(r * r - rel_y * rel_y);
+        position_array[point_i][1] = sphere_center[1] + rel_y;
+        point_i++;
+        mui++;
+      }
+      ri++;
+    }
+    std::vector<double> spoke_smoothed_shells{
+        2.51488, 2.99904, 2.99904, 3.69002, 3.72457, 3.69002, 2.99904, 2.99904, 2.51488,
+        2.51645, 3.00418, 3.00418, 3.72015, 3.7866,  3.72015, 3.00418, 3.00418, 2.51645,
+        2.51803, 3.00928, 3.00928, 3.74919, 3.84522, 3.74919, 3.00928, 3.00928, 2.51803,
+        2.51961, 3.01436, 3.01436, 3.77729, 3.90089, 3.77729, 3.01436, 3.01436, 2.51961,
+        5.52169, 3.04531, 3.04531, 3.93334, 4.19165, 3.93334, 3.04531, 3.04531, 5.52169,
+        5.55417, 6.52859, 6.95461, 4.19454, 4.61685, 4.19454, 6.95461, 6.52859, 5.55417,
+        7.53548, 6.56107, 7,       4.58158, 5.14978, 4.58158, 7,       6.56107, 7.53548,
+        7.56796, 8,       7.14194, 8,       6.33455, 8,       7.14194, 8,       7.56796};
+    std::vector<double> shell_smoothed_spoke{
+        4.82322, 4.82519, 4.82608, 4.82681, 4.8275, 4.82819, 4.82892, 4.8298,  4.83177,
+        4.81029, 4.81825, 4.8218,  4.82475, 4.8275, 4.83025, 4.8332,  4.83675, 4.8447,
+        4.78839, 4.80659, 4.81462, 4.82129, 4.8275, 4.83371, 4.84037, 4.84841, 4.86661,
+        4.75694, 4.79008, 4.8045,  4.81642, 4.8275, 4.83857, 4.8505,  4.86492, 4.89805,
+        4.04388, 4.22611, 4.67522, 4.75503, 5,      4.97765, 5.14326, 6.35454, 5.61112,
+        2.62832, 3.75795, 4.12091, 4.47922, 5,      6.9148,  6.66199, 7.35789, 7.02668,
+        1.66976, 3.01482, 3.72878, 4.43378, 5,      8.6895,  8.00342, 6.64018, 7.98524,
+        1,       2,       4.02682, 4.51075, 5,      5.14424, 5.62818, 8,       9};
+
+    std::vector<double> dd_const_data(local_size, 1.0);
+    std::vector<double> dd_spoke_data(local_size);
+    std::vector<double> dd_shell_data(local_size);
+    std::vector<double> dd_spoke_smoothed_shells(local_size);
+    std::vector<double> dd_shell_smoothed_spoke(local_size);
+    std::vector<std::array<double, 3>> dd_position_array(local_size);
+    for (size_t i = 0; i < local_size; i++) {
+      dd_spoke_data[i] = spoke_data[i + rtt_c4::node() * local_size];
+      dd_shell_data[i] = shell_data[i + rtt_c4::node() * local_size];
+      dd_spoke_smoothed_shells[i] = spoke_smoothed_shells[i + rtt_c4::node() * local_size];
+      dd_shell_smoothed_spoke[i] = shell_smoothed_spoke[i + rtt_c4::node() * local_size];
+      dd_position_array[i] = position_array[i + rtt_c4::node() * local_size];
+    }
+
+    // zero reconstruction array on a shell mesh
+    {
+      std::vector<double> zero_data(local_size, 0.0);
+      std::vector<std::array<double, 3>> one_over_bandwidth_array(
+          local_size, std::array<double, 3>{1.0, 1.0e12, 0.0});
+      const bool dd = true;
+      // two bins per point
+      const size_t n_coarse_bins = 5;
+      const double max_window_size = 2.0;
+      const size_t dim = 2;
+      quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
+      std::vector<double> smooth_result =
+          shell_kde.reconstruction(zero_data, one_over_bandwidth_array, qindex);
+      std::vector<double> log_smooth_result =
+          shell_kde.log_reconstruction(zero_data, one_over_bandwidth_array, qindex);
+      // Apply Conservation
+      shell_kde.apply_conservation(zero_data, smooth_result, qindex.domain_decomposed);
+      shell_kde.apply_conservation(zero_data, log_smooth_result, qindex.domain_decomposed);
+
+      // Check smooth result
+      for (size_t i = 0; i < local_size; i++) {
+        if (!rtt_dsxx::soft_equiv(smooth_result[i], zero_data[i]))
+          ITFAILS;
+        if (!rtt_dsxx::soft_equiv(log_smooth_result[i], zero_data[i]))
+          ITFAILS;
+      }
+
+      double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+      rtt_c4::global_sum(smooth_conservation);
+      double log_smooth_conservation =
+          std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+      rtt_c4::global_sum(log_smooth_conservation);
+
+      // Energy conservation
+      if (!rtt_dsxx::soft_equiv(0.0, smooth_conservation))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(0.0, log_smooth_conservation))
+        ITFAILS;
+    }
+
+    // spoke reconstruction array
+    {
+      std::vector<std::array<double, 3>> one_over_bandwidth_array(
+          local_size, std::array<double, 3>{1.0, 1.0e12, 0.0});
+      const bool dd = true;
+      // two bins per point
+      const size_t n_coarse_bins = 5;
+      const double max_window_size = 2.0;
+      const size_t dim = 2;
+      quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
+      std::vector<double> smooth_result =
+          sphere_kde.reconstruction(dd_spoke_data, one_over_bandwidth_array, qindex);
+      std::vector<double> log_smooth_result =
+          sphere_kde.log_reconstruction(dd_spoke_data, one_over_bandwidth_array, qindex);
+      // Apply Conservation
+      sphere_kde.apply_conservation(dd_spoke_data, smooth_result, qindex.domain_decomposed);
+      sphere_kde.apply_conservation(dd_spoke_data, log_smooth_result, qindex.domain_decomposed);
+
+      // Check smooth result
+      for (size_t i = 0; i < local_size; i++) {
+        if (!rtt_dsxx::soft_equiv(smooth_result[i], dd_spoke_data[i]))
+          ITFAILS;
+        if (!rtt_dsxx::soft_equiv(log_smooth_result[i], dd_spoke_data[i]))
+          ITFAILS;
+      }
+
+      double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+      rtt_c4::global_sum(smooth_conservation);
+      double log_smooth_conservation =
+          std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+      rtt_c4::global_sum(log_smooth_conservation);
+
+      // Energy conservation
+      if (!rtt_dsxx::soft_equiv(std::accumulate(spoke_data.begin(), spoke_data.end(), 0.0),
+                                smooth_conservation))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(std::accumulate(spoke_data.begin(), spoke_data.end(), 0.0),
+                                log_smooth_conservation))
+        ITFAILS;
+    }
+
+    // shell reconstruction array
+    {
+      std::vector<std::array<double, 3>> one_over_bandwidth_array(
+          local_size, std::array<double, 3>{1.0e12, 1.0, 0.0});
+      const bool dd = true;
+      // two bins per point
+      const size_t n_coarse_bins = 5;
+      const double max_window_size = 2.0;
+      const size_t dim = 2;
+      quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
+      std::vector<double> smooth_result =
+          sphere_kde.reconstruction(dd_shell_data, one_over_bandwidth_array, qindex);
+      std::vector<double> log_smooth_result =
+          sphere_kde.log_reconstruction(dd_shell_data, one_over_bandwidth_array, qindex);
+      // Apply Conservation
+      sphere_kde.apply_conservation(dd_shell_data, smooth_result, qindex.domain_decomposed);
+      sphere_kde.apply_conservation(dd_shell_data, log_smooth_result, qindex.domain_decomposed);
+
+      // Check smooth result
+      for (size_t i = 0; i < local_size; i++) {
+        if (!rtt_dsxx::soft_equiv(smooth_result[i], dd_shell_data[i]))
+          ITFAILS;
+        if (!rtt_dsxx::soft_equiv(log_smooth_result[i], dd_shell_data[i]))
+          ITFAILS;
+      }
+
+      double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+      rtt_c4::global_sum(smooth_conservation);
+      double log_smooth_conservation =
+          std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+      rtt_c4::global_sum(log_smooth_conservation);
+
+      // Energy conservation
+      if (!rtt_dsxx::soft_equiv(std::accumulate(shell_data.begin(), shell_data.end(), 0.0),
+                                smooth_conservation))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(std::accumulate(shell_data.begin(), shell_data.end(), 0.0),
+                                log_smooth_conservation))
+        ITFAILS;
+    }
+
+    // spoke smoothing on shell array
+    {
+      std::vector<std::array<double, 3>> one_over_bandwidth_array(
+          local_size, std::array<double, 3>{1.0, 1.0e12, 0.0});
+      const bool dd = true;
+      // two bins per point
+      const size_t n_coarse_bins = 5;
+      const double max_window_size = 2.0;
+      const size_t dim = 2;
+      quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
+      std::vector<double> smooth_result =
+          sphere_kde.reconstruction(dd_shell_data, one_over_bandwidth_array, qindex);
+      std::vector<double> log_smooth_result =
+          sphere_kde.reconstruction(dd_shell_data, one_over_bandwidth_array, qindex);
+      // Apply Conservation
+      sphere_kde.apply_conservation(dd_shell_data, smooth_result, qindex.domain_decomposed);
+      sphere_kde.apply_conservation(dd_shell_data, log_smooth_result, qindex.domain_decomposed);
+
+      // Check smooth result
+      for (size_t i = 0; i < local_size; i++) {
+        if (!rtt_dsxx::soft_equiv(smooth_result[i], dd_spoke_smoothed_shells[i], 1e-3))
+          ITFAILS;
+        if (!rtt_dsxx::soft_equiv(log_smooth_result[i], dd_spoke_smoothed_shells[i], 1e-3))
+          ITFAILS;
+      }
+
+      double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+      rtt_c4::global_sum(smooth_conservation);
+      double log_smooth_conservation =
+          std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+      rtt_c4::global_sum(log_smooth_conservation);
+
+      // Energy conservation
+      if (!rtt_dsxx::soft_equiv(std::accumulate(shell_data.begin(), shell_data.end(), 0.0),
+                                smooth_conservation))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(std::accumulate(shell_data.begin(), shell_data.end(), 0.0),
+                                log_smooth_conservation))
+        ITFAILS;
+    }
+
+    // shell smoothing on spoke array
+    {
+      std::vector<std::array<double, 3>> one_over_bandwidth_array(
+          local_size, std::array<double, 3>{1.0e12, 1.0, 0.0});
+      const bool dd = true;
+      // two bins per point
+      const size_t n_coarse_bins = 5;
+      const double max_window_size = 1.0;
+      const size_t dim = 2;
+      quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
+      std::vector<double> smooth_result =
+          sphere_kde.reconstruction(dd_spoke_data, one_over_bandwidth_array, qindex);
+      std::vector<double> log_smooth_result =
+          sphere_kde.reconstruction(dd_spoke_data, one_over_bandwidth_array, qindex);
+      // Apply Conservation
+      sphere_kde.apply_conservation(dd_spoke_data, smooth_result, qindex.domain_decomposed);
+      sphere_kde.apply_conservation(dd_spoke_data, log_smooth_result, qindex.domain_decomposed);
+
+      // Check smooth result
+      for (size_t i = 0; i < local_size; i++) {
+        if (!rtt_dsxx::soft_equiv(smooth_result[i], dd_shell_smoothed_spoke[i], 1e-3))
+          ITFAILS;
+        if (!rtt_dsxx::soft_equiv(log_smooth_result[i], dd_shell_smoothed_spoke[i], 1e-3))
+          ITFAILS;
+      }
+
+      double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+      rtt_c4::global_sum(smooth_conservation);
+      double log_smooth_conservation =
+          std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+      rtt_c4::global_sum(log_smooth_conservation);
+
+      // Energy conservation
+      if (!rtt_dsxx::soft_equiv(std::accumulate(spoke_data.begin(), spoke_data.end(), 0.0),
+                                smooth_conservation))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(std::accumulate(spoke_data.begin(), spoke_data.end(), 0.0),
+                                log_smooth_conservation))
+        ITFAILS;
+    }
+  }
+
+  int local_size = 3;
+  // give the odd size to the final rank to make striding easy
+  if (rtt_c4::node() == 2)
+    local_size = 4;
+
+  // No mean reconstruction because of small basis functions
+  {
+    std::vector<double> data{0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0 / 0.1, 0.0, 0.0});
+
+    // map to dd arrays with simple stride
+    std::vector<double> dd_data(local_size, 0.0);
+    std::vector<std::array<double, 3>> dd_position_array(local_size,
+                                                         std::array<double, 3>{0.0, 0.0, 0.0});
+    std::vector<std::array<double, 3>> dd_one_over_bandwidth_array(
+        local_size, std::array<double, 3>{0.0, 0., 0.0});
+
+    for (int i = 0; i < local_size; i++) {
+      dd_data[i] = data[i + rtt_c4::node() * 3];
+      dd_position_array[i] = position_array[i + rtt_c4::node() * 3];
+      dd_one_over_bandwidth_array[i] = one_over_bandwidth_array[i + rtt_c4::node() * 3];
+    }
+
+    const bool dd = true;
+    // 1 bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 0.1;
+    const size_t dim = 1;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(dd_data, log_smooth_result, qindex.domain_decomposed);
+
+    // Check smooth result
+    for (int i = 0; i < local_size; i++) {
+      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.1))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.1))
+        ITFAILS;
+    }
+
+    double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+    rtt_c4::global_sum(smooth_conservation);
+    double log_smooth_conservation =
+        std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+    rtt_c4::global_sum(log_smooth_conservation);
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0), smooth_conservation))
+      ITFAILS;
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              log_smooth_conservation))
+      ITFAILS;
+  }
+
+  // 2D No mean reconstruction because of small basis functions
+  {
+    std::vector<double> data{0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0 / 0.1, 1.0 / 0.1, 0.0});
+
+    // map to dd arrays with simple stride
+    std::vector<double> dd_data(local_size, 0.0);
+    std::vector<std::array<double, 3>> dd_position_array(local_size,
+                                                         std::array<double, 3>{0.0, 0.0, 0.0});
+    std::vector<std::array<double, 3>> dd_one_over_bandwidth_array(
+        local_size, std::array<double, 3>{0.0, 0., 0.0});
+
+    for (int i = 0; i < local_size; i++) {
+      dd_data[i] = data[i + rtt_c4::node() * 3];
+      dd_position_array[i] = position_array[i + rtt_c4::node() * 3];
+      dd_one_over_bandwidth_array[i] = one_over_bandwidth_array[i + rtt_c4::node() * 3];
+    }
+
+    const bool dd = true;
+    // 1 bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 0.1;
+    const size_t dim = 2;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(dd_data, log_smooth_result, qindex.domain_decomposed);
+
+    // Check smooth result
+    for (int i = 0; i < local_size; i++) {
+      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.1))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.1))
+        ITFAILS;
+    }
+
+    double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+    rtt_c4::global_sum(smooth_conservation);
+    double log_smooth_conservation =
+        std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+    rtt_c4::global_sum(log_smooth_conservation);
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0), smooth_conservation))
+      ITFAILS;
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              log_smooth_conservation))
+      ITFAILS;
+  }
+
+  // "Smoothed" reconstruction.
+  {
+    std::vector<double> data{0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0 / 4.0, 0.0, 0.0});
+
+    // map to dd arrays with simple stride
+    std::vector<double> dd_data(local_size, 0.0);
+    std::vector<std::array<double, 3>> dd_position_array(local_size,
+                                                         std::array<double, 3>{0.0, 0.0, 0.0});
+    std::vector<std::array<double, 3>> dd_one_over_bandwidth_array(
+        local_size, std::array<double, 3>{0.0, 0., 0.0});
+
+    for (int i = 0; i < local_size; i++) {
+      dd_data[i] = data[i + rtt_c4::node() * 3];
+      dd_position_array[i] = position_array[i + rtt_c4::node() * 3];
+      dd_one_over_bandwidth_array[i] = one_over_bandwidth_array[i + rtt_c4::node() * 3];
+    }
+
+    const bool dd = true;
+    // 1/2 bin per point
+    const size_t n_coarse_bins = 5;
+    const double max_window_size = 4.0;
+    const size_t dim = 1;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(dd_data, log_smooth_result, qindex.domain_decomposed);
+
+    // Check smooth result
+    for (int i = 0; i < local_size; i++) {
+      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.1))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.1))
+        ITFAILS;
+    }
+
+    double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+    rtt_c4::global_sum(smooth_conservation);
+    double log_smooth_conservation =
+        std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+    rtt_c4::global_sum(log_smooth_conservation);
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0), smooth_conservation))
+      ITFAILS;
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              log_smooth_conservation))
+      ITFAILS;
+  }
+
+  // 2D "Smoothed" reconstruction.
+  {
+    std::vector<double> data{0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0 / 4.0, 1.0 / 4.0, 0.0});
+
+    // map to dd arrays with simple stride
+    std::vector<double> dd_data(local_size, 0.0);
+    std::vector<std::array<double, 3>> dd_position_array(local_size,
+                                                         std::array<double, 3>{0.0, 0.0, 0.0});
+    std::vector<std::array<double, 3>> dd_one_over_bandwidth_array(
+        local_size, std::array<double, 3>{0.0, 0., 0.0});
+
+    for (int i = 0; i < local_size; i++) {
+      dd_data[i] = data[i + rtt_c4::node() * 3];
+      dd_position_array[i] = position_array[i + rtt_c4::node() * 3];
+      dd_one_over_bandwidth_array[i] = one_over_bandwidth_array[i + rtt_c4::node() * 3];
+    }
+
+    const bool dd = true;
+    // 1/2 bin per point
+    const size_t n_coarse_bins = 5;
+    const double max_window_size = 4.0;
+    const size_t dim = 1;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(dd_data, log_smooth_result, qindex.domain_decomposed);
+
+    // Check smooth result
+    for (int i = 0; i < local_size; i++) {
+      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.1))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.1))
+        ITFAILS;
+    }
+
+    double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+    rtt_c4::global_sum(smooth_conservation);
+    double log_smooth_conservation =
+        std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+    rtt_c4::global_sum(log_smooth_conservation);
+
+    // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0), smooth_conservation))
+      ITFAILS;
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              log_smooth_conservation))
+      ITFAILS;
+  }
+
+  // No reconstruction because of small basis functions
+  {
+    std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    std::vector<std::array<double, 3>> one_over_bandwidth_array(
+        10, std::array<double, 3>{1.0 / 0.1, 0.0, 0.0});
+
+    // map to dd arrays with simple stride
+    std::vector<double> dd_data(local_size, 0.0);
+    std::vector<std::array<double, 3>> dd_position_array(local_size,
+                                                         std::array<double, 3>{0.0, 0.0, 0.0});
+    std::vector<std::array<double, 3>> dd_one_over_bandwidth_array(
+        local_size, std::array<double, 3>{0.0, 0., 0.0});
+
+    for (int i = 0; i < local_size; i++) {
+      dd_data[i] = data[i + rtt_c4::node() * 3];
+      dd_position_array[i] = position_array[i + rtt_c4::node() * 3];
+      dd_one_over_bandwidth_array[i] = one_over_bandwidth_array[i + rtt_c4::node() * 3];
+    }
+
+    const bool dd = true;
+    // 2x bin per point
+    const size_t n_coarse_bins = 20;
+    const double max_window_size = 0.1;
+    const size_t dim = 1;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
     std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(data, position_array, one_over_bandwidth_array, dd);
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(dd_data, log_smooth_result, qindex.domain_decomposed);
 
     // Check smooth result
-    for (int i = 0; i < 10; i++) {
-      if (!rtt_dsxx::soft_equiv(data[i], smooth_result[i]))
+    for (int i = 0; i < local_size; i++) {
+      if (!rtt_dsxx::soft_equiv(data[i + rtt_c4::node() * 3], smooth_result[i]))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(data[i + rtt_c4::node() * 3], log_smooth_result[i]))
         ITFAILS;
     }
 
+    double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+    rtt_c4::global_sum(smooth_conservation);
+    double log_smooth_conservation =
+        std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+    rtt_c4::global_sum(log_smooth_conservation);
+
     // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0), smooth_conservation))
+      ITFAILS;
     if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
-                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+                              log_smooth_conservation))
       ITFAILS;
   }
 
-  // Smoothed reconstruction should be close to the problem mean of 0.15
+  // 2D No reconstruction because of small basis functions
   {
     std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
     std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
@@ -116,190 +1600,337 @@ void test_replication(ParallelUnitTest &ut) {
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0 / 4.0, 0., 0.0});
-    const bool dd = false;
+        10, std::array<double, 3>{1.0 / 0.1, 1.0, 0.0});
+
+    // map to dd arrays with simple stride
+    std::vector<double> dd_data(local_size, 0.0);
+    std::vector<std::array<double, 3>> dd_position_array(local_size,
+                                                         std::array<double, 3>{0.0, 0.0, 0.0});
+    std::vector<std::array<double, 3>> dd_one_over_bandwidth_array(
+        local_size, std::array<double, 3>{0.0, 0., 0.0});
+
+    for (int i = 0; i < local_size; i++) {
+      dd_data[i] = data[i + rtt_c4::node() * 3];
+      dd_position_array[i] = position_array[i + rtt_c4::node() * 3];
+      dd_one_over_bandwidth_array[i] = one_over_bandwidth_array[i + rtt_c4::node() * 3];
+    }
+
+    const bool dd = true;
+    // 2x bin per point
+    const size_t n_coarse_bins = 20;
+    const double max_window_size = 1.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
     std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(data, position_array, one_over_bandwidth_array, dd);
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(dd_data, log_smooth_result, qindex.domain_decomposed);
 
     // Check smooth result
-    for (int i = 0; i < 10; i++) {
-      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.15, 1e-1))
+    for (int i = 0; i < local_size; i++) {
+      if (!rtt_dsxx::soft_equiv(data[i + rtt_c4::node() * 3], smooth_result[i]))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(data[i + rtt_c4::node() * 3], log_smooth_result[i]))
         ITFAILS;
     }
 
+    double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+    rtt_c4::global_sum(smooth_conservation);
+    double log_smooth_conservation =
+        std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+    rtt_c4::global_sum(log_smooth_conservation);
+
     // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0), smooth_conservation))
+      ITFAILS;
     if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
-                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+                              log_smooth_conservation))
       ITFAILS;
   }
 
-  //  variable band width test.
+  // 2D No reconstruction because of small bias in dim=1 keeps dim=2 from
+  // accumulating offset data. This test can't be achieved in the opposite
+  // direction without a small bandwidth in both dirs because the rows are
+  // exactly in line with one another, while the columns are offset.
   {
-    std::vector<double> data{0.01, 0.02, 0.1, 0.2, 0.1, 0.02, 0.01, 0.2, 0.1, 0.2};
+    std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
     std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
     for (int i = 0; i < 10; i++) {
       position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0, 0., 0.0});
+        10, std::array<double, 3>{1.0 / 0.1, 1.0 / 4.0, 0.0});
 
-    const bool dd = false;
-    std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(data, position_array, one_over_bandwidth_array, dd);
+    // map to dd arrays with simple stride
+    std::vector<double> dd_data(local_size, 0.0);
+    std::vector<std::array<double, 3>> dd_position_array(local_size,
+                                                         std::array<double, 3>{0.0, 0.0, 0.0});
+    std::vector<std::array<double, 3>> dd_one_over_bandwidth_array(
+        local_size, std::array<double, 3>{0.0, 0., 0.0});
 
-    std::vector<double> bench{0.01446,   0.0172074, 0.10425,  0.172074, 0.131586,
-                              0.0172074, 0.040488,  0.172074, 0.131586, 0.15906};
+    for (int i = 0; i < local_size; i++) {
+      dd_data[i] = data[i + rtt_c4::node() * 3];
+      dd_position_array[i] = position_array[i + rtt_c4::node() * 3];
+      dd_one_over_bandwidth_array[i] = one_over_bandwidth_array[i + rtt_c4::node() * 3];
+    }
+
+    const bool dd = true;
+    // 2x bin per point
+    const size_t n_coarse_bins = 20;
+    const double max_window_size = 4.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(dd_data, log_smooth_result, qindex.domain_decomposed);
 
     // Check smooth result
-    for (int i = 0; i < 10; i++) {
-      if (!rtt_dsxx::soft_equiv(bench[i], smooth_result[i], 1e-4))
+    for (int i = 0; i < local_size; i++) {
+      if (!rtt_dsxx::soft_equiv(data[i + rtt_c4::node() * 3], smooth_result[i]))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(data[i + rtt_c4::node() * 3], log_smooth_result[i]))
         ITFAILS;
     }
 
+    double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+    rtt_c4::global_sum(smooth_conservation);
+    double log_smooth_conservation =
+        std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+    rtt_c4::global_sum(log_smooth_conservation);
+
     // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0), smooth_conservation))
+      ITFAILS;
     if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
-                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+                              log_smooth_conservation))
       ITFAILS;
   }
 
-  //  variable band width test.
+  // 2D reconstruct only along dim=1 for each row in dim=2
   {
-    std::vector<double> data{0.01, 0.02, 0.1, 0.2, 0.1, 0.02, 0.01, 0.2, 0.1, 0.2};
+    std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
     std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
     for (int i = 0; i < 10; i++) {
       position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0, 0., 0.0});
+        10, std::array<double, 3>{1.0 / 4.0, 1.0 / 0.1, 0.0});
 
-    // lets make the array a little bit more complicated
-    one_over_bandwidth_array[9] = {1.0 / 0.5, 0., 0.};
-    one_over_bandwidth_array[3] = {1.0 / 0.1, 0., 0.};
-    one_over_bandwidth_array[4] = {1.0 / 0.5, 0., 0.};
-    one_over_bandwidth_array[2] = {1.0 / 2.0, 0., 0.};
-    const bool dd = false;
-    std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(data, position_array, one_over_bandwidth_array, dd);
+    // map to dd arrays with simple stride
+    std::vector<double> dd_data(local_size, 0.0);
+    std::vector<std::array<double, 3>> dd_position_array(local_size,
+                                                         std::array<double, 3>{0.0, 0.0, 0.0});
+    std::vector<std::array<double, 3>> dd_one_over_bandwidth_array(
+        local_size, std::array<double, 3>{0.0, 0., 0.0});
+
+    for (int i = 0; i < local_size; i++) {
+      dd_data[i] = data[i + rtt_c4::node() * 3];
+      dd_position_array[i] = position_array[i + rtt_c4::node() * 3];
+      dd_one_over_bandwidth_array[i] = one_over_bandwidth_array[i + rtt_c4::node() * 3];
+    }
+
+    const bool dd = true;
+    // 2x bin per point
+    const size_t n_coarse_bins = 20;
+    const double max_window_size = 4.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
 
-    std::vector<double> bench{0.0139053, 0.0165473, 0.0953673, 0.194674, 0.0973372,
-                              0.0165473, 0.0389349, 0.165473,  0.126538, 0.194674};
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(dd_data, log_smooth_result, qindex.domain_decomposed);
 
     // Check smooth result
-    for (int i = 0; i < 10; i++) {
-      if (!rtt_dsxx::soft_equiv(bench[i], smooth_result[i], 1e-4))
-        ITFAILS;
+    // Check smooth result
+    for (int i = 0; i < local_size; i++) {
+      if (dd_position_array[i][1] > 0.0) {
+        // 0.14 = (0.1*3+0.2*2)/5
+        if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.14, 3e-2))
+          ITFAILS;
+        if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.14, 3e-2))
+          ITFAILS;
+      } else {
+        // 0.16 = (0.1*2+0.2*3)/5
+        if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.16, 3e-2))
+          ITFAILS;
+        if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.16, 3e-2))
+          ITFAILS;
+      }
     }
 
+    double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+    rtt_c4::global_sum(smooth_conservation);
+    double log_smooth_conservation =
+        std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+    rtt_c4::global_sum(log_smooth_conservation);
+
     // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0), smooth_conservation))
+      ITFAILS;
     if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
-                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+                              log_smooth_conservation))
       ITFAILS;
   }
 
-  //  step band width test.
+  // 2D reconstruct mainly along dim=2 (rows are offset by 0.5 so we have to
+  // have a larger bandwidth in dim=1 to get any smoothing in dim=2) for each
+  // column in dim=1
   {
-    std::vector<double> data{0.01, 0.02, 0.1, 0.2, 0.1, 0.02, 0.01, 0.2, 0.1, 0.2};
+    std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
     std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
     for (int i = 0; i < 10; i++) {
       position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0, 0., 0.0});
+        10, std::array<double, 3>{1.0 / 0.6, 1.0 / 4.0, 0.0});
 
-    // transition at 1.75
-    // lets make the array a little bit more complicated
-    one_over_bandwidth_array[0] = {1.0 / 1.75, 0., 0.};
-    one_over_bandwidth_array[1] = {1.0 / 0.75, 0., 0.};
-    one_over_bandwidth_array[2] = {1.0 / 0.25, 0., 0.};
-    one_over_bandwidth_array[3] = {1.0 / 1.25, 0., 0.};
-    one_over_bandwidth_array[4] = {1.0 / 2.25, 0., 0.};
-    one_over_bandwidth_array[5] = {1.0 / 1.25, 0., 0.};
-    one_over_bandwidth_array[6] = {1.0 / 0.25, 0., 0.};
-    one_over_bandwidth_array[7] = {1.0 / 0.75, 0., 0.};
-    one_over_bandwidth_array[8] = {1.0 / 1.75, 0., 0.};
-    one_over_bandwidth_array[9] = {1.0 / 2.75, 0., 0.};
-    const bool dd = false;
-    std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(data, position_array, one_over_bandwidth_array, dd);
+    std::vector<double> bench{0.122267, 0.181788, 0.118212, 0.181788, 0.118212,
+                              0.181788, 0.118212, 0.181788, 0.118212, 0.177733};
+    std::vector<double> log_bench{0.121638, 0.182268, 0.117873, 0.182268, 0.117873,
+                                  0.182268, 0.117873, 0.182268, 0.117873, 0.177799};
+
+    // map to dd arrays with simple stride
+    std::vector<double> dd_data(local_size, 0.0);
+    std::vector<std::array<double, 3>> dd_position_array(local_size,
+                                                         std::array<double, 3>{0.0, 0.0, 0.0});
+    std::vector<std::array<double, 3>> dd_one_over_bandwidth_array(
+        local_size, std::array<double, 3>{0.0, 0., 0.0});
+    std::vector<double> dd_bench(local_size, 0.0);
+    std::vector<double> log_dd_bench(local_size, 0.0);
+
+    for (int i = 0; i < local_size; i++) {
+      dd_data[i] = data[i + rtt_c4::node() * 3];
+      dd_position_array[i] = position_array[i + rtt_c4::node() * 3];
+      dd_one_over_bandwidth_array[i] = one_over_bandwidth_array[i + rtt_c4::node() * 3];
+      dd_bench[i] = bench[i + rtt_c4::node() * 3];
+      log_dd_bench[i] = log_bench[i + rtt_c4::node() * 3];
+    }
 
-    std::vector<double> bench{0.01588,   0.0177126, 0.101982, 0.157172, 0.154663,
-                              0.0163707, 0.010198,  0.177126, 0.153908, 0.154988};
+    const bool dd = true;
+    // 2x bin per point
+    const size_t n_coarse_bins = 20;
+    const double max_window_size = 4.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
+    std::vector<double> smooth_result =
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(dd_data, log_smooth_result, qindex.domain_decomposed);
 
     // Check smooth result
-    for (int i = 0; i < 10; i++) {
-      if (!rtt_dsxx::soft_equiv(bench[i], smooth_result[i], 1e-4))
+    for (int i = 0; i < local_size; i++) {
+      if (!rtt_dsxx::soft_equiv(smooth_result[i], dd_bench[i], 1e-4))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], log_dd_bench[i], 1e-4))
         ITFAILS;
     }
 
+    double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+    rtt_c4::global_sum(smooth_conservation);
+    double log_smooth_conservation =
+        std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+    rtt_c4::global_sum(log_smooth_conservation);
+
     // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0), smooth_conservation))
+      ITFAILS;
     if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
-                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+                              log_smooth_conservation))
       ITFAILS;
   }
 
-  // what if half of it is negative and the mean is zero
+  // Smoothed reconstruction should be close to the problem mean of 0.15
   {
-    std::vector<double> data{-0.2, 0.2, -0.2, 0.2, -0.2, 0.2, -0.2, 0.2, -0.2, 0.2};
+    std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
     std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
     for (int i = 0; i < 10; i++) {
       position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0 / 4.0, 0., 0.0});
+        10, std::array<double, 3>{1.0 / 4.0, 0.0, 0.0});
+
+    // map to dd arrays with simple stride
+    std::vector<double> dd_data(local_size, 0.0);
+    std::vector<std::array<double, 3>> dd_position_array(local_size,
+                                                         std::array<double, 3>{0.0, 0.0, 0.0});
+    std::vector<std::array<double, 3>> dd_one_over_bandwidth_array(
+        local_size, std::array<double, 3>{0.0, 0., 0.0});
+
+    for (int i = 0; i < local_size; i++) {
+      dd_data[i] = data[i + rtt_c4::node() * 3];
+      dd_position_array[i] = position_array[i + rtt_c4::node() * 3];
+      dd_one_over_bandwidth_array[i] = one_over_bandwidth_array[i + rtt_c4::node() * 3];
+    }
+
+    const bool dd = true;
+    // 1x bin per point
+    const size_t n_coarse_bins = 10;
+    // window size must be 2X bigger then biggest bandwidth
+    const double max_window_size = 9.0;
+    const size_t dim = 1;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
 
-    // lets make the array a little bit more complicated
-    const bool dd = false;
     std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(data, position_array, one_over_bandwidth_array, dd);
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(dd_data, log_smooth_result, qindex.domain_decomposed);
 
-    for (int i = 0; i < 10; i++) {
-      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.0, 1e-2))
+    // Check smooth result
+    for (int i = 0; i < local_size; i++) {
+      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.15, 1e-2))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.15, 1e-2))
         ITFAILS;
     }
 
+    double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+    rtt_c4::global_sum(smooth_conservation);
+    double log_smooth_conservation =
+        std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+    rtt_c4::global_sum(log_smooth_conservation);
+
     // Energy conservation
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0), smooth_conservation))
+      ITFAILS;
     if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
-                              std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0)))
+                              log_smooth_conservation))
       ITFAILS;
   }
 
-  if (ut.numFails == 0) {
-    PASSMSG("KDE checks pass");
-  } else {
-    FAILMSG("KDE checks failed");
-  }
-}
-
-void test_decomposition(ParallelUnitTest &ut) {
-  kde<kde_coordinates::CART> test_kde;
-
-  // test the epan kernel
-  double value = test_kde.epan_kernel(0.0);
-  if (!rtt_dsxx::soft_equiv(value, 0.75))
-    ITFAILS;
-
-  if (rtt_c4::nodes() != 3)
-    ITFAILS;
-
-  int local_size = 3;
-  // give the odd size to the final rank to make striding easy
-  if (rtt_c4::node() == 2)
-    local_size = 4;
-
-  // No mean reconstruction because of small basis functions
+  // 2D Smoothed reconstruction should be close to the problem mean of 0.15
   {
-    std::vector<double> data{0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+    std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
     std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
     for (int i = 0; i < 10; i++) {
       position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0 / 0.1, 0., 0.0});
+        10, std::array<double, 3>{1.0 / 4.0, 1.0 / 4.0, 0.0});
 
     // map to dd arrays with simple stride
     std::vector<double> dd_data(local_size, 0.0);
@@ -315,33 +1946,55 @@ void test_decomposition(ParallelUnitTest &ut) {
     }
 
     const bool dd = true;
+    // 1x bin per point
+    const size_t n_coarse_bins = 10;
+    // window size must be 2X bigger then biggest bandwidth
+    const double max_window_size = 9.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
     std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(dd_data, dd_position_array, dd_one_over_bandwidth_array, dd);
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(dd_data, log_smooth_result, qindex.domain_decomposed);
 
     // Check smooth result
     for (int i = 0; i < local_size; i++) {
-      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.1))
+      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.15, 1e-2))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.15, 1e-2))
         ITFAILS;
     }
 
     double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
     rtt_c4::global_sum(smooth_conservation);
+    double log_smooth_conservation =
+        std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
+    rtt_c4::global_sum(log_smooth_conservation);
 
     // Energy conservation
     if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0), smooth_conservation))
       ITFAILS;
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              log_smooth_conservation))
+      ITFAILS;
   }
 
-  // "Smoothed" reconstruction.
+  // No  variable band width test.
   {
-    std::vector<double> data{0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+    std::vector<double> data{0.01, 0.02, 0.1, 0.2, 0.1, 0.02, 0.01, 0.2, 0.1, 0.2};
     std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
     for (int i = 0; i < 10; i++) {
       position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0 / 4.0, 0., 0.0});
+        10, std::array<double, 3>{1.0, 0.0, 0.0});
+    std::vector<double> bench{0.01446,   0.0172074, 0.10425,  0.172074, 0.131586,
+                              0.0172074, 0.040488,  0.172074, 0.131586, 0.15906};
 
     // map to dd arrays with simple stride
     std::vector<double> dd_data(local_size, 0.0);
@@ -357,12 +2010,20 @@ void test_decomposition(ParallelUnitTest &ut) {
     }
 
     const bool dd = true;
+    // 1x bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 1.0;
+    const size_t dim = 1;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
     std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(dd_data, dd_position_array, dd_one_over_bandwidth_array, dd);
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
 
     // Check smooth result
     for (int i = 0; i < local_size; i++) {
-      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.1))
+      if (!rtt_dsxx::soft_equiv(bench[i + rtt_c4::node() * 3], smooth_result[i], 1e-4))
         ITFAILS;
     }
 
@@ -374,16 +2035,18 @@ void test_decomposition(ParallelUnitTest &ut) {
       ITFAILS;
   }
 
-  // No reconstruction because of small basis functions
+  // 2D no  variable band width test.
   {
-    std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
+    std::vector<double> data{0.01, 0.02, 0.1, 0.2, 0.1, 0.02, 0.01, 0.2, 0.1, 0.2};
     std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
     for (int i = 0; i < 10; i++) {
       position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0 / 0.1, 0., 0.0});
+        10, std::array<double, 3>{1.0, 1.0 / 4.0, 0.0});
+    std::vector<double> bench{0.0142901, 0.0172733, 0.104099, 0.172733, 0.130699,
+                              0.0172733, 0.0396694, 0.172733, 0.130699, 0.160531};
 
     // map to dd arrays with simple stride
     std::vector<double> dd_data(local_size, 0.0);
@@ -399,12 +2062,20 @@ void test_decomposition(ParallelUnitTest &ut) {
     }
 
     const bool dd = true;
+    // 1x bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 4.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
     std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(dd_data, dd_position_array, dd_one_over_bandwidth_array, dd);
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
 
     // Check smooth result
     for (int i = 0; i < local_size; i++) {
-      if (!rtt_dsxx::soft_equiv(data[i + rtt_c4::node() * 3], smooth_result[i]))
+      if (!rtt_dsxx::soft_equiv(bench[i + rtt_c4::node() * 3], smooth_result[i], 1e-4))
         ITFAILS;
     }
 
@@ -416,16 +2087,25 @@ void test_decomposition(ParallelUnitTest &ut) {
       ITFAILS;
   }
 
-  // Smoothed reconstruction should be close to the problem mean of 0.15
+  //  variable band width test.
   {
-    std::vector<double> data{0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2};
+    std::vector<double> data{0.01, 0.02, 0.1, 0.2, 0.1, 0.02, 0.01, 0.2, 0.1, 0.2};
     std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
     for (int i = 0; i < 10; i++) {
       position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0 / 4.0, 0., 0.0});
+        10, std::array<double, 3>{1.0, 0.0, 0.0});
+
+    // lets make the array a little bit more complicated
+    one_over_bandwidth_array[9] = {1.0 / 0.5, 0.0, 0.0};
+    one_over_bandwidth_array[3] = {1.0 / 0.1, 0.0, 0.0};
+    one_over_bandwidth_array[4] = {1.0 / 0.5, 0.0, 0.0};
+    one_over_bandwidth_array[2] = {1.0 / 2.0, 0.0, 0.0};
+
+    std::vector<double> bench{0.0135142, 0.0160819, 0.0926847, 0.2,      0.1,
+                              0.0160819, 0.0378397, 0.160819,  0.122979, 0.2};
 
     // map to dd arrays with simple stride
     std::vector<double> dd_data(local_size, 0.0);
@@ -441,12 +2121,21 @@ void test_decomposition(ParallelUnitTest &ut) {
     }
 
     const bool dd = true;
+    // 1x bin per point
+    const size_t n_coarse_bins = 10;
+    // max window size must be 2x the max bandwidth size
+    const double max_window_size = 4.0;
+    const size_t dim = 1;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
     std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(dd_data, dd_position_array, dd_one_over_bandwidth_array, dd);
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
 
     // Check smooth result
     for (int i = 0; i < local_size; i++) {
-      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.15, 1e-2))
+      if (!rtt_dsxx::soft_equiv(bench[i + rtt_c4::node() * 3], smooth_result[i], 1e-4))
         ITFAILS;
     }
 
@@ -458,7 +2147,7 @@ void test_decomposition(ParallelUnitTest &ut) {
       ITFAILS;
   }
 
-  //  variable band width test.
+  // 2D variable band width test.
   {
     std::vector<double> data{0.01, 0.02, 0.1, 0.2, 0.1, 0.02, 0.01, 0.2, 0.1, 0.2};
     std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
@@ -467,9 +2156,16 @@ void test_decomposition(ParallelUnitTest &ut) {
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0, 0., 0.0});
-    std::vector<double> bench{0.01446,   0.0172074, 0.10425,  0.172074, 0.131586,
-                              0.0172074, 0.040488,  0.172074, 0.131586, 0.15906};
+        10, std::array<double, 3>{1.0, 1.0 / 4.0, 0.0});
+
+    // lets make the array a little bit more complicated
+    one_over_bandwidth_array[9] = {1.0 / 0.5, 1.0 / 4.0, 0.0};
+    one_over_bandwidth_array[3] = {1.0 / 1.0, 1.0 / 0.1, 0.0};
+    one_over_bandwidth_array[4] = {1.0 / 0.5, 1.0 / 4.0, 0.0};
+    one_over_bandwidth_array[2] = {1.0 / 0.1, 1.0 / 4.0, 0.0};
+
+    std::vector<double> bench{0.0131256, 0.0158657, 0.1,      0.2,      0.1,
+                              0.0158657, 0.0364369, 0.158657, 0.120049, 0.2};
 
     // map to dd arrays with simple stride
     std::vector<double> dd_data(local_size, 0.0);
@@ -485,8 +2181,17 @@ void test_decomposition(ParallelUnitTest &ut) {
     }
 
     const bool dd = true;
+    // 1x bin per point
+    const size_t n_coarse_bins = 10;
+    // max window size must be 2x the max bandwidth size
+    const double max_window_size = 4.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
     std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(dd_data, dd_position_array, dd_one_over_bandwidth_array, dd);
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
 
     // Check smooth result
     for (int i = 0; i < local_size; i++) {
@@ -502,25 +2207,16 @@ void test_decomposition(ParallelUnitTest &ut) {
       ITFAILS;
   }
 
-  //  variable band width test.
+  // what if half of it is negative and the mean is zero for a reconstruction
   {
-    std::vector<double> data{0.01, 0.02, 0.1, 0.2, 0.1, 0.02, 0.01, 0.2, 0.1, 0.2};
+    std::vector<double> data{-0.2, 0.2, -0.2, 0.2, -0.2, 0.2, -0.2, 0.2, -0.2, 0.2};
     std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
     for (int i = 0; i < 10; i++) {
       position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0, 0., 0.0});
-
-    // lets make the array a little bit more complicated
-    one_over_bandwidth_array[9] = {1.0 / 0.5, 0., 0.};
-    one_over_bandwidth_array[3] = {1.0 / 0.1, 0., 0.};
-    one_over_bandwidth_array[4] = {1.0 / 0.5, 0., 0.};
-    one_over_bandwidth_array[2] = {1.0 / 2.0, 0., 0.};
-
-    std::vector<double> bench{0.0139053, 0.0165473, 0.0953673, 0.194674, 0.0973372,
-                              0.0165473, 0.0389349, 0.165473,  0.126538, 0.194674};
+        10, std::array<double, 3>{1.0 / 4.0, 0.0, 0.0});
 
     // map to dd arrays with simple stride
     std::vector<double> dd_data(local_size, 0.0);
@@ -536,25 +2232,45 @@ void test_decomposition(ParallelUnitTest &ut) {
     }
 
     const bool dd = true;
+    // 1x bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 4.0;
+    const size_t dim = 1;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
     std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(dd_data, dd_position_array, dd_one_over_bandwidth_array, dd);
+        test_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        test_kde.log_reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    test_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
+    test_kde.apply_conservation(dd_data, log_smooth_result, qindex.domain_decomposed);
 
-    // Check smooth result
     for (int i = 0; i < local_size; i++) {
-      if (!rtt_dsxx::soft_equiv(bench[i + rtt_c4::node() * 3], smooth_result[i], 1e-4))
+      if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.0, 1e-2))
+        ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.0, 1e-2))
         ITFAILS;
     }
 
     double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
     rtt_c4::global_sum(smooth_conservation);
+    double log_smooth_conservation =
+        std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+    rtt_c4::global_sum(log_smooth_conservation);
 
     // Energy conservation
     if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0), smooth_conservation))
       ITFAILS;
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              log_smooth_conservation))
+      ITFAILS;
   }
 
   // what if half of it is negative and the mean is zero for a reconstruction
+  // what if we also reflect the bc
   {
+    kde refl_kde({true, true, true, true, true, true});
     std::vector<double> data{-0.2, 0.2, -0.2, 0.2, -0.2, 0.2, -0.2, 0.2, -0.2, 0.2};
     std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
     for (int i = 0; i < 10; i++) {
@@ -562,7 +2278,7 @@ void test_decomposition(ParallelUnitTest &ut) {
       position_array[i][1] = i < 5 ? 0.5 : -0.5;
     }
     std::vector<std::array<double, 3>> one_over_bandwidth_array(
-        10, std::array<double, 3>{1.0 / 4.0, 0., 0.0});
+        10, std::array<double, 3>{1.0 / 4.0, 1.0 / 4.0, 0.0});
 
     // map to dd arrays with simple stride
     std::vector<double> dd_data(local_size, 0.0);
@@ -578,20 +2294,39 @@ void test_decomposition(ParallelUnitTest &ut) {
     }
 
     const bool dd = true;
+    // 1x bin per point
+    const size_t n_coarse_bins = 10;
+    const double max_window_size = 4.0;
+    const size_t dim = 2;
+    quick_index qindex(dim, dd_position_array, max_window_size, n_coarse_bins, dd);
+
     std::vector<double> smooth_result =
-        test_kde.reconstruction<1>(dd_data, dd_position_array, dd_one_over_bandwidth_array, dd);
+        refl_kde.reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    std::vector<double> log_smooth_result =
+        refl_kde.log_reconstruction(dd_data, dd_one_over_bandwidth_array, qindex);
+    // Apply Conservation
+    refl_kde.apply_conservation(dd_data, smooth_result, qindex.domain_decomposed);
+    refl_kde.apply_conservation(dd_data, log_smooth_result, qindex.domain_decomposed);
 
     for (int i = 0; i < local_size; i++) {
       if (!rtt_dsxx::soft_equiv(smooth_result[i], 0.0, 1e-2))
         ITFAILS;
+      if (!rtt_dsxx::soft_equiv(log_smooth_result[i], 0.0, 1e-2))
+        ITFAILS;
     }
 
     double smooth_conservation = std::accumulate(smooth_result.begin(), smooth_result.end(), 0.0);
     rtt_c4::global_sum(smooth_conservation);
+    double log_smooth_conservation =
+        std::accumulate(log_smooth_result.begin(), log_smooth_result.end(), 0.0);
+    rtt_c4::global_sum(log_smooth_conservation);
 
     // Energy conservation
     if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0), smooth_conservation))
       ITFAILS;
+    if (!rtt_dsxx::soft_equiv(std::accumulate(data.begin(), data.end(), 0.0),
+                              log_smooth_conservation))
+      ITFAILS;
   }
 
   if (ut.numFails == 0) {
diff --git a/src/kde/test/tstquick_index.cc b/src/kde/test/tstquick_index.cc
new file mode 100644
index 0000000000..cc6cb13929
--- /dev/null
+++ b/src/kde/test/tstquick_index.cc
@@ -0,0 +1,1731 @@
+//--------------------------------------------*-C++-*---------------------------------------------//
+/*!
+ * \file   kde/test/tstquick_index.cc
+ * \author Mathew Cleveland
+ * \date   Aug. 10th 2021
+ * \brief  quick_index testing function
+ * \note   Copyright (C) 2021 Triad National Security, LLC., All rights reserved. 
+ */
+//------------------------------------------------------------------------------------------------//
+
+#include "kde/quick_index.hh"
+#include "c4/ParallelUnitTest.hh"
+#include "ds++/Release.hh"
+#include "ds++/dbc.hh"
+#include <numeric>
+
+using namespace rtt_dsxx;
+using namespace rtt_c4;
+using namespace rtt_kde;
+
+//------------------------------------------------------------------------------------------------//
+// TESTS
+//------------------------------------------------------------------------------------------------//
+//
+void test_replication(ParallelUnitTest &ut) {
+  {
+    std::vector<double> data{0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+    // in rep mode the max window size does nothing so set it large
+    const double max_window_size = 100.0;
+    const size_t bins_per_dim = 10UL;
+    const bool dd = false;
+    const size_t dim = 1;
+    quick_index qindex(dim, position_array, max_window_size, bins_per_dim, dd);
+    // Check public data
+    //------------------------
+    if (qindex.domain_decomposed)
+      ITFAILS;
+    if (qindex.coarse_bin_resolution != bins_per_dim)
+      ITFAILS;
+    if (!soft_equiv(qindex.max_window_size, max_window_size))
+      ITFAILS;
+    // Check global bounding box
+    if (!soft_equiv(qindex.bounding_box_min[0], 0.0))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_min[1], 0.0))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_min[2], 0.0))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_max[0], 4.5))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_max[1], 0.0))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_max[2], 0.0))
+      ITFAILS;
+    // Check local coarse_index map
+    // build up a global gold to check the map
+    std::map<size_t, std::vector<size_t>> gold_map;
+    gold_map[0] = {0};
+    gold_map[1] = {5};
+    gold_map[2] = {1};
+    gold_map[3] = {6};
+    gold_map[4] = {2};
+    gold_map[5] = {7};
+    gold_map[6] = {3};
+    gold_map[7] = {8};
+    gold_map[8] = {4};
+    gold_map[9] = {9};
+    if (gold_map.size() != qindex.coarse_index_map.size())
+      ITFAILS;
+    for (auto &map : qindex.coarse_index_map)
+      for (size_t i = 0; i < map.second.size(); i++)
+        if (gold_map[map.first][i] != map.second[i])
+          ITFAILS;
+  }
+
+  if (ut.numFails == 0) {
+    PASSMSG("quick_index checks pass");
+  } else {
+    FAILMSG("quick_index checks failed");
+  }
+}
+
+void test_decomposition(ParallelUnitTest &ut) {
+  if (rtt_c4::nodes() != 3)
+    ITFAILS;
+
+  int local_size = 3;
+  // give the odd size to the final rank to make striding easy
+  if (rtt_c4::node() == 2)
+    local_size = 4;
+
+  {
+    std::vector<double> data{3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    // This cell spatial ordering is difficult for this setup in that every
+    // rank requires a sub set of information from every other rank
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+
+    // map to dd arrays with simple stride
+    std::vector<double> dd_data(local_size, 0.0);
+    std::vector<std::vector<double>> dd_3x_data(3, std::vector<double>(local_size, 0.0));
+    std::vector<std::array<double, 3>> dd_position_array(local_size,
+                                                         std::array<double, 3>{0.0, 0.0, 0.0});
+
+    for (int i = 0; i < local_size; i++) {
+      dd_data[i] = data[i + rtt_c4::node() * 3];
+      dd_3x_data[0][i] = data[i + rtt_c4::node() * 3];
+      dd_3x_data[1][i] = data[i + rtt_c4::node() * 3] + 1;
+      dd_3x_data[2][i] = -data[i + rtt_c4::node() * 3];
+      dd_position_array[i] = position_array[i + rtt_c4::node() * 3];
+    }
+
+    // in dd mode the max window size determines the number of ghost cells
+    const double max_window_size = 1.0;
+    const size_t bins_per_dim = 10UL;
+    const bool dd = true;
+    const size_t dim = 1;
+    quick_index qindex(dim, dd_position_array, max_window_size, bins_per_dim, dd);
+
+    // Check the local state data
+    if (!qindex.domain_decomposed)
+      ITFAILS;
+    if (qindex.coarse_bin_resolution != bins_per_dim)
+      ITFAILS;
+    if (!soft_equiv(qindex.max_window_size, max_window_size))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_min[0], 0.0))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_min[1], 0.0))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_min[2], 0.0))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_max[0], 4.5))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_max[1], 0.0))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_max[2], 0.0))
+      ITFAILS;
+    // Check local coarse_index map
+    // local indexing will not match the domain replicated case (different
+    // number of points per rank so different local indexing)
+    std::map<size_t, std::vector<size_t>> gold_map;
+    if (rtt_c4::node() == 0) {
+      gold_map[0] = {0}; // 0.0
+      gold_map[2] = {1}; // 1.0
+      gold_map[4] = {2}; // 2.0
+    } else if (rtt_c4::node() == 1) {
+      gold_map[6] = {0}; // 3.0
+      gold_map[8] = {1}; // 4.0
+      gold_map[1] = {2}; // 0.5
+    } else {
+      gold_map[3] = {0}; // 1.5
+      gold_map[5] = {1}; // 2.5
+      gold_map[7] = {2}; // 3.5
+      gold_map[9] = {3}; // 4.5
+    }
+    if (gold_map.size() != qindex.coarse_index_map.size())
+      ITFAILS;
+    for (auto &map : qindex.coarse_index_map)
+      for (size_t i = 0; i < map.second.size(); i++)
+        if (gold_map[map.first][i] != map.second[i])
+          ITFAILS;
+
+    // Check Domain Decomposed Data
+    // local bounding box extends beyond local data based on the window size
+    if (rtt_c4::node() == 0) {
+      if (!soft_equiv(qindex.local_bounding_box_min[0], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_min[1], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_min[2], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[0], 2.5))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[1], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[2], 0.0))
+        ITFAILS;
+    } else if (rtt_c4::node() == 1) {
+      if (!soft_equiv(qindex.local_bounding_box_min[0], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_min[1], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_min[2], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[0], 4.5))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[1], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[2], 0.0))
+        ITFAILS;
+    } else {
+      if (!soft_equiv(qindex.local_bounding_box_min[0], 1.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_min[1], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_min[2], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[0], 4.5))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[1], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[2], 0.0))
+        ITFAILS;
+    }
+    // global bins that span the local domains
+    std::vector<size_t> gold_bins;
+    if (rtt_c4::node() == 0) {
+      gold_bins = {0, 1, 2, 3, 4, 5};
+    } else if (rtt_c4::node() == 1) {
+      gold_bins = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    } else {
+      gold_bins = {2, 3, 4, 5, 6, 7, 8, 9};
+    }
+
+    if (gold_bins.size() != qindex.local_bins.size())
+      ITFAILS;
+    for (size_t i = 0; i < qindex.local_bins.size(); i++)
+      if (gold_bins[i] != qindex.local_bins[i])
+        ITFAILS;
+
+    // local ghost index map (how to find general location of the ghost data)
+    std::map<size_t, std::vector<size_t>> gold_ghost_index_map;
+    if (rtt_c4::node() == 0) {
+      gold_ghost_index_map[1] = {0}; // 0.5 from rank 1
+      gold_ghost_index_map[3] = {1}; // 1.5 from rank 2
+      gold_ghost_index_map[5] = {2}; // 2.5 from rank 2
+    } else if (rtt_c4::node() == 1) {
+      gold_ghost_index_map[0] = {0}; // 0.0 from rank 0
+      gold_ghost_index_map[2] = {1}; // 1.0 from rank 0
+      gold_ghost_index_map[4] = {2}; // 2.0 from rank 0
+      gold_ghost_index_map[3] = {3}; // 1.5 from rank 2
+      gold_ghost_index_map[5] = {4}; // 2.5 from rank 2
+      gold_ghost_index_map[7] = {5}; // 3.5 from rank 2
+      gold_ghost_index_map[9] = {6}; // 4.5 from rank 2
+    } else {
+      gold_ghost_index_map[2] = {0}; // 1.0 from rank 0
+      gold_ghost_index_map[4] = {1}; // 2.0 from rank 0
+      gold_ghost_index_map[6] = {2}; // 3.0 from rank 1
+      gold_ghost_index_map[8] = {3}; // 4.0 from rank 1
+    }
+    if (gold_ghost_index_map.size() != qindex.local_ghost_index_map.size())
+      ITFAILS;
+    for (auto &map : qindex.local_ghost_index_map) {
+      if (gold_ghost_index_map[map.first].size() != map.second.size())
+        ITFAILS;
+      for (size_t i = 0; i < map.second.size(); i++) {
+        if (map.second[i] != gold_ghost_index_map[map.first][i])
+          ITFAILS;
+      }
+    }
+
+    // Check the local ghost locations (this tangentially checks the private
+    // put_window_map which is used to build this local data).
+    std::vector<std::array<double, 3>> gold_ghost_locations;
+    if (rtt_c4::node() == 0) {
+      gold_ghost_locations = {{0.5, 0.0, 0.0}, {1.5, 0.0, 0.0}, {2.5, 0.0, 0.0}};
+    } else if (rtt_c4::node() == 1) {
+      gold_ghost_locations = {{0.0, 0.0, 0.0}, {1.0, 0.0, 0.0}, {2.0, 0.0, 0.0}, {1.5, 0.0, 0.0},
+                              {2.5, 0.0, 0.0}, {3.5, 0.0, 0.0}, {4.5, 0.0, 0.0}};
+    } else {
+      gold_ghost_locations = {{1.0, 0.0, 0.0}, {2.0, 0.0, 0.0}, {3.0, 0.0, 0.0}, {4.0, 0.0, 0.0}};
+    }
+    if (gold_ghost_locations.size() != qindex.local_ghost_locations.size())
+      ITFAILS;
+    for (size_t i = 0; i < qindex.local_ghost_locations.size(); i++) {
+      for (size_t d = 0; d < 3; d++)
+        if (!rtt_dsxx::soft_equiv(gold_ghost_locations[i][d], qindex.local_ghost_locations[i][d]))
+          ITFAILS;
+    }
+
+    // Check collect_ghost_data vector call
+    std::vector<double> ghost_data(qindex.local_ghost_buffer_size, 0.0);
+    qindex.collect_ghost_data(dd_data, ghost_data);
+    std::vector<std::vector<double>> ghost_3x_data(
+        3, std::vector<double>(qindex.local_ghost_buffer_size, 0.0));
+    qindex.collect_ghost_data(dd_3x_data, ghost_3x_data);
+
+    std::vector<double> gold_ghost_data;
+    std::vector<std::vector<double>> gold_3x_ghost_data(3);
+    if (rtt_c4::node() == 0) {
+      gold_ghost_data = {8.0, 9.0, 10.0};
+      gold_3x_ghost_data[0] = {8.0, 9.0, 10.0};
+      gold_3x_ghost_data[1] = {9.0, 10.0, 11.0};
+      gold_3x_ghost_data[2] = {-8.0, -9.0, -10.0};
+    } else if (rtt_c4::node() == 1) {
+      gold_ghost_data = {3.0, 4.0, 5.0, 9.0, 10.0, 11.0, 12.0};
+      gold_3x_ghost_data[0] = {3.0, 4.0, 5.0, 9.0, 10.0, 11.0, 12.0};
+      gold_3x_ghost_data[1] = {4.0, 5.0, 6.0, 10.0, 11.0, 12.0, 13.0};
+      gold_3x_ghost_data[2] = {-3.0, -4.0, -5.0, -9.0, -10.0, -11.0, -12.0};
+    } else {
+      gold_ghost_data = {4.0, 5.0, 6.0, 7.0};
+      gold_3x_ghost_data[0] = {4.0, 5.0, 6.0, 7.0};
+      gold_3x_ghost_data[1] = {5.0, 6.0, 7.0, 8.0};
+      gold_3x_ghost_data[2] = {-4.0, -5.0, -6.0, -7.0};
+    }
+    for (size_t i = 0; i < ghost_data.size(); i++)
+      if (!rtt_dsxx::soft_equiv(ghost_data[i], gold_ghost_data[i]))
+        ITFAILS;
+    for (size_t i = 0; i < ghost_3x_data[0].size(); i++)
+      if (!rtt_dsxx::soft_equiv(ghost_3x_data[0][i], gold_3x_ghost_data[0][i]))
+        ITFAILS;
+    for (size_t i = 0; i < ghost_3x_data[1].size(); i++)
+      if (!rtt_dsxx::soft_equiv(ghost_3x_data[1][i], gold_3x_ghost_data[1][i]))
+        ITFAILS;
+    for (size_t i = 0; i < ghost_3x_data[2].size(); i++)
+      if (!rtt_dsxx::soft_equiv(ghost_3x_data[2][i], gold_3x_ghost_data[2][i]))
+        ITFAILS;
+
+    // check max window mapping (more bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{5, 0, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "max";
+      std::vector<double> window_data(5, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0, 0.0, 8.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0, 0.0, 8.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0, 0.0, 9.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -3.0, 0.0, -8.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {10.0, 0.0, 6.0, 0.0, 11.0};
+        gold_window_3x_data[0] = {10.0, 0.0, 6.0, 0.0, 11.0};
+        gold_window_3x_data[1] = {11.0, 0.0, 7.0, 0.0, 12.0};
+        gold_window_3x_data[2] = {-10.0, 0.0, -6.0, 0.0, -11.0};
+      } else {
+        gold_window_data = {4.0, 0.0, 9.0, 0.0, 5.0};
+        gold_window_3x_data[0] = {4.0, 0.0, 9.0, 0.0, 5.0};
+        gold_window_3x_data[1] = {5.0, 0.0, 10.0, 0.0, 6.0};
+        gold_window_3x_data[2] = {-4.0, 0.0, -9.0, 0.0, -5.0};
+      }
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check min window mapping (more bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{5, 0, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "min";
+      std::vector<double> window_data(5, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0, 0.0, 8.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0, 0.0, 8.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0, 0.0, 9.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -3.0, 0.0, -8.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {10.0, 0.0, 6.0, 0.0, 11.0};
+        gold_window_3x_data[0] = {10.0, 0.0, 6.0, 0.0, 11.0};
+        gold_window_3x_data[1] = {11.0, 0.0, 7.0, 0.0, 12.0};
+        gold_window_3x_data[2] = {-10.0, 0.0, -6.0, 0.0, -11.0};
+      } else {
+        gold_window_data = {4.0, 0.0, 9.0, 0.0, 5.0};
+        gold_window_3x_data[0] = {4.0, 0.0, 9.0, 0.0, 5.0};
+        gold_window_3x_data[1] = {5.0, 0.0, 10.0, 0.0, 6.0};
+        gold_window_3x_data[2] = {-4.0, 0.0, -9.0, 0.0, -5.0};
+      }
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check min_fill window mapping (more bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{5, 0, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "min_fill";
+      std::vector<double> window_data(5, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0, 3.0, 8.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0, 3.0, 8.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0, 4.0, 9.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -3.0, -3.0, -8.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {10.0, 10.0, 6.0, 6.0, 11.0};
+        gold_window_3x_data[0] = {10.0, 10.0, 6.0, 6.0, 11.0};
+        gold_window_3x_data[1] = {11.0, 11.0, 7.0, 7.0, 12.0};
+        gold_window_3x_data[2] = {-10.0, -10.0, -6.0, -6.0, -11.0};
+      } else {
+        gold_window_data = {4.0, 4.0, 9.0, 9.0, 5.0};
+        gold_window_3x_data[0] = {4.0, 4.0, 9.0, 9.0, 5.0};
+        gold_window_3x_data[1] = {5.0, 5.0, 10.0, 10.0, 6.0};
+        gold_window_3x_data[2] = {-4.0, -4.0, -9.0, -9.0, -5.0};
+      }
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check ave window mapping (more bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{5, 0, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "ave";
+      std::vector<double> window_data(5, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0, 0.0, 8.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0, 0.0, 8.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0, 0.0, 9.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -3.0, 0.0, -8.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {10.0, 0.0, 6.0, 0.0, 11.0};
+        gold_window_3x_data[0] = {10.0, 0.0, 6.0, 0.0, 11.0};
+        gold_window_3x_data[1] = {11.0, 0.0, 7.0, 0.0, 12.0};
+        gold_window_3x_data[2] = {-10.0, 0.0, -6.0, 0.0, -11.0};
+      } else {
+        gold_window_data = {4.0, 0.0, 9.0, 0.0, 5.0};
+        gold_window_3x_data[0] = {4.0, 0.0, 9.0, 0.0, 5.0};
+        gold_window_3x_data[1] = {5.0, 0.0, 10.0, 0.0, 6.0};
+        gold_window_3x_data[2] = {-4.0, 0.0, -9.0, 0.0, -5.0};
+      }
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check ave_fill window mapping (more bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{5, 0, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "ave_fill";
+      std::vector<double> window_data(5, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0, 3.0, 8.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0, 3.0, 8.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0, 4.0, 9.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -3.0, -3.0, -8.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {10.0, 10.0, 6.0, 6.0, 11.0};
+        gold_window_3x_data[0] = {10.0, 10.0, 6.0, 6.0, 11.0};
+        gold_window_3x_data[1] = {11.0, 11.0, 7.0, 7.0, 12.0};
+        gold_window_3x_data[2] = {-10.0, -10.0, -6.0, -6.0, -11.0};
+      } else {
+        gold_window_data = {4.0, 4.0, 9.0, 9.0, 5.0};
+        gold_window_3x_data[0] = {4.0, 4.0, 9.0, 9.0, 5.0};
+        gold_window_3x_data[1] = {5.0, 5.0, 10.0, 10.0, 6.0};
+        gold_window_3x_data[2] = {-4.0, -4.0, -9.0, -9.0, -5.0};
+      }
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check normalize window mapping (more bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{5, 0, 0};
+      const bool normalize = true;
+      const bool bias = false;
+      const std::string map_type = "ave";
+      std::vector<double> window_data(5, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0 / 11.0, 0.0, 8.0 / 11.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0 / 11.0, 0.0, 8.0 / 11.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0 / 13.0, 0.0, 9.0 / 13.0};
+        gold_window_3x_data[2] = {0.0, 0.0, 3.0 / 11.0, 0.0, 8.0 / 11.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {10.0 / 27.0, 0.0, 6.0 / 27.0, 0.0, 11.0 / 27.0};
+        gold_window_3x_data[0] = {10.0 / 27.0, 0.0, 6.0 / 27.0, 0.0, 11.0 / 27.0};
+        gold_window_3x_data[1] = {11.0 / 30.0, 0.0, 7.0 / 30.0, 0.0, 12.0 / 30.0};
+        gold_window_3x_data[2] = {10.0 / 27.0, 0.0, 6.0 / 27.0, 0.0, 11.0 / 27.0};
+      } else {
+        gold_window_data = {4.0 / 18.0, 0.0, 9.0 / 18.0, 0.0, 5.0 / 18.0};
+        gold_window_3x_data[0] = {4.0 / 18.0, 0.0, 9.0 / 18.0, 0.0, 5.0 / 18.0};
+        gold_window_3x_data[1] = {5.0 / 21.0, 0.0, 10.0 / 21.0, 0.0, 6.0 / 21.0};
+        gold_window_3x_data[2] = {4.0 / 18.0, 0.0, 9.0 / 18.0, 0.0, 5.0 / 18.0};
+      }
+
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+    }
+
+    // check ave_fill + normalize window mapping (more bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{5, 0, 0};
+      const bool normalize = true;
+      const bool bias = false;
+      const std::string map_type = "ave_fill";
+      std::vector<double> window_data(5, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0 / 14.0, 3.0 / 14.0, 8.0 / 14.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0 / 14.0, 3.0 / 14.0, 8.0 / 14.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0 / 17.0, 4.0 / 17, 9.0 / 17.0};
+        gold_window_3x_data[2] = {0.0, 0.0, 3.0 / 14.0, 3.0 / 14.0, 8.0 / 14.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {10.0 / 43.0, 10.0 / 43.0, 6.0 / 43.0, 6.0 / 43.0, 11.0 / 43.0};
+        gold_window_3x_data[0] = {10.0 / 43.0, 10.0 / 43.0, 6.0 / 43.0, 6.0 / 43.0, 11.0 / 43.0};
+        gold_window_3x_data[1] = {11.0 / 48.0, 11.0 / 48.0, 7.0 / 48.0, 7.0 / 48.0, 12.0 / 48.0};
+        gold_window_3x_data[2] = {10.0 / 43.0, 10.0 / 43.0, 6.0 / 43.0, 6.0 / 43.0, 11.0 / 43.0};
+      } else {
+        gold_window_data = {4.0 / 31.0, 4.0 / 31.0, 9.0 / 31.0, 9.0 / 31.0, 5.0 / 31.0};
+        gold_window_3x_data[0] = {4.0 / 31.0, 4.0 / 31.0, 9.0 / 31.0, 9.0 / 31.0, 5.0 / 31.0};
+        gold_window_3x_data[1] = {5.0 / 36.0, 5.0 / 36.0, 10.0 / 36.0, 10.0 / 36.0, 6.0 / 36.0};
+        gold_window_3x_data[2] = {4.0 / 31.0, 4.0 / 31.0, 9.0 / 31.0, 9.0 / 31.0, 5.0 / 31.0};
+      }
+
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+    }
+
+    // check bias window mapping (more bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{5, 0, 0};
+      const bool normalize = false;
+      const bool bias = true;
+      const std::string map_type = "ave";
+      std::vector<double> window_data(5, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0, 0.0, 8.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0, 0.0, 8.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0, 0.0, 9.0};
+        gold_window_3x_data[2] = {0.0, 0.0, 8.0 - 3.0, 0.0, 8.0 - 8.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {10.0, 0.0, 6.0, 0.0, 11.0};
+        gold_window_3x_data[0] = {10.0, 0.0, 6.0, 0.0, 11.0};
+        gold_window_3x_data[1] = {11.0, 0.0, 7.0, 0.0, 12.0};
+        gold_window_3x_data[2] = {11.0 - 10.0, 0.0, 11.0 - 6.0, 0.0, 11.0 - 11.0};
+      } else {
+        gold_window_data = {4.0, 0.0, 9.0, 0.0, 5.0};
+        gold_window_3x_data[0] = {4.0, 0.0, 9.0, 0.0, 5.0};
+        gold_window_3x_data[1] = {5.0, 0.0, 10.0, 0.0, 6.0};
+        gold_window_3x_data[2] = {9.0 - 4.0, 0.0, 9.0 - 9.0, 0.0, 9.0 - 5.0};
+      }
+
+      for (size_t v = 0; v < 3; v++) {
+        for (size_t i = 0; i < bin_sizes[0]; i++) {
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+        }
+      }
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+    }
+
+    // check bias and normal window mapping (more bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{5, 0, 0};
+      const bool normalize = true;
+      const bool bias = true;
+      const std::string map_type = "ave";
+      std::vector<double> window_data(5, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0 / 11.0, 0.0, 8.0 / 11.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0 / 11.0, 0.0, 8.0 / 11.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0 / 13.0, 0.0, 9.0 / 13.0};
+        gold_window_3x_data[2] = {0.0, 0.0, 1.0, 0.0, 0.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {10.0 / 27.0, 0.0, 6.0 / 27.0, 0.0, 11.0 / 27.0};
+        gold_window_3x_data[0] = {10.0 / 27.0, 0.0, 6.0 / 27.0, 0.0, 11.0 / 27.0};
+        gold_window_3x_data[1] = {11.0 / 30.0, 0.0, 7.0 / 30.0, 0.0, 12.0 / 30.0};
+        gold_window_3x_data[2] = {1.0 / 6.0, 0.0, 5.0 / 6.0, 0.0, 0.0};
+      } else {
+        gold_window_data = {4.0 / 18.0, 0.0, 9.0 / 18.0, 0.0, 5.0 / 18.0};
+        gold_window_3x_data[0] = {4.0 / 18.0, 0.0, 9.0 / 18.0, 0.0, 5.0 / 18.0};
+        gold_window_3x_data[1] = {5.0 / 21.0, 0.0, 10.0 / 21.0, 0.0, 6.0 / 21.0};
+        gold_window_3x_data[2] = {5.0 / 9.0, 0.0, 0.0, 0.0, 4.0 / 9.0};
+      }
+
+      for (size_t v = 0; v < 3; v++) {
+        for (size_t i = 0; i < bin_sizes[0]; i++) {
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+        }
+      }
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+    }
+
+    // check max mapping (fewer bins then data) functions
+    {
+      // put two particles in the topmost bin
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{2, 0, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "max";
+      std::vector<double> window_data(2, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(2, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 8.0};
+        gold_window_3x_data[0] = {0.0, 8.0};
+        gold_window_3x_data[1] = {0.0, 9.0};
+        gold_window_3x_data[2] = {0.0, -3.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {10.0, 11.0};
+        gold_window_3x_data[0] = {10.0, 11.0};
+        gold_window_3x_data[1] = {11.0, 12.0};
+        gold_window_3x_data[2] = {-10.0, -6.0};
+      } else {
+        gold_window_data = {4.0, 9.0};
+        gold_window_3x_data[0] = {4.0, 9.0};
+        gold_window_3x_data[1] = {5.0, 10.0};
+        gold_window_3x_data[2] = {-4.0, -5.0};
+      }
+
+      for (size_t v = 0; v < 3; v++) {
+        for (size_t i = 0; i < bin_sizes[0]; i++) {
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+        }
+      }
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+    }
+
+    // check min mapping (fewer bins then data) functions
+    {
+      // put two particles in the topmost bin
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{2, 0, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "min";
+      std::vector<double> window_data(2, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(2, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 3.0};
+        gold_window_3x_data[0] = {0.0, 3.0};
+        gold_window_3x_data[1] = {0.0, 4.0};
+        gold_window_3x_data[2] = {0.0, -8.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {10.0, 6.0};
+        gold_window_3x_data[0] = {10.0, 6.0};
+        gold_window_3x_data[1] = {11.0, 7.0};
+        gold_window_3x_data[2] = {-10.0, -11.0};
+      } else {
+        gold_window_data = {4.0, 5.0};
+        gold_window_3x_data[0] = {4.0, 5.0};
+        gold_window_3x_data[1] = {5.0, 6.0};
+        gold_window_3x_data[2] = {-4.0, -9.0};
+      }
+
+      for (size_t v = 0; v < 3; v++) {
+        for (size_t i = 0; i < bin_sizes[0]; i++) {
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+        }
+      }
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+    }
+
+    // check ave mapping (fewer bins then data) functions
+    {
+      // put two particles in the topmost bin
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{2, 0, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "ave";
+      // use the negative data for the single array operations this time
+      std::vector<double> window_data(2, 0.0);
+      qindex.map_data_to_grid_window(dd_3x_data[2], ghost_3x_data[2], window_data, min, max,
+                                     bin_sizes, map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(2, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, -5.5};
+        gold_window_3x_data[0] = {0.0, 5.5};
+        gold_window_3x_data[1] = {0.0, 6.5};
+        gold_window_3x_data[2] = {0.0, -5.5};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {-10.0, -8.5};
+        gold_window_3x_data[0] = {10.0, 8.5};
+        gold_window_3x_data[1] = {11.0, 9.5};
+        gold_window_3x_data[2] = {-10.0, -8.5};
+      } else {
+        gold_window_data = {-4.0, -7.0};
+        gold_window_3x_data[0] = {4.0, 7.0};
+        gold_window_3x_data[1] = {5.0, 8.0};
+        gold_window_3x_data[2] = {-4.0, -7.0};
+      }
+
+      for (size_t v = 0; v < 3; v++) {
+        for (size_t i = 0; i < bin_sizes[0]; i++) {
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+        }
+      }
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+    }
+
+    // check nearest mapping (fewer bins then data) functions
+    // this is the same as average because the simple spacing
+    {
+      // put two particles in the topmost bin
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{2, 0, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "nearest";
+      // use the negative data for the single array operations this time
+      std::vector<double> window_data(2, 0.0);
+      qindex.map_data_to_grid_window(dd_3x_data[2], ghost_3x_data[2], window_data, min, max,
+                                     bin_sizes, map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(2, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, -5.5};
+        gold_window_3x_data[0] = {0.0, 5.5};
+        gold_window_3x_data[1] = {0.0, 6.5};
+        gold_window_3x_data[2] = {0.0, -5.5};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {-10.0, -8.5};
+        gold_window_3x_data[0] = {10.0, 8.5};
+        gold_window_3x_data[1] = {11.0, 9.5};
+        gold_window_3x_data[2] = {-10.0, -8.5};
+      } else {
+        gold_window_data = {-4.0, -7.0};
+        gold_window_3x_data[0] = {4.0, 7.0};
+        gold_window_3x_data[1] = {5.0, 8.0};
+        gold_window_3x_data[2] = {-4.0, -7.0};
+      }
+
+      for (size_t v = 0; v < 3; v++) {
+        for (size_t i = 0; i < bin_sizes[0]; i++) {
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+        }
+      }
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+    }
+
+    // check nearest mapping (fewer bins then data) functions
+    // not the same as average because the window is on the center point
+    {
+      // put two particles in the topmost bin
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{1, 0, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "nearest";
+      // use the negative data for the single array operations this time
+      std::vector<double> window_data(1, 0.0);
+      qindex.map_data_to_grid_window(dd_3x_data[2], ghost_3x_data[2], window_data, min, max,
+                                     bin_sizes, map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(1, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {-3.0};
+        gold_window_3x_data[0] = {3.0};
+        gold_window_3x_data[1] = {4.0};
+        gold_window_3x_data[2] = {-3.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {-6};
+        gold_window_3x_data[0] = {6};
+        gold_window_3x_data[1] = {7};
+        gold_window_3x_data[2] = {-6};
+      } else {
+        gold_window_data = {-9.0};
+        gold_window_3x_data[0] = {9.0};
+        gold_window_3x_data[1] = {10.0};
+        gold_window_3x_data[2] = {-9.0};
+      }
+
+      for (size_t v = 0; v < 3; v++) {
+        for (size_t i = 0; i < bin_sizes[0]; i++) {
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+        }
+      }
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+    }
+
+    // check bias and normal window mapping (fewer bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, 0.0, 0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, 0.0, 0.0};
+      const std::array<size_t, 3> bin_sizes{2, 0, 0};
+      const bool normalize = true;
+      const bool bias = true;
+      const std::string map_type = "max";
+      std::vector<double> window_data(2, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(2, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 1.0};
+        gold_window_3x_data[0] = {0.0, 1.0};
+        gold_window_3x_data[1] = {0.0, 1.0};
+        // this one is a bit tricky the bias causes the negative value to go to
+        // zero so there is nothing to rescale to one.
+        gold_window_3x_data[2] = {0.0, 0.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {10.0 / 21.0, 11.0 / 21.0};
+        gold_window_3x_data[0] = {10.0 / 21.0, 11.0 / 21.0};
+        gold_window_3x_data[1] = {11.0 / 23.0, 12.0 / 23.0};
+        gold_window_3x_data[2] = {0.0, 1.0};
+      } else {
+        gold_window_data = {4.0 / 13.0, 9.0 / 13.0};
+        gold_window_3x_data[0] = {4.0 / 13.0, 9.0 / 13.0};
+        gold_window_3x_data[1] = {5.0 / 15.0, 10.0 / 15.0};
+        gold_window_3x_data[2] = {1.0, 0.0};
+      }
+
+      for (size_t v = 0; v < 3; v++) {
+        for (size_t i = 0; i < bin_sizes[0]; i++) {
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+        }
+      }
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+    }
+  }
+
+  //2d
+  {
+    std::vector<double> data{3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0};
+    std::vector<std::array<double, 3>> position_array(10, std::array<double, 3>{0.0, 0.0, 0.0});
+    // This cell spatial ordering is difficult for this setup in that every
+    // rank requires a sub set of information from every other rank
+    for (int i = 0; i < 10; i++) {
+      position_array[i][0] = i < 5 ? i % 5 : i % 5 + 0.5;
+      position_array[i][1] = i < 5 ? 0.5 : -0.5;
+    }
+
+    // map to dd arrays with simple stride
+    std::vector<double> dd_data(local_size, 0.0);
+    std::vector<std::vector<double>> dd_3x_data(3, std::vector<double>(local_size, 0.0));
+    std::vector<std::array<double, 3>> dd_position_array(local_size,
+                                                         std::array<double, 3>{0.0, 0.0, 0.0});
+
+    for (int i = 0; i < local_size; i++) {
+      dd_data[i] = data[i + rtt_c4::node() * 3];
+      dd_3x_data[0][i] = data[i + rtt_c4::node() * 3];
+      dd_3x_data[1][i] = data[i + rtt_c4::node() * 3] + 1;
+      dd_3x_data[2][i] = -data[i + rtt_c4::node() * 3];
+      dd_position_array[i] = position_array[i + rtt_c4::node() * 3];
+    }
+
+    // in dd mode the max window size determines the number of ghost cells
+    const double max_window_size = 1.0;
+    const size_t bins_per_dim = 10UL;
+    const bool dd = true;
+    const size_t dim = 2;
+    quick_index qindex(dim, dd_position_array, max_window_size, bins_per_dim, dd);
+
+    // Check the local state data
+    if (!qindex.domain_decomposed)
+      ITFAILS;
+    if (qindex.coarse_bin_resolution != bins_per_dim)
+      ITFAILS;
+    if (!soft_equiv(qindex.max_window_size, max_window_size))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_min[0], 0.0))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_min[1], -0.5))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_min[2], 0.0))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_max[0], 4.5))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_max[1], 0.5))
+      ITFAILS;
+    if (!soft_equiv(qindex.bounding_box_max[2], 0.0))
+      ITFAILS;
+    // Check local coarse_index map
+    // local indexing will not match the domain replicated case (different
+    // number of points per rank so different local indexing)
+    std::map<size_t, std::vector<size_t>> gold_map;
+    if (rtt_c4::node() == 0) {
+      gold_map[90] = {0}; // 0.0 0.5
+      gold_map[92] = {1}; // 1.0 0.5
+      gold_map[94] = {2}; // 2.0 0.5
+    } else if (rtt_c4::node() == 1) {
+      gold_map[96] = {0}; // 3.0 0.5
+      gold_map[98] = {1}; // 4.0 0.5
+      gold_map[1] = {2};  // 0.5 -0.5
+    } else {
+      gold_map[3] = {0}; // 1.5 -0.5
+      gold_map[5] = {1}; // 2.5 -0.5
+      gold_map[7] = {2}; // 3.5 -0.5
+      gold_map[9] = {3}; // 4.5 -0.5
+    }
+    if (gold_map.size() != qindex.coarse_index_map.size())
+      ITFAILS;
+    for (auto &map : qindex.coarse_index_map)
+      for (size_t i = 0; i < map.second.size(); i++)
+        if (gold_map[map.first][i] != map.second[i])
+          ITFAILS;
+
+    // Check Domain Decomposed Data
+    // local bounding box extends beyond local data based on the window size
+    if (rtt_c4::node() == 0) {
+      if (!soft_equiv(qindex.local_bounding_box_min[0], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_min[1], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_min[2], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[0], 2.5))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[1], 0.5))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[2], 0.0))
+        ITFAILS;
+    } else if (rtt_c4::node() == 1) {
+      if (!soft_equiv(qindex.local_bounding_box_min[0], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_min[1], -0.5))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_min[2], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[0], 4.5))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[1], 0.5))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[2], 0.0))
+        ITFAILS;
+    } else {
+      if (!soft_equiv(qindex.local_bounding_box_min[0], 1.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_min[1], -0.5))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_min[2], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[0], 4.5))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[1], 0.0))
+        ITFAILS;
+      if (!soft_equiv(qindex.local_bounding_box_max[2], 0.0))
+        ITFAILS;
+    }
+    // global bins that span the local domains
+    std::vector<size_t> gold_bins;
+    if (rtt_c4::node() == 0) {
+      gold_bins = {50, 51, 52, 53, 54, 55, 60, 61, 62, 63, 64, 65, 70, 71, 72,
+                   73, 74, 75, 80, 81, 82, 83, 84, 85, 90, 91, 92, 93, 94, 95};
+    } else if (rtt_c4::node() == 1) {
+      gold_bins = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                   20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                   40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                   60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+                   80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99};
+    } else {
+      gold_bins = {2,  3,  4,  5,  6,  7,  8,  9,  12, 13, 14, 15, 16, 17, 18, 19,
+                   22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 38, 39,
+                   42, 43, 44, 45, 46, 47, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59};
+    }
+
+    if (gold_bins.size() != qindex.local_bins.size())
+      ITFAILS;
+    for (size_t i = 0; i < qindex.local_bins.size(); i++)
+      if (gold_bins[i] != qindex.local_bins[i])
+        ITFAILS;
+
+    // local ghost index map (how to find general location of the ghost data)
+    std::map<size_t, std::vector<size_t>> gold_ghost_index_map;
+    if (rtt_c4::node() == 1) {
+      gold_ghost_index_map[90] = {0}; // 0.0, 0.5 from rank 0
+      gold_ghost_index_map[92] = {1}; // 1.0, 0.5 from rank 0
+      gold_ghost_index_map[94] = {2}; // 2.0, 0.5 from rank 0
+      gold_ghost_index_map[3] = {3};  // 1.5, -0.5 from rank 2
+      gold_ghost_index_map[5] = {4};  // 2.5, -0.5 from rank 2
+      gold_ghost_index_map[7] = {5};  // 3.5, -0.5 from rank 2
+      gold_ghost_index_map[9] = {6};  // 4.5, -0.5 from rank 2
+    }
+
+    if (gold_ghost_index_map.size() != qindex.local_ghost_index_map.size())
+      ITFAILS;
+    for (auto &map : qindex.local_ghost_index_map) {
+      if (gold_ghost_index_map[map.first].size() != map.second.size())
+        ITFAILS;
+      for (size_t i = 0; i < map.second.size(); i++) {
+        if (map.second[i] != gold_ghost_index_map[map.first][i])
+          ITFAILS;
+      }
+    }
+
+    // Check the local ghost locations (this tangentially checks the private
+    // put_window_map which is used to build this local data).
+    std::vector<std::array<double, 3>> gold_ghost_locations;
+    if (rtt_c4::node() == 1) {
+      gold_ghost_locations = {{0.0, 0.5, 0.0},  {1.0, 0.5, 0.0},  {2.0, 0.5, 0.0}, {1.5, -0.5, 0.0},
+                              {2.5, -0.5, 0.0}, {3.5, -0.5, 0.0}, {4.5, -0.5, 0.0}};
+    }
+    if (gold_ghost_locations.size() != qindex.local_ghost_locations.size())
+      ITFAILS;
+    for (size_t i = 0; i < qindex.local_ghost_locations.size(); i++) {
+      for (size_t d = 0; d < 3; d++)
+        if (!rtt_dsxx::soft_equiv(gold_ghost_locations[i][d], qindex.local_ghost_locations[i][d]))
+          ITFAILS;
+    }
+
+    // Check collect_ghost_data vector call
+    std::vector<double> ghost_data(qindex.local_ghost_buffer_size, 0.0);
+    qindex.collect_ghost_data(dd_data, ghost_data);
+    std::vector<std::vector<double>> ghost_3x_data(
+        3, std::vector<double>(qindex.local_ghost_buffer_size, 0.0));
+    qindex.collect_ghost_data(dd_3x_data, ghost_3x_data);
+
+    std::vector<double> gold_ghost_data;
+    std::vector<std::vector<double>> gold_3x_ghost_data(3);
+    if (rtt_c4::node() == 1) {
+      gold_ghost_data = {3.0, 4.0, 5.0, 9.0, 10.0, 11.0, 12.0};
+      gold_3x_ghost_data[0] = {3.0, 4.0, 5.0, 9.0, 10.0, 11.0, 12.0};
+      gold_3x_ghost_data[1] = {4.0, 5.0, 6.0, 10.0, 11.0, 12.0, 13.0};
+      gold_3x_ghost_data[2] = {-3.0, -4.0, -5.0, -9.0, -10.0, -11.0, -12.0};
+    }
+
+    for (size_t i = 0; i < ghost_data.size(); i++)
+      if (!rtt_dsxx::soft_equiv(ghost_data[i], gold_ghost_data[i]))
+        ITFAILS;
+    for (size_t i = 0; i < ghost_3x_data[0].size(); i++)
+      if (!rtt_dsxx::soft_equiv(ghost_3x_data[0][i], gold_3x_ghost_data[0][i]))
+        ITFAILS;
+    for (size_t i = 0; i < ghost_3x_data[1].size(); i++)
+      if (!rtt_dsxx::soft_equiv(ghost_3x_data[1][i], gold_3x_ghost_data[1][i]))
+        ITFAILS;
+    for (size_t i = 0; i < ghost_3x_data[2].size(); i++)
+      if (!rtt_dsxx::soft_equiv(ghost_3x_data[2][i], gold_3x_ghost_data[2][i]))
+        ITFAILS;
+
+    // check max window mapping (more bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, dd_position_array[0][1] - 0.5,
+                                      0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, dd_position_array[0][1] + 0.5,
+                                      0.0};
+      const std::array<size_t, 3> bin_sizes{5, 1, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "max";
+      std::vector<double> window_data(5, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      // different result then 1D because the 1.0 y offset of the data
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -3.0, 0.0, 0.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {0.0, 0.0, 6.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 6.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 7.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -6.0, 0.0, 0.0};
+      } else {
+        gold_window_data = {0.0, 0.0, 9.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 9.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 10.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -9.0, 0.0, 0.0};
+      }
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check max sphere r window mapping (more bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> center{dd_position_array[0][0], dd_position_array[0][1], 0.0};
+      // wedge location +- 1 degree theta and 0.5 radius
+      const std::array<double, 3> dr_dtheta{0.4, 0.0174533, 0.0};
+      const std::array<size_t, 3> bin_sizes{5, 1, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "max";
+      const std::array<double, 3> sphere_center{-1.0, 0.0, 0.0};
+      std::vector<double> sphere_window_data(5, 0.0);
+      qindex.map_data_to_sphere_grid_window(dd_data, ghost_data, sphere_window_data, sphere_center,
+                                            center, dr_dtheta, bin_sizes, map_type, normalize,
+                                            bias);
+      std::vector<std::vector<double>> sphere_window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_sphere_grid_window(dd_3x_data, ghost_3x_data, sphere_window_3x_data,
+                                            sphere_center, center, dr_dtheta, bin_sizes, map_type,
+                                            normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      // different result then 1D because the 1.0 y offset of the data
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -3.0, 0.0, 0.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {0.0, 0.0, 6.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 6.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 7.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -6.0, 0.0, 0.0};
+      } else {
+        gold_window_data = {0.0, 0.0, 9.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 9.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 10.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -9.0, 0.0, 0.0};
+      }
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(sphere_window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(sphere_window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check max sphere r window mapping (more bins then data) functions with bias
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> center{dd_position_array[0][0], dd_position_array[0][1], 0.0};
+      // wedge location +- 1 degree theta and 0.5 radius
+      const std::array<double, 3> dr_dtheta{0.4, 0.0174533, 0.0};
+      const std::array<size_t, 3> bin_sizes{5, 1, 0};
+      const bool normalize = false;
+      const bool bias = true;
+      const std::string map_type = "max";
+      const std::array<double, 3> sphere_center{-1.0, 0.0, 0.0};
+      std::vector<double> sphere_window_data(5, 0.0);
+      qindex.map_data_to_sphere_grid_window(dd_data, ghost_data, sphere_window_data, sphere_center,
+                                            center, dr_dtheta, bin_sizes, map_type, normalize,
+                                            bias);
+      std::vector<std::vector<double>> sphere_window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_sphere_grid_window(dd_3x_data, ghost_3x_data, sphere_window_3x_data,
+                                            sphere_center, center, dr_dtheta, bin_sizes, map_type,
+                                            normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      // different result then 1D because the 1.0 y offset of the data
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, 0.0, 0.0, 0.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {0.0, 0.0, 6.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 6.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 7.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, 0.0, 0.0, 0.0};
+      } else {
+        gold_window_data = {0.0, 0.0, 9.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 9.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 10.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, 0.0, 0.0, 0.0};
+      }
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(sphere_window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(sphere_window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check max sphere r window mapping (more bins then data) normalized
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> center{dd_position_array[0][0], dd_position_array[0][1], 0.0};
+      // wedge location +- 1 degree theta and 0.5 radius
+      const std::array<double, 3> dr_dtheta{0.4, 0.0174533, 0.0};
+      const std::array<size_t, 3> bin_sizes{5, 1, 0};
+      const bool normalize = true;
+      const bool bias = false;
+      const std::string map_type = "max";
+      const std::array<double, 3> sphere_center{-1.0, 0.0, 0.0};
+      std::vector<double> sphere_window_data(5, 0.0);
+      qindex.map_data_to_sphere_grid_window(dd_data, ghost_data, sphere_window_data, sphere_center,
+                                            center, dr_dtheta, bin_sizes, map_type, normalize,
+                                            bias);
+      std::vector<std::vector<double>> sphere_window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_sphere_grid_window(dd_3x_data, ghost_3x_data, sphere_window_3x_data,
+                                            sphere_center, center, dr_dtheta, bin_sizes, map_type,
+                                            normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      // different result then 1D because the 1.0 y offset of the data
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 1.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 1.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 1.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, 1.0, 0.0, 0.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {0.0, 0.0, 1.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 1.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 1.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, 1.0, 0.0, 0.0};
+      } else {
+        gold_window_data = {0.0, 0.0, 1.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 1.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 1.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, 1.0, 0.0, 0.0};
+      }
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(sphere_window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(sphere_window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check max sphere r window mapping (more bins then data) normalized bias fill
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> center{dd_position_array[0][0], dd_position_array[0][1], 0.0};
+      // wedge location +- 1 degree theta and 0.5 radius
+      const std::array<double, 3> dr_dtheta{0.4, 0.0174533, 0.0};
+      const std::array<size_t, 3> bin_sizes{5, 1, 0};
+      const bool normalize = true;
+      const bool bias = true;
+      const std::string map_type = "min_fill";
+      const std::array<double, 3> sphere_center{-1.0, 0.0, 0.0};
+      std::vector<double> sphere_window_data(5, 0.0);
+      qindex.map_data_to_sphere_grid_window(dd_data, ghost_data, sphere_window_data, sphere_center,
+                                            center, dr_dtheta, bin_sizes, map_type, normalize,
+                                            bias);
+      std::vector<std::vector<double>> sphere_window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_sphere_grid_window(dd_3x_data, ghost_3x_data, sphere_window_3x_data,
+                                            sphere_center, center, dr_dtheta, bin_sizes, map_type,
+                                            normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      // different result then 1D because the 1.0 y offset of the data
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0};
+        gold_window_3x_data[2] = {0.0, 0.0, 0.0, 0.0, 0.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {0.0, 0.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0};
+        gold_window_3x_data[2] = {0.0, 0.0, 0.0, 0.0, 0.0};
+      } else {
+        gold_window_data = {0.0, 0.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0};
+        gold_window_3x_data[2] = {0.0, 0.0, 0.0, 0.0, 0.0};
+      }
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(sphere_window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(sphere_window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check max sphere r window mapping (more bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> center{dd_position_array[0][0], dd_position_array[0][1], 0.0};
+      // wedge location +- 1 degree theta and 0.5 radius
+      const std::array<double, 3> dr_dtheta{0.4, 0.0174533, 0.0};
+      const std::array<size_t, 3> bin_sizes{1, 5, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "ave";
+      const std::array<double, 3> sphere_center{-1.0, 0.0, 0.0};
+      std::vector<double> sphere_window_data(5, 0.0);
+      qindex.map_data_to_sphere_grid_window(dd_data, ghost_data, sphere_window_data, sphere_center,
+                                            center, dr_dtheta, bin_sizes, map_type, normalize,
+                                            bias);
+      std::vector<std::vector<double>> sphere_window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_sphere_grid_window(dd_3x_data, ghost_3x_data, sphere_window_3x_data,
+                                            sphere_center, center, dr_dtheta, bin_sizes, map_type,
+                                            normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      // different result then 1D because the 1.0 y offset of the data
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -3.0, 0.0, 0.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {0.0, 0.0, 6.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 6.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 7.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -6.0, 0.0, 0.0};
+      } else {
+        gold_window_data = {0.0, 0.0, 9.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 9.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 10.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -9.0, 0.0, 0.0};
+      }
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(sphere_window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(sphere_window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check max sphere r window mapping (more bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> center{dd_position_array[0][0], dd_position_array[0][1], 0.0};
+      // wedge location +- 1 degree theta and 0.5 radius
+      const std::array<double, 3> dr_dtheta{0.4, 0.0174533, 0.0};
+      const std::array<size_t, 3> bin_sizes{1, 5, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "nearest";
+      const std::array<double, 3> sphere_center{-1.0, 0.0, 0.0};
+      std::vector<double> sphere_window_data(5, 0.0);
+      qindex.map_data_to_sphere_grid_window(dd_data, ghost_data, sphere_window_data, sphere_center,
+                                            center, dr_dtheta, bin_sizes, map_type, normalize,
+                                            bias);
+      std::vector<std::vector<double>> sphere_window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_sphere_grid_window(dd_3x_data, ghost_3x_data, sphere_window_3x_data,
+                                            sphere_center, center, dr_dtheta, bin_sizes, map_type,
+                                            normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      // different result then 1D because the 1.0 y offset of the data
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -3.0, 0.0, 0.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {0.0, 0.0, 6.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 6.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 7.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -6.0, 0.0, 0.0};
+      } else {
+        gold_window_data = {0.0, 0.0, 9.0, 0.0, 0.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 9.0, 0.0, 0.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 10.0, 0.0, 0.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -9.0, 0.0, 0.0};
+      }
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(sphere_window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(sphere_window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check max_fill window mapping (more bins then data) functions
+    {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{dd_position_array[0][0] - 0.5, dd_position_array[0][1] - 0.5,
+                                      0.0};
+      const std::array<double, 3> max{dd_position_array[0][0] + 0.5, dd_position_array[0][1] + 0.5,
+                                      0.0};
+      const std::array<size_t, 3> bin_sizes{5, 1, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "max_fill";
+      std::vector<double> window_data(5, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(5, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      // different result then 1D because the 1.0 y offset of the data
+      if (rtt_c4::node() == 0) {
+        gold_window_data = {0.0, 0.0, 3.0, 3.0, 3.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 3.0, 3.0, 3.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 4.0, 4.0, 4.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -3.0, -3.0, -3.0};
+      } else if (rtt_c4::node() == 1) {
+        gold_window_data = {0.0, 0.0, 6.0, 6.0, 6.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 6.0, 6.0, 6.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 7.0, 7.0, 7.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -6.0, -6.0, -6.0};
+      } else {
+        gold_window_data = {0.0, 0.0, 9.0, 9.0, 9.0};
+        gold_window_3x_data[0] = {0.0, 0.0, 9.0, 9.0, 9.0};
+        gold_window_3x_data[1] = {0.0, 0.0, 10.0, 10.0, 10.0};
+        gold_window_3x_data[2] = {0.0, 0.0, -9.0, -9.0, -9.0};
+      }
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check y dim mapping only on rank 1 because the ghost data extends far enough to touch all domain space
+    if (rtt_c4::node() == 1) {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{0.5, -0.5, 0.0};
+      const std::array<double, 3> max{1.5, 0.5, 0.0};
+      const std::array<size_t, 3> bin_sizes{1, 2, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "min";
+      std::vector<double> window_data(2, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(2, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      gold_window_data = {8.0, 4.0};
+      gold_window_3x_data[0] = {8.0, 4.0};
+      gold_window_3x_data[1] = {9.0, 5.0};
+      gold_window_3x_data[2] = {-9.0, -4.0};
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check y dim mapping only on rank 1 because the ghost data extends far enough to touch all domain space
+    if (rtt_c4::node() == 1) {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{0.5, -0.5, 0.0};
+      const std::array<double, 3> max{1.5, 0.5, 0.0};
+      const std::array<size_t, 3> bin_sizes{1, 2, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "ave";
+      std::vector<double> window_data(2, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(2, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      gold_window_data = {8.5, 4.0};
+      gold_window_3x_data[0] = {8.5, 4.0};
+      gold_window_3x_data[1] = {9.5, 5.0};
+      gold_window_3x_data[2] = {-8.5, -4.0};
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check y dim mapping only on rank 1 because the ghost data extends far enough to touch all domain space
+    if (rtt_c4::node() == 1) {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{0.45, -0.5, 0.0};
+      const std::array<double, 3> max{0.55, 0.5, 0.0};
+      const std::array<size_t, 3> bin_sizes{1, 2, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "ave";
+      std::vector<double> window_data(2, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(2, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      gold_window_data = {8.0, 0.0};
+      gold_window_3x_data[0] = {8.0, 0.0};
+      gold_window_3x_data[1] = {9.0, 0.0};
+      gold_window_3x_data[2] = {-8.0, 0.0};
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+
+    // check y dim mapping only on rank 1 because the ghost data extends far enough to touch all domain space
+    if (rtt_c4::node() == 1) {
+      // build a length=1.0 window around the first point on each node
+      const std::array<double, 3> min{0.5, -0.5, 0.0};
+      const std::array<double, 3> max{1.5, 0.5, 0.0};
+      const std::array<size_t, 3> bin_sizes{1, 2, 0};
+      const bool normalize = false;
+      const bool bias = false;
+      const std::string map_type = "max";
+      std::vector<double> window_data(2, 0.0);
+      qindex.map_data_to_grid_window(dd_data, ghost_data, window_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<std::vector<double>> window_3x_data(3, std::vector<double>(2, 0.0));
+      qindex.map_data_to_grid_window(dd_3x_data, ghost_3x_data, window_3x_data, min, max, bin_sizes,
+                                     map_type, normalize, bias);
+      std::vector<double> gold_window_data;
+      std::vector<std::vector<double>> gold_window_3x_data(3);
+      gold_window_data = {9.0, 4.0};
+      gold_window_3x_data[0] = {9.0, 4.0};
+      gold_window_3x_data[1] = {10.0, 5.0};
+      gold_window_3x_data[2] = {-8.0, -4.0};
+
+      for (size_t i = 0; i < bin_sizes[0]; i++)
+        if (!rtt_dsxx::soft_equiv(window_data[i], gold_window_data[i]))
+          ITFAILS;
+      for (size_t v = 0; v < 3; v++)
+        for (size_t i = 0; i < bin_sizes[0]; i++)
+          if (!rtt_dsxx::soft_equiv(window_3x_data[v][i], gold_window_3x_data[v][i]))
+            ITFAILS;
+    }
+  }
+
+  if (ut.numFails == 0) {
+    PASSMSG("quick_index DD checks pass");
+  } else {
+    FAILMSG("quick_index DD checks failed");
+  }
+}
+
+//------------------------------------------------------------------------------------------------//
+int main(int argc, char *argv[]) {
+  ParallelUnitTest ut(argc, argv, release);
+  try {
+    // >>> UNIT TESTS
+    test_replication(ut);
+    if (nodes() == 3)
+      test_decomposition(ut);
+  }
+  UT_EPILOG(ut);
+}
+
+//------------------------------------------------------------------------------------------------//
+// end of tstquick_index.cc
+//------------------------------------------------------------------------------------------------//
diff --git a/src/mesh/python/README.md b/src/mesh/python/README.md
index 582480f581..9da5d2ec24 100644
--- a/src/mesh/python/README.md
+++ b/src/mesh/python/README.md
@@ -9,9 +9,17 @@ class has its own `__init__` method for calculating data needed to create X3D
 files (some of which should be usable for RTT file creation).
 For instance, one could add a mesh class that stochastically samples points and
 triangulates them to form a mesh of triangles.
+Currently supported meshes are:
+
+- orth_2d_mesh (default)
+- orth_3d_mesh
+- vor_2d_mesh
+
 Currently, there is a script called `x3d_generator.py`, which takes command-line
 input to instantiate a mesh object from one of the mesh_type classes and outputs
 a set of X3D files (main mesh file and boundary node files) of the mesh object.
+Another script, `x3d_plotter.py`, quickly plots the faces, nodes, and boundaries
+for a mesh in `x3d` format.
 
 ## Example Usage
 
@@ -21,3 +29,9 @@ in [0,1]x[0,2]x[0,4]:
 ```bash
 ./x3d_generator.py --mesh_type orth_3d_mesh --num_per_dim 4 4 4 --bnd_per_dim 0 1 0 2 0 4
 ```
+
+To display a plot of the generated mesh file:
+
+```bash
+./x3d_plotter.py -fn x3d.mesh.in
+```
diff --git a/src/mesh/python/mesh_types.py b/src/mesh/python/mesh_types.py
index bc7f7cdde8..5753ce9428 100644
--- a/src/mesh/python/mesh_types.py
+++ b/src/mesh/python/mesh_types.py
@@ -534,14 +534,9 @@ def norm(v):
             else:
                 ridge_vertices.append(verts)
 
-        # -- assign ridge vertices to regions and boundaries
+        # -- assign ridge vertices to regions
         cells = []
         cell_nodes = []
-        boundary_edges = {}
-        boundary_edges['xl'] = []
-        boundary_edges['xr'] = []
-        boundary_edges['yl'] = []
-        boundary_edges['yr'] = []
         for n, point in enumerate(points):
             cell_nodes.append(n)
             cells.append([])
@@ -561,46 +556,75 @@ def norm(v):
             else:
                 # -- a boundary
                 cells[indices[0]].append(ridge_idx)
-                if (soft_equiv(vertices[v_indices[0]][0], xmin) and
-                    soft_equiv(vertices[v_indices[1]][0], xmin)):
-                    boundary_edges['xl'].append(ridge_idx)
-                elif (soft_equiv(vertices[v_indices[0]][0], xmax) and
-                      soft_equiv(vertices[v_indices[1]][0], xmax)):
-                    boundary_edges['xr'].append(ridge_idx)
-                elif (soft_equiv(vertices[v_indices[0]][1], ymin) and
-                      soft_equiv(vertices[v_indices[1]][1], ymin)):
-                    boundary_edges['yl'].append(ridge_idx)
-                elif (soft_equiv(vertices[v_indices[0]][1], ymax) and
-                      soft_equiv(vertices[v_indices[1]][1], ymax)):
-                    boundary_edges['yr'].append(ridge_idx)
-                else:
-                    assert (False), 'Boundary edge not identified'
 
-        # -- update remaining base values
+        # -- update remaining base class values
         self.num_nodes = len(vertices)
         self.coordinates_per_node = np.zeros([self.num_nodes, 2])
         for n, vertex in enumerate(vertices):
             self.coordinates_per_node[n, 0] = vertex[0]
             self.coordinates_per_node[n, 1] = vertex[1]
-        self.num_faces = len(ridge_vertices)
         self.num_faces_per_cell = np.zeros(self.num_cells, dtype=int)
         for n in range(self.num_cells):
             self.num_faces_per_cell[n] = len(cells[n])
+        self.num_faces = sum(self.num_faces_per_cell)
         self.num_nodes_per_face = np.zeros(self.num_faces, dtype=int)
         for n in range(self.num_faces):
             self.num_nodes_per_face[n] = 2
         self.faces_per_cell = cells
         self.nodes_per_face = ridge_vertices
+
+        # -- rewrite cells and faces to not have duplicate faces
+        new_cells = []
+        new_faces = []
+        for cell in cells:
+            new_cell = []
+            for face in cell:
+                new_faces.append(ridge_vertices[face])
+                new_cell.append(len(new_faces) - 1)
+            new_cells.append(new_cell)
+        self.faces_per_cell = new_cells
+        self.nodes_per_face = new_faces
+
+        # -- write boundaries for new faces
+        boundary_edges = {}
+        boundary_edges['xl'] = []
+        boundary_edges['xr'] = []
+        boundary_edges['yl'] = []
+        boundary_edges['yr'] = []
+        for face_idx, v_indices in enumerate(self.nodes_per_face):
+            midpoint = [(vertices[v_indices[0]][0] + vertices[v_indices[1]][0]) / 2,
+                        (vertices[v_indices[0]][1] + vertices[v_indices[1]][1]) / 2]
+            distances = np.zeros(len(cell_nodes))
+            for n, node in enumerate(cell_nodes):
+                distances[n] = np.sqrt((points[n][0] - midpoint[0])**2 +
+                                       (points[n][1] - midpoint[1])**2)
+            indices = np.argsort(distances)
+            distances = np.sort(distances)
+            if not soft_equiv(distances[0], distances[1]):
+                # -- a boundary
+                if (soft_equiv(vertices[v_indices[0]][0], xmin) and
+                    soft_equiv(vertices[v_indices[1]][0], xmin)):
+                    boundary_edges['xl'].append(face_idx)
+                elif (soft_equiv(vertices[v_indices[0]][0], xmax) and
+                      soft_equiv(vertices[v_indices[1]][0], xmax)):
+                    boundary_edges['xr'].append(face_idx)
+                elif (soft_equiv(vertices[v_indices[0]][1], ymin) and
+                      soft_equiv(vertices[v_indices[1]][1], ymin)):
+                    boundary_edges['yl'].append(face_idx)
+                elif (soft_equiv(vertices[v_indices[0]][1], ymax) and
+                      soft_equiv(vertices[v_indices[1]][1], ymax)):
+                    boundary_edges['yr'].append(face_idx)
+                else:
+                    assert (False), 'Boundary edge not identified'
         self.nodes_per_side = []
         for n in range(4):
             bdy_key = list(boundary_edges.keys())[n]
             bdy_nodes = []
             for bdy in boundary_edges[bdy_key]:
-                nodes = ridge_vertices[bdy]
+                nodes = self.nodes_per_face[bdy]
                 for node in nodes:
-                    if node not in bdy_nodes:
-                        bdy_nodes.append(node)
-            self.nodes_per_side.append(bdy_nodes)
+                    bdy_nodes.append(node)
+            self.nodes_per_side.append(np.unique(bdy_nodes))
 
 
 # ------------------------------------------------------------------------------------------------ #
diff --git a/src/mesh/python/x3d_plotter.py b/src/mesh/python/x3d_plotter.py
new file mode 100755
index 0000000000..88c10fb333
--- /dev/null
+++ b/src/mesh/python/x3d_plotter.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+# -------------------------------------------*-python-*------------------------------------------- #
+# file  src/mesh/python/x3d_plotter.py
+# date  Monday, Aug 9, 2021
+# brief This script plots X3D mesh files.
+# note  Copyright (C) 2021, Triad National Security, LLC.,  All rights reserved.
+# ------------------------------------------------------------------------------------------------ #
+import matplotlib.pyplot as plt
+import argparse
+import os
+
+# ------------------------------------------------------------------------------------------------ #
+# -- create argument parser
+
+parser = argparse.ArgumentParser(description='Plot X3D mesh file.')
+parser.add_argument('-fn', '--file_name', type=str, default=None, required=True,
+                    help='Provide mesh file to plot.')
+
+# -- parse arguments from command line
+args = parser.parse_args()
+
+# ------------------------------------------------------------------------------------------------ #
+# -- Read and parse x3d file
+
+assert (os.path.exists(args.file_name)), f"Mesh file \"{args.file_name}\" does not exist!"
+with open(args.file_name) as f:
+    lines = [line.strip() for line in f]
+
+# Data to read in
+numdim = None
+numnodes = None
+numfaces = None
+numcells = None
+nodes = []
+face_indices = []
+faces = []
+cells = []
+boundaries = []
+
+blocks = ['header', 'nodes', 'faces', 'cells']
+current_block = None
+for line in lines:
+    words = line.split()
+    # If no current block, check if starting new block
+    if current_block is None:
+        for block in blocks:
+            if block == line:
+                current_block = block
+                break
+    # If current block, check if ending current block
+    else:
+        if line == "end_" + current_block:
+            current_block = None
+
+    # Process data if currently on a block
+    if current_block == 'header':
+        if words[0] == 'numdim':
+            numdim = int(words[1])
+        elif words[0] == 'nodes':
+            numnodes = int(words[1])
+        elif words[0] == 'faces':
+            numfaces = int(words[1])
+        elif words[0] == 'elements':
+            numcells = int(words[1])
+    elif current_block == 'nodes':
+        if len(words) == 4:
+            nodes.append([float(words[1]), float(words[2]), float(words[3])])
+    elif current_block == 'faces':
+        if len(words) >= 3:
+            face = []
+            for nnodes in range(int(words[1])):
+                # Convert from file node ID to code node index
+                face.append(int(words[nnodes + 2]) - 1)
+            face_index = int(words[0])
+            # if face_index not in face_indices:
+            faces.append(face)
+            face_indices.append(int(words[0]))
+    elif current_block == 'cells':
+        if len(words) >= 3:
+            cell = []
+            for nface in range(int(words[1])):
+                # Convert from file face ID to code face index
+                cell.append(int(words[nface + 2]) - 1)
+            cells.append(cell)
+
+# Sort faces in case they are out of order
+faces = [x for _, x in sorted(zip(face_indices, faces))]
+
+# Read boundaries
+boundary_files = []
+boundary_nodes = []
+boundary_faces = []
+if numdim == 2:
+    for n in range(4):
+        assert (args.file_name[-3:] == '.in'), "Filename does not end in \".in\""
+        boundary_files.append(args.file_name[:-3] + f".bdy{n+1}.in")
+
+for boundary_file in boundary_files:
+    with open(boundary_file) as f:
+        lines = [line.strip() for line in f]
+    # -- read in boundary nodes
+    boundary = []
+    for line in lines:
+        boundary.append(int(line) - 1)
+    boundary_nodes.append(boundary)
+
+    # -- calculate boundary faces
+    boundary_face_tmp = []
+    for face_idx, face in enumerate(faces):
+        node0 = face[0]
+        node1 = face[1]
+        if node0 in boundary and node1 in boundary:
+            boundary_face_tmp.append(face_idx)
+    boundary_faces.append(boundary_face_tmp)
+
+
+# -- sanity checks
+assert (numdim is not None), "numdim not found!"
+assert (numnodes is not None), "numnodes not found!"
+assert (numfaces is not None), "numfaces not found!"
+assert (numcells is not None), "numcells not found!"
+assert (len(nodes) == numnodes), "numnodes does not match number of nodes!"
+assert (len(faces) == numfaces), "numfaces does not match number of faces!"
+assert (len(cells) == numcells), "numcells does not match number of faces!"
+
+# ------------------------------------------------------------------------------------------------ #
+# -- Plot mesh
+
+if numdim == 1:
+    assert (False), "1D plotting not supported!"
+elif numdim == 2:
+
+    plt.figure()
+    ax = plt.gca()
+
+    # -- plot faces
+    plotted_faces = []
+    for cell in cells:
+        for face in cell:
+            # Don't re-plot the same face
+            if (([faces[face][0], faces[face][1]] not in plotted_faces) and
+                ([faces[face][1], faces[face][0]] not in plotted_faces)):
+                pt1 = nodes[faces[face][0]]
+                pt2 = nodes[faces[face][1]]
+                plotted_faces.append([faces[face][0], faces[face][1]])
+                ax.plot([pt1[0], pt2[0]], [pt1[1], pt2[1]], color='k')
+
+    # -- plot boundary faces
+    colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red']
+    for n, bound in enumerate(boundary_faces):
+        for face in bound:
+            pt1 = nodes[faces[face][0]]
+            pt2 = nodes[faces[face][1]]
+            ax.plot([pt1[0], pt2[0]], [pt1[1], pt2[1]], color=colors[n], linewidth=4)
+
+    # -- plot nodes
+    for node in nodes:
+        ax.plot([node[0]], [node[1]], marker='.', color='b')
+    plt.show()
+
+elif numdim == 3:
+    assert (False), "3D plotting not supported!"
diff --git a/src/viz/Ensight_Translator.cc b/src/viz/Ensight_Translator.cc
index 4f58a5ff39..71a878f134 100644
--- a/src/viz/Ensight_Translator.cc
+++ b/src/viz/Ensight_Translator.cc
@@ -366,12 +366,12 @@ void Ensight_Translator::write_case() {
   // write the pointer to the node variables
   for (auto &data_name : d_vdata_names)
     caseout << "scalar per node:    1  " << setw(19) << setiosflags(ios::left) << data_name
-            << setw(4) << " ./" << data_name << "/data.****\n";
+            << setw(3) << " ./" << data_name << "/data.****\n";
 
   // write the pointer to the cell variables
   for (auto &cdata_name : d_cdata_names)
     caseout << "scalar per element: 1  " << setw(19) << setiosflags(ios::left) << cdata_name
-            << setw(4) << " ./" << cdata_name << "/data.****\n";
+            << setw(3) << " ./" << cdata_name << "/data.****\n";
 
   // write out the time block
   caseout << "\nTIME\n"
diff --git a/tools/check_style.sh b/tools/check_style.sh
index fdf58d0a75..f412e415ba 100755
--- a/tools/check_style.sh
+++ b/tools/check_style.sh
@@ -387,6 +387,119 @@ if [[ -x "$FPY" ]]; then
 
 fi
 
+# ------------------------------------------------------------------------------------------------ #
+# Check copyright block
+# ------------------------------------------------------------------------------------------------ #
+
+echo -ne "\n--------------------------------------------------------------------------------\n"
+echo -e "Checking modified code for copyright block conformance.\n"
+
+patchfile_cb=$(mktemp /tmp/copyright_block.patch.XXXXXXXX)
+
+# file types to parse.
+FILE_EXTS=".c .cc .cmake .h .hh .in .f90 .F90 .f .F .py .txt"
+#FILE_ENDINGS_INCLUDE="_f.h _f77.h _f90.h"
+FILE_ENDINGS_EXCLUDE="ChangeLog Release.cc"
+export FILE_EXTS FILE_ENDINGS_EXCLUDE
+
+# Loop over all modified files.  Create one patch containing all changes to these files
+for file in $modifiedfiles; do
+
+  # ignore file if we do check for file extensions and the file does not match any of the
+  # extensions specified in $FILE_EXTS
+  if ! matches_extension "$file"; then continue; fi
+
+  file_nameonly=$(basename "${file}")
+  tmpfile1="/tmp/copyright-${file_nameonly}"
+
+  # Copy the file and attempt update it.
+  cp "${file}" "${tmpfile1}"
+
+  today=$(date +%Y)
+
+  # This data was found in the header comments.  It might be a single year or a range.
+  crl=$(grep Copyright "${tmpfile1}")
+  # shellcheck disable=SC2001
+  create_date=$(echo "${crl}" | sed -e 's/.* \([0-9][0-9]*\).*/\1/')
+
+  # These dates are reported by git
+  git_last_mod_date=$(git log -1 "${file}" | grep Date | \
+                              sed -e 's/.* \([0-9][0-9][0-9][0-9]\).*/\1/')
+  git_create_date=$(git log "${file}" | grep Date | tail -n 1 | \
+                            sed -e 's/.* \([0-9][0-9][0-9][0-9]\).*/\1/')
+
+  # Sanity Checks
+  [[ "${create_date}" =~ "Copyright" ]] && die "Failed to parse copyright line"
+  # [[ "${mod_date}" =~ "Copyright" ]] && die "Failed to parse copyright line"
+  [[ "${git_last_mod_date}" =~ "Copyright" ]] && die "Failed to parse copyright line"
+  [[ "${git_create_date}" =~ "Copyright" ]] && die "Failed to parse copyright line"
+  if [[ "${create_date}" -gt "${today}" ]] || [[ "${create_date}" -lt "1990" ]]; then
+    die "Existing copyright date range is corrupt. Please fix $file manually."
+  fi
+  if [[ "${git_create_date}" -gt "${today}" ]] || [[ "${git_create_date}" -lt "1990" ]]; then
+    die "Existing copyright date range is corrupt. Please fix $file manually."
+  fi
+  if [[ "${create_date}" -gt "${today}" ]] || [[ "${create_date}" -lt "1990" ]]; then
+    die "Existing copyright date range is corrupt. Please fix $file manually."
+  fi
+
+  # We converted from CVS to svn in 2010. This is the oldest create date that git will report.  In
+  # this case older data is lost, so just use whatever is in the file as the create date.
+  [[ "${git_create_date}" -lt "2011" ]] && git_create_date="${create_date}"
+
+  # Expected Copyright line:
+  ecrl="Copyright (C) ${git_create_date}-${today} Triad National Security, LLC., "
+  ecrl+=" All rights reserved."
+
+  # If existing copyright spans two lines, reduce it to one line.
+  twolines=$(grep -A 1 Copyright "${tmpfile1}" | tail -n 1 | grep -c reserved)
+  if [[ $twolines -gt 0 ]]; then
+    sed -i 's/All rights reserved[.]*//' "${tmpfile1}"
+  fi
+
+  # Do we have terminating comement character on the 'copyright' line.  If so, keep it.
+  ecm=""
+  if [[ $(echo "${crl}" | grep -c "\\\*/") -gt 0 ]]; then
+    ecm=" */"
+  fi
+
+  # Replace copyright with new one
+  sed -i "s%Copyright.*%${ecrl}${ecm}%" "${tmpfile1}"
+  diff -u "${file}" "${tmpfile1}" | \
+    sed -e "1s|--- |--- a/|" -e "2s|+++ ${tmpfile1}|+++ b/${file}|" >> "$patchfile_cb"
+  rm "${tmpfile1}"
+
+  unset today
+  unset crl
+  unset create_date
+  unset git_last_mod_date
+  unset git_create_date
+  unset ecrl
+  unset twolines
+  unset ecm
+
+done
+
+# If the patch file is size 0, then no changes are needed.
+if [[ -s "$patchfile_cb" ]]; then
+  foundissues=1
+  echo -ne "FAIL: some files do not conform to this project's Copyright block requirements:\n"
+  # Modify files, if requested
+  if [[ -s "$patchfile_cb" ]]; then
+    if [[ "${fix_mode}" == 1 ]]; then
+      run "git apply $patchfile_cb"
+      echo -ne "\n      Changes have been made to your files to meet Copyright block guidelines."
+      echo -ne "\n      Please check the updated files and add them to your commit.\n"
+    else
+      echo -ne "      run ${0##*/} with option -f to automatically apply this patch.\n"
+      cat "$patchfile_cb"
+    fi
+  fi
+else
+  echo -n "PASS: Changes to sources conform to this project's Copyright block requirements."
+fi
+rm -f "${patchfile_cb}"
+
 #--------------------------------------------------------------------------------------------------#
 # Done
 #--------------------------------------------------------------------------------------------------#