Changed DStandardScaler to use population instead of sample variance …

…and fixed bug with grabbing correct shape for channels first arrays
NCAR · Jul 26, 2024 · 94db93d · 94db93d
1 parent 2fd99bf
commit 94db93d
Showing 1 changed file with 23 additions and 13 deletions.
diff --git a/bridgescaler/distributed.py b/bridgescaler/distributed.py
@@ -11,6 +11,7 @@
 from numba import guvectorize, float32, float64, void
 CENTROID_DTYPE = np.dtype([('mean', np.float64), ('weight', np.float64)])
 
+
 class DBaseScaler(object):
     """
     Base distributed scaler class. Used only to store attributes and methods shared across all distributed
@@ -184,23 +185,26 @@ def fit(self, x, weight=None):
         else:
             assert x.shape[channel_dim] == self.x_columns_.shape[0], "New data has a different number of columns"
             if is_array:
-                x_col_order = np.arange(x.shape[-1])
+                if self.channels_last:
+                    x_col_order = np.arange(x.shape[-1])
+                else:
+                    x_col_order = np.arange(x.shape[1])
             else:
                 x_col_order = self.get_column_order(x_columns)
             # update derived from
             # https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
+            new_n = xv.shape[0]
             for i, o in enumerate(x_col_order):
                 if self.channels_last:
                     new_mean = np.nanmean(xv[..., i])
-                    new_var = np.nanvar(xv[..., i], ddof=1)
+                    new_var = np.nanvar(xv[..., i])
                 else:
                     new_mean = np.nanmean(xv[:, i])
-                    new_var = np.nanvar(xv[:, i], ddof=1)
-                new_n = xv.shape[0]
-                combined_mean = (self.n_ * self.mean_x_[o] + x.shape[0] * new_mean) / (self.n_ + x.shape[0])
-                weighted_var = ((self.n_ - 1) * self.var_x_[o] + (new_n - 1) * new_var) / (self.n_ + new_n - 1)
-                var_correction = self.n_ * new_n * (self.mean_x_[o] - new_mean) ** 2 / (
-                        (self.n_ + new_n) * (self.n_ + new_n - 1))
+                    new_var = np.nanvar(xv[:, i])
+                combined_mean = (self.n_ * self.mean_x_[o] + new_n * new_mean) / (self.n_ + new_n)
+                weighted_var = (self.n_ * self.var_x_[o] + new_n * new_var) / (self.n_ + new_n)
+                var_correction = (self.n_ * new_n * (self.mean_x_[o] - new_mean) ** 2) / (
+                        (self.n_ + new_n) ** 2)
                 self.mean_x_[o] = combined_mean
                 self.var_x_[o] = weighted_var + var_correction
             self.n_ += new_n
@@ -250,9 +254,9 @@ def __add__(self, other):
         assert np.all(other.x_columns_ == self.x_columns_), "Scaler columns do not match."
         current = deepcopy(self)
         current.mean_x_ = (self.n_ * self.mean_x_ + other.n_ * other.mean_x_) / (self.n_ + other.n_)
-        combined_var = ((self.n_ - 1) * self.var_x_ + (other.n_ - 1) * other.var_x_) / (self.n_ + other.n_ - 1)
-        combined_var_corr = self.n_ * other.n_ * (self.mean_x_ - other.mean_x_) ** 2 / (
-                (self.n_ + other.n_) * (self.n_ + other.n_ - 1))
+        combined_var = (self.n_ * self.var_x_ + other.n_ * other.var_x_) / (self.n_ + other.n_)
+        combined_var_corr = (self.n_ * other.n_ * (self.mean_x_ - other.mean_x_) ** 2) / (
+            (self.n_ + other.n_) ** 2)
         current.var_x_ = combined_var + combined_var_corr
         current.n_ = self.n_ + other.n_
         return current
@@ -291,7 +295,10 @@ def fit(self, x, weight=None):
         else:
             assert x.shape[channel_dim] == self.x_columns_.shape[0], "New data has a different number of columns"
             if is_array:
-                x_col_order = np.arange(x.shape[-1])
+                if self.channels_last:
+                    x_col_order = np.arange(x.shape[-1])
+                else:
+                    x_col_order = np.arange(x.shape[1])
             else:
                 x_col_order = self.get_column_order(x_columns)
             if self.channels_last:
@@ -539,7 +546,10 @@ def fit(self, x, weight=None):
         else:
             assert x.shape[channel_dim] == self.x_columns_.shape[0], "New data has a different number of columns"
             if is_array:
-                x_col_order = np.arange(x.shape[-1])
+                if self.channels_last:
+                    x_col_order = np.arange(x.shape[-1])
+                else:
+                    x_col_order = np.arange(x.shape[1])
             else:
                 x_col_order = self.get_column_order(x_columns)
             td_objs = self.attributes_to_td_objs()