Skip to content

Commit

Permalink
Changed DStandardScaler to use population instead of sample variance …
Browse files Browse the repository at this point in the history
…and fixed bug with grabbing correct shape for channels first arrays
  • Loading branch information
djgagne committed Jul 26, 2024
1 parent 2fd99bf commit 94db93d
Showing 1 changed file with 23 additions and 13 deletions.
36 changes: 23 additions & 13 deletions bridgescaler/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from numba import guvectorize, float32, float64, void
CENTROID_DTYPE = np.dtype([('mean', np.float64), ('weight', np.float64)])


class DBaseScaler(object):
"""
Base distributed scaler class. Used only to store attributes and methods shared across all distributed
Expand Down Expand Up @@ -184,23 +185,26 @@ def fit(self, x, weight=None):
else:
assert x.shape[channel_dim] == self.x_columns_.shape[0], "New data has a different number of columns"
if is_array:
x_col_order = np.arange(x.shape[-1])
if self.channels_last:
x_col_order = np.arange(x.shape[-1])
else:
x_col_order = np.arange(x.shape[1])
else:
x_col_order = self.get_column_order(x_columns)
# update derived from
# https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
new_n = xv.shape[0]
for i, o in enumerate(x_col_order):
if self.channels_last:
new_mean = np.nanmean(xv[..., i])
new_var = np.nanvar(xv[..., i], ddof=1)
new_var = np.nanvar(xv[..., i])
else:
new_mean = np.nanmean(xv[:, i])
new_var = np.nanvar(xv[:, i], ddof=1)
new_n = xv.shape[0]
combined_mean = (self.n_ * self.mean_x_[o] + x.shape[0] * new_mean) / (self.n_ + x.shape[0])
weighted_var = ((self.n_ - 1) * self.var_x_[o] + (new_n - 1) * new_var) / (self.n_ + new_n - 1)
var_correction = self.n_ * new_n * (self.mean_x_[o] - new_mean) ** 2 / (
(self.n_ + new_n) * (self.n_ + new_n - 1))
new_var = np.nanvar(xv[:, i])
combined_mean = (self.n_ * self.mean_x_[o] + new_n * new_mean) / (self.n_ + new_n)
weighted_var = (self.n_ * self.var_x_[o] + new_n * new_var) / (self.n_ + new_n)
var_correction = (self.n_ * new_n * (self.mean_x_[o] - new_mean) ** 2) / (
(self.n_ + new_n) ** 2)
self.mean_x_[o] = combined_mean
self.var_x_[o] = weighted_var + var_correction
self.n_ += new_n
Expand Down Expand Up @@ -250,9 +254,9 @@ def __add__(self, other):
assert np.all(other.x_columns_ == self.x_columns_), "Scaler columns do not match."
current = deepcopy(self)
current.mean_x_ = (self.n_ * self.mean_x_ + other.n_ * other.mean_x_) / (self.n_ + other.n_)
combined_var = ((self.n_ - 1) * self.var_x_ + (other.n_ - 1) * other.var_x_) / (self.n_ + other.n_ - 1)
combined_var_corr = self.n_ * other.n_ * (self.mean_x_ - other.mean_x_) ** 2 / (
(self.n_ + other.n_) * (self.n_ + other.n_ - 1))
combined_var = (self.n_ * self.var_x_ + other.n_ * other.var_x_) / (self.n_ + other.n_)
combined_var_corr = (self.n_ * other.n_ * (self.mean_x_ - other.mean_x_) ** 2) / (
(self.n_ + other.n_) ** 2)
current.var_x_ = combined_var + combined_var_corr
current.n_ = self.n_ + other.n_
return current
Expand Down Expand Up @@ -291,7 +295,10 @@ def fit(self, x, weight=None):
else:
assert x.shape[channel_dim] == self.x_columns_.shape[0], "New data has a different number of columns"
if is_array:
x_col_order = np.arange(x.shape[-1])
if self.channels_last:
x_col_order = np.arange(x.shape[-1])
else:
x_col_order = np.arange(x.shape[1])
else:
x_col_order = self.get_column_order(x_columns)
if self.channels_last:
Expand Down Expand Up @@ -539,7 +546,10 @@ def fit(self, x, weight=None):
else:
assert x.shape[channel_dim] == self.x_columns_.shape[0], "New data has a different number of columns"
if is_array:
x_col_order = np.arange(x.shape[-1])
if self.channels_last:
x_col_order = np.arange(x.shape[-1])
else:
x_col_order = np.arange(x.shape[1])
else:
x_col_order = self.get_column_order(x_columns)
td_objs = self.attributes_to_td_objs()
Expand Down

0 comments on commit 94db93d

Please sign in to comment.