diff --git a/skgstat/DirectionalVariogram.py b/skgstat/DirectionalVariogram.py index 171d6ec..9533b61 100644 --- a/skgstat/DirectionalVariogram.py +++ b/skgstat/DirectionalVariogram.py @@ -6,6 +6,7 @@ from .Variogram import Variogram from skgstat import plotting +from .MetricSpace import MetricSpace, MetricSpacePair class DirectionalVariogram(Variogram): @@ -227,8 +228,14 @@ def __init__(self, self._direction_mask_cache = None + if not isinstance(coordinates, MetricSpace): + coordinates = np.asarray(coordinates) + coordinates = MetricSpace(coordinates.copy(), dist_func, maxlag if maxlag and not isinstance(maxlag, str) and maxlag >= 1 else None) + else: + assert self.dist_func == coordinates.dist_metric, "Distance metric of variogram differs from distance metric of coordinates" + # Set coordinates - self._X = np.asarray(coordinates) + self._X = coordinates # pairwise difference self._diff = None @@ -310,7 +317,6 @@ def __init__(self, self.fit(force=True) def preprocessing(self, force=False): - self._calc_distances(force=force) self._calc_direction_mask_data(force) self._calc_diff(force=force) self._calc_groups(force=force) @@ -344,24 +350,28 @@ def _calc_direction_mask_data(self, force=False): `azimuth `_ """ + + # FIXME: This should be optimized for the sparse case (range << bbox(coordinates)), + # i.e. use the MetricSpace in self._X + # check if already calculated if self._angles is not None and not force: return - # if self._X is of just one dimension, concat zeros. - if self._X.ndim == 1: - _x = np.vstack(zip(self._X, np.zeros(len(self._X)))) - elif self._X.ndim == 2: - _x = self._X + # if self.coordinates is of just one dimension, concat zeros. + if self.coordinates.ndim == 1: + _x = np.vstack(zip(self.coordinates, np.zeros(len(self.coordinates)))) + elif self.coordinates.ndim == 2: + _x = self.coordinates else: raise NotImplementedError('N-dimensional coordinates cannot be handled') # for angles, we need Euklidean distance, # no matter which distance function is used - if self._dist_func_name == "euclidean": - self._euclidean_dist = self._dist - else: - self._euclidean_dist = pdist(_x, "euclidean") + #if self._dist_func_name == "euclidean": + # self._euclidean_dist = scipy.spatial.distance.squareform(self.distance_matrix) + #else: + self._euclidean_dist = pdist(_x, "euclidean") # Calculate the angles # (a - b).[1,0] = ||a - b|| * ||[1,0]|| * cos(v) diff --git a/skgstat/Kriging.py b/skgstat/Kriging.py index d691627..3e4523b 100644 --- a/skgstat/Kriging.py +++ b/skgstat/Kriging.py @@ -13,7 +13,7 @@ import scipy.spatial.distance from .Variogram import Variogram - +from .MetricSpace import MetricSpace, MetricSpacePair class LessPointsError(RuntimeError): pass @@ -30,7 +30,6 @@ class IllMatrixError(RuntimeWarning): def inv_solve(a, b): return inv(a).dot(b) - class OrdinaryKriging: def __init__( self, @@ -41,7 +40,11 @@ def __init__( precision=100, solver='inv', n_jobs=1, - perf=False + perf=False, + sparse=False, + + coordinates=None, + values=None ): """Ordinary Kriging routine @@ -88,14 +91,25 @@ def __init__( If True, the different parts of the algorithm will record their processing time. This is meant to be used for optimization and will be removed in a future version. Do not rely on this argument. + sparse : bool + + coordinates: numpy.ndarray, MetricSpace + values: numpy.ndarray """ # store arguments to the instance - if not isinstance(variogram, Variogram): - raise TypeError('variogram has to be of type skgstat.Variogram.') + if isinstance(variogram, Variogram): + if coordinates is None: coordinates = variogram.coordinates + if values is None: values = variogram.values + variogram_descr = variogram.describe() + if variogram_descr["model"] == "harmonize": + variogram_descr["model"] = variogram._build_harmonized_model() + variogram = variogram_descr + + self.sparse = sparse + # general attributes - self.V = variogram self._minp = min_points self._maxp = max_points self.min_points = min_points @@ -105,14 +119,21 @@ def __init__( self.n_jobs = n_jobs self.perf = perf - params = self.V.describe() - self.range = params['effective_range'] - self.nugget = params['nugget'] - self.sill = params['sill'] - + self.range = variogram['effective_range'] + self.nugget = variogram['nugget'] + self.sill = variogram['sill'] + self.dist_metric = variogram["dist_func"] + # coordinates and semivariance function - self.coords, self.values = self._get_coordinates_and_values() - self.gamma_model = self.V.fitted_model + if not isinstance(coordinates, MetricSpace): + coordinates, values = self._remove_duplicated_coordinates(coordinates, values) + coordinates = MetricSpace(coordinates.copy(), self.dist_metric, self.range if self.sparse else None) + else: + assert self.dist_metric == coordinates.dist_metric, "Distance metric of variogram differs from distance metric of coordinates" + assert coordinates.max_dist is None or coordinates.max_dist == self.range, "Sparse coordinates must have max_dist == variogram.effective_range" + self.values = values.copy() + self.coords = coordinates + self.gamma_model = Variogram.fitted_model_function(**variogram) self.z = None # calculation mode; self.range has to be initialized @@ -132,40 +153,28 @@ def __init__( self.singular_error = 0 self.no_points_error = 0 self.ill_matrix = 0 - + # performance counter if self.perf: self.perf_dist = list() self.perf_mat = list() self.perf_solv = list() - @property - def dist(self): - return self.V._dist_func_wrapper - - @property - def dist_metric(self): - return self.V._dist_func_name - - def _get_coordinates_and_values(self): + def dist(self, x): + return Variogram.wrapped_distance_function(self.dist_metric, x) + + @classmethod + def _remove_duplicated_coordinates(cls, coords, values): """Extract the coordinates and values - The coordinates and values array is extracted from the Variogram - instance. Additionally, the coordinates array is checked for - duplicates and only the first instance of a duplicate is used. - Duplicated coordinates would result in duplicated rows in the - variogram matrix and make it singular. - - Returns - ------- - coords : numpy.array - copy of Variogram.coordines without duplicates - values : numpy.array - copy of Variogram.values without duplicates + The coordinates array is checked for duplicates and only the + first instance of a duplicate is used. Duplicated coordinates + would result in duplicated rows in the variogram matrix and + make it singular. """ - c = self.V.coordinates.copy() - v = self.V.values.copy() + c = coords + v = values _, idx = np.unique(c, axis=0, return_index=True) @@ -260,7 +269,7 @@ def transform(self, *x): Parameters ---------- - x : numpy.array + x : numpy.array, MetricSpace One 1D array for each coordinate dimension. Typically two or three array, x, y, (z) are passed for 2D and 3D Kriging @@ -279,20 +288,23 @@ def transform(self, *x): if self.perf: self.perf_dist, self.perf_mat, self.perf_solv = [], [], [] - self.transform_coordinates = np.column_stack(x) - self.transform_dists = scipy.spatial.distance.cdist(self.transform_coordinates, self.coords, metric=self.dist_metric) + if len(x) != 1 or not isinstance(x[0], MetricSpace): + self.transform_coords = MetricSpace(np.column_stack(x).copy(), self.dist_metric, self.range if self.sparse else None) + else: + self.transform_coords = x[0] + self.transform_coords_pair = MetricSpacePair(self.transform_coords, self.coords) # DEV: this is dirty, not sure how to do it better at the moment self.sigma = np.empty(len(x[0])) self.__sigma_index = 0 # if multi-core, than here if self.n_jobs is None or self.n_jobs == 1: - z = np.fromiter(map(self._estimator, range(len(self.transform_coordinates))), dtype=float) + z = np.fromiter(map(self._estimator, range(len(self.transform_coords))), dtype=float) else: def f(idxs): return self._estimator(idxs) with Pool(self.n_jobs) as p: - z = p.starmap(f, range(len(self.transform_coordinates))) + z = p.starmap(f, range(len(self.transform_coords))) # print warnings if self.singular_error > 0: @@ -389,25 +401,19 @@ def _krige(self, idx): if self.perf: t0 = time.time() - p = self.transform_coordinates[idx,:] - dists = self.transform_dists[idx,:] - - # find all points within the search distance - idx = np.where(dists <= self.range)[0] + p = self.transform_coords.coords[idx,:] + idx = self.transform_coords_pair.find_closest(idx, self.range, self._maxp) + # raise an error if not enough points are found if idx.size < self._minp: raise LessPointsError - - if idx.size > self._maxp: - sorted_idx = np.argsort(dists) - idx = sorted_idx[np.isin(sorted_idx, idx)][:self._maxp] - + # finally find the points and values - in_range = self.coords[idx] + in_range = self.coords.coords[idx] values = self.values[idx] - dist_mat = self.dist(in_range) - + dist_mat = self.coords.diagonal(idx) + # if performance is tracked, time this step if self.perf: t1 = time.time() diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py new file mode 100644 index 0000000..dcc1020 --- /dev/null +++ b/skgstat/MetricSpace.py @@ -0,0 +1,120 @@ +import scipy.spatial +import scipy.spatial.distance +import scipy.sparse +import numpy as np + +def sparse_dok_get(m, fill_value=np.NaN): + mm = np.full(m.shape, fill_value) + for (x, y), value in m.items(): + mm[x,y] = value + return mm + +class DistanceMethods(object): + def find_closest(self, idx, max_dist = None, N = None): + """Find the (N) closest points (in the right set) to the point with + index idx (in the left set). + """ + assert self.max_dist is None or max_dist is None or max_dist == self.max_dist, "max_dist specified and max_dist != self.max_dist" + if max_dist is None: max_dist = self.max_dist + if isinstance(self.dists, scipy.sparse.spmatrix): + dists = self.dists.getrow(idx) + else: + dists = self.dists[idx,:] + if isinstance(dists, scipy.sparse.spmatrix): + ridx = np.array([k[1] for k in dists.todok().keys()]) + else: + ridx = np.where(dists <= max_dist)[0] + if ridx.size > N: + if isinstance(dists, scipy.sparse.spmatrix): + selected_dists = dists[0, ridx].toarray()[0,:] + else: + selected_dists = dists[ridx] + sorted_ridx = np.argsort(selected_dists, kind="stable") + ridx = ridx[sorted_ridx][:N] + return ridx + +class MetricSpace(DistanceMethods): + """A MetricSpace represents a point cloud together with a distance + metric and possibly a maximum distance. It efficiently provides + the distances between each point pair (when shorter than the + maximum distance). + + Note: If a max_dist is specified a sparse matrix representation is + used for the distances, which saves space and calculation time for + large datasets, especially where max_dist << the size of the point + cloud in space. However, it slows things down for small datasets. + """ + + def __init__(self, coords, dist_metric="euclidean", max_dist=None): + self.coords = coords.copy() + self.dist_metric = dist_metric + self.max_dist = max_dist + self._tree = None + self._dists = None + # Do a very quick check to see throw exceptions if self.dist_metric is invalid... + scipy.spatial.distance.pdist(self.coords[:1,:], metric=self.dist_metric) + + + @property + def tree(self): + assert self.dist_metric == "euclidean", "A coordinate tree can only be constructed for an euclidean space" + if self._tree is None: + self._tree = scipy.spatial.cKDTree(self.coords) + return self._tree + + @property + def dists(self): + if self._dists is None: + if self.max_dist is not None and self.dist_metric == "euclidean": + self._dists = self.tree.sparse_distance_matrix(self.tree, self.max_dist, output_type="coo_matrix").tocsr() + else: + self._dists = scipy.spatial.distance.squareform( + scipy.spatial.distance.pdist(self.coords, metric=self.dist_metric)) + return self._dists + + def diagonal(self, idx = None): + """Return a diagonal matrix (as per + scipy.spatial.distance.squareform), optionally for a subset of + the points + """ + dist_mat = self.dists + if idx is not None: + dist_mat = dist_mat[idx,:][:,idx] + if isinstance(self.dists, scipy.sparse.spmatrix): + dist_mat = sparse_dok_get(dist_mat.todok(), np.inf) + np.fill_diagonal(dist_mat, 0) # Normally set to inf + return scipy.spatial.distance.squareform(dist_mat) + + def __len__(self): + return len(self.coords) + +class MetricSpacePair(DistanceMethods): + """A MetricSpacePair represents a set of point clouds (MetricSpaces). + It efficiently provides the distances between each point in one + point cloud and each point in the other point cloud (when shorter + than the maximum distance). The two point clouds are required to + have the same distance metric as well as maximum distance. + """ + def __init__(self, ms1, ms2): + assert ms1.dist_metric == ms2.dist_metric, "Both MetricSpaces need to have the same distance metric" + assert ms1.max_dist == ms2.max_dist, "Both MetricSpaces need to have the same max_dist" + self.ms1 = ms1 + self.ms2 = ms2 + self._dists = None + + @property + def dist_metric(self): + return self.ms1.dist_metric + + @property + def max_dist(self): + return self.ms1.max_dist + + @property + def dists(self): + if self._dists is None: + if self.max_dist is not None and self.dist_metric == "euclidean": + self._dists = self.ms1.tree.sparse_distance_matrix(self.ms2.tree, self.max_dist, output_type="coo_matrix").tocsr() + else: + self._dists = scipy.spatial.distance.cdist(self.ms1.coords, self.ms2.coords, metric=self.ms1.dist_metric) + return self._dists diff --git a/skgstat/SpaceTimeVariogram.py b/skgstat/SpaceTimeVariogram.py index 02e546d..fbf64cf 100644 --- a/skgstat/SpaceTimeVariogram.py +++ b/skgstat/SpaceTimeVariogram.py @@ -203,6 +203,7 @@ def set_xdist_func(self, func_name): """ if isinstance(func_name, str): + self._xdist_func_name = func_name self._xdist_func = lambda x: pdist(x, metric=func_name) else: raise ValueError('For now only str arguments are supported.') @@ -241,6 +242,7 @@ def set_tdist_func(self, func_name): """ if isinstance(func_name, str): + self._tdist_func_name = func_name self._tdist_func = lambda t: pdist(t, metric=func_name) else: raise ValueError('For now only str arguments are supported.') @@ -686,7 +688,8 @@ def _set_xmarg_params(self): return # distance - self.XMarginal.dist_function = self.xdist_func + # FIXME: Handle xdist_func_name vs xdist_func better (like in Variogra.py) + self.XMarginal.dist_function = self._xdist_func_name self.XMarginal.n_lags = self.x_lags # binning @@ -710,7 +713,7 @@ def _set_tmarg_params(self): return # distance - self.TMarginal.dist_function = self.tdist_func + self.TMarginal.dist_function = self._tdist_func_name self.TMarginal.n_lags = self.t_lags # binning diff --git a/skgstat/Variogram.py b/skgstat/Variogram.py index 84d5ad1..46d49d0 100644 --- a/skgstat/Variogram.py +++ b/skgstat/Variogram.py @@ -14,7 +14,8 @@ from skgstat import estimators, models, binning from skgstat import plotting from skgstat.util import shannon_entropy - +from .MetricSpace import MetricSpace, MetricSpacePair +import scipy.sparse class Variogram(object): """Variogram Class @@ -211,9 +212,19 @@ def __init__(self, # Before we do anything else, make kwargs available self._kwargs = self._validate_kwargs(**kwargs) - # Set coordinates - self._X = np.asarray(coordinates) + self._1d = False + if not isinstance(coordinates, MetricSpace): + coordinates = np.asarray(coordinates) + if len(coordinates.shape) < 2: + coordinates = np.column_stack((coordinates, np.zeros(len(coordinates)))) + self._1d = True + coordinates = MetricSpace(coordinates.copy(), dist_func, maxlag if maxlag and not isinstance(maxlag, str) and maxlag >= 1 else None) + else: + assert self.dist_func == coordinates.dist_metric, "Distance metric of variogram differs from distance metric of coordinates" + # Set coordinates + self._X = coordinates + # pairwise differences self._diff = None @@ -225,13 +236,6 @@ def __init__(self, # calc_diff = False here, because it will be calculated by fit() later self.set_values(values=values, calc_diff=False) - # distance matrix - self._dist = None - - # set distance calculation function - self._dist_func_name = None - self.set_dist_function(func=dist_func) - # lags and max lag self._n_lags_passed_value = n_lags self._n_lags = None @@ -246,10 +250,6 @@ def __init__(self, self._estimator = None self.set_estimator(estimator_name=estimator) - # model can be a function or a string - self._model = None - self.set_model(model_name=model) - # the binning settings self._bin_func_name = None self._bin_func = None @@ -257,6 +257,13 @@ def __init__(self, self._bins = None self.set_bin_func(bin_func=bin_func) + # Needed for harmonized models + self.preprocessing(force=True) + + # model can be a function or a string + self._model = None + self.set_model(model_name=model) + # specify if the lag should be given absolute or relative to the maxlag self._normalized = normalize @@ -293,16 +300,16 @@ def coordinates(self): coordinates : numpy.array """ - return self._X + return self._X.coords @property def dim(self): """ Input coordinates dimensionality. """ - if len(self._X.shape) == 1: + if self._1d: return 1 - return self._X.shape[1] + return self.coordinates.shape[1] @property def values(self): @@ -378,7 +385,7 @@ class does only accept one dimensional arrays. """ # check dimensions - if not len(values) == len(self._X): + if not len(values) == len(self.coordinates): raise ValueError('The length of the values array has to match' + 'the length of coordinates') @@ -757,21 +764,11 @@ def set_model(self, model_name): if isinstance(model_name, str): # at first reset harmonize self._harmonize = False - if model_name.lower() == 'spherical': - self._model = models.spherical - elif model_name.lower() == 'exponential': - self._model = models.exponential - elif model_name.lower() == 'gaussian': - self._model = models.gaussian - elif model_name.lower() == 'cubic': - self._model = models.cubic - elif model_name.lower() == 'stable': - self._model = models.stable - elif model_name.lower() == 'matern': - self._model = models.matern - elif model_name.lower() == 'harmonize': + if model_name.lower() == 'harmonize': self._harmonize = True self._model = self._build_harmonized_model() + elif hasattr(models, model_name.lower()): + self._model = getattr(models, model_name.lower()) else: raise ValueError( ( @@ -840,13 +837,14 @@ def use_nugget(self, nugget): @property def dist_function(self): - return self._dist_func_name + return self._X.dist_metric - def _dist_func_wrapper(self, x): - if callable(self._dist_func_name): - return self._dist_func_name(x) + @classmethod + def wrapped_distance_function(cls, dist_func, x): + if callable(dist_func): + return dist_func(x) else: - return pdist(X=x, metric=self._dist_func_name) + return pdist(X=x, metric=dist_func) @dist_function.setter def dist_function(self, func): @@ -871,38 +869,40 @@ def set_dist_function(self, func): numpy.array """ - # reset the distances and fitting - self._dist = None + # reset the fitting self.cof, self.cov = None, None if isinstance(func, str): if func.lower() == 'rank': raise NotImplementedError - else: - # if not ranks, it has to be a scipy metric - self._dist_func_name = func - - elif callable(func): - self._dist_func_name = func - else: + elif not callable(func): raise ValueError('Input not supported. Pass a string or callable.') # re-calculate distances - self._calc_distances() + self._X = MetricSpace(self._X.coords, func, self._X.max_dist) @property def distance(self): - if self._dist is None: - self._calc_distances() - return self._dist - - @distance.setter - def distance(self, dist_array): - self._dist = dist_array + if isinstance(self.distance_matrix, scipy.sparse.spmatrix): + return self.triangular_distance_matrix.data + # Turn it back to triangular form not to have duplicates + return scipy.spatial.distance.squareform(self.distance_matrix) + @property + def triangular_distance_matrix(self): + # Like distance_matrix but with zeros below the diagonal... + # Only defined if distance_matrix is a sparse matrix + assert isinstance(self.distance_matrix, scipy.sparse.spmatrix) + m = self.distance_matrix + c = m.tocsc() + c.data = c.indices + rows = c.tocsr() + filt = scipy.sparse.csr.csr_matrix((m.indices < rows.data, m.indices, m.indptr)) + return m.multiply(filt) + @property def distance_matrix(self): - return squareform(self.distance) + return self._X.dists @property def maxlag(self): @@ -916,7 +916,7 @@ def maxlag(self, value): # remove bins self._bins = None self._groups = None - + # set new maxlag if value is None: self._maxlag = None @@ -1136,7 +1136,6 @@ def preprocessing(self, force=False): """ # call the _calc functions - self._calc_distances(force=force) self._calc_diff(force=force) self._calc_groups(force=force) @@ -1383,33 +1382,39 @@ def fitted_model(self): # get the pars cof = self.cof - # get the function - func = self._model - - if self._harmonize: - code = """model = lambda x: func(x)""" + return self.fitted_model_function(self._model, self.cof) + + @classmethod + def fitted_model_function(cls, model, cof=None, **kw): + if cof is None: + # Make sure to keep this synchronized with the output + # of describe()! + cof = [kw["effective_range"], kw["sill"]] + if "smoothness" in kw: + cof.append(kw["smoothness"]) + if "shape" in kw: + cof.append(kw["shape"]) + if kw["nugget"] != 0: + cof.append(kw["nugget"]) + + harmonize = False + if not callable(model): + if model == "harmonize": + raise ValueError("Please supply the actual harmonized model directly") + else: + model = model.lower() + model = getattr(models, model) + + if model.__name__ == "harmonize": + code = """fitted_model = lambda x: model(x)""" else: - code = """model = lambda x: func(x, %s)""" % \ + code = """fitted_model = lambda x: model(x, %s)""" % \ (', '.join([str(_) for _ in cof])) # run the code - loc = dict(func=func) + loc = dict(model=model) exec(code, loc, loc) - model = loc['model'] - - return model - - def _calc_distances(self, force=False): - if self._dist is not None and not force: - return - - # if self._X is of just one dimension, concat zeros. - if self._X.ndim == 1: - _x = np.column_stack((self._X, np.zeros(self._X.size))) - else: - _x = self._X - # else calculate the distances - self._dist = self._dist_func_wrapper(_x) + return loc['fitted_model'] def _calc_diff(self, force=False): """Calculates the pairwise differences @@ -1425,9 +1430,26 @@ def _calc_diff(self, force=False): v = self.values - # Append a column of zeros to make pdist happy - # euclidean: sqrt((a-b)**2 + (0-0)**2) == sqrt((a-b)**2) == abs(a-b) - self._diff = pdist(np.column_stack((v, np.zeros(len(v)))), metric="euclidean") + if isinstance(self.distance_matrix, scipy.sparse.spmatrix): + c = r = self.triangular_distance_matrix + if not isinstance(c, scipy.sparse.csr.csr_matrix): + c = c.tocsr() + if not isinstance(r, scipy.sparse.csc.csc_matrix): + r = r.tocsc() + Vcol = scipy.sparse.csr_matrix( + (self.values[c.indices], c.indices, c.indptr)) + Vrow = scipy.sparse.csc_matrix( + (self.values[r.indices], r.indices, r.indptr)).tocsr() + # self._diff will have same shape as self.distances, even + # when that's not in diagonal format... + # Note: it might be compelling to do np.abs(Vrow - + # Vcol).data instead here, but that might optimize away + # some zeros, leaving _diff in a different shape + self._diff = np.abs(Vrow.data - Vcol.data) + else: + # Append a column of zeros to make pdist happy + # euclidean: sqrt((a-b)**2 + (0-0)**2) == sqrt((a-b)**2) == abs(a-b) + self._diff = pdist(np.column_stack((v, np.zeros(len(v)))), metric="euclidean") def _calc_groups(self, force=False): """Calculate the lag class mask array @@ -1677,7 +1699,7 @@ def residuals(self): # calculate the residuals return np.fromiter( map(lambda x, y: x - y, model, experimental), - np.float + float ) @property @@ -1691,7 +1713,7 @@ def mean_residual(self): ------- float """ - return np.nanmean(np.fromiter(map(np.abs, self.residuals), np.float)) + return np.nanmean(np.fromiter(map(np.abs, self.residuals), float)) @property def rmse(self): @@ -1723,7 +1745,7 @@ def rmse(self): # get the sum of squares rsum = np.nansum(np.fromiter( map(lambda x, y: (x - y)**2, experimental, model), - np.float + float )) return np.sqrt(rsum / len(model)) @@ -1802,10 +1824,10 @@ def r(self): my = np.nanmean(model) # claculate the single pearson correlation terms - term1 = np.nansum(np.fromiter(map(lambda x, y: (x-mx) * (y-my), experimental, model), np.float)) + term1 = np.nansum(np.fromiter(map(lambda x, y: (x-mx) * (y-my), experimental, model), float)) - t2x = np.nansum(np.fromiter(map(lambda x: (x-mx)**2, experimental), np.float)) - t2y = np.nansum(np.fromiter(map(lambda y: (y-my)**2, model), np.float)) + t2x = np.nansum(np.fromiter(map(lambda x: (x-mx)**2, experimental), float)) + t2y = np.nansum(np.fromiter(map(lambda y: (y-my)**2, model), float)) return term1 / (np.sqrt(t2x * t2y)) @@ -1820,8 +1842,8 @@ def NS(self): mx = np.nanmean(experimental) # calculate the single nash-sutcliffe terms - term1 = np.nansum(np.fromiter(map(lambda x, y: (x - y)**2, experimental, model), np.float)) - term2 = np.nansum(np.fromiter(map(lambda x: (x - mx)**2, experimental), np.float)) + term1 = np.nansum(np.fromiter(map(lambda x, y: (x - y)**2, experimental, model), float)) + term2 = np.nansum(np.fromiter(map(lambda x: (x - mx)**2, experimental), float)) return 1 - (term1 / term2) @@ -1889,23 +1911,35 @@ def describe(self, short=False, flat=False): self.fit(force=True) # scale sill and range - if self.normalized: - maxlag = np.nanmax(self.bins) - maxvar = np.nanmax(self.experimental) - else: - maxlag = 1. - maxvar = 1. + maxlag = np.nanmax(self.bins) + maxvar = np.nanmax(self.experimental) # get the fitting coefficents cof = self.cof # build the dict + + def fnname(fn): + if callable(fn): + name = "%s.%s" % (fn.__module__, fn.__name__) + else: + name = fn + if name.startswith("skgstat."): + return name.split(".")[-1] + return name + rdict = dict( - name=self._model.__name__, - estimator=self._estimator.__name__, - effective_range=cof[0] * maxlag, - sill=cof[1] * maxvar, - nugget=cof[-1] * maxvar if self.use_nugget else 0 + model=fnname(self._model) if not self._harmonize else "harmonize", + estimator=fnname(self._estimator), + dist_func=fnname(self.dist_function), + + normalized_effective_range=cof[0] * maxlag, + normalized_sill=cof[1] * maxvar, + normalized_nugget=cof[-1] * maxvar if self.use_nugget else 0, + + effective_range=cof[0], + sill=cof[1], + nugget=cof[-1] if self.use_nugget else 0, ) # handle s parameters for matern and stable model @@ -1920,7 +1954,7 @@ def describe(self, short=False, flat=False): params = dict( estimator=self._estimator.__name__, model=self._model.__name__, - dist_func=str(self._dist_func_name), + dist_func=str(self.dist_function), bin_func=self._bin_func_name, normalize=self.normalized, fit_method=self.fit_method, @@ -2220,7 +2254,7 @@ def __str__(self): # pragma: no cover _range = np.NaN if 'error' in par else par['effective_range'] _nugget = np.NaN if 'error' in par else par['nugget'] - s = "{0} Variogram\n".format(par['name']) + s = "{0} Variogram\n".format(par['model']) s+= "-" * (len(s) - 1) + "\n" s+="""Estimator: %s \rEffective Range: %.2f diff --git a/skgstat/VariogramResult.py b/skgstat/VariogramResult.py new file mode 100644 index 0000000..aa70d6b --- /dev/null +++ b/skgstat/VariogramResult.py @@ -0,0 +1,62 @@ +import numpy as np +import pandas as pd +from .Variogram import Variogram + +class LagClass(object): + def __init__(self, size): + self.size = size + +class VariogramResult(Variogram): + """This class represents a calculated variogram without the original + coordinates and values. This is then a much smaller object and can + easily be stored in a database etc. + + Usage: + + v = skgstat.Variogram(coordinates, values) + vr = VariogramResult(v) # Throw away coordinates and values (and intermediate large arrays) + + ... + + ok = skgstat.OrdinaryKriging(vr.with_values(coordinates, values), **kriging_args) + """ + def __init__(self, variogram): + info = variogram.describe() + Variogram.__init__(self, model=info["name"], estimator=info["estimator"]) + self.cof = variogram.cof.copy() + self._bins = variogram.bins.copy() + self.__experimental = variogram.experimental.copy() + self._lag_classes = [LagClass(cls.size) for cls in variogram.lag_classes()] + @classmethod + def new_from_params(cls, cof, bins=None, experimental=None, lag_classes=None, **kw): + self = object.__new__(cls) + Variogram.__init__(self, **kw) + self.cof = np.array(cof) + if bins is not None: + self._bins = bins + if experimental is not None: + self.__experimental = experimental + if lag_classes is not None: + self._lag_classes = lag_classes + return self + def fit(self, *arg, **kw): + pass + @property + def _experimental(self): + return self.__experimental + def set_values(self, values, **kw): + if values is not None: + Variogram.set_values(self, values, **kw) + def _calc_distances(self, force=False): + pass + def _calc_diff(self, force=False): + pass + def _calc_groups(self, force=False): + pass + def lag_classes(self): + return self._lag_classes + def with_values(self, coordinates, values): + v = VariogramResult(self) + v._X = coordinates + v._values = values + return v diff --git a/skgstat/__init__.py b/skgstat/__init__.py index 892ac3f..b557e49 100644 --- a/skgstat/__init__.py +++ b/skgstat/__init__.py @@ -1,4 +1,5 @@ from .Variogram import Variogram +from .VariogramResult import VariogramResult from .DirectionalVariogram import DirectionalVariogram from .SpaceTimeVariogram import SpaceTimeVariogram from .Kriging import OrdinaryKriging diff --git a/skgstat/plotting/directtional_variogram.py b/skgstat/plotting/directtional_variogram.py index fdd92f3..4e2a6c6 100644 --- a/skgstat/plotting/directtional_variogram.py +++ b/skgstat/plotting/directtional_variogram.py @@ -14,7 +14,7 @@ def __calculate_plot_data(variogram, points): direction_mask = squareform(variogram._direction_mask()) # build a coordinate meshgrid - n = len(variogram._X) + n = len(variogram.coordinates) r = np.arange(n) x1, x2 = np.meshgrid(r, r) @@ -28,8 +28,8 @@ def __calculate_plot_data(variogram, points): # use all points point_mask = np.ones((n, n), dtype=bool) - start = variogram._X[x1[direction_mask & point_mask]] - end = variogram._X[x2[direction_mask & point_mask]] + start = variogram.coordinates[x1[direction_mask & point_mask]] + end = variogram.coordinates[x2[direction_mask & point_mask]] # extract all lines lines = np.column_stack(( @@ -68,11 +68,11 @@ def matplotlib_pair_field( # add coordinates if add_points: - ax.scatter(variogram._X[:, 0], variogram._X[:, 1], 15, c='k') + ax.scatter(variogram.coordinates[:, 0], variogram.coordinates[:, 1], 15, c='k') if isinstance(points, list): ax.scatter( - variogram._X[:, 0][points], - variogram._X[:, 1][points], + variogram.coordinates[:, 0][points], + variogram.coordinates[:, 1][points], 25, c='r' ) @@ -106,8 +106,8 @@ def plotly_pair_field( # add the coordinates as well if add_points: - x = variogram._X[:, 0] - y = variogram._X[:, 1] + x = variogram.coordinates[:, 0] + y = variogram.coordinates[:, 1] fig.add_trace( go.Scatter( x=x, y=y, mode='markers', diff --git a/skgstat/plotting/variogram_location_trend.py b/skgstat/plotting/variogram_location_trend.py index 9549c03..4f1f0c2 100644 --- a/skgstat/plotting/variogram_location_trend.py +++ b/skgstat/plotting/variogram_location_trend.py @@ -63,7 +63,7 @@ def model(x, m, b): def matplotlib_location_trend(variogram, axes=None, show=True, **kwargs): - N = len(variogram._X[0]) + N = len(variogram.coordinates[0]) # create the figure if axes is None: @@ -81,7 +81,7 @@ def matplotlib_location_trend(variogram, axes=None, show=True, **kwargs): # plot for i in range(N): - axes.flatten()[i].plot([_[i] for _ in variogram._X], variogram.values, '.r') + axes.flatten()[i].plot([_[i] for _ in variogram.coordinates], variogram.values, '.r') axes.flatten()[i].set_xlabel('%d-dimension' % (i + 1)) axes.flatten()[i].set_ylabel('value') @@ -96,7 +96,7 @@ def matplotlib_location_trend(variogram, axes=None, show=True, **kwargs): def plotly_location_trend(variogram, fig=None, show=True, **kwargs): - N = len(variogram._X[0]) + N = len(variogram.coordinates[0]) if N <= 3: names = ['X', 'Y', 'Z'][:N] else: @@ -115,7 +115,7 @@ def plotly_location_trend(variogram, fig=None, show=True, **kwargs): # plot for i in range(N): - y = variogram._X[:, i] + y = variogram.coordinates[:, i] fig.add_trace( GoCls(x=x, y=y, mode='markers', name=names[i]) ) diff --git a/skgstat/stmodels.py b/skgstat/stmodels.py index ebad2c5..cae9f88 100644 --- a/skgstat/stmodels.py +++ b/skgstat/stmodels.py @@ -10,7 +10,7 @@ def wrapper(*args, **kwargs): if st.ndim == 2: new_args = args[1:] mapping = map(lambda lags: func(lags, *new_args, **kwargs), st) - return np.fromiter(mapping, dtype=np.float) + return np.fromiter(mapping, dtype=float) else: return func(*args, **kwargs) return wrapper @@ -176,8 +176,8 @@ def product_sum(lags, Vx, Vt, k1, k2, k3, Cx, Ct): De Cesare et. al [15]_, [16]_: .. math:: - \gamma_{ST}(h_s, h_t) = [k_1C_T(0) + k_2]*\gamma_S(h_s) + - [k_1C_s(0) + k_3]\gamma_T(h_t) - k_1\gamma_s(h_s) x \gamma_T(h_t) + \\gamma_{ST}(h_s, h_t) = [k_1C_T(0) + k_2]*\\gamma_S(h_s) + + [k_1C_s(0) + k_3]\\gamma_T(h_t) - k_1\\gamma_s(h_s) x \\gamma_T(h_t) References ---------- diff --git a/skgstat/tests/test_directionalvariogram.py b/skgstat/tests/test_directionalvariogram.py index f4bc54d..b0fb481 100644 --- a/skgstat/tests/test_directionalvariogram.py +++ b/skgstat/tests/test_directionalvariogram.py @@ -16,18 +16,17 @@ def setUp(self): def test_standard_settings(self): DV = DirectionalVariogram(self.c, self.v, normalize=True) - - assert_array_almost_equal( - DV.parameters, [436., 2706., 0], - decimal=0 - ) + + assert_array_almost_equal(DV.describe()["normalized_effective_range"], 436., decimal=0) + assert_array_almost_equal(DV.describe()["normalized_sill"], 2706., decimal=0) + assert_array_almost_equal(DV.describe()["normalized_nugget"], 0., decimal=0) def test_azimuth(self): DV = DirectionalVariogram(self.c, self.v, azimuth=-45, normalize=True) - assert_array_almost_equal( - DV.parameters, [23.438, 219.406, 0], decimal=3 - ) + assert_array_almost_equal(DV.describe()["normalized_effective_range"], 23.438, decimal=3) + assert_array_almost_equal(DV.describe()["normalized_sill"], 219.406, decimal=3) + assert_array_almost_equal(DV.describe()["normalized_nugget"], 0., decimal=3) def test_invalid_azimuth(self): with self.assertRaises(ValueError) as e: @@ -41,10 +40,9 @@ def test_invalid_azimuth(self): def test_tolerance(self): DV = DirectionalVariogram(self.c, self.v, tolerance=15, normalize=True) - assert_array_almost_equal( - DV.parameters, [435.7, 2722.1, 0], - decimal=1 - ) + assert_array_almost_equal(DV.describe()["normalized_effective_range"], 435.7, decimal=1) + assert_array_almost_equal(DV.describe()["normalized_sill"], 2722.1, decimal=1) + assert_array_almost_equal(DV.describe()["normalized_nugget"], 0., decimal=1) def test_invalid_tolerance(self): with self.assertRaises(ValueError) as e: @@ -58,7 +56,8 @@ def test_invalid_tolerance(self): def test_bandwidth(self): DV = DirectionalVariogram(self.c, self.v, bandwidth=12, normalize=True) - for x, y in zip(DV.parameters, [435.733, 2715.865, 0]): + for x, y in zip([DV.describe()[name] for name in ("normalized_effective_range", "normalized_sill", "normalized_nugget")], + [435.733, 2715.865, 0]): self.assertAlmostEqual(x, y, places=3) def test_invalid_model(self): diff --git a/skgstat/tests/test_kriging.py b/skgstat/tests/test_kriging.py index 385ead8..3b039e5 100644 --- a/skgstat/tests/test_kriging.py +++ b/skgstat/tests/test_kriging.py @@ -15,7 +15,7 @@ def setUp(self): def test_coordinates_and_values(self): ok = OrdinaryKriging(self.V) - assert_array_almost_equal(self.c, ok.coords) + assert_array_almost_equal(self.c, ok.coords.coords) def test_coordinates_with_duplicates(self): c = self.c.copy() diff --git a/skgstat/tests/test_variogram.py b/skgstat/tests/test_variogram.py index 92724f3..78e9039 100644 --- a/skgstat/tests/test_variogram.py +++ b/skgstat/tests/test_variogram.py @@ -15,10 +15,50 @@ PLOTLY_FOUND = False from skgstat import Variogram +from skgstat import OrdinaryKriging from skgstat import estimators from skgstat import plotting +class TestSpatiallyCorrelatedData(unittest.TestCase): + def setUp(self): + # Generate some random but spatially correlated data + # with a range of ~20 + + np.random.seed(42) + c = np.random.sample((50, 2)) * 60 + np.random.seed(42) + v = np.random.normal(10, 4, 50) + + V = Variogram(c, v).describe() + V["effective_range"] = 20 + OK = OrdinaryKriging(V, coordinates=c, values=v) + + self.c = np.random.sample((500, 2)) * 60 + self.v = OK.transform(self.c) + + self.c = self.c[~np.isnan(self.v),:] + self.v = self.v[~np.isnan(self.v)] + + def test_dense_maxlag_inf(self): + Vdense = Variogram(self.c, self.v) + Vsparse = Variogram(self.c, self.v, maxlag=10000000) + + for x, y in zip(Vdense.parameters, Vsparse.parameters): + self.assertAlmostEqual(x, y, places=3) + + def test_sparse_maxlag_50(self): + V = Variogram(self.c, self.v, maxlag=50) + + for x, y in zip(V.parameters, [20.264, 6.478, 0]): + self.assertAlmostEqual(x, y, places=3) + + def test_sparse_maxlag_30(self): + V = Variogram(self.c, self.v, maxlag=30) + + for x, y in zip(V.parameters, [17.128, 6.068, 0]): + self.assertAlmostEqual(x, y, places=3) + class TestVariogramInstatiation(unittest.TestCase): def setUp(self): # set up default values, whenever c and v are not important @@ -32,6 +72,12 @@ def test_standard_settings(self): for x, y in zip(V.parameters, [7.122, 13.966, 0]): self.assertAlmostEqual(x, y, places=3) + + def test_sparse_standard_settings(self): + V = Variogram(self.c, self.v, maxlag=10000) + + for x, y in zip(V.parameters, [7.122, 13.966, 0]): + self.assertAlmostEqual(x, y, places=3) def test_input_dimensionality(self): c1d = np.random.normal(0, 1, 100) @@ -302,18 +348,22 @@ def test_wrong_dist_func_input(self): def test_callable_dist_function(self): V = Variogram([(0, 0), (4, 1), (1, 1)], [1, 2, 3], n_lags=2) - def dfunc(x): - return np.ones((len(x), len(x))) + def dfunc(u, v): + return 1 V.set_dist_function(dfunc) # test self.assertEqual(V.dist_function, dfunc) self.assertTrue((V.distance==1).all()) - self.assertEqual(V.distance.shape, (3, 3)) + self.assertEqual(V.distance_matrix.shape, (3, 3)) @staticmethod - def test_direct_dist_setting(): + def disabled_test_direct_dist_setting(): + # Distance can no longer be explicitly set + # it would require setting the whole MetricSpace, with a + # non-sparse diagonal matrix + V = Variogram([(0, 0), (4, 1), (1, 1)], [1, 2, 3], n_lags=2) V.distance = np.array([0, 0, 100]) @@ -345,7 +395,7 @@ def test_use_nugget_setting(self): V.use_nugget = True self.assertEqual(V.use_nugget, True) self.assertEqual(V._use_nugget, True) - self.assertAlmostEqual(V.describe()['nugget'], 291.28, places=2) + self.assertAlmostEqual(V.describe()['normalized_nugget'], 291.28, places=2) def test_use_nugget_exception(self): with self.assertRaises(ValueError) as e: @@ -871,8 +921,12 @@ def test_data_no_force(self): [0., 11.82, 13.97, 13.97, 13.97, 13.97, 13.97, 13.97, 13.97, 13.97], decimal=2 ) - - def test_data_with_force(self): + + def disabled_test_data_with_force(self): + # Distance can no longer be explicitly set + # it would require setting the whole MetricSpace, with a + # non-sparse diagonal matrix + # should work if _dist is corccupted self.V._dist = self.V._dist * 5. self.V.cof = None