From 5bc6b74d66fabbaa61356f85cac70a6fc32f69f0 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Wed, 14 Jul 2021 16:50:09 +0200 Subject: [PATCH 01/17] add RasterMetricSpace --- skgstat/MetricSpace.py | 227 +++++++++++++++++++++++++++++++++++++++++ skgstat/__init__.py | 2 +- 2 files changed, 228 insertions(+), 1 deletion(-) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index 2370a18..677d188 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -1,3 +1,5 @@ +from typing import Union, Optional, Sequence + from scipy.spatial.distance import pdist, cdist, squareform from scipy.spatial import cKDTree from scipy import sparse @@ -377,3 +379,228 @@ def dists(self): dists = dists.tocsr() self._dists = dists return self._dists + +class RasterEquidistantMetricSpace(MetricSpace): + """Like ProbabilisticMetricSpace but only applies to Raster data (2D gridded data) and + samples iteratively an `equidistant` subset within distances to a 'center' subset. + Subsets can either be a fraction of the total number of pairs (float < 1), or an integer count. + """ + + def __init__( + self, + coords, + shape, + extent, + runs=None, + dist_metric="euclidean", + max_dist=None, + samples=0.5, + rnd=None + ): + """RasterEquidistantMetricSpace class + + Parameters + ---------- + coords : numpy.ndarray + Coordinate array of shape (Npoints, 2) + shape: tuple[int, int] + Shape of raster (X, Y) + extent: tuple[float, float, float, float] + Extent of raster (Xmin, Xmax, Ymin, Ymax) + runs: int + Number of subsamples to concatenate + dist_metric : str + Distance metric names as used by scipy.spatial.distance.pdist + max_dist : float + Maximum distance between points after which the distance + is considered infinite and not calculated. + samples : float, int + Number of samples (int) or fraction of coords to sample (float < 1). + rnd : numpy.random.RandomState, int + Random state to use for the sampling. + """ + self.coords = coords.copy() + self.dist_metric = dist_metric + self.shape = shape + self.extent = extent + self.res = np.sqrt(((extent[1] - extent[0])/shape[0])**2 + ((extent[3] - extent[2])/shape[1])**2) + + # TODO: if the number of runs is not specified, divide the grid in N center samples + if runs is None: + runs = 10 + + self.runs = runs + + # if the maximum distance is not specified, find the maximum possible distance from the grid dimensions + if max_dist is None: + max_dist = np.max(self.shape) * self.res + self.max_dist = max_dist + + self.samples = samples + if rnd is None: + self.rnd = np.random + elif isinstance(rnd, np.random.RandomState): + self.rnd = rnd + else: + self.rnd = np.random.RandomState(np.random.MT19937(np.random.SeedSequence( ))) + + # Index and KDTree of center sample + self._cidx = None + self._ctree = None + + # Index and KDTree of equidistant sample + self._eqidx = None + self._eqtree = None + + self._center = None + self._center_radius = None + self._dists = None + # Do a very quick check to see throw exceptions + # if self.dist_metric is invalid... + pdist(self.coords[:1, :], metric=self.dist_metric) + + @property + def sample_count(self): + if isinstance(self.samples, int): + return self.samples + return int(self.samples * len(self.coords)) + + @property + def cidx(self): + """The sampled indices into `self.coords` for the center sample.""" + + if self._cidx is None: + + # First index: preselect samples in a disk of radius large enough to contain twice the sample_count samples + dist_center = np.sqrt((self.coords[:, 0] - self._center[0]) ** 2 + ( + self.coords[:, 1] - self._center[1]) ** 2) + idx1 = dist_center < self._center_radius + coords_center = self.coords[idx1, :] + + indices1 = np.argwhere(idx1) + + # Second index: randomly select half of the valid pixels, so that the other half can be used by the equidist + # sample for low distances + indices2 = self.rnd.choice(len(coords_center), size=int(len(coords_center) / 2), replace=False) + + self._cidx = indices1[indices2].squeeze() + + return self._cidx + + @property + def ctree(self): + """If `self.dist_metric` is `euclidean`, a `scipy.spatial.cKDTree` + instance of the center sample of `self.coords`. Undefined otherwise.""" + + # only Euclidean supported + if self.dist_metric != "euclidean": + raise ValueError(( + "A coordinate tree can only be constructed " + "for an euclidean space" + )) + + if self._ctree is None: + self._ctree = cKDTree(self.coords[self.cidx, :]) + return self._ctree + + + @property + def eqidx(self): + """The sampled indices into `self.coords` for the equidistant sample.""" + + # Hardcode exponential bins for now, see about providing more options later + list_inout_radius = [0.] + rad = self._center_radius + increasing_rad = rad + while increasing_rad < self.max_dist: + list_inout_radius.append(increasing_rad) + increasing_rad *= 1.5 + list_inout_radius.append(self.max_dist) + + dist_center = np.sqrt((self.coords[:, 0] - self._center[0]) ** 2 + ( + self.coords[:, 1] - self._center[1]) ** 2) + + if self._eqidx is None: + + # Loop over an iterative sampling in rings + list_idx = [] + for i in range(len(list_inout_radius)-1): + # First index: preselect samples in a ring of inside radius and outside radius + idx1 = np.logical_and(dist_center < list_inout_radius[i+1], dist_center >= list_inout_radius[i]) + coords_equi = self.coords[idx1, :] + + indices1 = np.argwhere(idx1) + + # Second index: randomly select half of the valid pixels, so that the other half can be used by the equidist + # sample for low distances + indices2 = ~self.rnd.choice(len(coords_equi), size=min(len(coords_equi),self.sample_count), replace=False) + list_idx.append(indices1[indices2].squeeze()) + + self._eqidx = np.concatenate(list_idx) + + return self._eqidx + + @property + def eqtree(self): + """If `self.dist_metric` is `euclidean`, a `scipy.spatial.cKDTree` + instance of the equidistant sample of `self.coords`. Undefined otherwise.""" + + # only Euclidean supported + if self.dist_metric != "euclidean": + raise ValueError(( + "A coordinate tree can only be constructed " + "for an euclidean space" + )) + + if self._eqtree is None: + self._eqtree = cKDTree(self.coords[self.eqidx, :]) + return self._eqtree + + @property + def dists(self): + """A distance matrix of the sampled point pairs as a + `scipy.sparse.csr_matrix` sparse matrix. """ + + if self._dists is None: + + list_dists, list_cidx, list_eqidx = ([] for i in range(3)) + + idx_center = self.rnd.choice(len(self.coords), size=(2, self.runs), replace=False) + + for i in range(self.runs): + + # Each run has a different center + self._center = (self.coords[idx_center[0, i], 0], self.coords[idx_center[1, i], 1]) + # Radius of center based on sample count + self._center_radius = np.sqrt(self.sample_count / np.pi) * self.res + + dists = self.ctree.sparse_distance_matrix( + self.eqtree, + self.max_dist, + output_type="coo_matrix" + ) + + list_dists.append(dists.data) + list_cidx.append(self.cidx[dists.row]) + list_eqidx.append(self.eqidx[dists.col]) + + self._cidx = None + self._ctree = None + self._eqidx = None + self._eqtree = None + + # concatenate the coo matrixes + d = np.concatenate(list_dists) + c = np.concatenate(list_cidx) + eq = np.concatenate(list_eqidx) + + # remove possible duplicates (that would be summed by default) + # from https://stackoverflow.com/questions/28677162/ignoring-duplicate-entries-in-sparse-matrix + c, eq, d = zip(*set(zip(c, eq, d))) + + # put everything into a csr matrix + dists = sparse.csr_matrix((d, (c, eq)), shape=(len(self.coords), len(self.coords))) + + self._dists = dists + + return self._dists \ No newline at end of file diff --git a/skgstat/__init__.py b/skgstat/__init__.py index d4b9fae..1a2a1a9 100644 --- a/skgstat/__init__.py +++ b/skgstat/__init__.py @@ -2,7 +2,7 @@ from .DirectionalVariogram import DirectionalVariogram from .SpaceTimeVariogram import SpaceTimeVariogram from .Kriging import OrdinaryKriging -from .MetricSpace import MetricSpace, MetricSpacePair +from .MetricSpace import MetricSpace, MetricSpacePair, ProbabalisticMetricSpace, RasterEquidistantMetricSpace from . import interfaces from . import data from . import util From fee0f3f198dd56ecf124cdcad381078d3ce7ba19 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Wed, 14 Jul 2021 17:19:46 +0200 Subject: [PATCH 02/17] remove unused imports --- skgstat/MetricSpace.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index 677d188..bbe92e5 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -1,5 +1,3 @@ -from typing import Union, Optional, Sequence - from scipy.spatial.distance import pdist, cdist, squareform from scipy.spatial import cKDTree from scipy import sparse From 4ae5838cf35de64d083151f8900e2d6286b59471 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Thu, 15 Jul 2021 15:17:54 +0200 Subject: [PATCH 03/17] speed up duplicate removal, fix a couple issues --- skgstat/MetricSpace.py | 66 +++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index bbe92e5..8da26f7 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -2,7 +2,7 @@ from scipy.spatial import cKDTree from scipy import sparse import numpy as np - +import multiprocessing as mp def _sparse_dok_get(m, fill_value=np.NaN): """Like m.toarray(), but setting empty values to `fill_value`, by @@ -393,6 +393,7 @@ def __init__( dist_metric="euclidean", max_dist=None, samples=0.5, + ncores=1, rnd=None ): """RasterEquidistantMetricSpace class @@ -423,11 +424,6 @@ def __init__( self.extent = extent self.res = np.sqrt(((extent[1] - extent[0])/shape[0])**2 + ((extent[3] - extent[2])/shape[1])**2) - # TODO: if the number of runs is not specified, divide the grid in N center samples - if runs is None: - runs = 10 - - self.runs = runs # if the maximum distance is not specified, find the maximum possible distance from the grid dimensions if max_dist is None: @@ -435,6 +431,14 @@ def __init__( self.max_dist = max_dist self.samples = samples + + if runs is None: + # By default + runs = (self.shape[0] * self.shape[1]) / (100 * self.samples) + + self.runs = runs + + if rnd is None: self.rnd = np.random elif isinstance(rnd, np.random.RandomState): @@ -473,13 +477,13 @@ def cidx(self): dist_center = np.sqrt((self.coords[:, 0] - self._center[0]) ** 2 + ( self.coords[:, 1] - self._center[1]) ** 2) idx1 = dist_center < self._center_radius - coords_center = self.coords[idx1, :] + count = np.count_nonzero(idx1) indices1 = np.argwhere(idx1) # Second index: randomly select half of the valid pixels, so that the other half can be used by the equidist # sample for low distances - indices2 = self.rnd.choice(len(coords_center), size=int(len(coords_center) / 2), replace=False) + indices2 = self.rnd.choice(count, size=count, replace=False) self._cidx = indices1[indices2].squeeze() @@ -507,32 +511,38 @@ def eqidx(self): """The sampled indices into `self.coords` for the equidistant sample.""" # Hardcode exponential bins for now, see about providing more options later - list_inout_radius = [0.] + list_in_radius = [0.] rad = self._center_radius increasing_rad = rad + list_out_radius = [rad] while increasing_rad < self.max_dist: - list_inout_radius.append(increasing_rad) + list_in_radius.append(increasing_rad) increasing_rad *= 1.5 - list_inout_radius.append(self.max_dist) + list_out_radius.append(increasing_rad) dist_center = np.sqrt((self.coords[:, 0] - self._center[0]) ** 2 + ( self.coords[:, 1] - self._center[1]) ** 2) + idx = np.logical_and(dist_center[None, :] >= np.array(list_in_radius)[:, None], + dist_center[None, :] < np.array(list_out_radius)[:, None]) + if self._eqidx is None: # Loop over an iterative sampling in rings list_idx = [] - for i in range(len(list_inout_radius)-1): + for i in range(len(list_in_radius)): # First index: preselect samples in a ring of inside radius and outside radius - idx1 = np.logical_and(dist_center < list_inout_radius[i+1], dist_center >= list_inout_radius[i]) - coords_equi = self.coords[idx1, :] + idx1 = idx[i, :] + count = np.count_nonzero(idx1) indices1 = np.argwhere(idx1) # Second index: randomly select half of the valid pixels, so that the other half can be used by the equidist # sample for low distances - indices2 = ~self.rnd.choice(len(coords_equi), size=min(len(coords_equi),self.sample_count), replace=False) - list_idx.append(indices1[indices2].squeeze()) + indices2 = self.rnd.choice(count, size=min(count,self.sample_count), replace=False) + sub_idx = indices1[indices2] + if len(sub_idx)>1: + list_idx.append(sub_idx.squeeze()) self._eqidx = np.concatenate(list_idx) @@ -563,14 +573,16 @@ def dists(self): list_dists, list_cidx, list_eqidx = ([] for i in range(3)) - idx_center = self.rnd.choice(len(self.coords), size=(2, self.runs), replace=False) + idx_center = self.rnd.choice(len(self.coords), size=self.runs, replace=False) + + # Each run has a different center + centers = self.coords[idx_center] + # Radius of center based on sample count + self._center_radius = np.sqrt(self.sample_count / np.pi) * self.res for i in range(self.runs): - # Each run has a different center - self._center = (self.coords[idx_center[0, i], 0], self.coords[idx_center[1, i], 1]) - # Radius of center based on sample count - self._center_radius = np.sqrt(self.sample_count / np.pi) * self.res + self._center = centers[i] dists = self.ctree.sparse_distance_matrix( self.eqtree, @@ -594,10 +606,16 @@ def dists(self): # remove possible duplicates (that would be summed by default) # from https://stackoverflow.com/questions/28677162/ignoring-duplicate-entries-in-sparse-matrix - c, eq, d = zip(*set(zip(c, eq, d))) - # put everything into a csr matrix - dists = sparse.csr_matrix((d, (c, eq)), shape=(len(self.coords), len(self.coords))) + # Stable solution but a bit slow + # c, eq, d = zip(*set(zip(c, eq, d))) + # dists = sparse.csr_matrix((d, (c, eq)), shape=(len(self.coords), len(self.coords))) + + # Solution 5+ times faster than the preceding, but relies on _update() which might change in scipy (which + # only has an implemented method for summing duplicates, and not ignoring them yet) + dok = sparse.dok_matrix((len(self.coords), len(self.coords))) + dok._update(zip(zip(c, eq), d)) + dists = dok.tocsr() self._dists = dists From aa7fc559fc9258d2f43aa00126e586d536e9c59f Mon Sep 17 00:00:00 2001 From: rhugonne Date: Thu, 15 Jul 2021 16:04:40 +0200 Subject: [PATCH 04/17] remove unusued mp draft --- skgstat/MetricSpace.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index 8da26f7..8de16e1 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -2,7 +2,6 @@ from scipy.spatial import cKDTree from scipy import sparse import numpy as np -import multiprocessing as mp def _sparse_dok_get(m, fill_value=np.NaN): """Like m.toarray(), but setting empty values to `fill_value`, by @@ -393,7 +392,6 @@ def __init__( dist_metric="euclidean", max_dist=None, samples=0.5, - ncores=1, rnd=None ): """RasterEquidistantMetricSpace class @@ -473,7 +471,7 @@ def cidx(self): if self._cidx is None: - # First index: preselect samples in a disk of radius large enough to contain twice the sample_count samples + # First index: preselect samples in a disk of radius large enough to contain the sample_count samples dist_center = np.sqrt((self.coords[:, 0] - self._center[0]) ** 2 + ( self.coords[:, 1] - self._center[1]) ** 2) idx1 = dist_center < self._center_radius From 7dde22a1ea184ec274a4c47429003b0ddb122192 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Thu, 15 Jul 2021 17:04:07 +0200 Subject: [PATCH 05/17] add test and fix random state --- skgstat/MetricSpace.py | 2 +- skgstat/tests/test_metric_space.py | 40 ++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index 8de16e1..3004330 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -442,7 +442,7 @@ def __init__( elif isinstance(rnd, np.random.RandomState): self.rnd = rnd else: - self.rnd = np.random.RandomState(np.random.MT19937(np.random.SeedSequence( ))) + self.rnd = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(rnd))) # Index and KDTree of center sample self._cidx = None diff --git a/skgstat/tests/test_metric_space.py b/skgstat/tests/test_metric_space.py index 5980b58..836a996 100644 --- a/skgstat/tests/test_metric_space.py +++ b/skgstat/tests/test_metric_space.py @@ -1,6 +1,7 @@ import pytest import numpy as np import skgstat as skg +import scipy # produce a random dataset np.random.seed(42) @@ -8,7 +9,6 @@ np.random.seed(42) rvals = np.random.normal(10, 4, 500) - def test_invalid_dist_func(): # instantiate metrix space ms = skg.MetricSpace(rcoords, dist_metric='euclidean') @@ -39,7 +39,7 @@ def test_dense_matrix_warning(): assert 'Only available' in w.value -def test_unkonwn_metric(): +def test_unknown_metric(): with pytest.raises(ValueError) as e: skg.MetricSpace(rcoords, dist_metric='foobar') @@ -76,3 +76,39 @@ def test_metric_pair_max_dist(): skg.MetricSpacePair(ms1, ms2) assert 'same max_dist' in e.value + +def test_raster_metric(): + + # Generate a gridded dataset + shape = (100, 100) + np.random.seed(42) + vals = np.random.normal(0, 1, size=shape) + + # Coordinates + x = np.arange(0, shape[0]) + y = np.arange(0, shape[1]) + xx, yy = np.meshgrid(x, y) + + # Flatten everything because we don't care about the 2D at this point + coords = np.dstack((xx.flatten(), yy.flatten())).squeeze() + vals = vals.flatten() + + # Run the computation + rems = skg.RasterEquidistantMetricSpace(coords, shape=shape, extent=(x[0],x[-1],y[0],y[-1]), samples=10, runs=10, + rnd=42) + + # Minimal check of the output + assert rems.max_dist == pytest.approx(140,rel=0.01) + assert rems.res == pytest.approx(1.4, rel=0.0001) + assert isinstance(rems.dists, scipy.sparse.csr.csr_matrix) + assert rems.dists.shape == (10000, 10000) + + # Check the random state provides the same final center + assert all(rems._center == np.array([62, 52])) + + # Check the interface with a Variogram object works + V = skg.Variogram(rems, vals) + + assert V.bin_count is not None + # Check the variogram is always the same with the random state given + assert V.experimental[0] == pytest.approx(1.06,0.01) From 20420fb36909ead8eb04411ba6d33abd9e0c9656 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Thu, 15 Jul 2021 18:54:42 +0200 Subject: [PATCH 06/17] furnish doc, add optional subsampling input parameters --- skgstat/MetricSpace.py | 108 ++++++++++++++++++++--------- skgstat/tests/test_metric_space.py | 2 +- 2 files changed, 77 insertions(+), 33 deletions(-) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index 3004330..a968a5c 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -381,6 +381,31 @@ class RasterEquidistantMetricSpace(MetricSpace): """Like ProbabilisticMetricSpace but only applies to Raster data (2D gridded data) and samples iteratively an `equidistant` subset within distances to a 'center' subset. Subsets can either be a fraction of the total number of pairs (float < 1), or an integer count. + The 'center' subset corresponds to a disk centered on a point of the grid for which the location + randomly varies and can be redrawn and aggregate for several runs. The corresponding 'equidistant' + subset consists of the concatenation samples drawn from rings with radius increasing gradually + until the maximum extent of the grid is reached. + + To define the subsampling, several parameters are available: + - The raw number of samples correspond to the samples that will be drawn in each central disk. + Along with the ratio of samples drawn (see below), it will automatically defines the radius + of the disk and rings for subsampling. + Note that this amount of samples drawn will be repeatedly drawn for each equidistant rings + at a given radius, resulting in a several-fold amount of samples for the equidistant subset. + - The ratio of subsample drawn defines the density of point sampled within each subset. + It defines the radius of the central disk and equidistant rings with the raw sample size. + It default to 20%. + - The number of runs correspond to the number of random center points repeated during the + subsampling. It default to a sampling of 1% of the grid with center subsets. + + Alternatively, one can supply: + - The multiplicative factor to derive increasing rings radii, set as squareroot of 2 by + default in order to conserve a similar area for each ring and verify the sampling ratio. + Or directly: + - The radius of the central disk subset. + - A list of radii for the equidistant ring subsets. + When providing those spatial parameters, all other sampling parameters will be ignored + except for the raw number of samples to draw in each subset. """ def __init__( @@ -388,10 +413,14 @@ def __init__( coords, shape, extent, + samples=100, + ratio_subsamp=0.2, runs=None, - dist_metric="euclidean", + exp_increase_fac=np.sqrt(2), + center_radius=None, + equidistant_radii=None, max_dist=None, - samples=0.5, + dist_metric="euclidean", rnd=None ): """RasterEquidistantMetricSpace class @@ -400,43 +429,50 @@ def __init__( ---------- coords : numpy.ndarray Coordinate array of shape (Npoints, 2) - shape: tuple[int, int] + shape : tuple[int, int] Shape of raster (X, Y) - extent: tuple[float, float, float, float] + extent : tuple[float, float, float, float] Extent of raster (Xmin, Xmax, Ymin, Ymax) - runs: int - Number of subsamples to concatenate + samples : float, int + Number of samples (int) or fraction of coords to sample (float < 1). + ratio_subsamp: + Ratio of samples drawn within each subsample. + runs : int + Number of subsamplings based on a random center point + exp_increase_fac : float + Multiplicative factor of increasing radius for ring subsets + center_radius: float + Radius of center subset, overrides other sampling parameters. + equidistant_radii: list + List of radii of ring subset, overrides other sampling parameters. dist_metric : str Distance metric names as used by scipy.spatial.distance.pdist max_dist : float Maximum distance between points after which the distance is considered infinite and not calculated. - samples : float, int - Number of samples (int) or fraction of coords to sample (float < 1). + rnd : numpy.random.RandomState, int Random state to use for the sampling. """ + self.coords = coords.copy() self.dist_metric = dist_metric self.shape = shape self.extent = extent - self.res = np.sqrt(((extent[1] - extent[0])/shape[0])**2 + ((extent[3] - extent[2])/shape[1])**2) + self.res = np.mean([(extent[1] - extent[0])/(shape[0]-1),(extent[3] - extent[2])/(shape[1]-1)]) - - # if the maximum distance is not specified, find the maximum possible distance from the grid dimensions + # if the maximum distance is not specified, find the maximum possible distance from the extent if max_dist is None: - max_dist = np.max(self.shape) * self.res + max_dist = np.sqrt((extent[1] - extent[0])**2 + (extent[3] - extent[2])**2) self.max_dist = max_dist self.samples = samples if runs is None: - # By default - runs = (self.shape[0] * self.shape[1]) / (100 * self.samples) - + # If None is provided, try to sample center samples for about one percent of the area + runs = int((self.shape[0] * self.shape[1]) / self.samples * 1/100.) self.runs = runs - if rnd is None: self.rnd = np.random elif isinstance(rnd, np.random.RandomState): @@ -444,6 +480,27 @@ def __init__( else: self.rnd = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(rnd))) + # Radius of center subsample, based on sample count + # If None is provided, the disk is defined with the exact size to hold the number of percentage of samples + # defined by the user + if center_radius is None: + center_radius = np.sqrt(1. / ratio_subsamp * self.sample_count / np.pi) * self.res + self._center_radius = center_radius + + # Radii of equidistant ring subsamples + # If None is provided, the rings are defined with exponentially increasing radii with a factor sqrt(2), which + # means each ring will have just enough area to sample at least the number of samples desired, and same + # for each of the following, due to: + # (sqrt(2)R)**2 - R**2 = R**2 + if equidistant_radii is None: + equidistant_radii = [0.] + increasing_rad = self._center_radius + while increasing_rad < self.max_dist: + equidistant_radii.append(increasing_rad) + increasing_rad *= exp_increase_fac + equidistant_radii.append(self.max_dist) + self._equidistant_radii = equidistant_radii + # Index and KDTree of center sample self._cidx = None self._ctree = None @@ -453,7 +510,6 @@ def __init__( self._eqtree = None self._center = None - self._center_radius = None self._dists = None # Do a very quick check to see throw exceptions # if self.dist_metric is invalid... @@ -508,27 +564,17 @@ def ctree(self): def eqidx(self): """The sampled indices into `self.coords` for the equidistant sample.""" - # Hardcode exponential bins for now, see about providing more options later - list_in_radius = [0.] - rad = self._center_radius - increasing_rad = rad - list_out_radius = [rad] - while increasing_rad < self.max_dist: - list_in_radius.append(increasing_rad) - increasing_rad *= 1.5 - list_out_radius.append(increasing_rad) - dist_center = np.sqrt((self.coords[:, 0] - self._center[0]) ** 2 + ( self.coords[:, 1] - self._center[1]) ** 2) - idx = np.logical_and(dist_center[None, :] >= np.array(list_in_radius)[:, None], - dist_center[None, :] < np.array(list_out_radius)[:, None]) + idx = np.logical_and(dist_center[None, :] >= np.array(self._equidistant_radii[:-1])[:, None], + dist_center[None, :] < np.array(self._equidistant_radii[1:])[:, None]) if self._eqidx is None: # Loop over an iterative sampling in rings list_idx = [] - for i in range(len(list_in_radius)): + for i in range(len(self._equidistant_radii) - 1): # First index: preselect samples in a ring of inside radius and outside radius idx1 = idx[i, :] @@ -575,8 +621,6 @@ def dists(self): # Each run has a different center centers = self.coords[idx_center] - # Radius of center based on sample count - self._center_radius = np.sqrt(self.sample_count / np.pi) * self.res for i in range(self.runs): diff --git a/skgstat/tests/test_metric_space.py b/skgstat/tests/test_metric_space.py index 836a996..a984694 100644 --- a/skgstat/tests/test_metric_space.py +++ b/skgstat/tests/test_metric_space.py @@ -99,7 +99,7 @@ def test_raster_metric(): # Minimal check of the output assert rems.max_dist == pytest.approx(140,rel=0.01) - assert rems.res == pytest.approx(1.4, rel=0.0001) + assert rems.res == pytest.approx(1, rel=0.0001) assert isinstance(rems.dists, scipy.sparse.csr.csr_matrix) assert rems.dists.shape == (10000, 10000) From dd89a1b911a1379339cc8f0efccf950d816121db Mon Sep 17 00:00:00 2001 From: rhugonne Date: Fri, 16 Jul 2021 10:42:49 +0200 Subject: [PATCH 07/17] correct typos --- skgstat/MetricSpace.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index a968a5c..274cb4f 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -382,21 +382,21 @@ class RasterEquidistantMetricSpace(MetricSpace): samples iteratively an `equidistant` subset within distances to a 'center' subset. Subsets can either be a fraction of the total number of pairs (float < 1), or an integer count. The 'center' subset corresponds to a disk centered on a point of the grid for which the location - randomly varies and can be redrawn and aggregate for several runs. The corresponding 'equidistant' - subset consists of the concatenation samples drawn from rings with radius increasing gradually + randomly varies and can be redrawn and aggregated for several runs. The corresponding 'equidistant' + subset consists of a concatenation of subsets drawn from rings with radius gradually increasing until the maximum extent of the grid is reached. To define the subsampling, several parameters are available: - - The raw number of samples correspond to the samples that will be drawn in each central disk. - Along with the ratio of samples drawn (see below), it will automatically defines the radius + - The raw number of samples corresponds to the samples that will be drawn in each central disk. + Along with the ratio of samples drawn (see below), it will automatically define the radius of the disk and rings for subsampling. - Note that this amount of samples drawn will be repeatedly drawn for each equidistant rings - at a given radius, resulting in a several-fold amount of samples for the equidistant subset. - - The ratio of subsample drawn defines the density of point sampled within each subset. - It defines the radius of the central disk and equidistant rings with the raw sample size. - It default to 20%. - - The number of runs correspond to the number of random center points repeated during the - subsampling. It default to a sampling of 1% of the grid with center subsets. + Note that the number of samples drawn will be repeatedly drawn for each equidistant rings + at a given radius, resulting in a several-fold amount of total samples for the equidistant + subset. + - The ratio of subsample defines the density of point sampled within each subset. It + defaults to 20%. + - The number of runs corresponds to the number of random center points repeated during the + subsampling. It defaults to a sampling of 1% of the grid with center subsets. Alternatively, one can supply: - The multiplicative factor to derive increasing rings radii, set as squareroot of 2 by From 48d0f50d43c034fb938e9da73c46bdbb11b37f5b Mon Sep 17 00:00:00 2001 From: rhugonne Date: Mon, 19 Jul 2021 19:35:35 +0200 Subject: [PATCH 08/17] add multiprocessing --- skgstat/MetricSpace.py | 211 +++++++++++++++++++---------- skgstat/tests/test_metric_space.py | 13 +- 2 files changed, 153 insertions(+), 71 deletions(-) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index 274cb4f..8fb221f 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -2,6 +2,8 @@ from scipy.spatial import cKDTree from scipy import sparse import numpy as np +import multiprocessing as mp +import time as time def _sparse_dok_get(m, fill_value=np.NaN): """Like m.toarray(), but setting empty values to `fill_value`, by @@ -377,6 +379,80 @@ def dists(self): self._dists = dists return self._dists +# Subfunctions of RasterEquidistantMetricSpace + +def get_disk_sample(coords, center, center_radius, rnd_func, sample_count): + + # First index: preselect samples in a disk of certain radius + dist_center = np.sqrt((coords[:, 0] - center[0]) ** 2 + ( + coords[:, 1] - center[1]) ** 2) + idx1 = dist_center < center_radius + + count = np.count_nonzero(idx1) + indices1 = np.argwhere(idx1) + + # Second index: randomly select half of the valid pixels, so that the other half can be used by the equidist + # sample for low distances + indices2 = rnd_func.choice(count, size=sample_count, replace=False) + + return indices1[indices2].squeeze() + +def get_successive_ring_samples(coords, center, equidistant_radii, rnd_func, sample_count): + + dist_center = np.sqrt((coords[:, 0] - center[0]) ** 2 + ( + coords[:, 1] - center[1]) ** 2) + + idx = np.logical_and(dist_center[None, :] >= np.array(equidistant_radii[:-1])[:, None], + dist_center[None, :] < np.array(equidistant_radii[1:])[:, None]) + + # Loop over an iterative sampling in rings + list_idx = [] + for i in range(len(equidistant_radii) - 1): + # First index: preselect samples in a ring of inside radius and outside radius + idx1 = idx[i, :] + + count = np.count_nonzero(idx1) + indices1 = np.argwhere(idx1) + + # Second index: randomly select half of the valid pixels, so that the other half can be used by the equidist + # sample for low distances + indices2 = rnd_func.choice(count, size=min(count, sample_count), replace=False) + sub_idx = indices1[indices2] + if len(sub_idx) > 1: + list_idx.append(sub_idx.squeeze()) + + return np.concatenate(list_idx) + +def get_idx_dists(coords, center, center_radius, equidistant_radii, rnd_func, + sample_count, max_dist, i, imax, verbose): + + if verbose: + print('Working on subsample ' + str(i+1) + ' out of ' + str(imax)) + + cidx = get_disk_sample(coords=coords, center=center, + center_radius=center_radius, rnd_func=rnd_func, + sample_count=sample_count) + + eqidx = get_successive_ring_samples(coords=coords, center=center, + equidistant_radii=equidistant_radii, rnd_func=rnd_func, + sample_count=sample_count) + + ctree = cKDTree(coords[cidx, :]) + eqtree = cKDTree(coords[eqidx, :]) + + dists = ctree.sparse_distance_matrix( + eqtree, + max_dist, + output_type="coo_matrix" + ) + + return dists.data, cidx[dists.row], eqidx[dists.col] + +def mp_wrapper_get_idx_dists(argdict: dict): + + return get_idx_dists(**argdict) + + class RasterEquidistantMetricSpace(MetricSpace): """Like ProbabilisticMetricSpace but only applies to Raster data (2D gridded data) and samples iteratively an `equidistant` subset within distances to a 'center' subset. @@ -416,12 +492,14 @@ def __init__( samples=100, ratio_subsamp=0.2, runs=None, + ncores=1, exp_increase_fac=np.sqrt(2), center_radius=None, equidistant_radii=None, max_dist=None, dist_metric="euclidean", - rnd=None + rnd=None, + verbose=False ): """RasterEquidistantMetricSpace class @@ -439,6 +517,8 @@ def __init__( Ratio of samples drawn within each subsample. runs : int Number of subsamplings based on a random center point + ncores : int + Number of cores to use in multiprocessing for the subsamplings. exp_increase_fac : float Multiplicative factor of increasing radius for ring subsets center_radius: float @@ -450,11 +530,19 @@ def __init__( max_dist : float Maximum distance between points after which the distance is considered infinite and not calculated. + verbose : bool + Whether to print statements in the console rnd : numpy.random.RandomState, int Random state to use for the sampling. """ + if dist_metric != "euclidean": + raise ValueError(( + "A RasterEquidistantMetricSpace class can only be constructed " + "for an euclidean space" + )) + self.coords = coords.copy() self.dist_metric = dist_metric self.shape = shape @@ -473,8 +561,10 @@ def __init__( runs = int((self.shape[0] * self.shape[1]) / self.samples * 1/100.) self.runs = runs + self.ncores = ncores + if rnd is None: - self.rnd = np.random + self.rnd = np.random.default_rng() elif isinstance(rnd, np.random.RandomState): self.rnd = rnd else: @@ -485,6 +575,9 @@ def __init__( # defined by the user if center_radius is None: center_radius = np.sqrt(1. / ratio_subsamp * self.sample_count / np.pi) * self.res + if verbose: + print('Radius of center disk sample for sample count of '+str(self.sample_count)+ ' and subsampling ratio' + ' of '+str(ratio_subsamp)+': '+str(center_radius)) self._center_radius = center_radius # Radii of equidistant ring subsamples @@ -499,8 +592,13 @@ def __init__( equidistant_radii.append(increasing_rad) increasing_rad *= exp_increase_fac equidistant_radii.append(self.max_dist) + if verbose: + print('Radii of equidistant ring samples for increasing factor of ' + str(exp_increase_fac) + ': ') + print(equidistant_radii) self._equidistant_radii = equidistant_radii + self.verbose = verbose + # Index and KDTree of center sample self._cidx = None self._ctree = None @@ -509,7 +607,7 @@ def __init__( self._eqidx = None self._eqtree = None - self._center = None + self._centers = None self._dists = None # Do a very quick check to see throw exceptions # if self.dist_metric is invalid... @@ -525,22 +623,6 @@ def sample_count(self): def cidx(self): """The sampled indices into `self.coords` for the center sample.""" - if self._cidx is None: - - # First index: preselect samples in a disk of radius large enough to contain the sample_count samples - dist_center = np.sqrt((self.coords[:, 0] - self._center[0]) ** 2 + ( - self.coords[:, 1] - self._center[1]) ** 2) - idx1 = dist_center < self._center_radius - - count = np.count_nonzero(idx1) - indices1 = np.argwhere(idx1) - - # Second index: randomly select half of the valid pixels, so that the other half can be used by the equidist - # sample for low distances - indices2 = self.rnd.choice(count, size=count, replace=False) - - self._cidx = indices1[indices2].squeeze() - return self._cidx @property @@ -556,7 +638,7 @@ def ctree(self): )) if self._ctree is None: - self._ctree = cKDTree(self.coords[self.cidx, :]) + self._ctree = [cKDTree(self.coords[self.cidx[i], :]) for i in range(len(self.cidx))] return self._ctree @@ -564,32 +646,6 @@ def ctree(self): def eqidx(self): """The sampled indices into `self.coords` for the equidistant sample.""" - dist_center = np.sqrt((self.coords[:, 0] - self._center[0]) ** 2 + ( - self.coords[:, 1] - self._center[1]) ** 2) - - idx = np.logical_and(dist_center[None, :] >= np.array(self._equidistant_radii[:-1])[:, None], - dist_center[None, :] < np.array(self._equidistant_radii[1:])[:, None]) - - if self._eqidx is None: - - # Loop over an iterative sampling in rings - list_idx = [] - for i in range(len(self._equidistant_radii) - 1): - # First index: preselect samples in a ring of inside radius and outside radius - idx1 = idx[i, :] - - count = np.count_nonzero(idx1) - indices1 = np.argwhere(idx1) - - # Second index: randomly select half of the valid pixels, so that the other half can be used by the equidist - # sample for low distances - indices2 = self.rnd.choice(count, size=min(count,self.sample_count), replace=False) - sub_idx = indices1[indices2] - if len(sub_idx)>1: - list_idx.append(sub_idx.squeeze()) - - self._eqidx = np.concatenate(list_idx) - return self._eqidx @property @@ -598,14 +654,9 @@ def eqtree(self): instance of the equidistant sample of `self.coords`. Undefined otherwise.""" # only Euclidean supported - if self.dist_metric != "euclidean": - raise ValueError(( - "A coordinate tree can only be constructed " - "for an euclidean space" - )) if self._eqtree is None: - self._eqtree = cKDTree(self.coords[self.eqidx, :]) + self._eqtree = [cKDTree(self.coords[self.eqidx[i], :]) for i in range(len(self.eqidx))] return self._eqtree @property @@ -613,33 +664,57 @@ def dists(self): """A distance matrix of the sampled point pairs as a `scipy.sparse.csr_matrix` sparse matrix. """ - if self._dists is None: - list_dists, list_cidx, list_eqidx = ([] for i in range(3)) + # Derive distances + if self._dists is None: idx_center = self.rnd.choice(len(self.coords), size=self.runs, replace=False) # Each run has a different center centers = self.coords[idx_center] - for i in range(self.runs): + t0 = time.time() + # Running on a single core: for loop + if self.ncores == 1: - self._center = centers[i] + list_dists, list_cidx, list_eqidx = ([] for i in range(3)) - dists = self.ctree.sparse_distance_matrix( - self.eqtree, - self.max_dist, - output_type="coo_matrix" - ) + for i in range(self.runs): - list_dists.append(dists.data) - list_cidx.append(self.cidx[dists.row]) - list_eqidx.append(self.eqidx[dists.col]) + center = centers[i] + dists, cidx, eqidx = get_idx_dists(self.coords, center=center, center_radius=self._center_radius, + equidistant_radii=self._equidistant_radii, rnd_func=self.rnd, + sample_count=self.sample_count, max_dist=self.max_dist, i=i, + imax=self.runs, verbose=self.verbose) + list_dists.append(dists) + list_cidx.append(cidx) + list_eqidx.append(eqidx) - self._cidx = None - self._ctree = None - self._eqidx = None - self._eqtree = None + # Running on several cores: multiprocessing + else: + print(self.ncores) + # Arguments to pass: only centers and loop index for verbose are changing + argsin = [{'center': centers[i], 'coords': self.coords, 'center_radius': self._center_radius, + 'equidistant_radii': self._equidistant_radii, 'rnd_func': self.rnd, + 'sample_count': self.sample_count, 'max_dist': self.max_dist, 'i': i, 'imax': self.runs, + 'verbose': self.verbose} for i in range(self.runs)] + + # Process in parallel + pool = mp.Pool(self.ncores, maxtasksperchild=1) + outputs = pool.map(mp_wrapper_get_idx_dists, argsin) + pool.close() + pool.join() + + # Get lists of outputs + list_dists, list_cidx, list_eqidx = list(zip(*outputs)) + + + t1 = time.time() + print(str(t1-t0)) + # Define class objects + self._centers = centers + self._cidx = list_cidx + self._eqidx = list_eqidx # concatenate the coo matrixes d = np.concatenate(list_dists) diff --git a/skgstat/tests/test_metric_space.py b/skgstat/tests/test_metric_space.py index a984694..e2ff103 100644 --- a/skgstat/tests/test_metric_space.py +++ b/skgstat/tests/test_metric_space.py @@ -95,7 +95,7 @@ def test_raster_metric(): # Run the computation rems = skg.RasterEquidistantMetricSpace(coords, shape=shape, extent=(x[0],x[-1],y[0],y[-1]), samples=10, runs=10, - rnd=42) + rnd=42, verbose=True) # Minimal check of the output assert rems.max_dist == pytest.approx(140,rel=0.01) @@ -104,11 +104,18 @@ def test_raster_metric(): assert rems.dists.shape == (10000, 10000) # Check the random state provides the same final center - assert all(rems._center == np.array([62, 52])) + assert all(rems._centers[-1] == np.array([62, 52])) # Check the interface with a Variogram object works V = skg.Variogram(rems, vals) assert V.bin_count is not None # Check the variogram is always the same with the random state given - assert V.experimental[0] == pytest.approx(1.06,0.01) + assert V.experimental[0] == pytest.approx(0.89,0.01) + + # Try to run in parallel + rems_mp = skg.RasterEquidistantMetricSpace(coords, shape=shape, extent=(x[0],x[-1],y[0],y[-1]), samples=100, runs=10, + rnd=42, ncores=5, verbose=True) + + V_mp = skg.Variogram(rems_mp, vals) + From 3125aabd87a255112f2475876c94e2dfd4b9dddc Mon Sep 17 00:00:00 2001 From: rhugonne Date: Mon, 19 Jul 2021 19:49:48 +0200 Subject: [PATCH 09/17] add outside funcs docstrings --- skgstat/MetricSpace.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index 8fb221f..4f94e62 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -379,10 +379,12 @@ def dists(self): self._dists = dists return self._dists -# Subfunctions of RasterEquidistantMetricSpace - def get_disk_sample(coords, center, center_radius, rnd_func, sample_count): - + """ + Subfunction for RasterEquidistantMetricSpace. + Calculates the indexes of a subsample in a disk "center sample". + Same parameters as in the class. + """ # First index: preselect samples in a disk of certain radius dist_center = np.sqrt((coords[:, 0] - center[0]) ** 2 + ( coords[:, 1] - center[1]) ** 2) @@ -398,7 +400,11 @@ def get_disk_sample(coords, center, center_radius, rnd_func, sample_count): return indices1[indices2].squeeze() def get_successive_ring_samples(coords, center, equidistant_radii, rnd_func, sample_count): - + """ + Subfunction for RasterEquidistantMetricSpace. + Calculates the indexes of several subsamples within disks, "equidistant sample". + Same parameters as in the class. + """ dist_center = np.sqrt((coords[:, 0] - center[0]) ** 2 + ( coords[:, 1] - center[1]) ** 2) @@ -425,6 +431,11 @@ def get_successive_ring_samples(coords, center, equidistant_radii, rnd_func, sam def get_idx_dists(coords, center, center_radius, equidistant_radii, rnd_func, sample_count, max_dist, i, imax, verbose): + """ + Subfunction for RasterEquidistantMetricSpace. + Calculates the pairwise distances between a list of pairs of "center" and "equidistant" ensembles. + Same parameters as in the class. + """ if verbose: print('Working on subsample ' + str(i+1) + ' out of ' + str(imax)) @@ -449,7 +460,9 @@ def get_idx_dists(coords, center, center_radius, equidistant_radii, rnd_func, return dists.data, cidx[dists.row], eqidx[dists.col] def mp_wrapper_get_idx_dists(argdict: dict): - + """ + Multiprocessing wrapper for get_idx_dists. + """ return get_idx_dists(**argdict) From fccd0c6c20b53c15ad4eed5a55734f8f83ccb8d7 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Mon, 19 Jul 2021 19:56:07 +0200 Subject: [PATCH 10/17] remove elapsed time prints --- skgstat/MetricSpace.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index 4f94e62..7fd4ca0 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -3,7 +3,6 @@ from scipy import sparse import numpy as np import multiprocessing as mp -import time as time def _sparse_dok_get(m, fill_value=np.NaN): """Like m.toarray(), but setting empty values to `fill_value`, by @@ -686,7 +685,6 @@ def dists(self): # Each run has a different center centers = self.coords[idx_center] - t0 = time.time() # Running on a single core: for loop if self.ncores == 1: @@ -705,7 +703,6 @@ def dists(self): # Running on several cores: multiprocessing else: - print(self.ncores) # Arguments to pass: only centers and loop index for verbose are changing argsin = [{'center': centers[i], 'coords': self.coords, 'center_radius': self._center_radius, 'equidistant_radii': self._equidistant_radii, 'rnd_func': self.rnd, @@ -721,9 +718,6 @@ def dists(self): # Get lists of outputs list_dists, list_cidx, list_eqidx = list(zip(*outputs)) - - t1 = time.time() - print(str(t1-t0)) # Define class objects self._centers = centers self._cidx = list_cidx From a5702c985ed2fc867e2f9094b03ef5e0a308c618 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Thu, 22 Jul 2021 15:01:18 +0200 Subject: [PATCH 11/17] formatting and typing annotations for REMS subfunctions --- skgstat/MetricSpace.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index 7fd4ca0..360da2d 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -378,7 +378,9 @@ def dists(self): self._dists = dists return self._dists -def get_disk_sample(coords, center, center_radius, rnd_func, sample_count): +# Subfunctions used in RasterEquidistantMetricSpace (outside class so that they can be pickled by multiprocessing) +def _get_disk_sample(coords: np.ndarray, center: tuple[float, float], center_radius: float, rnd_func: np.random.RandomState, + sample_count: int): """ Subfunction for RasterEquidistantMetricSpace. Calculates the indexes of a subsample in a disk "center sample". @@ -398,7 +400,8 @@ def get_disk_sample(coords, center, center_radius, rnd_func, sample_count): return indices1[indices2].squeeze() -def get_successive_ring_samples(coords, center, equidistant_radii, rnd_func, sample_count): +def _get_successive_ring_samples(coords: np.ndarray, center: tuple[float, float], equidistant_radii: list[float], + rnd_func: np.random.RandomState, sample_count: int): """ Subfunction for RasterEquidistantMetricSpace. Calculates the indexes of several subsamples within disks, "equidistant sample". @@ -428,8 +431,8 @@ def get_successive_ring_samples(coords, center, equidistant_radii, rnd_func, sam return np.concatenate(list_idx) -def get_idx_dists(coords, center, center_radius, equidistant_radii, rnd_func, - sample_count, max_dist, i, imax, verbose): +def _get_idx_dists(coords: np.ndarray, center: tuple[float, float], center_radius: float, equidistant_radii: list[float], + rnd_func: np.random.RandomState, sample_count: int, max_dist: float, i: int, imax: int, verbose: bool): """ Subfunction for RasterEquidistantMetricSpace. Calculates the pairwise distances between a list of pairs of "center" and "equidistant" ensembles. @@ -439,11 +442,11 @@ def get_idx_dists(coords, center, center_radius, equidistant_radii, rnd_func, if verbose: print('Working on subsample ' + str(i+1) + ' out of ' + str(imax)) - cidx = get_disk_sample(coords=coords, center=center, + cidx = _get_disk_sample(coords=coords, center=center, center_radius=center_radius, rnd_func=rnd_func, sample_count=sample_count) - eqidx = get_successive_ring_samples(coords=coords, center=center, + eqidx = _get_successive_ring_samples(coords=coords, center=center, equidistant_radii=equidistant_radii, rnd_func=rnd_func, sample_count=sample_count) @@ -458,11 +461,11 @@ def get_idx_dists(coords, center, center_radius, equidistant_radii, rnd_func, return dists.data, cidx[dists.row], eqidx[dists.col] -def mp_wrapper_get_idx_dists(argdict: dict): +def _mp_wrapper_get_idx_dists(argdict: dict): """ Multiprocessing wrapper for get_idx_dists. """ - return get_idx_dists(**argdict) + return _get_idx_dists(**argdict) class RasterEquidistantMetricSpace(MetricSpace): @@ -634,7 +637,6 @@ def sample_count(self): @property def cidx(self): """The sampled indices into `self.coords` for the center sample.""" - return self._cidx @property @@ -657,7 +659,6 @@ def ctree(self): @property def eqidx(self): """The sampled indices into `self.coords` for the equidistant sample.""" - return self._eqidx @property @@ -676,7 +677,6 @@ def dists(self): """A distance matrix of the sampled point pairs as a `scipy.sparse.csr_matrix` sparse matrix. """ - # Derive distances if self._dists is None: @@ -693,7 +693,7 @@ def dists(self): for i in range(self.runs): center = centers[i] - dists, cidx, eqidx = get_idx_dists(self.coords, center=center, center_radius=self._center_radius, + dists, cidx, eqidx = _get_idx_dists(self.coords, center=center, center_radius=self._center_radius, equidistant_radii=self._equidistant_radii, rnd_func=self.rnd, sample_count=self.sample_count, max_dist=self.max_dist, i=i, imax=self.runs, verbose=self.verbose) @@ -711,7 +711,7 @@ def dists(self): # Process in parallel pool = mp.Pool(self.ncores, maxtasksperchild=1) - outputs = pool.map(mp_wrapper_get_idx_dists, argsin) + outputs = pool.map(_mp_wrapper_get_idx_dists, argsin) pool.close() pool.join() From dbf8aad3459d8ea2c26ccef23d667d3e045f5058 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Thu, 22 Jul 2021 15:04:03 +0200 Subject: [PATCH 12/17] fix comment for ring sampling --- skgstat/MetricSpace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index 360da2d..8c479f3 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -407,6 +407,7 @@ def _get_successive_ring_samples(coords: np.ndarray, center: tuple[float, float] Calculates the indexes of several subsamples within disks, "equidistant sample". Same parameters as in the class. """ + # First index: preselect samples in a ring of certain inside radius and outside radius dist_center = np.sqrt((coords[:, 0] - center[0]) ** 2 + ( coords[:, 1] - center[1]) ** 2) @@ -416,7 +417,6 @@ def _get_successive_ring_samples(coords: np.ndarray, center: tuple[float, float] # Loop over an iterative sampling in rings list_idx = [] for i in range(len(equidistant_radii) - 1): - # First index: preselect samples in a ring of inside radius and outside radius idx1 = idx[i, :] count = np.count_nonzero(idx1) From 8ac7a1c03c3917421107bd31e0686df3046ef1b0 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Thu, 22 Jul 2021 15:05:37 +0200 Subject: [PATCH 13/17] clearer rename for ratio_subsample argument --- skgstat/MetricSpace.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index 8c479f3..5b6389a 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -378,6 +378,7 @@ def dists(self): self._dists = dists return self._dists + # Subfunctions used in RasterEquidistantMetricSpace (outside class so that they can be pickled by multiprocessing) def _get_disk_sample(coords: np.ndarray, center: tuple[float, float], center_radius: float, rnd_func: np.random.RandomState, sample_count: int): @@ -505,7 +506,7 @@ def __init__( shape, extent, samples=100, - ratio_subsamp=0.2, + ratio_subsample=0.2, runs=None, ncores=1, exp_increase_fac=np.sqrt(2), @@ -528,7 +529,7 @@ def __init__( Extent of raster (Xmin, Xmax, Ymin, Ymax) samples : float, int Number of samples (int) or fraction of coords to sample (float < 1). - ratio_subsamp: + ratio_subsample: Ratio of samples drawn within each subsample. runs : int Number of subsamplings based on a random center point @@ -589,10 +590,10 @@ def __init__( # If None is provided, the disk is defined with the exact size to hold the number of percentage of samples # defined by the user if center_radius is None: - center_radius = np.sqrt(1. / ratio_subsamp * self.sample_count / np.pi) * self.res + center_radius = np.sqrt(1. / ratio_subsample * self.sample_count / np.pi) * self.res if verbose: print('Radius of center disk sample for sample count of '+str(self.sample_count)+ ' and subsampling ratio' - ' of '+str(ratio_subsamp)+': '+str(center_radius)) + ' of '+str(ratio_subsample)+': '+str(center_radius)) self._center_radius = center_radius # Radii of equidistant ring subsamples From db879e2e37daf335c1d98297da5cdc03ff78ade5 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Thu, 22 Jul 2021 15:07:59 +0200 Subject: [PATCH 14/17] rename ncores in n_jobs --- skgstat/MetricSpace.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index 5b6389a..786e103 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -508,7 +508,7 @@ def __init__( samples=100, ratio_subsample=0.2, runs=None, - ncores=1, + n_jobs=1, exp_increase_fac=np.sqrt(2), center_radius=None, equidistant_radii=None, @@ -533,8 +533,8 @@ def __init__( Ratio of samples drawn within each subsample. runs : int Number of subsamplings based on a random center point - ncores : int - Number of cores to use in multiprocessing for the subsamplings. + n_jobs : int + Number of jobs to use in multiprocessing for the subsamplings. exp_increase_fac : float Multiplicative factor of increasing radius for ring subsets center_radius: float @@ -577,7 +577,7 @@ def __init__( runs = int((self.shape[0] * self.shape[1]) / self.samples * 1/100.) self.runs = runs - self.ncores = ncores + self.n_jobs = n_jobs if rnd is None: self.rnd = np.random.default_rng() @@ -687,7 +687,7 @@ def dists(self): centers = self.coords[idx_center] # Running on a single core: for loop - if self.ncores == 1: + if self.n_jobs == 1: list_dists, list_cidx, list_eqidx = ([] for i in range(3)) @@ -711,8 +711,8 @@ def dists(self): 'verbose': self.verbose} for i in range(self.runs)] # Process in parallel - pool = mp.Pool(self.ncores, maxtasksperchild=1) - outputs = pool.map(_mp_wrapper_get_idx_dists, argsin) + pool = mp.Pool(self.n_jobs, maxtasksperchild=1) + outputs = pool.map(_mp_wrapper_get_idx_dists, argsin, chunksize=1) pool.close() pool.join() From e30722a380683d31ade142b3bb82705ea2e395f1 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Thu, 22 Jul 2021 19:08:41 +0200 Subject: [PATCH 15/17] fix type annotation support --- skgstat/MetricSpace.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/skgstat/MetricSpace.py b/skgstat/MetricSpace.py index 786e103..28d07c7 100644 --- a/skgstat/MetricSpace.py +++ b/skgstat/MetricSpace.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from scipy.spatial.distance import pdist, cdist, squareform from scipy.spatial import cKDTree from scipy import sparse From af35fe9872ba50806ceeed1f2d7784c478723a5d Mon Sep 17 00:00:00 2001 From: rhugonne Date: Thu, 22 Jul 2021 19:10:45 +0200 Subject: [PATCH 16/17] remove multi-core test (not good for ci) --- skgstat/tests/test_metric_space.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/skgstat/tests/test_metric_space.py b/skgstat/tests/test_metric_space.py index e2ff103..3725469 100644 --- a/skgstat/tests/test_metric_space.py +++ b/skgstat/tests/test_metric_space.py @@ -113,9 +113,3 @@ def test_raster_metric(): # Check the variogram is always the same with the random state given assert V.experimental[0] == pytest.approx(0.89,0.01) - # Try to run in parallel - rems_mp = skg.RasterEquidistantMetricSpace(coords, shape=shape, extent=(x[0],x[-1],y[0],y[-1]), samples=100, runs=10, - rnd=42, ncores=5, verbose=True) - - V_mp = skg.Variogram(rems_mp, vals) - From 63c128bc87c8def0163e46a7f27b7ec88b97f578 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Thu, 22 Jul 2021 19:31:00 +0200 Subject: [PATCH 17/17] fix minor issue with bin_func input as Iterable: need to load with asarray() in fit --- skgstat/Variogram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skgstat/Variogram.py b/skgstat/Variogram.py index 732c32b..2577397 100644 --- a/skgstat/Variogram.py +++ b/skgstat/Variogram.py @@ -1436,7 +1436,7 @@ def fit(self, force=False, method=None, sigma=None, **kwargs): self.preprocessing(force=force) # load the data - x = self.bins + x = np.array(self.bins) y = self.experimental # overwrite fit setting if new params are given