Skip to content

Commit

Permalink
Merge pull request #114 from rhugonnet/rastermetricspace
Browse files Browse the repository at this point in the history
Add RasterMetricSpace for improved pairwise sampling of large 2D grids
  • Loading branch information
mmaelicke authored Jul 23, 2021
2 parents 43cf393 + 63c128b commit 258b687
Show file tree
Hide file tree
Showing 4 changed files with 412 additions and 5 deletions.
372 changes: 371 additions & 1 deletion skgstat/MetricSpace.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import annotations

from scipy.spatial.distance import pdist, cdist, squareform
from scipy.spatial import cKDTree
from scipy import sparse
import numpy as np

import multiprocessing as mp

def _sparse_dok_get(m, fill_value=np.NaN):
"""Like m.toarray(), but setting empty values to `fill_value`, by
Expand Down Expand Up @@ -377,3 +379,371 @@ def dists(self):
dists = dists.tocsr()
self._dists = dists
return self._dists


# Subfunctions used in RasterEquidistantMetricSpace (outside class so that they can be pickled by multiprocessing)
def _get_disk_sample(coords: np.ndarray, center: tuple[float, float], center_radius: float, rnd_func: np.random.RandomState,
sample_count: int):
"""
Subfunction for RasterEquidistantMetricSpace.
Calculates the indexes of a subsample in a disk "center sample".
Same parameters as in the class.
"""
# First index: preselect samples in a disk of certain radius
dist_center = np.sqrt((coords[:, 0] - center[0]) ** 2 + (
coords[:, 1] - center[1]) ** 2)
idx1 = dist_center < center_radius

count = np.count_nonzero(idx1)
indices1 = np.argwhere(idx1)

# Second index: randomly select half of the valid pixels, so that the other half can be used by the equidist
# sample for low distances
indices2 = rnd_func.choice(count, size=sample_count, replace=False)

return indices1[indices2].squeeze()

def _get_successive_ring_samples(coords: np.ndarray, center: tuple[float, float], equidistant_radii: list[float],
rnd_func: np.random.RandomState, sample_count: int):
"""
Subfunction for RasterEquidistantMetricSpace.
Calculates the indexes of several subsamples within disks, "equidistant sample".
Same parameters as in the class.
"""
# First index: preselect samples in a ring of certain inside radius and outside radius
dist_center = np.sqrt((coords[:, 0] - center[0]) ** 2 + (
coords[:, 1] - center[1]) ** 2)

idx = np.logical_and(dist_center[None, :] >= np.array(equidistant_radii[:-1])[:, None],
dist_center[None, :] < np.array(equidistant_radii[1:])[:, None])

# Loop over an iterative sampling in rings
list_idx = []
for i in range(len(equidistant_radii) - 1):
idx1 = idx[i, :]

count = np.count_nonzero(idx1)
indices1 = np.argwhere(idx1)

# Second index: randomly select half of the valid pixels, so that the other half can be used by the equidist
# sample for low distances
indices2 = rnd_func.choice(count, size=min(count, sample_count), replace=False)
sub_idx = indices1[indices2]
if len(sub_idx) > 1:
list_idx.append(sub_idx.squeeze())

return np.concatenate(list_idx)

def _get_idx_dists(coords: np.ndarray, center: tuple[float, float], center_radius: float, equidistant_radii: list[float],
rnd_func: np.random.RandomState, sample_count: int, max_dist: float, i: int, imax: int, verbose: bool):
"""
Subfunction for RasterEquidistantMetricSpace.
Calculates the pairwise distances between a list of pairs of "center" and "equidistant" ensembles.
Same parameters as in the class.
"""

if verbose:
print('Working on subsample ' + str(i+1) + ' out of ' + str(imax))

cidx = _get_disk_sample(coords=coords, center=center,
center_radius=center_radius, rnd_func=rnd_func,
sample_count=sample_count)

eqidx = _get_successive_ring_samples(coords=coords, center=center,
equidistant_radii=equidistant_radii, rnd_func=rnd_func,
sample_count=sample_count)

ctree = cKDTree(coords[cidx, :])
eqtree = cKDTree(coords[eqidx, :])

dists = ctree.sparse_distance_matrix(
eqtree,
max_dist,
output_type="coo_matrix"
)

return dists.data, cidx[dists.row], eqidx[dists.col]

def _mp_wrapper_get_idx_dists(argdict: dict):
"""
Multiprocessing wrapper for get_idx_dists.
"""
return _get_idx_dists(**argdict)


class RasterEquidistantMetricSpace(MetricSpace):
"""Like ProbabilisticMetricSpace but only applies to Raster data (2D gridded data) and
samples iteratively an `equidistant` subset within distances to a 'center' subset.
Subsets can either be a fraction of the total number of pairs (float < 1), or an integer count.
The 'center' subset corresponds to a disk centered on a point of the grid for which the location
randomly varies and can be redrawn and aggregated for several runs. The corresponding 'equidistant'
subset consists of a concatenation of subsets drawn from rings with radius gradually increasing
until the maximum extent of the grid is reached.
To define the subsampling, several parameters are available:
- The raw number of samples corresponds to the samples that will be drawn in each central disk.
Along with the ratio of samples drawn (see below), it will automatically define the radius
of the disk and rings for subsampling.
Note that the number of samples drawn will be repeatedly drawn for each equidistant rings
at a given radius, resulting in a several-fold amount of total samples for the equidistant
subset.
- The ratio of subsample defines the density of point sampled within each subset. It
defaults to 20%.
- The number of runs corresponds to the number of random center points repeated during the
subsampling. It defaults to a sampling of 1% of the grid with center subsets.
Alternatively, one can supply:
- The multiplicative factor to derive increasing rings radii, set as squareroot of 2 by
default in order to conserve a similar area for each ring and verify the sampling ratio.
Or directly:
- The radius of the central disk subset.
- A list of radii for the equidistant ring subsets.
When providing those spatial parameters, all other sampling parameters will be ignored
except for the raw number of samples to draw in each subset.
"""

def __init__(
self,
coords,
shape,
extent,
samples=100,
ratio_subsample=0.2,
runs=None,
n_jobs=1,
exp_increase_fac=np.sqrt(2),
center_radius=None,
equidistant_radii=None,
max_dist=None,
dist_metric="euclidean",
rnd=None,
verbose=False
):
"""RasterEquidistantMetricSpace class
Parameters
----------
coords : numpy.ndarray
Coordinate array of shape (Npoints, 2)
shape : tuple[int, int]
Shape of raster (X, Y)
extent : tuple[float, float, float, float]
Extent of raster (Xmin, Xmax, Ymin, Ymax)
samples : float, int
Number of samples (int) or fraction of coords to sample (float < 1).
ratio_subsample:
Ratio of samples drawn within each subsample.
runs : int
Number of subsamplings based on a random center point
n_jobs : int
Number of jobs to use in multiprocessing for the subsamplings.
exp_increase_fac : float
Multiplicative factor of increasing radius for ring subsets
center_radius: float
Radius of center subset, overrides other sampling parameters.
equidistant_radii: list
List of radii of ring subset, overrides other sampling parameters.
dist_metric : str
Distance metric names as used by scipy.spatial.distance.pdist
max_dist : float
Maximum distance between points after which the distance
is considered infinite and not calculated.
verbose : bool
Whether to print statements in the console
rnd : numpy.random.RandomState, int
Random state to use for the sampling.
"""

if dist_metric != "euclidean":
raise ValueError((
"A RasterEquidistantMetricSpace class can only be constructed "
"for an euclidean space"
))

self.coords = coords.copy()
self.dist_metric = dist_metric
self.shape = shape
self.extent = extent
self.res = np.mean([(extent[1] - extent[0])/(shape[0]-1),(extent[3] - extent[2])/(shape[1]-1)])

# if the maximum distance is not specified, find the maximum possible distance from the extent
if max_dist is None:
max_dist = np.sqrt((extent[1] - extent[0])**2 + (extent[3] - extent[2])**2)
self.max_dist = max_dist

self.samples = samples

if runs is None:
# If None is provided, try to sample center samples for about one percent of the area
runs = int((self.shape[0] * self.shape[1]) / self.samples * 1/100.)
self.runs = runs

self.n_jobs = n_jobs

if rnd is None:
self.rnd = np.random.default_rng()
elif isinstance(rnd, np.random.RandomState):
self.rnd = rnd
else:
self.rnd = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(rnd)))

# Radius of center subsample, based on sample count
# If None is provided, the disk is defined with the exact size to hold the number of percentage of samples
# defined by the user
if center_radius is None:
center_radius = np.sqrt(1. / ratio_subsample * self.sample_count / np.pi) * self.res
if verbose:
print('Radius of center disk sample for sample count of '+str(self.sample_count)+ ' and subsampling ratio'
' of '+str(ratio_subsample)+': '+str(center_radius))
self._center_radius = center_radius

# Radii of equidistant ring subsamples
# If None is provided, the rings are defined with exponentially increasing radii with a factor sqrt(2), which
# means each ring will have just enough area to sample at least the number of samples desired, and same
# for each of the following, due to:
# (sqrt(2)R)**2 - R**2 = R**2
if equidistant_radii is None:
equidistant_radii = [0.]
increasing_rad = self._center_radius
while increasing_rad < self.max_dist:
equidistant_radii.append(increasing_rad)
increasing_rad *= exp_increase_fac
equidistant_radii.append(self.max_dist)
if verbose:
print('Radii of equidistant ring samples for increasing factor of ' + str(exp_increase_fac) + ': ')
print(equidistant_radii)
self._equidistant_radii = equidistant_radii

self.verbose = verbose

# Index and KDTree of center sample
self._cidx = None
self._ctree = None

# Index and KDTree of equidistant sample
self._eqidx = None
self._eqtree = None

self._centers = None
self._dists = None
# Do a very quick check to see throw exceptions
# if self.dist_metric is invalid...
pdist(self.coords[:1, :], metric=self.dist_metric)

@property
def sample_count(self):
if isinstance(self.samples, int):
return self.samples
return int(self.samples * len(self.coords))

@property
def cidx(self):
"""The sampled indices into `self.coords` for the center sample."""
return self._cidx

@property
def ctree(self):
"""If `self.dist_metric` is `euclidean`, a `scipy.spatial.cKDTree`
instance of the center sample of `self.coords`. Undefined otherwise."""

# only Euclidean supported
if self.dist_metric != "euclidean":
raise ValueError((
"A coordinate tree can only be constructed "
"for an euclidean space"
))

if self._ctree is None:
self._ctree = [cKDTree(self.coords[self.cidx[i], :]) for i in range(len(self.cidx))]
return self._ctree


@property
def eqidx(self):
"""The sampled indices into `self.coords` for the equidistant sample."""
return self._eqidx

@property
def eqtree(self):
"""If `self.dist_metric` is `euclidean`, a `scipy.spatial.cKDTree`
instance of the equidistant sample of `self.coords`. Undefined otherwise."""

# only Euclidean supported

if self._eqtree is None:
self._eqtree = [cKDTree(self.coords[self.eqidx[i], :]) for i in range(len(self.eqidx))]
return self._eqtree

@property
def dists(self):
"""A distance matrix of the sampled point pairs as a
`scipy.sparse.csr_matrix` sparse matrix. """

# Derive distances
if self._dists is None:

idx_center = self.rnd.choice(len(self.coords), size=self.runs, replace=False)

# Each run has a different center
centers = self.coords[idx_center]

# Running on a single core: for loop
if self.n_jobs == 1:

list_dists, list_cidx, list_eqidx = ([] for i in range(3))

for i in range(self.runs):

center = centers[i]
dists, cidx, eqidx = _get_idx_dists(self.coords, center=center, center_radius=self._center_radius,
equidistant_radii=self._equidistant_radii, rnd_func=self.rnd,
sample_count=self.sample_count, max_dist=self.max_dist, i=i,
imax=self.runs, verbose=self.verbose)
list_dists.append(dists)
list_cidx.append(cidx)
list_eqidx.append(eqidx)

# Running on several cores: multiprocessing
else:
# Arguments to pass: only centers and loop index for verbose are changing
argsin = [{'center': centers[i], 'coords': self.coords, 'center_radius': self._center_radius,
'equidistant_radii': self._equidistant_radii, 'rnd_func': self.rnd,
'sample_count': self.sample_count, 'max_dist': self.max_dist, 'i': i, 'imax': self.runs,
'verbose': self.verbose} for i in range(self.runs)]

# Process in parallel
pool = mp.Pool(self.n_jobs, maxtasksperchild=1)
outputs = pool.map(_mp_wrapper_get_idx_dists, argsin, chunksize=1)
pool.close()
pool.join()

# Get lists of outputs
list_dists, list_cidx, list_eqidx = list(zip(*outputs))

# Define class objects
self._centers = centers
self._cidx = list_cidx
self._eqidx = list_eqidx

# concatenate the coo matrixes
d = np.concatenate(list_dists)
c = np.concatenate(list_cidx)
eq = np.concatenate(list_eqidx)

# remove possible duplicates (that would be summed by default)
# from https://stackoverflow.com/questions/28677162/ignoring-duplicate-entries-in-sparse-matrix

# Stable solution but a bit slow
# c, eq, d = zip(*set(zip(c, eq, d)))
# dists = sparse.csr_matrix((d, (c, eq)), shape=(len(self.coords), len(self.coords)))

# Solution 5+ times faster than the preceding, but relies on _update() which might change in scipy (which
# only has an implemented method for summing duplicates, and not ignoring them yet)
dok = sparse.dok_matrix((len(self.coords), len(self.coords)))
dok._update(zip(zip(c, eq), d))
dists = dok.tocsr()

self._dists = dists

return self._dists
2 changes: 1 addition & 1 deletion skgstat/Variogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -1436,7 +1436,7 @@ def fit(self, force=False, method=None, sigma=None, **kwargs):
self.preprocessing(force=force)

# load the data
x = self.bins
x = np.array(self.bins)
y = self.experimental

# overwrite fit setting if new params are given
Expand Down
Loading

0 comments on commit 258b687

Please sign in to comment.