Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RasterMetricSpace for improved pairwise sampling of large 2D grids #114

Merged
merged 18 commits into from
Jul 23, 2021
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
369 changes: 368 additions & 1 deletion skgstat/MetricSpace.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from scipy.spatial import cKDTree
from scipy import sparse
import numpy as np

import multiprocessing as mp

def _sparse_dok_get(m, fill_value=np.NaN):
"""Like m.toarray(), but setting empty values to `fill_value`, by
Expand Down Expand Up @@ -377,3 +377,370 @@ def dists(self):
dists = dists.tocsr()
self._dists = dists
return self._dists

def get_disk_sample(coords, center, center_radius, rnd_func, sample_count):
"""
Subfunction for RasterEquidistantMetricSpace.
Calculates the indexes of a subsample in a disk "center sample".
Same parameters as in the class.
"""
# First index: preselect samples in a disk of certain radius
dist_center = np.sqrt((coords[:, 0] - center[0]) ** 2 + (
coords[:, 1] - center[1]) ** 2)
idx1 = dist_center < center_radius

count = np.count_nonzero(idx1)
indices1 = np.argwhere(idx1)

# Second index: randomly select half of the valid pixels, so that the other half can be used by the equidist
# sample for low distances
indices2 = rnd_func.choice(count, size=sample_count, replace=False)

return indices1[indices2].squeeze()

def get_successive_ring_samples(coords, center, equidistant_radii, rnd_func, sample_count):
rhugonnet marked this conversation as resolved.
Show resolved Hide resolved
"""
Subfunction for RasterEquidistantMetricSpace.
Calculates the indexes of several subsamples within disks, "equidistant sample".
Same parameters as in the class.
"""
dist_center = np.sqrt((coords[:, 0] - center[0]) ** 2 + (
coords[:, 1] - center[1]) ** 2)

idx = np.logical_and(dist_center[None, :] >= np.array(equidistant_radii[:-1])[:, None],
rhugonnet marked this conversation as resolved.
Show resolved Hide resolved
dist_center[None, :] < np.array(equidistant_radii[1:])[:, None])

# Loop over an iterative sampling in rings
list_idx = []
for i in range(len(equidistant_radii) - 1):
# First index: preselect samples in a ring of inside radius and outside radius
idx1 = idx[i, :]

count = np.count_nonzero(idx1)
indices1 = np.argwhere(idx1)

# Second index: randomly select half of the valid pixels, so that the other half can be used by the equidist
# sample for low distances
indices2 = rnd_func.choice(count, size=min(count, sample_count), replace=False)
sub_idx = indices1[indices2]
if len(sub_idx) > 1:
list_idx.append(sub_idx.squeeze())

return np.concatenate(list_idx)

def get_idx_dists(coords, center, center_radius, equidistant_radii, rnd_func,
sample_count, max_dist, i, imax, verbose):
"""
Subfunction for RasterEquidistantMetricSpace.
Calculates the pairwise distances between a list of pairs of "center" and "equidistant" ensembles.
Same parameters as in the class.
"""

if verbose:
print('Working on subsample ' + str(i+1) + ' out of ' + str(imax))

cidx = get_disk_sample(coords=coords, center=center,
center_radius=center_radius, rnd_func=rnd_func,
sample_count=sample_count)

eqidx = get_successive_ring_samples(coords=coords, center=center,
equidistant_radii=equidistant_radii, rnd_func=rnd_func,
sample_count=sample_count)

ctree = cKDTree(coords[cidx, :])
eqtree = cKDTree(coords[eqidx, :])

dists = ctree.sparse_distance_matrix(
eqtree,
max_dist,
output_type="coo_matrix"
)

return dists.data, cidx[dists.row], eqidx[dists.col]

def mp_wrapper_get_idx_dists(argdict: dict):
"""
Multiprocessing wrapper for get_idx_dists.
"""
return get_idx_dists(**argdict)


class RasterEquidistantMetricSpace(MetricSpace):
"""Like ProbabilisticMetricSpace but only applies to Raster data (2D gridded data) and
samples iteratively an `equidistant` subset within distances to a 'center' subset.
Subsets can either be a fraction of the total number of pairs (float < 1), or an integer count.
The 'center' subset corresponds to a disk centered on a point of the grid for which the location
randomly varies and can be redrawn and aggregated for several runs. The corresponding 'equidistant'
subset consists of a concatenation of subsets drawn from rings with radius gradually increasing
until the maximum extent of the grid is reached.

To define the subsampling, several parameters are available:
- The raw number of samples corresponds to the samples that will be drawn in each central disk.
Along with the ratio of samples drawn (see below), it will automatically define the radius
of the disk and rings for subsampling.
Note that the number of samples drawn will be repeatedly drawn for each equidistant rings
at a given radius, resulting in a several-fold amount of total samples for the equidistant
subset.
- The ratio of subsample defines the density of point sampled within each subset. It
defaults to 20%.
- The number of runs corresponds to the number of random center points repeated during the
subsampling. It defaults to a sampling of 1% of the grid with center subsets.

Alternatively, one can supply:
- The multiplicative factor to derive increasing rings radii, set as squareroot of 2 by
default in order to conserve a similar area for each ring and verify the sampling ratio.
Or directly:
- The radius of the central disk subset.
- A list of radii for the equidistant ring subsets.
When providing those spatial parameters, all other sampling parameters will be ignored
except for the raw number of samples to draw in each subset.
"""

def __init__(
self,
coords,
shape,
extent,
samples=100,
ratio_subsamp=0.2,
rhugonnet marked this conversation as resolved.
Show resolved Hide resolved
runs=None,
ncores=1,
rhugonnet marked this conversation as resolved.
Show resolved Hide resolved
exp_increase_fac=np.sqrt(2),
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why sqrt(2)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because the surface area of successive rings is going to be preserved when increasing the radius by a factor of exactly sqrt(2). The equation for the area of the ring (disk minus disk) looks like this (once the pis that even out are removed): (sqrt(2)R)**2 - R**2 = R**2

It's briefly described in the class description + the __init__ function at the line it uses the factor

center_radius=None,
equidistant_radii=None,
max_dist=None,
dist_metric="euclidean",
rnd=None,
verbose=False
):
"""RasterEquidistantMetricSpace class

Parameters
----------
coords : numpy.ndarray
Coordinate array of shape (Npoints, 2)
shape : tuple[int, int]
Shape of raster (X, Y)
extent : tuple[float, float, float, float]
Extent of raster (Xmin, Xmax, Ymin, Ymax)
samples : float, int
Number of samples (int) or fraction of coords to sample (float < 1).
ratio_subsamp:
Ratio of samples drawn within each subsample.
runs : int
Number of subsamplings based on a random center point
ncores : int
Number of cores to use in multiprocessing for the subsamplings.
exp_increase_fac : float
Multiplicative factor of increasing radius for ring subsets
center_radius: float
Radius of center subset, overrides other sampling parameters.
equidistant_radii: list
List of radii of ring subset, overrides other sampling parameters.
dist_metric : str
Distance metric names as used by scipy.spatial.distance.pdist
max_dist : float
Maximum distance between points after which the distance
is considered infinite and not calculated.
verbose : bool
Whether to print statements in the console

rnd : numpy.random.RandomState, int
Random state to use for the sampling.
"""

if dist_metric != "euclidean":
raise ValueError((
"A RasterEquidistantMetricSpace class can only be constructed "
"for an euclidean space"
))

self.coords = coords.copy()
self.dist_metric = dist_metric
self.shape = shape
self.extent = extent
self.res = np.mean([(extent[1] - extent[0])/(shape[0]-1),(extent[3] - extent[2])/(shape[1]-1)])

# if the maximum distance is not specified, find the maximum possible distance from the extent
if max_dist is None:
max_dist = np.sqrt((extent[1] - extent[0])**2 + (extent[3] - extent[2])**2)
self.max_dist = max_dist

self.samples = samples

if runs is None:
# If None is provided, try to sample center samples for about one percent of the area
runs = int((self.shape[0] * self.shape[1]) / self.samples * 1/100.)
self.runs = runs

self.ncores = ncores

if rnd is None:
self.rnd = np.random.default_rng()
elif isinstance(rnd, np.random.RandomState):
self.rnd = rnd
else:
self.rnd = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(rnd)))

# Radius of center subsample, based on sample count
# If None is provided, the disk is defined with the exact size to hold the number of percentage of samples
# defined by the user
if center_radius is None:
center_radius = np.sqrt(1. / ratio_subsamp * self.sample_count / np.pi) * self.res
if verbose:
print('Radius of center disk sample for sample count of '+str(self.sample_count)+ ' and subsampling ratio'
' of '+str(ratio_subsamp)+': '+str(center_radius))
self._center_radius = center_radius

# Radii of equidistant ring subsamples
# If None is provided, the rings are defined with exponentially increasing radii with a factor sqrt(2), which
# means each ring will have just enough area to sample at least the number of samples desired, and same
# for each of the following, due to:
# (sqrt(2)R)**2 - R**2 = R**2
if equidistant_radii is None:
equidistant_radii = [0.]
increasing_rad = self._center_radius
while increasing_rad < self.max_dist:
equidistant_radii.append(increasing_rad)
increasing_rad *= exp_increase_fac
equidistant_radii.append(self.max_dist)
if verbose:
print('Radii of equidistant ring samples for increasing factor of ' + str(exp_increase_fac) + ': ')
print(equidistant_radii)
self._equidistant_radii = equidistant_radii

self.verbose = verbose

# Index and KDTree of center sample
self._cidx = None
self._ctree = None

# Index and KDTree of equidistant sample
self._eqidx = None
self._eqtree = None

self._centers = None
self._dists = None
# Do a very quick check to see throw exceptions
# if self.dist_metric is invalid...
pdist(self.coords[:1, :], metric=self.dist_metric)

@property
def sample_count(self):
if isinstance(self.samples, int):
return self.samples
return int(self.samples * len(self.coords))

@property
def cidx(self):
"""The sampled indices into `self.coords` for the center sample."""

return self._cidx

@property
def ctree(self):
"""If `self.dist_metric` is `euclidean`, a `scipy.spatial.cKDTree`
instance of the center sample of `self.coords`. Undefined otherwise."""

# only Euclidean supported
if self.dist_metric != "euclidean":
raise ValueError((
"A coordinate tree can only be constructed "
"for an euclidean space"
))

if self._ctree is None:
self._ctree = [cKDTree(self.coords[self.cidx[i], :]) for i in range(len(self.cidx))]
return self._ctree


@property
def eqidx(self):
"""The sampled indices into `self.coords` for the equidistant sample."""

return self._eqidx

@property
def eqtree(self):
"""If `self.dist_metric` is `euclidean`, a `scipy.spatial.cKDTree`
instance of the equidistant sample of `self.coords`. Undefined otherwise."""

# only Euclidean supported

if self._eqtree is None:
self._eqtree = [cKDTree(self.coords[self.eqidx[i], :]) for i in range(len(self.eqidx))]
return self._eqtree

@property
def dists(self):
"""A distance matrix of the sampled point pairs as a
`scipy.sparse.csr_matrix` sparse matrix. """


# Derive distances
if self._dists is None:

idx_center = self.rnd.choice(len(self.coords), size=self.runs, replace=False)

# Each run has a different center
centers = self.coords[idx_center]

# Running on a single core: for loop
if self.ncores == 1:

list_dists, list_cidx, list_eqidx = ([] for i in range(3))

for i in range(self.runs):

center = centers[i]
dists, cidx, eqidx = get_idx_dists(self.coords, center=center, center_radius=self._center_radius,
equidistant_radii=self._equidistant_radii, rnd_func=self.rnd,
sample_count=self.sample_count, max_dist=self.max_dist, i=i,
imax=self.runs, verbose=self.verbose)
list_dists.append(dists)
list_cidx.append(cidx)
list_eqidx.append(eqidx)

# Running on several cores: multiprocessing
else:
# Arguments to pass: only centers and loop index for verbose are changing
argsin = [{'center': centers[i], 'coords': self.coords, 'center_radius': self._center_radius,
'equidistant_radii': self._equidistant_radii, 'rnd_func': self.rnd,
'sample_count': self.sample_count, 'max_dist': self.max_dist, 'i': i, 'imax': self.runs,
'verbose': self.verbose} for i in range(self.runs)]

# Process in parallel
pool = mp.Pool(self.ncores, maxtasksperchild=1)
outputs = pool.map(mp_wrapper_get_idx_dists, argsin)
pool.close()
pool.join()

# Get lists of outputs
list_dists, list_cidx, list_eqidx = list(zip(*outputs))

# Define class objects
self._centers = centers
self._cidx = list_cidx
self._eqidx = list_eqidx

# concatenate the coo matrixes
d = np.concatenate(list_dists)
c = np.concatenate(list_cidx)
eq = np.concatenate(list_eqidx)

# remove possible duplicates (that would be summed by default)
# from https://stackoverflow.com/questions/28677162/ignoring-duplicate-entries-in-sparse-matrix

# Stable solution but a bit slow
# c, eq, d = zip(*set(zip(c, eq, d)))
# dists = sparse.csr_matrix((d, (c, eq)), shape=(len(self.coords), len(self.coords)))

# Solution 5+ times faster than the preceding, but relies on _update() which might change in scipy (which
# only has an implemented method for summing duplicates, and not ignoring them yet)
dok = sparse.dok_matrix((len(self.coords), len(self.coords)))
dok._update(zip(zip(c, eq), d))
dists = dok.tocsr()

self._dists = dists

return self._dists
2 changes: 1 addition & 1 deletion skgstat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from .DirectionalVariogram import DirectionalVariogram
from .SpaceTimeVariogram import SpaceTimeVariogram
from .Kriging import OrdinaryKriging
from .MetricSpace import MetricSpace, MetricSpacePair
from .MetricSpace import MetricSpace, MetricSpacePair, ProbabalisticMetricSpace, RasterEquidistantMetricSpace
from . import interfaces
from . import data
from . import util
Expand Down
Loading