From 7668fdc2d97a1647d8569c3945ce13b54fdbd13c Mon Sep 17 00:00:00 2001 From: "H. Thomson Comer" Date: Tue, 4 Apr 2023 09:14:10 -0500 Subject: [PATCH] Updated binpred architecture (#1009) The last couple of PRs were very brittle w.r.t updating their functionality. This PR eliminates some of that brittleness by adding object inheritance for every combination of features for every binary predicate. Class choice is handled by type dispatching on the input columns. The operation stack (`_preprocess`, `_op`, and `_postprocess`) is now chained to make debugging easier. Instead of calling the operations in sequence, they call one another. Refactoring the existing modules to use the new architecture was trivial. I'm still interested in modifying the architecture so that the set operations that determine the final result are identifiable, and possibly composable. Authors: - H. Thomson Comer (https://github.com/thomcom) Approvers: - Michael Wang (https://github.com/isVoid) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cuspatial/pull/1009 --- .../cuspatial/core/_column/geocolumn.py | 13 +- .../core/binpreds/binpred_dispatch.py | 31 + .../core/binpreds/binpred_interface.py | 443 ++++++++++ .../cuspatial/core/binpreds/binpreds.py | 782 ------------------ .../core/binpreds/feature_contains.py | 367 ++++++++ .../cuspatial/core/binpreds/feature_covers.py | 52 ++ .../core/binpreds/feature_crosses.py | 46 ++ .../cuspatial/core/binpreds/feature_equals.py | 341 ++++++++ .../core/binpreds/feature_intersects.py | 54 ++ .../core/binpreds/feature_overlaps.py | 75 ++ .../cuspatial/core/binpreds/feature_within.py | 82 ++ python/cuspatial/cuspatial/core/geoseries.py | 99 ++- .../tests/binpreds/test_binpred_internals.py | 25 +- .../binpreds/test_equals_only_binpreds.py | 48 +- .../cuspatial/utils/binpred_utils.py | 56 ++ 15 files changed, 1642 insertions(+), 872 deletions(-) create mode 100644 python/cuspatial/cuspatial/core/binpreds/binpred_dispatch.py create mode 100644 python/cuspatial/cuspatial/core/binpreds/binpred_interface.py delete mode 100644 python/cuspatial/cuspatial/core/binpreds/binpreds.py create mode 100644 python/cuspatial/cuspatial/core/binpreds/feature_contains.py create mode 100644 python/cuspatial/cuspatial/core/binpreds/feature_covers.py create mode 100644 python/cuspatial/cuspatial/core/binpreds/feature_crosses.py create mode 100644 python/cuspatial/cuspatial/core/binpreds/feature_equals.py create mode 100644 python/cuspatial/cuspatial/core/binpreds/feature_intersects.py create mode 100644 python/cuspatial/cuspatial/core/binpreds/feature_overlaps.py create mode 100644 python/cuspatial/cuspatial/core/binpreds/feature_within.py create mode 100644 python/cuspatial/cuspatial/utils/binpred_utils.py diff --git a/python/cuspatial/cuspatial/core/_column/geocolumn.py b/python/cuspatial/cuspatial/core/_column/geocolumn.py index 67b896959..01f51f03b 100644 --- a/python/cuspatial/cuspatial/core/_column/geocolumn.py +++ b/python/cuspatial/cuspatial/core/_column/geocolumn.py @@ -1,4 +1,6 @@ -# Copyright (c) 2021-2022 NVIDIA CORPORATION +# Copyright (c) 2021-2023 NVIDIA CORPORATION + +from enum import Enum from functools import cached_property from typing import Tuple, TypeVar @@ -11,6 +13,15 @@ from cuspatial.core._column.geometa import Feature_Enum, GeoMeta from cuspatial.utils.column_utils import empty_geometry_column + +class ColumnType(Enum): + MIXED = 0 + POINT = 1 + MULTIPOINT = 2 + LINESTRING = 3 + POLYGON = 4 + + T = TypeVar("T", bound="GeoColumn") diff --git a/python/cuspatial/cuspatial/core/binpreds/binpred_dispatch.py b/python/cuspatial/cuspatial/core/binpreds/binpred_dispatch.py new file mode 100644 index 000000000..5a53abaca --- /dev/null +++ b/python/cuspatial/cuspatial/core/binpreds/binpred_dispatch.py @@ -0,0 +1,31 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +"""`binpred_dispatch.py` contains a collection of dictionaries that +are used to dispatch binary predicate functions to the correct +implementation. + +The dictionaries are collected here to make using the dispatch +functionality easier. +""" + +from cuspatial.core.binpreds.feature_contains import ( # NOQA F401 + DispatchDict as CONTAINS_DISPATCH, +) +from cuspatial.core.binpreds.feature_covers import ( # NOQA F401 + DispatchDict as COVERS_DISPATCH, +) +from cuspatial.core.binpreds.feature_crosses import ( # NOQA F401 + DispatchDict as CROSSES_DISPATCH, +) +from cuspatial.core.binpreds.feature_equals import ( # NOQA F401 + DispatchDict as EQUALS_DISPATCH, +) +from cuspatial.core.binpreds.feature_intersects import ( # NOQA F401 + DispatchDict as INTERSECTS_DISPATCH, +) +from cuspatial.core.binpreds.feature_overlaps import ( # NOQA F401 + DispatchDict as OVERLAPS_DISPATCH, +) +from cuspatial.core.binpreds.feature_within import ( # NOQA F401 + DispatchDict as WITHIN_DISPATCH, +) diff --git a/python/cuspatial/cuspatial/core/binpreds/binpred_interface.py b/python/cuspatial/cuspatial/core/binpreds/binpred_interface.py new file mode 100644 index 000000000..35db8c827 --- /dev/null +++ b/python/cuspatial/cuspatial/core/binpreds/binpred_interface.py @@ -0,0 +1,443 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +from typing import TYPE_CHECKING + +from cudf import Series + +if TYPE_CHECKING: + from cuspatial.core.geoseries import GeoSeries + + +class BinPredConfig: + """Configuration for a binary predicate. + + Parameters + ---------- + align : bool + Whether to align the left-hand and right-hand GeoSeries before + computing the binary predicate. Defaults to True. + allpairs : bool + Whether to compute the binary predicate between all pairs of + features in the left-hand and right-hand GeoSeries. Defaults to + False. Only available with the contains predicate. + """ + + def __init__(self, **kwargs): + self.align = kwargs.get("align", True) + + def __repr__(self): + return f"BinpredConfig(align={self.align}, allpairs={self.allpairs})" + + def __str__(self): + return self.__repr__() + + +class PreprocessorResult: + """Result of a binary predicate preprocessor. The following classes + are used to give all implementors of `BinaryItf` a common interface + for preprocessor results. + + Parameters + ---------- + lhs : GeoSeries + The left-hand GeoSeries. + rhs : GeoSeries + The right-hand GeoSeries. + final_rhs : GeoSeries + The rhs GeoSeries, if modified by the preprocessor. For example + the contains preprocessor converts any complex feature type into + a collection of points. + point_indices : cudf.Series + A cudf.Series of indices that map each point in `points` to its + corresponding feature in the right-hand GeoSeries. + """ + + def __init__( + self, + lhs: "GeoSeries", + rhs: "GeoSeries", + final_rhs: "GeoSeries" = None, + point_indices: Series = None, + ): + self.lhs = lhs + self.rhs = rhs + self.final_rhs = final_rhs + self.point_indices = point_indices + + def __repr__(self): + return f"PreprocessorResult(lhs={self.lhs}, rhs={self.rhs}, \ + points={self.points}, point_indices={self.point_indices})" + + def __str__(self): + return self.__repr__() + + +class OpResult: + """Result of a binary predicate operation.""" + + pass + + +class ContainsOpResult(OpResult): + """Result of a Contains binary predicate operation. + + Parameters + ---------- + result : cudf.DataFrame + A cudf.DataFrame containing two columns: "polygon_index" and + Point_index". The "polygon_index" column contains the index of + the polygon that contains each point. The "point_index" column + contains the index of each point that is contained by a polygon. + points : GeoSeries + A GeoSeries of points. + point_indices : cudf.Series + A cudf.Series of indices that map each point in `points` to its + corresponding feature in the right-hand GeoSeries. + """ + + def __init__( + self, + result: Series, + points: "GeoSeries" = None, + point_indices: Series = None, + ): + self.result = result + self.points = points + self.point_indices = point_indices + + def __repr__(self): + return f"OpResult(result={self.result}, points={self.points}, \ + point_indices={self.point_indices})" + + def __str__(self): + return self.__repr__() + + +class EqualsOpResult(OpResult): + """Result of an Equals binary predicate operation. + + Parameters + ---------- + result : cudf.Series + A cudf.Series of boolean values indicating whether each feature in + the right-hand GeoSeries is equal to the point in the left-hand + GeoSeries. + point_indices: cudf.Series + A cudf.Series of indices that map each point in `points` to its + corresponding feature in the right-hand GeoSeries. + """ + + def __init__(self, result: Series, point_indices: Series): + self.result = result + self.point_indices = point_indices + + def __repr__(self): + return f"OpResult(result={self.result}) \ + point_indices={self.point_indices}" + + def __str__(self): + return self.__repr__() + + +class BinPred: + """Base class for binary predicates. This class is an abstract base class + and can not be instantiated directly. `BinPred` is the base class that + implements this interface in the most general case. Child classes exist + for each combination of left-hand and right-hand GeoSeries types and binary + predicates. For example, a `PointPointContains` predicate is a child class + of `BinPred` that implements the `contains_properly` predicate for two + `Point` GeoSeries. These classes are found in the `feature_` + files found in this directory. + + Notes + ----- + BinPred classes are selected using the appropriate dispatch function. For + example, the `contains_properly` predicate is selected using the + `CONTAINS_DISPATCH` dispatch function from `binpred_dispatch.py`. The + dispatch function selects the appropriate BinPred class based on the + left-hand and right-hand GeoSeries types. + + This enables customized behavior for each combination of left-hand and + right-hand GeoSeries types and binary predicates. For example, the + `contains_properly` predicate for two `Point` GeoSeries is implemented + using a `PointPointContains` BinPred class. The `contains_properly` + predicate for a `Point` GeoSeries and a `Polygon` GeoSeries is implemented + using a `PointPolygonContains` BinPred class. Most subclasses will be able + to use all or 2/3rds of the methods defined in the `RootContains(BinPred) + class`. + + The `RootContains` class implements the `contains_properly` predicate for + the most common combination of left-hand and right-hand GeoSeries types. + The `RootContains` class can be used as a template for implementing the + `contains_properly` predicate for other combinations of left-hand and + right-hand GeoSeries types. + + Examples + -------- + >>> from cuspatial.core.binpreds.binpred_dispatch import CONTAINS_DISPATCH + >>> from cuspatial.core.geoseries import GeoSeries + >>> from shapely.geometry import Point, Polygon + >>> predicate = CONTAINS_DISPATCH[( + ... lhs.column_type, rhs.column_type + ... )](align=True, allpairs=False) + >>> lhs = GeoSeries([Polygon([(0, 0), (1, 1), (1, 0)])]) + >>> rhs = GeoSeries([Point(0, 0), Point(1, 1)]) + >>> print(predicate(lhs, rhs)) + 0 False + dtype: bool + """ + + def __init__(self, **kwargs): + """Initialize a binary predicate. Collects any arguments passed + to the binary predicate to be used at runtime. + + This class stores the config object that can be passed to the binary + predicate at runtime. The lhs and rhs are set at runtime using the + __call__ method so that the same binary predicate can be used for + multiple left-hand and right-hand GeoSeries. + + Parameters + ---------- + **kwargs + Any additional arguments to be used at runtime. + + Attributes + ---------- + kwargs : dict + Any additional arguments to be used at runtime. + + Methods + ------- + __call__(self, lhs, rhs) + System call for the binary predicate. Calls the _call method, which + is implemented by the subclass. + _call(self, lhs, rhs) + Call the binary predicate. This method is implemented by the + subclass. + _preprocess(self, lhs, rhs) + Preprocess the left-hand and right-hand GeoSeries. This method is + implemented by the subclass. + _compute_predicate(self, lhs, rhs) + Compute the binary predicate between two GeoSeries. This method is + implemented by the subclass. + _postprocess(self, lhs, rhs, point_indices, op_result) + Postprocess the output GeoSeries to ensure that they are of the + correct type for the predicate. This method is implemented by the + subclass. + + Examples + -------- + >>> from cuspatial.core.binpreds.binpred_dispatch import ( + ... CONTAINS_DISPATCH + ... ) + >>> from cuspatial.core.geoseries import GeoSeries + >>> from shapely.geometry import Point, Polygon + >>> predicate = CONTAINS_DISPATCH[( + ... lhs.column_type, rhs.column_type + ... )](align=True, allpairs=False) + >>> lhs = GeoSeries([Polygon([(0, 0), (1, 1), (1, 0)])]) + >>> rhs = GeoSeries([Point(0, 0), Point(1, 1)]) + >>> print(predicate(lhs, rhs)) + 0 False + dtype: bool + """ + self.kwargs = kwargs + + def __call__(self, lhs: "GeoSeries", rhs: "GeoSeries") -> Series: + """System call for the binary predicate. Calls the _call method, + which is implemented by the subclass. Executing the binary predicate + returns the results of the binary predicate as a GeoSeries. + + Parameters + ---------- + lhs : GeoSeries + The left-hand GeoSeries. + rhs : GeoSeries + The right-hand GeoSeries. + + Returns + ------- + result : Series + The results of the binary predicate. + + Examples + -------- + >>> from cuspatial.core.binpreds.binpred_dispatch import ( + ... CONTAINS_DISPATCH + ... ) + >>> from cuspatial.core.geoseries import GeoSeries + >>> from shapely.geometry import Point, Polygon + >>> predicate = CONTAINS_DISPATCH[( + ... lhs.column_type, rhs.column_type + ... )](align=True, allpairs=False) + >>> lhs = GeoSeries([Polygon([(0, 0), (1, 1), (1, 0)])]) + >>> rhs = GeoSeries([Point(0, 0), Point(1, 1)]) + >>> print(predicate(lhs, rhs)) + 0 False + dtype: bool + """ + return self._call(lhs, rhs) + + def _call(self, lhs: "GeoSeries", rhs: "GeoSeries") -> Series: + """Call the binary predicate. This method is implemented by the + subclass. + + Parameters + ---------- + lhs : GeoSeries + The left-hand GeoSeries. + rhs : GeoSeries + The right-hand GeoSeries. + + Returns + ------- + result : Series + A cudf.Series of boolean values indicating whether each feature in + the right-hand GeoSeries satisfies the requirements of a binary + predicate with its corresponding feature in the left-hand + GeoSeries. + """ + return self._preprocess(lhs, rhs) + + def _preprocess(self, lhs: "GeoSeries", rhs: "GeoSeries") -> Series: + """Preprocess the left-hand and right-hand GeoSeries. This method + is implemented by the subclass. + + Preprocessing is used to ensure that the left-hand and right-hand + GeoSeries are of the correct type for each of the three basic + predicates: 'contains', 'intersects', and 'equals'. For example, + `contains` requires that the left-hand GeoSeries be polygons or + multipolygons and the right-hand GeoSeries be points or multipoints. + `intersects` requires that the left-hand GeoSeries be linestrings or + points and the right-hand GeoSeries be linestrings or points. + `equals` requires that the left-hand and right-hand GeoSeries be + points. + + Subclasses that implement `_preprocess` are responsible for calling + `_compute_predicate` to continue the execution of the binary predicate. + The last line of `_preprocess` as implemented by any subclass should be + + return self._compute_predicate(lhs, rhs, points, point_indices) + + Parameters + ---------- + lhs : GeoSeries + The original left-hand GeoSeries. + rhs : GeoSeries + The original right-hand GeoSeries. + + Returns + ------- + result : Series + A cudf.Series of boolean values indicating whether each feature in + the right-hand GeoSeries satisfies the requirements of a binary + predicate with its corresponding feature in the left-hand + GeoSeries. + """ + raise NotImplementedError + + def _compute_predicate( + self, + lhs: "GeoSeries", + rhs: "GeoSeries", + preprocessor_result: PreprocessorResult, + ) -> Series: + """Compute the binary predicate between two GeoSeries. This method + is implemented by the subclass. This method is called by `_preprocess` + to continue the execution of the binary predicate. `_compute_predicate` + is responsible for calling `_postprocess` to complete the execution of + the binary predicate. + + `compute_predicate` is used to compute the binary predicate, or + composition of binary predicates, between two GeoSeries. The left-hand + GeoSeries is considered the "base" GeoSeries and the right-hand + GeoSeries is considered the "other" GeoSeries. The binary predicate is + computed between each feature in the base GeoSeries and the other + GeoSeries. The result is a GeoSeries of boolean values indicating + whether each feature in the other GeoSeries satisfies the requirements + of a binary predicate with its corresponding feature in the base + GeoSeries. + + Subclasses that implement `_compute_predicate` are responsible for + calling `_postprocess` to complete the execution of the binary + predicate. The last line of `_compute_predicate` should be + + return self._postprocess( + lhs, + rhs, + OpResult(modified_rhs, point_indices) + ) + + where `modified_rhs` is a GeoSeries of points and `point_indices` is a + cudf.Series of indices that map each point in `points` to its + corresponding feature in the right-hand GeoSeries. + + Parameters + ---------- + lhs : GeoSeries + The left-hand GeoSeries. + rhs : GeoSeries + The right-hand GeoSeries. + preprocessor_result : PreprocessorResult + The result of the preprocessing step. + """ + raise NotImplementedError + + def _postprocess( + self, + lhs: "GeoSeries", + rhs: "GeoSeries", + op_result: OpResult, + ) -> Series: + """Postprocess the output GeoSeries to ensure that they are of the + correct return type for the predicate. This method is implemented by + the subclass. + + Postprocessing is used to convert the results of the + `_compute_predicate` call into countable values. This step converts the + results of one of the three binary predicates `contains`, `intersects`, + or `equals` into a `Series` of boolean values. When the `rhs` is a + non-point type, `_postprocess` is responsible for aggregating the + results of the `_compute_predicate` call into a single boolean value + for each feature in the `lhs`. + + Parameters + ---------- + lhs : GeoSeries + The left-hand GeoSeries. + rhs : GeoSeries + The right-hand GeoSeries. + op_result : cudf.Series + The result of the `_compute_predicate` call. + + Returns + ------- + result : Series + A Series of boolean values indicating whether each feature in + the right-hand GeoSeries satisfies the requirements of a binary + predicate with its corresponding feature in the left-hand + GeoSeries. + + Notes + ----- + Arithmetic rules incorporated into `_postprocess` classes: + + (a, b) -> a contains b iff for all points p in b, p is in a + (a, b) -> a intersects b iff for any point p in b, p is in a + + I'm currently looking into refactoring these arithmetics into a + syntax that more closely resembles it. + """ + raise NotImplementedError + + +class NotImplementedPredicate(BinPred): + """A class that is used to raise an error when a binary predicate is + not implemented for a given combination of left-hand and right-hand + GeoSeries types. This is useful for delineating which binary predicates + are implemented for which GeoSeries types in their appropriate + `DispatchDict`. + """ + + def __init__(self, *args, **kwargs): + raise NotImplementedError diff --git a/python/cuspatial/cuspatial/core/binpreds/binpreds.py b/python/cuspatial/cuspatial/core/binpreds/binpreds.py deleted file mode 100644 index 3e6c1654e..000000000 --- a/python/cuspatial/cuspatial/core/binpreds/binpreds.py +++ /dev/null @@ -1,782 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. - -from abc import ABC, abstractmethod - -import cupy as cp - -import cudf - -import cuspatial -from cuspatial.core._column.geocolumn import GeoColumn -from cuspatial.core.binpreds.contains import contains_properly -from cuspatial.utils.column_utils import ( - contains_only_linestrings, - contains_only_multipoints, - contains_only_points, - contains_only_polygons, - has_multipolygons, - has_same_geometry, -) - - -class PreprocessorOutput: - """The output of the preprocess method of a binary predicate. - - This makes it possible to create a class that matches the necessary - signature of a geoseries.GeoColumnAccessor object. In some cases the - preprocessor may need to reorder the input data, in which case the - preprocessor will return a PreprocessorOutput object instead of a - GeoColumnAccessor.""" - - def __init__(self, coords, indices) -> None: - self.vertices = coords - self.indices = indices - - @property - def xy(self): - return self.vertices - - def point_indices(self): - return self.indices - - -class BinaryPredicate(ABC): - """BinaryPredicate is an abstract class that implements the binary - predicate algorithm. The binary predicate algorithm is used to compute - the relationship between two GeoSeries. - - The algorithm is implemented in three steps: `preprocess`, `_op`, and - `postprocess`. The `preprocess` step is used to ensure that the input - GeoSeries are of the correct type for the binary predicate. The - `_op` step is used to compute the relationship between the points - in the input GeoSeries. The `postprocess` step is used to compute - the relationship between the input GeoSeries. - """ - - @abstractmethod - def preprocess(self, lhs, rhs): - """Preprocess the input data for the binary predicate. This method - should be implemented by subclasses. Preprocess and postprocess are - used to implement the discrete math rules of the binary predicates. - - Notes - ----- - Should only be called once. - - Parameters - ---------- - op : str - The binary predicate to perform. - lhs : GeoSeries - The left-hand-side of the GeoSeries-level binary predicate. - rhs : GeoSeries - The right-hand-side of the GeoSeries-level binary predicate. - - Returns - ------- - GeoSeries - The left-hand-side of the internal binary predicate, may be - reordered. - GeoSeries - The right-hand-side of the internal binary predicate, may be - reordered. - """ - return (lhs, rhs, cudf.RangeIndex(len(rhs))) - - @abstractmethod - def postprocess( - self, point_indices: cudf.Series, point_result: cudf.Series - ) -> cudf.Series: - """Postprocess the output data for the binary predicate. This method - should be implemented by subclasses. - - Postprocess converts the raw results of the binary predicate into - the final result. This is where the discrete math rules are applied. - The binary predicate operation does not compute any relationships - between features in the input GeoSeries', it only computes the - relationship between the points in the input geometries. The - postprocess method uses the discrete math rules to compute the - relationship between the input geometries. - - Parameters - ---------- - op : str - The binary predicate to post process. Determines for example the - set predicate to use for computing the result. - point_indices : cudf.Series - The indices of the points in the original GeoSeries. - point_result : cudf.Series - The raw result of the binary predicate. - - Returns - ------- - cudf.Series - The output of the post processing, True/False results for - the specified binary op. - """ - pass - - def __init__(self, lhs, rhs, align=True): - """Compute the binary predicate `op` on `lhs` and `rhs`. - - There are ten binary predicates supported by cuspatial: - - `.equals` - - `.disjoint` - - `.touches` - - `.contains` - - `.contains_properly` - - `.covers` - - `.intersects` - - `.within` - - `.crosses` - - `.overlaps` - - There are thirty-six ordering combinations of `lhs` and `rhs`, the - unordered pairs of each `point`, `multipoint`, `linestring`, - `multilinestring`, `polygon`, and `multipolygon`. The ordering of - `lhs` and `rhs` is important because the result of the binary - predicate is not symmetric. For example, `A.contains(B)` is not - the same as `B.contains(A)`. - - Parameters - ---------- - op : str - The binary predicate to perform. - lhs : GeoSeries - The left-hand-side of the binary predicate. - rhs : GeoSeries - The right-hand-side of the binary predicate. - align : bool - If True, align the indices of `lhs` and `rhs` before performing - the binary predicate. If False, `lhs` and `rhs` must have the - same index. - - Returns - ------- - GeoSeries - A GeoSeries containing the result of the binary predicate. - """ - (self.lhs, self.rhs) = lhs.align(rhs) if align else (lhs, rhs) - self.align = align - - def _cancel_op(self, lhs, rhs, result): - """Used to disable computation of the binary predicate. - - This occurs when the binary predicate is not supported for the - input types, and a final result can be computed only using - `preprocess` and `postprocess`.""" - self._op = lambda x, y: result - return (lhs, rhs, result) - - def __call__(self) -> cudf.Series: - """Return the result of the binary predicate.""" - # Type disambiguation - # Type disambiguation has a large effect on the decisions of the - # algorithm. - (lhs, rhs, indices) = self.preprocess(self.lhs, self.rhs) - - # Binpred call - point_result = self._op(lhs, rhs) - - # Postprocess: Apply discrete math rules to identify relationships. - return self.postprocess(indices, point_result) - - -class ContainsProperlyBinpred(BinaryPredicate): - def __init__(self, lhs, rhs, align=True, allpairs=False): - super().__init__(lhs, rhs, align=align) - self.allpairs = allpairs - if allpairs: - self.align = False - - def preprocess(self, lhs, rhs): - """Preprocess the input GeoSeries to ensure that they are of the - correct type for the predicate.""" - # RHS conditioning: - point_indices = None - # point in polygon - if contains_only_linestrings(rhs): - # condition for linestrings - geom = rhs.lines - elif contains_only_polygons(rhs) is True: - # polygon in polygon - geom = rhs.polygons - elif contains_only_multipoints(rhs) is True: - # mpoint in polygon - geom = rhs.multipoints - else: - # no conditioning is required - geom = rhs.points - xy_points = geom.xy - - # Arrange into shape for calling point-in-polygon, intersection, or - # equals - point_indices = geom.point_indices() - final_rhs = cuspatial.core.geoseries.GeoSeries( - GeoColumn._from_points_xy(xy_points._column) - ) - return (lhs, final_rhs, cudf.Series(point_indices)) - - def _should_use_quadtree(self): - """Determine if the quadtree should be used for the binary predicate. - - Returns - ------- - bool - True if the quadtree should be used, False otherwise. - - Notes - ----- - 1. Quadtree is always used if user requests `allpairs=True`. - 2. If the number of polygons in the lhs is less than 32, we use the - byte-limited algorithm because it is faster and has less memory - overhead. - 3. If the lhs contains more than 32 polygons, we use the quadtree - because it does not have a polygon-count limit. - 4. If the lhs contains multipolygons, we use quadtree because the - performance between quadtree and byte-limited is similar, but - code complexity would be higher if we did multipolygon - reconstruction on both code paths. - """ - return ( - len(self.lhs) >= 32 or has_multipolygons(self.lhs) or self.allpairs - ) - - def _op(self, lhs, points): - """Compute the contains_properly relationship between two GeoSeries. - A feature A contains another feature B if no points of B lie in the - exterior of A, and at least one point of the interior of B lies in the - interior of A. This is the inverse of `within`.""" - if not contains_only_polygons(lhs): - raise TypeError( - "`.contains` can only be called with polygon series." - ) - if self._should_use_quadtree(): - return contains_properly(lhs, points, how="quadtree") - else: - return contains_properly(lhs, points, how="byte-limited") - - def _convert_quadtree_result_from_part_to_polygon_indices( - self, point_result - ): - """Convert the result of a quadtree contains_properly call from - part indices to polygon indices. - - Parameters - ---------- - point_result : cudf.Series - The result of a quadtree contains_properly call. This result - contains the `part_index` of the polygon that contains the - point, not the polygon index. - - Returns - ------- - cudf.Series - The result of a quadtree contains_properly call. This result - contains the `polygon_index` of the polygon that contains the - point, not the part index. - """ - # Get the length of each part, map it to indices, and store - # the result in a dataframe. - if not contains_only_polygons(self.lhs): - raise TypeError( - "`.contains` can only be called with polygon series." - ) - rings_to_parts = cp.array(self.lhs.polygons.part_offset) - part_sizes = rings_to_parts[1:] - rings_to_parts[:-1] - parts_map = cudf.Series( - cp.arange(len(part_sizes)), name="part_index" - ).repeat(part_sizes) - parts_index_mapping_df = parts_map.reset_index(drop=True).reset_index() - # Map the length of each polygon in a similar fashion, then - # join them below. - parts_to_geoms = cp.array(self.lhs.polygons.geometry_offset) - geometry_sizes = parts_to_geoms[1:] - parts_to_geoms[:-1] - geometry_map = cudf.Series( - cp.arange(len(geometry_sizes)), name="polygon_index" - ).repeat(geometry_sizes) - geom_index_mapping_df = geometry_map.reset_index(drop=True) - geom_index_mapping_df.index.name = "part_index" - geom_index_mapping_df = geom_index_mapping_df.reset_index() - # Replace the part index with the polygon index by join - part_result = parts_index_mapping_df.merge( - point_result, on="part_index" - ) - # Replace the polygon index with the row index by join - return geom_index_mapping_df.merge(part_result, on="part_index")[ - ["polygon_index", "point_index"] - ] - - def _count_results_in_multipoint_geometries( - self, point_indices, point_result - ): - """Count the number of points in each multipoint geometry. - - Parameters - ---------- - point_indices : cudf.Series - The indices of the points in the original (rhs) GeoSeries. - point_result : cudf.DataFrame - The result of a contains_properly call. - - Returns - ------- - cudf.Series - The number of points that fell within a particular polygon id. - cudf.Series - The number of points in each multipoint geometry. - """ - point_indices_df = cudf.Series( - point_indices, - name="rhs_index", - index=cudf.RangeIndex(len(point_indices), name="point_index"), - ).reset_index() - with_rhs_indices = point_result.merge( - point_indices_df, on="point_index" - ) - points_grouped_by_original_polygon = with_rhs_indices[ - ["point_index", "rhs_index"] - ].drop_duplicates() - hits = ( - points_grouped_by_original_polygon.groupby("rhs_index") - .count() - .sort_index() - ) - expected_count = ( - point_indices_df.groupby("rhs_index").count().sort_index() - ) - return hits, expected_count - - def _postprocess_quadtree_result(self, point_indices, point_result): - if len(point_result) == 0: - return cudf.Series([False] * len(self.lhs)) - - # Convert the quadtree part indices df into a polygon indices df - polygon_indices = ( - self._convert_quadtree_result_from_part_to_polygon_indices( - point_result - ) - ) - # Because the quadtree contains_properly call returns a list of - # points that are contained in each part, parts can be duplicated - # once their index is converted to a polygon index. - allpairs_result = polygon_indices.drop_duplicates() - - # Replace the polygon index with the original index - allpairs_result["polygon_index"] = allpairs_result[ - "polygon_index" - ].replace( - cudf.Series(self.lhs.index, index=cp.arange(len(self.lhs.index))) - ) - - # If the user wants all pairs, return the result. Otherwise, - # return a boolean series indicating whether each point is - # contained in the corresponding polygon. - if self.allpairs: - return allpairs_result - else: - # for each input pair i: result[i] =  true iff point[i] is - # contained in at least one polygon of multipolygon[i]. - if ( - contains_only_linestrings(self.rhs) - or contains_only_polygons(self.rhs) - or contains_only_multipoints(self.rhs) - ): - ( - hits, - expected_count, - ) = self._count_results_in_multipoint_geometries( - point_indices, allpairs_result - ) - result_df = hits.reset_index().merge( - expected_count.reset_index(), on="rhs_index" - ) - result_df["feature_in_polygon"] = ( - result_df["point_index_x"] >= result_df["point_index_y"] - ) - final_result = cudf.Series( - [False] * (point_indices.max().item() + 1) - ) # point_indices is zero index - final_result.loc[ - result_df["rhs_index"][result_df["feature_in_polygon"]] - ] = True - return final_result - else: - # pairwise - if len(self.lhs) == len(self.rhs): - matches = ( - allpairs_result["polygon_index"] - == allpairs_result["point_index"] - ) - final_result = cudf.Series([False] * len(point_indices)) - final_result.loc[ - allpairs_result["polygon_index"][matches] - ] = True - return final_result - else: - final_result = cudf.Series([False] * len(point_indices)) - final_result.loc[allpairs_result["polygon_index"]] = True - return final_result - - def _postprocess_brute_force_result(self, point_indices, point_result): - # If there are 31 or fewer polygons in the input, the result - # is a dataframe with one row per point and one column per - # polygon. - - # Result can be: - # A Dataframe of booleans with n_points rows and up to 31 columns. - if ( - contains_only_linestrings(self.rhs) - or contains_only_polygons(self.rhs) - or contains_only_multipoints(self.rhs) - ): - # Compute the set of results for each point-in-polygon predicate. - # Group them by the original index, and sum the results. If the - # sum of points in the rhs feature is equal to the number of - # points found in the polygon, then the polygon contains the - # feature. - point_result["idx"] = point_indices - group_result = ( - point_result.groupby("idx").sum().sort_index() - == point_result.groupby("idx").count().sort_index() - ) - else: - group_result = point_result - - # If there is only one column, the result is a series with - # one row per point. If it is a dataframe, the result needs - # to be converted from each matching row/column value to a - # series using `cp.diag`. - boolean_series_output = cudf.Series([False] * len(self.lhs)) - boolean_series_output.name = None - if len(point_result.columns) > 1: - boolean_series_output[group_result.index] = cp.diag( - group_result.values - ) - else: - boolean_series_output[group_result.index] = group_result[ - group_result.columns[0] - ] - return boolean_series_output - - def postprocess(self, point_indices, point_result): - """Postprocess the output GeoSeries to ensure that they are of the - correct type for the predicate. - - Postprocess for contains_properly has to handle multiple input and - output configurations. - - The input can be a single polygon, a single multipolygon, or a - GeoSeries containing a mix of polygons and multipolygons. - - The input to postprocess is `point_indices`, which can be either a - cudf.DataFrame with one row per point and one column per polygon or - a cudf.DataFrame containing the point index and the part index for - each point in the polygon. - """ - return self._postprocess_quadtree_result(point_indices, point_result) - - -class OverlapsBinpred(ContainsProperlyBinpred): - def postprocess(self, point_indices, point_result): - # Same as contains_properly, but we need to check that the - # dimensions are the same. - # TODO: Maybe change this to intersection - if not has_same_geometry(self.lhs, self.rhs): - return cudf.Series([False] * len(self.lhs)) - if len(point_result) == 0: - return cudf.Series([False] * len(self.lhs)) - polygon_indices = ( - self._convert_quadtree_result_from_part_to_polygon_indices( - point_result - ) - ) - group_counts = polygon_indices.groupby("polygon_index").count() - point_counts = ( - cudf.DataFrame( - {"point_indices": point_indices, "input_size": True} - ) - .groupby("point_indices") - .count() - ) - result = (group_counts["point_index"] > 0) & ( - group_counts["point_index"] < point_counts["input_size"] - ) - return result - - -class IntersectsBinpred(ContainsProperlyBinpred): - def preprocess(self, lhs, rhs): - if contains_only_polygons(rhs): - (self.lhs, self.rhs) = (rhs, lhs) - return super().preprocess(rhs, lhs) - - def postprocess(self, point_indices, point_result): - # Same as contains_properly, but we need to check that the - # dimensions are the same. - contains_result = super().postprocess(point_indices, point_result) - return contains_result - - -class WithinBinpred(ContainsProperlyBinpred): - def preprocess(self, lhs, rhs): - if contains_only_polygons(rhs): - (self.lhs, self.rhs) = (rhs, lhs) - return super().preprocess(rhs, lhs) - - def postprocess(self, point_indices, point_result): - """Postprocess the output GeoSeries to ensure that they are of the - correct type for the predicate.""" - (hits, expected_count,) = self._count_results_in_multipoint_geometries( - point_indices, point_result - ) - result_df = hits.reset_index().merge( - expected_count.reset_index(), on="rhs_index" - ) - result_df["feature_in_polygon"] = ( - result_df["point_index_x"] >= result_df["point_index_y"] - ) - final_result = cudf.Series( - [False] * (point_indices.max().item() + 1) - ) # point_indices is zero index - final_result.loc[ - result_df["rhs_index"][result_df["feature_in_polygon"]] - ] = True - return final_result - - -class EqualsBinpred(BinaryPredicate): - def _offset_equals(self, lhs, rhs): - """Compute the pairwise length equality of two offset arrays""" - lhs_lengths = lhs[:-1] - lhs[1:] - rhs_lengths = rhs[:-1] - rhs[1:] - return lhs_lengths == rhs_lengths - - def _sort_interleaved_points_by_offset(self, coords, offsets, sort_order): - """Sort xy according to bins defined by offset. Sort order is a list - of column names to sort by. - - `_sort_interleaved_points_by_offset` creates a dataframe with the - following columns: - "sizes": an index for each object represented in `coords`. - "points": an index for each point in `coords`. - "xy_key": an index that maintains x/y ordering. - "xy": the x/y coordinates in `coords`. - - The dataframe is sorted according to keys passed in by the caller. - For sorting multipoints, the keys in order are "object_key", "xy", - "xy_key". This sorts the points in each multipoint into the same - bin defined by "object_key", then sorts the points in each bin by - x/y coordinates, and finally sorts the points in each bin by the - `xy_key` which maintains that the x coordinate precedes the y - coordinate. - - For sorting linestrings, the keys in order are "object_key", - "point_key", "xy_key". This sorts the points in each linestring - into the same bin defined by "object_key", then sorts the points - in each bin by point ordering, and finally sorts the points in - each bin by x/y ordering. - - Parameters - ---------- - coords : cudf.Series - interleaved x,y coordinates - offsets : cudf.Series - offsets into coords - sort_order : list - list of column names to sort by. One of "object_key", "point_key", - "xy_key", and "xy". - - Returns - ------- - cudf.Series - sorted interleaved x,y coordinates - """ - sizes = offsets[1:] - offsets[:-1] - object_key = ( - cudf.Series(cp.arange(len(sizes))) - .repeat(sizes * 2) - .reset_index(drop=True) - ) - point_key = cp.arange(len(coords) // 2).repeat(2)[::-1] - xy_key = cp.tile([0, 1], len(coords) // 2) - sorting_df = cudf.DataFrame( - { - "object_key": object_key, - "point_key": point_key, - "xy_key": xy_key, - "xy": coords, - } - ) - sorted_df = sorting_df.sort_values(by=sort_order).reset_index( - drop=True - ) - return sorted_df["xy"] - - def _sort_multipoint_series(self, coords, offsets): - """Sort xy according to bins defined by offset""" - result = self._sort_interleaved_points_by_offset( - coords, offsets, ["object_key", "xy", "xy_key"] - ) - result.name = None - return result - - def _sort_multipoints(self, lhs, rhs, initial): - lhs_sorted = self._sort_multipoint_series( - lhs.multipoints.xy, lhs.multipoints.geometry_offset - ) - rhs_sorted = self._sort_multipoint_series( - rhs.multipoints.xy, rhs.multipoints.geometry_offset - ) - lhs_result = cuspatial.core.geoseries.GeoSeries.from_multipoints_xy( - lhs_sorted, lhs.multipoints.geometry_offset - ) - rhs_result = cuspatial.core.geoseries.GeoSeries.from_multipoints_xy( - rhs_sorted, rhs.multipoints.geometry_offset - ) - lhs_result.index = lhs.index - rhs_result.index = rhs.index - return ( - lhs_result, - rhs_result, - initial, - ) - - def _reverse_linestrings(self, coords, offsets): - """Reverse the order of coordinates in a Arrow buffer of coordinates - and offsets.""" - result = self._sort_interleaved_points_by_offset( - coords, offsets, ["object_key", "point_key", "xy_key"] - ) - result.name = None - return result - - def _compare_linestrings_and_reversed(self, lhs, rhs, initial): - """Compare linestrings with their reversed counterparts.""" - lhs_xy = self._reverse_linestrings(lhs.xy, lhs.part_offset) - rhs_xy = self._reverse_linestrings(rhs.xy, rhs.part_offset) - return ( - PreprocessorOutput(lhs_xy, lhs.point_indices()), - PreprocessorOutput(rhs_xy, rhs.point_indices()), - initial, - ) - - def preprocess(self, lhs, rhs): - # Compare types - type_compare = lhs.feature_types == rhs.feature_types - # Any unmatched type is not equal - if (type_compare == False).all(): # noqa: E712 - # Override _op so that it will not be run. - return self._cancel_op(lhs, rhs, type_compare) - # Get indices of matching types - if contains_only_multipoints(lhs): - lengths_equal = self._offset_equals( - lhs.multipoints.geometry_offset, - rhs.multipoints.geometry_offset, - ) - if lengths_equal.any(): - # Multipoints are equal if they contains the - # same unordered points. - return self._sort_multipoints( - lhs[lengths_equal], - rhs[lengths_equal], - lengths_equal, - ) - else: - # No lengths are equal, so none can be equal. - return self._cancel_op(lhs, rhs, lengths_equal) - elif contains_only_linestrings(lhs): - lengths_equal = self._offset_equals( - lhs.lines.part_offset, rhs.lines.part_offset - ) - if lengths_equal.any(): - return ( - lhs[lengths_equal], - rhs[lengths_equal], - lengths_equal, - ) - else: - return self._cancel_op(lhs, rhs, lengths_equal) - elif contains_only_polygons(lhs): - raise NotImplementedError - elif contains_only_points(lhs): - return (lhs, rhs, type_compare) - - def postprocess(self, lengths_equal, point_result): - # if point_result is not a Series, preprocessing terminated - # the results early. - if isinstance(point_result, cudf.Series): - point_result = point_result.sort_index() - lengths_equal[point_result.index] = point_result - return cudf.Series(lengths_equal) - - def _vertices_equals(self, lhs, rhs): - """Compute the equals relationship between interleaved xy - coordinate buffers.""" - length = min(len(lhs), len(rhs)) - a = lhs[:length:2]._column == rhs[:length:2]._column - b = rhs[1:length:2]._column == lhs[1:length:2]._column - return a & b - - def _op(self, lhs, rhs): - if contains_only_linestrings(lhs): - # Linestrings can be compared either forward or - # reversed. We need to compare both directions. - lhs_reversed = self._reverse_linestrings( - lhs.lines.xy, lhs.lines.part_offset - ) - forward_result = self._vertices_equals(lhs.lines.xy, rhs.lines.xy) - reverse_result = self._vertices_equals(lhs_reversed, rhs.lines.xy) - result = forward_result | reverse_result - elif contains_only_multipoints(lhs): - result = self._vertices_equals( - lhs.multipoints.xy, rhs.multipoints.xy - ) - elif contains_only_points(lhs): - result = self._vertices_equals(lhs.points.xy, rhs.points.xy) - elif contains_only_polygons(lhs): - raise NotImplementedError - indices = lhs.point_indices - result_df = cudf.DataFrame( - {"idx": indices[: len(result)], "equals": result} - ) - gb_idx = result_df.groupby("idx") - result = (gb_idx.sum().sort_index() == gb_idx.count().sort_index())[ - "equals" - ] - result.index = lhs.index - result.index.name = None - result.name = None - return result - - -class CrossesBinpred(EqualsBinpred): - """An object is said to cross other if its interior intersects the - interior of the other but does not contain it, and the dimension of - the intersection is less than the dimension of the one or the other. - - This is only implemented for `points.crosses(points)` at this time. - """ - - def postprocess(self, point_indices, point_result): - if has_same_geometry(self.lhs, self.rhs) and contains_only_points( - self.lhs - ): - return cudf.Series([False] * len(self.lhs)) - df_result = cudf.DataFrame({"idx": point_indices, "pip": point_result}) - point_result = cudf.Series( - df_result["pip"], index=cudf.RangeIndex(0, len(df_result)) - ) - point_result.name = None - return point_result - - -class CoversBinpred(EqualsBinpred): - """An object A covers another B if no points of B lie in the exterior - of A, and at least one point of the interior of B lies in the interior - of A. - - This is only implemented for `points.covers(points)` at this time.""" - - def postprocess(self, point_indices, point_result): - return cudf.Series(point_result, index=point_indices) diff --git a/python/cuspatial/cuspatial/core/binpreds/feature_contains.py b/python/cuspatial/cuspatial/core/binpreds/feature_contains.py new file mode 100644 index 000000000..28e783c7d --- /dev/null +++ b/python/cuspatial/cuspatial/core/binpreds/feature_contains.py @@ -0,0 +1,367 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from typing import Generic, TypeVar, Union + +import cupy as cp + +import cudf +from cudf.core.dataframe import DataFrame +from cudf.core.series import Series + +from cuspatial.core._column.geocolumn import GeoColumn +from cuspatial.core.binpreds.binpred_interface import ( + BinPred, + BinPredConfig, + ContainsOpResult, + NotImplementedPredicate, + PreprocessorResult, +) +from cuspatial.core.binpreds.contains import contains_properly +from cuspatial.core.binpreds.feature_equals import ( + DispatchDict as EQUALS_DISPATCH_DICT, +) +from cuspatial.utils.binpred_utils import ( + LineString, + MultiPoint, + Point, + Polygon, + _count_results_in_multipoint_geometries, + _false_series, +) +from cuspatial.utils.column_utils import ( + contains_only_linestrings, + contains_only_multipoints, + contains_only_polygons, + has_multipolygons, +) + +GeoSeries = TypeVar("GeoSeries") + + +class ContainsPredicateBase(BinPred, Generic[GeoSeries]): + """Base class for binary predicates that are defined in terms of a + `contains` basic predicate. This class implements the logic that underlies + `polygon.contains` primarily, and is implemented for many cases. + + Subclasses are selected using the `DispatchDict` located at the end + of this file. + """ + + def __init__(self, **kwargs): + """`ContainsPredicateBase` constructor. + + Parameters + ---------- + allpairs: bool + Whether to compute all pairs of features in the left-hand and + right-hand GeoSeries. If False, the feature will be compared in a + 1:1 fashion with the corresponding feature in the other GeoSeries. + """ + self.config = BinPredConfig(**kwargs) + self.config.allpairs = kwargs.get("allpairs", False) + + def _preprocess(self, lhs, rhs): + """Flatten any rhs into only its points xy array. This is necessary + because the basic predicate for contains, point-in-polygon, + only accepts points. + + Parameters + ---------- + lhs : GeoSeries + The left-hand GeoSeries. + rhs : GeoSeries + The right-hand GeoSeries. + + Returns + ------- + result : GeoSeries + A GeoSeries of boolean values indicating whether each feature in + the right-hand GeoSeries satisfies the requirements of the point- + in-polygon basic predicate with its corresponding feature in the + left-hand GeoSeries. + """ + # RHS conditioning: + point_indices = None + # point in polygon + if contains_only_linestrings(rhs): + # condition for linestrings + geom = rhs.lines + elif contains_only_polygons(rhs) is True: + # polygon in polygon + geom = rhs.polygons + elif contains_only_multipoints(rhs) is True: + # mpoint in polygon + geom = rhs.multipoints + else: + # no conditioning is required + geom = rhs.points + xy_points = geom.xy + + # Arrange into shape for calling point-in-polygon, intersection, or + # equals + point_indices = geom.point_indices() + from cuspatial.core.geoseries import GeoSeries + + final_rhs = GeoSeries(GeoColumn._from_points_xy(xy_points._column)) + preprocess_result = PreprocessorResult( + lhs, rhs, final_rhs, point_indices + ) + return self._compute_predicate(lhs, rhs, preprocess_result) + + def _should_use_quadtree(self, lhs): + """Determine if the quadtree should be used for the binary predicate. + + Returns + ------- + bool + True if the quadtree should be used, False otherwise. + + Notes + ----- + 1. Quadtree is always used if user requests `allpairs=True`. + 2. If the number of polygons in the lhs is less than 32, we use the + byte-limited algorithm because it is faster and has less memory + overhead. + 3. If the lhs contains more than 32 polygons, we use the quadtree + because it does not have a polygon-count limit. + 4. If the lhs contains multipolygons, we use quadtree because the + performance between quadtree and byte-limited is similar, but + code complexity would be higher if we did multipolygon + reconstruction on both code paths. + """ + return len(lhs) >= 32 or has_multipolygons(lhs) or self.config.allpairs + + def _compute_predicate( + self, + lhs: "GeoSeries", + rhs: "GeoSeries", + preprocessor_result: PreprocessorResult, + ): + """Compute the contains_properly relationship between two GeoSeries. + A feature A contains another feature B if no points of B lie in the + exterior of A, and at least one point of the interior of B lies in the + interior of A. This is the inverse of `within`.""" + if not contains_only_polygons(lhs): + raise TypeError( + "`.contains` can only be called with polygon series." + ) + points = preprocessor_result.final_rhs + point_indices = preprocessor_result.point_indices + if self._should_use_quadtree(lhs): + result = contains_properly(lhs, points, how="quadtree") + else: + result = contains_properly(lhs, points, how="byte-limited") + op_result = ContainsOpResult(result, points, point_indices) + return self._postprocess(lhs, rhs, op_result) + + def _convert_quadtree_result_from_part_to_polygon_indices( + self, lhs, point_result + ): + """Convert the result of a quadtree contains_properly call from + part indices to polygon indices. + + Parameters + ---------- + point_result : cudf.Series + The result of a quadtree contains_properly call. This result + contains the `part_index` of the polygon that contains the + point, not the polygon index. + + Returns + ------- + cudf.Series + The result of a quadtree contains_properly call. This result + contains the `polygon_index` of the polygon that contains the + point, not the part index. + """ + # Get the length of each part, map it to indices, and store + # the result in a dataframe. + rings_to_parts = cp.array(lhs.polygons.part_offset) + part_sizes = rings_to_parts[1:] - rings_to_parts[:-1] + parts_map = cudf.Series( + cp.arange(len(part_sizes)), name="part_index" + ).repeat(part_sizes) + parts_index_mapping_df = parts_map.reset_index(drop=True).reset_index() + # Map the length of each polygon in a similar fashion, then + # join them below. + parts_to_geoms = cp.array(lhs.polygons.geometry_offset) + geometry_sizes = parts_to_geoms[1:] - parts_to_geoms[:-1] + geometry_map = cudf.Series( + cp.arange(len(geometry_sizes)), name="polygon_index" + ).repeat(geometry_sizes) + geom_index_mapping_df = geometry_map.reset_index(drop=True) + geom_index_mapping_df.index.name = "part_index" + geom_index_mapping_df = geom_index_mapping_df.reset_index() + # Replace the part index with the polygon index by join + part_result = parts_index_mapping_df.merge( + point_result, on="part_index" + ) + # Replace the polygon index with the row index by join + return geom_index_mapping_df.merge(part_result, on="part_index")[ + ["polygon_index", "point_index"] + ] + + def _reindex_allpairs(self, lhs, op_result) -> Union[Series, DataFrame]: + """Prepare the allpairs result of a contains_properly call as + the first step of postprocessing. + + Parameters + ---------- + lhs : GeoSeries + The left-hand side of the binary predicate. + op_result : ContainsOpResult + The result of the contains_properly call. + + Returns + ------- + cudf.DataFrame + + """ + # Convert the quadtree part indices df into a polygon indices df + polygon_indices = ( + self._convert_quadtree_result_from_part_to_polygon_indices( + lhs, op_result.result + ) + ) + # Because the quadtree contains_properly call returns a list of + # points that are contained in each part, parts can be duplicated + # once their index is converted to a polygon index. + allpairs_result = polygon_indices.drop_duplicates() + + # Replace the polygon index with the original index + allpairs_result["polygon_index"] = allpairs_result[ + "polygon_index" + ].replace(Series(lhs.index, index=cp.arange(len(lhs.index)))) + + return allpairs_result + + def _postprocess(self, lhs, rhs, op_result): + """Postprocess the output GeoSeries to ensure that they are of the + correct type for the predicate. + + Postprocess for contains_properly has to handle multiple input and + output configurations. + + The input can be a single polygon, a single multipolygon, or a + GeoSeries containing a mix of polygons and multipolygons. + + The input to postprocess is `point_indices`, which can be either a + cudf.DataFrame with one row per point and one column per polygon or + a cudf.DataFrame containing the point index and the part index for + each point in the polygon. + + Parameters + ---------- + lhs : GeoSeries + The left-hand side of the binary predicate. + rhs : GeoSeries + The right-hand side of the binary predicate. + preprocessor_output : ContainsOpResult + The result of the contains_properly call. + + Returns + ------- + cudf.Series or cudf.DataFrame + A Series of boolean values indicating whether each feature in + the rhs GeoSeries is contained in the lhs GeoSeries in the + case of allpairs=False. Otherwise, a DataFrame containing the + point index and the polygon index for each point in the + polygon. + """ + if len(op_result.result) == 0: + return _false_series(len(lhs)) + + # Convert the quadtree part indices df into a polygon indices df. + # Helps with handling multipolygons. + allpairs_result = self._reindex_allpairs(lhs, op_result) + + # If the user wants all pairs, return the result. Otherwise, + # return a boolean series indicating whether each point is + # contained in the corresponding polygon. + if self.config.allpairs: + return allpairs_result + else: + # for each input pair i: result[i] =  true iff point[i] is + # contained in at least one polygon of multipolygon[i]. + # pairwise + final_result = _false_series(len(rhs)) + if len(lhs) == len(rhs): + matches = ( + allpairs_result["polygon_index"] + == allpairs_result["point_index"] + ) + polygon_indexes = allpairs_result["polygon_index"][matches] + final_result.loc[ + op_result.point_indices[polygon_indexes] + ] = True + return final_result + else: + final_result.loc[allpairs_result["polygon_index"]] = True + return final_result + + +class PolygonComplexContains(ContainsPredicateBase): + """Base class for contains operations that use a complex object on + the right hand side. + + This class is shared by the Polygon*Contains classes that use + a non-points object on the right hand side: MultiPoint, LineString, + MultiLineString, Polygon, and MultiPolygon. + """ + + def _postprocess(self, lhs, rhs, preprocessor_output): + # for each input pair i: result[i] =  true iff point[i] is + # contained in at least one polygon of multipolygon[i]. + # pairwise + point_indices = preprocessor_output.point_indices + allpairs_result = self._reindex_allpairs(lhs, preprocessor_output) + if isinstance(allpairs_result, Series): + return allpairs_result + + (hits, expected_count,) = _count_results_in_multipoint_geometries( + point_indices, allpairs_result + ) + result_df = hits.reset_index().merge( + expected_count.reset_index(), on="rhs_index" + ) + result_df["feature_in_polygon"] = ( + result_df["point_index_x"] >= result_df["point_index_y"] + ) + final_result = _false_series(len(rhs)) + final_result.loc[ + result_df["rhs_index"][result_df["feature_in_polygon"]] + ] = True + return final_result + + +class PointPointContains(ContainsPredicateBase): + def _preprocess(self, lhs, rhs): + """PointPointContains that simply calls the equals predicate on the + points.""" + + predicate = EQUALS_DISPATCH_DICT[(lhs.column_type, rhs.column_type)]( + align=self.config.align + ) + return predicate(lhs, rhs) + + +"""DispatchDict listing the classes to use for each combination of + left and right hand side types. """ +DispatchDict = { + (Point, Point): PointPointContains, + (Point, MultiPoint): NotImplementedPredicate, + (Point, LineString): NotImplementedPredicate, + (Point, Polygon): NotImplementedPredicate, + (MultiPoint, Point): NotImplementedPredicate, + (MultiPoint, MultiPoint): NotImplementedPredicate, + (MultiPoint, LineString): NotImplementedPredicate, + (MultiPoint, Polygon): NotImplementedPredicate, + (LineString, Point): NotImplementedPredicate, + (LineString, MultiPoint): NotImplementedPredicate, + (LineString, LineString): NotImplementedPredicate, + (LineString, Polygon): NotImplementedPredicate, + (Polygon, Point): ContainsPredicateBase, + (Polygon, MultiPoint): PolygonComplexContains, + (Polygon, LineString): PolygonComplexContains, + (Polygon, Polygon): PolygonComplexContains, +} diff --git a/python/cuspatial/cuspatial/core/binpreds/feature_covers.py b/python/cuspatial/cuspatial/core/binpreds/feature_covers.py new file mode 100644 index 000000000..02a0dcf3d --- /dev/null +++ b/python/cuspatial/cuspatial/core/binpreds/feature_covers.py @@ -0,0 +1,52 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from cuspatial.core.binpreds.binpred_interface import NotImplementedPredicate +from cuspatial.core.binpreds.feature_equals import EqualsPredicateBase +from cuspatial.utils.binpred_utils import ( + LineString, + MultiPoint, + Point, + Polygon, +) + + +class CoversPredicateBase(EqualsPredicateBase): + """Implements the covers predicate across different combinations of + geometry types. For example, a Point-Polygon covers predicate is + defined in terms of a Point-Point equals predicate. The initial release + implements covers predicates that depend only on the equals predicate, or + depend on no predicate, such as impossible cases like + `LineString.covers(Polygon)`. + + For this initial release, cover is supported for the following types: + + Point.covers(Point) + Point.covers(Polygon) + LineString.covers(Polygon) + Polygon.covers(Point) + Polygon.covers(MultiPoint) + Polygon.covers(LineString) + Polygon.covers(Polygon) + """ + + pass + + +DispatchDict = { + (Point, Point): CoversPredicateBase, + (Point, MultiPoint): NotImplementedPredicate, + (Point, LineString): NotImplementedPredicate, + (Point, Polygon): CoversPredicateBase, + (MultiPoint, Point): NotImplementedPredicate, + (MultiPoint, MultiPoint): NotImplementedPredicate, + (MultiPoint, LineString): NotImplementedPredicate, + (MultiPoint, Polygon): NotImplementedPredicate, + (LineString, Point): NotImplementedPredicate, + (LineString, MultiPoint): NotImplementedPredicate, + (LineString, LineString): NotImplementedPredicate, + (LineString, Polygon): CoversPredicateBase, + (Polygon, Point): CoversPredicateBase, + (Polygon, MultiPoint): CoversPredicateBase, + (Polygon, LineString): CoversPredicateBase, + (Polygon, Polygon): CoversPredicateBase, +} diff --git a/python/cuspatial/cuspatial/core/binpreds/feature_crosses.py b/python/cuspatial/cuspatial/core/binpreds/feature_crosses.py new file mode 100644 index 000000000..0d3c0fe08 --- /dev/null +++ b/python/cuspatial/cuspatial/core/binpreds/feature_crosses.py @@ -0,0 +1,46 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from cuspatial.core.binpreds.binpred_interface import NotImplementedPredicate +from cuspatial.core.binpreds.feature_equals import EqualsPredicateBase +from cuspatial.utils.binpred_utils import ( + LineString, + MultiPoint, + Point, + Polygon, + _false_series, +) + + +class CrossesPredicateBase(EqualsPredicateBase): + """Base class for binary predicates that are defined in terms of a + the equals binary predicate. For example, a Point-Point Crosses + predicate is defined in terms of a Point-Point Equals predicate. + """ + + pass + + +class PointPointCrosses(CrossesPredicateBase): + def _preprocess(self, lhs, rhs): + """Points can't cross other points, so we return False.""" + return _false_series(len(lhs)) + + +DispatchDict = { + (Point, Point): PointPointCrosses, + (Point, MultiPoint): NotImplementedPredicate, + (Point, LineString): NotImplementedPredicate, + (Point, Polygon): CrossesPredicateBase, + (MultiPoint, Point): NotImplementedPredicate, + (MultiPoint, MultiPoint): NotImplementedPredicate, + (MultiPoint, LineString): NotImplementedPredicate, + (MultiPoint, Polygon): NotImplementedPredicate, + (LineString, Point): NotImplementedPredicate, + (LineString, MultiPoint): NotImplementedPredicate, + (LineString, LineString): NotImplementedPredicate, + (LineString, Polygon): NotImplementedPredicate, + (Polygon, Point): CrossesPredicateBase, + (Polygon, MultiPoint): CrossesPredicateBase, + (Polygon, LineString): CrossesPredicateBase, + (Polygon, Polygon): CrossesPredicateBase, +} diff --git a/python/cuspatial/cuspatial/core/binpreds/feature_equals.py b/python/cuspatial/cuspatial/core/binpreds/feature_equals.py new file mode 100644 index 000000000..8b1480b97 --- /dev/null +++ b/python/cuspatial/cuspatial/core/binpreds/feature_equals.py @@ -0,0 +1,341 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from __future__ import annotations + +from typing import Generic, TypeVar + +import cupy as cp + +import cudf +from cudf import Series + +import cuspatial +from cuspatial.core.binpreds.binpred_interface import ( + BinPred, + EqualsOpResult, + NotImplementedPredicate, + PreprocessorResult, +) +from cuspatial.utils.binpred_utils import ( + LineString, + MultiPoint, + Point, + Polygon, + _false_series, +) + +GeoSeries = TypeVar("GeoSeries") + + +class EqualsPredicateBase(BinPred, Generic[GeoSeries]): + """Base class for binary predicates that are defined in terms of the equals + basic predicate. `EqualsPredicateBase` implements utility functions that + are used within many equals-related binary predicates. + """ + + def _offset_equals(self, lhs, rhs): + """Compute the pairwise length equality of two offset arrays. Consider + the following example: + + lhs = [0, 3, 5, 7] + rhs = [0, 2, 4, 6] + + _offset_equals(lhs, rhs) returns [False, True, True, True]. The first + element is False because the first object in lhs has 3 points, while + the first object in rhs has 2 points. The remaining elements are True + because the remaining objects in lhs and rhs have the same number of + points. + + Parameters + ---------- + lhs : cudf.Series + left-hand-side offset array + rhs : cudf.Series + right-hand-side offset array + + Returns + ------- + cudf.Series + pairwise length equality + """ + lhs_lengths = lhs[:-1] - lhs[1:] + rhs_lengths = rhs[:-1] - rhs[1:] + return lhs_lengths == rhs_lengths + + def _sort_interleaved_points_by_offset(self, coords, offsets, sort_order): + """Sort xy according to bins defined by offset. Sort order is a list + of column names to sort by. + + `_sort_interleaved_points_by_offset` creates a dataframe with the + following columns: + "sizes": an index for each object represented in `coords`. + "points": an index for each point in `coords`. + "xy_key": an index that maintains x/y ordering. + "xy": the x/y coordinates in `coords`. + + The dataframe is sorted according to keys passed in by the caller. + For sorting multipoints, the keys in order are "object_key", "xy", + "xy_key". This sorts the points in each multipoint into the same + bin defined by "object_key", then sorts the points in each bin by + x/y coordinates, and finally sorts the points in each bin by the + `xy_key` which maintains that the x coordinate precedes the y + coordinate. + + For sorting linestrings, the keys in order are "object_key", + "point_key", "xy_key". This sorts the points in each linestring + into the same bin defined by "object_key", then sorts the points + in each bin by point ordering, and finally sorts the points in + each bin by x/y ordering. + + Parameters + ---------- + coords : cudf.Series + interleaved x,y coordinates + offsets : cudf.Series + offsets into coords + sort_order : list + list of column names to sort by. One of "object_key", "point_key", + "xy_key", and "xy". + + Returns + ------- + cudf.Series + sorted interleaved x,y coordinates + """ + sizes = offsets[1:] - offsets[:-1] + object_key = ( + cudf.Series(cp.arange(len(sizes))) + .repeat(sizes * 2) + .reset_index(drop=True) + ) + point_key = cp.arange(len(coords) // 2).repeat(2)[::-1] + xy_key = cp.tile([0, 1], len(coords) // 2) + sorting_df = cudf.DataFrame( + { + "object_key": object_key, + "point_key": point_key, + "xy_key": xy_key, + "xy": coords, + } + ) + sorted_df = sorting_df.sort_values(by=sort_order).reset_index( + drop=True + ) + return sorted_df["xy"] + + def _sort_multipoint_series(self, coords, offsets): + """Sort xy according to bins defined by offset. Consider an xy buffer + of 20 values and an offset buffer [0, 5]. This means that the first + multipoint has 5 points and the second multipoint has 5 points. The + first multipoint is sorted by x/y coordinates and the second + multipoint is sorted by x/y coordinates. The resultant sorted values + are stored in the same offset region, or bin, as the original + unsorted values. + + Parameters + ---------- + coords : cudf.Series + interleaved x,y coordinates + offsets : cudf.Series + offsets into coords + + Returns + ------- + cudf.Series + Coordinates sorted according to the bins defined by offsets. + """ + result = self._sort_interleaved_points_by_offset( + coords, offsets, ["object_key", "xy", "xy_key"] + ) + result.name = None + return result + + def _sort_multipoints(self, lhs, rhs): + """Sort the coordinates of the multipoints in the left-hand and + right-hand GeoSeries. + + Parameters + ---------- + lhs : GeoSeries + The left-hand GeoSeries. + rhs : GeoSeries + The right-hand GeoSeries. + + Returns + ------- + lhs_result : Tuple + A tuple containing the sorted left-hand GeoSeries and the + sorted right-hand GeoSeries. + """ + lhs_sorted = self._sort_multipoint_series( + lhs.multipoints.xy, lhs.multipoints.geometry_offset + ) + rhs_sorted = self._sort_multipoint_series( + rhs.multipoints.xy, rhs.multipoints.geometry_offset + ) + lhs_result = cuspatial.core.geoseries.GeoSeries.from_multipoints_xy( + lhs_sorted, lhs.multipoints.geometry_offset + ) + rhs_result = cuspatial.core.geoseries.GeoSeries.from_multipoints_xy( + rhs_sorted, rhs.multipoints.geometry_offset + ) + lhs_result.index = lhs.index + rhs_result.index = rhs.index + return ( + lhs_result, + rhs_result, + ) + + def _reverse_linestrings(self, coords, offsets): + """Reverse the order of coordinates in a Arrow buffer of coordinates + and offsets.""" + result = self._sort_interleaved_points_by_offset( + coords, offsets, ["object_key", "point_key", "xy_key"] + ) + result.name = None + return result + + def _preprocess(self, lhs: "GeoSeries", rhs: "GeoSeries"): + """Convert the input geometry types into buffers of points that can + then be compared with the equals basic predicate. + + The equals basic predicate is simply the value-equality operation + on the coordinates of the points in the geometries. This means that + we can convert any geometry type into a buffer of points and then + compare the buffers with the equals basic predicate. + + Parameters + ---------- + lhs : GeoSeries + The left-hand GeoSeries. + rhs : GeoSeries + The right-hand GeoSeries. + + Returns + ------- + result : GeoSeries + A GeoSeries of boolean values indicating whether each feature in + the right-hand GeoSeries satisfies the requirements of a binary + predicate with its corresponding feature in the left-hand + GeoSeries. + """ + # Any unmatched type is not equal + if (lhs.feature_types != rhs.feature_types).any(): + return _false_series(len(lhs)) + return self._compute_predicate( + lhs, rhs, PreprocessorResult(None, rhs.point_indices) + ) + + def _vertices_equals(self, lhs: Series, rhs: Series): + """Compute the equals relationship between interleaved xy + coordinate buffers.""" + if not isinstance(lhs, Series): + raise TypeError("lhs must be a cudf.Series") + if not isinstance(rhs, Series): + raise TypeError("rhs must be a cudf.Series") + length = min(len(lhs), len(rhs)) + a = lhs[:length:2]._column == rhs[:length:2]._column + b = rhs[1:length:2]._column == lhs[1:length:2]._column + return a & b + + def _compute_predicate(self, lhs, rhs, preprocessor_result): + """Perform the binary predicate operation on the input GeoSeries. + The lhs and rhs are `GeoSeries` of points, and the point_indices + are the indices of the points in the rhs GeoSeries that correspond + to each feature in the rhs GeoSeries. + """ + result = self._vertices_equals(lhs.points.xy, rhs.points.xy) + return self._postprocess( + lhs, rhs, EqualsOpResult(result, preprocessor_result.point_indices) + ) + + def _postprocess(self, lhs, rhs, op_result): + """Postprocess the output GeoSeries to combine the resulting + comparisons into a single boolean value for each feature in the + rhs GeoSeries. + """ + return cudf.Series(op_result.result, dtype="bool") + + +class PolygonComplexEquals(EqualsPredicateBase): + def _postprocess(self, lhs, rhs, op_result): + """Postprocess the output GeoSeries to combine the resulting + comparisons into a single boolean value for each feature in the + rhs GeoSeries. + """ + if len(op_result.result) == 0: + return _false_series(len(lhs)) + result_df = cudf.DataFrame( + {"idx": op_result.point_indices, "equals": op_result.result} + ) + gb_idx = result_df.groupby("idx") + feature_equals_linestring = ( + gb_idx.sum().sort_index() == gb_idx.count().sort_index() + )["equals"] + result = _false_series(len(lhs)) + result[ + feature_equals_linestring.index + ] = feature_equals_linestring.values + return result + + +class MultiPointMultiPointEquals(PolygonComplexEquals): + def _preprocess(self, lhs, rhs): + """Sort the multipoints by their coordinates. This is necessary + because the order of the points in a multipoint is not significant + for the equals predicate.""" + (lhs_result, rhs_result) = self._sort_multipoints(lhs, rhs) + return self._compute_predicate( + lhs_result, rhs_result, rhs_result.point_indices + ) + + def _compute_predicate(self, lhs, rhs, point_indices): + result = self._vertices_equals(lhs.multipoints.xy, rhs.multipoints.xy) + return self._postprocess( + lhs, rhs, EqualsOpResult(result, point_indices) + ) + + +class LineStringLineStringEquals(PolygonComplexEquals): + def _compute_predicate(self, lhs, rhs, preprocessor_result): + """Linestrings can be compared either forward or reversed. We need + to compare both directions.""" + lengths_equal = self._offset_equals( + lhs.lines.part_offset, rhs.lines.part_offset + ) + lhs_lengths_equal = lhs[lengths_equal] + rhs_lengths_equal = rhs[lengths_equal] + lhs_reversed = self._reverse_linestrings( + lhs_lengths_equal.lines.xy, lhs_lengths_equal.lines.part_offset + ) + forward_result = self._vertices_equals( + lhs_lengths_equal.lines.xy, rhs_lengths_equal.lines.xy + ) + reverse_result = self._vertices_equals( + lhs_reversed, rhs_lengths_equal.lines.xy + ) + result = forward_result | reverse_result + return self._postprocess( + lhs, rhs, EqualsOpResult(result, lhs_lengths_equal.point_indices) + ) + + +"""DispatchDict for Equals operations.""" +DispatchDict = { + (Point, Point): EqualsPredicateBase, + (Point, MultiPoint): NotImplementedPredicate, + (Point, LineString): NotImplementedPredicate, + (Point, Polygon): EqualsPredicateBase, + (MultiPoint, Point): NotImplementedPredicate, + (MultiPoint, MultiPoint): MultiPointMultiPointEquals, + (MultiPoint, LineString): NotImplementedPredicate, + (MultiPoint, Polygon): NotImplementedPredicate, + (LineString, Point): NotImplementedPredicate, + (LineString, MultiPoint): NotImplementedPredicate, + (LineString, LineString): LineStringLineStringEquals, + (LineString, Polygon): EqualsPredicateBase, + (Polygon, Point): EqualsPredicateBase, + (Polygon, MultiPoint): EqualsPredicateBase, + (Polygon, LineString): EqualsPredicateBase, + (Polygon, Polygon): EqualsPredicateBase, +} diff --git a/python/cuspatial/cuspatial/core/binpreds/feature_intersects.py b/python/cuspatial/cuspatial/core/binpreds/feature_intersects.py new file mode 100644 index 000000000..ca339b1dd --- /dev/null +++ b/python/cuspatial/cuspatial/core/binpreds/feature_intersects.py @@ -0,0 +1,54 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from cuspatial.core.binpreds.binpred_interface import NotImplementedPredicate +from cuspatial.core.binpreds.feature_contains import ContainsPredicateBase +from cuspatial.core.binpreds.feature_equals import EqualsPredicateBase +from cuspatial.utils.binpred_utils import ( + LineString, + MultiPoint, + Point, + Polygon, +) + + +class IntersectsPredicateBase(EqualsPredicateBase): + """Base class for binary predicates that are defined in terms of + the intersects basic predicate. These predicates are defined in terms + of the equals basic predicate. The type dispatches here that depend + on `IntersectsPredicateBase` use the `PredicateEquals` class for their + complete implementation, unmodified. + + point.intersects(polygon) is equivalent to polygon.contains(point) + with the left and right hand sides reversed. + """ + + pass + + +class PointPolygonIntersects(ContainsPredicateBase): + def _preprocess(self, lhs, rhs): + """Swap LHS and RHS and call the normal contains processing.""" + self.lhs = rhs + self.rhs = lhs + return super()._preprocess(rhs, lhs) + + +""" Type dispatch dictionary for intersects binary predicates. """ +DispatchDict = { + (Point, Point): IntersectsPredicateBase, + (Point, MultiPoint): NotImplementedPredicate, + (Point, LineString): NotImplementedPredicate, + (Point, Polygon): PointPolygonIntersects, + (MultiPoint, Point): NotImplementedPredicate, + (MultiPoint, MultiPoint): NotImplementedPredicate, + (MultiPoint, LineString): NotImplementedPredicate, + (MultiPoint, Polygon): NotImplementedPredicate, + (LineString, Point): NotImplementedPredicate, + (LineString, MultiPoint): NotImplementedPredicate, + (LineString, LineString): NotImplementedPredicate, + (LineString, Polygon): NotImplementedPredicate, + (Polygon, Point): IntersectsPredicateBase, + (Polygon, MultiPoint): IntersectsPredicateBase, + (Polygon, LineString): IntersectsPredicateBase, + (Polygon, Polygon): IntersectsPredicateBase, +} diff --git a/python/cuspatial/cuspatial/core/binpreds/feature_overlaps.py b/python/cuspatial/cuspatial/core/binpreds/feature_overlaps.py new file mode 100644 index 000000000..6956c9cbf --- /dev/null +++ b/python/cuspatial/cuspatial/core/binpreds/feature_overlaps.py @@ -0,0 +1,75 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +import cudf + +from cuspatial.core.binpreds.binpred_interface import NotImplementedPredicate +from cuspatial.core.binpreds.feature_contains import ContainsPredicateBase +from cuspatial.core.binpreds.feature_equals import EqualsPredicateBase +from cuspatial.utils.binpred_utils import ( + LineString, + MultiPoint, + Point, + Polygon, + _false_series, +) +from cuspatial.utils.column_utils import has_same_geometry + + +class OverlapsPredicateBase(EqualsPredicateBase): + """Base class for overlaps binary predicate. Depends on the + equals predicate for all implementations up to this point. + For example, a Point-Point Crosses predicate is defined in terms + of a Point-Point Equals predicate. + """ + + pass + + +class PointPointOverlaps(OverlapsPredicateBase): + def _preprocess(self, lhs, rhs): + """Points can't overlap other points, so we return False.""" + return _false_series(len(lhs)) + + +class PolygonPointOverlaps(ContainsPredicateBase): + def _postprocess(self, lhs, rhs, op_result): + if not has_same_geometry(lhs, rhs) or len(op_result.point_result) == 0: + return _false_series(len(lhs)) + polygon_indices = ( + self._convert_quadtree_result_from_part_to_polygon_indices( + op_result.point_result + ) + ) + group_counts = polygon_indices.groupby("polygon_index").count() + point_counts = ( + cudf.DataFrame( + {"point_indices": op_result.point_indices, "input_size": True} + ) + .groupby("point_indices") + .count() + ) + result = (group_counts["point_index"] > 0) & ( + group_counts["point_index"] < point_counts["input_size"] + ) + return result + + +"""Dispatch table for overlaps binary predicate.""" +DispatchDict = { + (Point, Point): PointPointOverlaps, + (Point, MultiPoint): NotImplementedPredicate, + (Point, LineString): NotImplementedPredicate, + (Point, Polygon): OverlapsPredicateBase, + (MultiPoint, Point): NotImplementedPredicate, + (MultiPoint, MultiPoint): NotImplementedPredicate, + (MultiPoint, LineString): NotImplementedPredicate, + (MultiPoint, Polygon): NotImplementedPredicate, + (LineString, Point): NotImplementedPredicate, + (LineString, MultiPoint): NotImplementedPredicate, + (LineString, LineString): NotImplementedPredicate, + (LineString, Polygon): NotImplementedPredicate, + (Polygon, Point): OverlapsPredicateBase, + (Polygon, MultiPoint): OverlapsPredicateBase, + (Polygon, LineString): OverlapsPredicateBase, + (Polygon, Polygon): OverlapsPredicateBase, +} diff --git a/python/cuspatial/cuspatial/core/binpreds/feature_within.py b/python/cuspatial/cuspatial/core/binpreds/feature_within.py new file mode 100644 index 000000000..e438e5cea --- /dev/null +++ b/python/cuspatial/cuspatial/core/binpreds/feature_within.py @@ -0,0 +1,82 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +import cudf + +from cuspatial.core.binpreds.binpred_interface import NotImplementedPredicate +from cuspatial.core.binpreds.feature_contains import ContainsPredicateBase +from cuspatial.core.binpreds.feature_equals import EqualsPredicateBase +from cuspatial.utils import binpred_utils +from cuspatial.utils.binpred_utils import ( + LineString, + MultiPoint, + Point, + Polygon, + _false_series, +) + + +class RootWithin(EqualsPredicateBase): + """Base class for binary predicates that are defined in terms of a + root-level binary predicate. For example, a Point-Point Within + predicate is defined in terms of a Point-Point Contains predicate. + """ + + pass + + +class PointPointWithin(RootWithin): + def _postprocess(self, lhs, rhs, op_result): + return cudf.Series(op_result.result) + + +class PointPolygonWithin(ContainsPredicateBase): + def _preprocess(self, lhs, rhs): + # Note the order of arguments is reversed. + return super()._preprocess(rhs, lhs) + + +class ComplexPolygonWithin(ContainsPredicateBase): + def _preprocess(self, lhs, rhs): + # Note the order of arguments is reversed. + return super()._preprocess(rhs, lhs) + + def _postprocess(self, lhs, rhs, op_result): + """Postprocess the output GeoSeries to ensure that they are of the + correct type for the predicate.""" + ( + hits, + expected_count, + ) = binpred_utils._count_results_in_multipoint_geometries( + op_result.point_indices, op_result.result + ) + result_df = hits.reset_index().merge( + expected_count.reset_index(), on="rhs_index" + ) + result_df["feature_in_polygon"] = ( + result_df["point_index_x"] >= result_df["point_index_y"] + ) + final_result = _false_series(len(lhs)) + final_result.loc[ + result_df["rhs_index"][result_df["feature_in_polygon"]] + ] = True + return final_result + + +DispatchDict = { + (Point, Point): PointPointWithin, + (Point, MultiPoint): NotImplementedPredicate, + (Point, LineString): NotImplementedPredicate, + (Point, Polygon): PointPolygonWithin, + (MultiPoint, Point): NotImplementedPredicate, + (MultiPoint, MultiPoint): NotImplementedPredicate, + (MultiPoint, LineString): NotImplementedPredicate, + (MultiPoint, Polygon): ComplexPolygonWithin, + (LineString, Point): NotImplementedPredicate, + (LineString, MultiPoint): NotImplementedPredicate, + (LineString, LineString): NotImplementedPredicate, + (LineString, Polygon): ComplexPolygonWithin, + (Polygon, Point): RootWithin, + (Polygon, MultiPoint): RootWithin, + (Polygon, LineString): RootWithin, + (Polygon, Polygon): ComplexPolygonWithin, +} diff --git a/python/cuspatial/cuspatial/core/geoseries.py b/python/cuspatial/cuspatial/core/geoseries.py index 41d5f7eb8..944a12486 100644 --- a/python/cuspatial/cuspatial/core/geoseries.py +++ b/python/cuspatial/cuspatial/core/geoseries.py @@ -24,16 +24,28 @@ from cudf.core.column.column import as_column import cuspatial.io.pygeoarrow as pygeoarrow -from cuspatial.core._column.geocolumn import GeoColumn +from cuspatial.core._column.geocolumn import ColumnType, GeoColumn from cuspatial.core._column.geometa import Feature_Enum, GeoMeta -from cuspatial.core.binpreds.binpreds import ( - ContainsProperlyBinpred, - CoversBinpred, - CrossesBinpred, - EqualsBinpred, - IntersectsBinpred, - OverlapsBinpred, - WithinBinpred, +from cuspatial.core.binpreds.feature_contains import ( + DispatchDict as CONTAINS_DISPATCH, +) +from cuspatial.core.binpreds.feature_covers import ( + DispatchDict as COVERS_DISPATCH, +) +from cuspatial.core.binpreds.feature_crosses import ( + DispatchDict as CROSSES_DISPATCH, +) +from cuspatial.core.binpreds.feature_equals import ( + DispatchDict as EQUALS_DISPATCH, +) +from cuspatial.core.binpreds.feature_intersects import ( + DispatchDict as INTERSECTS_DISPATCH, +) +from cuspatial.core.binpreds.feature_overlaps import ( + DispatchDict as OVERLAPS_DISPATCH, +) +from cuspatial.core.binpreds.feature_within import ( + DispatchDict as WITHIN_DISPATCH, ) from cuspatial.utils.column_utils import ( contains_only_linestrings, @@ -121,6 +133,23 @@ def type(self): ) return result + @property + def column_type(self): + """This is used to determine the type of the GeoColumn. + It is a value returning method that produces the same result as + the various `contains_only_*` methods, except as an Enum instead + of many booleans.""" + if contains_only_polygons(self): + return ColumnType.POLYGON + elif contains_only_linestrings(self): + return ColumnType.LINESTRING + elif contains_only_multipoints(self): + return ColumnType.MULTIPOINT + elif contains_only_points(self): + return ColumnType.POINT + else: + return ColumnType.MIXED + @property def point_indices(self): if contains_only_polygons(self): @@ -132,6 +161,8 @@ def point_indices(self): elif contains_only_points(self): return self.points.point_indices() else: + if len(self) == 0: + return cudf.Series([0], dtype="int32") raise TypeError( "GeoSeries must contain only Points, MultiPoints, Lines, or " "Polygons to return point indices." @@ -1017,9 +1048,10 @@ def contains_properly(self, other, align=False, allpairs=False): `point_indices` and `polygon_indices`, each of which is a `Series` of `dtype('int32')` in the case of `allpairs=True`. """ - if contains_only_points(self) and contains_only_points(self): - return EqualsBinpred(self, other, align)() - return ContainsProperlyBinpred(self, other, align, allpairs)() + predicate = CONTAINS_DISPATCH[(self.column_type, other.column_type)]( + align=align, allpairs=allpairs + ) + return predicate(self, other) def geom_equals(self, other, align=True): """Compute if a GeoSeries of features A is equal to a GeoSeries of @@ -1064,7 +1096,10 @@ def geom_equals(self, other, align=True): A Series of boolean values indicating whether each feature in A is equal to the corresponding feature in B. """ - return EqualsBinpred(self, other, align)() + predicate = EQUALS_DISPATCH[(self.column_type, other.column_type)]( + align=align + ) + return predicate(self, other) def covers(self, other, align=True): """Compute if a GeoSeries of features A covers a second GeoSeries of @@ -1084,7 +1119,10 @@ def covers(self, other, align=True): input GeoSeries covers the corresponding feature in the other GeoSeries. """ - return CoversBinpred(self, other, align)() + predicate = COVERS_DISPATCH[(self.column_type, other.column_type)]( + align=align + ) + return predicate(self, other) def intersects(self, other, align=True): """Returns a `Series` of `dtype('bool')` with value `True` for each @@ -1106,10 +1144,10 @@ def intersects(self, other, align=True): A Series of boolean values indicating whether the geometries of each row intersect. """ - if contains_only_points(self) and contains_only_points(other): - return cudf.Series(EqualsBinpred(self, other, align)()) - else: - return IntersectsBinpred(self, other, align)() + predicate = INTERSECTS_DISPATCH[(self.column_type, other.column_type)]( + align=align + ) + return predicate(self, other) def within(self, other, align=True): """Returns a `Series` of `dtype('bool')` with value `True` for each @@ -1132,10 +1170,10 @@ def within(self, other, align=True): A Series of boolean values indicating whether each feature falls within the corresponding polygon in the input. """ - if contains_only_points(self) and contains_only_points(other): - return cudf.Series(EqualsBinpred(self, other, align)()) - else: - return WithinBinpred(self, other, align)() + predicate = WITHIN_DISPATCH[(self.column_type, other.column_type)]( + align=align + ) + return predicate(self, other) def overlaps(self, other, align=True): """Returns True for all aligned geometries that overlap other, else @@ -1158,11 +1196,10 @@ def overlaps(self, other, align=True): result : cudf.Series A Series of boolean values indicating whether each geometry overlaps the corresponding geometry in the input.""" - # Overlaps has the same requirement as crosses. - if contains_only_points(self) and contains_only_points(other): - return cudf.Series([False] * len(self)) - else: - return OverlapsBinpred(self, other, align=align)() + predicate = OVERLAPS_DISPATCH[(self.column_type, other.column_type)]( + align=align + ) + return predicate(self, other) def crosses(self, other, align=True): """Returns True for all aligned geometries that cross other, else @@ -1185,7 +1222,7 @@ def crosses(self, other, align=True): result : cudf.Series A Series of boolean values indicating whether each geometry crosses the corresponding geometry in the input.""" - if contains_only_points(self) and contains_only_points(other): - return cudf.Series([False] * len(self)) - else: - return CrossesBinpred(self, other, align=align)() + predicate = CROSSES_DISPATCH[(self.column_type, other.column_type)]( + align=align + ) + return predicate(self, other) diff --git a/python/cuspatial/cuspatial/tests/binpreds/test_binpred_internals.py b/python/cuspatial/cuspatial/tests/binpreds/test_binpred_internals.py index 1c59e2882..7d18530ac 100644 --- a/python/cuspatial/cuspatial/tests/binpreds/test_binpred_internals.py +++ b/python/cuspatial/cuspatial/tests/binpreds/test_binpred_internals.py @@ -4,6 +4,7 @@ from shapely.geometry import LineString import cuspatial +from cuspatial.core.binpreds.binpred_dispatch import EQUALS_DISPATCH def test_internal_reversed_linestrings(): @@ -17,10 +18,10 @@ def test_internal_reversed_linestrings(): LineString([(0, 0), (1, 0), (1, 1), (0, 0)]), ] ) - from cuspatial.core.binpreds.binpreds import EqualsBinpred - - bp = EqualsBinpred(linestring1, linestring2) - got = bp._reverse_linestrings( + predicate = EQUALS_DISPATCH[ + (linestring1.column_type, linestring2.column_type) + ]() + got = predicate._reverse_linestrings( linestring1.lines.xy, linestring1.lines.part_offset ).to_pandas() expected = linestring2.lines.xy.to_pandas() @@ -40,10 +41,10 @@ def test_internal_reversed_linestrings_pair(): LineString([(1, 0), (1, 1), (0, 0)]), ] ) - from cuspatial.core.binpreds.binpreds import EqualsBinpred - - bp = EqualsBinpred(linestring1, linestring2) - got = bp._reverse_linestrings( + predicate = EQUALS_DISPATCH[ + (linestring1.column_type, linestring2.column_type) + ]() + got = predicate._reverse_linestrings( linestring1.lines.xy, linestring1.lines.part_offset ).to_pandas() expected = linestring2.lines.xy.to_pandas() @@ -65,10 +66,10 @@ def test_internal_reversed_linestrings_triple(): LineString([(1, 1), (0, 0), (1, 0), (1, 1), (0, 0)]), ] ) - from cuspatial.core.binpreds.binpreds import EqualsBinpred - - bp = EqualsBinpred(linestring1, linestring2) - got = bp._reverse_linestrings( + predicate = EQUALS_DISPATCH[ + (linestring1.column_type, linestring2.column_type) + ]() + got = predicate._reverse_linestrings( linestring1.lines.xy, linestring1.lines.part_offset ).to_pandas() expected = linestring2.lines.xy.to_pandas() diff --git a/python/cuspatial/cuspatial/tests/binpreds/test_equals_only_binpreds.py b/python/cuspatial/cuspatial/tests/binpreds/test_equals_only_binpreds.py index 84c4e0f25..3dfdbf2a8 100644 --- a/python/cuspatial/cuspatial/tests/binpreds/test_equals_only_binpreds.py +++ b/python/cuspatial/cuspatial/tests/binpreds/test_equals_only_binpreds.py @@ -106,8 +106,8 @@ def test_3_linestrings_equals_3_linestrings_one_equal(lhs): def test_10_linestrings_geom_equals_10_linestrings(linestring_generator): - gpdlines1 = gpd.GeoSeries([*linestring_generator(11, 5)]) - gpdlines2 = gpd.GeoSeries([*linestring_generator(11, 5)]) + gpdlines1 = gpd.GeoSeries([*linestring_generator(10, 5)]) + gpdlines2 = gpd.GeoSeries([*linestring_generator(10, 5)]) lines1 = cuspatial.from_geopandas(gpdlines1) lines2 = cuspatial.from_geopandas(gpdlines2) got = lines1.geom_equals(lines2) @@ -732,50 +732,6 @@ def test_linestring_orders(): pd.testing.assert_series_equal(expected, got.to_pandas()) -def test_internal_reversed_linestrings(): - linestring1 = cuspatial.GeoSeries( - [ - LineString([(0, 0), (1, 1), (1, 0), (0, 0)]), - ] - ) - linestring2 = cuspatial.GeoSeries( - [ - LineString([(0, 0), (1, 0), (1, 1), (0, 0)]), - ] - ) - from cuspatial.core.binpreds.binpreds import EqualsBinpred - - bp = EqualsBinpred(linestring1, linestring2) - got = bp._reverse_linestrings( - linestring1.lines.xy, linestring1.lines.part_offset - ).to_pandas() - expected = linestring2.lines.xy.to_pandas() - pd.testing.assert_series_equal(got, expected) - - -def test_internal_reversed_linestrings_pair(): - linestring1 = cuspatial.GeoSeries( - [ - LineString([(0, 0), (1, 1), (1, 0), (0, 0)]), - LineString([(0, 0), (1, 1), (1, 0)]), - ] - ) - linestring2 = cuspatial.GeoSeries( - [ - LineString([(0, 0), (1, 0), (1, 1), (0, 0)]), - LineString([(1, 0), (1, 1), (0, 0)]), - ] - ) - from cuspatial.core.binpreds.binpreds import EqualsBinpred - - bp = EqualsBinpred(linestring1, linestring2) - got = bp._reverse_linestrings( - linestring1.lines.xy, linestring1.lines.part_offset - ).to_pandas() - expected = linestring2.lines.xy.to_pandas() - pd.testing.assert_series_equal(got, expected) - - def test_from_points_xy_large(): points = cuspatial.GeoSeries( cuspatial.core._column.geocolumn.GeoColumn._from_points_xy( diff --git a/python/cuspatial/cuspatial/utils/binpred_utils.py b/python/cuspatial/cuspatial/utils/binpred_utils.py new file mode 100644 index 000000000..8cbddbda2 --- /dev/null +++ b/python/cuspatial/cuspatial/utils/binpred_utils.py @@ -0,0 +1,56 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +import cupy as cp + +import cudf + +from cuspatial.core._column.geocolumn import ColumnType + +"""Column-Type objects to use for simple syntax in the `DispatchDict` contained +in each `feature_.py` file. For example, instead of writing out +`ColumnType.POINT`, we can just write `Point`. +""" +Point = ColumnType.POINT +MultiPoint = ColumnType.MULTIPOINT +LineString = ColumnType.LINESTRING +Polygon = ColumnType.POLYGON + + +def _false_series(size): + """Return a Series of False values""" + return cudf.Series(cp.zeros(size, dtype=cp.bool_)) + + +def _count_results_in_multipoint_geometries(point_indices, point_result): + """Count the number of points in each multipoint geometry. + + Parameters + ---------- + point_indices : cudf.Series + The indices of the points in the original (rhs) GeoSeries. + point_result : cudf.DataFrame + The result of a contains_properly call. + + Returns + ------- + cudf.Series + The number of points that fell within a particular polygon id. + cudf.Series + The number of points in each multipoint geometry. + """ + point_indices_df = cudf.Series( + point_indices, + name="rhs_index", + index=cudf.RangeIndex(len(point_indices), name="point_index"), + ).reset_index() + with_rhs_indices = point_result.merge(point_indices_df, on="point_index") + points_grouped_by_original_polygon = with_rhs_indices[ + ["point_index", "rhs_index"] + ].drop_duplicates() + hits = ( + points_grouped_by_original_polygon.groupby("rhs_index") + .count() + .sort_index() + ) + expected_count = point_indices_df.groupby("rhs_index").count().sort_index() + return hits, expected_count