-
Hi! I love using DuckDB but I am still a SQL newbie. I have a question about efficient spatial join. My code works but I feel it's quite inefficient (partly because of a Basically, I want to check if points are in polygons. Here are four points in a geojson file named {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"properties": {
"name": "pointA"
},
"geometry": {
"coordinates": [-76.34553248992202, 48.241182892559266],
"type": "Point"
}
},
{
"type": "Feature",
"properties": { "name": "pointB" },
"geometry": {
"coordinates": [-73.18043031919933, 50.15023361660323],
"type": "Point"
}
},
{
"type": "Feature",
"properties": { "name": "pointC" },
"geometry": {
"coordinates": [-72.78960434234926, 48.47150751404138],
"type": "Point"
}
},
{
"type": "Feature",
"properties": { "name": "pointD" },
"geometry": {
"coordinates": [-72.2926406368759, 47.43075362784262],
"type": "Point"
}
}
]
} And here are two polygons in a geojson file named {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"properties": {
"name": "container1"
},
"geometry": {
"coordinates": [
[
[-73.86384311805543, 49.44467601022404],
[-73.86384311805543, 46.84089921028934],
[-71.26679234199449, 46.84089921028934],
[-71.26679234199449, 49.44467601022404],
[-73.86384311805543, 49.44467601022404]
]
],
"type": "Polygon"
}
},
{
"type": "Feature",
"properties": {
"name": "container2"
},
"geometry": {
"coordinates": [
[
[-75.35234508889035, 50.05982906185096],
[-75.35234508889035, 48.261633398786614],
[-72.62185476279677, 48.261633398786614],
[-72.62185476279677, 50.05982906185096],
[-75.35234508889035, 50.05982906185096]
]
],
"type": "Polygon"
}
}
]
} And here's the query I run with DuckDB (NodeJS). const duckdb = require("duckdb");
const db = new duckdb.Database(":memory:");
const query = `
INSTALL spatial;
LOAD spatial;
-- Create points table
CREATE TABLE points AS SELECT * FROM ST_Read('points.json');
ALTER TABLE points RENAME COLUMN "name" TO "pointName";
ALTER TABLE points RENAME COLUMN "geom" TO "pointGeom";
-- Create polygons table
CREATE TABLE polygons AS SELECT * FROM ST_Read('polygons.json');
ALTER TABLE polygons RENAME COLUMN "name" TO "polyName";
ALTER TABLE polygons RENAME COLUMN "geom" TO "polyGeom";
-- Join points and polygons
CREATE OR REPLACE TABLE points AS
SELECT points.*, polygons.*
FROM points
CROSS JOIN polygons;
-- Check if points are inside polygons
ALTER TABLE points ADD COLUMN "isInside" BOOLEAN;
UPDATE points SET "isInside" = ST_Covers("polyGeom", "pointGeom");
--- Keep only points inside a polygon
CREATE OR REPLACE TABLE points
AS SELECT * FROM points
WHERE "isInside" = TRUE;
-- Output the result
SELECT * FROM points;
`;
db.all(query, function (err, res) {
if (err) {
console.warn(err);
}
console.log(res);
}); And here's the output. [
{
pointName: 'pointC',
pointGeom: <Buffer 00 00 00 00 00 00 00 00 00 00 00 00 01 00 00 00 dc ca a6 e0 88 32 52 c0 20 50 b4 5b 5a 3c 48 40>,
polyName: 'container1',
polyGeom: <Buffer 02 04 00 00 00 00 00 00 4a ba 93 c2 14 5d 3b 42 99 88 8e c2 5a c7 45 42 02 00 00 00 01 00 00 00 05 00 00 00 00 00 00 00 0e 3b a5 34 49 77 52 c0 30 9d ... 70 more bytes>,
isInside: true
},
{
pointName: 'pointD',
pointGeom: <Buffer 00 00 00 00 00 00 00 00 00 00 00 00 01 00 00 00 35 37 cb 9f ba 12 52 c0 d4 1b 54 ef 22 b7 47 40>,
polyName: 'container1',
polyGeom: <Buffer 02 04 00 00 00 00 00 00 4a ba 93 c2 14 5d 3b 42 99 88 8e c2 5a c7 45 42 02 00 00 00 01 00 00 00 05 00 00 00 00 00 00 00 0e 3b a5 34 49 77 52 c0 30 9d ... 70 more bytes>,
isInside: true
},
{
pointName: 'pointC',
pointGeom: <Buffer 00 00 00 00 00 00 00 00 00 00 00 00 01 00 00 00 dc ca a6 e0 88 32 52 c0 20 50 b4 5b 5a 3c 48 40>,
polyName: 'container2',
polyGeom: <Buffer 02 04 00 00 00 00 00 00 67 b4 96 c2 e9 0b 41 42 63 3e 91 c2 44 3d 48 42 02 00 00 00 01 00 00 00 05 00 00 00 00 00 00 00 2d 6c 6a d2 8c d6 52 c0 08 00 ... 70 more bytes>,
isInside: true
}
] It works! But when I use this strategy with geojson files with a lot of features, it's quite slow. Any SQL expert around here could help? 🤓 Thank you very much! |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 4 replies
-
Hi! I think the better way to do a spatial join is to use a proper inner join with a spatial predicate as join condition, instead of doing a cross product and then filtering (which is going to have to create an exponential amount of geometries just to filter most of them out). e.g. instead of doing this SELECT * FROM t1 CROSS JOIN t2 WHERE st_intersects(t1.geom, t2.geom); Do this: SELECT * FROM t1 JOIN t2 ON st_intersects(t1.geom, t2.geom) Inner joins like this can also be optimized further by DuckDB spatial as it will attempt to rewrite the query to first join on only the geometries who's cached bounding boxes intersect, which avoids doing the actual costly intersection comparison for geometries that can't intersect at all. |
Beta Was this translation helpful? Give feedback.
Hi! I think the better way to do a spatial join is to use a proper inner join with a spatial predicate as join condition, instead of doing a cross product and then filtering (which is going to have to create an exponential amount of geometries just to filter most of them out).
e.g. instead of doing this
Do this:
Inner joins like this can also be optimized further by DuckDB spatial as it will attempt to rewrite the query to first join on only the geometries who's cached bounding boxes intersect, which avoids doing the actual costly intersection comparison for ge…