From a2e8a7100876f09436c847dd20ea5750e6cafa04 Mon Sep 17 00:00:00 2001 From: Muhammad Taha Naveed Date: Wed, 11 Sep 2024 21:45:58 +0500 Subject: [PATCH] Add support for external extensions (#2088) - This commit allows the functions from external extensions to be called from the Cypher queries, provided that typecast is available for args and return type of that function. The extension should be installed and the function should be in the search path. - Added cypher typecast for pgvector datatypes, its not a direct cast. It casts agtype to text and then text to vector. - Added regression tests for pg_trgm, fuzzystrmatch and pgvector extensions. pg_trgm is another extension that is used for fuzzy string matching. These regression test are extra tests that need to be explicitly added to the regression suite. Following command can be used to do so: make installcheck EXTRA_TESTS="pg_trgm pgvector fuzzystrmatch" - Updated CI to run the extra tests for the extensions. --- .github/workflows/installcheck.yaml | 18 +- Makefile | 9 +- regress/expected/expr.out | 29 -- regress/expected/fuzzystrmatch.out | 177 ++++++++ regress/expected/pg_trgm.out | 120 ++++++ regress/expected/pgvector.out | 285 +++++++++++++ regress/sql/expr.sql | 9 - regress/sql/fuzzystrmatch.sql | 61 +++ regress/sql/pg_trgm.sql | 54 +++ regress/sql/pgvector.sql | 101 +++++ src/backend/parser/cypher_expr.c | 600 +++++++++++++++++----------- 11 files changed, 1191 insertions(+), 272 deletions(-) create mode 100644 regress/expected/fuzzystrmatch.out create mode 100644 regress/expected/pg_trgm.out create mode 100644 regress/expected/pgvector.out create mode 100644 regress/sql/fuzzystrmatch.sql create mode 100644 regress/sql/pg_trgm.sql create mode 100644 regress/sql/pgvector.sql diff --git a/.github/workflows/installcheck.yaml b/.github/workflows/installcheck.yaml index 0b2e5ce7b..fcd1a4fd1 100644 --- a/.github/workflows/installcheck.yaml +++ b/.github/workflows/installcheck.yaml @@ -22,25 +22,37 @@ jobs: path: ~/pg15 key: ${{ runner.os }}-v1-pg15-${{ env.PG_COMMIT_HASH }} - - name: Install PostgreSQL 15 + - name: Install PostgreSQL 15 and some extensions if: steps.pg15cache.outputs.cache-hit != 'true' run: | git clone --depth 1 --branch REL_15_STABLE git://git.postgresql.org/git/postgresql.git ~/pg15source cd ~/pg15source ./configure --prefix=$HOME/pg15 CFLAGS="-std=gnu99 -ggdb -O0" --enable-cassert make install -j$(nproc) > /dev/null + cd contrib + cd fuzzystrmatch + make PG_CONFIG=$HOME/pg15/bin/pg_config install -j$(nproc) > /dev/null + cd ../pg_trgm + make PG_CONFIG=$HOME/pg15/bin/pg_config install -j$(nproc) > /dev/null - uses: actions/checkout@v3 - - name: Build + - name: Build AGE id: build run: | make PG_CONFIG=$HOME/pg15/bin/pg_config install -j$(nproc) + + - name: Pull and build pgvector + id: pgvector + run: | + git clone https://github.com/pgvector/pgvector.git + cd pgvector + make PG_CONFIG=$HOME/pg15/bin/pg_config install -j$(nproc) > /dev/null - name: Regression tests id: regression_tests run: | - make PG_CONFIG=$HOME/pg15/bin/pg_config installcheck + make PG_CONFIG=$HOME/pg15/bin/pg_config installcheck EXTRA_TESTS="pgvector fuzzystrmatch pg_trgm" continue-on-error: true - name: Dump regression test errors diff --git a/Makefile b/Makefile index 400d5a7a1..c0a847830 100644 --- a/Makefile +++ b/Makefile @@ -112,8 +112,13 @@ REGRESS = scan \ name_validation \ jsonb_operators \ list_comprehension \ - map_projection \ - drop + map_projection + +ifneq ($(EXTRA_TESTS),) + REGRESS += $(EXTRA_TESTS) +endif + +REGRESS += drop srcdir=`pwd` diff --git a/regress/expected/expr.out b/regress/expected/expr.out index 35a104ff6..106ef341a 100644 --- a/regress/expected/expr.out +++ b/regress/expected/expr.out @@ -8769,25 +8769,6 @@ SELECT * FROM cypher('issue_1988', $$ {"id": 844424930131969, "label": "Part", "properties": {"set": "set", "match": "match", "merge": "merge", "create": "create", "delete": "delete", "part_num": 123}}::vertex (4 rows) --- --- Test external extension function logic for fuzzystrmatch --- -SELECT * FROM create_graph('fuzzystrmatch'); -NOTICE: graph "fuzzystrmatch" has been created - create_graph --------------- - -(1 row) - --- These should fail with extension not installed -SELECT * FROM cypher('fuzzystrmatch', $$ RETURN soundex("hello world!") $$) AS (result agtype); -ERROR: extension fuzzystrmatch is not installed for function soundex -LINE 1: SELECT * FROM cypher('fuzzystrmatch', $$ RETURN soundex("hel... - ^ -SELECT * FROM cypher('fuzzystrmatch', $$ RETURN difference("hello world!", "hello world!") $$) AS (result agtype); -ERROR: extension fuzzystrmatch is not installed for function difference -LINE 1: SELECT * FROM cypher('fuzzystrmatch', $$ RETURN difference("... - ^ -- -- Issue 2093: Server crashes when executing SELECT agtype_hash_cmp(agtype_in('[null, null, null, null, null]')); -- @@ -8806,16 +8787,6 @@ SELECT agtype_hash_cmp(agtype_in('[null, null, null, null, null]')); -- -- Cleanup -- -SELECT * FROM drop_graph('fuzzystrmatch', true); -NOTICE: drop cascades to 2 other objects -DETAIL: drop cascades to table fuzzystrmatch._ag_label_vertex -drop cascades to table fuzzystrmatch._ag_label_edge -NOTICE: graph "fuzzystrmatch" has been dropped - drop_graph ------------- - -(1 row) - SELECT * FROM drop_graph('issue_1988', true); NOTICE: drop cascades to 4 other objects DETAIL: drop cascades to table issue_1988._ag_label_vertex diff --git a/regress/expected/fuzzystrmatch.out b/regress/expected/fuzzystrmatch.out new file mode 100644 index 000000000..1d4613ee3 --- /dev/null +++ b/regress/expected/fuzzystrmatch.out @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +LOAD 'age'; +SET search_path=ag_catalog; +SELECT create_graph('graph'); +NOTICE: graph "graph" has been created + create_graph +-------------- + +(1 row) + +-- Should error out +SELECT * FROM cypher('graph', $$ RETURN soundex("hello") $$) AS (n agtype); +ERROR: function soundex does not exist +LINE 1: SELECT * FROM cypher('graph', $$ RETURN soundex("hello") $$)... + ^ +HINT: If the function is from an external extension, make sure the extension is installed and the function is in the search path. +-- Create the extension in the public schema +CREATE EXTENSION fuzzystrmatch SCHEMA public; +-- Should error out +SELECT * FROM cypher('graph', $$ RETURN soundex("hello") $$) AS (n agtype); +ERROR: function soundex does not exist +LINE 1: SELECT * FROM cypher('graph', $$ RETURN soundex("hello") $$)... + ^ +HINT: If the function is from an external extension, make sure the extension is installed and the function is in the search path. +-- Should work +SET search_path=ag_catalog, public; +SELECT * FROM cypher('graph', $$ CREATE (:Person {name: "Jane"}), + (:Person {name: "John"}), + (:Person {name: "Jone"}), + (:Person {name: "Jack"}), + (:Person {name: "Jax"}), + (:Person {name: "Jake"}), + (:Person {name: "Julie"}), + (:Person {name: "Julius"}), + (:Person {name: "Jill"}), + (:Person {name: "Jillie"}), + (:Person {name: "Julian"}) +$$) AS (n agtype); + n +--- +(0 rows) + +SELECT * FROM cypher('graph', $$ MATCH (p) return soundex(p.name) $$) AS (n agtype); + n +-------- + "J500" + "J500" + "J500" + "J200" + "J200" + "J200" + "J400" + "J420" + "J400" + "J400" + "J450" +(11 rows) + +SELECT * FROM cypher('graph', $$ MATCH (p) return levenshtein(p.name, "John") $$) AS (n agtype); + n +--- + 3 + 0 + 2 + 3 + 3 + 3 + 4 + 5 + 3 + 5 + 4 +(11 rows) + +SELECT * FROM cypher('graph', $$ MATCH (p) return difference(p.name, "John") $$) AS (n agtype); + n +--- + 4 + 4 + 4 + 3 + 3 + 3 + 3 + 2 + 3 + 3 + 2 +(11 rows) + +SELECT * FROM cypher('graph', $$ MATCH (p) return metaphone(p.name, 4) $$) AS (n agtype); + n +------- + "JN" + "JN" + "JN" + "JK" + "JKS" + "JK" + "JL" + "JLS" + "JL" + "JL" + "JLN" +(11 rows) + +SELECT * FROM cypher('graph', $$ MATCH (p) return dmetaphone(p.name) $$) AS (n agtype); + n +------- + "JN" + "JN" + "JN" + "JK" + "JKS" + "JK" + "JL" + "JLS" + "JL" + "JL" + "JLN" +(11 rows) + +-- Difference is basically similarity using soundex, https://www.postgresql.org/docs/current/fuzzystrmatch.html +SELECT * FROM cypher('graph', $$ MATCH (p) return p ORDER BY difference(p.name, "Jon") DESC LIMIT 3$$) AS (n agtype); + n +------------------------------------------------------------------------------------ + {"id": 844424930131970, "label": "Person", "properties": {"name": "John"}}::vertex + {"id": 844424930131971, "label": "Person", "properties": {"name": "Jone"}}::vertex + {"id": 844424930131969, "label": "Person", "properties": {"name": "Jane"}}::vertex +(3 rows) + +SELECT * FROM cypher('graph', $$ MATCH (p) return p ORDER BY difference(p.name, "Jak") DESC LIMIT 3$$) AS (n agtype); + n +------------------------------------------------------------------------------------ + {"id": 844424930131972, "label": "Person", "properties": {"name": "Jack"}}::vertex + {"id": 844424930131973, "label": "Person", "properties": {"name": "Jax"}}::vertex + {"id": 844424930131974, "label": "Person", "properties": {"name": "Jake"}}::vertex +(3 rows) + +SELECT * FROM cypher('graph', $$ MATCH (p) return p ORDER BY difference(p.name, "Jil") DESC LIMIT 3$$) AS (n agtype); + n +-------------------------------------------------------------------------------------- + {"id": 844424930131975, "label": "Person", "properties": {"name": "Julie"}}::vertex + {"id": 844424930131977, "label": "Person", "properties": {"name": "Jill"}}::vertex + {"id": 844424930131978, "label": "Person", "properties": {"name": "Jillie"}}::vertex +(3 rows) + +-- Clean up +SELECT drop_graph('graph', true); +NOTICE: drop cascades to 3 other objects +DETAIL: drop cascades to table graph._ag_label_vertex +drop cascades to table graph._ag_label_edge +drop cascades to table graph."Person" +NOTICE: graph "graph" has been dropped + drop_graph +------------ + +(1 row) + +DROP EXTENSION fuzzystrmatch CASCADE; diff --git a/regress/expected/pg_trgm.out b/regress/expected/pg_trgm.out new file mode 100644 index 000000000..5b2a2b9c8 --- /dev/null +++ b/regress/expected/pg_trgm.out @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +LOAD 'age'; +SET search_path=ag_catalog; +SELECT create_graph('graph'); +NOTICE: graph "graph" has been created + create_graph +-------------- + +(1 row) + +-- Should error out +SELECT * FROM cypher('graph', $$ RETURN show_trgm("hello") $$) AS (n agtype); +ERROR: function show_trgm does not exist +LINE 1: SELECT * FROM cypher('graph', $$ RETURN show_trgm("hello") $... + ^ +HINT: If the function is from an external extension, make sure the extension is installed and the function is in the search path. +-- Create the extension in the public schema +CREATE EXTENSION pg_trgm SCHEMA public; +-- Should error out +SELECT * FROM cypher('graph', $$ RETURN show_trgm("hello") $$) AS (n agtype); +ERROR: function show_trgm does not exist +LINE 1: SELECT * FROM cypher('graph', $$ RETURN show_trgm("hello") $... + ^ +HINT: If the function is from an external extension, make sure the extension is installed and the function is in the search path. +-- Should work +SET search_path=ag_catalog, public; +SELECT * FROM cypher('graph', $$ CREATE (:Person {name: "Jane"}), + (:Person {name: "John"}), + (:Person {name: "Jone"}), + (:Person {name: "Jack"}), + (:Person {name: "Jax"}), + (:Person {name: "Jake"}), + (:Person {name: "Julie"}), + (:Person {name: "Julius"}), + (:Person {name: "Jill"}), + (:Person {name: "Jillie"}), + (:Person {name: "Julian"}) +$$) AS (n agtype); + n +--- +(0 rows) + +SELECT * FROM cypher('graph', $$ MATCH (p) return show_trgm(p.name) $$) AS (n text[]); + n +------------------------------------- + {" j"," ja",ane,jan,"ne "} + {" j"," jo","hn ",joh,ohn} + {" j"," jo",jon,"ne ",one} + {" j"," ja",ack,"ck ",jac} + {" j"," ja","ax ",jax} + {" j"," ja",ake,jak,"ke "} + {" j"," ju","ie ",jul,lie,uli} + {" j"," ju",ius,jul,liu,uli,"us "} + {" j"," ji",ill,jil,"ll "} + {" j"," ji","ie ",ill,jil,lie,lli} + {" j"," ju","an ",ian,jul,lia,uli} +(11 rows) + +SELECT * FROM cypher('graph', $$ MATCH (p) with p, similarity(p.name, "Jon") as sim return p.name, sim ORDER BY sim DESC $$) AS (n agtype, s real); + n | s +----------+------------ + "Jone" | 0.5 + "John" | 0.2857143 + "Jax" | 0.14285715 + "Jane" | 0.125 + "Jack" | 0.125 + "Jake" | 0.125 + "Jill" | 0.125 + "Julie" | 0.11111111 + "Julius" | 0.1 + "Julian" | 0.1 + "Jillie" | 0.1 +(11 rows) + +SELECT * FROM cypher('graph', $$ MATCH (p) with p, word_similarity(p.name, "Jon") as sim return p.name, sim ORDER BY sim DESC $$) AS (n agtype, s real); + n | s +----------+------------ + "Jone" | 0.6 + "John" | 0.4 + "Jax" | 0.25 + "Jane" | 0.2 + "Jack" | 0.2 + "Jake" | 0.2 + "Jill" | 0.2 + "Julie" | 0.16666667 + "Julius" | 0.14285715 + "Julian" | 0.14285715 + "Jillie" | 0.14285715 +(11 rows) + +-- Clean up +SELECT drop_graph('graph', true); +NOTICE: drop cascades to 3 other objects +DETAIL: drop cascades to table graph._ag_label_vertex +drop cascades to table graph._ag_label_edge +drop cascades to table graph."Person" +NOTICE: graph "graph" has been dropped + drop_graph +------------ + +(1 row) + +DROP EXTENSION pg_trgm CASCADE; diff --git a/regress/expected/pgvector.out b/regress/expected/pgvector.out new file mode 100644 index 000000000..f1bd53ed4 --- /dev/null +++ b/regress/expected/pgvector.out @@ -0,0 +1,285 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +LOAD 'age'; +SET search_path=ag_catalog; +SELECT create_graph('graph'); +NOTICE: graph "graph" has been created + create_graph +-------------- + +(1 row) + +-- Should error out +SELECT * FROM cypher('graph', $$ RETURN cosine_distance("[1,2,3]", "[1,2,3]") $$) AS (n agtype); +ERROR: function cosine_distance does not exist +LINE 1: SELECT * FROM cypher('graph', $$ RETURN cosine_distance("[1,... + ^ +HINT: If the function is from an external extension, make sure the extension is installed and the function is in the search path. +-- Create the extension in the public schema +CREATE EXTENSION vector SCHEMA public; +-- Should error out +SELECT * FROM cypher('graph', $$ RETURN cosine_distance("[1,2,3]", "[1,2,3]") $$) AS (n agtype); +ERROR: function cosine_distance does not exist +LINE 1: SELECT * FROM cypher('graph', $$ RETURN cosine_distance("[1,... + ^ +HINT: If the function is from an external extension, make sure the extension is installed and the function is in the search path. +-- Should work +SET search_path=ag_catalog, public; +SELECT create_graph('graph'); +ERROR: graph "graph" already exists +SELECT * FROM cypher('graph', $$ RETURN "[1.22,2.22,3.33]"::vector $$) AS (n vector); + n +------------------ + [1.22,2.22,3.33] +(1 row) + +SELECT * FROM cypher('graph', $$ RETURN "[1.22,2.22,3.33]"::vector $$) AS (n halfvec); + n +--------------------------------- + [1.2197266,2.2207031,3.3300781] +(1 row) + +SELECT * FROM cypher('graph', $$ RETURN "[1.22,2.22,3.33]"::vector $$) AS (n sparsevec); + n +-------------------------- + {1:1.22,2:2.22,3:3.33}/3 +(1 row) + +SELECT * FROM cypher('graph', $$ RETURN l2_distance("[1,2,3]", "[1,2,4]") $$) AS (n agtype); + n +----- + 1.0 +(1 row) + +SELECT * FROM cypher('graph', $$ RETURN inner_product("[1,2,3]", "[1,2,4]") $$) AS (n agtype); + n +------ + 17.0 +(1 row) + +SELECT * FROM cypher('graph', $$ RETURN cosine_distance("[1,2,3]", "[1,2,4]") $$) AS (n agtype); + n +--------------------- + 0.00853986601633272 +(1 row) + +SELECT * FROM cypher('graph', $$ RETURN l1_distance("[1,2,3]", "[1,2,4]") $$) AS (n agtype); + n +----- + 1.0 +(1 row) + +SELECT * FROM cypher('graph', $$ RETURN vector_dims("[1,2,3]") $$) AS (n agtype); + n +--- + 3 +(1 row) + +SELECT * FROM cypher('graph', $$ RETURN vector_norm("[1,2,3]") $$) AS (n agtype); + n +-------------------- + 3.7416573867739413 +(1 row) + +SELECT * FROM cypher('graph', $$ RETURN l2_normalize("[1,2,3]") $$) AS (n vector); + n +----------------------------------- + [0.26726124,0.5345225,0.80178374] +(1 row) + +SELECT * FROM cypher('graph', $$ RETURN l2_normalize("[1,2,3]")::text $$) AS (n agtype); + n +------------------------------------- + [0.26726124, 0.5345225, 0.80178374] +(1 row) + +SELECT * FROM cypher('graph', $$ RETURN subvector("[1,2,3,4,5,6]", 2, 4) $$) AS (n vector); + n +----------- + [2,3,4,5] +(1 row) + +SELECT * FROM cypher('graph', $$ RETURN subvector("[1,2,3,4,5,6]", 2, 4)::text $$) AS (n agtype); + n +-------------- + [2, 3, 4, 5] +(1 row) + +SELECT * FROM cypher('graph', $$ RETURN binary_quantize("[1,2,4]") $$) AS (n bit); + n +----- + 111 +(1 row) + +-- An example usage +SELECT * FROM cypher('graph', $$ + CREATE (:Movie {title: "The Matrix", year: 1999, genre: "Action", plot: "A computer hacker learns about the true nature of reality and joins a rebellion to free humanity from a simulated world controlled by machines.", embedding: "[-0.07594558, 0.04081754, 0.29592122, -0.11921061]"}), + (:Movie {title: "The Matrix Reloaded", year: 2003, genre: "Action", plot: "The rebels continue their fight against the machines, uncovering deeper truths about the Matrix and the nature of their mission.", embedding: "[0.30228977, -0.22839354, 0.35070436, 0.01262819]"}), + (:Movie {title: "The Matrix Revolutions", year: 2003, genre: "Action", plot: "The final battle between humans and machines reaches its climax as the fate of both worlds hangs in the balance.", embedding: "[ 0.12240622, -0.29752459, 0.22620453, 0.24454723]"}), + (:Movie {title: "The Matrix Resurrections", year: 2021, genre: "Action", plot: "Neo returns to a new version of the Matrix and must once again fight to save the people from the control of the machines.", embedding: "[ 0.34717246, -0.13820869, 0.29214213, 0.08090488]"}), + (:Movie {title: "Inception", year: 2010, genre: "Sci-Fi", plot: "A skilled thief is given a chance at redemption if he can successfully perform an inception: planting an idea into someone’s subconscious.", embedding: "[ 0.03923657, 0.39284106, -0.20927092, -0.17770818]"}), + (:Movie {title: "Interstellar", year: 2014, genre: "Sci-Fi", plot: "A group of explorers travel through a wormhole in space in an attempt to ensure humanity’s survival.", embedding: "[-0.29302418, -0.39615033, -0.23393948, -0.09601383]"}), + (:Movie {title: "Avatar", year: 2009, genre: "Sci-Fi", plot: "A paraplegic Marine is sent to the moon Pandora, where he becomes torn between following orders and protecting the world he feels is his home.", embedding: "[-0.13663386, 0.00635589, -0.03038832, -0.08252723]"}), + (:Movie {title: "Blade Runner", year: 1982, genre: "Sci-Fi", plot: "A blade runner must pursue and terminate four replicants who have stolen a ship in space and returned to Earth.", embedding: "[ 0.27215557, -0.1479577, -0.09972772, -0.08234394]"}), + (:Movie {title: "Blade Runner 2049", year: 2017, genre: "Sci-Fi", plot: "A new blade runner unearths a long-buried secret that has the potential to plunge what’s left of society into chaos.", embedding: "[ 0.21560573, -0.07505179, -0.01331814, 0.13403069]"}), + (:Movie {title: "Minority Report", year: 2002, genre: "Sci-Fi", plot: "In a future where a special police unit can arrest murderers before they commit their crimes, a top officer is accused of a future murder.", embedding: "[ 0.24008012, 0.44954908, -0.30905488, 0.15195407]"}), + (:Movie {title: "Total Recall", year: 1990, genre: "Sci-Fi", plot: "A construction worker discovers that his memories have been implanted and becomes embroiled in a conspiracy on Mars.", embedding: "[-0.17471036, 0.14695261, -0.06272433, -0.21795064]"}), + (:Movie {title: "Elysium", year: 2013, genre: "Sci-Fi", plot: "In a future where the rich live on a luxurious space station while the rest of humanity lives in squalor, a man fights to bring equality.", embedding: "[-0.33280967, 0.07733926, 0.11015328, 0.53382836]"}), + (:Movie {title: "Gattaca", year: 1997, genre: "Sci-Fi", plot: "In a future where genetic engineering determines social class, a man defies his fate to achieve his dreams.", embedding: "[-0.21629286, 0.31114665, 0.08303899, 0.46199759]"}), + (:Movie {title: "The Fifth Element", year: 1997, genre: "Sci-Fi", plot: "In a futuristic world, a cab driver becomes the key to saving humanity from an impending cosmic threat.", embedding: "[-0.11528205, -0.0208782, -0.0735215, 0.14327449]"}), + (:Movie {title: "The Terminator", year: 1984, genre: "Action", plot: "A cyborg assassin is sent back in time to kill the mother of the future resistance leader.", embedding: "[ 0.33666933, 0.18040994, -0.01075103, -0.11117851]"}), + (:Movie {title: "Terminator 2: Judgment Day", year: 1991, genre: "Action", plot: "A reprogrammed Terminator is sent to protect the future leader of the human resistance from a more advanced Terminator.", embedding: "[ 0.34698868, 0.06439331, 0.06232323, -0.19534876]"}), + (:Movie {title: "Jurassic Park", year: 1993, genre: "Adventure", plot: "Scientists clone dinosaurs to create a theme park, but things go awry when the creatures escape.", embedding: "[ 0.01794725, -0.11434246, -0.46831815, -0.01049593]"}), + (:Movie {title: "The Avengers", year: 2012, genre: "Action", plot: "Superheroes assemble to face a global threat from an alien invasion led by Loki.", embedding: "[ 0.00546514, -0.37005171, -0.42612838, 0.07968612]"}) +$$) AS (result agtype); + result +-------- +(0 rows) + +SELECT * FROM cypher('graph', $$ MATCH (m:Movie) RETURN m.title, (m.embedding)::vector $$) AS (title agtype, embedding vector); + title | embedding +------------------------------+--------------------------------------------------- + "The Matrix" | [-0.07594558,0.04081754,0.2959212,-0.11921061] + "The Matrix Reloaded" | [0.30228978,-0.22839354,0.35070437,0.01262819] + "The Matrix Revolutions" | [0.12240622,-0.2975246,0.22620453,0.24454723] + "The Matrix Resurrections" | [0.34717247,-0.13820869,0.29214212,0.08090488] + "Inception" | [0.03923657,0.39284107,-0.20927092,-0.17770818] + "Interstellar" | [-0.29302418,-0.39615032,-0.23393948,-0.09601383] + "Avatar" | [-0.13663386,0.00635589,-0.03038832,-0.08252723] + "Blade Runner" | [0.27215558,-0.1479577,-0.09972772,-0.08234394] + "Blade Runner 2049" | [0.21560574,-0.07505179,-0.01331814,0.13403068] + "Minority Report" | [0.24008012,0.44954908,-0.30905488,0.15195407] + "Total Recall" | [-0.17471036,0.14695261,-0.06272433,-0.21795064] + "Elysium" | [-0.33280966,0.07733926,0.11015328,0.5338284] + "Gattaca" | [-0.21629286,0.31114665,0.08303899,0.4619976] + "The Fifth Element" | [-0.11528205,-0.0208782,-0.0735215,0.14327449] + "The Terminator" | [0.33666933,0.18040994,-0.01075103,-0.11117851] + "Terminator 2: Judgment Day" | [0.34698868,0.06439331,0.06232323,-0.19534875] + "Jurassic Park" | [0.01794725,-0.11434246,-0.46831816,-0.01049593] + "The Avengers" | [0.00546514,-0.3700517,-0.4261284,0.07968612] +(18 rows) + +-- Check the dimension of the embedding +SELECT * FROM cypher('graph', $$ MATCH (m:Movie) RETURN m.title, vector_dims(m.embedding) $$) AS (title agtype, dimension int); + title | dimension +------------------------------+----------- + "The Matrix" | 4 + "The Matrix Reloaded" | 4 + "The Matrix Revolutions" | 4 + "The Matrix Resurrections" | 4 + "Inception" | 4 + "Interstellar" | 4 + "Avatar" | 4 + "Blade Runner" | 4 + "Blade Runner 2049" | 4 + "Minority Report" | 4 + "Total Recall" | 4 + "Elysium" | 4 + "Gattaca" | 4 + "The Fifth Element" | 4 + "The Terminator" | 4 + "Terminator 2: Judgment Day" | 4 + "Jurassic Park" | 4 + "The Avengers" | 4 +(18 rows) + +-- Get top 4 most similar movies to The Terminator using cosine distance +SELECT * FROM cypher('graph', $$ MATCH (m:Movie), (search:Movie {title: "The Terminator"}) + RETURN m.title ORDER BY cosine_distance(m.embedding, search.embedding) ASC LIMIT 4 +$$) AS (title agtype); + title +------------------------------ + "The Terminator" + "Terminator 2: Judgment Day" + "Minority Report" + "Blade Runner" +(4 rows) + +-- Get top 4 most similar movies to The Matrix using cosine distance +SELECT * FROM cypher('graph', $$ MATCH (m:Movie), (search:Movie {title: "The Matrix"}) + RETURN m.title ORDER BY cosine_distance(m.embedding, search.embedding) ASC LIMIT 4 +$$) AS (title agtype); + title +---------------------------- + "The Matrix" + "The Matrix Reloaded" + "The Matrix Resurrections" + "Total Recall" +(4 rows) + +-- l2 norm of the embedding +SELECT * FROM cypher('graph', $$ MATCH (m:Movie) set m.embedding=(l2_normalize(m.embedding))::text return m.title, m.embedding $$) AS (title agtype, embedding agtype); + title | embedding +------------------------------+---------------------------------------------------- + "The Matrix" | "[-0.22980669,0.12351139,0.89543957,-0.36072403]" + "The Matrix Reloaded" | "[0.58534974,-0.44225806,0.6790991,0.024453051]" + "The Matrix Revolutions" | "[0.26431033,-0.6424414,0.4884408,0.528048]" + "The Matrix Resurrections" | "[0.72151977,-0.28723562,0.60715157,0.16814256]" + "Inception" | "[0.08159459,0.81693435,-0.43519026,-0.3695538]" + "Interstellar" | "[-0.5290723,-0.71527255,-0.4223914,-0.17335857]" + "Avatar" | "[-0.84023285,0.039085682,-0.18687363,-0.507503]" + "Blade Runner" | "[0.81074023,-0.44075987,-0.29708475,-0.2452992]" + "Blade Runner 2049" | "[0.8134027,-0.28314334,-0.05024454,0.50564945]" + "Minority Report" | "[0.39031598,0.7308651,-0.5024533,0.24704295]" + "Total Recall" | "[-0.54291505,0.4566574,-0.19491677,-0.67728484]" + "Elysium" | "[-0.517338,0.12022049,0.17122844,0.82981277]" + "Gattaca" | "[-0.35853538,0.51576865,0.13764863,0.765825]" + "The Fifth Element" | "[-0.5788842,-0.10483904,-0.36918527,0.7194471]" + "The Terminator" | "[0.84599304,0.45333964,-0.02701552,-0.27937278]" + "Terminator 2: Judgment Day" | "[0.8501332,0.15776564,0.15269388,-0.4786106]" + "Jurassic Park" | "[0.037194606,-0.23696794,-0.9705615,-0.02175219]" + "The Avengers" | "[0.009587915,-0.6492101,-0.7475897,0.13979948]" +(18 rows) + +-- Get top 4 most similar movies to The Terminator using l2 distance +SELECT * FROM cypher('graph', $$ MATCH (m:Movie), (search:Movie {title: "The Terminator"}) + RETURN m.title ORDER BY l2_distance(m.embedding, search.embedding) ASC LIMIT 4 +$$) AS (title agtype); + title +------------------------------ + "The Terminator" + "Terminator 2: Judgment Day" + "Minority Report" + "Blade Runner" +(4 rows) + +-- Get top 4 most similar movies to The Matrix using l2 distance +SELECT * FROM cypher('graph', $$ MATCH (m:Movie), (search:Movie {title: "The Matrix"}) + RETURN m.title ORDER BY l2_distance(m.embedding, search.embedding) ASC LIMIT 4 +$$) AS (title agtype); + title +---------------------------- + "The Matrix" + "The Matrix Reloaded" + "The Matrix Resurrections" + "Total Recall" +(4 rows) + +SELECT drop_graph('graph', true); +NOTICE: drop cascades to 3 other objects +DETAIL: drop cascades to table graph._ag_label_vertex +drop cascades to table graph._ag_label_edge +drop cascades to table graph."Movie" +NOTICE: graph "graph" has been dropped + drop_graph +------------ + +(1 row) + +DROP EXTENSION vector CASCADE; diff --git a/regress/sql/expr.sql b/regress/sql/expr.sql index 7bcec8595..1152dc3aa 100644 --- a/regress/sql/expr.sql +++ b/regress/sql/expr.sql @@ -3537,14 +3537,6 @@ SELECT * FROM cypher('issue_1988', $$ SELECT * FROM cypher('issue_1988', $$ MATCH (p) RETURN p $$) as (p agtype); --- --- Test external extension function logic for fuzzystrmatch --- -SELECT * FROM create_graph('fuzzystrmatch'); --- These should fail with extension not installed -SELECT * FROM cypher('fuzzystrmatch', $$ RETURN soundex("hello world!") $$) AS (result agtype); -SELECT * FROM cypher('fuzzystrmatch', $$ RETURN difference("hello world!", "hello world!") $$) AS (result agtype); - -- -- Issue 2093: Server crashes when executing SELECT agtype_hash_cmp(agtype_in('[null, null, null, null, null]')); -- @@ -3554,7 +3546,6 @@ SELECT agtype_hash_cmp(agtype_in('[null, null, null, null, null]')); -- -- Cleanup -- -SELECT * FROM drop_graph('fuzzystrmatch', true); SELECT * FROM drop_graph('issue_1988', true); SELECT * FROM drop_graph('issue_1953', true); SELECT * FROM drop_graph('expanded_map', true); diff --git a/regress/sql/fuzzystrmatch.sql b/regress/sql/fuzzystrmatch.sql new file mode 100644 index 000000000..b850f46ed --- /dev/null +++ b/regress/sql/fuzzystrmatch.sql @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +LOAD 'age'; +SET search_path=ag_catalog; + +SELECT create_graph('graph'); + +-- Should error out +SELECT * FROM cypher('graph', $$ RETURN soundex("hello") $$) AS (n agtype); + +-- Create the extension in the public schema +CREATE EXTENSION fuzzystrmatch SCHEMA public; + +-- Should error out +SELECT * FROM cypher('graph', $$ RETURN soundex("hello") $$) AS (n agtype); + +-- Should work +SET search_path=ag_catalog, public; +SELECT * FROM cypher('graph', $$ CREATE (:Person {name: "Jane"}), + (:Person {name: "John"}), + (:Person {name: "Jone"}), + (:Person {name: "Jack"}), + (:Person {name: "Jax"}), + (:Person {name: "Jake"}), + (:Person {name: "Julie"}), + (:Person {name: "Julius"}), + (:Person {name: "Jill"}), + (:Person {name: "Jillie"}), + (:Person {name: "Julian"}) +$$) AS (n agtype); +SELECT * FROM cypher('graph', $$ MATCH (p) return soundex(p.name) $$) AS (n agtype); +SELECT * FROM cypher('graph', $$ MATCH (p) return levenshtein(p.name, "John") $$) AS (n agtype); +SELECT * FROM cypher('graph', $$ MATCH (p) return difference(p.name, "John") $$) AS (n agtype); +SELECT * FROM cypher('graph', $$ MATCH (p) return metaphone(p.name, 4) $$) AS (n agtype); +SELECT * FROM cypher('graph', $$ MATCH (p) return dmetaphone(p.name) $$) AS (n agtype); + +-- Difference is basically similarity using soundex, https://www.postgresql.org/docs/current/fuzzystrmatch.html +SELECT * FROM cypher('graph', $$ MATCH (p) return p ORDER BY difference(p.name, "Jon") DESC LIMIT 3$$) AS (n agtype); +SELECT * FROM cypher('graph', $$ MATCH (p) return p ORDER BY difference(p.name, "Jak") DESC LIMIT 3$$) AS (n agtype); +SELECT * FROM cypher('graph', $$ MATCH (p) return p ORDER BY difference(p.name, "Jil") DESC LIMIT 3$$) AS (n agtype); + +-- Clean up +SELECT drop_graph('graph', true); +DROP EXTENSION fuzzystrmatch CASCADE; \ No newline at end of file diff --git a/regress/sql/pg_trgm.sql b/regress/sql/pg_trgm.sql new file mode 100644 index 000000000..a276e5913 --- /dev/null +++ b/regress/sql/pg_trgm.sql @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +LOAD 'age'; +SET search_path=ag_catalog; + +SELECT create_graph('graph'); + +-- Should error out +SELECT * FROM cypher('graph', $$ RETURN show_trgm("hello") $$) AS (n agtype); + +-- Create the extension in the public schema +CREATE EXTENSION pg_trgm SCHEMA public; + +-- Should error out +SELECT * FROM cypher('graph', $$ RETURN show_trgm("hello") $$) AS (n agtype); + +-- Should work +SET search_path=ag_catalog, public; +SELECT * FROM cypher('graph', $$ CREATE (:Person {name: "Jane"}), + (:Person {name: "John"}), + (:Person {name: "Jone"}), + (:Person {name: "Jack"}), + (:Person {name: "Jax"}), + (:Person {name: "Jake"}), + (:Person {name: "Julie"}), + (:Person {name: "Julius"}), + (:Person {name: "Jill"}), + (:Person {name: "Jillie"}), + (:Person {name: "Julian"}) +$$) AS (n agtype); +SELECT * FROM cypher('graph', $$ MATCH (p) return show_trgm(p.name) $$) AS (n text[]); +SELECT * FROM cypher('graph', $$ MATCH (p) with p, similarity(p.name, "Jon") as sim return p.name, sim ORDER BY sim DESC $$) AS (n agtype, s real); +SELECT * FROM cypher('graph', $$ MATCH (p) with p, word_similarity(p.name, "Jon") as sim return p.name, sim ORDER BY sim DESC $$) AS (n agtype, s real); + +-- Clean up +SELECT drop_graph('graph', true); +DROP EXTENSION pg_trgm CASCADE; \ No newline at end of file diff --git a/regress/sql/pgvector.sql b/regress/sql/pgvector.sql new file mode 100644 index 000000000..816d6eb9f --- /dev/null +++ b/regress/sql/pgvector.sql @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +LOAD 'age'; +SET search_path=ag_catalog; + +SELECT create_graph('graph'); + +-- Should error out +SELECT * FROM cypher('graph', $$ RETURN cosine_distance("[1,2,3]", "[1,2,3]") $$) AS (n agtype); + +-- Create the extension in the public schema +CREATE EXTENSION vector SCHEMA public; + +-- Should error out +SELECT * FROM cypher('graph', $$ RETURN cosine_distance("[1,2,3]", "[1,2,3]") $$) AS (n agtype); + +-- Should work +SET search_path=ag_catalog, public; + +SELECT create_graph('graph'); +SELECT * FROM cypher('graph', $$ RETURN "[1.22,2.22,3.33]"::vector $$) AS (n vector); +SELECT * FROM cypher('graph', $$ RETURN "[1.22,2.22,3.33]"::vector $$) AS (n halfvec); +SELECT * FROM cypher('graph', $$ RETURN "[1.22,2.22,3.33]"::vector $$) AS (n sparsevec); + +SELECT * FROM cypher('graph', $$ RETURN l2_distance("[1,2,3]", "[1,2,4]") $$) AS (n agtype); +SELECT * FROM cypher('graph', $$ RETURN inner_product("[1,2,3]", "[1,2,4]") $$) AS (n agtype); +SELECT * FROM cypher('graph', $$ RETURN cosine_distance("[1,2,3]", "[1,2,4]") $$) AS (n agtype); +SELECT * FROM cypher('graph', $$ RETURN l1_distance("[1,2,3]", "[1,2,4]") $$) AS (n agtype); +SELECT * FROM cypher('graph', $$ RETURN vector_dims("[1,2,3]") $$) AS (n agtype); +SELECT * FROM cypher('graph', $$ RETURN vector_norm("[1,2,3]") $$) AS (n agtype); +SELECT * FROM cypher('graph', $$ RETURN l2_normalize("[1,2,3]") $$) AS (n vector); +SELECT * FROM cypher('graph', $$ RETURN l2_normalize("[1,2,3]")::text $$) AS (n agtype); +SELECT * FROM cypher('graph', $$ RETURN subvector("[1,2,3,4,5,6]", 2, 4) $$) AS (n vector); +SELECT * FROM cypher('graph', $$ RETURN subvector("[1,2,3,4,5,6]", 2, 4)::text $$) AS (n agtype); +SELECT * FROM cypher('graph', $$ RETURN binary_quantize("[1,2,4]") $$) AS (n bit); + +-- An example usage +SELECT * FROM cypher('graph', $$ + CREATE (:Movie {title: "The Matrix", year: 1999, genre: "Action", plot: "A computer hacker learns about the true nature of reality and joins a rebellion to free humanity from a simulated world controlled by machines.", embedding: "[-0.07594558, 0.04081754, 0.29592122, -0.11921061]"}), + (:Movie {title: "The Matrix Reloaded", year: 2003, genre: "Action", plot: "The rebels continue their fight against the machines, uncovering deeper truths about the Matrix and the nature of their mission.", embedding: "[0.30228977, -0.22839354, 0.35070436, 0.01262819]"}), + (:Movie {title: "The Matrix Revolutions", year: 2003, genre: "Action", plot: "The final battle between humans and machines reaches its climax as the fate of both worlds hangs in the balance.", embedding: "[ 0.12240622, -0.29752459, 0.22620453, 0.24454723]"}), + (:Movie {title: "The Matrix Resurrections", year: 2021, genre: "Action", plot: "Neo returns to a new version of the Matrix and must once again fight to save the people from the control of the machines.", embedding: "[ 0.34717246, -0.13820869, 0.29214213, 0.08090488]"}), + (:Movie {title: "Inception", year: 2010, genre: "Sci-Fi", plot: "A skilled thief is given a chance at redemption if he can successfully perform an inception: planting an idea into someone’s subconscious.", embedding: "[ 0.03923657, 0.39284106, -0.20927092, -0.17770818]"}), + (:Movie {title: "Interstellar", year: 2014, genre: "Sci-Fi", plot: "A group of explorers travel through a wormhole in space in an attempt to ensure humanity’s survival.", embedding: "[-0.29302418, -0.39615033, -0.23393948, -0.09601383]"}), + (:Movie {title: "Avatar", year: 2009, genre: "Sci-Fi", plot: "A paraplegic Marine is sent to the moon Pandora, where he becomes torn between following orders and protecting the world he feels is his home.", embedding: "[-0.13663386, 0.00635589, -0.03038832, -0.08252723]"}), + (:Movie {title: "Blade Runner", year: 1982, genre: "Sci-Fi", plot: "A blade runner must pursue and terminate four replicants who have stolen a ship in space and returned to Earth.", embedding: "[ 0.27215557, -0.1479577, -0.09972772, -0.08234394]"}), + (:Movie {title: "Blade Runner 2049", year: 2017, genre: "Sci-Fi", plot: "A new blade runner unearths a long-buried secret that has the potential to plunge what’s left of society into chaos.", embedding: "[ 0.21560573, -0.07505179, -0.01331814, 0.13403069]"}), + (:Movie {title: "Minority Report", year: 2002, genre: "Sci-Fi", plot: "In a future where a special police unit can arrest murderers before they commit their crimes, a top officer is accused of a future murder.", embedding: "[ 0.24008012, 0.44954908, -0.30905488, 0.15195407]"}), + (:Movie {title: "Total Recall", year: 1990, genre: "Sci-Fi", plot: "A construction worker discovers that his memories have been implanted and becomes embroiled in a conspiracy on Mars.", embedding: "[-0.17471036, 0.14695261, -0.06272433, -0.21795064]"}), + (:Movie {title: "Elysium", year: 2013, genre: "Sci-Fi", plot: "In a future where the rich live on a luxurious space station while the rest of humanity lives in squalor, a man fights to bring equality.", embedding: "[-0.33280967, 0.07733926, 0.11015328, 0.53382836]"}), + (:Movie {title: "Gattaca", year: 1997, genre: "Sci-Fi", plot: "In a future where genetic engineering determines social class, a man defies his fate to achieve his dreams.", embedding: "[-0.21629286, 0.31114665, 0.08303899, 0.46199759]"}), + (:Movie {title: "The Fifth Element", year: 1997, genre: "Sci-Fi", plot: "In a futuristic world, a cab driver becomes the key to saving humanity from an impending cosmic threat.", embedding: "[-0.11528205, -0.0208782, -0.0735215, 0.14327449]"}), + (:Movie {title: "The Terminator", year: 1984, genre: "Action", plot: "A cyborg assassin is sent back in time to kill the mother of the future resistance leader.", embedding: "[ 0.33666933, 0.18040994, -0.01075103, -0.11117851]"}), + (:Movie {title: "Terminator 2: Judgment Day", year: 1991, genre: "Action", plot: "A reprogrammed Terminator is sent to protect the future leader of the human resistance from a more advanced Terminator.", embedding: "[ 0.34698868, 0.06439331, 0.06232323, -0.19534876]"}), + (:Movie {title: "Jurassic Park", year: 1993, genre: "Adventure", plot: "Scientists clone dinosaurs to create a theme park, but things go awry when the creatures escape.", embedding: "[ 0.01794725, -0.11434246, -0.46831815, -0.01049593]"}), + (:Movie {title: "The Avengers", year: 2012, genre: "Action", plot: "Superheroes assemble to face a global threat from an alien invasion led by Loki.", embedding: "[ 0.00546514, -0.37005171, -0.42612838, 0.07968612]"}) +$$) AS (result agtype); +SELECT * FROM cypher('graph', $$ MATCH (m:Movie) RETURN m.title, (m.embedding)::vector $$) AS (title agtype, embedding vector); + +-- Check the dimension of the embedding +SELECT * FROM cypher('graph', $$ MATCH (m:Movie) RETURN m.title, vector_dims(m.embedding) $$) AS (title agtype, dimension int); + +-- Get top 4 most similar movies to The Terminator using cosine distance +SELECT * FROM cypher('graph', $$ MATCH (m:Movie), (search:Movie {title: "The Terminator"}) + RETURN m.title ORDER BY cosine_distance(m.embedding, search.embedding) ASC LIMIT 4 +$$) AS (title agtype); +-- Get top 4 most similar movies to The Matrix using cosine distance +SELECT * FROM cypher('graph', $$ MATCH (m:Movie), (search:Movie {title: "The Matrix"}) + RETURN m.title ORDER BY cosine_distance(m.embedding, search.embedding) ASC LIMIT 4 +$$) AS (title agtype); +-- l2 norm of the embedding +SELECT * FROM cypher('graph', $$ MATCH (m:Movie) set m.embedding=(l2_normalize(m.embedding))::text return m.title, m.embedding $$) AS (title agtype, embedding agtype); + +-- Get top 4 most similar movies to The Terminator using l2 distance +SELECT * FROM cypher('graph', $$ MATCH (m:Movie), (search:Movie {title: "The Terminator"}) + RETURN m.title ORDER BY l2_distance(m.embedding, search.embedding) ASC LIMIT 4 +$$) AS (title agtype); +-- Get top 4 most similar movies to The Matrix using l2 distance +SELECT * FROM cypher('graph', $$ MATCH (m:Movie), (search:Movie {title: "The Matrix"}) + RETURN m.title ORDER BY l2_distance(m.embedding, search.embedding) ASC LIMIT 4 +$$) AS (title agtype); + +SELECT drop_graph('graph', true); +DROP EXTENSION vector CASCADE; \ No newline at end of file diff --git a/src/backend/parser/cypher_expr.c b/src/backend/parser/cypher_expr.c index 45cfbbfbc..8eaf3bd2f 100644 --- a/src/backend/parser/cypher_expr.c +++ b/src/backend/parser/cypher_expr.c @@ -25,6 +25,8 @@ #include "postgres.h" #include "catalog/pg_proc.h" +#include "catalog/dependency.h" +#include "commands/extension.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" #include "optimizer/optimizer.h" @@ -34,6 +36,7 @@ #include "parser/cypher_clause.h" #include "parser/parse_oper.h" #include "parser/parse_relation.h" +#include "parser/parse_type.h" #include "utils/builtins.h" #include "utils/catcache.h" #include "utils/float.h" @@ -97,14 +100,24 @@ static Node *transform_column_ref_for_indirection(cypher_parsestate *cpstate, ColumnRef *cr); static Node *transform_cypher_list_comprehension(cypher_parsestate *cpstate, cypher_unwind *expr); -static bool is_fuzzystrmatch_function(FuncCall *fn); -static void check_for_extension_functions(char *extension, FuncCall *fn); -static List *cast_agtype_input_to_other_type(cypher_parsestate *cpstate, - FuncCall *fn, List *targs); -static Node *cast_input_to_output_type(cypher_parsestate *cpstate, Node *expr, - Oid source_oid, Oid target_oid); +static Node *transform_external_ext_FuncCall(cypher_parsestate *cpstate, + FuncCall *fn, List *targs, + Form_pg_proc procform, + char *extension); +static List *cast_agtype_args_to_target_type(cypher_parsestate *cpstate, + Form_pg_proc procform, + List *fargs, + Oid *target_types); +static Node *cast_to_target_type(cypher_parsestate *cpstate, Node *expr, + Oid source_oid, Oid target_oid); static Node *wrap_text_output_to_agtype(cypher_parsestate *cpstate, FuncExpr *fexpr); +static Form_pg_proc get_procform(FuncCall *fn, bool err_not_found); +static char *get_mapped_extension(Oid func_oid); +static bool is_extension_external(char *extension); +static bool is_pgvector_datatype(char *typename); +static char *construct_age_function_name(char *funcname); +static bool function_exists(char *funcname, char *extension); /* transform a cypher expression */ Node *transform_cypher_expr(cypher_parsestate *cpstate, Node *expr, @@ -1548,6 +1561,7 @@ static Node *transform_cypher_typecast(cypher_parsestate *cpstate, { List *fname; FuncCall *fnode; + ParseState *pstate; /* verify input parameter */ Assert (cpstate != NULL); @@ -1555,6 +1569,7 @@ static Node *transform_cypher_typecast(cypher_parsestate *cpstate, /* create the qualified function name, schema first */ fname = list_make1(makeString("ag_catalog")); + pstate = &cpstate->pstate; /* append the name of the requested typecast function */ if (pg_strcasecmp(ctypecast->typecast, "edge") == 0) @@ -1599,7 +1614,40 @@ static Node *transform_cypher_typecast(cypher_parsestate *cpstate, { fname = lappend(fname, makeString(FUNC_AGTYPE_TYPECAST_PG_TEXT)); } + else if (is_pgvector_datatype(ctypecast->typecast)) + { + TypeName *target_typname; + Oid source_oid; + Oid target_oid; + Node *expr; + + /* transform the expr before casting */ + expr = transform_cypher_expr_recurse(cpstate, + ctypecast->expr); + /* get the source and target oids */ + target_typname = makeTypeNameFromNameList(list_make1( + makeString(ctypecast->typecast))); + target_oid = typenameTypeId(pstate, target_typname); + source_oid = exprType(expr); + + if (source_oid == AGTYPEOID) + { + /* + * Cast to text and then to target type, since we cant + * directly cast agtype to pgvector datatypes. + */ + expr = cast_to_target_type(cpstate, expr, source_oid, TEXTOID); + expr = cast_to_target_type(cpstate, expr, TEXTOID, target_oid); + } + else + { + /* try a direct cast, it will error out if not possible */ + expr = cast_to_target_type(cpstate, expr, source_oid, target_oid); + } + + return expr; + } /* if none was found, error out */ else { @@ -1616,164 +1664,120 @@ static Node *transform_cypher_typecast(cypher_parsestate *cpstate, return transform_FuncCall(cpstate, fnode); } -/* is the function part of the fuzzystrmatch extension */ -static bool is_fuzzystrmatch_function(FuncCall *fn) +static Node *transform_external_ext_FuncCall(cypher_parsestate *cpstate, + FuncCall *fn, List *targs, + Form_pg_proc procform, + char *extension) { - char *funcname = (((String*)linitial(fn->funcname))->sval); + ParseState *pstate = &cpstate->pstate; + FuncExpr *fexpr = NULL; + Node *retval = NULL; + Node *last_srf = pstate->p_last_srf; + Oid *proargtypes; - if (pg_strcasecmp(funcname, "soundex") == 0 || - pg_strcasecmp(funcname, "difference") == 0 || - pg_strcasecmp(funcname, "daitch_mokotoff") == 0 || - pg_strcasecmp(funcname, "soundex_tsvector") == 0 || - pg_strcasecmp(funcname, "levenshtein") == 0 || - pg_strcasecmp(funcname, "levenshtein_less_equal") == 0 || - pg_strcasecmp(funcname, "metaphone") == 0 || - pg_strcasecmp(funcname, "dmetaphone") == 0) - { - return true; - } - return false; -} + /* make sure procform in not NULL */ + Assert(procform != NULL); + proargtypes = procform->proargtypes.values; -/* - * Cast a function's input parameter list from agtype to that function's input - * type. This is used for functions that don't take agtype as input and where - * there isn't an implicit cast to do this for us. - */ -static List *cast_agtype_input_to_other_type(cypher_parsestate *cpstate, - FuncCall *fn, List *targs) -{ - char *funcname = (((String*)linitial(fn->funcname))->sval); - int nargs = fn->args->length; - CatCList *catlist = NULL; - List *new_targs = NIL; - ListCell *lc = NULL; - int i = 0; + /* cast the agtype arguments to the types accepted by function */ + targs = cast_agtype_args_to_target_type(cpstate, procform, targs, proargtypes); - /* get a list of matching functions from the sys cache */ - catlist = SearchSysCacheList1(PROCNAMEARGSNSP, CStringGetDatum(funcname)); + /* now get the function node for the external function */ + fexpr = (FuncExpr *)ParseFuncOrColumn(pstate, fn->funcname, targs, + last_srf, fn, false, + fn->location); - /* iterate through the list of functions for ones that match */ - for (i = 0; i < catlist->n_members; i++) + /* + * This will cast TEXT output to AGTYPE. It will error out if this is + * not possible to do. For TEXT to AGTYPE we need to wrap the output + * due to issues with creating a cast from TEXT to AGTYPE. + */ + if (fexpr->funcresulttype == TEXTOID) { - HeapTuple proctup = &catlist->members[i]->tuple; - Form_pg_proc procform = (Form_pg_proc) GETSTRUCT(proctup); - - /* check that the names, number of args, and variadic match */ - if (pg_strcasecmp(funcname, procform->proname.data) == 0 && - nargs == procform->pronargs && - fn->func_variadic == procform->provariadic) - { - Oid *proargtypes = procform->proargtypes.values; - int j = 0; - - /* - * Rebuild targs with castings to the function's input types from - * targ's output type. - */ - foreach(lc, targs) - { - Oid poid = proargtypes[j]; - Node *targ = lfirst(lc); - Oid toid = exprType(targ); - - /* cast the arg. this will error out if it can't be done. */ - targ = cast_input_to_output_type(cpstate, targ, toid, poid); + retval = wrap_text_output_to_agtype(cpstate, fexpr); + } + else + { + retval = (Node *)fexpr; + } - /* add it to the new argument list */ - new_targs = lappend(new_targs, targ); - j++; - } + /* additional casts or wraps can be done here for other types */ - /* free the old args and replace them with the new ones */ - pfree_if_not_null(targs); - targs = new_targs; - break; - } + /* flag that an aggregate was found during a transform */ + if (retval != NULL && retval->type == T_Aggref) + { + cpstate->exprHasAgg = true; } - /* we need to release the cache list */ - ReleaseSysCacheList(catlist); - return targs; + + /* we can just return it here */ + return retval; } /* - * Verify that a called function, that is mapped to a specific - * function in some other extension, is loaded. Otherwise, bail - * out with an error, stating the issue. - * - * Note: some code borrowed from FuncnameGetCandidates + * Cast a function's input parameter list from agtype to that function's input + * type. This is used for functions that don't take agtype as input and where + * there isn't an implicit cast to do this for us. */ -static void check_for_extension_functions(char *extension, FuncCall *fn) +static List *cast_agtype_args_to_target_type(cypher_parsestate *cpstate, + Form_pg_proc procform, + List *fargs, + Oid *target_types) { - char *funcname = (((String*)linitial(fn->funcname))->sval); - CatCList *catlist = NULL; - bool found = false; + char *funcname = NameStr(procform->proname); + int nargs = procform->pronargs; + ListCell *lc = NULL; int i = 0; - /* get a list of matching functions */ - catlist = SearchSysCacheList1(PROCNAMEARGSNSP, CStringGetDatum(funcname)); - - /* if the catalog list is empty, the extension isn't loaded */ - if (catlist->n_members == 0) + /* verify the length of args are same */ + if (list_length(fargs) != nargs) { ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("extension %s is not installed for function %s", - extension, funcname))); + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("function %s requires %d arguments, %d given", + funcname, nargs, list_length(fargs)))); } - /* iterate through them and verify that they are in the search path */ - for (i = 0; i < catlist->n_members; i++) + /* iterate through the function's args */ + foreach (lc, fargs) { - HeapTuple proctup = &catlist->members[i]->tuple; - Form_pg_proc procform = (Form_pg_proc) GETSTRUCT(proctup); - List *asp = fetch_search_path(false); - ListCell *nsp; + char *target_typname; + Node *expr = lfirst(lc); + Oid source_oid = exprType(expr); + Oid target_oid = target_types[i]; - /* - * Consider only procs that are in the search path and are not in - * the temp namespace. - */ - foreach(nsp, asp) - { - Oid oid = lfirst_oid(nsp); + /* get the typename from target_oid */ + target_typname = format_type_be(target_oid); - if (procform->pronamespace == oid && - isTempNamespace(procform->pronamespace) == false) - { - pfree_if_not_null(asp); - found = true; - break; - } + /* cast the agtype to the target type */ + if (source_oid == AGTYPEOID && is_pgvector_datatype(target_typname)) + { + /* + * There is no cast from agtype to vector, so we first + * cast agtype to text and then text to vector. + */ + expr = cast_to_target_type(cpstate, expr, source_oid, TEXTOID); + expr = cast_to_target_type(cpstate, expr, TEXTOID, target_oid); } - - if (found) + /* additional casts can be added here for other types */ + else { - break; + /* try a direct cast, it will error out if not possible */ + expr = cast_to_target_type(cpstate, expr, source_oid, target_oid); } - pfree_if_not_null(asp); - } - - /* if we didn't find it, it isn't in the search path */ - if (!found) - { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("extension %s is not in search path for function %s", - extension, funcname))); + lfirst(lc) = expr; + i++; } - /* release the system cache list */ - ReleaseSysCacheList(catlist); + return fargs; } /* * Cast an input type to an output type, error out if not possible. * Thanks to Taha for this idea. */ -static Node *cast_input_to_output_type(cypher_parsestate *cpstate, Node *expr, - Oid source_oid, Oid target_oid) +static Node *cast_to_target_type(cypher_parsestate *cpstate, Node *expr, + Oid source_oid, Oid target_oid) { ParseState *pstate = &cpstate->pstate; @@ -1831,6 +1835,181 @@ static Node *wrap_text_output_to_agtype(cypher_parsestate *cpstate, return retval; } +/* + * Returns Form_pg_proc struct for given function, if the function + * is not in search path, it is not considered. + */ +static Form_pg_proc get_procform(FuncCall *fn, bool err_not_found) +{ + CatCList *catlist = NULL; + Form_pg_proc procform = NULL; + int nargs; + int i = 0; + List *asp; + bool found = false; + char *funcname = (((String*)linitial(fn->funcname))->sval); + + /* get a list of matching functions */ + catlist = SearchSysCacheList1(PROCNAMEARGSNSP, CStringGetDatum(funcname)); + + if (catlist->n_members == 0) + { + ReleaseSysCacheList(catlist); + return NULL; + } + + asp = fetch_search_path(false); + nargs = list_length(fn->args); + + /* iterate through them and verify that they are in the search path */ + for (i = 0; i < catlist->n_members; i++) + { + ListCell *nsp; + HeapTuple proctup = &catlist->members[i]->tuple; + procform = (Form_pg_proc) GETSTRUCT(proctup); + + /* + * Check if the function name, number of arguments, and + * variadic match before checking if it is in the search + * path. + */ + if (pg_strcasecmp(funcname, procform->proname.data) == 0 && + nargs == procform->pronargs && + fn->func_variadic == procform->provariadic) + { + foreach(nsp, asp) + { + Oid oid = lfirst_oid(nsp); + + if (procform->pronamespace == oid && + isTempNamespace(procform->pronamespace) == false) + { + found = true; + break; + } + } + } + + if (found) + { + break; + } + + /* reset procform */ + procform = NULL; + } + + /* Error out if function not found */ + if (err_not_found && (procform == NULL)) + { + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("function %s does not exist", funcname), + errhint("If the function is from an external extension, " + "make sure the extension is installed and the " + "function is in the search path."))); + } + + /* we need to release the cache list */ + ReleaseSysCacheList(catlist); + pfree_if_not_null(asp); + + return procform; +} + +static char *get_mapped_extension(Oid func_oid) +{ + Oid extension_oid; + char *extension = NULL; + + extension_oid = getExtensionOfObject(ProcedureRelationId, func_oid); + extension = get_extension_name(extension_oid); + + return extension; +} + +static bool is_extension_external(char *extension) +{ + return ((extension != NULL) && + (pg_strcasecmp(extension, "age") != 0)); +} + +static bool is_pgvector_datatype(char *typename) +{ + return (pg_strcasecmp(typename, "vector") || + pg_strcasecmp(typename, "halfvec") || + pg_strcasecmp(typename, "sparsevec")); +} + +/* Returns age_ prefiexed lower case function name */ +static char *construct_age_function_name(char *funcname) +{ + int pnlen = strlen(funcname); + char *ag_name = palloc(pnlen + 5); + int i; + + /* copy in the prefix - all AGE functions are prefixed with age_ */ + strncpy(ag_name, "age_", 4); + + /* + * All AGE function names are in lower case. So, copy in the funcname + * in lower case. + */ + for (i = 0; i < pnlen; i++) + { + ag_name[i + 4] = tolower(funcname[i]); + } + + /* terminate it with 0 */ + ag_name[i + 4] = 0; + + return ag_name; +} + + +/* + * Checks if a function exists. If the extension name is given, + * then it checks if the function exists in that extension. + */ +static bool function_exists(char *funcname, char *extension) +{ + CatCList *catlist = NULL; + bool found = false; + int i = 0; + + /* get a list of matching functions */ + catlist = SearchSysCacheList1(PROCNAMEARGSNSP, CStringGetDatum(funcname)); + + if (catlist->n_members == 0) + { + ReleaseSysCacheList(catlist); + return false; + } + else if (extension == NULL) + { + ReleaseSysCacheList(catlist); + return true; + } + + for (i = 0; i < catlist->n_members; i++) + { + HeapTuple proctup = &catlist->members[i]->tuple; + Form_pg_proc procform = (Form_pg_proc) GETSTRUCT(proctup); + char *ext = get_mapped_extension(procform->oid); + + if (ext != NULL && pg_strcasecmp(ext, extension) == 0) + { + found = true; + break; + } + } + + /* we need to release the cache list */ + ReleaseSysCacheList(catlist); + + return found; +} + /* * Code borrowed from PG's transformFuncCall and updated for AGE */ @@ -1842,7 +2021,6 @@ static Node *transform_FuncCall(cypher_parsestate *cpstate, FuncCall *fn) List *fname = NIL; ListCell *arg; Node *retval = NULL; - bool found = false; /* Transform the list of arguments ... */ foreach(arg, fn->args) @@ -1856,120 +2034,84 @@ static Node *transform_FuncCall(cypher_parsestate *cpstate, FuncCall *fn) /* within group should not happen */ Assert(!fn->agg_within_group); - /* - * Check for cypher functions that map to the fuzzystrmatch extension and - * verify that the external functions exist. - */ - if (is_fuzzystrmatch_function(fn)) + /* If it is a qualified function call, let it through. */ + if (list_length(fn->funcname) > 1) { - /* abort if the extension isn't loaded or in the path */ - check_for_extension_functions("fuzzystrmatch", fn); - - /* everything looks good so mark found as true */ - found = true; + fname = fn->funcname; } - /* - * If we found a function that is part of an extension, which is in the - * search_path, then cast the agtype inputs to that function's type inputs. + * Else We need to check if the function call is for + * age or for some external extension. */ - if (found) + else { - FuncExpr *fexpr = NULL; - - /* - * Coerce agtype inputs to function's inputs. this will error out if - * this is not possible to do. - */ - targs = cast_agtype_input_to_other_type(cpstate, fn, targs); - - /* now get the function node for the external function */ - fexpr = (FuncExpr *)ParseFuncOrColumn(pstate, fn->funcname, targs, - last_srf, fn, false, - fn->location); + char *name = strVal(linitial(fn->funcname)); + char *ag_name = construct_age_function_name(name); - /* - * This will cast TEXT outputs to AGTYPE. It will error out if this is - * not possible to do. For TEXT to AGTYPE we need to wrap the output - * due to issues with creating a cast from TEXT to AGTYPE. - */ - if (fexpr->funcresulttype == TEXTOID) + if (function_exists(ag_name, "age")) { - retval = wrap_text_output_to_agtype(cpstate, fexpr); - } - else - { - retval = (Node *)fexpr; - } + /* qualify the name with our schema name */ + fname = list_make2(makeString("ag_catalog"), makeString(ag_name)); - /* additional casts or wraps can be done here for other types */ + /* + * Currently 3 functions need the graph name passed in as the first + * argument - in addition to the other arguments: startNode, endNode, + * and vle. So, check for those 3 functions here and that the arg list + * is not empty. Then prepend the graph name if necessary. + */ + if ((list_length(targs) != 0) && + (strcmp("startNode", name) == 0 || + strcmp("endNode", name) == 0 || + strcmp("vle", name) == 0 || + strcmp("vertex_stats", name) == 0)) + { + char *graph_name = cpstate->graph_name; + Datum d = string_to_agtype(graph_name); + Const *c = makeConst(AGTYPEOID, -1, InvalidOid, -1, d, false, + false); - /* flag that an aggregate was found during a transform */ - if (retval != NULL && retval->type == T_Aggref) - { - cpstate->exprHasAgg = true; + targs = lcons(c, targs); + } } - - /* we can just return it here */ - return retval; - } - - /* - * If the function name is not qualified and not from an extension, then it - * is one of ours. We need to construct its name, and qualify it, so that PG - * can find it. - */ - if (list_length(fn->funcname) == 1) - { - /* get the name, size, and the ag name allocated */ - char *name = ((String*)linitial(fn->funcname))->sval; - int pnlen = strlen(name); - char *ag_name = palloc(pnlen + 5); - int i; - - /* copy in the prefix - all AGE functions are prefixed with age_ */ - strncpy(ag_name, "age_", 4); - - /* - * All AGE function names are in lower case. So, copy in the name - * in lower case. + /* + * If it's not in age, check if it's a potential call to some function + * in another installed extension. */ - for (i = 0; i < pnlen; i++) + else if(function_exists(name, NULL)) { - ag_name[i + 4] = tolower(name[i]); - } - - /* terminate it with 0 */ - ag_name[i + 4] = 0; + Form_pg_proc procform = get_procform(fn, true); + char *extension = get_mapped_extension(procform->oid); - /* qualify the name with our schema name */ - fname = list_make2(makeString("ag_catalog"), makeString(ag_name)); - - /* - * Currently 3 functions need the graph name passed in as the first - * argument - in addition to the other arguments: startNode, endNode, - * and vle. So, check for those 3 functions here and that the arg list - * is not empty. Then prepend the graph name if necessary. - */ - if ((list_length(targs) != 0) && - (strcmp("startNode", name) == 0 || - strcmp("endNode", name) == 0 || - strcmp("vle", name) == 0 || - strcmp("vertex_stats", name) == 0)) + /* + * If the function is from another extension, transform + * it if possible and return the function expr. + */ + if (is_extension_external(extension)) + { + retval = transform_external_ext_FuncCall(cpstate, fn, targs, + procform, extension); + return retval; + } + else + { + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("function %s does not exist", name), + errhint("If the function is from an external extension, " + "make sure the extension is installed and the " + "function is in the search path."))); + } + } + /* no function found */ + else { - char *graph_name = cpstate->graph_name; - Datum d = string_to_agtype(graph_name); - Const *c = makeConst(AGTYPEOID, -1, InvalidOid, -1, d, false, - false); - - targs = lcons(c, targs); + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("function %s does not exist", name), + errhint("If the function is from an external extension, " + "make sure the extension is installed and the " + "function is in the search path."))); } - - } - /* If it is not one of our functions, pass the name list through */ - else - { - fname = fn->funcname; } /* ... and hand off to ParseFuncOrColumn */