From d4eae9fe29c658506e539d9c66ca157efccf6e62 Mon Sep 17 00:00:00 2001 From: Nan Jiang Date: Thu, 19 Jul 2018 14:25:04 -0400 Subject: [PATCH] fixes #864 - Adds text similarity/distance methods and double metaphone text encoder Added Apache commons-text dependency and used the LevenshteinDistance, HammingDistance and JaroWinklerDistance objects from the dependency to create the similarity/distance methods in Strings. Replaced deprecated StringUtils.getLevenshteinDistance with LevenshteinDistance. Added Double Metaphone encoding method in Phonetic. --- build.gradle | 1 + docs/overview.adoc | 7 +++- src/main/java/apoc/text/Phonetic.java | 14 +++++++ src/main/java/apoc/text/Strings.java | 50 ++++++++++++++++++++++- src/test/java/apoc/text/PhoneticTest.java | 28 +++++++++++++ src/test/java/apoc/text/StringsTest.java | 34 ++++++++++++++- 6 files changed, 129 insertions(+), 5 deletions(-) diff --git a/build.gradle b/build.gradle index 0275eb0a66..ebcaf1b559 100644 --- a/build.gradle +++ b/build.gradle @@ -127,6 +127,7 @@ dependencies { compile group: 'com.github.javafaker', name: 'javafaker', version: '0.10' compile group: 'org.apache.commons', name: 'commons-math3', version: '3.6.1' + compile group: 'org.apache.commons', name: 'commons-text', version: '1.2' jmh group: 'org.neo4j', name: 'neo4j-lucene-index', version: neo4jVersionEffective jmh group: 'org.neo4j', name: 'neo4j-kernel', version: neo4jVersionEffective, classifier: "tests" diff --git a/docs/overview.adoc b/docs/overview.adoc index afa94e29fe..58a71c50af 100644 --- a/docs/overview.adoc +++ b/docs/overview.adoc @@ -879,7 +879,11 @@ Example: `'FRIEND|MENTORS>|| phonetic(final @Name("value") Object value) { @@ -38,6 +41,17 @@ public Stream phoneticDelta(final @Name("text1") String text1, f } } + @Procedure + @Description("apoc.text.doubleMetaphone(value) yield value - Compute the Double Metaphone phonetic encoding of all words of the text value which can be a single string or a list of strings") + public Stream doubleMetaphone(final @Name("value") Object value) + { + Stream stream = value instanceof Iterable ? StreamSupport.stream(((Iterable) value).spliterator(), false) : Stream.of(value); + + return stream.map(str -> (str == null || str.toString().isEmpty()) ? StringResult.EMPTY : + new StringResult(Stream.of(str.toString().trim().split("\\W+")) + .map(DOUBLE_METAPHONE::doubleMetaphone).reduce("", (a, s) -> a + s))); + } + public static class PhoneticResult { public final String phonetic1, phonetic2; public final long delta; diff --git a/src/main/java/apoc/text/Strings.java b/src/main/java/apoc/text/Strings.java index d7e82470ed..36640554eb 100644 --- a/src/main/java/apoc/text/Strings.java +++ b/src/main/java/apoc/text/Strings.java @@ -1,6 +1,9 @@ package apoc.text; import apoc.util.Util; +import org.apache.commons.text.similarity.HammingDistance; +import org.apache.commons.text.similarity.JaroWinklerDistance; +import org.apache.commons.text.similarity.LevenshteinDistance; import org.neo4j.graphdb.Node; import org.neo4j.graphdb.Relationship; import org.neo4j.helpers.collection.Pair; @@ -36,6 +39,10 @@ */ public class Strings { + private final static HammingDistance hammingDistance = new HammingDistance(); + private final static JaroWinklerDistance jaroWinklerDistance = new JaroWinklerDistance(); + private final static LevenshteinDistance levenshteinDistance = new LevenshteinDistance(); + @UserFunction @Description("apoc.text.replace(text, regex, replacement) - replace each substring of the given string that matches the given regular expression with the given replacement.") public String replace(final @Name("text") String text, final @Name("regex") String regex, final @Name("replacement") String replacement) { @@ -126,12 +133,51 @@ public boolean compareCleaned(final @Name("text1") String text1, final @Name("te } @UserFunction - @Description("apoc.text.distance(text1, text2) - compare the given strings with the StringUtils.distance(text1, text2) method") + @Description("apoc.text.distance(text1, text2) - compare the given strings with the Levenshtein distance algorithm.") public Long distance(final @Name("text1") String text1, @Name("text2")final String text2) { + return levenshteinDistance(text1, text2); + } + + @UserFunction + @Description("apoc.text.levenshteinDistance(text1, text2) - compare the given strings with the Levenshtein distance algorithm.") + public Long levenshteinDistance(final @Name("text1") String text1, @Name("text2")final String text2) { + if (text1 == null || text2 == null) { + return null; + } + return (long)levenshteinDistance.apply(text1, text2); + } + + @UserFunction + @Description( "apoc.text.levenshteinSimilarity(text1, text2) - calculate the similarity (a value within 0 and 1) between two texts." ) + public Double levenshteinSimilarity(final @Name("text1") String text1, @Name("text2")final String text2) { + if ( text1 == null || text2 == null ) { + return null; + } + + int longerLength = Math.max(text1.length(), text2.length()); + if (longerLength == 0) { + return 1.0; + } + long editDistance = distance( text1, text2 ); + return (longerLength - editDistance) / (double)longerLength; + } + + @UserFunction + @Description( "apoc.text.hammingDistance(text1, text2) - compare the given strings with the Hamming distance algorithm." ) + public Long hammingDistance(final @Name("text1") String text1, @Name("text2")final String text2) { + if (text1 == null || text2 == null) { + return null; + } + return (long)hammingDistance.apply(text1, text2) ; + } + + @UserFunction + @Description( "apoc.text.jaroWinklerDistance(text1, text2) - compare the given strings with the Jaro-Winkler distance algorithm." ) + public Double jaroWinklerDistance(final @Name("text1") String text1, @Name("text2")final String text2) { if (text1 == null || text2 == null) { return null; } - return (long) StringUtils.getLevenshteinDistance(text1, text2); + return jaroWinklerDistance.apply(text1, text2); } @UserFunction diff --git a/src/test/java/apoc/text/PhoneticTest.java b/src/test/java/apoc/text/PhoneticTest.java index a0ef9215d9..a0288c4168 100644 --- a/src/test/java/apoc/text/PhoneticTest.java +++ b/src/test/java/apoc/text/PhoneticTest.java @@ -75,4 +75,32 @@ public void shouldComputeSoundexDifference() { assertThat(row.get("delta"), equalTo(4L)) ); } + + @Test + public void shoudlComputeDoubleMetaphone() { + testCall(db, "CALL apoc.text.doubleMetaphone('Apoc')", (row) -> + assertThat(row.get("value"), equalTo("APK")) + ); + } + + @Test + public void shoudlComputeDoubleMetaphoneOfNull() { + testCall(db, "CALL apoc.text.doubleMetaphone(NULL)", (row) -> + assertThat(row.get("value"), equalTo(null)) + ); + } + + @Test + public void shoudlComputeDoubleMetaphoneForTheEmptyString() { + testCall(db, "CALL apoc.text.doubleMetaphone('')", (row) -> + assertThat(row.get("value"), equalTo(null)) + ); + } + + @Test + public void shouldComputeDoubleMetaphoneOfManyWords() { + testCall(db, "CALL apoc.text.doubleMetaphone('Hello, dear User!')", (row) -> + assertThat(row.get("value"), equalTo("HLTRASR")) + ); + } } diff --git a/src/test/java/apoc/text/StringsTest.java b/src/test/java/apoc/text/StringsTest.java index 67a0862e5d..228a913790 100644 --- a/src/test/java/apoc/text/StringsTest.java +++ b/src/test/java/apoc/text/StringsTest.java @@ -190,15 +190,45 @@ public void testCompareCleanedInQuery() throws Exception { } @Test - public void testGetLevenshteinDistance() { + public void testLevenshteinDistance() { String text1 = "Levenshtein"; String text2 = "Levenstein"; - testCall(db, "RETURN apoc.text.distance({a}, {b}) as distance", + testCall(db, "RETURN apoc.text.distance({a}, {b}) AS distance", map("a", text1, "b", text2), row -> assertEquals(1L, row.get("distance"))); } + @Test + public void testLevenshteinSimilarity() { + String text1 = "Levenshtein"; + String text2 = "Levenstein"; + + testCall(db, "RETURN apoc.text.levenshteinSimilarity({a}, {b}) AS similarity", + map("a", text1, "b", text2), + row -> assertEquals(0.9, (double)row.get("similarity"), 0.01)); + } + + @Test + public void testHammingDistance() { + String text1 = "Neo"; + String text2 = "Leo"; + + testCall(db, "RETURN apoc.text.hammingDistance({a}, {b}) AS distance", + map("a", text1, "b", text2), + row -> assertEquals(1L, row.get("distance"))); + } + + @Test + public void testJaroWinklerDistance() { + String text1 = "Neo"; + String text2 = "Leo"; + + testCall(db, "RETURN apoc.text.jaroWinklerDistance({a}, {b}) AS distance", + map("a", text1, "b", text2), + row -> assertEquals(0.77, (double)row.get("distance"), 0.01)); + } + @Test public void testFuzzyMatch() { Strings strings = new Strings();