From c1167c84b4c4d58b3011680008b0d82f77a7fa16 Mon Sep 17 00:00:00 2001 From: Nan Jiang Date: Thu, 19 Jul 2018 11:57:36 -0400 Subject: [PATCH] Fixes #864: Adds text similarity/distance methods and double metaphone text encoding. - Added Apache commons-text dependency - Added Levenshtein Similarity code and test - Added Hamming Distance code and test - Added Jaro-Winkler Distance code and test - Added Double Metaphone text encoding and test --- build.gradle | 1 + src/main/java/apoc/text/Phonetic.java | 14 ++++++++ src/main/java/apoc/text/Strings.java | 44 +++++++++++++++++++++-- src/test/java/apoc/text/PhoneticTest.java | 28 +++++++++++++++ src/test/java/apoc/text/StringsTest.java | 34 ++++++++++++++++-- 5 files changed, 117 insertions(+), 4 deletions(-) diff --git a/build.gradle b/build.gradle index 0275eb0a66..ebcaf1b559 100644 --- a/build.gradle +++ b/build.gradle @@ -127,6 +127,7 @@ dependencies { compile group: 'com.github.javafaker', name: 'javafaker', version: '0.10' compile group: 'org.apache.commons', name: 'commons-math3', version: '3.6.1' + compile group: 'org.apache.commons', name: 'commons-text', version: '1.2' jmh group: 'org.neo4j', name: 'neo4j-lucene-index', version: neo4jVersionEffective jmh group: 'org.neo4j', name: 'neo4j-kernel', version: neo4jVersionEffective, classifier: "tests" diff --git a/src/main/java/apoc/text/Phonetic.java b/src/main/java/apoc/text/Phonetic.java index 00e7be7d89..62d0683558 100644 --- a/src/main/java/apoc/text/Phonetic.java +++ b/src/main/java/apoc/text/Phonetic.java @@ -1,5 +1,6 @@ package apoc.text; +import org.apache.commons.codec.language.DoubleMetaphone; import org.neo4j.procedure.Description; import apoc.result.LongResult; import apoc.result.StringResult; @@ -18,6 +19,8 @@ public class Phonetic { + private static final DoubleMetaphone DOUBLE_METAPHONE = new DoubleMetaphone(); + @Procedure @Description("apoc.text.phonetic(value) yield value - Compute the US_ENGLISH phonetic soundex encoding of all words of the text value which can be a single string or a list of strings") public Stream phonetic(final @Name("value") Object value) { @@ -38,6 +41,17 @@ public Stream phoneticDelta(final @Name("text1") String text1, f } } + @Procedure + @Description("apoc.text.doubleMetaphone(value) yield value - Compute the Double Metaphone phonetic encoding of all words of the text value which can be a single string or a list of strings") + public Stream doubleMetaphone(final @Name("value") Object value) + { + Stream stream = value instanceof Iterable ? StreamSupport.stream(((Iterable) value).spliterator(), false) : Stream.of(value); + + return stream.map(str -> (str == null || str.toString().isEmpty()) ? StringResult.EMPTY : + new StringResult(Stream.of(str.toString().trim().split("\\W+")) + .map(DOUBLE_METAPHONE::doubleMetaphone).reduce("", (a, s) -> a + s))); + } + public static class PhoneticResult { public final String phonetic1, phonetic2; public final long delta; diff --git a/src/main/java/apoc/text/Strings.java b/src/main/java/apoc/text/Strings.java index d7e82470ed..27a2b17ff2 100644 --- a/src/main/java/apoc/text/Strings.java +++ b/src/main/java/apoc/text/Strings.java @@ -1,6 +1,9 @@ package apoc.text; import apoc.util.Util; +import org.apache.commons.text.similarity.HammingDistance; +import org.apache.commons.text.similarity.JaroWinklerDistance; +import org.apache.commons.text.similarity.LevenshteinDistance; import org.neo4j.graphdb.Node; import org.neo4j.graphdb.Relationship; import org.neo4j.helpers.collection.Pair; @@ -36,6 +39,10 @@ */ public class Strings { + private final static HammingDistance hammingDistance = new HammingDistance(); + private final static JaroWinklerDistance jaroWinklerDistance = new JaroWinklerDistance(); + private final static LevenshteinDistance levenshteinDistance = new LevenshteinDistance(); + @UserFunction @Description("apoc.text.replace(text, regex, replacement) - replace each substring of the given string that matches the given regular expression with the given replacement.") public String replace(final @Name("text") String text, final @Name("regex") String regex, final @Name("replacement") String replacement) { @@ -126,12 +133,45 @@ public boolean compareCleaned(final @Name("text1") String text1, final @Name("te } @UserFunction - @Description("apoc.text.distance(text1, text2) - compare the given strings with the StringUtils.distance(text1, text2) method") + @Description("apoc.text.distance(text1, text2) - compare the given strings with the Levenshtein distance algorithm.") public Long distance(final @Name("text1") String text1, @Name("text2")final String text2) { if (text1 == null || text2 == null) { return null; } - return (long) StringUtils.getLevenshteinDistance(text1, text2); + return (long)levenshteinDistance.apply(text1, text2); + } + + @UserFunction + @Description( "apoc.text.levenshteinSimilarity(text1, text2) - calculate the similarity (a value within 0 and 1) between two texts." ) + public Double similarity(final @Name("text1") String text1, @Name("text2")final String text2) { + if ( text1 == null || text2 == null ) { + return null; + } + + int longerLength = Math.max(text1.length(), text2.length()); + if (longerLength == 0) { + return 1.0; + } + long editDistance = distance( text1, text2 ); + return (longerLength - editDistance) / (double)longerLength; + } + + @UserFunction + @Description( "apoc.text.hammingDistance(text1, text2) - compare the given strings with the Hamming distance algorithm." ) + public Long hammingDistance(final @Name("text1") String text1, @Name("text2")final String text2) { + if (text1 == null || text2 == null) { + return null; + } + return (long)hammingDistance.apply(text1, text2) ; + } + + @UserFunction + @Description( "apoc.text.jaroWinklerDistance(text1, text2) - compare the given strings with the Jaro-Winkler distance algorithm." ) + public Double jaroWinklerDistance(final @Name("text1") String text1, @Name("text2")final String text2) { + if (text1 == null || text2 == null) { + return null; + } + return jaroWinklerDistance.apply(text1, text2); } @UserFunction diff --git a/src/test/java/apoc/text/PhoneticTest.java b/src/test/java/apoc/text/PhoneticTest.java index a0ef9215d9..a0288c4168 100644 --- a/src/test/java/apoc/text/PhoneticTest.java +++ b/src/test/java/apoc/text/PhoneticTest.java @@ -75,4 +75,32 @@ public void shouldComputeSoundexDifference() { assertThat(row.get("delta"), equalTo(4L)) ); } + + @Test + public void shoudlComputeDoubleMetaphone() { + testCall(db, "CALL apoc.text.doubleMetaphone('Apoc')", (row) -> + assertThat(row.get("value"), equalTo("APK")) + ); + } + + @Test + public void shoudlComputeDoubleMetaphoneOfNull() { + testCall(db, "CALL apoc.text.doubleMetaphone(NULL)", (row) -> + assertThat(row.get("value"), equalTo(null)) + ); + } + + @Test + public void shoudlComputeDoubleMetaphoneForTheEmptyString() { + testCall(db, "CALL apoc.text.doubleMetaphone('')", (row) -> + assertThat(row.get("value"), equalTo(null)) + ); + } + + @Test + public void shouldComputeDoubleMetaphoneOfManyWords() { + testCall(db, "CALL apoc.text.doubleMetaphone('Hello, dear User!')", (row) -> + assertThat(row.get("value"), equalTo("HLTRASR")) + ); + } } diff --git a/src/test/java/apoc/text/StringsTest.java b/src/test/java/apoc/text/StringsTest.java index 67a0862e5d..9c89b18d08 100644 --- a/src/test/java/apoc/text/StringsTest.java +++ b/src/test/java/apoc/text/StringsTest.java @@ -190,15 +190,45 @@ public void testCompareCleanedInQuery() throws Exception { } @Test - public void testGetLevenshteinDistance() { + public void testLevenshteinDistance() { String text1 = "Levenshtein"; String text2 = "Levenstein"; - testCall(db, "RETURN apoc.text.distance({a}, {b}) as distance", + testCall(db, "RETURN apoc.text.distance({a}, {b}) AS distance", map("a", text1, "b", text2), row -> assertEquals(1L, row.get("distance"))); } + @Test + public void testLevenshteinSimilarity() { + String text1 = "Levenshtein"; + String text2 = "Levenstein"; + + testCall(db, "RETURN apoc.text.similarity({a}, {b}) AS similarity", + map("a", text1, "b", text2), + row -> assertEquals(0.9, (double)row.get("similarity"), 0.01)); + } + + @Test + public void testHammingDistance() { + String text1 = "Neo"; + String text2 = "Leo"; + + testCall(db, "RETURN apoc.text.hammingDistance({a}, {b}) AS distance", + map("a", text1, "b", text2), + row -> assertEquals(1L, row.get("distance"))); + } + + @Test + public void testJaroWinklerDistance() { + String text1 = "Neo"; + String text2 = "Leo"; + + testCall(db, "RETURN apoc.text.jaroWinklerDistance({a}, {b}) AS distance", + map("a", text1, "b", text2), + row -> assertEquals(0.7777, (double)row.get("distance"), 0.0001)); + } + @Test public void testFuzzyMatch() { Strings strings = new Strings();