From ffb5c65cfba068da720be6f9dbd92de90de7b8eb Mon Sep 17 00:00:00 2001 From: Nan Jiang Date: Fri, 5 Jan 2018 10:28:46 -0500 Subject: [PATCH] fixes #864: Adds text similarity/distance methods and double metaphone text encoding. Added Levenshtein Distance and Similarity code and test Added Hamming Distance code and test Added Jaro-Winkler Distance code and test Added Double Metaphone text encoding and test Added Apache commons-text dependency --- build.gradle | 1 + src/main/java/apoc/text/Phonetic.java | 14 +++++ src/main/java/apoc/text/Strings.java | 51 +++++++++++++++- src/test/java/apoc/text/PhoneticTest.java | 29 +++++++++ src/test/java/apoc/text/StringsTest.java | 71 +++++++++++++++++++++++ 5 files changed, 165 insertions(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 0275eb0a66..ebcaf1b559 100644 --- a/build.gradle +++ b/build.gradle @@ -127,6 +127,7 @@ dependencies { compile group: 'com.github.javafaker', name: 'javafaker', version: '0.10' compile group: 'org.apache.commons', name: 'commons-math3', version: '3.6.1' + compile group: 'org.apache.commons', name: 'commons-text', version: '1.2' jmh group: 'org.neo4j', name: 'neo4j-lucene-index', version: neo4jVersionEffective jmh group: 'org.neo4j', name: 'neo4j-kernel', version: neo4jVersionEffective, classifier: "tests" diff --git a/src/main/java/apoc/text/Phonetic.java b/src/main/java/apoc/text/Phonetic.java index 00e7be7d89..62d0683558 100644 --- a/src/main/java/apoc/text/Phonetic.java +++ b/src/main/java/apoc/text/Phonetic.java @@ -1,5 +1,6 @@ package apoc.text; +import org.apache.commons.codec.language.DoubleMetaphone; import org.neo4j.procedure.Description; import apoc.result.LongResult; import apoc.result.StringResult; @@ -18,6 +19,8 @@ public class Phonetic { + private static final DoubleMetaphone DOUBLE_METAPHONE = new DoubleMetaphone(); + @Procedure @Description("apoc.text.phonetic(value) yield value - Compute the US_ENGLISH phonetic soundex encoding of all words of the text value which can be a single string or a list of strings") public Stream phonetic(final @Name("value") Object value) { @@ -38,6 +41,17 @@ public Stream phoneticDelta(final @Name("text1") String text1, f } } + @Procedure + @Description("apoc.text.doubleMetaphone(value) yield value - Compute the Double Metaphone phonetic encoding of all words of the text value which can be a single string or a list of strings") + public Stream doubleMetaphone(final @Name("value") Object value) + { + Stream stream = value instanceof Iterable ? StreamSupport.stream(((Iterable) value).spliterator(), false) : Stream.of(value); + + return stream.map(str -> (str == null || str.toString().isEmpty()) ? StringResult.EMPTY : + new StringResult(Stream.of(str.toString().trim().split("\\W+")) + .map(DOUBLE_METAPHONE::doubleMetaphone).reduce("", (a, s) -> a + s))); + } + public static class PhoneticResult { public final String phonetic1, phonetic2; public final long delta; diff --git a/src/main/java/apoc/text/Strings.java b/src/main/java/apoc/text/Strings.java index d7e82470ed..e2a8c0172a 100644 --- a/src/main/java/apoc/text/Strings.java +++ b/src/main/java/apoc/text/Strings.java @@ -25,6 +25,9 @@ import java.util.stream.StreamSupport; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.text.similarity.HammingDistance; +import org.apache.commons.text.similarity.JaroWinklerDistance; +import org.apache.commons.text.similarity.LevenshteinDistance; import static apoc.util.Util.quote; import static java.lang.Math.toIntExact; @@ -36,6 +39,10 @@ */ public class Strings { + private final static JaroWinklerDistance jaroWinklerDistance = new JaroWinklerDistance(); + private final static HammingDistance hammingDistance = new HammingDistance(); + private final static LevenshteinDistance levenshteinDistance = new LevenshteinDistance(); + @UserFunction @Description("apoc.text.replace(text, regex, replacement) - replace each substring of the given string that matches the given regular expression with the given replacement.") public String replace(final @Name("text") String text, final @Name("regex") String regex, final @Name("replacement") String replacement) { @@ -131,7 +138,7 @@ public Long distance(final @Name("text1") String text1, @Name("text2")final Stri if (text1 == null || text2 == null) { return null; } - return (long) StringUtils.getLevenshteinDistance(text1, text2); + return Integer.toUnsignedLong( levenshteinDistance.apply( text1, text2 ) ); } @UserFunction @@ -485,4 +492,46 @@ public String toCypher(@Name("value") Object value, @Name(value = "config",defau } return null; } + + @UserFunction + @Description( "apoc.text.levenshteinDistance(lhs, rhs) return the Levenshtein distance of two texts" ) + public Long levenshteinDistance( @Name( "lhs" ) String lhs, @Name( "rhs" ) String rhs ) + { + return distance( lhs, rhs ); + } + + @UserFunction + @Description( "apoc.text.levenshteinSimilarity(lhs, rhs) return the similarity of two texts based on Levenshtein distance" ) + public Double levenshteinSimilarity( @Name( "lhs" ) String lhs, @Name( "rhs" ) String rhs ) + { + if ( lhs == null || rhs == null ) + { + return null; + } + + double editDistance = levenshteinDistance( lhs, rhs ); + return 1 - (editDistance / Math.max( lhs.length(), rhs.length() )); + } + + @UserFunction + @Description( "apoc.text.hammingDistance(lhs, rhs) return the Hamming distance of two texts" ) + public Long hammingDistance( @Name( "lhs" ) String lhs, @Name( "rhs" ) String rhs ) + { + if ( lhs == null || rhs == null ) + { + return null; + } + return Integer.toUnsignedLong( hammingDistance.apply( lhs, rhs ) ); + } + + @UserFunction + @Description( "apoc.text.jaroWinklerDistance(lhs, rhs) return the Jaro-Winkler distance of two texts" ) + public Double jaroWinklerDistance( @Name( "lhs" ) String lhs, @Name( "rhs" ) String rhs ) + { + if ( lhs == null || rhs == null ) + { + return null; + } + return jaroWinklerDistance.apply( lhs, rhs ); + } } diff --git a/src/test/java/apoc/text/PhoneticTest.java b/src/test/java/apoc/text/PhoneticTest.java index a0ef9215d9..48ce52ed6b 100644 --- a/src/test/java/apoc/text/PhoneticTest.java +++ b/src/test/java/apoc/text/PhoneticTest.java @@ -9,6 +9,7 @@ import static apoc.util.TestUtil.testCall; import static org.hamcrest.CoreMatchers.equalTo; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertThat; public class PhoneticTest { @@ -75,4 +76,32 @@ public void shouldComputeSoundexDifference() { assertThat(row.get("delta"), equalTo(4L)) ); } + + @Test + public void shouldComputeDoubleMetaphone() { + testCall(db, "CALL apoc.text.doubleMetaphone('Apoc') YIELD value RETURN value", + (row) -> assertEquals("APK", row.get("value")) + ); + } + + @Test + public void shouldComputeDoubleMetaphoneOfNull() { + testCall(db, "CALL apoc.text.doubleMetaphone(NULL) YIELD value RETURN value", + (row) -> assertEquals(null, row.get("value")) + ); + } + + @Test + public void shouldComputeDoubleMetaphoneForTheEmptyString() { + testCall(db, "CALL apoc.text.doubleMetaphone('') YIELD value RETURN value", + (row) -> assertThat(row.get("value"), equalTo(null)) + ); + } + + @Test + public void shouldComputeDoubleMetaphoneOfManyWords() { + testCall(db, "CALL apoc.text.doubleMetaphone('Hello, dear User!') YIELD value RETURN value", (row) -> + assertThat(row.get("value"), equalTo("HLTRASR")) + ); + } } diff --git a/src/test/java/apoc/text/StringsTest.java b/src/test/java/apoc/text/StringsTest.java index 67a0862e5d..e0b49087ac 100644 --- a/src/test/java/apoc/text/StringsTest.java +++ b/src/test/java/apoc/text/StringsTest.java @@ -580,4 +580,75 @@ public void testToCypher() throws Exception { testCall(db, "RETURN apoc.text.toCypher($v) AS value", map("v", false), (row) -> assertEquals("false", row.get("value"))); } } + + + @Test + public void testLevenshteinDistance() { + testCall(db, "RETURN apoc.text.levenshteinDistance('cat','cut') AS value", + row -> assertEquals(1L, row.get("value")) + ); + } + + @Test + public void testLevenshteinDistanceOfNull() { + testCall(db, "RETURN apoc.text.levenshteinDistance(null,'cut') AS value", + row -> assertEquals(null, row.get("value")) + ); + } + + @Test + public void testLevenshteinDistanceForTheEmptyString() { + testCall(db, "RETURN apoc.text.levenshteinDistance('','cut') AS value", + row -> assertEquals(3L, row.get("value")) + ); + } + + @Test + public void testLevenshteinSimilarity() { + testCall(db, "RETURN apoc.text.levenshteinSimilarity('cold','cool') AS value", + row -> assertEquals(0.5, row.get("value")) + ); + } + + @Test + public void testLevenshteinSimilarityOfNull() { + testCall(db, "RETURN apoc.text.levenshteinSimilarity(null,'cool') AS value", + row -> assertEquals(null, row.get("value")) + ); + } + + @Test + public void testHammingDistance() { + testCall(db, "RETURN apoc.text.hammingDistance('cat','cut') AS value", + row -> assertEquals(1L, row.get("value")) + ); + } + + @Test + public void testHammingDistanceOfNull() { + testCall(db, "RETURN apoc.text.hammingDistance(null,'cut') AS value", + row -> assertEquals(null, row.get("value")) + ); + } + + @Test + public void testHighJaroWinklerDistanceForSimilarNames() { + testCall(db, "RETURN apoc.text.jaroWinklerDistance('Sherry','Shelly') AS value", + row -> assertEquals("0.84444", String.format( "%.5f", (double)row.get("value"))) + ); + } + + @Test + public void testLowJaroWinklerDistanceForNotSimilarNames() { + testCall(db, "RETURN apoc.text.jaroWinklerDistance('Douglas','Shelly') AS value", + row -> assertEquals("0.43651", String.format( "%.5f", (double)row.get("value"))) + ); + } + + @Test + public void testJaroWinklerDistanceOfNull() { + testCall(db, "RETURN apoc.text.jaroWinklerDistance(null,'Shelly') AS value", + row -> assertEquals(null, row.get("value")) + ); + } }