Skip to content

Commit

Permalink
fixes neo4j-contrib#864: Adds text similarity/distance methods and do…
Browse files Browse the repository at this point in the history
…uble metaphone text encoding.

Added Levenshtein Distance and Similarity code and test

Added Hamming Distance code and test

Added Jaro-Winkler Distance code and test

Added Double Metaphone text encoding and test

Added Apache commons-text dependency
  • Loading branch information
Nan Jiang authored and alexiudice committed Jul 19, 2018
1 parent aba1611 commit ffb5c65
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 1 deletion.
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ dependencies {
compile group: 'com.github.javafaker', name: 'javafaker', version: '0.10'

compile group: 'org.apache.commons', name: 'commons-math3', version: '3.6.1'
compile group: 'org.apache.commons', name: 'commons-text', version: '1.2'
jmh group: 'org.neo4j', name: 'neo4j-lucene-index', version: neo4jVersionEffective
jmh group: 'org.neo4j', name: 'neo4j-kernel', version: neo4jVersionEffective, classifier: "tests"

Expand Down
14 changes: 14 additions & 0 deletions src/main/java/apoc/text/Phonetic.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package apoc.text;

import org.apache.commons.codec.language.DoubleMetaphone;
import org.neo4j.procedure.Description;
import apoc.result.LongResult;
import apoc.result.StringResult;
Expand All @@ -18,6 +19,8 @@

public class Phonetic {

private static final DoubleMetaphone DOUBLE_METAPHONE = new DoubleMetaphone();

@Procedure
@Description("apoc.text.phonetic(value) yield value - Compute the US_ENGLISH phonetic soundex encoding of all words of the text value which can be a single string or a list of strings")
public Stream<StringResult> phonetic(final @Name("value") Object value) {
Expand All @@ -38,6 +41,17 @@ public Stream<PhoneticResult> phoneticDelta(final @Name("text1") String text1, f
}
}

@Procedure
@Description("apoc.text.doubleMetaphone(value) yield value - Compute the Double Metaphone phonetic encoding of all words of the text value which can be a single string or a list of strings")
public Stream<StringResult> doubleMetaphone(final @Name("value") Object value)
{
Stream<Object> stream = value instanceof Iterable ? StreamSupport.stream(((Iterable) value).spliterator(), false) : Stream.of(value);

return stream.map(str -> (str == null || str.toString().isEmpty()) ? StringResult.EMPTY :
new StringResult(Stream.of(str.toString().trim().split("\\W+"))
.map(DOUBLE_METAPHONE::doubleMetaphone).reduce("", (a, s) -> a + s)));
}

public static class PhoneticResult {
public final String phonetic1, phonetic2;
public final long delta;
Expand Down
51 changes: 50 additions & 1 deletion src/main/java/apoc/text/Strings.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
import java.util.stream.StreamSupport;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.HammingDistance;
import org.apache.commons.text.similarity.JaroWinklerDistance;
import org.apache.commons.text.similarity.LevenshteinDistance;

import static apoc.util.Util.quote;
import static java.lang.Math.toIntExact;
Expand All @@ -36,6 +39,10 @@
*/
public class Strings {

private final static JaroWinklerDistance jaroWinklerDistance = new JaroWinklerDistance();
private final static HammingDistance hammingDistance = new HammingDistance();
private final static LevenshteinDistance levenshteinDistance = new LevenshteinDistance();

@UserFunction
@Description("apoc.text.replace(text, regex, replacement) - replace each substring of the given string that matches the given regular expression with the given replacement.")
public String replace(final @Name("text") String text, final @Name("regex") String regex, final @Name("replacement") String replacement) {
Expand Down Expand Up @@ -131,7 +138,7 @@ public Long distance(final @Name("text1") String text1, @Name("text2")final Stri
if (text1 == null || text2 == null) {
return null;
}
return (long) StringUtils.getLevenshteinDistance(text1, text2);
return Integer.toUnsignedLong( levenshteinDistance.apply( text1, text2 ) );
}

@UserFunction
Expand Down Expand Up @@ -485,4 +492,46 @@ public String toCypher(@Name("value") Object value, @Name(value = "config",defau
}
return null;
}

@UserFunction
@Description( "apoc.text.levenshteinDistance(lhs, rhs) return the Levenshtein distance of two texts" )
public Long levenshteinDistance( @Name( "lhs" ) String lhs, @Name( "rhs" ) String rhs )
{
return distance( lhs, rhs );
}

@UserFunction
@Description( "apoc.text.levenshteinSimilarity(lhs, rhs) return the similarity of two texts based on Levenshtein distance" )
public Double levenshteinSimilarity( @Name( "lhs" ) String lhs, @Name( "rhs" ) String rhs )
{
if ( lhs == null || rhs == null )
{
return null;
}

double editDistance = levenshteinDistance( lhs, rhs );
return 1 - (editDistance / Math.max( lhs.length(), rhs.length() ));
}

@UserFunction
@Description( "apoc.text.hammingDistance(lhs, rhs) return the Hamming distance of two texts" )
public Long hammingDistance( @Name( "lhs" ) String lhs, @Name( "rhs" ) String rhs )
{
if ( lhs == null || rhs == null )
{
return null;
}
return Integer.toUnsignedLong( hammingDistance.apply( lhs, rhs ) );
}

@UserFunction
@Description( "apoc.text.jaroWinklerDistance(lhs, rhs) return the Jaro-Winkler distance of two texts" )
public Double jaroWinklerDistance( @Name( "lhs" ) String lhs, @Name( "rhs" ) String rhs )
{
if ( lhs == null || rhs == null )
{
return null;
}
return jaroWinklerDistance.apply( lhs, rhs );
}
}
29 changes: 29 additions & 0 deletions src/test/java/apoc/text/PhoneticTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import static apoc.util.TestUtil.testCall;
import static org.hamcrest.CoreMatchers.equalTo;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;

public class PhoneticTest {
Expand Down Expand Up @@ -75,4 +76,32 @@ public void shouldComputeSoundexDifference() {
assertThat(row.get("delta"), equalTo(4L))
);
}

@Test
public void shouldComputeDoubleMetaphone() {
testCall(db, "CALL apoc.text.doubleMetaphone('Apoc') YIELD value RETURN value",
(row) -> assertEquals("APK", row.get("value"))
);
}

@Test
public void shouldComputeDoubleMetaphoneOfNull() {
testCall(db, "CALL apoc.text.doubleMetaphone(NULL) YIELD value RETURN value",
(row) -> assertEquals(null, row.get("value"))
);
}

@Test
public void shouldComputeDoubleMetaphoneForTheEmptyString() {
testCall(db, "CALL apoc.text.doubleMetaphone('') YIELD value RETURN value",
(row) -> assertThat(row.get("value"), equalTo(null))
);
}

@Test
public void shouldComputeDoubleMetaphoneOfManyWords() {
testCall(db, "CALL apoc.text.doubleMetaphone('Hello, dear User!') YIELD value RETURN value", (row) ->
assertThat(row.get("value"), equalTo("HLTRASR"))
);
}
}
71 changes: 71 additions & 0 deletions src/test/java/apoc/text/StringsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -580,4 +580,75 @@ public void testToCypher() throws Exception {
testCall(db, "RETURN apoc.text.toCypher($v) AS value", map("v", false), (row) -> assertEquals("false", row.get("value")));
}
}


@Test
public void testLevenshteinDistance() {
testCall(db, "RETURN apoc.text.levenshteinDistance('cat','cut') AS value",
row -> assertEquals(1L, row.get("value"))
);
}

@Test
public void testLevenshteinDistanceOfNull() {
testCall(db, "RETURN apoc.text.levenshteinDistance(null,'cut') AS value",
row -> assertEquals(null, row.get("value"))
);
}

@Test
public void testLevenshteinDistanceForTheEmptyString() {
testCall(db, "RETURN apoc.text.levenshteinDistance('','cut') AS value",
row -> assertEquals(3L, row.get("value"))
);
}

@Test
public void testLevenshteinSimilarity() {
testCall(db, "RETURN apoc.text.levenshteinSimilarity('cold','cool') AS value",
row -> assertEquals(0.5, row.get("value"))
);
}

@Test
public void testLevenshteinSimilarityOfNull() {
testCall(db, "RETURN apoc.text.levenshteinSimilarity(null,'cool') AS value",
row -> assertEquals(null, row.get("value"))
);
}

@Test
public void testHammingDistance() {
testCall(db, "RETURN apoc.text.hammingDistance('cat','cut') AS value",
row -> assertEquals(1L, row.get("value"))
);
}

@Test
public void testHammingDistanceOfNull() {
testCall(db, "RETURN apoc.text.hammingDistance(null,'cut') AS value",
row -> assertEquals(null, row.get("value"))
);
}

@Test
public void testHighJaroWinklerDistanceForSimilarNames() {
testCall(db, "RETURN apoc.text.jaroWinklerDistance('Sherry','Shelly') AS value",
row -> assertEquals("0.84444", String.format( "%.5f", (double)row.get("value")))
);
}

@Test
public void testLowJaroWinklerDistanceForNotSimilarNames() {
testCall(db, "RETURN apoc.text.jaroWinklerDistance('Douglas','Shelly') AS value",
row -> assertEquals("0.43651", String.format( "%.5f", (double)row.get("value")))
);
}

@Test
public void testJaroWinklerDistanceOfNull() {
testCall(db, "RETURN apoc.text.jaroWinklerDistance(null,'Shelly') AS value",
row -> assertEquals(null, row.get("value"))
);
}
}

0 comments on commit ffb5c65

Please sign in to comment.