Skip to content

Commit

Permalink
fixes neo4j-contrib#864: Adds text similarity/distance methods and do…
Browse files Browse the repository at this point in the history
…uble metaphone text encoding.

Added Levenshtein Distance and Similarity code and test

Added Hamming Distance code and test

Added Jaro-Winkler Distance code and test

Added Double Metaphone text encoding and test

Added Apache commons-text dependency
  • Loading branch information
Nan Jiang authored and alexiudice committed Jul 19, 2018
1 parent 2e1b263 commit 9ac9de3
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 56 deletions.
51 changes: 0 additions & 51 deletions src/main/java/apoc/algo/Similarity.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
package apoc.algo;

import org.apache.commons.text.similarity.HammingDistance;
import org.apache.commons.text.similarity.JaroWinklerDistance;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.procedure.Context;
import org.neo4j.procedure.Description;
Expand Down Expand Up @@ -67,52 +64,4 @@ public double euclideanDistance(@Name("vector1") List<Number> vector1, @Name("ve
public double euclideanSimilarity(@Name("vector1") List<Number> vector1, @Name("vector2") List<Number> vector2) {
return 1.0d / (1 + euclideanDistance(vector1, vector2));
}

@UserFunction
@Description("apoc.algo.levenshteinDistance(lhs, rhs) return the Levenshtein distance of two texts")
public double levenshteinDistance( @Name("lhs") String lhs, @Name("rhs") String rhs ) {
if ( lhs == null || rhs == null )
{
return 0.0;
}

LevenshteinDistance ld = new LevenshteinDistance();
return ld.apply( lhs, rhs );
}

@UserFunction
@Description("apoc.algo.levenshteinSimilarity(lhs, rhs) return the similarity of two texts based on Levenshtein distance")
public double levenshteinSimilarity( @Name("lhs") String lhs, @Name("rhs") String rhs ) {
if ( lhs == null || rhs == null )
{
return 0.0;
}

double editDistance = levenshteinDistance( lhs, rhs );
return 1 - (editDistance / Math.max( lhs.length(), rhs.length() ));
}

@UserFunction
@Description("apoc.algo.hammingDistance(lhs, rhs) return the Hamming distance of two texts")
public double hammingDistance( @Name("lhs") String lhs, @Name("rhs") String rhs ) {
if ( lhs == null || rhs == null )
{
return 0.0;
}

HammingDistance hd = new HammingDistance();
return hd.apply( lhs, rhs );
}

@UserFunction
@Description("apoc.algo.jaroWinklerDistance(lhs, rhs) return the Jaro-Winkler distance of two texts")
public double jaroWinklerDistance( @Name("lhs") String lhs, @Name("rhs") String rhs ) {
if ( lhs == null || rhs == null )
{
return 0.0;
}

JaroWinklerDistance jwd = new JaroWinklerDistance();
return jwd.apply( lhs, rhs );
}
}
10 changes: 6 additions & 4 deletions src/main/java/apoc/text/Phonetic.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package apoc.text;

import org.apache.commons.codec.language.DoubleMetaphone;
import org.neo4j.procedure.Description;
import apoc.result.LongResult;
import apoc.result.StringResult;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.language.DoubleMetaphone;
Expand Down Expand Up @@ -49,12 +52,11 @@ public Stream<StringResult> doubleMetaphone(final @Name("value") Object value)
Stream<Object> stream = value instanceof Iterable ? StreamSupport.stream(((Iterable) value).spliterator(), false) : Stream.of(value);

return stream.map(str -> (str == null || str.toString().isEmpty()) ? StringResult.EMPTY :
new StringResult(Stream.of(str.toString().trim().split("\\W+"))
.map(DOUBLE_METAPHONE::doubleMetaphone).reduce("", (a, s) -> a + s)));
new StringResult(Stream.of(str.toString().trim().split("\\W+"))
.map(DOUBLE_METAPHONE::doubleMetaphone).reduce("", (a, s) -> a + s)));
}

public static class PhoneticResult
{
public static class PhoneticResult {
public final String phonetic1, phonetic2;
public final long delta;

Expand Down
51 changes: 50 additions & 1 deletion src/main/java/apoc/text/Strings.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
import java.util.stream.Stream;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.HammingDistance;
import org.apache.commons.text.similarity.JaroWinklerDistance;
import org.apache.commons.text.similarity.LevenshteinDistance;

import static java.lang.Math.toIntExact;
import static java.util.Arrays.asList;
Expand All @@ -27,6 +30,10 @@
*/
public class Strings {

private final static JaroWinklerDistance jaroWinklerDistance = new JaroWinklerDistance();
private final static HammingDistance hammingDistance = new HammingDistance();
private final static LevenshteinDistance levenshteinDistance = new LevenshteinDistance();

@UserFunction
@Description("apoc.text.replace(text, regex, replacement) - replace each substring of the given string that matches the given regular expression with the given replacement.")
public String replace(final @Name("text") String text, final @Name("regex") String regex, final @Name("replacement") String replacement) {
Expand Down Expand Up @@ -105,7 +112,7 @@ public Long distance(final @Name("text1") String text1, @Name("text2")final Stri
if (text1 == null || text2 == null) {
return null;
}
return (long) StringUtils.getLevenshteinDistance(text1, text2);
return Integer.toUnsignedLong( levenshteinDistance.apply( text1, text2 ) );
}

@UserFunction
Expand Down Expand Up @@ -372,4 +379,46 @@ public String base64Decode(@Name("text") String text) {
byte[] decoded = Base64.getDecoder().decode(text.getBytes());
return new String(decoded);
}

@UserFunction
@Description( "apoc.text.levenshteinDistance(lhs, rhs) return the Levenshtein distance of two texts" )
public Long levenshteinDistance( @Name( "lhs" ) String lhs, @Name( "rhs" ) String rhs )
{
return distance( lhs, rhs );
}

@UserFunction
@Description( "apoc.text.levenshteinSimilarity(lhs, rhs) return the similarity of two texts based on Levenshtein distance" )
public Double levenshteinSimilarity( @Name( "lhs" ) String lhs, @Name( "rhs" ) String rhs )
{
if ( lhs == null || rhs == null )
{
return null;
}

double editDistance = levenshteinDistance( lhs, rhs );
return 1 - (editDistance / Math.max( lhs.length(), rhs.length() ));
}

@UserFunction
@Description( "apoc.text.hammingDistance(lhs, rhs) return the Hamming distance of two texts" )
public Long hammingDistance( @Name( "lhs" ) String lhs, @Name( "rhs" ) String rhs )
{
if ( lhs == null || rhs == null )
{
return null;
}
return Integer.toUnsignedLong( hammingDistance.apply( lhs, rhs ) );
}

@UserFunction
@Description( "apoc.text.jaroWinklerDistance(lhs, rhs) return the Jaro-Winkler distance of two texts" )
public Double jaroWinklerDistance( @Name( "lhs" ) String lhs, @Name( "rhs" ) String rhs )
{
if ( lhs == null || rhs == null )
{
return null;
}
return jaroWinklerDistance.apply( lhs, rhs );
}
}
70 changes: 70 additions & 0 deletions src/test/java/apoc/text/StringsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -489,4 +489,74 @@ public void testSorensenDiceSimilarityWithTurkishLocale() {
map("text1",text1,"text2", text2, "languageTag", languageTag),
row -> assertEquals(0.5, row.get("value")));
}

@Test
public void testLevenshteinDistance() {
testCall(db, "RETURN apoc.text.levenshteinDistance('cat','cut') AS value",
row -> assertEquals(1L, row.get("value"))
);
}

@Test
public void testLevenshteinDistanceOfNull() {
testCall(db, "RETURN apoc.text.levenshteinDistance(null,'cut') AS value",
row -> assertEquals(null, row.get("value"))
);
}

@Test
public void testLevenshteinDistanceForTheEmptyString() {
testCall(db, "RETURN apoc.text.levenshteinDistance('','cut') AS value",
row -> assertEquals(3L, row.get("value"))
);
}

@Test
public void testLevenshteinSimilarity() {
testCall(db, "RETURN apoc.text.levenshteinSimilarity('cold','cool') AS value",
row -> assertEquals(0.5, row.get("value"))
);
}

@Test
public void testLevenshteinSimilarityOfNull() {
testCall(db, "RETURN apoc.text.levenshteinSimilarity(null,'cool') AS value",
row -> assertEquals(null, row.get("value"))
);
}

@Test
public void testHammingDistance() {
testCall(db, "RETURN apoc.text.hammingDistance('cat','cut') AS value",
row -> assertEquals(1L, row.get("value"))
);
}

@Test
public void testHammingDistanceOfNull() {
testCall(db, "RETURN apoc.text.hammingDistance(null,'cut') AS value",
row -> assertEquals(null, row.get("value"))
);
}

@Test
public void testHighJaroWinklerDistanceForSimilarNames() {
testCall(db, "RETURN apoc.text.jaroWinklerDistance('Sherry','Shelly') AS value",
row -> assertEquals("0.84444", String.format( "%.5f", (double)row.get("value")))
);
}

@Test
public void testLowJaroWinklerDistanceForNotSimilarNames() {
testCall(db, "RETURN apoc.text.jaroWinklerDistance('Douglas','Shelly') AS value",
row -> assertEquals("0.43651", String.format( "%.5f", (double)row.get("value")))
);
}

@Test
public void testJaroWinklerDistanceOfNull() {
testCall(db, "RETURN apoc.text.jaroWinklerDistance(null,'Shelly') AS value",
row -> assertEquals(null, row.get("value"))
);
}
}

0 comments on commit 9ac9de3

Please sign in to comment.