Skip to content

Commit

Permalink
Fixes neo4j-contrib#864: Adds text similarity/distance methods and do…
Browse files Browse the repository at this point in the history
…uble metaphone text encoding.

- Added Apache commons-text dependency

- Added Levenshtein Similarity code and test

- Added Hamming Distance code and test

- Added Jaro-Winkler Distance code and test

- Added Double Metaphone text encoding and test
  • Loading branch information
nammmm committed Jul 19, 2018
1 parent aba1611 commit e0cea6e
Show file tree
Hide file tree
Showing 6 changed files with 158 additions and 26 deletions.
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ dependencies {
compile group: 'com.github.javafaker', name: 'javafaker', version: '0.10'

compile group: 'org.apache.commons', name: 'commons-math3', version: '3.6.1'
compile group: 'org.apache.commons', name: 'commons-text', version: '1.2'
jmh group: 'org.neo4j', name: 'neo4j-lucene-index', version: neo4jVersionEffective
jmh group: 'org.neo4j', name: 'neo4j-kernel', version: neo4jVersionEffective, classifier: "tests"

Expand Down
13 changes: 10 additions & 3 deletions src/main/java/apoc/index/FulltextIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,22 @@
import apoc.meta.Meta;
import apoc.result.WeightedNodeResult;
import apoc.result.WeightedRelationshipResult;
import apoc.util.Util;
import org.neo4j.graphdb.*;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.Label;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.PropertyContainer;
import org.neo4j.graphdb.Relationship;
import org.neo4j.graphdb.index.Index;
import org.neo4j.graphdb.index.IndexHits;
import org.neo4j.graphdb.index.IndexManager;
import org.neo4j.graphdb.index.RelationshipIndex;
import org.neo4j.index.impl.lucene.explicit.LuceneIndexImplementation;
import org.neo4j.logging.Log;
import org.neo4j.procedure.*;
import org.neo4j.procedure.Context;
import org.neo4j.procedure.Description;
import org.neo4j.procedure.Mode;
import org.neo4j.procedure.Name;
import org.neo4j.procedure.Procedure;

import java.util.ArrayList;
import java.util.Collection;
Expand Down
21 changes: 15 additions & 6 deletions src/main/java/apoc/text/Phonetic.java
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
package apoc.text;

import org.neo4j.procedure.Description;
import apoc.result.LongResult;
import apoc.result.StringResult;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.language.Soundex;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.neo4j.procedure.Description;
import org.neo4j.procedure.Name;
import org.neo4j.procedure.Procedure;

import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import static org.apache.commons.codec.language.Soundex.US_ENGLISH;

public class Phonetic {

private static final DoubleMetaphone DOUBLE_METAPHONE = new DoubleMetaphone();

@Procedure
@Description("apoc.text.phonetic(value) yield value - Compute the US_ENGLISH phonetic soundex encoding of all words of the text value which can be a single string or a list of strings")
public Stream<StringResult> phonetic(final @Name("value") Object value) {
Expand All @@ -38,6 +36,17 @@ public Stream<PhoneticResult> phoneticDelta(final @Name("text1") String text1, f
}
}

@Procedure
@Description("apoc.text.doubleMetaphone(value) yield value - Compute the Double Metaphone phonetic encoding of all words of the text value which can be a single string or a list of strings")
public Stream<StringResult> doubleMetaphone(final @Name("value") Object value)
{
Stream<Object> stream = value instanceof Iterable ? StreamSupport.stream(((Iterable) value).spliterator(), false) : Stream.of(value);

return stream.map(str -> (str == null || str.toString().isEmpty()) ? StringResult.EMPTY :
new StringResult(Stream.of(str.toString().trim().split("\\W+"))
.map(DOUBLE_METAPHONE::doubleMetaphone).reduce("", (a, s) -> a + s)));
}

public static class PhoneticResult {
public final String phonetic1, phonetic2;
public final long delta;
Expand Down
63 changes: 54 additions & 9 deletions src/main/java/apoc/text/Strings.java
Original file line number Diff line number Diff line change
@@ -1,31 +1,39 @@
package apoc.text;

import apoc.util.Util;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.HammingDistance;
import org.apache.commons.text.similarity.JaroWinklerDistance;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.Relationship;
import org.neo4j.helpers.collection.Pair;
import org.neo4j.procedure.Description;
import apoc.result.StringResult;
import org.neo4j.procedure.Name;
import org.neo4j.procedure.Procedure;
import org.neo4j.procedure.UserFunction;

import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.text.Normalizer;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ThreadLocalRandom;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import org.apache.commons.lang3.StringUtils;

import static apoc.util.Util.quote;
import static java.lang.Math.toIntExact;
import static java.util.Arrays.asList;
Expand All @@ -36,6 +44,10 @@
*/
public class Strings {

private final static HammingDistance hammingDistance = new HammingDistance();
private final static JaroWinklerDistance jaroWinklerDistance = new JaroWinklerDistance();
private final static LevenshteinDistance levenshteinDistance = new LevenshteinDistance();

@UserFunction
@Description("apoc.text.replace(text, regex, replacement) - replace each substring of the given string that matches the given regular expression with the given replacement.")
public String replace(final @Name("text") String text, final @Name("regex") String regex, final @Name("replacement") String replacement) {
Expand Down Expand Up @@ -126,12 +138,45 @@ public boolean compareCleaned(final @Name("text1") String text1, final @Name("te
}

@UserFunction
@Description("apoc.text.distance(text1, text2) - compare the given strings with the StringUtils.distance(text1, text2) method")
@Description("apoc.text.distance(text1, text2) - compare the given strings with the Levenshtein distance algorithm.")
public Long distance(final @Name("text1") String text1, @Name("text2")final String text2) {
if (text1 == null || text2 == null) {
return null;
}
return (long) StringUtils.getLevenshteinDistance(text1, text2);
return (long)levenshteinDistance.apply(text1, text2);
}

@UserFunction
@Description( "apoc.text.levenshteinSimilarity(text1, text2) - calculate the similarity (a value within 0 and 1) between two texts." )
public Double similarity(final @Name("text1") String text1, @Name("text2")final String text2) {
if ( text1 == null || text2 == null ) {
return null;
}

int longerLength = Math.max(text1.length(), text2.length());
if (longerLength == 0) {
return 1.0;
}
long editDistance = distance( text1, text2 );
return (longerLength - editDistance) / (double)longerLength;
}

@UserFunction
@Description( "apoc.text.hammingDistance(text1, text2) - compare the given strings with the Hamming distance algorithm." )
public Long hammingDistance(final @Name("text1") String text1, @Name("text2")final String text2) {
if (text1 == null || text2 == null) {
return null;
}
return (long)hammingDistance.apply(text1, text2) ;
}

@UserFunction
@Description( "apoc.text.jaroWinklerDistance(text1, text2) - compare the given strings with the Jaro-Winkler distance algorithm." )
public Double jaroWinklerDistance(final @Name("text1") String text1, @Name("text2")final String text2) {
if (text1 == null || text2 == null) {
return null;
}
return jaroWinklerDistance.apply(text1, text2);
}

@UserFunction
Expand All @@ -153,7 +198,7 @@ public Boolean fuzzyMatch(final @Name("text1") String text1, @Name("text2")final
int termLength = text1.length();
int maxDistanceAllowed = termLength < 3 ? 0 : termLength < 5 ? 1 : 2;

Long distance = distance(text1, text2);
long distance = distance(text1, text2);

return distance <= maxDistanceAllowed;
}
Expand Down
29 changes: 29 additions & 0 deletions src/test/java/apoc/text/PhoneticTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import static apoc.util.TestUtil.testCall;
import static org.hamcrest.CoreMatchers.equalTo;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;

public class PhoneticTest {
Expand Down Expand Up @@ -75,4 +76,32 @@ public void shouldComputeSoundexDifference() {
assertThat(row.get("delta"), equalTo(4L))
);
}

@Test
public void shouldComputeDoubleMetaphone() {
testCall(db, "CALL apoc.text.doubleMetaphone('Apoc') YIELD value RETURN value",
(row) -> assertEquals("APK", row.get("value"))
);
}

@Test
public void shouldComputeDoubleMetaphoneOfNull() {
testCall(db, "CALL apoc.text.doubleMetaphone(NULL) YIELD value RETURN value",
(row) -> assertEquals(null, row.get("value"))
);
}

@Test
public void shouldComputeDoubleMetaphoneForTheEmptyString() {
testCall(db, "CALL apoc.text.doubleMetaphone('') YIELD value RETURN value",
(row) -> assertThat(row.get("value"), equalTo(null))
);
}

@Test
public void shouldComputeDoubleMetaphoneOfManyWords() {
testCall(db, "CALL apoc.text.doubleMetaphone('Hello, dear User!') YIELD value RETURN value", (row) ->
assertThat(row.get("value"), equalTo("HLTRASR"))
);
}
}
57 changes: 49 additions & 8 deletions src/test/java/apoc/text/StringsTest.java
Original file line number Diff line number Diff line change
@@ -1,23 +1,34 @@
package apoc.text;

import apoc.util.TestUtil;
import org.junit.*;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import org.neo4j.graphdb.*;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.PropertyContainer;
import org.neo4j.graphdb.QueryExecutionException;
import org.neo4j.graphdb.Transaction;
import org.neo4j.helpers.collection.Iterators;
import org.neo4j.test.TestGraphDatabaseFactory;

import static java.lang.Math.toIntExact;

import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static apoc.util.MapUtil.map;
import static apoc.util.TestUtil.testCall;
import static apoc.util.TestUtil.testResult;
import static java.lang.Math.toIntExact;
import static java.util.Arrays.asList;
import static org.junit.Assert.*;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;


/**
Expand Down Expand Up @@ -190,15 +201,45 @@ public void testCompareCleanedInQuery() throws Exception {
}

@Test
public void testGetLevenshteinDistance() {
public void testLevenshteinDistance() {
String text1 = "Levenshtein";
String text2 = "Levenstein";

testCall(db, "RETURN apoc.text.distance({a}, {b}) as distance",
testCall(db, "RETURN apoc.text.distance({a}, {b}) AS distance",
map("a", text1, "b", text2),
row -> assertEquals(1L, row.get("distance")));
}

@Test
public void testLevenshteinSimilarity() {
String text1 = "Levenshtein";
String text2 = "Levenstein";

testCall(db, "RETURN apoc.text.similarity({a}, {b}) AS similarity",
map("a", text1, "b", text2),
row -> assertEquals(0.9, (double)row.get("similarity"), 0.01));
}

@Test
public void testHammingDistance() {
String text1 = "Neo";
String text2 = "Leo";

testCall(db, "RETURN apoc.text.hammingDistance({a}, {b}) AS distance",
map("a", text1, "b", text2),
row -> assertEquals(1L, row.get("distance")));
}

@Test
public void testJaroWinklerDistance() {
String text1 = "Neo";
String text2 = "Leo";

testCall(db, "RETURN apoc.text.jaroWinklerDistance({a}, {b}) AS distance",
map("a", text1, "b", text2),
row -> assertEquals(0.7777, (double)row.get("distance"), 0.0001));
}

@Test
public void testFuzzyMatch() {
Strings strings = new Strings();
Expand Down

0 comments on commit e0cea6e

Please sign in to comment.