Skip to content


fixes #864 - Adds text similarity/distance methods and double metapho…
Browse files Browse the repository at this point in the history
…ne text encoder (#865)

Added Apache commons-text dependency and used the LevenshteinDistance, HammingDistance and JaroWinklerDistance objects from the dependency to create the similarity/distance methods in Strings. 
Replaced deprecated StringUtils.getLevenshteinDistance with LevenshteinDistance.

Added Double Metaphone encoding method in Phonetic.
  • Loading branch information
alexiudice authored and jexp committed Jul 20, 2018
1 parent aba1611 commit bbc36ef
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 5 deletions.
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ dependencies {
compile group: 'com.github.javafaker', name: 'javafaker', version: '0.10'

compile group: 'org.apache.commons', name: 'commons-math3', version: '3.6.1'
compile group: 'org.apache.commons', name: 'commons-text', version: '1.2'
jmh group: 'org.neo4j', name: 'neo4j-lucene-index', version: neo4jVersionEffective
jmh group: 'org.neo4j', name: 'neo4j-kernel', version: neo4jVersionEffective, classifier: "tests"

Expand Down
7 changes: 6 additions & 1 deletion docs/overview.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -879,7 +879,11 @@ Example: `'FRIEND|MENTORS>|<REPORTS_TO'` will match to :FRIEND relationships in

| apoc.text.distance(text1, text2) | compare the given strings with the StringUtils.distance(text1, text2) method
| apoc.text.distance(text1, text2) | compare the given strings with the Levenshtein distance algorithm
| apoc.text.levenshteinDistance(text1, text2) | compare the given strings with the Levenshtein distance algorithm
| apoc.text.levenshteinSimilarity(text1, text2) | calculate the similarity (a value within 0 and 1) between two texts based on Levenshtein distance.
| apoc.text.hammingDistance(text1, text2) | compare the given strings with the Hamming distance algorithm
| apoc.text.jaroWinklerDistance(text1, text2) | compare the given strings with the Jaro-Winkler distance algorithm
| apoc.text.sorensenDiceSimilarity(text1, text2) | compare the given strings with the Sørensen–Dice coefficient formula, assuming an English locale
| apoc.text.sorensenDiceSimilarityWithLanguage(text1, text2, languageTag) | compare the given strings with the Sørensen–Dice coefficient formula, with the provided IETF language tag
| apoc.text.fuzzyMatch(text1, text2) | check if 2 words can be matched in a fuzzy way. Depending on the length of the String it will allow more characters that needs to be edited to match the second String.
Expand All @@ -890,6 +894,7 @@ Example: `'FRIEND|MENTORS>|<REPORTS_TO'` will match to :FRIEND relationships in
| apoc.text.phonetic(value) | Compute the US_ENGLISH phonetic soundex encoding of all words of the text value which can be a single string or a list of strings
| apoc.text.doubleMetaphone(value) | Compute the Double Metaphone phonetic encoding of all words of the text value which can be a single string or a list of strings
| apoc.text.clean(text) | strip the given string of everything except alpha numeric characters and convert it to lower case.
| apoc.text.compareCleaned(text1, text2) | compare the given strings stripped of everything except alpha numeric characters converted to lower case.
Expand Down
14 changes: 14 additions & 0 deletions src/main/java/apoc/text/
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package apoc.text;

import org.apache.commons.codec.language.DoubleMetaphone;
import org.neo4j.procedure.Description;
import apoc.result.LongResult;
import apoc.result.StringResult;
Expand All @@ -18,6 +19,8 @@

public class Phonetic {

private static final DoubleMetaphone DOUBLE_METAPHONE = new DoubleMetaphone();

@Description("apoc.text.phonetic(value) yield value - Compute the US_ENGLISH phonetic soundex encoding of all words of the text value which can be a single string or a list of strings")
public Stream<StringResult> phonetic(final @Name("value") Object value) {
Expand All @@ -38,6 +41,17 @@ public Stream<PhoneticResult> phoneticDelta(final @Name("text1") String text1, f

@Description("apoc.text.doubleMetaphone(value) yield value - Compute the Double Metaphone phonetic encoding of all words of the text value which can be a single string or a list of strings")
public Stream<StringResult> doubleMetaphone(final @Name("value") Object value)
Stream<Object> stream = value instanceof Iterable ? value).spliterator(), false) : Stream.of(value);

return -> (str == null || str.toString().isEmpty()) ? StringResult.EMPTY :
new StringResult(Stream.of(str.toString().trim().split("\\W+"))
.map(DOUBLE_METAPHONE::doubleMetaphone).reduce("", (a, s) -> a + s)));

public static class PhoneticResult {
public final String phonetic1, phonetic2;
public final long delta;
Expand Down
50 changes: 48 additions & 2 deletions src/main/java/apoc/text/
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package apoc.text;

import apoc.util.Util;
import org.apache.commons.text.similarity.HammingDistance;
import org.apache.commons.text.similarity.JaroWinklerDistance;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.Relationship;
import org.neo4j.helpers.collection.Pair;
Expand Down Expand Up @@ -36,6 +39,10 @@
public class Strings {

private final static HammingDistance hammingDistance = new HammingDistance();
private final static JaroWinklerDistance jaroWinklerDistance = new JaroWinklerDistance();
private final static LevenshteinDistance levenshteinDistance = new LevenshteinDistance();

@Description("apoc.text.replace(text, regex, replacement) - replace each substring of the given string that matches the given regular expression with the given replacement.")
public String replace(final @Name("text") String text, final @Name("regex") String regex, final @Name("replacement") String replacement) {
Expand Down Expand Up @@ -126,12 +133,51 @@ public boolean compareCleaned(final @Name("text1") String text1, final @Name("te

@Description("apoc.text.distance(text1, text2) - compare the given strings with the StringUtils.distance(text1, text2) method")
@Description("apoc.text.distance(text1, text2) - compare the given strings with the Levenshtein distance algorithm.")
public Long distance(final @Name("text1") String text1, @Name("text2")final String text2) {
return levenshteinDistance(text1, text2);

@Description("apoc.text.levenshteinDistance(text1, text2) - compare the given strings with the Levenshtein distance algorithm.")
public Long levenshteinDistance(final @Name("text1") String text1, @Name("text2")final String text2) {
if (text1 == null || text2 == null) {
return null;
return (long)levenshteinDistance.apply(text1, text2);

@Description( "apoc.text.levenshteinSimilarity(text1, text2) - calculate the similarity (a value within 0 and 1) between two texts." )
public Double levenshteinSimilarity(final @Name("text1") String text1, @Name("text2")final String text2) {
if ( text1 == null || text2 == null ) {
return null;

int longerLength = Math.max(text1.length(), text2.length());
if (longerLength == 0) {
return 1.0;
long editDistance = distance( text1, text2 );
return (longerLength - editDistance) / (double)longerLength;

@Description( "apoc.text.hammingDistance(text1, text2) - compare the given strings with the Hamming distance algorithm." )
public Long hammingDistance(final @Name("text1") String text1, @Name("text2")final String text2) {
if (text1 == null || text2 == null) {
return null;
return (long)hammingDistance.apply(text1, text2) ;

@Description( "apoc.text.jaroWinklerDistance(text1, text2) - compare the given strings with the Jaro-Winkler distance algorithm." )
public Double jaroWinklerDistance(final @Name("text1") String text1, @Name("text2")final String text2) {
if (text1 == null || text2 == null) {
return null;
return (long) StringUtils.getLevenshteinDistance(text1, text2);
return jaroWinklerDistance.apply(text1, text2);

Expand Down
28 changes: 28 additions & 0 deletions src/test/java/apoc/text/
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,32 @@ public void shouldComputeSoundexDifference() {
assertThat(row.get("delta"), equalTo(4L))

public void shoudlComputeDoubleMetaphone() {
testCall(db, "CALL apoc.text.doubleMetaphone('Apoc')", (row) ->
assertThat(row.get("value"), equalTo("APK"))

public void shoudlComputeDoubleMetaphoneOfNull() {
testCall(db, "CALL apoc.text.doubleMetaphone(NULL)", (row) ->
assertThat(row.get("value"), equalTo(null))

public void shoudlComputeDoubleMetaphoneForTheEmptyString() {
testCall(db, "CALL apoc.text.doubleMetaphone('')", (row) ->
assertThat(row.get("value"), equalTo(null))

public void shouldComputeDoubleMetaphoneOfManyWords() {
testCall(db, "CALL apoc.text.doubleMetaphone('Hello, dear User!')", (row) ->
assertThat(row.get("value"), equalTo("HLTRASR"))
34 changes: 32 additions & 2 deletions src/test/java/apoc/text/
Original file line number Diff line number Diff line change
Expand Up @@ -190,15 +190,45 @@ public void testCompareCleanedInQuery() throws Exception {

public void testGetLevenshteinDistance() {
public void testLevenshteinDistance() {
String text1 = "Levenshtein";
String text2 = "Levenstein";

testCall(db, "RETURN apoc.text.distance({a}, {b}) as distance",
testCall(db, "RETURN apoc.text.distance({a}, {b}) AS distance",
map("a", text1, "b", text2),
row -> assertEquals(1L, row.get("distance")));

public void testLevenshteinSimilarity() {
String text1 = "Levenshtein";
String text2 = "Levenstein";

testCall(db, "RETURN apoc.text.levenshteinSimilarity({a}, {b}) AS similarity",
map("a", text1, "b", text2),
row -> assertEquals(0.9, (double)row.get("similarity"), 0.01));

public void testHammingDistance() {
String text1 = "Neo";
String text2 = "Leo";

testCall(db, "RETURN apoc.text.hammingDistance({a}, {b}) AS distance",
map("a", text1, "b", text2),
row -> assertEquals(1L, row.get("distance")));

public void testJaroWinklerDistance() {
String text1 = "Neo";
String text2 = "Leo";

testCall(db, "RETURN apoc.text.jaroWinklerDistance({a}, {b}) AS distance",
map("a", text1, "b", text2),
row -> assertEquals(0.77, (double)row.get("distance"), 0.01));

public void testFuzzyMatch() {
Strings strings = new Strings();
Expand Down

0 comments on commit bbc36ef

Please sign in to comment.