-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add cleanup operation for replacing ligatures (#3718)
- Loading branch information
1 parent
7bd2ca0
commit 3e6a65d
Showing
6 changed files
with
166 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
53 changes: 53 additions & 0 deletions
53
src/main/java/org/jabref/logic/layout/format/ReplaceUnicodeLigaturesFormatter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
package org.jabref.logic.layout.format; | ||
|
||
import java.util.HashMap; | ||
import java.util.Map; | ||
import java.util.regex.Pattern; | ||
|
||
import org.jabref.logic.l10n.Localization; | ||
import org.jabref.logic.layout.LayoutFormatter; | ||
import org.jabref.logic.util.strings.UnicodeLigaturesMap; | ||
import org.jabref.model.cleanup.Formatter; | ||
|
||
public class ReplaceUnicodeLigaturesFormatter implements LayoutFormatter, Formatter { | ||
|
||
private Map<Pattern, String> ligaturesMap; | ||
|
||
public ReplaceUnicodeLigaturesFormatter() { | ||
ligaturesMap = new HashMap<>(); | ||
UnicodeLigaturesMap stringMap = new UnicodeLigaturesMap(); | ||
for (String key : stringMap.keySet()) { | ||
ligaturesMap.put(Pattern.compile(key), stringMap.get(key)); | ||
} | ||
} | ||
|
||
@Override | ||
public String getName() { | ||
return Localization.lang("Replace Unicode ligatures"); | ||
} | ||
|
||
@Override | ||
public String getKey() { | ||
return "remove_unicode_ligatures"; | ||
} | ||
|
||
@Override | ||
public String format(String fieldText) { | ||
String result = fieldText; | ||
|
||
for (Pattern key : ligaturesMap.keySet()) { | ||
result = key.matcher(result).replaceAll(ligaturesMap.get(key)); | ||
} | ||
return result; | ||
} | ||
|
||
@Override | ||
public String getDescription() { | ||
return Localization.lang("Replaces Unicode ligatures with their expanded form"); | ||
} | ||
|
||
@Override | ||
public String getExampleInput() { | ||
return "Æneas"; | ||
} | ||
} |
68 changes: 68 additions & 0 deletions
68
src/main/java/org/jabref/logic/util/strings/UnicodeLigaturesMap.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
package org.jabref.logic.util.strings; | ||
|
||
import java.util.HashMap; | ||
|
||
public class UnicodeLigaturesMap extends HashMap<String, String> { | ||
|
||
/** | ||
* Ligature mapping taken from https://en.wikipedia.org/wiki/Typographic_ligature#Ligatures_in_Unicode_(Latin_alphabets) | ||
* | ||
* The mapping is bijective. In case it is ever needed to turn the extended version back to unicode ligatures, the | ||
* map can easily be reversed. | ||
*/ | ||
public UnicodeLigaturesMap() { | ||
put("\uA732", "AA"); | ||
put("\uA733", "aa"); | ||
put("\u00C6", "AE"); | ||
put("\u00E6", "ae"); | ||
put("\uA734", "AO"); | ||
put("\uA735", "ao"); | ||
put("\uA736", "AU"); | ||
put("\uA737", "au"); | ||
put("\uA738", "AV"); | ||
put("\uA739", "av"); | ||
//AV, av with bar | ||
put("\uA73A", "AV"); | ||
put("\uA73B", "av"); | ||
put("\uA73C", "AY"); | ||
put("\uA73D", "ay"); | ||
put("\uD83D\uDE70", "et"); | ||
put("\uFB00", "ff"); | ||
put("\uFB01", "fi"); | ||
put("\uFB02", "fl"); | ||
put("\uFB03", "ffi"); | ||
put("\uFB04", "ffl"); | ||
put("\uFB05", "ſt"); | ||
put("\uFB06", "st"); | ||
put("\u0152", "OE"); | ||
put("\u0153", "oe"); | ||
put("\uA74E", "OO"); | ||
put("\uA74F", "oo"); | ||
// we explicitly decided to exclude the conversion of ß or ẞ | ||
// put("\u1E9E", "ſs"); | ||
// put("\u00DF", "ſz"); | ||
put("\uA728", "TZ"); | ||
put("\uA729", "tz"); | ||
put("\u1D6B", "ue"); | ||
put("\uA760", "VY"); | ||
put("\uA761", "vy"); | ||
|
||
// ligatures for phonetic transcription | ||
put("\u0238", "db"); | ||
put("\u02A3", "dz"); | ||
put("\u02A5", "dʑ"); | ||
put("\u02A4", "dʒ"); | ||
put("\u02A9", "fŋ"); | ||
put("\u0132", "IJ"); | ||
put("\u0133", "ij"); | ||
put("\u02AA", "ls"); | ||
put("\u02AB", "lz"); | ||
put("\u026E", "lʒ"); | ||
put("\u0239", "qp"); | ||
put("\u02A6", "ts"); | ||
put("\u02A7", "tʃ"); | ||
put("\u02A8", "tɕ"); | ||
put("\uAB50", "ui"); | ||
put("\uAB51", "turned ui"); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
38 changes: 38 additions & 0 deletions
38
src/test/java/org/jabref/logic/layout/format/ReplaceUnicodeLigaturesFormatterTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
package org.jabref.logic.layout.format; | ||
|
||
import org.junit.jupiter.api.BeforeEach; | ||
import org.junit.jupiter.api.Test; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
|
||
public class ReplaceUnicodeLigaturesFormatterTest { | ||
|
||
private ReplaceUnicodeLigaturesFormatter formatter; | ||
|
||
@BeforeEach | ||
public void setUp() { | ||
formatter = new ReplaceUnicodeLigaturesFormatter(); | ||
} | ||
|
||
@Test | ||
public void testPlainFormat() { | ||
assertEquals("lorem ipsum", formatter.format("lorem ipsum")); | ||
} | ||
|
||
@Test | ||
public void testSingleLigatures() { | ||
assertEquals("AA", formatter.format("\uA732")); | ||
assertEquals("fi", formatter.format("fi")); | ||
assertEquals("et", formatter.format("\uD83D\uDE70")); | ||
} | ||
|
||
@Test | ||
public void testLigatureSequence() { | ||
assertEquals("aefffflstue", formatter.format("æfffflstᵫ")); | ||
} | ||
|
||
@Test | ||
public void testSampleInput() { | ||
assertEquals("AEneas", formatter.format("Æneas")); | ||
} | ||
} |