Skip to content

Commit

Permalink
Add cleanup operation for replacing ligatures (#3718)
Browse files Browse the repository at this point in the history
  • Loading branch information
lenhard authored and LinusDietz committed Feb 14, 2018
1 parent 7bd2ca0 commit 3e6a65d
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ For more details refer to the [field mapping help page](http://help.jabref.org/e
- We changed the default dialog option when removing a [file link](http://help.jabref.org/en/FileLinks#adding-external-links-to-an-entry) from an entry.
The new default removes the linked file from the entry instead of deleting the file from disk. [#3679](https://github.com/JabRef/jabref/issues/3679)
- The group editing window can now also be called by double-clicking the group to be edited. [koppor#277](https://github.com/koppor/jabref/issues/277)
- We added a new cleanup operation that replaces ligatures with their expanded form. [3613](https://github.com/JabRef/jabref/issues/3613)

### Fixed
- We fixed an issue where pressing space caused the cursor to jump to the start of the text field. [#3471](https://github.com/JabRef/jabref/issues/3471)
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/org/jabref/logic/cleanup/Cleanups.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import org.jabref.logic.formatter.bibtexfields.OrdinalsToSuperscriptFormatter;
import org.jabref.logic.formatter.bibtexfields.UnicodeToLatexFormatter;
import org.jabref.logic.layout.format.LatexToUnicodeFormatter;
import org.jabref.logic.layout.format.ReplaceUnicodeLigaturesFormatter;
import org.jabref.model.cleanup.FieldFormatterCleanup;
import org.jabref.model.cleanup.FieldFormatterCleanups;
import org.jabref.model.cleanup.Formatter;
Expand All @@ -35,6 +36,7 @@ public class Cleanups {
defaultFormatters.add(new FieldFormatterCleanup(FieldName.PAGES, new NormalizePagesFormatter()));
defaultFormatters.add(new FieldFormatterCleanup(FieldName.DATE, new NormalizeDateFormatter()));
defaultFormatters.add(new FieldFormatterCleanup(FieldName.MONTH, new NormalizeMonthFormatter()));
defaultFormatters.add(new FieldFormatterCleanup(FieldName.INTERNAL_ALL_TEXT_FIELDS_FIELD, new ReplaceUnicodeLigaturesFormatter()));
DEFAULT_SAVE_ACTIONS = new FieldFormatterCleanups(false, defaultFormatters);

List<FieldFormatterCleanup> recommendedBibTeXFormatters = new ArrayList<>();
Expand Down Expand Up @@ -82,7 +84,7 @@ public static List<FieldFormatterCleanup> parse(String formatterString) {
while (startIndex < formatterString.length()) {
// read the field name
int currentIndex = remainingString.indexOf('[');
String fieldKey = remainingString.substring(0, currentIndex);
String fieldKey = remainingString.substring(0, currentIndex);
int endIndex = remainingString.indexOf(']');
startIndex += endIndex + 1;

Expand Down Expand Up @@ -125,7 +127,6 @@ public static FieldFormatterCleanups parse(List<String> formatterMetaList) {
// return default actions
return DEFAULT_SAVE_ACTIONS;
}

}

private static Formatter getFormatterFromString(String formatterName) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package org.jabref.logic.layout.format;

import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;

import org.jabref.logic.l10n.Localization;
import org.jabref.logic.layout.LayoutFormatter;
import org.jabref.logic.util.strings.UnicodeLigaturesMap;
import org.jabref.model.cleanup.Formatter;

public class ReplaceUnicodeLigaturesFormatter implements LayoutFormatter, Formatter {

private Map<Pattern, String> ligaturesMap;

public ReplaceUnicodeLigaturesFormatter() {
ligaturesMap = new HashMap<>();
UnicodeLigaturesMap stringMap = new UnicodeLigaturesMap();
for (String key : stringMap.keySet()) {
ligaturesMap.put(Pattern.compile(key), stringMap.get(key));
}
}

@Override
public String getName() {
return Localization.lang("Replace Unicode ligatures");
}

@Override
public String getKey() {
return "remove_unicode_ligatures";
}

@Override
public String format(String fieldText) {
String result = fieldText;

for (Pattern key : ligaturesMap.keySet()) {
result = key.matcher(result).replaceAll(ligaturesMap.get(key));
}
return result;
}

@Override
public String getDescription() {
return Localization.lang("Replaces Unicode ligatures with their expanded form");
}

@Override
public String getExampleInput() {
return "Æneas";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package org.jabref.logic.util.strings;

import java.util.HashMap;

public class UnicodeLigaturesMap extends HashMap<String, String> {

/**
* Ligature mapping taken from https://en.wikipedia.org/wiki/Typographic_ligature#Ligatures_in_Unicode_(Latin_alphabets)
*
* The mapping is bijective. In case it is ever needed to turn the extended version back to unicode ligatures, the
* map can easily be reversed.
*/
public UnicodeLigaturesMap() {
put("\uA732", "AA");
put("\uA733", "aa");
put("\u00C6", "AE");
put("\u00E6", "ae");
put("\uA734", "AO");
put("\uA735", "ao");
put("\uA736", "AU");
put("\uA737", "au");
put("\uA738", "AV");
put("\uA739", "av");
//AV, av with bar
put("\uA73A", "AV");
put("\uA73B", "av");
put("\uA73C", "AY");
put("\uA73D", "ay");
put("\uD83D\uDE70", "et");
put("\uFB00", "ff");
put("\uFB01", "fi");
put("\uFB02", "fl");
put("\uFB03", "ffi");
put("\uFB04", "ffl");
put("\uFB05", "ſt");
put("\uFB06", "st");
put("\u0152", "OE");
put("\u0153", "oe");
put("\uA74E", "OO");
put("\uA74F", "oo");
// we explicitly decided to exclude the conversion of ß or ẞ
// put("\u1E9E", "ſs");
// put("\u00DF", "ſz");
put("\uA728", "TZ");
put("\uA729", "tz");
put("\u1D6B", "ue");
put("\uA760", "VY");
put("\uA761", "vy");

// ligatures for phonetic transcription
put("\u0238", "db");
put("\u02A3", "dz");
put("\u02A5", "dʑ");
put("\u02A4", "dʒ");
put("\u02A9", "fŋ");
put("\u0132", "IJ");
put("\u0133", "ij");
put("\u02AA", "ls");
put("\u02AB", "lz");
put("\u026E", "lʒ");
put("\u0239", "qp");
put("\u02A6", "ts");
put("\u02A7", "tʃ");
put("\u02A8", "tɕ");
put("\uAB50", "ui");
put("\uAB51", "turned ui");
}
}
3 changes: 3 additions & 0 deletions src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -994,6 +994,9 @@ Replace\ string=Replace string

Replace\ with=Replace with

Replace\ Unicode\ ligatures=Replace Unicode ligatures
Replaces\ Unicode\ ligatures\ with\ their\ expanded\ form=Replaces Unicode ligatures with their expanded form

Replaced=Replaced

Required\ fields=Required fields
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package org.jabref.logic.layout.format;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

public class ReplaceUnicodeLigaturesFormatterTest {

private ReplaceUnicodeLigaturesFormatter formatter;

@BeforeEach
public void setUp() {
formatter = new ReplaceUnicodeLigaturesFormatter();
}

@Test
public void testPlainFormat() {
assertEquals("lorem ipsum", formatter.format("lorem ipsum"));
}

@Test
public void testSingleLigatures() {
assertEquals("AA", formatter.format("\uA732"));
assertEquals("fi", formatter.format("fi"));
assertEquals("et", formatter.format("\uD83D\uDE70"));
}

@Test
public void testLigatureSequence() {
assertEquals("aefffflstue", formatter.format("æfffflstᵫ"));
}

@Test
public void testSampleInput() {
assertEquals("AEneas", formatter.format("Æneas"));
}
}

0 comments on commit 3e6a65d

Please sign in to comment.