From 89abea352957327a8bf6181c4f353ad29c8be6fe Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sat, 9 Nov 2019 21:14:14 +0100 Subject: [PATCH] Clean URLs in DBLPFetcher - Style RemoveLatexCommandsFormatter - Add tests for RemoveLatexCommandsFormatter - Add support for removing single and multiple whitespaces after a command - Split tests in RemoveBracketsTest - Fix casing in CleanupUrlFormatter (to match style and current name of test class CleanUpFormatterTest) - Fix casing in some ...Url.. method names - Format CleanupUrlFormatter --- .../jabref/gui/fieldeditors/UrlEditor.java | 9 ++- .../fieldeditors/contextmenu/EditorMenus.java | 6 +- .../jabref/logic/formatter/Formatters.java | 4 +- ...ormatter.java => CleanupUrlFormatter.java} | 18 +++--- .../bibtexfields/LatexCleanupFormatter.java | 3 + .../logic/importer/fetcher/DBLPFetcher.java | 11 ++-- .../format/RemoveLatexCommandsFormatter.java | 53 +++++++++------- .../bibtexfields/CleanupUrlFormatterTest.java | 9 ++- .../layout/format/RemoveBracketsTest.java | 14 ++++- .../RemoveLatexCommandsFormatterTest.java | 63 +++++++++++++++++++ 10 files changed, 141 insertions(+), 49 deletions(-) rename src/main/java/org/jabref/logic/formatter/bibtexfields/{CleanupURLFormatter.java => CleanupUrlFormatter.java} (84%) create mode 100644 src/test/java/org/jabref/logic/layout/format/RemoveLatexCommandsFormatterTest.java diff --git a/src/main/java/org/jabref/gui/fieldeditors/UrlEditor.java b/src/main/java/org/jabref/gui/fieldeditors/UrlEditor.java index 08d22698a8c2..270770a2ea19 100644 --- a/src/main/java/org/jabref/gui/fieldeditors/UrlEditor.java +++ b/src/main/java/org/jabref/gui/fieldeditors/UrlEditor.java @@ -12,7 +12,7 @@ import org.jabref.gui.DialogService; import org.jabref.gui.autocompleter.AutoCompleteSuggestionProvider; import org.jabref.gui.fieldeditors.contextmenu.EditorMenus; -import org.jabref.logic.formatter.bibtexfields.CleanupURLFormatter; +import org.jabref.logic.formatter.bibtexfields.CleanupUrlFormatter; import org.jabref.logic.formatter.bibtexfields.TrimWhitespaceFormatter; import org.jabref.logic.integrity.FieldCheckers; import org.jabref.model.entry.BibEntry; @@ -34,12 +34,11 @@ public UrlEditor(Field field, DialogService dialogService, AutoCompleteSuggestio .load(); textArea.textProperty().bindBidirectional(viewModel.textProperty()); - Supplier> contextMenuSupplier = EditorMenus.getCleanupURLMenu(textArea); + Supplier> contextMenuSupplier = EditorMenus.getCleanupUrlMenu(textArea); textArea.addToContextMenu(contextMenuSupplier); - // init paste handler for URLEditor to format pasted url link in textArea - textArea.setPasteActionHandler(() -> textArea.setText(new CleanupURLFormatter().format(new TrimWhitespaceFormatter().format(textArea.getText())))); - + // init paste handler for UrlEditor to format pasted url link in textArea + textArea.setPasteActionHandler(() -> textArea.setText(new CleanupUrlFormatter().format(new TrimWhitespaceFormatter().format(textArea.getText())))); new EditorValidator(preferences).configureValidation(viewModel.getFieldValidator().getValidationStatus(), textArea); } diff --git a/src/main/java/org/jabref/gui/fieldeditors/contextmenu/EditorMenus.java b/src/main/java/org/jabref/gui/fieldeditors/contextmenu/EditorMenus.java index abdb97009092..b6b8df62bc30 100644 --- a/src/main/java/org/jabref/gui/fieldeditors/contextmenu/EditorMenus.java +++ b/src/main/java/org/jabref/gui/fieldeditors/contextmenu/EditorMenus.java @@ -16,7 +16,7 @@ import org.jabref.gui.actions.ActionFactory; import org.jabref.gui.actions.StandardActions; import org.jabref.gui.edit.CopyDoiUrlAction; -import org.jabref.logic.formatter.bibtexfields.CleanupURLFormatter; +import org.jabref.logic.formatter.bibtexfields.CleanupUrlFormatter; import org.jabref.logic.formatter.bibtexfields.NormalizeNamesFormatter; import org.jabref.logic.l10n.Localization; @@ -91,11 +91,11 @@ public static Supplier> getDOIMenu(TextArea textArea) { * @param textArea text-area that this menu will be connected to * @return menu containing items of the default menu and an item to cleanup a URL */ - public static Supplier> getCleanupURLMenu(TextArea textArea) { + public static Supplier> getCleanupUrlMenu(TextArea textArea) { return () -> { CustomMenuItem cleanupURL = new CustomMenuItem(new Label(Localization.lang("Cleanup URL link"))); cleanupURL.setDisable(textArea.textProperty().isEmpty().get()); - cleanupURL.setOnAction(event -> textArea.setText(new CleanupURLFormatter().format(textArea.getText()))); + cleanupURL.setOnAction(event -> textArea.setText(new CleanupUrlFormatter().format(textArea.getText()))); List menuItems = new ArrayList<>(); menuItems.add(cleanupURL); diff --git a/src/main/java/org/jabref/logic/formatter/Formatters.java b/src/main/java/org/jabref/logic/formatter/Formatters.java index 7aea0a024a6b..55a91f7f86cf 100644 --- a/src/main/java/org/jabref/logic/formatter/Formatters.java +++ b/src/main/java/org/jabref/logic/formatter/Formatters.java @@ -6,7 +6,7 @@ import java.util.Objects; import java.util.Optional; -import org.jabref.logic.formatter.bibtexfields.CleanupURLFormatter; +import org.jabref.logic.formatter.bibtexfields.CleanupUrlFormatter; import org.jabref.logic.formatter.bibtexfields.ClearFormatter; import org.jabref.logic.formatter.bibtexfields.EscapeUnderscoresFormatter; import org.jabref.logic.formatter.bibtexfields.HtmlToLatexFormatter; @@ -58,7 +58,7 @@ public static List getCaseChangers() { public static List getOthers() { return Arrays.asList( new ClearFormatter(), - new CleanupURLFormatter(), + new CleanupUrlFormatter(), new LatexCleanupFormatter(), new MinifyNameListFormatter(), new NormalizeDateFormatter(), diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/CleanupURLFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/CleanupUrlFormatter.java similarity index 84% rename from src/main/java/org/jabref/logic/formatter/bibtexfields/CleanupURLFormatter.java rename to src/main/java/org/jabref/logic/formatter/bibtexfields/CleanupUrlFormatter.java index e14b717dd647..62932ac31c2e 100644 --- a/src/main/java/org/jabref/logic/formatter/bibtexfields/CleanupURLFormatter.java +++ b/src/main/java/org/jabref/logic/formatter/bibtexfields/CleanupUrlFormatter.java @@ -7,6 +7,7 @@ import java.util.regex.Pattern; import org.jabref.logic.l10n.Localization; +import org.jabref.logic.layout.format.RemoveLatexCommandsFormatter; import org.jabref.model.cleanup.Formatter; import org.apache.commons.logging.Log; @@ -15,9 +16,9 @@ /** * Cleanup URL link */ -public class CleanupURLFormatter extends Formatter { +public class CleanupUrlFormatter extends Formatter { - private static final Log LOGGER = LogFactory.getLog(CleanupURLFormatter.class); + private static final Log LOGGER = LogFactory.getLog(CleanupUrlFormatter.class); // This regexp find "url=" or "to=" parameter in full link and get text after them private static final Pattern PATTERN_URL = Pattern.compile("(?:url|to)=([^&]*)"); @@ -38,16 +39,16 @@ public String format(String value) { Matcher matcher = PATTERN_URL.matcher(value); if (matcher.find()) { - toDecode = matcher.group(1); - + toDecode = matcher.group(1); } try { decodedLink = URLDecoder.decode(toDecode, StandardCharsets.UTF_8.name()); - } - catch (UnsupportedEncodingException e) { + } catch (UnsupportedEncodingException e) { LOGGER.warn("Used unsupported character encoding", e); } - return decodedLink; + + String result = new RemoveLatexCommandsFormatter().format(decodedLink); + return result; } @Override @@ -61,6 +62,5 @@ public String getExampleInput() { "rja&uact=8&ved=0ahUKEwjg3ZrB_ZPXAhVGuhoKHYdOBOg4ChAWCCYwAA&url=" + "http%3A%2F%2Fwww.focus.de%2Fgesundheit%2Fratgeber%2Fherz%2Ftest%2" + "Flebenserwartung-werden-sie-100-jahre-alt_aid_363828.html" + "&usg=AOvVaw1G6m2jf-pTHYkXceii4hXU"; - } - + } } diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/LatexCleanupFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/LatexCleanupFormatter.java index 032914146b67..c9bf179d2013 100644 --- a/src/main/java/org/jabref/logic/formatter/bibtexfields/LatexCleanupFormatter.java +++ b/src/main/java/org/jabref/logic/formatter/bibtexfields/LatexCleanupFormatter.java @@ -5,6 +5,9 @@ import org.jabref.logic.l10n.Localization; import org.jabref.model.cleanup.Formatter; +/** + * Simplifies LaTeX syntax. {@see org.jabref.logic.layout.format.RemoveLatexCommandsFormatter} for a formatter removing LaTeX commands completely. + */ public class LatexCleanupFormatter extends Formatter { private static final Pattern REMOVE_REDUNDANT = Pattern diff --git a/src/main/java/org/jabref/logic/importer/fetcher/DBLPFetcher.java b/src/main/java/org/jabref/logic/importer/fetcher/DBLPFetcher.java index d7515a638c4f..76942fea1b7f 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/DBLPFetcher.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/DBLPFetcher.java @@ -7,6 +7,7 @@ import java.util.Optional; import org.jabref.logic.cleanup.DoiCleanup; +import org.jabref.logic.formatter.bibtexfields.CleanupUrlFormatter; import org.jabref.logic.formatter.bibtexfields.ClearFormatter; import org.jabref.logic.help.HelpFile; import org.jabref.logic.importer.FetcherException; @@ -17,6 +18,7 @@ import org.jabref.model.cleanup.FieldFormatterCleanup; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.field.InternalField; +import org.jabref.model.entry.field.StandardField; import org.jabref.model.util.DummyFileUpdateMonitor; import org.apache.http.client.utils.URIBuilder; @@ -57,13 +59,13 @@ public Parser getParser() { @Override public void doPostCleanup(BibEntry entry) { DoiCleanup doiCleaner = new DoiCleanup(); - - FieldFormatterCleanup clearTimestampFormatter = new FieldFormatterCleanup(InternalField.TIMESTAMP, - new ClearFormatter()); - doiCleaner.cleanup(entry); + + FieldFormatterCleanup clearTimestampFormatter = new FieldFormatterCleanup(InternalField.TIMESTAMP, new ClearFormatter()); clearTimestampFormatter.cleanup(entry); + FieldFormatterCleanup cleanUpUrlFormatter = new FieldFormatterCleanup(StandardField.URL, new CleanupUrlFormatter()); + cleanUpUrlFormatter.cleanup(entry); } @Override @@ -75,5 +77,4 @@ public String getName() { public Optional getHelpPage() { return Optional.of(HelpFile.FETCHER_DBLP); } - } diff --git a/src/main/java/org/jabref/logic/layout/format/RemoveLatexCommandsFormatter.java b/src/main/java/org/jabref/logic/layout/format/RemoveLatexCommandsFormatter.java index a538bdaf13ef..e9681c4d698d 100644 --- a/src/main/java/org/jabref/logic/layout/format/RemoveLatexCommandsFormatter.java +++ b/src/main/java/org/jabref/logic/layout/format/RemoveLatexCommandsFormatter.java @@ -7,51 +7,61 @@ public class RemoveLatexCommandsFormatter implements LayoutFormatter { @Override public String format(String field) { - StringBuilder sb = new StringBuilder(""); + StringBuilder cleanedField = new StringBuilder(); StringBuilder currentCommand = null; - char c; + char currentCharacter; boolean escaped = false; boolean incommand = false; - int i; - for (i = 0; i < field.length(); i++) { - c = field.charAt(i); - if (escaped && (c == '\\')) { - sb.append('\\'); + int currentFieldPosition; + for (currentFieldPosition = 0; currentFieldPosition < field.length(); currentFieldPosition++) { + currentCharacter = field.charAt(currentFieldPosition); + if (escaped && (currentCharacter == '\\')) { + cleanedField.append('\\'); escaped = false; - } else if (c == '\\') { + // \\ --> first \ begins the command, second \ ends the command + // \latexommand\\ -> \latexcommand is the command, terminated by \, which begins a new command + incommand = false; + } else if (currentCharacter == '\\') { escaped = true; incommand = true; currentCommand = new StringBuilder(); - } else if (!incommand && ((c == '{') || (c == '}'))) { + } else if (!incommand && ((currentCharacter == '{') || (currentCharacter == '}'))) { // Swallow the brace. - } else if (Character.isLetter(c) || StringUtil.SPECIAL_COMMAND_CHARS.contains(String.valueOf(c))) { + } else if (Character.isLetter(currentCharacter) || StringUtil.SPECIAL_COMMAND_CHARS.contains(String.valueOf(currentCharacter))) { escaped = false; if (incommand) { - currentCommand.append(c); + currentCommand.append(currentCharacter); if ((currentCommand.length() == 1) && StringUtil.SPECIAL_COMMAND_CHARS.contains(currentCommand.toString())) { // This indicates that we are in a command of the type \^o or \~{n} incommand = false; escaped = false; - } } else { - sb.append(c); + cleanedField.append(currentCharacter); } - } else if (Character.isLetter(c)) { + } else if (Character.isLetter(currentCharacter)) { escaped = false; if (incommand) { // We are in a command, and should not keep the letter. - currentCommand.append(c); + currentCommand.append(currentCharacter); } else { - sb.append(c); + cleanedField.append(currentCharacter); } } else { - if (!incommand || (!Character.isWhitespace(c) && (c != '{'))) { - sb.append(c); + if (!incommand || (!Character.isWhitespace(currentCharacter) && (currentCharacter != '{'))) { + cleanedField.append(currentCharacter); } else { - if (c != '{') { - sb.append(c); + if (!Character.isWhitespace(currentCharacter) && (currentCharacter != '{')) { + // do not append the opening brace of a command parameter + // do not append the whitespace character + cleanedField.append(currentCharacter); + } + if (incommand) { + // eat up all whitespace characters + while ((currentFieldPosition + 1 < field.length() && Character.isWhitespace(field.charAt(currentFieldPosition + 1)))) { + currentFieldPosition++; + } } } incommand = false; @@ -59,7 +69,6 @@ public String format(String field) { } } - return sb.toString(); + return cleanedField.toString(); } - } diff --git a/src/test/java/org/jabref/logic/formatter/bibtexfields/CleanupUrlFormatterTest.java b/src/test/java/org/jabref/logic/formatter/bibtexfields/CleanupUrlFormatterTest.java index 46e4bea35080..c6a838aabae3 100644 --- a/src/test/java/org/jabref/logic/formatter/bibtexfields/CleanupUrlFormatterTest.java +++ b/src/test/java/org/jabref/logic/formatter/bibtexfields/CleanupUrlFormatterTest.java @@ -10,11 +10,11 @@ */ class CleanupUrlFormatterTest { - private CleanupURLFormatter formatter; + private CleanupUrlFormatter formatter; @BeforeEach void setUp() { - formatter = new CleanupURLFormatter(); + formatter = new CleanupUrlFormatter(); } @Test @@ -29,6 +29,11 @@ void extractURLFormLink() { formatter.format("away.php?to=http%3A%2F%2Fwikipedia.org&a=snippet")); } + @Test + void latexCommandsRemoved() { + assertEquals("http://pi.informatik.uni-siegen.de/stt/36_2/./03_Technische_Beitraege/ZEUS2016/beitrag_2.pdf", formatter.format("http://pi.informatik.uni-siegen.de/stt/36\\_2/./03\\_Technische\\_Beitraege/ZEUS2016/beitrag\\_2.pdf")); + } + @Test void formatExample() { assertEquals("http://www.focus.de/" + diff --git a/src/test/java/org/jabref/logic/layout/format/RemoveBracketsTest.java b/src/test/java/org/jabref/logic/layout/format/RemoveBracketsTest.java index 48c904c40bc5..018d53547865 100644 --- a/src/test/java/org/jabref/logic/layout/format/RemoveBracketsTest.java +++ b/src/test/java/org/jabref/logic/layout/format/RemoveBracketsTest.java @@ -16,10 +16,22 @@ public void setUp() { } @Test - public void testFormat() throws Exception { + public void bracePairCorrectlyRemoved() throws Exception { assertEquals("some text", formatter.format("{some text}")); + } + + @Test + public void singleOpeningBraceCorrectlyRemoved() throws Exception { assertEquals("some text", formatter.format("{some text")); + } + + @Test + public void singleClosingBraceCorrectlyRemoved() throws Exception { assertEquals("some text", formatter.format("some text}")); + } + + @Test + public void bracePairWithEscapedBackslashCorrectlyRemoved() throws Exception { assertEquals("\\some text\\", formatter.format("\\{some text\\}")); } } diff --git a/src/test/java/org/jabref/logic/layout/format/RemoveLatexCommandsFormatterTest.java b/src/test/java/org/jabref/logic/layout/format/RemoveLatexCommandsFormatterTest.java new file mode 100644 index 000000000000..cf13831200d3 --- /dev/null +++ b/src/test/java/org/jabref/logic/layout/format/RemoveLatexCommandsFormatterTest.java @@ -0,0 +1,63 @@ +package org.jabref.logic.layout.format; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class RemoveLatexCommandsFormatterTest { + + private RemoveLatexCommandsFormatter formatter; + + @BeforeEach + public void setUp() { + formatter = new RemoveLatexCommandsFormatter(); + } + + @Test + public void withoutLatexCommandsUnmodified() { + assertEquals("some text", formatter.format("some text")); + } + + @Test + public void singleCommandWiped() { + assertEquals("", formatter.format("\\sometext")); + } + + @Test + public void singleSpaceAfterCommandRemoved() { + assertEquals("text", formatter.format("\\some text")); + } + + @Test + public void multipleSpacesAfterCommandRemoved() { + assertEquals("text", formatter.format("\\some text")); + } + + @Test + public void escapedBackslashBecomesBackslash() { + assertEquals("\\", formatter.format("\\\\")); + } + + @Test + public void escapedBackslashFollowedByTextBecomesBackslashFollowedByText() { + assertEquals("\\some text", formatter.format("\\\\some text")); + } + + @Test + public void escapedBackslashKept() { + assertEquals("\\some text\\", formatter.format("\\\\some text\\\\")); + } + + @Test + public void escapedUnderscoreReplaces() { + assertEquals("some_text", formatter.format("some\\_text")); + } + + @Test + public void exampleUrlCorrectlyCleaned() { + assertEquals("http://pi.informatik.uni-siegen.de/stt/36_2/./03_Technische_Beitraege/ZEUS2016/beitrag_2.pdf", formatter.format("http://pi.informatik.uni-siegen.de/stt/36\\_2/./03\\_Technische\\_Beitraege/ZEUS2016/beitrag\\_2.pdf")); + } + + +}