Skip to content

Commit

Permalink
Clean URLs in DBLPFetcher
Browse files Browse the repository at this point in the history
- Style RemoveLatexCommandsFormatter
- Add tests for RemoveLatexCommandsFormatter
- Add support for removing single and multiple whitespaces after a command
- Split tests in RemoveBracketsTest
- Fix casing in CleanupUrlFormatter (to match style and current name of test class CleanUpFormatterTest)
- Fix casing in some ...Url.. method names
- Format CleanupUrlFormatter
  • Loading branch information
koppor committed Nov 9, 2019
1 parent 42053a8 commit 89abea3
Show file tree
Hide file tree
Showing 10 changed files with 141 additions and 49 deletions.
9 changes: 4 additions & 5 deletions src/main/java/org/jabref/gui/fieldeditors/UrlEditor.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import org.jabref.gui.DialogService;
import org.jabref.gui.autocompleter.AutoCompleteSuggestionProvider;
import org.jabref.gui.fieldeditors.contextmenu.EditorMenus;
import org.jabref.logic.formatter.bibtexfields.CleanupURLFormatter;
import org.jabref.logic.formatter.bibtexfields.CleanupUrlFormatter;
import org.jabref.logic.formatter.bibtexfields.TrimWhitespaceFormatter;
import org.jabref.logic.integrity.FieldCheckers;
import org.jabref.model.entry.BibEntry;
Expand All @@ -34,12 +34,11 @@ public UrlEditor(Field field, DialogService dialogService, AutoCompleteSuggestio
.load();

textArea.textProperty().bindBidirectional(viewModel.textProperty());
Supplier<List<MenuItem>> contextMenuSupplier = EditorMenus.getCleanupURLMenu(textArea);
Supplier<List<MenuItem>> contextMenuSupplier = EditorMenus.getCleanupUrlMenu(textArea);
textArea.addToContextMenu(contextMenuSupplier);

// init paste handler for URLEditor to format pasted url link in textArea
textArea.setPasteActionHandler(() -> textArea.setText(new CleanupURLFormatter().format(new TrimWhitespaceFormatter().format(textArea.getText()))));

// init paste handler for UrlEditor to format pasted url link in textArea
textArea.setPasteActionHandler(() -> textArea.setText(new CleanupUrlFormatter().format(new TrimWhitespaceFormatter().format(textArea.getText()))));

new EditorValidator(preferences).configureValidation(viewModel.getFieldValidator().getValidationStatus(), textArea);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import org.jabref.gui.actions.ActionFactory;
import org.jabref.gui.actions.StandardActions;
import org.jabref.gui.edit.CopyDoiUrlAction;
import org.jabref.logic.formatter.bibtexfields.CleanupURLFormatter;
import org.jabref.logic.formatter.bibtexfields.CleanupUrlFormatter;
import org.jabref.logic.formatter.bibtexfields.NormalizeNamesFormatter;
import org.jabref.logic.l10n.Localization;

Expand Down Expand Up @@ -91,11 +91,11 @@ public static Supplier<List<MenuItem>> getDOIMenu(TextArea textArea) {
* @param textArea text-area that this menu will be connected to
* @return menu containing items of the default menu and an item to cleanup a URL
*/
public static Supplier<List<MenuItem>> getCleanupURLMenu(TextArea textArea) {
public static Supplier<List<MenuItem>> getCleanupUrlMenu(TextArea textArea) {
return () -> {
CustomMenuItem cleanupURL = new CustomMenuItem(new Label(Localization.lang("Cleanup URL link")));
cleanupURL.setDisable(textArea.textProperty().isEmpty().get());
cleanupURL.setOnAction(event -> textArea.setText(new CleanupURLFormatter().format(textArea.getText())));
cleanupURL.setOnAction(event -> textArea.setText(new CleanupUrlFormatter().format(textArea.getText())));

List<MenuItem> menuItems = new ArrayList<>();
menuItems.add(cleanupURL);
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/jabref/logic/formatter/Formatters.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import java.util.Objects;
import java.util.Optional;

import org.jabref.logic.formatter.bibtexfields.CleanupURLFormatter;
import org.jabref.logic.formatter.bibtexfields.CleanupUrlFormatter;
import org.jabref.logic.formatter.bibtexfields.ClearFormatter;
import org.jabref.logic.formatter.bibtexfields.EscapeUnderscoresFormatter;
import org.jabref.logic.formatter.bibtexfields.HtmlToLatexFormatter;
Expand Down Expand Up @@ -58,7 +58,7 @@ public static List<Formatter> getCaseChangers() {
public static List<Formatter> getOthers() {
return Arrays.asList(
new ClearFormatter(),
new CleanupURLFormatter(),
new CleanupUrlFormatter(),
new LatexCleanupFormatter(),
new MinifyNameListFormatter(),
new NormalizeDateFormatter(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.util.regex.Pattern;

import org.jabref.logic.l10n.Localization;
import org.jabref.logic.layout.format.RemoveLatexCommandsFormatter;
import org.jabref.model.cleanup.Formatter;

import org.apache.commons.logging.Log;
Expand All @@ -15,9 +16,9 @@
/**
* Cleanup URL link
*/
public class CleanupURLFormatter extends Formatter {
public class CleanupUrlFormatter extends Formatter {

private static final Log LOGGER = LogFactory.getLog(CleanupURLFormatter.class);
private static final Log LOGGER = LogFactory.getLog(CleanupUrlFormatter.class);
// This regexp find "url=" or "to=" parameter in full link and get text after them
private static final Pattern PATTERN_URL = Pattern.compile("(?:url|to)=([^&]*)");

Expand All @@ -38,16 +39,16 @@ public String format(String value) {

Matcher matcher = PATTERN_URL.matcher(value);
if (matcher.find()) {
toDecode = matcher.group(1);

toDecode = matcher.group(1);
}
try {
decodedLink = URLDecoder.decode(toDecode, StandardCharsets.UTF_8.name());
}
catch (UnsupportedEncodingException e) {
} catch (UnsupportedEncodingException e) {
LOGGER.warn("Used unsupported character encoding", e);
}
return decodedLink;

String result = new RemoveLatexCommandsFormatter().format(decodedLink);
return result;
}

@Override
Expand All @@ -61,6 +62,5 @@ public String getExampleInput() {
"rja&uact=8&ved=0ahUKEwjg3ZrB_ZPXAhVGuhoKHYdOBOg4ChAWCCYwAA&url=" +
"http%3A%2F%2Fwww.focus.de%2Fgesundheit%2Fratgeber%2Fherz%2Ftest%2" +
"Flebenserwartung-werden-sie-100-jahre-alt_aid_363828.html" + "&usg=AOvVaw1G6m2jf-pTHYkXceii4hXU";
}

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import org.jabref.logic.l10n.Localization;
import org.jabref.model.cleanup.Formatter;

/**
* Simplifies LaTeX syntax. {@see org.jabref.logic.layout.format.RemoveLatexCommandsFormatter} for a formatter removing LaTeX commands completely.
*/
public class LatexCleanupFormatter extends Formatter {

private static final Pattern REMOVE_REDUNDANT = Pattern
Expand Down
11 changes: 6 additions & 5 deletions src/main/java/org/jabref/logic/importer/fetcher/DBLPFetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.util.Optional;

import org.jabref.logic.cleanup.DoiCleanup;
import org.jabref.logic.formatter.bibtexfields.CleanupUrlFormatter;
import org.jabref.logic.formatter.bibtexfields.ClearFormatter;
import org.jabref.logic.help.HelpFile;
import org.jabref.logic.importer.FetcherException;
Expand All @@ -17,6 +18,7 @@
import org.jabref.model.cleanup.FieldFormatterCleanup;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.InternalField;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.util.DummyFileUpdateMonitor;

import org.apache.http.client.utils.URIBuilder;
Expand Down Expand Up @@ -57,13 +59,13 @@ public Parser getParser() {
@Override
public void doPostCleanup(BibEntry entry) {
DoiCleanup doiCleaner = new DoiCleanup();

FieldFormatterCleanup clearTimestampFormatter = new FieldFormatterCleanup(InternalField.TIMESTAMP,
new ClearFormatter());

doiCleaner.cleanup(entry);

FieldFormatterCleanup clearTimestampFormatter = new FieldFormatterCleanup(InternalField.TIMESTAMP, new ClearFormatter());
clearTimestampFormatter.cleanup(entry);

FieldFormatterCleanup cleanUpUrlFormatter = new FieldFormatterCleanup(StandardField.URL, new CleanupUrlFormatter());
cleanUpUrlFormatter.cleanup(entry);
}

@Override
Expand All @@ -75,5 +77,4 @@ public String getName() {
public Optional<HelpFile> getHelpPage() {
return Optional.of(HelpFile.FETCHER_DBLP);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -7,59 +7,68 @@ public class RemoveLatexCommandsFormatter implements LayoutFormatter {

@Override
public String format(String field) {
StringBuilder sb = new StringBuilder("");
StringBuilder cleanedField = new StringBuilder();
StringBuilder currentCommand = null;
char c;
char currentCharacter;
boolean escaped = false;
boolean incommand = false;
int i;
for (i = 0; i < field.length(); i++) {
c = field.charAt(i);
if (escaped && (c == '\\')) {
sb.append('\\');
int currentFieldPosition;
for (currentFieldPosition = 0; currentFieldPosition < field.length(); currentFieldPosition++) {
currentCharacter = field.charAt(currentFieldPosition);
if (escaped && (currentCharacter == '\\')) {
cleanedField.append('\\');
escaped = false;
} else if (c == '\\') {
// \\ --> first \ begins the command, second \ ends the command
// \latexommand\\ -> \latexcommand is the command, terminated by \, which begins a new command
incommand = false;
} else if (currentCharacter == '\\') {
escaped = true;
incommand = true;
currentCommand = new StringBuilder();
} else if (!incommand && ((c == '{') || (c == '}'))) {
} else if (!incommand && ((currentCharacter == '{') || (currentCharacter == '}'))) {
// Swallow the brace.
} else if (Character.isLetter(c) || StringUtil.SPECIAL_COMMAND_CHARS.contains(String.valueOf(c))) {
} else if (Character.isLetter(currentCharacter) || StringUtil.SPECIAL_COMMAND_CHARS.contains(String.valueOf(currentCharacter))) {
escaped = false;
if (incommand) {
currentCommand.append(c);
currentCommand.append(currentCharacter);
if ((currentCommand.length() == 1)
&& StringUtil.SPECIAL_COMMAND_CHARS.contains(currentCommand.toString())) {
// This indicates that we are in a command of the type \^o or \~{n}
incommand = false;
escaped = false;

}
} else {
sb.append(c);
cleanedField.append(currentCharacter);
}
} else if (Character.isLetter(c)) {
} else if (Character.isLetter(currentCharacter)) {
escaped = false;
if (incommand) {
// We are in a command, and should not keep the letter.
currentCommand.append(c);
currentCommand.append(currentCharacter);
} else {
sb.append(c);
cleanedField.append(currentCharacter);
}
} else {
if (!incommand || (!Character.isWhitespace(c) && (c != '{'))) {
sb.append(c);
if (!incommand || (!Character.isWhitespace(currentCharacter) && (currentCharacter != '{'))) {
cleanedField.append(currentCharacter);
} else {
if (c != '{') {
sb.append(c);
if (!Character.isWhitespace(currentCharacter) && (currentCharacter != '{')) {
// do not append the opening brace of a command parameter
// do not append the whitespace character
cleanedField.append(currentCharacter);
}
if (incommand) {
// eat up all whitespace characters
while ((currentFieldPosition + 1 < field.length() && Character.isWhitespace(field.charAt(currentFieldPosition + 1)))) {
currentFieldPosition++;
}
}
}
incommand = false;
escaped = false;
}
}

return sb.toString();
return cleanedField.toString();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
*/
class CleanupUrlFormatterTest {

private CleanupURLFormatter formatter;
private CleanupUrlFormatter formatter;

@BeforeEach
void setUp() {
formatter = new CleanupURLFormatter();
formatter = new CleanupUrlFormatter();
}

@Test
Expand All @@ -29,6 +29,11 @@ void extractURLFormLink() {
formatter.format("away.php?to=http%3A%2F%2Fwikipedia.org&a=snippet"));
}

@Test
void latexCommandsRemoved() {
assertEquals("http://pi.informatik.uni-siegen.de/stt/36_2/./03_Technische_Beitraege/ZEUS2016/beitrag_2.pdf", formatter.format("http://pi.informatik.uni-siegen.de/stt/36\\_2/./03\\_Technische\\_Beitraege/ZEUS2016/beitrag\\_2.pdf"));
}

@Test
void formatExample() {
assertEquals("http://www.focus.de/" +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,22 @@ public void setUp() {
}

@Test
public void testFormat() throws Exception {
public void bracePairCorrectlyRemoved() throws Exception {
assertEquals("some text", formatter.format("{some text}"));
}

@Test
public void singleOpeningBraceCorrectlyRemoved() throws Exception {
assertEquals("some text", formatter.format("{some text"));
}

@Test
public void singleClosingBraceCorrectlyRemoved() throws Exception {
assertEquals("some text", formatter.format("some text}"));
}

@Test
public void bracePairWithEscapedBackslashCorrectlyRemoved() throws Exception {
assertEquals("\\some text\\", formatter.format("\\{some text\\}"));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package org.jabref.logic.layout.format;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

class RemoveLatexCommandsFormatterTest {

private RemoveLatexCommandsFormatter formatter;

@BeforeEach
public void setUp() {
formatter = new RemoveLatexCommandsFormatter();
}

@Test
public void withoutLatexCommandsUnmodified() {
assertEquals("some text", formatter.format("some text"));
}

@Test
public void singleCommandWiped() {
assertEquals("", formatter.format("\\sometext"));
}

@Test
public void singleSpaceAfterCommandRemoved() {
assertEquals("text", formatter.format("\\some text"));
}

@Test
public void multipleSpacesAfterCommandRemoved() {
assertEquals("text", formatter.format("\\some text"));
}

@Test
public void escapedBackslashBecomesBackslash() {
assertEquals("\\", formatter.format("\\\\"));
}

@Test
public void escapedBackslashFollowedByTextBecomesBackslashFollowedByText() {
assertEquals("\\some text", formatter.format("\\\\some text"));
}

@Test
public void escapedBackslashKept() {
assertEquals("\\some text\\", formatter.format("\\\\some text\\\\"));
}

@Test
public void escapedUnderscoreReplaces() {
assertEquals("some_text", formatter.format("some\\_text"));
}

@Test
public void exampleUrlCorrectlyCleaned() {
assertEquals("http://pi.informatik.uni-siegen.de/stt/36_2/./03_Technische_Beitraege/ZEUS2016/beitrag_2.pdf", formatter.format("http://pi.informatik.uni-siegen.de/stt/36\\_2/./03\\_Technische\\_Beitraege/ZEUS2016/beitrag\\_2.pdf"));
}


}

0 comments on commit 89abea3

Please sign in to comment.