Skip to content

Commit

Permalink
Add cleanup operation in reference to the Urldate field (#9999)
Browse files Browse the repository at this point in the history
  • Loading branch information
dkokkotas authored Jun 11, 2023
1 parent b2b4aba commit dd656c5
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 17 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
- We added drag and drop events for field 'Groups' in entry editor panel. [#569](https://github.com/koppor/jabref/issues/569)
- We added support for parsing MathML in the Medline importer. [#4273](https://github.com/JabRef/jabref/issues/4273)
- We added the ability to search for a DOI directly from 'Web Search'. [#9674](https://github.com/JabRef/jabref/issues/9674)
- We added a cleanup activity that identifies a URL in the `note` field and moves it to the `url` field. [koppor#216](https://github.com/koppor/jabref/issues/216)
- We added a cleanup activity that identifies a URL or a last-visited-date in the `note` field and moves it to the `url` and `urldate` field respectively. [koppor#216](https://github.com/koppor/jabref/issues/216)
- We enabled the user to change the name of a field in a custom entry type by double-clicking on it. [#9840](https://github.com/JabRef/jabref/issues/9840)


Expand Down
66 changes: 51 additions & 15 deletions src/main/java/org/jabref/logic/cleanup/URLCleanup.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jabref.logic.formatter.bibtexfields.NormalizeDateFormatter;
import org.jabref.model.FieldChange;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.Date;
import org.jabref.model.entry.field.Field;
import org.jabref.model.entry.field.StandardField;

Expand All @@ -15,31 +17,41 @@
*/
public class URLCleanup implements CleanupJob {

/*
* The urlRegex was originally fetched from a suggested solution in
* https://stackoverflow.com/questions/28185064/python-infinite-loop-in-regex-to-match-url.
* In order to be functional, we made the necessary adjustments regarding Java
* features (mainly doubled backslashes).
*/
public static final String URL_REGEX = "(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.]"
+ "[a-z]{2,4}/)(?:[^\\s()<>\\\\]+|\\(([^\\s()<>\\\\]+|(\\([^\\s()"
+ "<>\\\\]+\\)))*\\))+(?:\\(([^\\s()<>\\\\]+|(\\([^\\s()<>\\\\]+\\"
+ ")))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))";

public static final String DATE_TERMS_REGEX = "accessed on|visited on|retrieved on|viewed on";

private static final Field NOTE_FIELD = StandardField.NOTE;
private static final Field URL_FIELD = StandardField.URL;
private static final Field URLDATE_FIELD = StandardField.URLDATE;

final Pattern urlPattern = Pattern.compile(URL_REGEX, Pattern.CASE_INSENSITIVE);
final Pattern dateTermsPattern = Pattern.compile(DATE_TERMS_REGEX, Pattern.CASE_INSENSITIVE);
final Pattern datePattern = Pattern.compile(Date.DATE_REGEX, Pattern.CASE_INSENSITIVE);

private NormalizeDateFormatter formatter = new NormalizeDateFormatter();

@Override
public List<FieldChange> cleanup(BibEntry entry) {
List<FieldChange> changes = new ArrayList<>();

String noteFieldValue = entry.getField(NOTE_FIELD).orElse(null);

/*
* The urlRegex was originally fetched from a suggested solution in
* https://stackoverflow.com/questions/28185064/python-infinite-loop-in-regex-to-match-url.
* In order to be functional, we made the necessary adjustments regarding Java
* features (mainly doubled backslashes).
*/
String urlRegex = "(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.]"
+ "[a-z]{2,4}/)(?:[^\\s()<>\\\\]+|\\(([^\\s()<>\\\\]+|(\\([^\\s()"
+ "<>\\\\]+\\)))*\\))+(?:\\(([^\\s()<>\\\\]+|(\\([^\\s()<>\\\\]+\\"
+ ")))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))";

final Pattern pattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE);
final Matcher matcher = pattern.matcher(noteFieldValue);
final Matcher urlMatcher = urlPattern.matcher(noteFieldValue);
final Matcher dateTermsMatcher = dateTermsPattern.matcher(noteFieldValue);
final Matcher dateMatcher = datePattern.matcher(noteFieldValue);

if (matcher.find()) {
String url = matcher.group();
if (urlMatcher.find()) {
String url = urlMatcher.group();

// Remove the URL from the NoteFieldValue
String newNoteFieldValue = noteFieldValue
Expand Down Expand Up @@ -70,6 +82,30 @@ public List<FieldChange> cleanup(BibEntry entry) {
entry.setField(NOTE_FIELD, newNoteFieldValue).ifPresent(changes::add);
entry.setField(URL_FIELD, url).ifPresent(changes::add);
}

if (dateTermsMatcher.find()) {
String term = dateTermsMatcher.group();
newNoteFieldValue = newNoteFieldValue
.replace(term, "");
if (dateMatcher.find()) {
String date = dateMatcher.group();
String formattedDate = formatter.format(date);
newNoteFieldValue = newNoteFieldValue
.replace(date, "").trim()
.replaceAll("^,|,$", "").trim(); // either starts or ends with a comma

// Same approach with the URL cleanup.
if (entry.hasField(URLDATE_FIELD)) {
String urlDateFieldValue = entry.getField(URLDATE_FIELD).orElse(null);
if (urlDateFieldValue.equals(formattedDate)) {
entry.setField(NOTE_FIELD, newNoteFieldValue).ifPresent(changes::add);
}
} else {
entry.setField(NOTE_FIELD, newNoteFieldValue).ifPresent(changes::add);
entry.setField(URLDATE_FIELD, formattedDate).ifPresent(changes::add);
}
}
}
}
return changes;
}
Expand Down
13 changes: 13 additions & 0 deletions src/main/java/org/jabref/model/entry/Date.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

public class Date {

public static final String DATE_REGEX;
private static final DateTimeFormatter NORMALIZED_DATE_FORMATTER = DateTimeFormatter.ofPattern("uuuu[-MM][-dd]");
private static final DateTimeFormatter SIMPLE_DATE_FORMATS;
private static final Logger LOGGER = LoggerFactory.getLogger(Date.class);
Expand Down Expand Up @@ -65,6 +66,18 @@ public class Date {
DateTimeFormatterBuilder::appendOptional,
(builder, formatterBuilder) -> builder.append(formatterBuilder.toFormatter()))
.toFormatter(Locale.US);

/*
* There is also {@link org.jabref.model.entry.Date#parse(java.lang.String)}.
* The regex of that method cannot be used as we parse single dates here and that method parses:
* i) date ranges
* ii) two dates separated by '/'
* Additionally, parse method requires the reviewed String to hold only a date.
*/
DATE_REGEX = "\\d{4}-\\d{1,2}-\\d{1,2}" + // covers YYYY-MM-DD, YYYY-M-DD, YYYY-MM-D, YYYY-M-D
"|\\d{4}\\.\\d{1,2}\\.\\d{1,2}|" + // covers YYYY.MM.DD, YYYY.M.DD, YYYY.MM.D, YYYY.M.D
"(January|February|March|April|May|June|July|August|September|" +
"October|November|December) \\d{1,2}, \\d{4}"; // covers Month DD, YYYY & Month D, YYYY
}

private final TemporalAccessor date;
Expand Down
60 changes: 59 additions & 1 deletion src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ private static Stream<Arguments> provideURL() {
"\\url{https://hdl.handle.net/10442/hedi/6089}, "
+ "\\url{http://142.42.1.1:8080}")),

// Input entry holds the same URL both in Note and Url field.
Arguments.of(
new BibEntry().withField(StandardField.URL,
"https://hdl.handle.net/10442/hedi/6089"),
new BibEntry().withField(StandardField.NOTE,
"\\url{https://hdl.handle.net/10442/hedi/6089}")
.withField(StandardField.URL,
"https://hdl.handle.net/10442/hedi/6089")),

// Input Note field has several values stored.
Arguments.of(
new BibEntry().withField(StandardField.URL,
Expand Down Expand Up @@ -112,7 +121,56 @@ private static Stream<Arguments> provideURL() {
new BibEntry().withField(StandardField.URL,
"https://www.example.com/foo/?bar=baz&inga=42&quux"),
new BibEntry().withField(StandardField.NOTE,
"https://www.example.com/foo/?bar=baz&inga=42&quux"))
"https://www.example.com/foo/?bar=baz&inga=42&quux")),

// Expected entry returns formatted the url-date in the Urldate field.
Arguments.of(
new BibEntry().withField(StandardField.URL,
"http://142.42.1.1:8080")
.withField(StandardField.URLDATE,
"2021-01-15"),
new BibEntry().withField(StandardField.NOTE,
"\\url{http://142.42.1.1:8080}, accessed on January 15, 2021")),

// Input entry doesn't hold any URL in the Note field.
Arguments.of(
new BibEntry().withField(StandardField.NOTE,
"Accessed on 2015-01-15"),
new BibEntry().withField(StandardField.NOTE,
"Accessed on 2015-01-15")),

// Input entry has multiple url-dates stored in the Note field.
Arguments.of(
new BibEntry().withField(StandardField.URL,
"http://142.42.1.1:8080")
.withField(StandardField.URLDATE,
"2021-01-15")
.withField(StandardField.NOTE,
"visited on February 12, 2017"),
new BibEntry().withField(StandardField.NOTE,
"\\url{http://142.42.1.1:8080}, accessed on January 15, 2021, visited on February 12, 2017")),

// Input entry holds the same url-date in both Note and Urldate field.
Arguments.of(
new BibEntry().withField(StandardField.URL,
"http://142.42.1.1:8080")
.withField(StandardField.URLDATE,
"2015-01-15"),
new BibEntry().withField(StandardField.NOTE,
"\\url{http://142.42.1.1:8080}, visited on 2015.01.15")
.withField(StandardField.URLDATE,
"2015-01-15")),

// Input Note field has several values stored.
Arguments.of(
new BibEntry().withField(StandardField.URL,
"https://example.org")
.withField(StandardField.URLDATE,
"2023-04-11")
.withField(StandardField.NOTE,
"cited by Kramer"),
new BibEntry().withField(StandardField.NOTE,
"\\url{https://example.org}, cited by Kramer, accessed on 2023-04-11"))
);
}
}

0 comments on commit dd656c5

Please sign in to comment.