Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cleanup operation in reference to the Urldate field #9999

Merged
merged 14 commits into from
Jun 11, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
- We added drag and drop events for field 'Groups' in entry editor panel. [#569](https://github.com/koppor/jabref/issues/569)
- We added support for parsing MathML in the Medline importer. [#4273](https://github.com/JabRef/jabref/issues/4273)
- We added the ability to search for a DOI directly from 'Web Search'. [#9674](https://github.com/JabRef/jabref/issues/9674)
- We added a cleanup activity that identifies a URL in the `note` field and moves it to the `url` field. [koppor#216](https://github.com/koppor/jabref/issues/216)
- We added a cleanup activity that identifies a URL or a last-visited-date in the `note` field and moves it to the `url` and `urldate` field respectively. [koppor#216](https://github.com/koppor/jabref/issues/216)
- We enabled the user to change the name of a field in a custom entry type by double-clicking on it. [#9840](https://github.com/JabRef/jabref/issues/9840)


Expand Down
66 changes: 51 additions & 15 deletions src/main/java/org/jabref/logic/cleanup/URLCleanup.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jabref.logic.formatter.bibtexfields.NormalizeDateFormatter;
import org.jabref.model.FieldChange;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.Date;
import org.jabref.model.entry.field.Field;
import org.jabref.model.entry.field.StandardField;

Expand All @@ -15,31 +17,41 @@
*/
public class URLCleanup implements CleanupJob {

/*
* The urlRegex was originally fetched from a suggested solution in
* https://stackoverflow.com/questions/28185064/python-infinite-loop-in-regex-to-match-url.
* In order to be functional, we made the necessary adjustments regarding Java
* features (mainly doubled backslashes).
*/
public static final String URL_REGEX = "(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.]"
+ "[a-z]{2,4}/)(?:[^\\s()<>\\\\]+|\\(([^\\s()<>\\\\]+|(\\([^\\s()"
+ "<>\\\\]+\\)))*\\))+(?:\\(([^\\s()<>\\\\]+|(\\([^\\s()<>\\\\]+\\"
+ ")))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))";

public static final String DATE_TERMS_REGEX = "accessed on|visited on|retrieved on|viewed on";

private static final Field NOTE_FIELD = StandardField.NOTE;
private static final Field URL_FIELD = StandardField.URL;
private static final Field URLDATE_FIELD = StandardField.URLDATE;

final Pattern urlPattern = Pattern.compile(URL_REGEX, Pattern.CASE_INSENSITIVE);
final Pattern dateTermsPattern = Pattern.compile(DATE_TERMS_REGEX, Pattern.CASE_INSENSITIVE);
final Pattern datePattern = Pattern.compile(Date.DATE_REGEX, Pattern.CASE_INSENSITIVE);

private NormalizeDateFormatter formatter = new NormalizeDateFormatter();

@Override
public List<FieldChange> cleanup(BibEntry entry) {
List<FieldChange> changes = new ArrayList<>();

String noteFieldValue = entry.getField(NOTE_FIELD).orElse(null);

/*
* The urlRegex was originally fetched from a suggested solution in
* https://stackoverflow.com/questions/28185064/python-infinite-loop-in-regex-to-match-url.
* In order to be functional, we made the necessary adjustments regarding Java
* features (mainly doubled backslashes).
*/
String urlRegex = "(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.]"
+ "[a-z]{2,4}/)(?:[^\\s()<>\\\\]+|\\(([^\\s()<>\\\\]+|(\\([^\\s()"
+ "<>\\\\]+\\)))*\\))+(?:\\(([^\\s()<>\\\\]+|(\\([^\\s()<>\\\\]+\\"
+ ")))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))";

final Pattern pattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE);
final Matcher matcher = pattern.matcher(noteFieldValue);
final Matcher urlMatcher = urlPattern.matcher(noteFieldValue);
final Matcher dateTermsMatcher = dateTermsPattern.matcher(noteFieldValue);
final Matcher dateMatcher = datePattern.matcher(noteFieldValue);
dkokkotas marked this conversation as resolved.
Show resolved Hide resolved

if (matcher.find()) {
String url = matcher.group();
if (urlMatcher.find()) {
String url = urlMatcher.group();

// Remove the URL from the NoteFieldValue
String newNoteFieldValue = noteFieldValue
Expand Down Expand Up @@ -70,6 +82,30 @@ public List<FieldChange> cleanup(BibEntry entry) {
entry.setField(NOTE_FIELD, newNoteFieldValue).ifPresent(changes::add);
entry.setField(URL_FIELD, url).ifPresent(changes::add);
}

if (dateTermsMatcher.find()) {
String term = dateTermsMatcher.group();
newNoteFieldValue = newNoteFieldValue
.replace(term, "");
if (dateMatcher.find()) {
String date = dateMatcher.group();
String formattedDate = formatter.format(date);
newNoteFieldValue = newNoteFieldValue
.replace(date, "").trim()
.replaceAll("^,|,$", "").trim(); // either starts or ends with a comma

// Same approach with the URL cleanup.
if (entry.hasField(URLDATE_FIELD)) {
String urlDateFieldValue = entry.getField(URLDATE_FIELD).orElse(null);
if (urlDateFieldValue.equals(formattedDate)) {
entry.setField(NOTE_FIELD, newNoteFieldValue).ifPresent(changes::add);
}
} else {
entry.setField(NOTE_FIELD, newNoteFieldValue).ifPresent(changes::add);
entry.setField(URLDATE_FIELD, formattedDate).ifPresent(changes::add);
}
}
}
}
return changes;
}
Expand Down
6 changes: 6 additions & 0 deletions src/main/java/org/jabref/model/entry/Date.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

public class Date {

public static final String DATE_REGEX;
private static final DateTimeFormatter NORMALIZED_DATE_FORMATTER = DateTimeFormatter.ofPattern("uuuu[-MM][-dd]");
private static final DateTimeFormatter SIMPLE_DATE_FORMATS;
private static final Logger LOGGER = LoggerFactory.getLogger(Date.class);
Expand Down Expand Up @@ -65,6 +66,11 @@ public class Date {
DateTimeFormatterBuilder::appendOptional,
(builder, formatterBuilder) -> builder.append(formatterBuilder.toFormatter()))
.toFormatter(Locale.US);

DATE_REGEX = "\\d{4}-\\d{1,2}-\\d{1,2}" + // covers YYYY-MM-DD, YYYY-M-DD, YYYY-MM-D, YYYY-M-D
dkokkotas marked this conversation as resolved.
Show resolved Hide resolved
"|\\d{4}\\.\\d{1,2}\\.\\d{1,2}|" + // covers YYYY.MM.DD, YYYY.M.DD, YYYY.MM.D, YYYY.M.D
"(January|February|March|April|May|June|July|August|September|" +
"October|November|December) \\d{1,2}, \\d{4}"; // covers Month DD, YYYY & Month D, YYYY
dkokkotas marked this conversation as resolved.
Show resolved Hide resolved
}

private final TemporalAccessor date;
Expand Down
60 changes: 59 additions & 1 deletion src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ private static Stream<Arguments> provideURL() {
"\\url{https://hdl.handle.net/10442/hedi/6089}, "
+ "\\url{http://142.42.1.1:8080}")),

// Input entry holds the same URL both in Note and Url field.
Arguments.of(
new BibEntry().withField(StandardField.URL,
"https://hdl.handle.net/10442/hedi/6089"),
new BibEntry().withField(StandardField.NOTE,
"\\url{https://hdl.handle.net/10442/hedi/6089}")
.withField(StandardField.URL,
"https://hdl.handle.net/10442/hedi/6089")),

// Input Note field has several values stored.
Arguments.of(
new BibEntry().withField(StandardField.URL,
Expand Down Expand Up @@ -112,7 +121,56 @@ private static Stream<Arguments> provideURL() {
new BibEntry().withField(StandardField.URL,
"https://www.example.com/foo/?bar=baz&inga=42&quux"),
new BibEntry().withField(StandardField.NOTE,
"https://www.example.com/foo/?bar=baz&inga=42&quux"))
"https://www.example.com/foo/?bar=baz&inga=42&quux")),

// Expected entry returns formatted the url-date in the Urldate field.
Arguments.of(
new BibEntry().withField(StandardField.URL,
"http://142.42.1.1:8080")
.withField(StandardField.URLDATE,
"2021-01-15"),
new BibEntry().withField(StandardField.NOTE,
"\\url{http://142.42.1.1:8080}, Accessed on January 15, 2021")),

// Input entry doesn't hold any URL in the Note field.
Arguments.of(
new BibEntry().withField(StandardField.NOTE,
"Accessed on 2015-01-15"),
new BibEntry().withField(StandardField.NOTE,
"Accessed on 2015-01-15")),

// Input entry has multiple url-dates stored in the Note field.
Arguments.of(
new BibEntry().withField(StandardField.URL,
"http://142.42.1.1:8080")
.withField(StandardField.URLDATE,
"2021-01-15")
.withField(StandardField.NOTE,
"Visited on February 12, 2017"),
new BibEntry().withField(StandardField.NOTE,
"\\url{http://142.42.1.1:8080}, Accessed on January 15, 2021, Visited on February 12, 2017")),

// Input entry holds the same url-date in both Note and Urldate field.
Arguments.of(
new BibEntry().withField(StandardField.URL,
"http://142.42.1.1:8080")
.withField(StandardField.URLDATE,
"2015-01-15"),
new BibEntry().withField(StandardField.NOTE,
"\\url{http://142.42.1.1:8080}, Visited on 2015.01.15")
.withField(StandardField.URLDATE,
"2015-01-15")),

// Input Note field has several values stored.
Arguments.of(
new BibEntry().withField(StandardField.URL,
"https://example.org")
.withField(StandardField.URLDATE,
"2023-04-11")
.withField(StandardField.NOTE,
"cited by Kramer"),
new BibEntry().withField(StandardField.NOTE,
"\\url{https://example.org}, cited by Kramer, Accessed on 2023-04-11"))
);
}
}