diff --git a/CHANGELOG.md b/CHANGELOG.md index 11aa3680d04..9c7077e1d3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,7 +31,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve - We added drag and drop events for field 'Groups' in entry editor panel. [#569](https://github.com/koppor/jabref/issues/569) - We added support for parsing MathML in the Medline importer. [#4273](https://github.com/JabRef/jabref/issues/4273) - We added the ability to search for a DOI directly from 'Web Search'. [#9674](https://github.com/JabRef/jabref/issues/9674) -- We added a cleanup activity that identifies a URL in the `note` field and moves it to the `url` field. [koppor#216](https://github.com/koppor/jabref/issues/216) +- We added a cleanup activity that identifies a URL or a last-visited-date in the `note` field and moves it to the `url` and `urldate` field respectively. [koppor#216](https://github.com/koppor/jabref/issues/216) - We enabled the user to change the name of a field in a custom entry type by double-clicking on it. [#9840](https://github.com/JabRef/jabref/issues/9840) diff --git a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java index d7569a120f2..924a3e5af70 100644 --- a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java +++ b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java @@ -5,8 +5,10 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.jabref.logic.formatter.bibtexfields.NormalizeDateFormatter; import org.jabref.model.FieldChange; import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.Date; import org.jabref.model.entry.field.Field; import org.jabref.model.entry.field.StandardField; @@ -15,8 +17,28 @@ */ public class URLCleanup implements CleanupJob { + /* + * The urlRegex was originally fetched from a suggested solution in + * https://stackoverflow.com/questions/28185064/python-infinite-loop-in-regex-to-match-url. + * In order to be functional, we made the necessary adjustments regarding Java + * features (mainly doubled backslashes). + */ + public static final String URL_REGEX = "(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.]" + + "[a-z]{2,4}/)(?:[^\\s()<>\\\\]+|\\(([^\\s()<>\\\\]+|(\\([^\\s()" + + "<>\\\\]+\\)))*\\))+(?:\\(([^\\s()<>\\\\]+|(\\([^\\s()<>\\\\]+\\" + + ")))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))"; + + public static final String DATE_TERMS_REGEX = "accessed on|visited on|retrieved on|viewed on"; + private static final Field NOTE_FIELD = StandardField.NOTE; private static final Field URL_FIELD = StandardField.URL; + private static final Field URLDATE_FIELD = StandardField.URLDATE; + + final Pattern urlPattern = Pattern.compile(URL_REGEX, Pattern.CASE_INSENSITIVE); + final Pattern dateTermsPattern = Pattern.compile(DATE_TERMS_REGEX, Pattern.CASE_INSENSITIVE); + final Pattern datePattern = Pattern.compile(Date.DATE_REGEX, Pattern.CASE_INSENSITIVE); + + private NormalizeDateFormatter formatter = new NormalizeDateFormatter(); @Override public List cleanup(BibEntry entry) { @@ -24,22 +46,12 @@ public List cleanup(BibEntry entry) { String noteFieldValue = entry.getField(NOTE_FIELD).orElse(null); - /* - * The urlRegex was originally fetched from a suggested solution in - * https://stackoverflow.com/questions/28185064/python-infinite-loop-in-regex-to-match-url. - * In order to be functional, we made the necessary adjustments regarding Java - * features (mainly doubled backslashes). - */ - String urlRegex = "(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.]" - + "[a-z]{2,4}/)(?:[^\\s()<>\\\\]+|\\(([^\\s()<>\\\\]+|(\\([^\\s()" - + "<>\\\\]+\\)))*\\))+(?:\\(([^\\s()<>\\\\]+|(\\([^\\s()<>\\\\]+\\" - + ")))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))"; - - final Pattern pattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE); - final Matcher matcher = pattern.matcher(noteFieldValue); + final Matcher urlMatcher = urlPattern.matcher(noteFieldValue); + final Matcher dateTermsMatcher = dateTermsPattern.matcher(noteFieldValue); + final Matcher dateMatcher = datePattern.matcher(noteFieldValue); - if (matcher.find()) { - String url = matcher.group(); + if (urlMatcher.find()) { + String url = urlMatcher.group(); // Remove the URL from the NoteFieldValue String newNoteFieldValue = noteFieldValue @@ -70,6 +82,30 @@ public List cleanup(BibEntry entry) { entry.setField(NOTE_FIELD, newNoteFieldValue).ifPresent(changes::add); entry.setField(URL_FIELD, url).ifPresent(changes::add); } + + if (dateTermsMatcher.find()) { + String term = dateTermsMatcher.group(); + newNoteFieldValue = newNoteFieldValue + .replace(term, ""); + if (dateMatcher.find()) { + String date = dateMatcher.group(); + String formattedDate = formatter.format(date); + newNoteFieldValue = newNoteFieldValue + .replace(date, "").trim() + .replaceAll("^,|,$", "").trim(); // either starts or ends with a comma + + // Same approach with the URL cleanup. + if (entry.hasField(URLDATE_FIELD)) { + String urlDateFieldValue = entry.getField(URLDATE_FIELD).orElse(null); + if (urlDateFieldValue.equals(formattedDate)) { + entry.setField(NOTE_FIELD, newNoteFieldValue).ifPresent(changes::add); + } + } else { + entry.setField(NOTE_FIELD, newNoteFieldValue).ifPresent(changes::add); + entry.setField(URLDATE_FIELD, formattedDate).ifPresent(changes::add); + } + } + } } return changes; } diff --git a/src/main/java/org/jabref/model/entry/Date.java b/src/main/java/org/jabref/model/entry/Date.java index 46ea51971aa..219a7b294e5 100644 --- a/src/main/java/org/jabref/model/entry/Date.java +++ b/src/main/java/org/jabref/model/entry/Date.java @@ -19,6 +19,7 @@ public class Date { + public static final String DATE_REGEX; private static final DateTimeFormatter NORMALIZED_DATE_FORMATTER = DateTimeFormatter.ofPattern("uuuu[-MM][-dd]"); private static final DateTimeFormatter SIMPLE_DATE_FORMATS; private static final Logger LOGGER = LoggerFactory.getLogger(Date.class); @@ -65,6 +66,18 @@ public class Date { DateTimeFormatterBuilder::appendOptional, (builder, formatterBuilder) -> builder.append(formatterBuilder.toFormatter())) .toFormatter(Locale.US); + + /* + * There is also {@link org.jabref.model.entry.Date#parse(java.lang.String)}. + * The regex of that method cannot be used as we parse single dates here and that method parses: + * i) date ranges + * ii) two dates separated by '/' + * Additionally, parse method requires the reviewed String to hold only a date. + */ + DATE_REGEX = "\\d{4}-\\d{1,2}-\\d{1,2}" + // covers YYYY-MM-DD, YYYY-M-DD, YYYY-MM-D, YYYY-M-D + "|\\d{4}\\.\\d{1,2}\\.\\d{1,2}|" + // covers YYYY.MM.DD, YYYY.M.DD, YYYY.MM.D, YYYY.M.D + "(January|February|March|April|May|June|July|August|September|" + + "October|November|December) \\d{1,2}, \\d{4}"; // covers Month DD, YYYY & Month D, YYYY } private final TemporalAccessor date; diff --git a/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java b/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java index 7bfd2fd990b..1c967a1a781 100644 --- a/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java +++ b/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java @@ -53,6 +53,15 @@ private static Stream provideURL() { "\\url{https://hdl.handle.net/10442/hedi/6089}, " + "\\url{http://142.42.1.1:8080}")), + // Input entry holds the same URL both in Note and Url field. + Arguments.of( + new BibEntry().withField(StandardField.URL, + "https://hdl.handle.net/10442/hedi/6089"), + new BibEntry().withField(StandardField.NOTE, + "\\url{https://hdl.handle.net/10442/hedi/6089}") + .withField(StandardField.URL, + "https://hdl.handle.net/10442/hedi/6089")), + // Input Note field has several values stored. Arguments.of( new BibEntry().withField(StandardField.URL, @@ -112,7 +121,56 @@ private static Stream provideURL() { new BibEntry().withField(StandardField.URL, "https://www.example.com/foo/?bar=baz&inga=42&quux"), new BibEntry().withField(StandardField.NOTE, - "https://www.example.com/foo/?bar=baz&inga=42&quux")) + "https://www.example.com/foo/?bar=baz&inga=42&quux")), + + // Expected entry returns formatted the url-date in the Urldate field. + Arguments.of( + new BibEntry().withField(StandardField.URL, + "http://142.42.1.1:8080") + .withField(StandardField.URLDATE, + "2021-01-15"), + new BibEntry().withField(StandardField.NOTE, + "\\url{http://142.42.1.1:8080}, accessed on January 15, 2021")), + + // Input entry doesn't hold any URL in the Note field. + Arguments.of( + new BibEntry().withField(StandardField.NOTE, + "Accessed on 2015-01-15"), + new BibEntry().withField(StandardField.NOTE, + "Accessed on 2015-01-15")), + + // Input entry has multiple url-dates stored in the Note field. + Arguments.of( + new BibEntry().withField(StandardField.URL, + "http://142.42.1.1:8080") + .withField(StandardField.URLDATE, + "2021-01-15") + .withField(StandardField.NOTE, + "visited on February 12, 2017"), + new BibEntry().withField(StandardField.NOTE, + "\\url{http://142.42.1.1:8080}, accessed on January 15, 2021, visited on February 12, 2017")), + + // Input entry holds the same url-date in both Note and Urldate field. + Arguments.of( + new BibEntry().withField(StandardField.URL, + "http://142.42.1.1:8080") + .withField(StandardField.URLDATE, + "2015-01-15"), + new BibEntry().withField(StandardField.NOTE, + "\\url{http://142.42.1.1:8080}, visited on 2015.01.15") + .withField(StandardField.URLDATE, + "2015-01-15")), + + // Input Note field has several values stored. + Arguments.of( + new BibEntry().withField(StandardField.URL, + "https://example.org") + .withField(StandardField.URLDATE, + "2023-04-11") + .withField(StandardField.NOTE, + "cited by Kramer"), + new BibEntry().withField(StandardField.NOTE, + "\\url{https://example.org}, cited by Kramer, accessed on 2023-04-11")) ); } }