From 10d8821e96aa2b0b9baa2f3b7d8fa2e7f76cc334 Mon Sep 17 00:00:00 2001 From: Dimitrios Kokkotas Date: Sat, 10 Jun 2023 20:28:12 +0300 Subject: [PATCH 01/14] Add urldate cleanup operation --- .../org/jabref/logic/cleanup/URLCleanup.java | 59 +++++++++++++++++-- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java index d7569a120f2..c2902b99400 100644 --- a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java +++ b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java @@ -5,6 +5,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.jabref.logic.formatter.bibtexfields.NormalizeDateFormatter; import org.jabref.model.FieldChange; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.field.Field; @@ -17,6 +18,9 @@ public class URLCleanup implements CleanupJob { private static final Field NOTE_FIELD = StandardField.NOTE; private static final Field URL_FIELD = StandardField.URL; + private static final Field URLDATE_FIELD = StandardField.URLDATE; + + private NormalizeDateFormatter formatter = new NormalizeDateFormatter(); @Override public List cleanup(BibEntry entry) { @@ -35,11 +39,34 @@ public List cleanup(BibEntry entry) { + "<>\\\\]+\\)))*\\))+(?:\\(([^\\s()<>\\\\]+|(\\([^\\s()<>\\\\]+\\" + ")))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))"; - final Pattern pattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE); - final Matcher matcher = pattern.matcher(noteFieldValue); + String dateTermsRegex = "Accessed on|Visited on|Retrieved on|Viewed on"; + + /* + * dateRegex matches several date formats. Explanation: + *
    + *
  • "\d{4}": Matches exactly four digits (YYYY)
  • + *
  • "\d{1,2}": Matches one or two digits (M or MM)
  • + *
  • "\d{1,2}": Matches one or two digits (D or DD)
  • + *
+ * Indicative formats identified: + * YYYY-MM-DD, YYYY-MM-DD, YYYY-M-DD, YYYY-MM-D, YYYY-M-D + * YYYY.MM.DD, YYYY.MM.DD, YYYY.M.DD, YYYY.MM.D, YYYY.M.D + * Month DD, YYYY & Month D, YYYY + */ + String dateRegex = ("\\d{4}-\\d{1,2}-\\d{1,2}|\\d{4}\\.\\d{1,2}\\.\\d{1,2}|" + + "(January|February|March|April|May|June|July|August|September|" + + "October|November|December) \\d{1,2}, \\d{4}"); + + final Pattern urlPattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE); + final Pattern termsPattern = Pattern.compile(dateTermsRegex, Pattern.CASE_INSENSITIVE); + final Pattern datePattern = Pattern.compile(dateRegex, Pattern.CASE_INSENSITIVE); - if (matcher.find()) { - String url = matcher.group(); + final Matcher urlMatcher = urlPattern.matcher(noteFieldValue); + final Matcher termsMatcher = termsPattern.matcher(noteFieldValue); + final Matcher dateMatcher = datePattern.matcher(noteFieldValue); + + if (urlMatcher.find()) { + String url = urlMatcher.group(); // Remove the URL from the NoteFieldValue String newNoteFieldValue = noteFieldValue @@ -70,6 +97,30 @@ public List cleanup(BibEntry entry) { entry.setField(NOTE_FIELD, newNoteFieldValue).ifPresent(changes::add); entry.setField(URL_FIELD, url).ifPresent(changes::add); } + + if (termsMatcher.find()) { + String term = termsMatcher.group(); + newNoteFieldValue = newNoteFieldValue + .replace(term, ""); + if (dateMatcher.find()) { + String date = dateMatcher.group(); + String formattedDate = formatter.format(date); + newNoteFieldValue = newNoteFieldValue + .replace(date, "").trim() + .replaceAll("^,|,$", "").trim(); // either starts or ends with comma + + // same behaviour with URL cleanup + if (entry.hasField(URLDATE_FIELD)) { + String urlDateFieldValue = entry.getField(URLDATE_FIELD).orElse(null); + if (urlDateFieldValue.equals(formattedDate)) { + entry.setField(NOTE_FIELD, newNoteFieldValue).ifPresent(changes::add); + } + } else { + entry.setField(NOTE_FIELD, newNoteFieldValue).ifPresent(changes::add); + entry.setField(URLDATE_FIELD, formattedDate).ifPresent(changes::add); + } + } + } } return changes; } From 3353bb6191ee0ca44b6e026f65483eae991d263c Mon Sep 17 00:00:00 2001 From: Dimitrios Kokkotas Date: Sat, 10 Jun 2023 20:29:37 +0300 Subject: [PATCH 02/14] Add parameterized tests regarding urldate cleanup --- .../jabref/logic/cleanup/URLCleanupTest.java | 60 ++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java b/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java index 7bfd2fd990b..8c5b8cad48f 100644 --- a/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java +++ b/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java @@ -53,6 +53,15 @@ private static Stream provideURL() { "\\url{https://hdl.handle.net/10442/hedi/6089}, " + "\\url{http://142.42.1.1:8080}")), + // Input entry holds the same URL both in Note and Url field. + Arguments.of( + new BibEntry().withField(StandardField.URL, + "https://hdl.handle.net/10442/hedi/6089"), + new BibEntry().withField(StandardField.NOTE, + "\\url{https://hdl.handle.net/10442/hedi/6089}") + .withField(StandardField.URL, + "https://hdl.handle.net/10442/hedi/6089")), + // Input Note field has several values stored. Arguments.of( new BibEntry().withField(StandardField.URL, @@ -112,7 +121,56 @@ private static Stream provideURL() { new BibEntry().withField(StandardField.URL, "https://www.example.com/foo/?bar=baz&inga=42&quux"), new BibEntry().withField(StandardField.NOTE, - "https://www.example.com/foo/?bar=baz&inga=42&quux")) + "https://www.example.com/foo/?bar=baz&inga=42&quux")), + + // Expected entry returns formatted the url-date in Urldate field. + Arguments.of( + new BibEntry().withField(StandardField.URL, + "http://142.42.1.1:8080") + .withField(StandardField.URLDATE, + "2021-01-15"), + new BibEntry().withField(StandardField.NOTE, + "\\url{http://142.42.1.1:8080}, Accessed on January 15, 2021")), + + // Input entry doesn't hold any URL both in Note field. + Arguments.of( + new BibEntry().withField(StandardField.NOTE, + "Accessed on 2015-01-15"), + new BibEntry().withField(StandardField.NOTE, + "Accessed on 2015-01-15")), + + // Input entry has multiple url-dates stored in Note field. + Arguments.of( + new BibEntry().withField(StandardField.URL, + "http://142.42.1.1:8080") + .withField(StandardField.URLDATE, + "2021-01-15") + .withField(StandardField.NOTE, + "Visited on February 12, 2017"), + new BibEntry().withField(StandardField.NOTE, + "\\url{http://142.42.1.1:8080}, Accessed on January 15, 2021, Visited on February 12, 2017")), + + // Input entry holds the same url-date in both Note and Urldate field. + Arguments.of( + new BibEntry().withField(StandardField.URL, + "http://142.42.1.1:8080") + .withField(StandardField.URLDATE, + "2015-01-15"), + new BibEntry().withField(StandardField.NOTE, + "\\url{http://142.42.1.1:8080}, Visited on 2015.01.15") + .withField(StandardField.URLDATE, + "2015-01-15")), + + // Input Note field has several values stored. + Arguments.of( + new BibEntry().withField(StandardField.URL, + "https://example.org") + .withField(StandardField.URLDATE, + "2023-04-11") + .withField(StandardField.NOTE, + "cited by Kramer"), + new BibEntry().withField(StandardField.NOTE, + "\\url{https://example.org}, cited by Kramer, Accessed on 2023-04-11")) ); } } From 9a0031da1764f18ac9a0ff58e9da068d7bec4601 Mon Sep 17 00:00:00 2001 From: Dimitrios Kokkotas Date: Sat, 10 Jun 2023 20:32:45 +0300 Subject: [PATCH 03/14] Update the existing cleanup operation description in CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 11aa3680d04..f10dd6f3c03 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,7 +31,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve - We added drag and drop events for field 'Groups' in entry editor panel. [#569](https://github.com/koppor/jabref/issues/569) - We added support for parsing MathML in the Medline importer. [#4273](https://github.com/JabRef/jabref/issues/4273) - We added the ability to search for a DOI directly from 'Web Search'. [#9674](https://github.com/JabRef/jabref/issues/9674) -- We added a cleanup activity that identifies a URL in the `note` field and moves it to the `url` field. [koppor#216](https://github.com/koppor/jabref/issues/216) +- We added a cleanup activity that identifies a URL and/or a URL-DATE in the `note` field and moves it/them to the `url` and `urldate` field correspondingly. [koppor#216](https://github.com/koppor/jabref/issues/216) - We enabled the user to change the name of a field in a custom entry type by double-clicking on it. [#9840](https://github.com/JabRef/jabref/issues/9840) From f014a326b850c657ca13a0c7ed7c3a2b0c7a76bb Mon Sep 17 00:00:00 2001 From: Dimitrios Kokkotas Date: Sat, 10 Jun 2023 21:53:31 +0300 Subject: [PATCH 04/14] Remove duplicate date formats from javadoc comment --- src/main/java/org/jabref/logic/cleanup/URLCleanup.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java index c2902b99400..799585de531 100644 --- a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java +++ b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java @@ -49,8 +49,8 @@ public List cleanup(BibEntry entry) { *
  • "\d{1,2}": Matches one or two digits (D or DD)
  • * * Indicative formats identified: - * YYYY-MM-DD, YYYY-MM-DD, YYYY-M-DD, YYYY-MM-D, YYYY-M-D - * YYYY.MM.DD, YYYY.MM.DD, YYYY.M.DD, YYYY.MM.D, YYYY.M.D + * YYYY-MM-DD, YYYY-M-DD, YYYY-MM-D, YYYY-M-D + * YYYY.MM.DD, YYYY.M.DD, YYYY.MM.D, YYYY.M.D * Month DD, YYYY & Month D, YYYY */ String dateRegex = ("\\d{4}-\\d{1,2}-\\d{1,2}|\\d{4}\\.\\d{1,2}\\.\\d{1,2}|" + From 96c1e25e5e6079752215a7b619f6811d4b58f344 Mon Sep 17 00:00:00 2001 From: Dimitrios Kokkotas Date: Sat, 10 Jun 2023 22:00:10 +0300 Subject: [PATCH 05/14] Rephrase javadoc comments --- src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java b/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java index 8c5b8cad48f..640f6cc7712 100644 --- a/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java +++ b/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java @@ -123,7 +123,7 @@ private static Stream provideURL() { new BibEntry().withField(StandardField.NOTE, "https://www.example.com/foo/?bar=baz&inga=42&quux")), - // Expected entry returns formatted the url-date in Urldate field. + // Expected entry returns formatted the url-date in the Urldate field. Arguments.of( new BibEntry().withField(StandardField.URL, "http://142.42.1.1:8080") @@ -132,14 +132,14 @@ private static Stream provideURL() { new BibEntry().withField(StandardField.NOTE, "\\url{http://142.42.1.1:8080}, Accessed on January 15, 2021")), - // Input entry doesn't hold any URL both in Note field. + // Input entry doesn't hold any URL in the Note field. Arguments.of( new BibEntry().withField(StandardField.NOTE, "Accessed on 2015-01-15"), new BibEntry().withField(StandardField.NOTE, "Accessed on 2015-01-15")), - // Input entry has multiple url-dates stored in Note field. + // Input entry has multiple url-dates stored in the Note field. Arguments.of( new BibEntry().withField(StandardField.URL, "http://142.42.1.1:8080") From 384bb42b3b7d447d8008a2eb7aaa03e49f9be911 Mon Sep 17 00:00:00 2001 From: Dimitrios Kokkotas Date: Sat, 10 Jun 2023 22:03:37 +0300 Subject: [PATCH 06/14] Resolve code style check issues --- src/main/java/org/jabref/logic/cleanup/URLCleanup.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java index 799585de531..06c64e0fdca 100644 --- a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java +++ b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java @@ -53,9 +53,9 @@ public List cleanup(BibEntry entry) { * YYYY.MM.DD, YYYY.M.DD, YYYY.MM.D, YYYY.M.D * Month DD, YYYY & Month D, YYYY */ - String dateRegex = ("\\d{4}-\\d{1,2}-\\d{1,2}|\\d{4}\\.\\d{1,2}\\.\\d{1,2}|" + + String dateRegex = "\\d{4}-\\d{1,2}-\\d{1,2}|\\d{4}\\.\\d{1,2}\\.\\d{1,2}|" + "(January|February|March|April|May|June|July|August|September|" + - "October|November|December) \\d{1,2}, \\d{4}"); + "October|November|December) \\d{1,2}, \\d{4}"; final Pattern urlPattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE); final Pattern termsPattern = Pattern.compile(dateTermsRegex, Pattern.CASE_INSENSITIVE); From 8002e6f237de5c35781ae733dafa1fe121c91ecb Mon Sep 17 00:00:00 2001 From: Dimitrios Kokkotas Date: Sat, 10 Jun 2023 22:17:44 +0300 Subject: [PATCH 07/14] Improve javadoc comments' syntax --- src/main/java/org/jabref/logic/cleanup/URLCleanup.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java index 06c64e0fdca..02a3b6e8e8b 100644 --- a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java +++ b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java @@ -107,9 +107,9 @@ public List cleanup(BibEntry entry) { String formattedDate = formatter.format(date); newNoteFieldValue = newNoteFieldValue .replace(date, "").trim() - .replaceAll("^,|,$", "").trim(); // either starts or ends with comma + .replaceAll("^,|,$", "").trim(); // either starts or ends with a comma - // same behaviour with URL cleanup + // Same approach with the URL cleanup. if (entry.hasField(URLDATE_FIELD)) { String urlDateFieldValue = entry.getField(URLDATE_FIELD).orElse(null); if (urlDateFieldValue.equals(formattedDate)) { From 9a9cc8df8b03d984cd6341aa069ca520410d00a7 Mon Sep 17 00:00:00 2001 From: Dimitrios Kokkotas Date: Sun, 11 Jun 2023 03:42:31 +0300 Subject: [PATCH 08/14] Rephrase operation's description in CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f10dd6f3c03..9c7077e1d3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,7 +31,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve - We added drag and drop events for field 'Groups' in entry editor panel. [#569](https://github.com/koppor/jabref/issues/569) - We added support for parsing MathML in the Medline importer. [#4273](https://github.com/JabRef/jabref/issues/4273) - We added the ability to search for a DOI directly from 'Web Search'. [#9674](https://github.com/JabRef/jabref/issues/9674) -- We added a cleanup activity that identifies a URL and/or a URL-DATE in the `note` field and moves it/them to the `url` and `urldate` field correspondingly. [koppor#216](https://github.com/koppor/jabref/issues/216) +- We added a cleanup activity that identifies a URL or a last-visited-date in the `note` field and moves it to the `url` and `urldate` field respectively. [koppor#216](https://github.com/koppor/jabref/issues/216) - We enabled the user to change the name of a field in a custom entry type by double-clicking on it. [#9840](https://github.com/JabRef/jabref/issues/9840) From a73fe5a923de13e2248416df67463e3b7782b5c2 Mon Sep 17 00:00:00 2001 From: Dimitrios Kokkotas Date: Sun, 11 Jun 2023 04:43:34 +0300 Subject: [PATCH 09/14] Enrich javadoc comment and rename attributes properly --- .../org/jabref/logic/cleanup/URLCleanup.java | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java index 02a3b6e8e8b..152f5ec0b87 100644 --- a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java +++ b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java @@ -39,9 +39,21 @@ public List cleanup(BibEntry entry) { + "<>\\\\]+\\)))*\\))+(?:\\(([^\\s()<>\\\\]+|(\\([^\\s()<>\\\\]+\\" + ")))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))"; - String dateTermsRegex = "Accessed on|Visited on|Retrieved on|Viewed on"; + String dateTermsRegex = "accessed on|visited on|retrieved on|viewed on"; /* + * Several date patterns are available in: + * jabref/src/main/java/org/jabref/model/entry/Date.java class + * + * However, these cannot be used for the needs of the operation + * as Date.parse method requires the whole String (i.e. newNoteFieldValue) + * to be a date, in order to be matched. Besides that, it is not possible to + * extract a certain regex introduced in the parse method and call it in + * the current class. Reasoning: + * Defining a public static final variable (containing desired regex) + * within the parse static method, it is not allowed, as public static constants + * must be declared at class-level and not inside the method. + * * dateRegex matches several date formats. Explanation: *
      *
    • "\d{4}": Matches exactly four digits (YYYY)
    • @@ -58,11 +70,11 @@ public List cleanup(BibEntry entry) { "October|November|December) \\d{1,2}, \\d{4}"; final Pattern urlPattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE); - final Pattern termsPattern = Pattern.compile(dateTermsRegex, Pattern.CASE_INSENSITIVE); + final Pattern dateTermsPattern = Pattern.compile(dateTermsRegex, Pattern.CASE_INSENSITIVE); final Pattern datePattern = Pattern.compile(dateRegex, Pattern.CASE_INSENSITIVE); final Matcher urlMatcher = urlPattern.matcher(noteFieldValue); - final Matcher termsMatcher = termsPattern.matcher(noteFieldValue); + final Matcher dateTermsMatcher = dateTermsPattern.matcher(noteFieldValue); final Matcher dateMatcher = datePattern.matcher(noteFieldValue); if (urlMatcher.find()) { @@ -98,8 +110,8 @@ public List cleanup(BibEntry entry) { entry.setField(URL_FIELD, url).ifPresent(changes::add); } - if (termsMatcher.find()) { - String term = termsMatcher.group(); + if (dateTermsMatcher.find()) { + String term = dateTermsMatcher.group(); newNoteFieldValue = newNoteFieldValue .replace(term, ""); if (dateMatcher.find()) { From 2661dde53a46580f12ce945e0ef1945f7e83fb7e Mon Sep 17 00:00:00 2001 From: Dimitrios Kokkotas Date: Sun, 11 Jun 2023 04:52:10 +0300 Subject: [PATCH 10/14] Rephrase javadoc comment --- .../java/org/jabref/logic/cleanup/URLCleanup.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java index 152f5ec0b87..0bfde498b47 100644 --- a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java +++ b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java @@ -42,17 +42,17 @@ public List cleanup(BibEntry entry) { String dateTermsRegex = "accessed on|visited on|retrieved on|viewed on"; /* - * Several date patterns are available in: + * Several date patterns are available at: * jabref/src/main/java/org/jabref/model/entry/Date.java class * * However, these cannot be used for the needs of the operation - * as Date.parse method requires the whole String (i.e. newNoteFieldValue) - * to be a date, in order to be matched. Besides that, it is not possible to + * as Date.parse static method requires the newNoteFieldValue String to + * hold only a date to match correctly. Besides that, it is not possible to * extract a certain regex introduced in the parse method and call it in * the current class. Reasoning: - * Defining a public static final variable (containing desired regex) - * within the parse static method, it is not allowed, as public static constants - * must be declared at class-level and not inside the method. + * Defining a public static final attribute (containing desired regex) + * within the parse static method, it is not allowed, as public static + * constants must be declared at class-level and not inside the method. * * dateRegex matches several date formats. Explanation: *
        From 4b9045e617fdadf4048ed8c8d80c0121f5066f85 Mon Sep 17 00:00:00 2001 From: Dimitrios Kokkotas Date: Sun, 11 Jun 2023 14:21:13 +0300 Subject: [PATCH 11/14] Replace dateRegex under Date class --- .../org/jabref/logic/cleanup/URLCleanup.java | 31 ++----------------- .../java/org/jabref/model/entry/Date.java | 6 ++++ 2 files changed, 8 insertions(+), 29 deletions(-) diff --git a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java index 0bfde498b47..8c7b302359f 100644 --- a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java +++ b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java @@ -8,6 +8,7 @@ import org.jabref.logic.formatter.bibtexfields.NormalizeDateFormatter; import org.jabref.model.FieldChange; import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.Date; import org.jabref.model.entry.field.Field; import org.jabref.model.entry.field.StandardField; @@ -41,37 +42,9 @@ public List cleanup(BibEntry entry) { String dateTermsRegex = "accessed on|visited on|retrieved on|viewed on"; - /* - * Several date patterns are available at: - * jabref/src/main/java/org/jabref/model/entry/Date.java class - * - * However, these cannot be used for the needs of the operation - * as Date.parse static method requires the newNoteFieldValue String to - * hold only a date to match correctly. Besides that, it is not possible to - * extract a certain regex introduced in the parse method and call it in - * the current class. Reasoning: - * Defining a public static final attribute (containing desired regex) - * within the parse static method, it is not allowed, as public static - * constants must be declared at class-level and not inside the method. - * - * dateRegex matches several date formats. Explanation: - *
          - *
        • "\d{4}": Matches exactly four digits (YYYY)
        • - *
        • "\d{1,2}": Matches one or two digits (M or MM)
        • - *
        • "\d{1,2}": Matches one or two digits (D or DD)
        • - *
        - * Indicative formats identified: - * YYYY-MM-DD, YYYY-M-DD, YYYY-MM-D, YYYY-M-D - * YYYY.MM.DD, YYYY.M.DD, YYYY.MM.D, YYYY.M.D - * Month DD, YYYY & Month D, YYYY - */ - String dateRegex = "\\d{4}-\\d{1,2}-\\d{1,2}|\\d{4}\\.\\d{1,2}\\.\\d{1,2}|" + - "(January|February|March|April|May|June|July|August|September|" + - "October|November|December) \\d{1,2}, \\d{4}"; - final Pattern urlPattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE); final Pattern dateTermsPattern = Pattern.compile(dateTermsRegex, Pattern.CASE_INSENSITIVE); - final Pattern datePattern = Pattern.compile(dateRegex, Pattern.CASE_INSENSITIVE); + final Pattern datePattern = Pattern.compile(Date.DATE_REGEX, Pattern.CASE_INSENSITIVE); final Matcher urlMatcher = urlPattern.matcher(noteFieldValue); final Matcher dateTermsMatcher = dateTermsPattern.matcher(noteFieldValue); diff --git a/src/main/java/org/jabref/model/entry/Date.java b/src/main/java/org/jabref/model/entry/Date.java index 46ea51971aa..cf18bb1b993 100644 --- a/src/main/java/org/jabref/model/entry/Date.java +++ b/src/main/java/org/jabref/model/entry/Date.java @@ -19,6 +19,7 @@ public class Date { + public static final String DATE_REGEX; private static final DateTimeFormatter NORMALIZED_DATE_FORMATTER = DateTimeFormatter.ofPattern("uuuu[-MM][-dd]"); private static final DateTimeFormatter SIMPLE_DATE_FORMATS; private static final Logger LOGGER = LoggerFactory.getLogger(Date.class); @@ -65,6 +66,11 @@ public class Date { DateTimeFormatterBuilder::appendOptional, (builder, formatterBuilder) -> builder.append(formatterBuilder.toFormatter())) .toFormatter(Locale.US); + + DATE_REGEX = "\\d{4}-\\d{1,2}-\\d{1,2}" + // covers YYYY-MM-DD, YYYY-M-DD, YYYY-MM-D, YYYY-M-D + "|\\d{4}\\.\\d{1,2}\\.\\d{1,2}|" + // covers YYYY.MM.DD, YYYY.M.DD, YYYY.MM.D, YYYY.M.D + "(January|February|March|April|May|June|July|August|September|" + + "October|November|December) \\d{1,2}, \\d{4}"; // covers Month DD, YYYY & Month D, YYYY } private final TemporalAccessor date; From 09abb85874c369c51d9f3de71c494b7169d7c716 Mon Sep 17 00:00:00 2001 From: Dimitrios Kokkotas Date: Sun, 11 Jun 2023 15:54:16 +0300 Subject: [PATCH 12/14] Refactor URCleanup class --- .../org/jabref/logic/cleanup/URLCleanup.java | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java index 8c7b302359f..924a3e5af70 100644 --- a/src/main/java/org/jabref/logic/cleanup/URLCleanup.java +++ b/src/main/java/org/jabref/logic/cleanup/URLCleanup.java @@ -17,10 +17,27 @@ */ public class URLCleanup implements CleanupJob { + /* + * The urlRegex was originally fetched from a suggested solution in + * https://stackoverflow.com/questions/28185064/python-infinite-loop-in-regex-to-match-url. + * In order to be functional, we made the necessary adjustments regarding Java + * features (mainly doubled backslashes). + */ + public static final String URL_REGEX = "(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.]" + + "[a-z]{2,4}/)(?:[^\\s()<>\\\\]+|\\(([^\\s()<>\\\\]+|(\\([^\\s()" + + "<>\\\\]+\\)))*\\))+(?:\\(([^\\s()<>\\\\]+|(\\([^\\s()<>\\\\]+\\" + + ")))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))"; + + public static final String DATE_TERMS_REGEX = "accessed on|visited on|retrieved on|viewed on"; + private static final Field NOTE_FIELD = StandardField.NOTE; private static final Field URL_FIELD = StandardField.URL; private static final Field URLDATE_FIELD = StandardField.URLDATE; + final Pattern urlPattern = Pattern.compile(URL_REGEX, Pattern.CASE_INSENSITIVE); + final Pattern dateTermsPattern = Pattern.compile(DATE_TERMS_REGEX, Pattern.CASE_INSENSITIVE); + final Pattern datePattern = Pattern.compile(Date.DATE_REGEX, Pattern.CASE_INSENSITIVE); + private NormalizeDateFormatter formatter = new NormalizeDateFormatter(); @Override @@ -29,23 +46,6 @@ public List cleanup(BibEntry entry) { String noteFieldValue = entry.getField(NOTE_FIELD).orElse(null); - /* - * The urlRegex was originally fetched from a suggested solution in - * https://stackoverflow.com/questions/28185064/python-infinite-loop-in-regex-to-match-url. - * In order to be functional, we made the necessary adjustments regarding Java - * features (mainly doubled backslashes). - */ - String urlRegex = "(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.]" - + "[a-z]{2,4}/)(?:[^\\s()<>\\\\]+|\\(([^\\s()<>\\\\]+|(\\([^\\s()" - + "<>\\\\]+\\)))*\\))+(?:\\(([^\\s()<>\\\\]+|(\\([^\\s()<>\\\\]+\\" - + ")))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))"; - - String dateTermsRegex = "accessed on|visited on|retrieved on|viewed on"; - - final Pattern urlPattern = Pattern.compile(urlRegex, Pattern.CASE_INSENSITIVE); - final Pattern dateTermsPattern = Pattern.compile(dateTermsRegex, Pattern.CASE_INSENSITIVE); - final Pattern datePattern = Pattern.compile(Date.DATE_REGEX, Pattern.CASE_INSENSITIVE); - final Matcher urlMatcher = urlPattern.matcher(noteFieldValue); final Matcher dateTermsMatcher = dateTermsPattern.matcher(noteFieldValue); final Matcher dateMatcher = datePattern.matcher(noteFieldValue); From 757a024195b17a70c2041276a3521d09d9b39c90 Mon Sep 17 00:00:00 2001 From: Dimitrios Kokkotas Date: Sun, 11 Jun 2023 16:27:06 +0300 Subject: [PATCH 13/14] Add reference to Date.parse method and enrich javadoc comment --- src/main/java/org/jabref/model/entry/Date.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/main/java/org/jabref/model/entry/Date.java b/src/main/java/org/jabref/model/entry/Date.java index cf18bb1b993..219a7b294e5 100644 --- a/src/main/java/org/jabref/model/entry/Date.java +++ b/src/main/java/org/jabref/model/entry/Date.java @@ -67,6 +67,13 @@ public class Date { (builder, formatterBuilder) -> builder.append(formatterBuilder.toFormatter())) .toFormatter(Locale.US); + /* + * There is also {@link org.jabref.model.entry.Date#parse(java.lang.String)}. + * The regex of that method cannot be used as we parse single dates here and that method parses: + * i) date ranges + * ii) two dates separated by '/' + * Additionally, parse method requires the reviewed String to hold only a date. + */ DATE_REGEX = "\\d{4}-\\d{1,2}-\\d{1,2}" + // covers YYYY-MM-DD, YYYY-M-DD, YYYY-MM-D, YYYY-M-D "|\\d{4}\\.\\d{1,2}\\.\\d{1,2}|" + // covers YYYY.MM.DD, YYYY.M.DD, YYYY.MM.D, YYYY.M.D "(January|February|March|April|May|June|July|August|September|" + From e5fb3d885fb2b6ccc5ce0b26a4ed3463c038149b Mon Sep 17 00:00:00 2001 From: Dimitrios Kokkotas Date: Sun, 11 Jun 2023 18:27:36 +0300 Subject: [PATCH 14/14] Rewrite user test cases properly --- .../java/org/jabref/logic/cleanup/URLCleanupTest.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java b/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java index 640f6cc7712..1c967a1a781 100644 --- a/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java +++ b/src/test/java/org/jabref/logic/cleanup/URLCleanupTest.java @@ -130,7 +130,7 @@ private static Stream provideURL() { .withField(StandardField.URLDATE, "2021-01-15"), new BibEntry().withField(StandardField.NOTE, - "\\url{http://142.42.1.1:8080}, Accessed on January 15, 2021")), + "\\url{http://142.42.1.1:8080}, accessed on January 15, 2021")), // Input entry doesn't hold any URL in the Note field. Arguments.of( @@ -146,9 +146,9 @@ private static Stream provideURL() { .withField(StandardField.URLDATE, "2021-01-15") .withField(StandardField.NOTE, - "Visited on February 12, 2017"), + "visited on February 12, 2017"), new BibEntry().withField(StandardField.NOTE, - "\\url{http://142.42.1.1:8080}, Accessed on January 15, 2021, Visited on February 12, 2017")), + "\\url{http://142.42.1.1:8080}, accessed on January 15, 2021, visited on February 12, 2017")), // Input entry holds the same url-date in both Note and Urldate field. Arguments.of( @@ -157,7 +157,7 @@ private static Stream provideURL() { .withField(StandardField.URLDATE, "2015-01-15"), new BibEntry().withField(StandardField.NOTE, - "\\url{http://142.42.1.1:8080}, Visited on 2015.01.15") + "\\url{http://142.42.1.1:8080}, visited on 2015.01.15") .withField(StandardField.URLDATE, "2015-01-15")), @@ -170,7 +170,7 @@ private static Stream provideURL() { .withField(StandardField.NOTE, "cited by Kramer"), new BibEntry().withField(StandardField.NOTE, - "\\url{https://example.org}, cited by Kramer, Accessed on 2023-04-11")) + "\\url{https://example.org}, cited by Kramer, accessed on 2023-04-11")) ); } }