Merge pull request #99 from JabRef/doi-parser

Rewrite DOI parsing
JabRef · Aug 14, 2015 · a394b59 · a394b59
2 parents 4ea1887 + 5f8321d
commit a394b59
Show file tree

Hide file tree

Showing 12 changed files with 281 additions and 268 deletions.
diff --git a/src/main/java/net/sf/jabref/export/layout/format/DOICheck.java b/src/main/java/net/sf/jabref/export/layout/format/DOICheck.java
@@ -15,34 +15,21 @@
 */
 package net.sf.jabref.export.layout.format;
 
-import net.sf.jabref.util.Doi;
+import net.sf.jabref.util.DOI;
 import net.sf.jabref.export.layout.LayoutFormatter;
 
 /**
- * Used to fix [ 1588028 ] export HTML table doi url.
+ * Used to fix [ 1588028 ] export HTML table DOI URL.
  * 
- * Will prepend "http://dx.doi.org/" if only doi number and not a URL is given.
- *
- * @author mark-schenk
- * @author olly98
+ * Will prepend "http://doi.org/" if only DOI and not an URL is given.
  */
 public class DOICheck implements LayoutFormatter {
-
     @Override
     public String format(String fieldText) {
-
         if (fieldText == null) {
             return null;
         }
 
-        if (fieldText.trim().isEmpty()) {
-            return "";
-        }
-
-        if (Doi.containsHttpDoi(fieldText)) {
-            return fieldText;
-        } else {
-            return new Doi(fieldText).getUri();
-        }
+        return DOI.build(fieldText).map(doi -> doi.getURL()).orElse(fieldText);
     }
 }
diff --git a/src/main/java/net/sf/jabref/export/layout/format/DOIStrip.java b/src/main/java/net/sf/jabref/export/layout/format/DOIStrip.java
@@ -15,24 +15,19 @@
 */
 package net.sf.jabref.export.layout.format;
 
-import net.sf.jabref.util.Doi;
+import net.sf.jabref.util.DOI;
 import net.sf.jabref.export.layout.LayoutFormatter;
 
 /**
  * Will strip any prefixes from the Doi field, in order to output only the Doi number
- * 
- * @author mark-schenk 
- * @author olly98
- *
  */
 public class DOIStrip implements LayoutFormatter {
-
     @Override
     public String format(String fieldText) {
         if (fieldText == null) {
             return null;
-        } else {
-            return new Doi(fieldText).getDoi();
         }
+
+        return DOI.build(fieldText).map(doi -> doi.getDOI()).orElse(fieldText);
     }
 }
diff --git a/src/main/java/net/sf/jabref/external/FindFullText.java b/src/main/java/net/sf/jabref/external/FindFullText.java
@@ -28,7 +28,7 @@
 import java.util.List;
 
 import net.sf.jabref.BibtexEntry;
-import net.sf.jabref.util.Doi;
+import net.sf.jabref.util.DOI;
 import net.sf.jabref.logic.net.URLDownload;
 
 /**
@@ -58,7 +58,7 @@ public FindResult findFullText(BibtexEntry entry) {
         String doiText = entry.getField("doi");
         // First try the Doi link, if defined:
         if (doiText != null && !doiText.trim().isEmpty()) {
-            FindResult resDoi = lookForFullTextAtURL(new Doi(doiText).getUri());
+            FindResult resDoi = lookForFullTextAtURL(new DOI(doiText).getURL());
             if (resDoi.status == FindFullText.FOUND_PDF) {
                 return resDoi;
             } else if (urlText != null && !urlText.trim().isEmpty()) {

diff --git a/src/main/java/net/sf/jabref/gui/CleanUpAction.java b/src/main/java/net/sf/jabref/gui/CleanUpAction.java
@@ -19,6 +19,7 @@
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Optional;
 
 import javax.swing.JCheckBox;
 import javax.swing.JLabel;
@@ -42,7 +43,7 @@
 import com.jgoodies.forms.layout.CellConstraints;
 import com.jgoodies.forms.layout.FormLayout;
 import net.sf.jabref.logic.l10n.Localization;
-import net.sf.jabref.util.Doi;
+import net.sf.jabref.util.DOI;
 import net.sf.jabref.util.FileUtil;
 import net.sf.jabref.logic.util.MonthUtil;
 import net.sf.jabref.util.Util;
@@ -116,6 +117,12 @@ public CleanUpAction(BasePanel panel) {
         initOptionsPanel();
     }
 
+    private void removeFieldValue(BibtexEntry bes, String fieldName, NamedCompound ce) {
+        String origValue = bes.getField(fieldName);
+        ce.addEdit(new UndoableFieldChange(bes, fieldName, origValue, ""));
+        bes.setField(fieldName, "");
+    }
+
     private void initOptionsPanel() {
         cleanUpSuperscrips = new JCheckBox(Localization.lang("Convert 1st, 2nd, ... to real superscripts"));
         cleanUpDOI = new JCheckBox(Localization.lang("Move DOIs from note and URL field to DOI field and remove http prefix"));
@@ -382,40 +389,41 @@ private void doCleanUpSuperscripts(BibtexEntry entry, NamedCompound ce) {
      * @param ce 
      */
     private void doCleanUpDOI(BibtexEntry bes, NamedCompound ce) {
-
         // fields to check
         String[] fields = {"note", "url", "ee"};
 
         // First check if the Doi Field is empty
         if (bes.getField("doi") != null) {
             String doiFieldValue = bes.getField("doi");
-            if (Doi.containsHttpDoi(doiFieldValue)) {
-                String newValue = new Doi(doiFieldValue).getDoi();
-                ce.addEdit(new UndoableFieldChange(bes, "doi", doiFieldValue, newValue));
-                bes.setField("doi", newValue);
-            }
-            if (Doi.containsDoi(doiFieldValue)) {
+
+            Optional<DOI> doi = DOI.build(doiFieldValue);
+
+            if(doi.isPresent()) {
+                String newValue = doi.get().getDOI();
+                if (!doiFieldValue.equals(newValue)) {
+                    ce.addEdit(new UndoableFieldChange(bes, "doi", doiFieldValue, newValue));
+                    bes.setField("doi", newValue);
+                }
+
                 // Doi field seems to contain Doi
-                // cleanup note, url, ee field
-                // we do NOT copy values to the Doi field as the Doi field contains a Doi!
+                // -> cleanup note, url, ee field
                 for (String field : fields) {
-                    if (Doi.containsDoi(bes.getField(field))) {
-                        Doi.removeDOIfromBibtexEntryField(bes, field, ce);
-                    }
+                    DOI.build(bes.getField((field))).ifPresent( unused -> removeFieldValue(bes, field, ce));
                 }
             }
         } else {
             // As the Doi field is empty we now check if note, url, or ee field contains a Doi
-
             for (String field : fields) {
-                if (Doi.containsDoi(bes.getField(field))) {
+                Optional<DOI> doi = DOI.build(bes.getField(field));
+
+                if (doi.isPresent()) {
                     // update Doi
                     String oldValue = bes.getField("doi");
-                    String newValue = new Doi(bes.getField(field)).getDoi();
+                    String newValue = doi.get().getDOI();
                     ce.addEdit(new UndoableFieldChange(bes, "doi", oldValue, newValue));
                     bes.setField("doi", newValue);
 
-                    Doi.removeDOIfromBibtexEntryField(bes, field, ce);
+                    removeFieldValue(bes, field, ce);
                 }
             }
         }

diff --git a/src/main/java/net/sf/jabref/imports/PdfContentImporter.java b/src/main/java/net/sf/jabref/imports/PdfContentImporter.java
@@ -30,7 +30,7 @@ of the License, or (at your option) any later version.
 
 import net.sf.jabref.*;
 import net.sf.jabref.logic.l10n.Localization;
-import net.sf.jabref.util.Doi;
+import net.sf.jabref.util.DOI;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.pdfbox.pdmodel.PDDocument;
@@ -237,7 +237,7 @@ public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) thr
             stripper.writeText(document, writer);
             String textResult = writer.toString();
 
-            String doi = new Doi(textResult).getDoi();
+            String doi = new DOI(textResult).getDOI();
             if (doi.length() < textResult.length()) {
                 // A Doi was found in the text
                 // We do NO parsing of the text, but use the Doi fetcher

diff --git a/src/main/java/net/sf/jabref/util/DOI.java b/src/main/java/net/sf/jabref/util/DOI.java
@@ -0,0 +1,114 @@
+package net.sf.jabref.util;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class DOI {
+    private static final Log LOGGER = LogFactory.getLog(DOI.class);
+
+    // DOI resolver
+    public static final URI RESOLVER = URI.create("http://doi.org");
+
+    // Regex
+    // (see http://www.doi.org/doi_handbook/2_Numbering.html)
+    private static final String DOI_EXP = ""
+            + "(?:urn:)?"                       // optional urn
+            + "(?:doi:)?"                       // optional doi
+            + "("                               // begin group \1
+            + "10"                              // directory indicator
+            + "(?:\\.[0-9]+)+"                  // registrant codes
+            + "[/:]"                            // divider
+            + "(?:.+)"                          // suffix alphanumeric string
+            + ")";                              // end group \1
+
+    private static final String HTTP_EXP = "https?://[^\\s]+?" + DOI_EXP;
+    // Pattern
+    private static final Pattern DOI_PATT = Pattern.compile("^(?:https?://[^\\s]+?)?" + DOI_EXP + "$", Pattern.CASE_INSENSITIVE);
+
+    /**
+     * Creates an Optional<DOI> from various schemes including URL, URN, and plain DOIs.
+     *
+     * Useful for suppressing the <c>IllegalArgumentException</c> of the Constructor
+     * and checking for Optional.isPresent() instead.
+     *
+     * @param doi the DOI string
+     * @return an Optional containing the DOI or an empty Optional
+     */
+    public static Optional<DOI> build(String doi) {
+        try {
+            return Optional.of(new DOI(doi));
+        } catch(NullPointerException | IllegalArgumentException e) {
+            return Optional.empty();
+        }
+    }
+
+    // DOI
+    private final String doi;
+
+    /**
+     * Creates a DOI from various schemes including URL, URN, and plain DOIs.
+     *
+     * @param doi the DOI string
+     * @throws NullPointerException if DOI is null
+     * @throws IllegalArgumentException if doi does not include a valid DOI
+     * @return an instance of the DOI class
+     */
+    public DOI(String doi) {
+        Objects.requireNonNull(doi);
+
+        // Remove whitespace
+        doi = doi.trim();
+
+        // HTTP URL decoding
+        if(doi.matches(HTTP_EXP)) {
+            try {
+                // decodes path segment
+                URI url = new URI(doi);
+                doi = url.getScheme() + "://" + url.getHost() + url.getPath();
+            } catch(URISyntaxException e) {
+                throw new IllegalArgumentException(doi + " is not a valid HTTP DOI.");
+            }
+        }
+
+        // Extract DOI
+        Matcher matcher = DOI_PATT.matcher(doi);
+        if (matcher.find()) {
+            // match only group \1
+            this.doi = matcher.group(1);
+        } else {
+            throw new IllegalArgumentException(doi + " is not a valid DOI.");
+        }
+    }
+
+    /**
+     * Return the plain DOI
+     *
+     * @return the plain DOI value.
+     */
+    public String getDOI() {
+        return doi;
+    }
+
+    /**
+     * Return a URL presentation for the DOI
+     *
+     * @return an encoded URL representation of the DOI
+     */
+    public String getURL() {
+        try {
+            URI uri = new URI(RESOLVER.getScheme(), RESOLVER.getHost(), "/" + doi, null);
+            return uri.toASCIIString();
+        } catch(URISyntaxException e) {
+            // should never happen
+            LOGGER.error(doi + " could not be encoded as URL.");
+            return "";
+        }
+    }
+}