Skip to content

Commit

Permalink
Merge pull request #99 from JabRef/doi-parser
Browse files Browse the repository at this point in the history
Rewrite DOI parsing
  • Loading branch information
stefan-kolb committed Aug 14, 2015
2 parents 4ea1887 + 5f8321d commit a394b59
Show file tree
Hide file tree
Showing 12 changed files with 281 additions and 268 deletions.
21 changes: 4 additions & 17 deletions src/main/java/net/sf/jabref/export/layout/format/DOICheck.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,34 +15,21 @@
*/
package net.sf.jabref.export.layout.format;

import net.sf.jabref.util.Doi;
import net.sf.jabref.util.DOI;
import net.sf.jabref.export.layout.LayoutFormatter;

/**
* Used to fix [ 1588028 ] export HTML table doi url.
* Used to fix [ 1588028 ] export HTML table DOI URL.
*
* Will prepend "http://dx.doi.org/" if only doi number and not a URL is given.
*
* @author mark-schenk
* @author olly98
* Will prepend "http://doi.org/" if only DOI and not an URL is given.
*/
public class DOICheck implements LayoutFormatter {

@Override
public String format(String fieldText) {

if (fieldText == null) {
return null;
}

if (fieldText.trim().isEmpty()) {
return "";
}

if (Doi.containsHttpDoi(fieldText)) {
return fieldText;
} else {
return new Doi(fieldText).getUri();
}
return DOI.build(fieldText).map(doi -> doi.getURL()).orElse(fieldText);
}
}
11 changes: 3 additions & 8 deletions src/main/java/net/sf/jabref/export/layout/format/DOIStrip.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,19 @@
*/
package net.sf.jabref.export.layout.format;

import net.sf.jabref.util.Doi;
import net.sf.jabref.util.DOI;
import net.sf.jabref.export.layout.LayoutFormatter;

/**
* Will strip any prefixes from the Doi field, in order to output only the Doi number
*
* @author mark-schenk
* @author olly98
*
*/
public class DOIStrip implements LayoutFormatter {

@Override
public String format(String fieldText) {
if (fieldText == null) {
return null;
} else {
return new Doi(fieldText).getDoi();
}

return DOI.build(fieldText).map(doi -> doi.getDOI()).orElse(fieldText);
}
}
4 changes: 2 additions & 2 deletions src/main/java/net/sf/jabref/external/FindFullText.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import java.util.List;

import net.sf.jabref.BibtexEntry;
import net.sf.jabref.util.Doi;
import net.sf.jabref.util.DOI;
import net.sf.jabref.logic.net.URLDownload;

/**
Expand Down Expand Up @@ -58,7 +58,7 @@ public FindResult findFullText(BibtexEntry entry) {
String doiText = entry.getField("doi");
// First try the Doi link, if defined:
if (doiText != null && !doiText.trim().isEmpty()) {
FindResult resDoi = lookForFullTextAtURL(new Doi(doiText).getUri());
FindResult resDoi = lookForFullTextAtURL(new DOI(doiText).getURL());
if (resDoi.status == FindFullText.FOUND_PDF) {
return resDoi;
} else if (urlText != null && !urlText.trim().isEmpty()) {
Expand Down
42 changes: 25 additions & 17 deletions src/main/java/net/sf/jabref/gui/CleanUpAction.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;

import javax.swing.JCheckBox;
import javax.swing.JLabel;
Expand All @@ -42,7 +43,7 @@
import com.jgoodies.forms.layout.CellConstraints;
import com.jgoodies.forms.layout.FormLayout;
import net.sf.jabref.logic.l10n.Localization;
import net.sf.jabref.util.Doi;
import net.sf.jabref.util.DOI;
import net.sf.jabref.util.FileUtil;
import net.sf.jabref.logic.util.MonthUtil;
import net.sf.jabref.util.Util;
Expand Down Expand Up @@ -116,6 +117,12 @@ public CleanUpAction(BasePanel panel) {
initOptionsPanel();
}

private void removeFieldValue(BibtexEntry bes, String fieldName, NamedCompound ce) {
String origValue = bes.getField(fieldName);
ce.addEdit(new UndoableFieldChange(bes, fieldName, origValue, ""));
bes.setField(fieldName, "");
}

private void initOptionsPanel() {
cleanUpSuperscrips = new JCheckBox(Localization.lang("Convert 1st, 2nd, ... to real superscripts"));
cleanUpDOI = new JCheckBox(Localization.lang("Move DOIs from note and URL field to DOI field and remove http prefix"));
Expand Down Expand Up @@ -382,40 +389,41 @@ private void doCleanUpSuperscripts(BibtexEntry entry, NamedCompound ce) {
* @param ce
*/
private void doCleanUpDOI(BibtexEntry bes, NamedCompound ce) {

// fields to check
String[] fields = {"note", "url", "ee"};

// First check if the Doi Field is empty
if (bes.getField("doi") != null) {
String doiFieldValue = bes.getField("doi");
if (Doi.containsHttpDoi(doiFieldValue)) {
String newValue = new Doi(doiFieldValue).getDoi();
ce.addEdit(new UndoableFieldChange(bes, "doi", doiFieldValue, newValue));
bes.setField("doi", newValue);
}
if (Doi.containsDoi(doiFieldValue)) {

Optional<DOI> doi = DOI.build(doiFieldValue);

if(doi.isPresent()) {
String newValue = doi.get().getDOI();
if (!doiFieldValue.equals(newValue)) {
ce.addEdit(new UndoableFieldChange(bes, "doi", doiFieldValue, newValue));
bes.setField("doi", newValue);
}

// Doi field seems to contain Doi
// cleanup note, url, ee field
// we do NOT copy values to the Doi field as the Doi field contains a Doi!
// -> cleanup note, url, ee field
for (String field : fields) {
if (Doi.containsDoi(bes.getField(field))) {
Doi.removeDOIfromBibtexEntryField(bes, field, ce);
}
DOI.build(bes.getField((field))).ifPresent( unused -> removeFieldValue(bes, field, ce));
}
}
} else {
// As the Doi field is empty we now check if note, url, or ee field contains a Doi

for (String field : fields) {
if (Doi.containsDoi(bes.getField(field))) {
Optional<DOI> doi = DOI.build(bes.getField(field));

if (doi.isPresent()) {
// update Doi
String oldValue = bes.getField("doi");
String newValue = new Doi(bes.getField(field)).getDoi();
String newValue = doi.get().getDOI();
ce.addEdit(new UndoableFieldChange(bes, "doi", oldValue, newValue));
bes.setField("doi", newValue);

Doi.removeDOIfromBibtexEntryField(bes, field, ce);
removeFieldValue(bes, field, ce);
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/net/sf/jabref/imports/PdfContentImporter.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ of the License, or (at your option) any later version.

import net.sf.jabref.*;
import net.sf.jabref.logic.l10n.Localization;
import net.sf.jabref.util.Doi;
import net.sf.jabref.util.DOI;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.PDDocument;
Expand Down Expand Up @@ -237,7 +237,7 @@ public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) thr
stripper.writeText(document, writer);
String textResult = writer.toString();

String doi = new Doi(textResult).getDoi();
String doi = new DOI(textResult).getDOI();
if (doi.length() < textResult.length()) {
// A Doi was found in the text
// We do NO parsing of the text, but use the Doi fetcher
Expand Down
114 changes: 114 additions & 0 deletions src/main/java/net/sf/jabref/util/DOI.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package net.sf.jabref.util;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DOI {
private static final Log LOGGER = LogFactory.getLog(DOI.class);

// DOI resolver
public static final URI RESOLVER = URI.create("http://doi.org");

// Regex
// (see http://www.doi.org/doi_handbook/2_Numbering.html)
private static final String DOI_EXP = ""
+ "(?:urn:)?" // optional urn
+ "(?:doi:)?" // optional doi
+ "(" // begin group \1
+ "10" // directory indicator
+ "(?:\\.[0-9]+)+" // registrant codes
+ "[/:]" // divider
+ "(?:.+)" // suffix alphanumeric string
+ ")"; // end group \1

private static final String HTTP_EXP = "https?://[^\\s]+?" + DOI_EXP;
// Pattern
private static final Pattern DOI_PATT = Pattern.compile("^(?:https?://[^\\s]+?)?" + DOI_EXP + "$", Pattern.CASE_INSENSITIVE);

/**
* Creates an Optional<DOI> from various schemes including URL, URN, and plain DOIs.
*
* Useful for suppressing the <c>IllegalArgumentException</c> of the Constructor
* and checking for Optional.isPresent() instead.
*
* @param doi the DOI string
* @return an Optional containing the DOI or an empty Optional
*/
public static Optional<DOI> build(String doi) {
try {
return Optional.of(new DOI(doi));
} catch(NullPointerException | IllegalArgumentException e) {
return Optional.empty();
}
}

// DOI
private final String doi;

/**
* Creates a DOI from various schemes including URL, URN, and plain DOIs.
*
* @param doi the DOI string
* @throws NullPointerException if DOI is null
* @throws IllegalArgumentException if doi does not include a valid DOI
* @return an instance of the DOI class
*/
public DOI(String doi) {
Objects.requireNonNull(doi);

// Remove whitespace
doi = doi.trim();

// HTTP URL decoding
if(doi.matches(HTTP_EXP)) {
try {
// decodes path segment
URI url = new URI(doi);
doi = url.getScheme() + "://" + url.getHost() + url.getPath();
} catch(URISyntaxException e) {
throw new IllegalArgumentException(doi + " is not a valid HTTP DOI.");
}
}

// Extract DOI
Matcher matcher = DOI_PATT.matcher(doi);
if (matcher.find()) {
// match only group \1
this.doi = matcher.group(1);
} else {
throw new IllegalArgumentException(doi + " is not a valid DOI.");
}
}

/**
* Return the plain DOI
*
* @return the plain DOI value.
*/
public String getDOI() {
return doi;
}

/**
* Return a URL presentation for the DOI
*
* @return an encoded URL representation of the DOI
*/
public String getURL() {
try {
URI uri = new URI(RESOLVER.getScheme(), RESOLVER.getHost(), "/" + doi, null);
return uri.toASCIIString();
} catch(URISyntaxException e) {
// should never happen
LOGGER.error(doi + " could not be encoded as URL.");
return "";
}
}
}
Loading

0 comments on commit a394b59

Please sign in to comment.