Add EndNote XML importer (#3713)

* Add EndNote XML importer * add some optionals of Nullable to fix parsing errors * Implement feedback * Fix a few other NPE * Improve importer according to feedback from user
JabRef · Feb 16, 2018 · bb2b078 · bb2b078
1 parent b3c6f0a
commit bb2b078
Show file tree

Hide file tree

Showing 20 changed files with 935 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
 - We added [oaDOI](https://oadoi.org/) as a fulltext provider, so that JabRef is now able to provide fulltexts for more than 90 million open-access articles.
 - We changed one default of [Cleanup entries dialog](http://help.jabref.org/en/CleanupEntries): Per default, the PDF are not moved to the default file directory anymore. [#3619](https://github.com/JabRef/jabref/issues/3619)
 - We added a new type of group that shows all items referenced in a given LaTeX file (actually the generated AUX file). [#1664](https://github.com/JabRef/jabref/issues/1664)
+- We added an importer for the EndNote XML format. [Feature request in the forum](http://discourse.jabref.org/t/import-from-bookends-or-endnote/1048)
 - We added the export of the `translator` field to the according MS-Office XML field. [#1750, comment](https://github.com/JabRef/jabref/issues/1750#issuecomment-357350986)
 - We changed the import of the MS-Office XML fields `bookauthor` and `translator`. Both are now imported to their corresponding bibtex/biblatex fields.
 - We improved the export of the `address` and `location` field to the MS-Office XML fields. If the address field does not contain a comma, it is treated as single value and exported to the field `city`. [#1750, comment](https://github.com/JabRef/jabref/issues/1750#issuecomment-357539167)

diff --git a/src/main/java/org/jabref/logic/importer/ImportFormatReader.java b/src/main/java/org/jabref/logic/importer/ImportFormatReader.java
@@ -14,6 +14,7 @@
 import org.jabref.logic.importer.fileformat.CopacImporter;
 import org.jabref.logic.importer.fileformat.CustomImporter;
 import org.jabref.logic.importer.fileformat.EndnoteImporter;
+import org.jabref.logic.importer.fileformat.EndnoteXmlImporter;
 import org.jabref.logic.importer.fileformat.FreeCiteImporter;
 import org.jabref.logic.importer.fileformat.InspecImporter;
 import org.jabref.logic.importer.fileformat.IsiImporter;
@@ -56,6 +57,7 @@ public void resetImportFormats(ImportFormatPreferences newImportFormatPreference
         formats.add(new BibTeXMLImporter());
         formats.add(new CopacImporter());
         formats.add(new EndnoteImporter(importFormatPreferences));
+        formats.add(new EndnoteXmlImporter(importFormatPreferences));
         formats.add(new FreeCiteImporter(importFormatPreferences));
         formats.add(new InspecImporter());
         formats.add(new IsiImporter());

diff --git a/src/main/java/org/jabref/logic/importer/fileformat/EndnoteXmlImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/EndnoteXmlImporter.java
@@ -0,0 +1,306 @@
+package org.jabref.logic.importer.fileformat;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.stream.Collectors;
+
+import javax.xml.bind.JAXBContext;
+import javax.xml.bind.JAXBException;
+import javax.xml.bind.Unmarshaller;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+
+import org.jabref.logic.importer.ImportFormatPreferences;
+import org.jabref.logic.importer.Importer;
+import org.jabref.logic.importer.ParseException;
+import org.jabref.logic.importer.Parser;
+import org.jabref.logic.importer.ParserResult;
+import org.jabref.logic.importer.fileformat.endnote.Abstract;
+import org.jabref.logic.importer.fileformat.endnote.Authors;
+import org.jabref.logic.importer.fileformat.endnote.Contributors;
+import org.jabref.logic.importer.fileformat.endnote.Dates;
+import org.jabref.logic.importer.fileformat.endnote.ElectronicResourceNum;
+import org.jabref.logic.importer.fileformat.endnote.Isbn;
+import org.jabref.logic.importer.fileformat.endnote.Keywords;
+import org.jabref.logic.importer.fileformat.endnote.Notes;
+import org.jabref.logic.importer.fileformat.endnote.Number;
+import org.jabref.logic.importer.fileformat.endnote.Pages;
+import org.jabref.logic.importer.fileformat.endnote.PdfUrls;
+import org.jabref.logic.importer.fileformat.endnote.Record;
+import org.jabref.logic.importer.fileformat.endnote.RefType;
+import org.jabref.logic.importer.fileformat.endnote.RelatedUrls;
+import org.jabref.logic.importer.fileformat.endnote.SecondaryTitle;
+import org.jabref.logic.importer.fileformat.endnote.Style;
+import org.jabref.logic.importer.fileformat.endnote.Title;
+import org.jabref.logic.importer.fileformat.endnote.Titles;
+import org.jabref.logic.importer.fileformat.endnote.Url;
+import org.jabref.logic.importer.fileformat.endnote.Urls;
+import org.jabref.logic.importer.fileformat.endnote.Volume;
+import org.jabref.logic.importer.fileformat.endnote.Xml;
+import org.jabref.logic.importer.fileformat.endnote.Year;
+import org.jabref.logic.util.FileType;
+import org.jabref.model.entry.BibEntry;
+import org.jabref.model.entry.BiblatexEntryType;
+import org.jabref.model.entry.BiblatexEntryTypes;
+import org.jabref.model.entry.FieldName;
+import org.jabref.model.entry.LinkedFile;
+import org.jabref.model.strings.StringUtil;
+import org.jabref.model.util.OptionalUtil;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Importer for the Endnote XML format.
+ *
+ * Based on dtd scheme downloaded from Article #122577 in http://kbportal.thomson.com.
+ */
+public class EndnoteXmlImporter extends Importer implements Parser {
+
+    private static final Logger LOGGER = LoggerFactory.getLogger(EndnoteXmlImporter.class);
+    private final ImportFormatPreferences preferences;
+    private Unmarshaller unmarshaller;
+
+    public EndnoteXmlImporter(ImportFormatPreferences preferences) {
+        this.preferences = preferences;
+    }
+
+    @Override
+    public String getName() {
+        return "EndNote XML";
+    }
+
+    @Override
+    public FileType getFileType() {
+        return FileType.ENDNOTE_XML;
+    }
+
+    @Override
+    public String getId() {
+        return "endnote";
+    }
+
+    @Override
+    public String getDescription() {
+        return "Importer for the EndNote XML format.";
+    }
+
+    @Override
+    public boolean isRecognizedFormat(BufferedReader reader) throws IOException {
+        String str;
+        int i = 0;
+        while (((str = reader.readLine()) != null) && (i < 50)) {
+            if (str.toLowerCase(Locale.ENGLISH).contains("<records>")) {
+                return true;
+            }
+
+            i++;
+        }
+        return false;
+    }
+
+    @Override
+    public ParserResult importDatabase(BufferedReader reader) throws IOException {
+        Objects.requireNonNull(reader);
+
+        try {
+            Object unmarshalledObject = unmarshallRoot(reader);
+
+            if (unmarshalledObject instanceof Xml) {
+                // Check whether we have an article set, an article, a book article or a book article set
+                Xml root = (Xml) unmarshalledObject;
+                List<BibEntry> bibEntries = root.getRecords()
+                        .getRecord()
+                        .stream()
+                        .map(this::parseRecord)
+                        .collect(Collectors.toList());
+
+                return new ParserResult(bibEntries);
+            } else {
+                return ParserResult.fromErrorMessage("File does not start with xml tag.");
+            }
+        } catch (JAXBException | XMLStreamException e) {
+            LOGGER.debug("could not parse document", e);
+            return ParserResult.fromError(e);
+        }
+    }
+
+    private Object unmarshallRoot(BufferedReader reader) throws XMLStreamException, JAXBException {
+        initUnmarshaller();
+
+        XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
+        XMLStreamReader xmlStreamReader = xmlInputFactory.createXMLStreamReader(reader);
+
+        // Go to the root element
+        while (!xmlStreamReader.isStartElement()) {
+            xmlStreamReader.next();
+        }
+
+        return unmarshaller.unmarshal(xmlStreamReader);
+    }
+
+    private void initUnmarshaller() throws JAXBException {
+        if (unmarshaller == null) {
+            // Lazy init because this is expensive
+            JAXBContext context = JAXBContext.newInstance("org.jabref.logic.importer.fileformat.endnote");
+            unmarshaller = context.createUnmarshaller();
+        }
+    }
+
+    private static BiblatexEntryType convertRefNameToType(String refName) {
+        switch (refName.toLowerCase().trim()) {
+            case "artwork":
+                return BiblatexEntryTypes.MISC;
+            case "generic":
+                return BiblatexEntryTypes.MISC;
+            case "electronic rticle":
+                return BiblatexEntryTypes.ELECTRONIC;
+            case "book section":
+                return BiblatexEntryTypes.INBOOK;
+            case "book":
+                return BiblatexEntryTypes.BOOK;
+            case "journal article":
+                return BiblatexEntryTypes.ARTICLE;
+
+            default:
+                return BiblatexEntryTypes.ARTICLE;
+        }
+    }
+
+    private BibEntry parseRecord(Record record) {
+        BibEntry entry = new BibEntry();
+
+        entry.setType(getType(record));
+        Optional.ofNullable(getAuthors(record))
+                .ifPresent(value -> entry.setField(FieldName.AUTHOR, value));
+        Optional.ofNullable(record.getTitles())
+                .map(Titles::getTitle)
+                .map(Title::getStyle)
+                .map(Style::getvalue)
+                .ifPresent(value -> entry.setField(FieldName.TITLE, clean(value)));
+        Optional.ofNullable(record.getTitles())
+                .map(Titles::getSecondaryTitle)
+                .map(SecondaryTitle::getStyle)
+                .map(Style::getvalue)
+                .ifPresent(value -> entry.setField(FieldName.JOURNAL, clean(value)));
+        Optional.ofNullable(record.getPages())
+                .map(Pages::getStyle)
+                .map(Style::getvalue)
+                .ifPresent(value -> entry.setField(FieldName.PAGES, value));
+        Optional.ofNullable(record.getNumber())
+                .map(Number::getStyle)
+                .map(Style::getvalue)
+                .ifPresent(value -> entry.setField(FieldName.NUMBER, value));
+        Optional.ofNullable(record.getVolume())
+                .map(Volume::getStyle)
+                .map(Style::getvalue)
+                .ifPresent(value -> entry.setField(FieldName.VOLUME, value));
+        Optional.ofNullable(record.getDates())
+                .map(Dates::getYear)
+                .map(Year::getStyle)
+                .map(Style::getvalue)
+                .ifPresent(value -> entry.setField(FieldName.YEAR, value));
+        Optional.ofNullable(record.getNotes())
+                .map(Notes::getStyle)
+                .map(Style::getvalue)
+                .ifPresent(value -> entry.setField(FieldName.NOTE, value.trim()));
+        getUrl(record)
+                .ifPresent(value -> entry.setField(FieldName.URL, value));
+        entry.putKeywords(getKeywords(record), preferences.getKeywordSeparator());
+        Optional.ofNullable(record.getAbstract())
+                .map(Abstract::getStyle)
+                .map(Style::getvalue)
+                .ifPresent(value -> entry.setField(FieldName.ABSTRACT, value.trim()));
+        entry.setFiles(getLinkedFiles(record));
+        Optional.ofNullable(record.getIsbn())
+                .map(Isbn::getStyle)
+                .map(Style::getvalue)
+                .ifPresent(value -> entry.setField(FieldName.ISBN, clean(value)));
+        Optional.ofNullable(record.getElectronicResourceNum())
+                .map(ElectronicResourceNum::getStyle)
+                .map(Style::getvalue)
+                .ifPresent(doi -> entry.setField(FieldName.DOI, doi.trim()));
+
+        return entry;
+    }
+
+    private BiblatexEntryType getType(Record record) {
+        return Optional.ofNullable(record.getRefType())
+                       .map(RefType::getName)
+                       .map(EndnoteXmlImporter::convertRefNameToType)
+                       .orElse(BiblatexEntryTypes.ARTICLE);
+    }
+
+    private List<LinkedFile> getLinkedFiles(Record record) {
+        Optional<PdfUrls> urls = Optional.ofNullable(record.getUrls())
+                                         .map(Urls::getPdfUrls);
+        return OptionalUtil.toStream(urls)
+                           .flatMap(pdfUrls -> pdfUrls.getUrl().stream())
+                           .flatMap(url -> OptionalUtil.toStream(getUrlValue(url)))
+                           .map(url -> new LinkedFile("", url, "PDF"))
+                           .collect(Collectors.toList());
+    }
+
+    private Optional<String> getUrl(Record record) {
+        Optional<RelatedUrls> urls = Optional.ofNullable(record.getUrls())
+                                             .map(Urls::getRelatedUrls);
+        return OptionalUtil.toStream(urls)
+                           .flatMap(url -> url.getUrl().stream())
+                           .flatMap(url -> OptionalUtil.toStream(getUrlValue(url)))
+                           .findFirst();
+    }
+
+    private Optional<String> getUrlValue(Url url) {
+        return Optional.ofNullable(url)
+                       .map(Url::getStyle)
+                       .map(Style::getvalue)
+                       .map(this::clean);
+    }
+
+    private List<String> getKeywords(Record record) {
+        Keywords keywords = record.getKeywords();
+        if (keywords != null) {
+            return keywords.getKeyword()
+                           .stream()
+                           .map(keyword -> keyword.getStyle().getvalue())
+                           .collect(Collectors.toList());
+        } else {
+            return Collections.emptyList();
+        }
+    }
+
+    private String getAuthors(Record record) {
+        Optional<Authors> authors = Optional.ofNullable(record.getContributors())
+                                            .map(Contributors::getAuthors);
+        return OptionalUtil.toStream(authors)
+                           .flatMap(value -> value.getAuthor().stream())
+                           .map(author -> author.getStyle().getvalue())
+                           .collect(Collectors.joining(" and "));
+    }
+
+    private String clean(String input) {
+        return StringUtil.unifyLineBreaks(input, " ")
+                .trim()
+                .replaceAll(" +", " ");
+    }
+
+    @Override
+    public List<BibEntry> parseEntries(InputStream inputStream) throws ParseException {
+        try {
+            return importDatabase(
+                    new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))).getDatabase().getEntries();
+        } catch (IOException e) {
+            LOGGER.error(e.getLocalizedMessage(), e);
+        }
+        return Collections.emptyList();
+    }
+}
diff --git a/src/main/java/org/jabref/logic/importer/fileformat/MedlineImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/MedlineImporter.java
@@ -93,6 +93,7 @@ public class MedlineImporter extends Importer implements Parser {
     private static final String KEYWORD_SEPARATOR = "; ";
 
     private static final Locale ENGLISH = Locale.ENGLISH;
+    private Unmarshaller unmarshaller;
 
     private static String join(List<String> list, String string) {
         return Joiner.on(string).join(list);
@@ -141,17 +142,7 @@ public ParserResult importDatabase(BufferedReader reader) throws IOException {
         List<BibEntry> bibItems = new ArrayList<>();
 
         try {
-            JAXBContext context = JAXBContext.newInstance("org.jabref.logic.importer.fileformat.medline");
-            XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
-            XMLStreamReader xmlStreamReader = xmlInputFactory.createXMLStreamReader(reader);
-
-            //go to the root element
-            while (!xmlStreamReader.isStartElement()) {
-                xmlStreamReader.next();
-            }
-
-            Unmarshaller unmarshaller = context.createUnmarshaller();
-            Object unmarshalledObject = unmarshaller.unmarshal(xmlStreamReader);
+            Object unmarshalledObject = unmarshallRoot(reader);
 
             //check whether we have an article set, an article, a book article or a book article set
             if (unmarshalledObject instanceof PubmedArticleSet) {
@@ -185,6 +176,28 @@ public ParserResult importDatabase(BufferedReader reader) throws IOException {
         return new ParserResult(bibItems);
     }
 
+    private Object unmarshallRoot(BufferedReader reader) throws JAXBException, XMLStreamException {
+        initUmarshaller();
+
+        XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
+        XMLStreamReader xmlStreamReader = xmlInputFactory.createXMLStreamReader(reader);
+
+        //go to the root element
+        while (!xmlStreamReader.isStartElement()) {
+            xmlStreamReader.next();
+        }
+
+        return unmarshaller.unmarshal(xmlStreamReader);
+    }
+
+    private void initUmarshaller() throws JAXBException {
+        if (unmarshaller == null) {
+            // Lazy init because this is expensive
+            JAXBContext context = JAXBContext.newInstance("org.jabref.logic.importer.fileformat.medline");
+            unmarshaller = context.createUnmarshaller();
+        }
+    }
+
     private void parseBookArticle(PubmedBookArticle currentArticle, List<BibEntry> bibItems) {
         Map<String, String> fields = new HashMap<>();
         if (currentArticle.getBookDocument() != null) {

diff --git a/src/main/java/org/jabref/logic/util/FileType.java b/src/main/java/org/jabref/logic/util/FileType.java
@@ -24,8 +24,9 @@ public enum FileType {
     CITATION_STYLE(Localization.lang("%0 file", "CSL"), "csl"),
     DOCBOOK(Localization.lang("%0 file", "Docbook 4.4"), "xml"),
     DIN_1505(Localization.lang("%0 file", "DIN 1505"), "rtf"),
-    ENDNOTE(Localization.lang("%0 file", "Endnote/Refer"), "ref", "enw"),
-    ENDNOTE_TXT(Localization.lang("%0 file", "Endnote"), "txt"), //for export
+    ENDNOTE(Localization.lang("%0 file", "EndNote/Refer"), "ref", "enw"),
+    ENDNOTE_XML(Localization.lang("%0 file", "EndNote XML"), "xml"),
+    ENDNOTE_TXT(Localization.lang("%0 file", "EndNote"), "txt"), //for export
     FREECITE(Localization.lang("%0 file", "FreeCite"), "txt", "xml"),
     HARVARD_RTF(Localization.lang("%0 file", "Harvard"), "rtf"),
     HTML_LIST(Localization.lang("%0 file", Localization.lang("HTML list")), "html"),