Skip to content

Commit

Permalink
Add EndNote XML importer (#3713)
Browse files Browse the repository at this point in the history
* Add EndNote XML importer

* add some optionals of Nullable to fix parsing errors

* Implement feedback

* Fix a few other NPE

* Improve importer according to feedback from user
  • Loading branch information
tobiasdiez authored and Siedlerchr committed Feb 16, 2018
1 parent b3c6f0a commit bb2b078
Show file tree
Hide file tree
Showing 20 changed files with 935 additions and 23 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
- We added [oaDOI](https://oadoi.org/) as a fulltext provider, so that JabRef is now able to provide fulltexts for more than 90 million open-access articles.
- We changed one default of [Cleanup entries dialog](http://help.jabref.org/en/CleanupEntries): Per default, the PDF are not moved to the default file directory anymore. [#3619](https://github.com/JabRef/jabref/issues/3619)
- We added a new type of group that shows all items referenced in a given LaTeX file (actually the generated AUX file). [#1664](https://github.com/JabRef/jabref/issues/1664)
- We added an importer for the EndNote XML format. [Feature request in the forum](http://discourse.jabref.org/t/import-from-bookends-or-endnote/1048)
- We added the export of the `translator` field to the according MS-Office XML field. [#1750, comment](https://github.com/JabRef/jabref/issues/1750#issuecomment-357350986)
- We changed the import of the MS-Office XML fields `bookauthor` and `translator`. Both are now imported to their corresponding bibtex/biblatex fields.
- We improved the export of the `address` and `location` field to the MS-Office XML fields. If the address field does not contain a comma, it is treated as single value and exported to the field `city`. [#1750, comment](https://github.com/JabRef/jabref/issues/1750#issuecomment-357539167)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import org.jabref.logic.importer.fileformat.CopacImporter;
import org.jabref.logic.importer.fileformat.CustomImporter;
import org.jabref.logic.importer.fileformat.EndnoteImporter;
import org.jabref.logic.importer.fileformat.EndnoteXmlImporter;
import org.jabref.logic.importer.fileformat.FreeCiteImporter;
import org.jabref.logic.importer.fileformat.InspecImporter;
import org.jabref.logic.importer.fileformat.IsiImporter;
Expand Down Expand Up @@ -56,6 +57,7 @@ public void resetImportFormats(ImportFormatPreferences newImportFormatPreference
formats.add(new BibTeXMLImporter());
formats.add(new CopacImporter());
formats.add(new EndnoteImporter(importFormatPreferences));
formats.add(new EndnoteXmlImporter(importFormatPreferences));
formats.add(new FreeCiteImporter(importFormatPreferences));
formats.add(new InspecImporter());
formats.add(new IsiImporter());
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
package org.jabref.logic.importer.fileformat;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.Importer;
import org.jabref.logic.importer.ParseException;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.importer.fileformat.endnote.Abstract;
import org.jabref.logic.importer.fileformat.endnote.Authors;
import org.jabref.logic.importer.fileformat.endnote.Contributors;
import org.jabref.logic.importer.fileformat.endnote.Dates;
import org.jabref.logic.importer.fileformat.endnote.ElectronicResourceNum;
import org.jabref.logic.importer.fileformat.endnote.Isbn;
import org.jabref.logic.importer.fileformat.endnote.Keywords;
import org.jabref.logic.importer.fileformat.endnote.Notes;
import org.jabref.logic.importer.fileformat.endnote.Number;
import org.jabref.logic.importer.fileformat.endnote.Pages;
import org.jabref.logic.importer.fileformat.endnote.PdfUrls;
import org.jabref.logic.importer.fileformat.endnote.Record;
import org.jabref.logic.importer.fileformat.endnote.RefType;
import org.jabref.logic.importer.fileformat.endnote.RelatedUrls;
import org.jabref.logic.importer.fileformat.endnote.SecondaryTitle;
import org.jabref.logic.importer.fileformat.endnote.Style;
import org.jabref.logic.importer.fileformat.endnote.Title;
import org.jabref.logic.importer.fileformat.endnote.Titles;
import org.jabref.logic.importer.fileformat.endnote.Url;
import org.jabref.logic.importer.fileformat.endnote.Urls;
import org.jabref.logic.importer.fileformat.endnote.Volume;
import org.jabref.logic.importer.fileformat.endnote.Xml;
import org.jabref.logic.importer.fileformat.endnote.Year;
import org.jabref.logic.util.FileType;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.BiblatexEntryType;
import org.jabref.model.entry.BiblatexEntryTypes;
import org.jabref.model.entry.FieldName;
import org.jabref.model.entry.LinkedFile;
import org.jabref.model.strings.StringUtil;
import org.jabref.model.util.OptionalUtil;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Importer for the Endnote XML format.
*
* Based on dtd scheme downloaded from Article #122577 in http://kbportal.thomson.com.
*/
public class EndnoteXmlImporter extends Importer implements Parser {

private static final Logger LOGGER = LoggerFactory.getLogger(EndnoteXmlImporter.class);
private final ImportFormatPreferences preferences;
private Unmarshaller unmarshaller;

public EndnoteXmlImporter(ImportFormatPreferences preferences) {
this.preferences = preferences;
}

@Override
public String getName() {
return "EndNote XML";
}

@Override
public FileType getFileType() {
return FileType.ENDNOTE_XML;
}

@Override
public String getId() {
return "endnote";
}

@Override
public String getDescription() {
return "Importer for the EndNote XML format.";
}

@Override
public boolean isRecognizedFormat(BufferedReader reader) throws IOException {
String str;
int i = 0;
while (((str = reader.readLine()) != null) && (i < 50)) {
if (str.toLowerCase(Locale.ENGLISH).contains("<records>")) {
return true;
}

i++;
}
return false;
}

@Override
public ParserResult importDatabase(BufferedReader reader) throws IOException {
Objects.requireNonNull(reader);

try {
Object unmarshalledObject = unmarshallRoot(reader);

if (unmarshalledObject instanceof Xml) {
// Check whether we have an article set, an article, a book article or a book article set
Xml root = (Xml) unmarshalledObject;
List<BibEntry> bibEntries = root.getRecords()
.getRecord()
.stream()
.map(this::parseRecord)
.collect(Collectors.toList());

return new ParserResult(bibEntries);
} else {
return ParserResult.fromErrorMessage("File does not start with xml tag.");
}
} catch (JAXBException | XMLStreamException e) {
LOGGER.debug("could not parse document", e);
return ParserResult.fromError(e);
}
}

private Object unmarshallRoot(BufferedReader reader) throws XMLStreamException, JAXBException {
initUnmarshaller();

XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
XMLStreamReader xmlStreamReader = xmlInputFactory.createXMLStreamReader(reader);

// Go to the root element
while (!xmlStreamReader.isStartElement()) {
xmlStreamReader.next();
}

return unmarshaller.unmarshal(xmlStreamReader);
}

private void initUnmarshaller() throws JAXBException {
if (unmarshaller == null) {
// Lazy init because this is expensive
JAXBContext context = JAXBContext.newInstance("org.jabref.logic.importer.fileformat.endnote");
unmarshaller = context.createUnmarshaller();
}
}

private static BiblatexEntryType convertRefNameToType(String refName) {
switch (refName.toLowerCase().trim()) {
case "artwork":
return BiblatexEntryTypes.MISC;
case "generic":
return BiblatexEntryTypes.MISC;
case "electronic rticle":
return BiblatexEntryTypes.ELECTRONIC;
case "book section":
return BiblatexEntryTypes.INBOOK;
case "book":
return BiblatexEntryTypes.BOOK;
case "journal article":
return BiblatexEntryTypes.ARTICLE;

default:
return BiblatexEntryTypes.ARTICLE;
}
}

private BibEntry parseRecord(Record record) {
BibEntry entry = new BibEntry();

entry.setType(getType(record));
Optional.ofNullable(getAuthors(record))
.ifPresent(value -> entry.setField(FieldName.AUTHOR, value));
Optional.ofNullable(record.getTitles())
.map(Titles::getTitle)
.map(Title::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.TITLE, clean(value)));
Optional.ofNullable(record.getTitles())
.map(Titles::getSecondaryTitle)
.map(SecondaryTitle::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.JOURNAL, clean(value)));
Optional.ofNullable(record.getPages())
.map(Pages::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.PAGES, value));
Optional.ofNullable(record.getNumber())
.map(Number::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.NUMBER, value));
Optional.ofNullable(record.getVolume())
.map(Volume::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.VOLUME, value));
Optional.ofNullable(record.getDates())
.map(Dates::getYear)
.map(Year::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.YEAR, value));
Optional.ofNullable(record.getNotes())
.map(Notes::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.NOTE, value.trim()));
getUrl(record)
.ifPresent(value -> entry.setField(FieldName.URL, value));
entry.putKeywords(getKeywords(record), preferences.getKeywordSeparator());
Optional.ofNullable(record.getAbstract())
.map(Abstract::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.ABSTRACT, value.trim()));
entry.setFiles(getLinkedFiles(record));
Optional.ofNullable(record.getIsbn())
.map(Isbn::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.ISBN, clean(value)));
Optional.ofNullable(record.getElectronicResourceNum())
.map(ElectronicResourceNum::getStyle)
.map(Style::getvalue)
.ifPresent(doi -> entry.setField(FieldName.DOI, doi.trim()));

return entry;
}

private BiblatexEntryType getType(Record record) {
return Optional.ofNullable(record.getRefType())
.map(RefType::getName)
.map(EndnoteXmlImporter::convertRefNameToType)
.orElse(BiblatexEntryTypes.ARTICLE);
}

private List<LinkedFile> getLinkedFiles(Record record) {
Optional<PdfUrls> urls = Optional.ofNullable(record.getUrls())
.map(Urls::getPdfUrls);
return OptionalUtil.toStream(urls)
.flatMap(pdfUrls -> pdfUrls.getUrl().stream())
.flatMap(url -> OptionalUtil.toStream(getUrlValue(url)))
.map(url -> new LinkedFile("", url, "PDF"))
.collect(Collectors.toList());
}

private Optional<String> getUrl(Record record) {
Optional<RelatedUrls> urls = Optional.ofNullable(record.getUrls())
.map(Urls::getRelatedUrls);
return OptionalUtil.toStream(urls)
.flatMap(url -> url.getUrl().stream())
.flatMap(url -> OptionalUtil.toStream(getUrlValue(url)))
.findFirst();
}

private Optional<String> getUrlValue(Url url) {
return Optional.ofNullable(url)
.map(Url::getStyle)
.map(Style::getvalue)
.map(this::clean);
}

private List<String> getKeywords(Record record) {
Keywords keywords = record.getKeywords();
if (keywords != null) {
return keywords.getKeyword()
.stream()
.map(keyword -> keyword.getStyle().getvalue())
.collect(Collectors.toList());
} else {
return Collections.emptyList();
}
}

private String getAuthors(Record record) {
Optional<Authors> authors = Optional.ofNullable(record.getContributors())
.map(Contributors::getAuthors);
return OptionalUtil.toStream(authors)
.flatMap(value -> value.getAuthor().stream())
.map(author -> author.getStyle().getvalue())
.collect(Collectors.joining(" and "));
}

private String clean(String input) {
return StringUtil.unifyLineBreaks(input, " ")
.trim()
.replaceAll(" +", " ");
}

@Override
public List<BibEntry> parseEntries(InputStream inputStream) throws ParseException {
try {
return importDatabase(
new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))).getDatabase().getEntries();
} catch (IOException e) {
LOGGER.error(e.getLocalizedMessage(), e);
}
return Collections.emptyList();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ public class MedlineImporter extends Importer implements Parser {
private static final String KEYWORD_SEPARATOR = "; ";

private static final Locale ENGLISH = Locale.ENGLISH;
private Unmarshaller unmarshaller;

private static String join(List<String> list, String string) {
return Joiner.on(string).join(list);
Expand Down Expand Up @@ -141,17 +142,7 @@ public ParserResult importDatabase(BufferedReader reader) throws IOException {
List<BibEntry> bibItems = new ArrayList<>();

try {
JAXBContext context = JAXBContext.newInstance("org.jabref.logic.importer.fileformat.medline");
XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
XMLStreamReader xmlStreamReader = xmlInputFactory.createXMLStreamReader(reader);

//go to the root element
while (!xmlStreamReader.isStartElement()) {
xmlStreamReader.next();
}

Unmarshaller unmarshaller = context.createUnmarshaller();
Object unmarshalledObject = unmarshaller.unmarshal(xmlStreamReader);
Object unmarshalledObject = unmarshallRoot(reader);

//check whether we have an article set, an article, a book article or a book article set
if (unmarshalledObject instanceof PubmedArticleSet) {
Expand Down Expand Up @@ -185,6 +176,28 @@ public ParserResult importDatabase(BufferedReader reader) throws IOException {
return new ParserResult(bibItems);
}

private Object unmarshallRoot(BufferedReader reader) throws JAXBException, XMLStreamException {
initUmarshaller();

XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
XMLStreamReader xmlStreamReader = xmlInputFactory.createXMLStreamReader(reader);

//go to the root element
while (!xmlStreamReader.isStartElement()) {
xmlStreamReader.next();
}

return unmarshaller.unmarshal(xmlStreamReader);
}

private void initUmarshaller() throws JAXBException {
if (unmarshaller == null) {
// Lazy init because this is expensive
JAXBContext context = JAXBContext.newInstance("org.jabref.logic.importer.fileformat.medline");
unmarshaller = context.createUnmarshaller();
}
}

private void parseBookArticle(PubmedBookArticle currentArticle, List<BibEntry> bibItems) {
Map<String, String> fields = new HashMap<>();
if (currentArticle.getBookDocument() != null) {
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/org/jabref/logic/util/FileType.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ public enum FileType {
CITATION_STYLE(Localization.lang("%0 file", "CSL"), "csl"),
DOCBOOK(Localization.lang("%0 file", "Docbook 4.4"), "xml"),
DIN_1505(Localization.lang("%0 file", "DIN 1505"), "rtf"),
ENDNOTE(Localization.lang("%0 file", "Endnote/Refer"), "ref", "enw"),
ENDNOTE_TXT(Localization.lang("%0 file", "Endnote"), "txt"), //for export
ENDNOTE(Localization.lang("%0 file", "EndNote/Refer"), "ref", "enw"),
ENDNOTE_XML(Localization.lang("%0 file", "EndNote XML"), "xml"),
ENDNOTE_TXT(Localization.lang("%0 file", "EndNote"), "txt"), //for export
FREECITE(Localization.lang("%0 file", "FreeCite"), "txt", "xml"),
HARVARD_RTF(Localization.lang("%0 file", "Harvard"), "rtf"),
HTML_LIST(Localization.lang("%0 file", Localization.lang("HTML list")), "html"),
Expand Down
Loading

0 comments on commit bb2b078

Please sign in to comment.