Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add EndNote XML importer #3713

Merged
merged 7 commits into from
Feb 16, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
- We added [oaDOI](https://oadoi.org/) as a fulltext provider, so that JabRef is now able to provide fulltexts for more than 90 million open-access articles.
- We changed one default of [Cleanup entries dialog](http://help.jabref.org/en/CleanupEntries): Per default, the PDF are not moved to the default file directory anymore. [#3619](https://github.com/JabRef/jabref/issues/3619)
- We added a new type of group that shows all items referenced in a given LaTeX file (actually the generated AUX file). [#1664](https://github.com/JabRef/jabref/issues/1664)
- We added an importer for the EndNote XML format. [Feature request in the forum](http://discourse.jabref.org/t/import-from-bookends-or-endnote/1048)
- We added the export of the `translator` field to the according MS-Office XML field. [#1750, comment](https://github.com/JabRef/jabref/issues/1750#issuecomment-357350986)
- We changed the import of the MS-Office XML fields `bookauthor` and `translator`. Both are now imported to their corresponding bibtex/biblatex fields.
- We improved the export of the `address` and `location` field to the MS-Office XML fields. If the address field does not contain a comma, it is treated as single value and exported to the field `city`. [#1750, comment](https://github.com/JabRef/jabref/issues/1750#issuecomment-357539167)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import org.jabref.logic.importer.fileformat.CopacImporter;
import org.jabref.logic.importer.fileformat.CustomImporter;
import org.jabref.logic.importer.fileformat.EndnoteImporter;
import org.jabref.logic.importer.fileformat.EndnoteXmlImporter;
import org.jabref.logic.importer.fileformat.FreeCiteImporter;
import org.jabref.logic.importer.fileformat.InspecImporter;
import org.jabref.logic.importer.fileformat.IsiImporter;
Expand Down Expand Up @@ -56,6 +57,7 @@ public void resetImportFormats(ImportFormatPreferences newImportFormatPreference
formats.add(new BibTeXMLImporter());
formats.add(new CopacImporter());
formats.add(new EndnoteImporter(importFormatPreferences));
formats.add(new EndnoteXmlImporter(importFormatPreferences));
formats.add(new FreeCiteImporter(importFormatPreferences));
formats.add(new InspecImporter());
formats.add(new IsiImporter());
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
package org.jabref.logic.importer.fileformat;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.Importer;
import org.jabref.logic.importer.ParseException;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.importer.fileformat.endnote.Abstract;
import org.jabref.logic.importer.fileformat.endnote.Authors;
import org.jabref.logic.importer.fileformat.endnote.Contributors;
import org.jabref.logic.importer.fileformat.endnote.Dates;
import org.jabref.logic.importer.fileformat.endnote.ElectronicResourceNum;
import org.jabref.logic.importer.fileformat.endnote.Isbn;
import org.jabref.logic.importer.fileformat.endnote.Keywords;
import org.jabref.logic.importer.fileformat.endnote.Notes;
import org.jabref.logic.importer.fileformat.endnote.Number;
import org.jabref.logic.importer.fileformat.endnote.Pages;
import org.jabref.logic.importer.fileformat.endnote.PdfUrls;
import org.jabref.logic.importer.fileformat.endnote.Record;
import org.jabref.logic.importer.fileformat.endnote.RefType;
import org.jabref.logic.importer.fileformat.endnote.RelatedUrls;
import org.jabref.logic.importer.fileformat.endnote.SecondaryTitle;
import org.jabref.logic.importer.fileformat.endnote.Style;
import org.jabref.logic.importer.fileformat.endnote.Title;
import org.jabref.logic.importer.fileformat.endnote.Titles;
import org.jabref.logic.importer.fileformat.endnote.Url;
import org.jabref.logic.importer.fileformat.endnote.Urls;
import org.jabref.logic.importer.fileformat.endnote.Volume;
import org.jabref.logic.importer.fileformat.endnote.Xml;
import org.jabref.logic.importer.fileformat.endnote.Year;
import org.jabref.logic.util.FileType;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.BiblatexEntryType;
import org.jabref.model.entry.BiblatexEntryTypes;
import org.jabref.model.entry.FieldName;
import org.jabref.model.entry.LinkedFile;
import org.jabref.model.strings.StringUtil;
import org.jabref.model.util.OptionalUtil;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Importer for the Endnote XML format.
*
* Based on dtd scheme downloaded from Article #122577 in http://kbportal.thomson.com.
*/
public class EndnoteXmlImporter extends Importer implements Parser {

private static final Logger LOGGER = LoggerFactory.getLogger(EndnoteXmlImporter.class);
private final ImportFormatPreferences preferences;
private Unmarshaller unmarshaller;

public EndnoteXmlImporter(ImportFormatPreferences preferences) {
this.preferences = preferences;
}

@Override
public String getName() {
return "EndNote XML";
}

@Override
public FileType getFileType() {
return FileType.ENDNOTE_XML;
}

@Override
public String getId() {
return "endnote";
}

@Override
public String getDescription() {
return "Importer for the EndNote XML format.";
}

@Override
public boolean isRecognizedFormat(BufferedReader reader) throws IOException {
String str;
int i = 0;
while (((str = reader.readLine()) != null) && (i < 50)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a funny way of determining the format :-) Somewhere in the first 50 lines a records tag has to appear? Why up to 50? What if some random garbage comes before the records tag? Would JAXB still be able to parse that?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's another thing I just copied from the medline importer. Can jaxb test the file without actually parsing everything? If not I find the "record appears near the beginning" a relative good heuristic.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, JAXB will try to parse everything. So you could go with such a heuristic, but could you just improve the code a bit? E.g. no side effect (line reading) in the condition of the while, promoting the magic number 50 to a private final static and so on.

if (str.toLowerCase(Locale.ENGLISH).contains("<records>")) {
return true;
}

i++;
}
return false;
}

@Override
public ParserResult importDatabase(BufferedReader reader) throws IOException {
Objects.requireNonNull(reader);

try {
Object unmarshalledObject = unmarshallRoot(reader);

if (unmarshalledObject instanceof Xml) {
// Check whether we have an article set, an article, a book article or a book article set
Xml root = (Xml) unmarshalledObject;
List<BibEntry> bibEntries = root.getRecords()
.getRecord()
.stream()
.map(this::parseRecord)
.collect(Collectors.toList());

return new ParserResult(bibEntries);
} else {
return ParserResult.fromErrorMessage("File does not start with xml tag.");
}
} catch (JAXBException | XMLStreamException e) {
LOGGER.debug("could not parse document", e);
return ParserResult.fromError(e);
}
}

private Object unmarshallRoot(BufferedReader reader) throws XMLStreamException, JAXBException {
initUnmarshaller();

XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
XMLStreamReader xmlStreamReader = xmlInputFactory.createXMLStreamReader(reader);

// Go to the root element
while (!xmlStreamReader.isStartElement()) {
xmlStreamReader.next();
}

return unmarshaller.unmarshal(xmlStreamReader);
}

private void initUnmarshaller() throws JAXBException {
if (unmarshaller == null) {
// Lazy init because this is expensive
JAXBContext context = JAXBContext.newInstance("org.jabref.logic.importer.fileformat.endnote");
unmarshaller = context.createUnmarshaller();
}
}

private static BiblatexEntryType convertRefNameToType(String refName) {
switch (refName.toLowerCase().trim()) {
case "artwork":
return BiblatexEntryTypes.MISC;
case "generic":
return BiblatexEntryTypes.MISC;
case "electronic rticle":
return BiblatexEntryTypes.ELECTRONIC;
case "book section":
return BiblatexEntryTypes.INBOOK;
case "book":
return BiblatexEntryTypes.BOOK;
case "journal article":
return BiblatexEntryTypes.ARTICLE;

default:
return BiblatexEntryTypes.ARTICLE;
}
}

private BibEntry parseRecord(Record record) {
BibEntry entry = new BibEntry();

entry.setType(getType(record));
Optional.ofNullable(getAuthors(record))
.ifPresent(value -> entry.setField(FieldName.AUTHOR, value));
Optional.ofNullable(record.getTitles())
.map(Titles::getTitle)
.map(Title::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.TITLE, clean(value)));
Optional.ofNullable(record.getTitles())
.map(Titles::getSecondaryTitle)
.map(SecondaryTitle::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.JOURNAL, clean(value)));
Optional.ofNullable(record.getPages())
.map(Pages::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.PAGES, value));
Optional.ofNullable(record.getNumber())
.map(Number::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.NUMBER, value));
Optional.ofNullable(record.getVolume())
.map(Volume::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.VOLUME, value));
Optional.ofNullable(record.getDates())
.map(Dates::getYear)
.map(Year::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.YEAR, value));
Optional.ofNullable(record.getNotes())
.map(Notes::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.NOTE, value.trim()));
getUrl(record)
.ifPresent(value -> entry.setField(FieldName.URL, value));
entry.putKeywords(getKeywords(record), preferences.getKeywordSeparator());
Optional.ofNullable(record.getAbstract())
.map(Abstract::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.ABSTRACT, value.trim()));
entry.setFiles(getLinkedFiles(record));
Optional.ofNullable(record.getIsbn())
.map(Isbn::getStyle)
.map(Style::getvalue)
.ifPresent(value -> entry.setField(FieldName.ISBN, clean(value)));
Optional.ofNullable(record.getElectronicResourceNum())
.map(ElectronicResourceNum::getStyle)
.map(Style::getvalue)
.ifPresent(doi -> entry.setField(FieldName.DOI, doi.trim()));

return entry;
}

private BiblatexEntryType getType(Record record) {
return Optional.ofNullable(record.getRefType())
.map(RefType::getName)
.map(EndnoteXmlImporter::convertRefNameToType)
.orElse(BiblatexEntryTypes.ARTICLE);
}

private List<LinkedFile> getLinkedFiles(Record record) {
Optional<PdfUrls> urls = Optional.ofNullable(record.getUrls())
.map(Urls::getPdfUrls);
return OptionalUtil.toStream(urls)
.flatMap(pdfUrls -> pdfUrls.getUrl().stream())
.flatMap(url -> OptionalUtil.toStream(getUrlValue(url)))
.map(url -> new LinkedFile("", url, "PDF"))
.collect(Collectors.toList());
}

private Optional<String> getUrl(Record record) {
Optional<RelatedUrls> urls = Optional.ofNullable(record.getUrls())
.map(Urls::getRelatedUrls);
return OptionalUtil.toStream(urls)
.flatMap(url -> url.getUrl().stream())
.flatMap(url -> OptionalUtil.toStream(getUrlValue(url)))
.findFirst();
}

private Optional<String> getUrlValue(Url url) {
return Optional.ofNullable(url)
.map(Url::getStyle)
.map(Style::getvalue)
.map(this::clean);
}

private List<String> getKeywords(Record record) {
Keywords keywords = record.getKeywords();
if (keywords != null) {
return keywords.getKeyword()
.stream()
.map(keyword -> keyword.getStyle().getvalue())
.collect(Collectors.toList());
} else {
return Collections.emptyList();
}
}

private String getAuthors(Record record) {
Optional<Authors> authors = Optional.ofNullable(record.getContributors())
.map(Contributors::getAuthors);
return OptionalUtil.toStream(authors)
.flatMap(value -> value.getAuthor().stream())
.map(author -> author.getStyle().getvalue())
.collect(Collectors.joining(" and "));
}

private String clean(String input) {
return StringUtil.unifyLineBreaks(input, " ")
.trim()
.replaceAll(" +", " ");
}

@Override
public List<BibEntry> parseEntries(InputStream inputStream) throws ParseException {
try {
return importDatabase(
new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))).getDatabase().getEntries();
} catch (IOException e) {
LOGGER.error(e.getLocalizedMessage(), e);
}
return Collections.emptyList();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ public class MedlineImporter extends Importer implements Parser {
private static final String KEYWORD_SEPARATOR = "; ";

private static final Locale ENGLISH = Locale.ENGLISH;
private Unmarshaller unmarshaller;

private static String join(List<String> list, String string) {
return Joiner.on(string).join(list);
Expand Down Expand Up @@ -141,17 +142,7 @@ public ParserResult importDatabase(BufferedReader reader) throws IOException {
List<BibEntry> bibItems = new ArrayList<>();

try {
JAXBContext context = JAXBContext.newInstance("org.jabref.logic.importer.fileformat.medline");
XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
XMLStreamReader xmlStreamReader = xmlInputFactory.createXMLStreamReader(reader);

//go to the root element
while (!xmlStreamReader.isStartElement()) {
xmlStreamReader.next();
}

Unmarshaller unmarshaller = context.createUnmarshaller();
Object unmarshalledObject = unmarshaller.unmarshal(xmlStreamReader);
Object unmarshalledObject = unmarshallRoot(reader);

//check whether we have an article set, an article, a book article or a book article set
if (unmarshalledObject instanceof PubmedArticleSet) {
Expand Down Expand Up @@ -185,6 +176,28 @@ public ParserResult importDatabase(BufferedReader reader) throws IOException {
return new ParserResult(bibItems);
}

private Object unmarshallRoot(BufferedReader reader) throws JAXBException, XMLStreamException {
initUmarshaller();

XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
XMLStreamReader xmlStreamReader = xmlInputFactory.createXMLStreamReader(reader);

//go to the root element
while (!xmlStreamReader.isStartElement()) {
xmlStreamReader.next();
}

return unmarshaller.unmarshal(xmlStreamReader);
}

private void initUmarshaller() throws JAXBException {
if (unmarshaller == null) {
// Lazy init because this is expensive
JAXBContext context = JAXBContext.newInstance("org.jabref.logic.importer.fileformat.medline");
unmarshaller = context.createUnmarshaller();
}
}

private void parseBookArticle(PubmedBookArticle currentArticle, List<BibEntry> bibItems) {
Map<String, String> fields = new HashMap<>();
if (currentArticle.getBookDocument() != null) {
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/org/jabref/logic/util/FileType.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ public enum FileType {
CITATION_STYLE(Localization.lang("%0 file", "CSL"), "csl"),
DOCBOOK(Localization.lang("%0 file", "Docbook 4.4"), "xml"),
DIN_1505(Localization.lang("%0 file", "DIN 1505"), "rtf"),
ENDNOTE(Localization.lang("%0 file", "Endnote/Refer"), "ref", "enw"),
ENDNOTE_TXT(Localization.lang("%0 file", "Endnote"), "txt"), //for export
ENDNOTE(Localization.lang("%0 file", "EndNote/Refer"), "ref", "enw"),
ENDNOTE_XML(Localization.lang("%0 file", "EndNote XML"), "xml"),
ENDNOTE_TXT(Localization.lang("%0 file", "EndNote"), "txt"), //for export
FREECITE(Localization.lang("%0 file", "FreeCite"), "txt", "xml"),
HARVARD_RTF(Localization.lang("%0 file", "Harvard"), "rtf"),
HTML_LIST(Localization.lang("%0 file", Localization.lang("HTML list")), "html"),
Expand Down
Loading