Skip to content

Commit

Permalink
Add MathML support when importing PubMed (#9963)
Browse files Browse the repository at this point in the history
* add mathml support for medline importer

* add changelog entry

* fix checkstyle issues

* remove remaining tabs

* update resource loading method
  • Loading branch information
aqurilla authored Jun 1, 2023
1 parent 8d1154e commit c7ada34
Show file tree
Hide file tree
Showing 16 changed files with 3,419 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
- We added the possibility to automatically fetch entries when an ISBN is pasted on the main table. [#9864](https://github.com/JabRef/jabref/issues/9864)
- We added the option to disable the automatic linking of files in the entry editor [#5105](https://github.com/JabRef/jabref/issues/5105)
- We added the link icon for ISBNs in linked identifiers column. [#9819](https://github.com/JabRef/jabref/issues/9819)
- We added support for parsing MathML in the Medline importer. [#4273](https://github.com/JabRef/jabref/issues/4273)

### Changed

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.jabref.logic.importer.fileformat.medline.MeshHeading;
import org.jabref.logic.importer.fileformat.medline.OtherId;
import org.jabref.logic.importer.fileformat.medline.PersonalNameSubject;
import org.jabref.logic.importer.util.MathMLParser;
import org.jabref.logic.util.StandardFileType;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.Date;
Expand Down Expand Up @@ -1082,6 +1083,9 @@ private void handleTextElement(XMLStreamReader reader, List<String> textList, St
if (isStartXMLEvent(reader)) {
String elementName = reader.getName().getLocalPart();
switch (elementName) {
case "math" -> {
result.append(MathMLParser.parse(reader));
}
case "sup", "sub" -> {
reader.next();
if (isCharacterXMLEvent(reader)) {
Expand Down
69 changes: 69 additions & 0 deletions src/main/java/org/jabref/logic/importer/util/MathMLParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package org.jabref.logic.importer.util;

import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Objects;

import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MathMLParser {
private static final Logger LOGGER = LoggerFactory.getLogger(MathMLParser.class);
private static final String XSLT_FILE_PATH = "/xslt/mathml_latex/mmltex.xsl";

/**
* Parses the MathML element into its corresponding
* LaTeX representation, using an XSLT transformation file
*
* @param reader the stream reader
* @return Returns the LaTeX representation
*/
public static String parse(XMLStreamReader reader) {
String xmlContent = "";
String latexResult = "<Unsupported MathML expression>";

try {
// extract XML content
xmlContent = StaxParser.getXMLContent(reader);

// convert to LaTeX using XSLT file
Source xmlSource = new StreamSource(new StringReader(xmlContent));

URL xsltResource = MathMLParser.class.getResource(XSLT_FILE_PATH);
Source xsltSource = new StreamSource(Objects.requireNonNull(xsltResource).openStream(), xsltResource.toURI().toASCIIString());

TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer = transformerFactory.newTransformer(xsltSource);

StringWriter writer = new StringWriter();
Result result = new StreamResult(writer);
transformer.transform(xmlSource, result);

latexResult = writer.getBuffer().toString();
} catch (XMLStreamException e) {
LOGGER.debug("An exception occurred when getting XML content", e);
} catch (IOException e) {
LOGGER.debug("An I/O exception occurred", e);
} catch (URISyntaxException e) {
LOGGER.debug("XSLT Source URI invalid", e);
} catch (TransformerException e) {
LOGGER.debug("An exception occurred during transformation", e);
}

return latexResult;
}
}

128 changes: 128 additions & 0 deletions src/main/java/org/jabref/logic/importer/util/StaxParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
package org.jabref.logic.importer.util;

import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

public class StaxParser {

/**
* Extracts the XML content inside the first
* encountered parent tag, including tag elements,
* attributes, namespace, prefix and contained text
*
* @param reader the stream reader
* @return Returns the inner XML content
*/
public static String getXMLContent(XMLStreamReader reader) throws XMLStreamException {
// skip over START DOCUMENT event
while (reader.getEventType() == XMLStreamConstants.START_DOCUMENT && reader.hasNext()) {
reader.next();
}

StringBuilder content = new StringBuilder();

String parentTag = reader.getLocalName();
int depth = 1;
content.append(getXMLStartTag(reader, true));

while (reader.hasNext()) {
int event = reader.next();
if (event == XMLStreamConstants.START_ELEMENT) {
String tagName = reader.getLocalName();

if (tagName.equals(parentTag)) {
// nested tag of same type
depth++;
}

// append the start tag
content.append(getXMLStartTag(reader, false));
} else if (event == XMLStreamConstants.END_ELEMENT) {
String tagName = reader.getLocalName();

// append the end tag
content.append(getXMLEndTag(reader));

if (tagName.equals(parentTag)) {
depth--;

if (depth == 0) {
// reached the closing tag of the first parent tag
break;
}
}
} else if (event == XMLStreamConstants.CHARACTERS) {
content.append(getXMLText(reader));
} else if (event == XMLStreamConstants.CDATA) {
content.append(getXMLCData(reader));
} else if (event == XMLStreamConstants.COMMENT) {
content.append(getXMLComment(reader));
} else if (event == XMLStreamConstants.PROCESSING_INSTRUCTION) {
content.append(getXMLProcessingInstruction(reader));
} else if (event == XMLStreamConstants.SPACE || event == XMLStreamConstants.ENTITY_REFERENCE) {
content.append(getXMLText(reader));
}
}

return content.toString().trim();
}

private static String getXMLStartTag(XMLStreamReader reader, boolean addNamespaceURI) {
StringBuilder startTag = new StringBuilder();

String prefix = reader.getPrefix();

startTag.append("<")
.append(prefix != null && !prefix.isBlank() ? prefix + ":" : "")
.append(reader.getName().getLocalPart());

String namespaceURI = reader.getNamespaceURI();
if (addNamespaceURI && namespaceURI != null) {
startTag.append(" xmlns")
.append(prefix != null && !prefix.isBlank() ? ":" + prefix : "")
.append("=\"")
.append(namespaceURI)
.append("\"");
}

for (int i = 0; i < reader.getAttributeCount(); i++) {
startTag.append(" ").append(reader.getAttributeLocalName(i)).append("=\"").append(reader.getAttributeValue(i)).append("\"");
}

if (reader.isEndElement()) {
startTag.append("/");
}

startTag.append(">");
return startTag.toString();
}

private static String getXMLEndTag(XMLStreamReader reader) {
StringBuilder endTag = new StringBuilder();
String prefix = reader.getPrefix();

endTag.append("</")
.append(prefix != null && !prefix.isBlank() ? prefix + ":" : "")
.append(reader.getName().getLocalPart())
.append(">");

return endTag.toString();
}

private static String getXMLCData(XMLStreamReader reader) {
return "<![CDATA[" + reader.getText() + "]]>";
}

private static String getXMLComment(XMLStreamReader reader) {
return "<!--" + reader.getText() + "-->";
}

private static String getXMLProcessingInstruction(XMLStreamReader reader) {
return "<?" + reader.getPITarget() + " " + reader.getPIData() + "?>";
}

private static String getXMLText(XMLStreamReader reader) {
return reader.getText().trim();
}
}
97 changes: 97 additions & 0 deletions src/main/resources/xslt/mathml_latex/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
README for the XSLT MathML Library

XSLT MathML Library is a set of XSLT stylesheets to transform
MathML 2.0 to LaTeX.

For more information, see
http://www.raleigh.ru/MathML/mmltex/index.php?lang=en

Manifest
--------

README this file
mmltex.xsl
tokens.xsl
glayout.xsl
scripts.xsl
tables.xsl
entities.xsl
cmarkup.xsl

Use
---

There are two ways of using the library:

* Use a local copy of the library.

1. Download the distribution (see below).

2. Unpack the distribution, using unzip.

3. In your stylesheet import or include either the main
stylesheet, mmltex.xsl, or the stylesheet module you
wish to use, such as tokens.xsl. This example assumes
that the distribution has been extracted into the same
directory as your own stylesheet:

<xsl:import href="mmltex.xsl"/>

* Import or include either the main stylesheet, or the
stylesheet module you wish to use, directly from the library
website; http://www.raleigh.ru/MathML/mmltex/. For example:

<xsl:import href="http://www.raleigh.ru/MathML/mmltex/mmltex.xsl"/>

Obtaining The Library
---------------------

The XSLT MathML Library is available for download as:

* Zip file: http://www.raleigh.ru/MathML/mmltex/mmltex.zip

Copyright
---------

Copyright (C) 2001, 2002 Vasil Yaroshevich

Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the ``Software''), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

Except as contained in this notice, the names of individuals
credited with contribution to this software shall not be used in
advertising or otherwise to promote the sale, use or other
dealings in this Software without prior written authorization
from the individuals in question.

Any stylesheet derived from this Software that is publically
distributed will be identified with a different name and the
version strings in any derived Software will be changed so that
no possibility of confusion between the derived package and this
Software will exist.

Warranty
--------

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL NORMAN WALSH OR ANY OTHER
CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.

Contacting the Author
---------------------

These stylesheets are maintained by Vasil Yaroshevich, <[email protected]>.
Loading

0 comments on commit c7ada34

Please sign in to comment.