-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add MathML support when importing PubMed (#9963)
* add mathml support for medline importer * add changelog entry * fix checkstyle issues * remove remaining tabs * update resource loading method
- Loading branch information
Showing
16 changed files
with
3,419 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
69 changes: 69 additions & 0 deletions
69
src/main/java/org/jabref/logic/importer/util/MathMLParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
package org.jabref.logic.importer.util; | ||
|
||
import java.io.IOException; | ||
import java.io.StringReader; | ||
import java.io.StringWriter; | ||
import java.net.URISyntaxException; | ||
import java.net.URL; | ||
import java.util.Objects; | ||
|
||
import javax.xml.stream.XMLStreamException; | ||
import javax.xml.stream.XMLStreamReader; | ||
import javax.xml.transform.Result; | ||
import javax.xml.transform.Source; | ||
import javax.xml.transform.Transformer; | ||
import javax.xml.transform.TransformerException; | ||
import javax.xml.transform.TransformerFactory; | ||
import javax.xml.transform.stream.StreamResult; | ||
import javax.xml.transform.stream.StreamSource; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
public class MathMLParser { | ||
private static final Logger LOGGER = LoggerFactory.getLogger(MathMLParser.class); | ||
private static final String XSLT_FILE_PATH = "/xslt/mathml_latex/mmltex.xsl"; | ||
|
||
/** | ||
* Parses the MathML element into its corresponding | ||
* LaTeX representation, using an XSLT transformation file | ||
* | ||
* @param reader the stream reader | ||
* @return Returns the LaTeX representation | ||
*/ | ||
public static String parse(XMLStreamReader reader) { | ||
String xmlContent = ""; | ||
String latexResult = "<Unsupported MathML expression>"; | ||
|
||
try { | ||
// extract XML content | ||
xmlContent = StaxParser.getXMLContent(reader); | ||
|
||
// convert to LaTeX using XSLT file | ||
Source xmlSource = new StreamSource(new StringReader(xmlContent)); | ||
|
||
URL xsltResource = MathMLParser.class.getResource(XSLT_FILE_PATH); | ||
Source xsltSource = new StreamSource(Objects.requireNonNull(xsltResource).openStream(), xsltResource.toURI().toASCIIString()); | ||
|
||
TransformerFactory transformerFactory = TransformerFactory.newInstance(); | ||
Transformer transformer = transformerFactory.newTransformer(xsltSource); | ||
|
||
StringWriter writer = new StringWriter(); | ||
Result result = new StreamResult(writer); | ||
transformer.transform(xmlSource, result); | ||
|
||
latexResult = writer.getBuffer().toString(); | ||
} catch (XMLStreamException e) { | ||
LOGGER.debug("An exception occurred when getting XML content", e); | ||
} catch (IOException e) { | ||
LOGGER.debug("An I/O exception occurred", e); | ||
} catch (URISyntaxException e) { | ||
LOGGER.debug("XSLT Source URI invalid", e); | ||
} catch (TransformerException e) { | ||
LOGGER.debug("An exception occurred during transformation", e); | ||
} | ||
|
||
return latexResult; | ||
} | ||
} | ||
|
128 changes: 128 additions & 0 deletions
128
src/main/java/org/jabref/logic/importer/util/StaxParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
package org.jabref.logic.importer.util; | ||
|
||
import javax.xml.stream.XMLStreamConstants; | ||
import javax.xml.stream.XMLStreamException; | ||
import javax.xml.stream.XMLStreamReader; | ||
|
||
public class StaxParser { | ||
|
||
/** | ||
* Extracts the XML content inside the first | ||
* encountered parent tag, including tag elements, | ||
* attributes, namespace, prefix and contained text | ||
* | ||
* @param reader the stream reader | ||
* @return Returns the inner XML content | ||
*/ | ||
public static String getXMLContent(XMLStreamReader reader) throws XMLStreamException { | ||
// skip over START DOCUMENT event | ||
while (reader.getEventType() == XMLStreamConstants.START_DOCUMENT && reader.hasNext()) { | ||
reader.next(); | ||
} | ||
|
||
StringBuilder content = new StringBuilder(); | ||
|
||
String parentTag = reader.getLocalName(); | ||
int depth = 1; | ||
content.append(getXMLStartTag(reader, true)); | ||
|
||
while (reader.hasNext()) { | ||
int event = reader.next(); | ||
if (event == XMLStreamConstants.START_ELEMENT) { | ||
String tagName = reader.getLocalName(); | ||
|
||
if (tagName.equals(parentTag)) { | ||
// nested tag of same type | ||
depth++; | ||
} | ||
|
||
// append the start tag | ||
content.append(getXMLStartTag(reader, false)); | ||
} else if (event == XMLStreamConstants.END_ELEMENT) { | ||
String tagName = reader.getLocalName(); | ||
|
||
// append the end tag | ||
content.append(getXMLEndTag(reader)); | ||
|
||
if (tagName.equals(parentTag)) { | ||
depth--; | ||
|
||
if (depth == 0) { | ||
// reached the closing tag of the first parent tag | ||
break; | ||
} | ||
} | ||
} else if (event == XMLStreamConstants.CHARACTERS) { | ||
content.append(getXMLText(reader)); | ||
} else if (event == XMLStreamConstants.CDATA) { | ||
content.append(getXMLCData(reader)); | ||
} else if (event == XMLStreamConstants.COMMENT) { | ||
content.append(getXMLComment(reader)); | ||
} else if (event == XMLStreamConstants.PROCESSING_INSTRUCTION) { | ||
content.append(getXMLProcessingInstruction(reader)); | ||
} else if (event == XMLStreamConstants.SPACE || event == XMLStreamConstants.ENTITY_REFERENCE) { | ||
content.append(getXMLText(reader)); | ||
} | ||
} | ||
|
||
return content.toString().trim(); | ||
} | ||
|
||
private static String getXMLStartTag(XMLStreamReader reader, boolean addNamespaceURI) { | ||
StringBuilder startTag = new StringBuilder(); | ||
|
||
String prefix = reader.getPrefix(); | ||
|
||
startTag.append("<") | ||
.append(prefix != null && !prefix.isBlank() ? prefix + ":" : "") | ||
.append(reader.getName().getLocalPart()); | ||
|
||
String namespaceURI = reader.getNamespaceURI(); | ||
if (addNamespaceURI && namespaceURI != null) { | ||
startTag.append(" xmlns") | ||
.append(prefix != null && !prefix.isBlank() ? ":" + prefix : "") | ||
.append("=\"") | ||
.append(namespaceURI) | ||
.append("\""); | ||
} | ||
|
||
for (int i = 0; i < reader.getAttributeCount(); i++) { | ||
startTag.append(" ").append(reader.getAttributeLocalName(i)).append("=\"").append(reader.getAttributeValue(i)).append("\""); | ||
} | ||
|
||
if (reader.isEndElement()) { | ||
startTag.append("/"); | ||
} | ||
|
||
startTag.append(">"); | ||
return startTag.toString(); | ||
} | ||
|
||
private static String getXMLEndTag(XMLStreamReader reader) { | ||
StringBuilder endTag = new StringBuilder(); | ||
String prefix = reader.getPrefix(); | ||
|
||
endTag.append("</") | ||
.append(prefix != null && !prefix.isBlank() ? prefix + ":" : "") | ||
.append(reader.getName().getLocalPart()) | ||
.append(">"); | ||
|
||
return endTag.toString(); | ||
} | ||
|
||
private static String getXMLCData(XMLStreamReader reader) { | ||
return "<![CDATA[" + reader.getText() + "]]>"; | ||
} | ||
|
||
private static String getXMLComment(XMLStreamReader reader) { | ||
return "<!--" + reader.getText() + "-->"; | ||
} | ||
|
||
private static String getXMLProcessingInstruction(XMLStreamReader reader) { | ||
return "<?" + reader.getPITarget() + " " + reader.getPIData() + "?>"; | ||
} | ||
|
||
private static String getXMLText(XMLStreamReader reader) { | ||
return reader.getText().trim(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
README for the XSLT MathML Library | ||
|
||
XSLT MathML Library is a set of XSLT stylesheets to transform | ||
MathML 2.0 to LaTeX. | ||
|
||
For more information, see | ||
http://www.raleigh.ru/MathML/mmltex/index.php?lang=en | ||
|
||
Manifest | ||
-------- | ||
|
||
README this file | ||
mmltex.xsl | ||
tokens.xsl | ||
glayout.xsl | ||
scripts.xsl | ||
tables.xsl | ||
entities.xsl | ||
cmarkup.xsl | ||
|
||
Use | ||
--- | ||
|
||
There are two ways of using the library: | ||
|
||
* Use a local copy of the library. | ||
|
||
1. Download the distribution (see below). | ||
|
||
2. Unpack the distribution, using unzip. | ||
|
||
3. In your stylesheet import or include either the main | ||
stylesheet, mmltex.xsl, or the stylesheet module you | ||
wish to use, such as tokens.xsl. This example assumes | ||
that the distribution has been extracted into the same | ||
directory as your own stylesheet: | ||
|
||
<xsl:import href="mmltex.xsl"/> | ||
|
||
* Import or include either the main stylesheet, or the | ||
stylesheet module you wish to use, directly from the library | ||
website; http://www.raleigh.ru/MathML/mmltex/. For example: | ||
|
||
<xsl:import href="http://www.raleigh.ru/MathML/mmltex/mmltex.xsl"/> | ||
|
||
Obtaining The Library | ||
--------------------- | ||
|
||
The XSLT MathML Library is available for download as: | ||
|
||
* Zip file: http://www.raleigh.ru/MathML/mmltex/mmltex.zip | ||
|
||
Copyright | ||
--------- | ||
|
||
Copyright (C) 2001, 2002 Vasil Yaroshevich | ||
|
||
Permission is hereby granted, free of charge, to any person | ||
obtaining a copy of this software and associated documentation | ||
files (the ``Software''), to deal in the Software without | ||
restriction, including without limitation the rights to use, | ||
copy, modify, merge, publish, distribute, sublicense, and/or | ||
sell copies of the Software, and to permit persons to whom the | ||
Software is furnished to do so, subject to the following | ||
conditions: | ||
|
||
The above copyright notice and this permission notice shall be | ||
included in all copies or substantial portions of the Software. | ||
|
||
Except as contained in this notice, the names of individuals | ||
credited with contribution to this software shall not be used in | ||
advertising or otherwise to promote the sale, use or other | ||
dealings in this Software without prior written authorization | ||
from the individuals in question. | ||
|
||
Any stylesheet derived from this Software that is publically | ||
distributed will be identified with a different name and the | ||
version strings in any derived Software will be changed so that | ||
no possibility of confusion between the derived package and this | ||
Software will exist. | ||
|
||
Warranty | ||
-------- | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
NONINFRINGEMENT. IN NO EVENT SHALL NORMAN WALSH OR ANY OTHER | ||
CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | ||
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
OTHER DEALINGS IN THE SOFTWARE. | ||
|
||
Contacting the Author | ||
--------------------- | ||
|
||
These stylesheets are maintained by Vasil Yaroshevich, <[email protected]>. |
Oops, something went wrong.