Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support to optionally allow surrogate pair entities (#165) #174

Merged
merged 13 commits into from
Jan 16, 2024
Merged
24 changes: 24 additions & 0 deletions src/main/java/com/ctc/wstx/api/ReaderConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ public final class ReaderConfig
final static int PROP_MAX_ENTITY_DEPTH = 68;

final static int PROP_MAX_DTD_DEPTH = 69;

final static int PROP_RESOLVE_ENTITY_SURROGATE_PAIRS = 70;

/*
////////////////////////////////////////////////
Expand Down Expand Up @@ -361,6 +363,8 @@ public final class ReaderConfig
PROP_UNDECLARED_ENTITY_RESOLVER);
sProperties.put(WstxInputProperties.P_BASE_URL,
PROP_BASE_URL);
sProperties.put(WstxInputProperties.P_RESOLVE_ENTITY_SURROGATE_PAIRS,
PROP_RESOLVE_ENTITY_SURROGATE_PAIRS);
sProperties.put(WstxInputProperties.P_INPUT_PARSING_MODE,
PROP_INPUT_PARSING_MODE);
}
Expand Down Expand Up @@ -418,6 +422,11 @@ public final class ReaderConfig
* references
*/
protected URL mBaseURL;

/**
* Resolve surrogate pairs of entities (2 code-points as one target character)
*/
protected boolean mResolveEntitySurrogatePairs = Boolean.FALSE;
cowtowncoder marked this conversation as resolved.
Show resolved Hide resolved

/**
* Parsing mode can be changed from the default xml compliant
Expand Down Expand Up @@ -583,6 +592,7 @@ public ReaderConfig createNonShared(SymbolTable sym)
rc.mMaxEntityDepth = mMaxEntityDepth;
rc.mMaxEntityCount = mMaxEntityCount;
rc.mMaxDtdDepth = mMaxDtdDepth;
rc.mResolveEntitySurrogatePairs = mResolveEntitySurrogatePairs;
if (mSpecialProperties != null) {
int len = mSpecialProperties.length;
Object[] specProps = new Object[len];
Expand Down Expand Up @@ -791,6 +801,10 @@ public XMLResolver getUndeclaredEntityResolver() {
}

public URL getBaseURL() { return mBaseURL; }

public boolean isResolvingEntitySurrogatePairs() {
return mResolveEntitySurrogatePairs;
}

public WstxInputProperties.ParsingMode getInputParsingMode() {
return mParsingMode;
Expand Down Expand Up @@ -1074,6 +1088,10 @@ public void setUndeclaredEntityResolver(XMLResolver r) {
}

public void setBaseURL(URL baseURL) { mBaseURL = baseURL; }

public void setResolveEntitySurrogatePairs(boolean resolveEntitySurrogatePairs) {
mResolveEntitySurrogatePairs = resolveEntitySurrogatePairs;
}

public void setInputParsingMode(WstxInputProperties.ParsingMode mode) {
mParsingMode = mode;
Expand Down Expand Up @@ -1533,6 +1551,8 @@ public Object getProperty(int id)
return getUndeclaredEntityResolver();
case PROP_BASE_URL:
return getBaseURL();
case PROP_RESOLVE_ENTITY_SURROGATE_PAIRS:
return isResolvingEntitySurrogatePairs();
case PROP_INPUT_PARSING_MODE:
return getInputParsingMode();

Expand Down Expand Up @@ -1757,6 +1777,10 @@ public boolean setProperty(String propName, int id, Object value)
setBaseURL(u);
}
break;

case PROP_RESOLVE_ENTITY_SURROGATE_PAIRS:
setResolveEntitySurrogatePairs(ArgUtil.convertToBoolean(propName, value));
break;

case PROP_INPUT_PARSING_MODE:
setInputParsingMode((WstxInputProperties.ParsingMode) value);
Expand Down
7 changes: 7 additions & 0 deletions src/main/java/com/ctc/wstx/api/WstxInputProperties.java
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,13 @@ public final class WstxInputProperties
* DTD subset).
*/
public final static String P_BASE_URL = "com.ctc.wstx.baseURL";

/**
* Property of type {@link java.lang.Boolean}, that will allow parsing
* high unicode characters written by surrogate pairs (2 code points)
* Default set as Boolean.FALSE, because it is not a standard behavior
*/
cowtowncoder marked this conversation as resolved.
Show resolved Hide resolved
public final static String P_RESOLVE_ENTITY_SURROGATE_PAIRS = "com.ctc.wstx.resolveEntitySurrogatePairs";
cowtowncoder marked this conversation as resolved.
Show resolved Hide resolved

// // // Alternate parsing modes

Expand Down
142 changes: 107 additions & 35 deletions src/main/java/com/ctc/wstx/sr/StreamScanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -1188,55 +1188,121 @@ protected int resolveSimpleEntity(boolean checkStd)
if (c == '#') {
c = buf[ptr++];
int value = 0;
// Turns to 0 when first entity is read (must be a high surrogate char)
int pairValue = -1;
int inputLen = mInputEnd;
if (c == 'x') { // hex
while (ptr < inputLen) {
c = buf[ptr++];
if (c == ';') {
break;
}
value = value << 4;
if (c <= '9' && c >= '0') {
value += (c - '0');
} else if (c >= 'a' && c <= 'f') {
value += (10 + (c - 'a'));
} else if (c >= 'A' && c <= 'F') {
value += (10 + (c - 'A'));
} else {
mInputPtr = ptr; // so error points to correct char
throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F).");
boolean isValueHighSurrogate = false;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, so, I assume this works, but is quite hard to follow. What I'd suggest is to split code in such a way that the original decoding for the first entity is within initial method; and the rest of processing in separate method, only called if:

  1. Entity surrogate pair decoding is enabled
  2. First entity is valid first surrogate

This probably leads to slight duplication of code (decoding loop) but allows removing the outer loop since each method only decodes one entity.
And if duplicate looks too ugly decoding loops can be extracted into separate method too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll try to refactor this part in my free time (maybe on the weekend), but my first assumptions was to write functional code with minimal changes in the class architecture, to not conflict your rules and coding style. I haven't seen before any contributor guide, so I wanted to be careful and do this most safety way. Please give me a time, because I don't have much free time after hours

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Magmaruss understood! I could try my hand at change, but was worried I might get some parts wrong easily.
And my suggestion was not meant as critique; I much appreciate the contribution and it follows all reasonable expectations (esp. in absence of much documentation on what is expected of contributions).


/*
* Loop used only to continue iteration to search surrogate pair.
* If simple entity, it does only one iteration
*/
while (value == 0
|| (mConfig.isResolvingEntitySurrogatePairs()
&& value >= 0xD800 && value <= 0xDBFF && pairValue <= 0)) {

int tmpValue = pairValue >= 0 ? pairValue : value;

if (c == 'x') { // hex
while (ptr < inputLen) {
c = buf[ptr++];
if (c == ';') {
break;
}

tmpValue = tmpValue << 4;
if (c <= '9' && c >= '0') {
tmpValue += (c - '0');
} else if (c >= 'a' && c <= 'f') {
tmpValue += (10 + (c - 'a'));
} else if (c >= 'A' && c <= 'F') {
tmpValue += (10 + (c - 'A'));
} else {
mInputPtr = ptr; // so error points to correct char
throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F).");
}
/* Need to check for overflow; easiest to do right as
* it happens...
*/
if (tmpValue > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();
}
}
/* Need to check for overflow; easiest to do right as
* it happens...
*/
if (value > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();
} else { // numeric (decimal)
while (c != ';') {
if (c <= '9' && c >= '0') {
tmpValue = (tmpValue * 10) + (c - '0');
// Overflow?
if (tmpValue > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();
}

} else {
mInputPtr = ptr; // so error points to correct char
throwUnexpectedChar(c, "; expected a decimal number.");
}
if (ptr >= inputLen) {
break;
}
c = buf[ptr++];
}
}
} else { // numeric (decimal)
while (c != ';') {
if (c <= '9' && c >= '0') {
value = (value * 10) + (c - '0');
// Overflow?
if (value > MAX_UNICODE_CHAR) {
reportUnicodeOverflow();

if (pairValue >= 0) {
pairValue = tmpValue;
} else {
value = tmpValue;
}

isValueHighSurrogate = value >= 0xD800 && value <= 0xDBFF;

/*
* If resolving entity surrogate pairs enabled and if current entity
* is in range of high surrogate value, try to find surrogate pair
*/
if (isValueHighSurrogate && mConfig.isResolvingEntitySurrogatePairs()
&& c == ';' && pairValue < 0 && ptr + 1 < inputLen) {
c = buf[ptr++];

if (c == '&' && ptr + 1 < inputLen) {
c = buf[ptr++];

if (c == '#' && ptr + 1 < inputLen) {
c = buf[ptr++];
pairValue = 0;
continue;
} else {
reportNoSurrogatePair(value);
}
} else {
mInputPtr = ptr; // so error points to correct char
throwUnexpectedChar(c, "; expected a decimal number.");
reportNoSurrogatePair(value);
}
if (ptr >= inputLen) {
break;
}
c = buf[ptr++];
} else if (isValueHighSurrogate
&& mConfig.isResolvingEntitySurrogatePairs()
&& ptr + 1 >= inputLen) {
reportNoSurrogatePair(value);
}
}

/* We get here either if we got it all, OR if we ran out of
* input in current buffer.
*/
if (c == ';') { // got the full thing
mInputPtr = ptr;
validateChar(value);

if (mConfig.isResolvingEntitySurrogatePairs() && pairValue > 0) {
/*
cowtowncoder marked this conversation as resolved.
Show resolved Hide resolved
* If pair value is not in range of low surrogate values, then throw an error
*/
if (pairValue < 0xDC00 || pairValue > 0xDFFF) {
reportNoSurrogatePair(value);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be better if we reported both leading surrogate and second non-surrogate entity value?

}

value = 0x10000 + (value - 0xD800) * 0x400 + (pairValue - 0xDC00);
} else {
validateChar(value);
}

return value;
}

Expand Down Expand Up @@ -2464,6 +2530,12 @@ private void reportIllegalChar(int value)
{
throwParseError("Illegal character entity: expansion character (code 0x{0}", Integer.toHexString(value), null);
}

private void reportNoSurrogatePair(int highSurrogate)
throws XMLStreamException
{
throwParseError("Cannot find surrogate pair: high surrogate character (code 0x{0})", Integer.toHexString(highSurrogate), null);
}

protected void verifyLimit(String type, long maxValue, long currentValue)
throws XMLStreamException
Expand Down
10 changes: 10 additions & 0 deletions src/test/java/org/codehaus/stax/test/BaseStaxTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import javax.xml.stream.*;
import javax.xml.stream.events.XMLEvent;

import com.ctc.wstx.api.WstxInputProperties;

/* Latest updates:
*
* - 07-Sep-2007, TSa: Updating based on latest understanding of
Expand Down Expand Up @@ -275,6 +277,14 @@ protected static boolean setSupportExternalEntities(XMLInputFactory f, boolean s
return false;
}
}

protected static void setResolveEntitySurrogatePairs(XMLInputFactory f, boolean state)
throws XMLStreamException
{
Boolean b = state ? Boolean.TRUE : Boolean.FALSE;
f.setProperty(WstxInputProperties.P_RESOLVE_ENTITY_SURROGATE_PAIRS, b);
assertEquals(b, f.getProperty(WstxInputProperties.P_RESOLVE_ENTITY_SURROGATE_PAIRS));
}

protected static void setResolver(XMLInputFactory f, XMLResolver resolver)
throws XMLStreamException
Expand Down
Loading