Skip to content

Commit

Permalink
Use JSON11 for handling long numerals (opensearch-project#6915)
Browse files Browse the repository at this point in the history
Signed-off-by: Miki <[email protected]>

(cherry picked from commit 6a20742)
Signed-off-by: Miki <[email protected]>
  • Loading branch information
AMoo-Miki committed Jun 5, 2024
1 parent eeeef3a commit c964dcb
Show file tree
Hide file tree
Showing 5 changed files with 886 additions and 843 deletions.
2 changes: 2 additions & 0 deletions changelogs/fragments/6915.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
feat:
- Use JSON11 for handling long numerals ([#6915](https://github.com/opensearch-project/OpenSearch-Dashboards/pull/6915))
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
"**/jest-config": "npm:@amoo-miki/[email protected]",
"**/jest-jasmine2": "npm:@amoo-miki/[email protected]",
"**/joi/hoek": "npm:@amoo-miki/[email protected]",
"**/json11": "^1.1.2",
"**/json-schema": "^0.4.0",
"**/kind-of": ">=6.0.3",
"**/loader-utils": "^2.0.4",
Expand Down Expand Up @@ -153,7 +154,7 @@
"@hapi/vision": "^6.1.0",
"@hapi/wreck": "^17.1.0",
"@opensearch-project/opensearch": "^1.1.0",
"@opensearch-project/opensearch-next": "npm:@opensearch-project/opensearch@^2.6.0",
"@opensearch-project/opensearch-next": "npm:@opensearch-project/opensearch@^2.9.0",
"@opensearch/datemath": "5.0.3",
"@osd/ace": "1.0.0",
"@osd/analytics": "1.0.0",
Expand Down Expand Up @@ -239,7 +240,7 @@
"uuid": "3.3.2",
"whatwg-fetch": "^3.0.0",
"yauzl": "^2.10.0"

},
"devDependencies": {
"@babel/core": "^7.22.9",
Expand Down
1 change: 1 addition & 0 deletions packages/osd-std/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"private": true,
"sideEffects": false,
"dependencies": {
"json11": "^1.1.2",
"lodash": "^4.17.21"
},
"devDependencies": {
Expand Down
267 changes: 11 additions & 256 deletions packages/osd-std/src/json.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,260 +3,7 @@
* SPDX-License-Identifier: Apache-2.0
*/

/* In JavaScript, a `Number` is a 64-bit floating-point value which can store 16 digits. However, the
* serializer and deserializer will need to cater to numeric values generated by other languages which
* can have up to 19 digits. Native JSON parser and stringifier, incapable of handling the extra
* digits, corrupt the values, making them unusable.
*
* To work around this limitation, the deserializer converts long sequences of digits into strings and
* marks them before applying the parser. During the parsing, string values that begin with the mark
* are converted to `BigInt` values.
* Similarly, during stringification, the serializer converts `BigInt` values to marked strings and
* when done, it replaces them with plain numerals.
*
* `Number.MAX_SAFE_INTEGER`, 9,007,199,254,740,991, is the largest number that the native methods can
* parse and stringify, and any numeral greater than that would need to be translated using the
* workaround; all 17-digits or longer and only tail-end of the 16-digits need translation. It would
* be unfair to all the 16-digit numbers if the translation applied to `\d{16,}` only to cover the
* less than 10%. Hence, a RegExp is created to only match numerals too long to be a number.
*
* To make the explanation simpler, let's assume that MAX_SAFE_INTEGER is 8921 which has 4 digits.
* Starting from the right, we take each digit onwards, `[<start>-9]`:
* 1) 7922 - 7929: 792[2-9]\d{0}
* 2) 7930 - 7999: 79[3-9]\d{1}
* 9) 9 + 1 = 10 which results in a rollover; no need to do anything.
* 8) 9000 - 9999: [9-9]\d{3}
* Finally we add anything 5 digits or longer: `\d{5,}
*
* Note: A better solution would use AST but considering its performance penalty, RegExp is the next
* best thing.
*/
const maxIntAsString = String(Number.MAX_SAFE_INTEGER);
const maxIntLength = maxIntAsString.length;
// Sub-patterns for each digit
const longNumeralMatcherTokens = [`\\d{${maxIntAsString.length + 1},}`];
for (let i = 0; i < maxIntLength; i++) {
if (maxIntAsString[i] !== '9') {
longNumeralMatcherTokens.push(
maxIntAsString.substring(0, i) +
`[${parseInt(maxIntAsString[i], 10) + 1}-9]` +
`\\d{${maxIntLength - i - 1}}`
);
}
}

/* The matcher that looks for `": <numerals>, ...}` and `[..., <numeral>, ...]`
*
* The pattern starts by looking for `":` not immediately preceded by a `\`. That should be
* followed by any of the numeric sub-patterns. A comma, end of an array, end of an object, or
* the end of the input are the only acceptable elements after it.
*
* Note: This RegExp can result in false-positive hits on the likes of `{"key": "[ <numeral> ]"}` and
* those are cleaned out during parsing.
*/
const longNumeralMatcher = new RegExp(
`((?:\\[|,|(?<!\\\\)"\\s*:)\\s*)(-?(?:${longNumeralMatcherTokens.join('|')}))(\\s*)(?=,|}|]|$)`,
'g'
);

/* The characters with a highly unlikely chance of occurrence in strings, alone or in combination.
* These will be combined in various orders and lengths, to find a specific "marker" that is not
* present in the JSON string.
*/
const markerChars = ['෴', '߷', '֍'];

/* Generates an array of all combinations of `markerChars` with the requested length. */
const getMarkerChoices = (length: number): string[] => {
// coverage:ignore-line
if (!length || length < 0) return [];
const choices = [];
const arr = markerChars;
const arrLength = arr.length;
const temp = Array(length);

(function fill(pos, start) {
if (pos === length) return choices.push(temp.join(''));

for (let i = start; i < arrLength; i++) {
temp[pos] = arr[i];
fill(pos + 1, i);
}
})(0, 0);

return choices;
};

/* Experiments with different combinations of various lengths, until one is found to not be in
* the input string.
*/
const getMarker = (text: string): { marker: string; length: number } => {
let marker;
let length = 0;
do {
length++;
getMarkerChoices(length).some((markerChoice) => {
if (text.indexOf(markerChoice) === -1) {
marker = markerChoice;
return true;
}
});
} while (!marker);

return {
marker,
length,
};
};

const parseStringWithLongNumerals = (
text: string,
reviver?: ((this: any, key: string, value: any) => any) | null
): any => {
const { marker, length } = getMarker(text);

let hadException;
let obj;
let markedJSON = text.replace(longNumeralMatcher, `$1"${marker}$2"$3`);
const markedValueMatcher = new RegExp(`^${marker}-?\\d+$`);

/* Convert marked values to BigInt values.
* The `startsWith` is purely for performance, to avoid running `test` if not needed.
*/
const convertMarkedValues = (val: any) =>
typeof val === 'string' && val.startsWith(marker) && markedValueMatcher.test(val)
? BigInt(val.substring(length))
: val;

/* For better performance, instead of testing for existence of `reviver` on each value, two almost
* identical functions are used.
*/
const parseMarkedText = reviver
? (markedText: string) =>
JSON.parse(markedText, function (key, val) {
return reviver.call(this, key, convertMarkedValues(val));
})
: (markedText: string) => JSON.parse(markedText, (key, val) => convertMarkedValues(val));

/* RegExp cannot replace AST and the process of marking adds quotes. So, any false-positive hit
* will make the JSON string unparseable.
*
* To find those instances, we try to parse and watch for the location of any errors. If an error
* is caused by the marking, we remove that single marking and try again.
*/
try {
do {
try {
hadException = false;
obj = parseMarkedText(markedJSON);
} catch (e) {
hadException = true;
/* There are two types of exception objects that can be raised:
* 1) a textual message with the position that we need to parse
* i. Unexpected [token|string ...] at position ...
* ii. Expected ',' or ... after ... in JSON at position ...
* iii. expected ',' or ... after ... in object at line ... column ...
* 2) a proper object with lineNumber and columnNumber which we can use
* Note: this might refer to the part of the code that threw the exception but
* we will try it anyway; the regex is specific enough to not produce
* false-positives.
*/
let { lineNumber, columnNumber } = e;

if (typeof e?.message === 'string') {
/* Check for 1-i and 1-ii
* Finding "..."෴1111"..." inside a string value, the extra quotes throw a syntax error
* and the position points to " that is assumed to be the begining of a value.
*/
let match = e.message.match(/^(?:Un)?expected .*at position (\d+)(\D|$)/i);
if (match) {
lineNumber = 1;
// Add 1 to reach the marker
columnNumber = parseInt(match[1], 10) + 1;
} else {
/* Check for 1-iii
* Finding "...,"෴1111"..." inside a string value, the extra quotes throw a syntax error
* and the column number points to the marker after the " that is assumed to be terminating the
* value.
* PS: There are different versions of this error across browsers and platforms.
*/
// ToDo: Add functional tests for this path
match = e.message.match(/expected .*line (\d+) column (\d+)(\D|$)/i);
if (match) {
lineNumber = parseInt(match[1], 10);
columnNumber = parseInt(match[2], 10);
}
}
}

if (lineNumber < 1 || columnNumber < 2) {
/* The problem is not with this replacement.
* Note: This will never happen because the outer parse would have already thrown.
*/
// coverage:ignore-line
throw e;
}

/* We need to skip e.lineNumber - 1 number of `\n` occurrences.
* Then, we need to go to e.columnNumber - 2 to look for `"<mark>\d+"`; we need to `-1` to
* account for the quote but an additional `-1` is needed because columnNumber starts from 1.
*/
const re = new RegExp(
`^((?:.*\\n){${lineNumber - 1}}[^\\n]{${columnNumber - 2}})"${marker}(-?\\d+)"`
);
if (!re.test(markedJSON)) {
/* The exception is not caused by adding the marker.
* Note: This will never happen because the outer parse would have already thrown.
*/
// coverage:ignore-line
throw e;
}

// We have found a bad replacement; let's remove it.
markedJSON = markedJSON.replace(re, '$1$2');
}
} while (hadException);
} catch (ex) {
// If parsing of marked `text` fails, fallback to parsing the original `text`
obj = JSON.parse(text, reviver || undefined);
}

return obj;
};

const stringifyObjectWithBigInts = (
obj: any,
candidate: string,
replacer?: ((this: any, key: string, value: any) => any) | null,
space?: string | number
): string => {
const { marker } = getMarker(candidate);

/* The matcher that looks for "<marker><numerals>"
* Because we have made sure that `marker` was never present in the original object, we can
* carelessly assume every "<marker><numerals>" is due to our marking.
*/
const markedBigIntMatcher = new RegExp(`"${marker}(-?\\d+)"`, 'g');

/* Convert BigInt values to a string and mark them.
* Can't be bothered with Number values outside the safe range because they are already corrupted.
*
* For better performance, instead of testing for existence of `replacer` on each value, two almost
* identical functions are used.
*/
const addMarkerToBigInts = replacer
? function (this: any, key: string, val: any) {
// replacer is called before marking because marking changes the type
const newVal = replacer.call(this, key, val);
return typeof newVal === 'bigint' ? `${marker}${newVal.toString()}` : newVal;
}
: (key: string, val: any) => (typeof val === 'bigint' ? `${marker}${val.toString()}` : val);

return (
JSON.stringify(obj, addMarkerToBigInts, space)
// Replace marked substrings with just the numerals
.replace(markedBigIntMatcher, '$1')
);
};
import JSON11 from 'json11';

export const stringify = (
obj: any,
Expand Down Expand Up @@ -298,7 +45,14 @@ export const stringify = (
text = JSON.stringify(obj, checkForBigInts, space);

if (!numeralsAreNumbers) {
text = stringifyObjectWithBigInts(obj, text, replacer, space);
const temp = JSON11.stringify(obj, {
replacer,
space,
withBigInt: false,
quote: '"',
quoteNames: true,
});
if (temp) text = temp;
}

return text;
Expand Down Expand Up @@ -344,7 +98,8 @@ export const parse = (
obj = JSON.parse(text, checkForLargeNumerals);

if (!numeralsAreNumbers) {
obj = parseStringWithLongNumerals(text, reviver);
const temp = JSON11.parse(text, reviver, { withLongNumerals: true });
if (temp) obj = temp;
}

return obj;
Expand Down
Loading

0 comments on commit c964dcb

Please sign in to comment.