Skip to content

Commit

Permalink
feat(NODE-5861): optimize parsing basic latin strings (#642)
Browse files Browse the repository at this point in the history
  • Loading branch information
nbbeeken authored Jan 31, 2024
1 parent 44bec19 commit cdb779b
Show file tree
Hide file tree
Showing 12 changed files with 275 additions and 45 deletions.
2 changes: 1 addition & 1 deletion rollup.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ const tsConfig = {
module: 'esnext',
moduleResolution: 'node',
removeComments: true,
lib: ['es2021'],
lib: ['es2021', 'ES2022.Error'],
importHelpers: false,
noEmitHelpers: false,
noEmitOnError: true,
Expand Down
4 changes: 2 additions & 2 deletions src/binary.ts
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,8 @@ export class Binary extends BSONValue {
if (encoding === 'hex') return ByteUtils.toHex(this.buffer);
if (encoding === 'base64') return ByteUtils.toBase64(this.buffer);
if (encoding === 'utf8' || encoding === 'utf-8')
return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength);
return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength);
return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength, false);
return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength, false);
}

/** @internal */
Expand Down
6 changes: 3 additions & 3 deletions src/error.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { BSON_MAJOR_VERSION } from './constants';
* @public
* @category Error
*
* `BSONError` objects are thrown when BSON ecounters an error.
* `BSONError` objects are thrown when BSON encounters an error.
*
* This is the parent class for all the other errors thrown by this library.
*/
Expand All @@ -23,8 +23,8 @@ export class BSONError extends Error {
return 'BSONError';
}

constructor(message: string) {
super(message);
constructor(message: string, options?: { cause?: unknown }) {
super(message, options);
}

/**
Expand Down
41 changes: 10 additions & 31 deletions src/parser/deserializer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ function deserializeObject(
if (i >= buffer.byteLength) throw new BSONError('Bad BSON Document: illegal CString');

// Represents the key
const name = isArray ? arrayIndex++ : ByteUtils.toUTF8(buffer, index, i);
const name = isArray ? arrayIndex++ : ByteUtils.toUTF8(buffer, index, i, false);

// shouldValidateKey is true if the key should be validated, false otherwise
let shouldValidateKey = true;
Expand Down Expand Up @@ -266,7 +266,7 @@ function deserializeObject(
) {
throw new BSONError('bad string length in bson');
}
value = getValidatedString(buffer, index, index + stringSize - 1, shouldValidateKey);
value = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey);
index = index + stringSize;
} else if (elementType === constants.BSON_DATA_OID) {
const oid = ByteUtils.allocate(12);
Expand Down Expand Up @@ -476,7 +476,7 @@ function deserializeObject(
// If are at the end of the buffer there is a problem with the document
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
// Return the C string
const source = ByteUtils.toUTF8(buffer, index, i);
const source = ByteUtils.toUTF8(buffer, index, i, false);
// Create the regexp
index = i + 1;

Expand All @@ -489,7 +489,7 @@ function deserializeObject(
// If are at the end of the buffer there is a problem with the document
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
// Return the C string
const regExpOptions = ByteUtils.toUTF8(buffer, index, i);
const regExpOptions = ByteUtils.toUTF8(buffer, index, i, false);
index = i + 1;

// For each option add the corresponding one for javascript
Expand Down Expand Up @@ -521,7 +521,7 @@ function deserializeObject(
// If are at the end of the buffer there is a problem with the document
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
// Return the C string
const source = ByteUtils.toUTF8(buffer, index, i);
const source = ByteUtils.toUTF8(buffer, index, i, false);
index = i + 1;

// Get the start search index
Expand All @@ -533,7 +533,7 @@ function deserializeObject(
// If are at the end of the buffer there is a problem with the document
if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
// Return the C string
const regExpOptions = ByteUtils.toUTF8(buffer, index, i);
const regExpOptions = ByteUtils.toUTF8(buffer, index, i, false);
index = i + 1;

// Set the object
Expand All @@ -551,7 +551,7 @@ function deserializeObject(
) {
throw new BSONError('bad string length in bson');
}
const symbol = getValidatedString(buffer, index, index + stringSize - 1, shouldValidateKey);
const symbol = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey);
value = promoteValues ? symbol : new BSONSymbol(symbol);
index = index + stringSize;
} else if (elementType === constants.BSON_DATA_TIMESTAMP) {
Expand Down Expand Up @@ -587,7 +587,7 @@ function deserializeObject(
) {
throw new BSONError('bad string length in bson');
}
const functionString = getValidatedString(
const functionString = ByteUtils.toUTF8(
buffer,
index,
index + stringSize - 1,
Expand Down Expand Up @@ -626,7 +626,7 @@ function deserializeObject(
}

// Javascript function
const functionString = getValidatedString(
const functionString = ByteUtils.toUTF8(
buffer,
index,
index + stringSize - 1,
Expand Down Expand Up @@ -678,7 +678,7 @@ function deserializeObject(
throw new BSONError('Invalid UTF-8 string in BSON document');
}
}
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1);
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false);
// Update parse index position
index = index + stringSize;

Expand Down Expand Up @@ -728,24 +728,3 @@ function deserializeObject(

return object;
}

function getValidatedString(
buffer: Uint8Array,
start: number,
end: number,
shouldValidateUtf8: boolean
) {
const value = ByteUtils.toUTF8(buffer, start, end);
// if utf8 validation is on, do the check
if (shouldValidateUtf8) {
for (let i = 0; i < value.length; i++) {
if (value.charCodeAt(i) === 0xfffd) {
if (!validateUtf8(buffer, start, end)) {
throw new BSONError('Invalid UTF-8 string in BSON document');
}
break;
}
}
}
return value;
}
4 changes: 2 additions & 2 deletions src/utils/byte_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ export type ByteUtils = {
toHex: (buffer: Uint8Array) => string;
/** Create a Uint8Array containing utf8 code units from a string */
fromUTF8: (text: string) => Uint8Array;
/** Create a string from utf8 code units */
toUTF8: (buffer: Uint8Array, start: number, end: number) => string;
/** Create a string from utf8 code units, fatal=true will throw an error if UTF-8 bytes are invalid, fatal=false will insert replacement characters */
toUTF8: (buffer: Uint8Array, start: number, end: number, fatal: boolean) => string;
/** Get the utf8 code unit count from a string if it were to be transformed to utf8 */
utf8ByteLength: (input: string) => number;
/** Encode UTF8 bytes generated from `source` string into `destination` at byteOffset. Returns the number of bytes encoded. */
Expand Down
61 changes: 61 additions & 0 deletions src/utils/latin.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/**
* This function is an optimization for small basic latin strings.
* @internal
* @remarks
* ### Important characteristics:
* - If the uint8array or distance between start and end is 0 this function returns an empty string
* - If the byteLength of the string is 1, 2, or 3 we invoke String.fromCharCode and manually offset into the buffer
* - If the byteLength of the string is less than or equal to 20 an array of bytes is built and `String.fromCharCode.apply` is called with the result
* - If any byte exceeds 128 this function returns null
*
* @param uint8array - A sequence of bytes that may contain basic latin characters
* @param start - The start index from which to search the uint8array
* @param end - The index to stop searching the uint8array
* @returns string if all bytes are within the basic latin range, otherwise null
*/
export function tryLatin(uint8array: Uint8Array, start: number, end: number): string | null {
if (uint8array.length === 0) {
return '';
}

const stringByteLength = end - start;
if (stringByteLength === 0) {
return '';
}

if (stringByteLength > 20) {
return null;
}

if (stringByteLength === 1 && uint8array[start] < 128) {
return String.fromCharCode(uint8array[start]);
}

if (stringByteLength === 2 && uint8array[start] < 128 && uint8array[start + 1] < 128) {
return String.fromCharCode(uint8array[start]) + String.fromCharCode(uint8array[start + 1]);
}

if (
stringByteLength === 3 &&
uint8array[start] < 128 &&
uint8array[start + 1] < 128 &&
uint8array[start + 2] < 128
) {
return (
String.fromCharCode(uint8array[start]) +
String.fromCharCode(uint8array[start + 1]) +
String.fromCharCode(uint8array[start + 2])
);
}

const latinBytes = [];
for (let i = start; i < end; i++) {
const byte = uint8array[i];
if (byte > 127) {
return null;
}
latinBytes.push(byte);
}

return String.fromCharCode(...latinBytes);
}
23 changes: 21 additions & 2 deletions src/utils/node_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import { BSONError } from '../error';
import { validateUtf8 } from '../validate_utf8';
import { tryLatin } from './latin';

type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary';
type NodeJsBuffer = ArrayBufferView &
Expand Down Expand Up @@ -125,8 +127,25 @@ export const nodeJsByteUtils = {
return Buffer.from(text, 'utf8');
},

toUTF8(buffer: Uint8Array, start: number, end: number): string {
return nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end);
toUTF8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string {
const basicLatin = end - start <= 20 ? tryLatin(buffer, start, end) : null;
if (basicLatin != null) {
return basicLatin;
}

const string = nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end);
if (fatal) {
// TODO(NODE-4930): Insufficiently strict BSON UTF8 validation
for (let i = 0; i < string.length; i++) {
if (string.charCodeAt(i) === 0xfffd) {
if (!validateUtf8(buffer, start, end)) {
throw new BSONError('Invalid UTF-8 string in BSON document');
}
break;
}
}
}
return string;
},

utf8ByteLength(input: string): number {
Expand Down
17 changes: 15 additions & 2 deletions src/utils/web_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { BSONError } from '../error';
import { tryLatin } from './latin';

type TextDecoder = {
readonly encoding: string;
Expand Down Expand Up @@ -172,8 +173,20 @@ export const webByteUtils = {
return new TextEncoder().encode(text);
},

toUTF8(uint8array: Uint8Array, start: number, end: number): string {
return new TextDecoder('utf8', { fatal: false }).decode(uint8array.slice(start, end));
toUTF8(uint8array: Uint8Array, start: number, end: number, fatal: boolean): string {
const basicLatin = end - start <= 20 ? tryLatin(uint8array, start, end) : null;
if (basicLatin != null) {
return basicLatin;
}

if (fatal) {
try {
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
} catch (cause) {
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
}
}
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
},

utf8ByteLength(input: string): number {
Expand Down
42 changes: 40 additions & 2 deletions test/node/byte_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -400,19 +400,34 @@ const fromUTF8Tests: ByteUtilTest<'fromUTF8'>[] = [
const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
{
name: 'should create utf8 string from buffer input',
inputs: [Buffer.from('abc\u{1f913}', 'utf8')],
inputs: [Buffer.from('abc\u{1f913}', 'utf8'), 0, 7, false],
expectation({ output, error }) {
expect(error).to.be.null;
expect(output).to.deep.equal(Buffer.from('abc\u{1f913}', 'utf8').toString('utf8'));
}
},
{
name: 'should return empty string for empty buffer input',
inputs: [Buffer.alloc(0)],
inputs: [Buffer.alloc(0), 0, 1, false],
expectation({ output, error }) {
expect(error).to.be.null;
expect(output).to.be.a('string').with.lengthOf(0);
}
},
{
name: 'should throw an error if fatal is set and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'should insert replacement character fatal is false and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
expectation({ error, output }) {
expect(error).to.not.exist;
expect(output).to.equal('abc\uFFFD');
}
}
];
const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
Expand Down Expand Up @@ -596,6 +611,29 @@ describe('ByteUtils', () => {
});
});

describe('toUTF8 basic latin optimization', () => {
afterEach(() => {
sinon.restore();
});

context('Given a basic latin string', () => {
it('should not invoke Buffer.toString', () => {
const buffer = Buffer.from('abcdef', 'utf8');
const spy = sinon.spy(buffer, 'toString');
nodeJsByteUtils.toUTF8(buffer, 0, 6, false);
expect(spy).to.not.have.been.called;
});

it('should not invoke TextDecoder.decode', () => {
const utf8Bytes = Buffer.from('abcdef', 'utf8');
const buffer = new Uint8Array(utf8Bytes.buffer, utf8Bytes.byteOffset, utf8Bytes.byteLength);
const spy = sinon.spy(TextDecoder.prototype, 'decode');
webByteUtils.toUTF8(buffer, 0, 6, false);
expect(spy).to.not.have.been.called;
});
});
});

describe('randomBytes fallback case when crypto is not present', () => {
describe('web', function () {
let bsonWithNoCryptoCtx;
Expand Down
1 change: 1 addition & 0 deletions test/node/release.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ const REQUIRED_FILES = [
'src/utils/byte_utils.ts',
'src/utils/node_byte_utils.ts',
'src/utils/web_byte_utils.ts',
'src/utils/latin.ts',
'src/validate_utf8.ts',
'vendor/base64/base64.js',
'vendor/base64/package.json',
Expand Down
Loading

0 comments on commit cdb779b

Please sign in to comment.