feat(NODE-5861): optimize parsing basic latin strings (#642)

mongodb · Jan 31, 2024 · cdb779b · cdb779b
1 parent 44bec19
commit cdb779b
Show file tree

Hide file tree

Showing 12 changed files with 275 additions and 45 deletions.
diff --git a/rollup.config.mjs b/rollup.config.mjs
@@ -13,7 +13,7 @@ const tsConfig = {
   module: 'esnext',
   moduleResolution: 'node',
   removeComments: true,
-  lib: ['es2021'],
+  lib: ['es2021', 'ES2022.Error'],
   importHelpers: false,
   noEmitHelpers: false,
   noEmitOnError: true,

diff --git a/src/binary.ts b/src/binary.ts
@@ -191,8 +191,8 @@ export class Binary extends BSONValue {
     if (encoding === 'hex') return ByteUtils.toHex(this.buffer);
     if (encoding === 'base64') return ByteUtils.toBase64(this.buffer);
     if (encoding === 'utf8' || encoding === 'utf-8')
-      return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength);
-    return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength);
+      return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength, false);
+    return ByteUtils.toUTF8(this.buffer, 0, this.buffer.byteLength, false);
   }
 
   /** @internal */

diff --git a/src/error.ts b/src/error.ts
@@ -4,7 +4,7 @@ import { BSON_MAJOR_VERSION } from './constants';
  * @public
  * @category Error
  *
- * `BSONError` objects are thrown when BSON ecounters an error.
+ * `BSONError` objects are thrown when BSON encounters an error.
  *
  * This is the parent class for all the other errors thrown by this library.
  */
@@ -23,8 +23,8 @@ export class BSONError extends Error {
     return 'BSONError';
   }
 
-  constructor(message: string) {
-    super(message);
+  constructor(message: string, options?: { cause?: unknown }) {
+    super(message, options);
   }
 
   /**

diff --git a/src/parser/deserializer.ts b/src/parser/deserializer.ts
@@ -236,7 +236,7 @@ function deserializeObject(
     if (i >= buffer.byteLength) throw new BSONError('Bad BSON Document: illegal CString');
 
     // Represents the key
-    const name = isArray ? arrayIndex++ : ByteUtils.toUTF8(buffer, index, i);
+    const name = isArray ? arrayIndex++ : ByteUtils.toUTF8(buffer, index, i, false);
 
     // shouldValidateKey is true if the key should be validated, false otherwise
     let shouldValidateKey = true;
@@ -266,7 +266,7 @@ function deserializeObject(
       ) {
         throw new BSONError('bad string length in bson');
       }
-      value = getValidatedString(buffer, index, index + stringSize - 1, shouldValidateKey);
+      value = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey);
       index = index + stringSize;
     } else if (elementType === constants.BSON_DATA_OID) {
       const oid = ByteUtils.allocate(12);
@@ -476,7 +476,7 @@ function deserializeObject(
       // If are at the end of the buffer there is a problem with the document
       if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
       // Return the C string
-      const source = ByteUtils.toUTF8(buffer, index, i);
+      const source = ByteUtils.toUTF8(buffer, index, i, false);
       // Create the regexp
       index = i + 1;
 
@@ -489,7 +489,7 @@ function deserializeObject(
       // If are at the end of the buffer there is a problem with the document
       if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
       // Return the C string
-      const regExpOptions = ByteUtils.toUTF8(buffer, index, i);
+      const regExpOptions = ByteUtils.toUTF8(buffer, index, i, false);
       index = i + 1;
 
       // For each option add the corresponding one for javascript
@@ -521,7 +521,7 @@ function deserializeObject(
       // If are at the end of the buffer there is a problem with the document
       if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
       // Return the C string
-      const source = ByteUtils.toUTF8(buffer, index, i);
+      const source = ByteUtils.toUTF8(buffer, index, i, false);
       index = i + 1;
 
       // Get the start search index
@@ -533,7 +533,7 @@ function deserializeObject(
       // If are at the end of the buffer there is a problem with the document
       if (i >= buffer.length) throw new BSONError('Bad BSON Document: illegal CString');
       // Return the C string
-      const regExpOptions = ByteUtils.toUTF8(buffer, index, i);
+      const regExpOptions = ByteUtils.toUTF8(buffer, index, i, false);
       index = i + 1;
 
       // Set the object
@@ -551,7 +551,7 @@ function deserializeObject(
       ) {
         throw new BSONError('bad string length in bson');
       }
-      const symbol = getValidatedString(buffer, index, index + stringSize - 1, shouldValidateKey);
+      const symbol = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey);
       value = promoteValues ? symbol : new BSONSymbol(symbol);
       index = index + stringSize;
     } else if (elementType === constants.BSON_DATA_TIMESTAMP) {
@@ -587,7 +587,7 @@ function deserializeObject(
       ) {
         throw new BSONError('bad string length in bson');
       }
-      const functionString = getValidatedString(
+      const functionString = ByteUtils.toUTF8(
         buffer,
         index,
         index + stringSize - 1,
@@ -626,7 +626,7 @@ function deserializeObject(
       }
 
       // Javascript function
-      const functionString = getValidatedString(
+      const functionString = ByteUtils.toUTF8(
         buffer,
         index,
         index + stringSize - 1,
@@ -678,7 +678,7 @@ function deserializeObject(
           throw new BSONError('Invalid UTF-8 string in BSON document');
         }
       }
-      const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1);
+      const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false);
       // Update parse index position
       index = index + stringSize;
 
@@ -728,24 +728,3 @@ function deserializeObject(
 
   return object;
 }
-
-function getValidatedString(
-  buffer: Uint8Array,
-  start: number,
-  end: number,
-  shouldValidateUtf8: boolean
-) {
-  const value = ByteUtils.toUTF8(buffer, start, end);
-  // if utf8 validation is on, do the check
-  if (shouldValidateUtf8) {
-    for (let i = 0; i < value.length; i++) {
-      if (value.charCodeAt(i) === 0xfffd) {
-        if (!validateUtf8(buffer, start, end)) {
-          throw new BSONError('Invalid UTF-8 string in BSON document');
-        }
-        break;
-      }
-    }
-  }
-  return value;
-}
diff --git a/src/utils/byte_utils.ts b/src/utils/byte_utils.ts
@@ -25,8 +25,8 @@ export type ByteUtils = {
   toHex: (buffer: Uint8Array) => string;
   /** Create a Uint8Array containing utf8 code units from a string */
   fromUTF8: (text: string) => Uint8Array;
-  /** Create a string from utf8 code units */
-  toUTF8: (buffer: Uint8Array, start: number, end: number) => string;
+  /** Create a string from utf8 code units, fatal=true will throw an error if UTF-8 bytes are invalid, fatal=false will insert replacement characters */
+  toUTF8: (buffer: Uint8Array, start: number, end: number, fatal: boolean) => string;
   /** Get the utf8 code unit count from a string if it were to be transformed to utf8 */
   utf8ByteLength: (input: string) => number;
   /** Encode UTF8 bytes generated from `source` string into `destination` at byteOffset. Returns the number of bytes encoded. */

diff --git a/src/utils/latin.ts b/src/utils/latin.ts
@@ -0,0 +1,61 @@
+/**
+ * This function is an optimization for small basic latin strings.
+ * @internal
+ * @remarks
+ * ### Important characteristics:
+ * - If the uint8array or distance between start and end is 0 this function returns an empty string
+ * - If the byteLength of the string is 1, 2, or 3 we invoke String.fromCharCode and manually offset into the buffer
+ * - If the byteLength of the string is less than or equal to 20 an array of bytes is built and `String.fromCharCode.apply` is called with the result
+ * - If any byte exceeds 128 this function returns null
+ *
+ * @param uint8array - A sequence of bytes that may contain basic latin characters
+ * @param start - The start index from which to search the uint8array
+ * @param end - The index to stop searching the uint8array
+ * @returns string if all bytes are within the basic latin range, otherwise null
+ */
+export function tryLatin(uint8array: Uint8Array, start: number, end: number): string | null {
+  if (uint8array.length === 0) {
+    return '';
+  }
+
+  const stringByteLength = end - start;
+  if (stringByteLength === 0) {
+    return '';
+  }
+
+  if (stringByteLength > 20) {
+    return null;
+  }
+
+  if (stringByteLength === 1 && uint8array[start] < 128) {
+    return String.fromCharCode(uint8array[start]);
+  }
+
+  if (stringByteLength === 2 && uint8array[start] < 128 && uint8array[start + 1] < 128) {
+    return String.fromCharCode(uint8array[start]) + String.fromCharCode(uint8array[start + 1]);
+  }
+
+  if (
+    stringByteLength === 3 &&
+    uint8array[start] < 128 &&
+    uint8array[start + 1] < 128 &&
+    uint8array[start + 2] < 128
+  ) {
+    return (
+      String.fromCharCode(uint8array[start]) +
+      String.fromCharCode(uint8array[start + 1]) +
+      String.fromCharCode(uint8array[start + 2])
+    );
+  }
+
+  const latinBytes = [];
+  for (let i = start; i < end; i++) {
+    const byte = uint8array[i];
+    if (byte > 127) {
+      return null;
+    }
+    latinBytes.push(byte);
+  }
+
+  return String.fromCharCode(...latinBytes);
+}
diff --git a/src/utils/node_byte_utils.ts b/src/utils/node_byte_utils.ts
@@ -1,4 +1,6 @@
 import { BSONError } from '../error';
+import { validateUtf8 } from '../validate_utf8';
+import { tryLatin } from './latin';
 
 type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary';
 type NodeJsBuffer = ArrayBufferView &
@@ -125,8 +127,25 @@ export const nodeJsByteUtils = {
     return Buffer.from(text, 'utf8');
   },
 
-  toUTF8(buffer: Uint8Array, start: number, end: number): string {
-    return nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end);
+  toUTF8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string {
+    const basicLatin = end - start <= 20 ? tryLatin(buffer, start, end) : null;
+    if (basicLatin != null) {
+      return basicLatin;
+    }
+
+    const string = nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end);
+    if (fatal) {
+      // TODO(NODE-4930): Insufficiently strict BSON UTF8 validation
+      for (let i = 0; i < string.length; i++) {
+        if (string.charCodeAt(i) === 0xfffd) {
+          if (!validateUtf8(buffer, start, end)) {
+            throw new BSONError('Invalid UTF-8 string in BSON document');
+          }
+          break;
+        }
+      }
+    }
+    return string;
   },
 
   utf8ByteLength(input: string): number {

diff --git a/src/utils/web_byte_utils.ts b/src/utils/web_byte_utils.ts
@@ -1,4 +1,5 @@
 import { BSONError } from '../error';
+import { tryLatin } from './latin';
 
 type TextDecoder = {
   readonly encoding: string;
@@ -172,8 +173,20 @@ export const webByteUtils = {
     return new TextEncoder().encode(text);
   },
 
-  toUTF8(uint8array: Uint8Array, start: number, end: number): string {
-    return new TextDecoder('utf8', { fatal: false }).decode(uint8array.slice(start, end));
+  toUTF8(uint8array: Uint8Array, start: number, end: number, fatal: boolean): string {
+    const basicLatin = end - start <= 20 ? tryLatin(uint8array, start, end) : null;
+    if (basicLatin != null) {
+      return basicLatin;
+    }
+
+    if (fatal) {
+      try {
+        return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
+      } catch (cause) {
+        throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
+      }
+    }
+    return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
   },
 
   utf8ByteLength(input: string): number {

diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts
@@ -400,19 +400,34 @@ const fromUTF8Tests: ByteUtilTest<'fromUTF8'>[] = [
 const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
   {
     name: 'should create utf8 string from buffer input',
-    inputs: [Buffer.from('abc\u{1f913}', 'utf8')],
+    inputs: [Buffer.from('abc\u{1f913}', 'utf8'), 0, 7, false],
     expectation({ output, error }) {
       expect(error).to.be.null;
       expect(output).to.deep.equal(Buffer.from('abc\u{1f913}', 'utf8').toString('utf8'));
     }
   },
   {
     name: 'should return empty string for empty buffer input',
-    inputs: [Buffer.alloc(0)],
+    inputs: [Buffer.alloc(0), 0, 1, false],
     expectation({ output, error }) {
       expect(error).to.be.null;
       expect(output).to.be.a('string').with.lengthOf(0);
     }
+  },
+  {
+    name: 'should throw an error if fatal is set and string is invalid',
+    inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
+    expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  {
+    name: 'should insert replacement character fatal is false and string is invalid',
+    inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
+    expectation({ error, output }) {
+      expect(error).to.not.exist;
+      expect(output).to.equal('abc\uFFFD');
+    }
   }
 ];
 const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
@@ -596,6 +611,29 @@ describe('ByteUtils', () => {
     });
   });
 
+  describe('toUTF8 basic latin optimization', () => {
+    afterEach(() => {
+      sinon.restore();
+    });
+
+    context('Given a basic latin string', () => {
+      it('should not invoke Buffer.toString', () => {
+        const buffer = Buffer.from('abcdef', 'utf8');
+        const spy = sinon.spy(buffer, 'toString');
+        nodeJsByteUtils.toUTF8(buffer, 0, 6, false);
+        expect(spy).to.not.have.been.called;
+      });
+
+      it('should not invoke TextDecoder.decode', () => {
+        const utf8Bytes = Buffer.from('abcdef', 'utf8');
+        const buffer = new Uint8Array(utf8Bytes.buffer, utf8Bytes.byteOffset, utf8Bytes.byteLength);
+        const spy = sinon.spy(TextDecoder.prototype, 'decode');
+        webByteUtils.toUTF8(buffer, 0, 6, false);
+        expect(spy).to.not.have.been.called;
+      });
+    });
+  });
+
   describe('randomBytes fallback case when crypto is not present', () => {
     describe('web', function () {
       let bsonWithNoCryptoCtx;

diff --git a/test/node/release.test.ts b/test/node/release.test.ts
@@ -46,6 +46,7 @@ const REQUIRED_FILES = [
   'src/utils/byte_utils.ts',
   'src/utils/node_byte_utils.ts',
   'src/utils/web_byte_utils.ts',
+  'src/utils/latin.ts',
   'src/validate_utf8.ts',
   'vendor/base64/base64.js',
   'vendor/base64/package.json',