feat(jmespath): add lexer component (#2214)

* feat(jmespath): add lexer component * refactor: reduce cognitive complexity
aws-powertools · Mar 19, 2024 · 006ebcf · 006ebcf
1 parent 76c4cfd
commit 006ebcf
Showing 1 changed file with 368 additions and 0 deletions.
diff --git a/packages/jmespath/src/Lexer.ts b/packages/jmespath/src/Lexer.ts
@@ -0,0 +1,368 @@
+import {
+  SIMPLE_TOKENS,
+  START_IDENTIFIER,
+  VALID_IDENTIFIER,
+  VALID_NUMBER,
+  WHITESPACE,
+} from './constants.js';
+import { EmptyExpressionError, LexerError } from './errors.js';
+import type { Token } from './types.js';
+
+/**
+ * A lexer for JMESPath expressions.
+ *
+ * This lexer tokenizes a JMESPath expression into a sequence of tokens.
+ */
+class Lexer {
+  #position!: number;
+  #expression!: string;
+  #chars!: string[];
+  #current!: string;
+  #length!: number;
+
+  /**
+   * Tokenize a JMESPath expression.
+   *
+   * This method is a generator that yields tokens for the given expression.
+   *
+   * @param expression The JMESPath expression to tokenize.
+   */
+  public *tokenize(expression: string): Generator<Token> {
+    this.#initializeForExpression(expression);
+    while (this.#current !== '' && this.#current !== undefined) {
+      if (SIMPLE_TOKENS.has(this.#current)) {
+        yield {
+          // We know that SIMPLE_TOKENS has this.#current as a key because
+          // we checked for that above.
+          // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
+          type: SIMPLE_TOKENS.get(this.#current)!,
+          value: this.#current,
+          start: this.#position,
+          end: this.#position + 1,
+        };
+
+        this.#next();
+      } else if (START_IDENTIFIER.has(this.#current)) {
+        yield this.#consumeIdentifier();
+      } else if (WHITESPACE.has(this.#current)) {
+        this.#next();
+      } else if (this.#current === '[') {
+        yield this.#consumeSquareBracket();
+      } else if (this.#current === `'`) {
+        yield this.#consumeRawStringLiteral();
+      } else if (this.#current === '`') {
+        yield this.#consumeLiteral();
+      } else if (VALID_NUMBER.has(this.#current)) {
+        const start = this.#position;
+        const buff = this.#consumeNumber();
+        yield {
+          type: 'number',
+          value: parseInt(buff),
+          start: start,
+          end: start + buff.length,
+        };
+      } else if (this.#current === '-') {
+        yield this.#consumeNegativeNumber();
+      } else if (this.#current === '"') {
+        yield this.#consumeQuotedIdentifier();
+      } else if (['<', '>', '!', '=', '|', '&'].includes(this.#current)) {
+        yield this.#consumeComparatorSigns(
+          this.#current as '<' | '>' | '!' | '=' | '|' | '&'
+        );
+      } else {
+        throw new LexerError(this.#position, this.#current);
+      }
+    }
+    yield { type: 'eof', value: '', start: this.#length, end: this.#length };
+  }
+
+  /**
+   * Consume a comparator sign.
+   *
+   * This method is called when the lexer encounters a comparator sign.
+   *
+   * @param current The current character
+   */
+  #consumeComparatorSigns = (
+    current: '<' | '>' | '!' | '=' | '|' | '&'
+  ): Token => {
+    switch (current) {
+      case '<':
+        return this.#matchOrElse('=', 'lte', 'lt');
+      case '>':
+        return this.#matchOrElse('=', 'gte', 'gt');
+      case '!':
+        return this.#matchOrElse('=', 'ne', 'not');
+      case '|':
+        return this.#matchOrElse('|', 'or', 'pipe');
+      case '&':
+        return this.#matchOrElse('&', 'and', 'expref');
+      default:
+        return this.#consumeEqualSign();
+    }
+  };
+
+  /**
+   * Consume an equal sign.
+   *
+   * This method is called when the lexer encounters an equal sign.
+   * It checks if the next character is also an equal sign and returns
+   * the corresponding token.
+   */
+  #consumeEqualSign(): Token {
+    if (this.#next() === '=') {
+      this.#next();
+
+      return {
+        type: 'eq',
+        value: '==',
+        start: this.#position - 1,
+        end: this.#position,
+      };
+    } else {
+      throw new LexerError(this.#position - 1, '=');
+    }
+  }
+
+  /**
+   * Consume an unquoted identifier.
+   *
+   * This method is called when the lexer encounters a character that is a valid
+   * identifier. It advances the lexer until it finds a character that is not a
+   * valid identifier and returns the corresponding token.
+   */
+  #consumeIdentifier(): Token {
+    const start = this.#position;
+    let buff = this.#current;
+    while (VALID_IDENTIFIER.has(this.#next())) {
+      buff += this.#current;
+    }
+
+    return {
+      type: 'unquoted_identifier',
+      value: buff,
+      start,
+      end: start + buff.length,
+    };
+  }
+
+  /**
+   * Consume a negative number.
+   *
+   * This method is called when the lexer encounters a negative sign.
+   * It checks if the next character is a number and returns the corresponding token.
+   */
+  #consumeNegativeNumber(): Token {
+    const start = this.#position;
+    const buff = this.#consumeNumber();
+    if (buff.length > 1) {
+      return {
+        type: 'number',
+        value: parseInt(buff),
+        start: start,
+        end: start + buff.length,
+      };
+    } else {
+      // If the negative sign is not followed by a number, it is an error.
+      throw new LexerError(start, 'Unknown token after "-"');
+    }
+  }
+
+  /**
+   * Consume a raw string that is a number.
+   *
+   * It takes the current position and advances
+   * the lexer until it finds a character that
+   * is not a number.
+   */
+  #consumeNumber(): string {
+    let buff = this.#current;
+    while (VALID_NUMBER.has(this.#next())) {
+      buff += this.#current;
+    }
+
+    return buff;
+  }
+
+  /**
+   * Consume a square bracket.
+   *
+   * This method is called when the lexer encounters a square bracket.
+   * It checks if the next character is a question mark or a closing
+   * square bracket and returns the corresponding token.
+   */
+  #consumeSquareBracket(): Token {
+    const start = this.#position;
+    const nextChar = this.#next();
+    if (nextChar == ']') {
+      this.#next();
+
+      return { type: 'flatten', value: '[]', start: start, end: start + 2 };
+    } else if (nextChar == '?') {
+      this.#next();
+
+      return { type: 'filter', value: '[?', start: start, end: start + 2 };
+    } else {
+      return { type: 'lbracket', value: '[', start: start, end: start + 1 };
+    }
+  }
+
+  /**
+   * Initializes the lexer for the given expression.
+   *
+   * We use a separate method for this instead of the constructor
+   * because we want to be able to reuse the same lexer instance
+   * and also because we want to be able to expose a public API
+   * for tokenizing expressions like `new Lexer().tokenize(expression)`.
+   *
+   * @param expression The JMESPath expression to tokenize.
+   */
+  #initializeForExpression(expression: string): void {
+    if (typeof expression !== 'string') {
+      throw new EmptyExpressionError();
+    }
+
+    this.#position = 0;
+    this.#expression = expression;
+    this.#chars = Array.from(expression);
+    this.#current = this.#chars[0];
+    this.#length = this.#expression.length;
+  }
+
+  /**
+   * Advance the lexer to the next character in the expression.
+   */
+  #next(): string {
+    if (this.#position === this.#length - 1) {
+      this.#current = '';
+    } else {
+      this.#position += 1;
+      this.#current = this.#chars[this.#position];
+    }
+
+    return this.#current;
+  }
+
+  /**
+   * Consume until the given delimiter is reached allowing
+   * for escaping of the delimiter with a backslash (`\`).
+   *
+   * @param delimiter The delimiter to consume until.
+   */
+  #consumeUntil(delimiter: string): string {
+    const start = this.#position;
+    let buff = '';
+    this.#next();
+    while (this.#current !== delimiter) {
+      if (this.#current === '\\') {
+        buff += '\\';
+        this.#next();
+      }
+      if (this.#current === '') {
+        // We've reached the end of the expression (EOF) before
+        // we found the delimiter. This is an error.
+        throw new LexerError(start, this.#expression.substring(start));
+      }
+      buff += this.#current;
+      this.#next();
+    }
+    // Skip the closing delimiter
+    this.#next();
+
+    return buff;
+  }
+
+  /**
+   * Process a literal.
+   *
+   * A literal is a JSON string that is enclosed in backticks.
+   */
+  #consumeLiteral(): Token {
+    const start = this.#position;
+    const lexeme = this.#consumeUntil('`').replace('\\`', '`');
+    try {
+      const parsedJson = JSON.parse(lexeme);
+
+      return {
+        type: 'literal',
+        value: parsedJson,
+        start,
+        end: this.#position - start,
+      };
+    } catch (error) {
+      throw new LexerError(start, lexeme);
+    }
+  }
+
+  /**
+   * Process a quoted identifier.
+   *
+   * A quoted identifier is a string that is enclosed in double quotes.
+   */
+  #consumeQuotedIdentifier(): Token {
+    const start = this.#position;
+    const lexeme = '"' + this.#consumeUntil('"') + '"';
+    const tokenLen = this.#position - start;
+
+    return {
+      type: 'quoted_identifier',
+      value: JSON.parse(lexeme),
+      start,
+      end: tokenLen,
+    };
+  }
+
+  /**
+   * Process a raw string literal.
+   *
+   * A raw string literal is a string that is enclosed in single quotes.
+   */
+  #consumeRawStringLiteral(): Token {
+    const start = this.#position;
+    const lexeme = this.#consumeUntil(`'`).replace(`\\'`, `'`);
+    const tokenLen = this.#position - start;
+
+    return {
+      type: 'literal',
+      value: lexeme,
+      start,
+      end: tokenLen,
+    };
+  }
+
+  /**
+   * Match the expected character and return the corresponding token type.
+   *
+   * @param expected The expected character
+   * @param matchType The token type to return if the expected character is found
+   * @param elseType  The token type to return if the expected character is not found
+   */
+  #matchOrElse(
+    expected: string,
+    matchType: Token['type'],
+    elseType: Token['type']
+  ): Token {
+    const start = this.#position;
+    const current = this.#current;
+    const nextChar = this.#next();
+    if (nextChar === expected) {
+      this.#next();
+
+      return {
+        type: matchType,
+        value: current + nextChar,
+        start,
+        end: start + 2,
+      };
+    }
+
+    return {
+      type: elseType,
+      value: current,
+      start,
+      end: start,
+    };
+  }
+}
+
+export { Lexer };