Introduce BadNode and MultiError (#230)

* Introduce `BadNode` and `ErrorList` * Fix ErrorList.String * Fix test to expect parsing errors for testdata/*/bad_*.sql * Use direct returns instead #189 (comment) * Make the `bad_` prefix more evil * Store tokens in `BadNode` instead of a raw string * Rename noError to noPanic * Add Bad{Statement,QueryExpr,Expr,Type,DDL,DML} ASTs #230 (comment) Co-Authored-By: apstndb <[email protected]> * Add the doc about error recovering * Rename `ErrorList` to `MultiError` and improve error messages * Shorten error message more --------- Co-authored-by: apstndb <[email protected]>
cloudspannerecosystem · Dec 21, 2024 · 6f345ac · 6f345ac
1 parent c07fef2
commit 6f345ac
Show file tree

Hide file tree

Showing 46 changed files with 2,090 additions and 255 deletions.
diff --git a/ast/ast.go b/ast/ast.go
@@ -14,6 +14,7 @@
 //   - sqlIdentQuote x: Quotes the given identifier string if needed.
 //   - sqlStringQuote s: Returns the SQL quoted string of s.
 //   - sqlBytesQuote bs: Returns the SQL quotes bytes of bs.
+//   - tokenJoin toks: Concateates the string representations of tokens.
 //   - isnil v: Checks whether v is nil or others.
 //
 // Each Node's documentation has pos and end information using the following EBNF.
@@ -59,6 +60,9 @@ type Statement interface {
 // - https://cloud.google.com/spanner/docs/reference/standard-sql/data-definition-language
 // - https://cloud.google.com/spanner/docs/reference/standard-sql/dml-syntax
 
+func (BadStatement) isStatement()       {}
+func (BadDDL) isStatement()             {}
+func (BadDML) isStatement()             {}
 func (QueryStatement) isStatement()     {}
 func (CreateSchema) isStatement()       {}
 func (DropSchema) isStatement()         {}
@@ -109,6 +113,7 @@ type QueryExpr interface {
 	isQueryExpr()
 }
 
+func (BadQueryExpr) isQueryExpr()  {}
 func (Select) isQueryExpr()        {}
 func (Query) isQueryExpr()         {}
 func (FromQuery) isQueryExpr()     {}
@@ -174,6 +179,7 @@ type Expr interface {
 	isExpr()
 }
 
+func (BadExpr) isExpr()               {}
 func (BinaryExpr) isExpr()            {}
 func (UnaryExpr) isExpr()             {}
 func (InExpr) isExpr()                {}
@@ -296,6 +302,7 @@ type Type interface {
 	isType()
 }
 
+func (BadType) isType()    {}
 func (SimpleType) isType() {}
 func (ArrayType) isType()  {}
 func (StructType) isType() {}
@@ -343,6 +350,7 @@ type DDL interface {
 //
 // - https://cloud.google.com/spanner/docs/reference/standard-sql/data-definition-language
 
+func (BadDDL) isDDL()             {}
 func (CreateSchema) isDDL()       {}
 func (DropSchema) isDDL()         {}
 func (CreateDatabase) isDDL()     {}
@@ -501,6 +509,7 @@ type DML interface {
 	isDML()
 }
 
+func (BadDML) isDML() {}
 func (Insert) isDML() {}
 func (Delete) isDML() {}
 func (Update) isDML() {}
@@ -533,6 +542,84 @@ func (ChangeStreamSetFor) isChangeStreamAlteration()     {}
 func (ChangeStreamDropForAll) isChangeStreamAlteration() {}
 func (ChangeStreamSetOptions) isChangeStreamAlteration() {}
 
+// ================================================================================
+//
+// Bad Node
+//
+// ================================================================================
+
+// BadNode is a placeholder node for a source code containing syntax errors.
+//
+//	{{.Tokens | tokenJoin}}
+type BadNode struct {
+	// pos = NodePos
+	// end = NodeEnd
+
+	NodePos, NodeEnd token.Pos
+
+	Tokens []*token.Token
+}
+
+// BadStatement is a BadNode for Statement.
+//
+// {{.BadNode | sql}}
+type BadStatement struct {
+	// pos = BadNode.pos
+	// end = BadNode.end
+
+	BadNode *BadNode
+}
+
+// BadQueryExpr is a BadNode for QueryExpr.
+//
+// {{.BadNode | sql}}
+type BadQueryExpr struct {
+	// pos = BadNode.pos
+	// end = BadNode.end
+
+	BadNode *BadNode
+}
+
+// BadExpr is a BadNode for Expr.
+//
+// {{.BadNode | sql}}
+type BadExpr struct {
+	// pos = BadNode.pos
+	// end = BadNode.end
+
+	BadNode *BadNode
+}
+
+// BadType is a BadNode for Type.
+//
+// {{.BadNode | sql}}
+type BadType struct {
+	// pos = BadNode.pos
+	// end = BadNode.end
+
+	BadNode *BadNode
+}
+
+// BadDDL is a BadNode for DDL.
+//
+// {{.BadNode | sql}}
+type BadDDL struct {
+	// pos = BadNode.pos
+	// end = BadNode.end
+
+	BadNode *BadNode
+}
+
+// BadDML is a BadNode for DML.
+//
+// {{.BadNode | sql}}
+type BadDML struct {
+	// pos = BadNode.pos
+	// end = BadNode.end
+
+	BadNode *BadNode
+}
+
 // ================================================================================
 //
 // SELECT
@@ -1265,9 +1352,10 @@ type SelectorExpr struct {
 
 // IndexExpr is a subscript operator expression node.
 // This node can be:
-//	- array subscript operator
-//	- struct subscript operator
-//	- JSON subscript operator
+//   - array subscript operator
+//   - struct subscript operator
+//   - JSON subscript operator
+//
 // Note: The name IndexExpr is a historical reason, maybe better to rename to SubscriptExpr.
 //
 //	{{.Expr | sql}}[{{.Index | sql}}]
@@ -2259,8 +2347,6 @@ type DropProtoBundle struct {
 	Bundle token.Pos // position of "BUNDLE" pseudo keyword
 }
 
-// end of PROTO BUNDLE statements
-
 // CreateTable is CREATE TABLE statement node.
 //
 //	CREATE TABLE {{if .IfNotExists}}IF NOT EXISTS{{end}} {{.Name | sql}} (

diff --git a/ast/pos.go b/ast/pos.go
diff --git a/ast/sql.go b/ast/sql.go
@@ -155,6 +155,30 @@ func paren(p prec, e Expr) string {
 	}
 }
 
+// ================================================================================
+//
+// Bad Node
+//
+// ================================================================================
+
+func (b *BadNode) SQL() string {
+	var sql string
+	for _, tok := range b.Tokens {
+		if sql != "" && len(tok.Space) > 0 {
+			sql += " "
+		}
+		sql += tok.Raw
+	}
+	return sql
+}
+
+func (b *BadStatement) SQL() string { return b.BadNode.SQL() }
+func (b *BadQueryExpr) SQL() string { return b.BadNode.SQL() }
+func (b *BadExpr) SQL() string      { return b.BadNode.SQL() }
+func (b *BadType) SQL() string      { return b.BadNode.SQL() }
+func (b *BadDDL) SQL() string       { return b.BadNode.SQL() }
+func (b *BadDML) SQL() string       { return b.BadNode.SQL() }
+
 // ================================================================================
 //
 // SELECT

diff --git a/docs/content/error-recover/_index.md b/docs/content/error-recover/_index.md
@@ -0,0 +1,137 @@
+---
+date: 2024-12-20 00:00:00 +0900
+title: "Error recovering"
+weight: 2
+---
+
+Since v0.1.0, `memefish.ParseXXX` methods returns AST node(s) even if an error is reproted.
+That is, if we try to parse incomplete SQL such as:
+
+```sql
+SELECT (1 +) + (* 2)
+```
+
+Then, the following two errors are reported:
+
+```sql
+syntax error: :1:12: unexpected token: )
+  1|  SELECT (1 +) + (* 2)
+   |             ^
+syntax error: :1:17: unexpected token: *
+  1|  SELECT (1 +) + (* 2)
+   |                  ^
+```
+
+Hoever, the AST is also returned:
+
+```go {hl_lines=["10-31","36-57"]}
+&ast.QueryStatement{
+  Query: &ast.Select{
+    Results: []ast.SelectItem{
+      &ast.ExprSelectItem{
+        Expr: &ast.BinaryExpr{
+          Op:   "+",
+          Left: &ast.ParenExpr{
+            Lparen: 7,
+            Rparen: 11,
+            Expr:   &ast.BadExpr{
+              BadNode: &ast.BadNode{
+                NodePos: 8,
+                NodeEnd: 11,
+                Tokens:  []*token.Token{
+                  &token.Token{
+                    Kind: "<int>",
+                    Raw:  "1",
+                    Base: 10,
+                    Pos:  8,
+                    End:  9,
+                  },
+                  &token.Token{
+                    Kind:  "+",
+                    Space: " ",
+                    Raw:   "+",
+                    Pos:   10,
+                    End:   11,
+                  },
+                },
+              },
+            },
+          },
+          Right: &ast.ParenExpr{
+            Lparen: 15,
+            Rparen: 19,
+            Expr:   &ast.BadExpr{
+              BadNode: &ast.BadNode{
+                NodePos: 16,
+                NodeEnd: 19,
+                Tokens:  []*token.Token{
+                  &token.Token{
+                    Kind: "*",
+                    Raw:  "*",
+                    Pos:  16,
+                    End:  17,
+                  },
+                  &token.Token{
+                    Kind:  "<int>",
+                    Space: " ",
+                    Raw:   "2",
+                    Base:  10,
+                    Pos:   18,
+                    End:   19,
+                  },
+                },
+              },
+            },
+          },
+        },
+      },
+    },
+  },
+}
+```
+
+Thus, the places where the error occurred are filled with the `ast.BadXXX` nodes (`ast.BadExpr` in this example).
+
+## How méméfish performs error recovery
+
+This section explains how méméfish performs error recovery.
+
+In méméfish, a *recovery point* is set when parsing a syntax where some multiple types of AST nodes are the result.
+For example, when parsing an parenthesized expression, the recovery point is set after the open parenthesis `(`.
+If an error occurs in the parenthesized expression, the parser backtracks to the recovery point and skips the tokens until the parenthesized expression ends.
+The skipped tokens are then collectively `ast.BadNode` and this node is wrapped up a specific `ast.BadXXX` node (e.g., `ast.BadExpr`).
+
+```sql
+SELECT (1 + 2 *)
+               ^--- error point
+        ^---------- recovery point
+        |~~~~~| --- skipped tokens
+```
+
+Recovery points are set where:
+
+- the beginning of statements, queries, DDLs, DMLs,
+- the beginning of expressions (e.g., after an open parenthesis `(`, `SELECT`, `WHERE` etc.), and
+- the beginning of types.
+
+Token skipping is performed as follows.
+
+- For `ast.Statement`, `ast.DDL`, and `ast.DML`,
+  * skip tokens until a semicolon `;` appears.
+- For `ast.QueryExpr`,
+  * skip tokens until a semicolon `;` appears, or
+  * skip tokens with counting the nest of parentheses `(`
+    + until the closing symbol (`)`) appears at no nestings, or
+    + until the symbol that is supposed to be the end of the expression (`UNION`, `INTERSECT`, `EXCEPT`) appears at no nestings.
+- For `ast.Expr`,
+  * skip tokens until a semicolon `;` appears, or
+  * skip tokens with counting the nest of parentheses `(`, brackets `[`, `CASE` and `WHEN`
+    + until the closing symbol (`)`, `]`, `END`, `THEN`) appears at no nestings or
+    + until the symbol that is supposed to be the end of the expression (`,`, `AS`, `FROM`, `GROUP`, `HAVING`, `ORDER`, `LIMIT`, `OFFSET`, `AT`, `UNION`, `INTERSECT`, `EXCEPT`) appears at no nestings.
+- For `ast.Type`,
+  * skip tokens until the semicolon `;` or the closing parenthesis `)` appears, or
+  * skip tokens with counting the nest of triangle brackets `<`
+  * until the closing symbol (`>`) appears at no nestings.
+
+Note that this skipping rules are just heuristics and may not be perfect.
+In some cases, there is a possibility of skipping too many tokens.