fix parsing of document

goccy · Nov 8, 2024 · 26c4215 · 26c4215
1 parent e3a88b0
commit 26c4215
Show file tree

Hide file tree

Showing 3 changed files with 234 additions and 17 deletions.
diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go
@@ -1877,6 +1877,216 @@ a: !!binary |
 		},
 		{
 			YAML: `
+a:
+ b
+
+ c
+`,
+			Tokens: token.Tokens{
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "a",
+					Origin:        "\na",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "b\nc",
+					Origin:        "\n b\n\n c",
+				},
+			},
+		},
+		{
+			YAML: `
+a:   
+ b   
+
+  
+ c
+ d 
+e: f
+`,
+			Tokens: token.Tokens{
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "a",
+					Origin:        "\na",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "b\nc d",
+					Origin:        "\n b\n\n\n c\n d\n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "e",
+					Origin:        "e",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "f",
+					Origin:        " f",
+				},
+			},
+		},
+		{
+			YAML: `
+a: |
+ b   
+
+  
+ c
+ d 
+e: f
+`,
+			Tokens: token.Tokens{
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "a",
+					Origin:        "\na",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.LiteralType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockScalarIndicator,
+					Value:         "|",
+					Origin:        " |\n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "b   \n\n \nc\nd \n",
+					Origin:        " b   \n\n  \n c\n d \n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "e",
+					Origin:        "e",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "f",
+					Origin:        " f",
+				},
+			},
+		},
+		{
+			YAML: `
+a: >
+ b   
+
+  
+ c
+ d 
+e: f
+`,
+			Tokens: token.Tokens{
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "a",
+					Origin:        "\na",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.FoldedType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockScalarIndicator,
+					Value:         ">",
+					Origin:        " >\n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "b   \n\n \nc d \n",
+					Origin:        " b   \n\n  \n c\n d \n",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "e",
+					Origin:        "e",
+				},
+				{
+					Type:          token.MappingValueType,
+					CharacterType: token.CharacterTypeIndicator,
+					Indicator:     token.BlockStructureIndicator,
+					Value:         ":",
+					Origin:        ":",
+				},
+				{
+					Type:          token.StringType,
+					CharacterType: token.CharacterTypeMiscellaneous,
+					Indicator:     token.NotIndicator,
+					Value:         "f",
+					Origin:        " f",
+				},
+			},
+		},
+		{
+			YAML: `
 a: >
   Text`,
 			Tokens: token.Tokens{

diff --git a/scanner/context.go b/scanner/context.go
@@ -150,8 +150,7 @@ func (c *Context) addDocumentIndent(column int) {
 		// new-line-char is used as is instead of space.
 		// Therefore, it is necessary to replace the space already added to buf.
 		// `c.docFoldedNewLine` is a variable that is set to true for every newline.
-		if c.isFolded && c.docFoldedNewLine {
-			c.buf[len(c.buf)-1] = '\n'
+		if (c.isFolded || c.isRawFolded) && c.docFoldedNewLine {
 			c.docFoldedNewLine = false
 		}
 		// Since addBuf ignore space character, add to the buffer directly.
@@ -160,18 +159,22 @@ func (c *Context) addDocumentIndent(column int) {
 }
 
 func (c *Context) addDocumentNewLineInFolded(column int) {
-	if !c.isFolded {
+	if c.isLiteral {
 		return
 	}
 	if !c.docFoldedNewLine {
 		return
 	}
+	if c.docLineIndentColumn == c.docPrevLineIndentColumn {
+		if c.buf[len(c.buf)-1] == '\n' {
+			c.buf[len(c.buf)-1] = ' '
+		}
+	}
 	if c.docFirstLineIndentColumn == c.docLineIndentColumn &&
 		c.docLineIndentColumn == c.docPrevLineIndentColumn {
 		// use space as a new line delimiter.
 		return
 	}
-	c.buf[len(c.buf)-1] = '\n'
 	c.docFoldedNewLine = false
 }
 
@@ -276,7 +279,7 @@ func (c *Context) bufferedSrc() []rune {
 	if c.isDocument() {
 		// remove end '\n' character and trailing empty lines.
 		// https://yaml.org/spec/1.2.2/#8112-block-chomping-indicator
-		if c.hasTrimAllEndNewlineOpt() {
+		if c.hasTrimAllEndNewlineOpt() || c.isRawFolded {
 			// If the '-' flag is specified, all trailing newline characters will be removed.
 			src = []rune(strings.TrimRight(string(src), "\n"))
 		} else {
@@ -298,6 +301,9 @@ func (c *Context) bufferedSrc() []rune {
 
 		// If the text ends with a space character, remove all of them.
 		src = []rune(strings.TrimRight(string(src), " "))
+		if string(src) == "\n" {
+			src = []rune{}
+		}
 	}
 	return src
 }

diff --git a/scanner/scanner.go b/scanner/scanner.go
@@ -176,7 +176,7 @@ func (s *Scanner) indentStateFromIndentNumDifference() IndentState {
 }
 
 func (s *Scanner) updateIndent(ctx *Context, c rune) {
-	if s.isFirstCharAtLine && s.isNewLineChar(c) && ctx.isDocument() {
+	if s.isFirstCharAtLine && s.isNewLineChar(c) {
 		return
 	}
 	if s.isFirstCharAtLine && c == ' ' {
@@ -557,21 +557,13 @@ func (s *Scanner) scanDocument(ctx *Context, c rune) error {
 			s.progressColumn(ctx, 1)
 			return ErrInvalidToken(err.Error(), invalidTk)
 		}
-		if ctx.isLiteral {
-			ctx.addBuf(c)
-		} else if ctx.isFolded {
-			ctx.addBuf(c)
-		}
+		ctx.addBuf(c)
 		value := ctx.bufferedSrc()
 		ctx.addToken(token.String(string(value), string(ctx.obuf), s.pos()))
 		ctx.resetBuffer()
 		s.progressColumn(ctx, 1)
 	} else if s.isNewLineChar(c) {
-		if ctx.isLiteral {
-			ctx.addBuf(c)
-		} else {
-			ctx.addBuf(' ')
-		}
+		ctx.addBuf(c)
 		ctx.updateDocumentNewLineState()
 		s.progressLine(ctx)
 	} else if s.isFirstCharAtLine && c == ' ' {
@@ -626,7 +618,15 @@ func (s *Scanner) scanNewLine(ctx *Context, c rune) {
 	} else if s.isAnchor {
 		s.addBufferedTokenIfExists(ctx)
 	}
-	ctx.addBuf(' ')
+	if ctx.existsBuffer() && s.isFirstCharAtLine {
+		if ctx.buf[len(ctx.buf)-1] == ' ' {
+			ctx.buf[len(ctx.buf)-1] = '\n'
+		} else {
+			ctx.buf = append(ctx.buf, '\n')
+		}
+	} else {
+		ctx.addBuf(' ')
+	}
 	ctx.addOriginBuf(c)
 	s.progressLine(ctx)
 }
@@ -789,6 +789,7 @@ func (s *Scanner) scanRawFoldedChar(ctx *Context) bool {
 		return false
 	}
 
+	ctx.updateDocumentLineIndentColumn(s.column)
 	ctx.isRawFolded = true
 	ctx.addBuf('-')
 	ctx.addOriginBuf('-')