From b473123843a3e93572628a131b89237cce344e5f Mon Sep 17 00:00:00 2001
From: Ian Johnson <ian@ianjohnson.dev>
Date: Mon, 9 Sep 2024 22:23:45 -0400
Subject: [PATCH] std.zig.tokenizer: simplify line-based tokens

Closes #21358
Closes #21360

This commit modifies the `multiline_string_literal_line`, `doc_comment`,
and `container_doc_comment` tokens to no longer include the line ending
as part of the token. This makes it easier to handle line endings (which
may be LF, CRLF, or in edge cases possibly nonexistent) consistently.

In the two issues linked above, Autodoc was already assuming this for
doc comments, and yielding incorrect results when handling files with
CRLF line endings (both in Markdown parsing and source rendering).

Applying the same simplification for multiline string literals also
brings `zig fmt` into conformance with
https://github.com/ziglang/zig-spec/issues/38 regarding formatting of
multiline strings with CRLF line endings: the spec says that `zig fmt`
should remove the CR from such line endings, but this was not previously
the case.
---
 lib/std/zig/AstGen.zig      |  6 ++----
 lib/std/zig/parser_test.zig | 38 +++++++++++++++++++++++++++++++++++++
 lib/std/zig/render.zig      |  3 ---
 lib/std/zig/tokenizer.zig   |  4 ----
 4 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/lib/std/zig/AstGen.zig b/lib/std/zig/AstGen.zig
index c15a995e6010..675fe095a26e 100644
--- a/lib/std/zig/AstGen.zig
+++ b/lib/std/zig/AstGen.zig
@@ -11721,16 +11721,14 @@ fn strLitNodeAsString(astgen: *AstGen, node: Ast.Node.Index) !IndexSlice {
     var tok_i = start;
     {
         const slice = tree.tokenSlice(tok_i);
-        const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
-        const line_bytes = slice[2 .. slice.len - carriage_return_ending];
+        const line_bytes = slice[2..];
         try string_bytes.appendSlice(gpa, line_bytes);
         tok_i += 1;
     }
     // Following lines: each line prepends a newline.
     while (tok_i <= end) : (tok_i += 1) {
         const slice = tree.tokenSlice(tok_i);
-        const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
-        const line_bytes = slice[2 .. slice.len - carriage_return_ending];
+        const line_bytes = slice[2..];
         try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len + 1);
         string_bytes.appendAssumeCapacity('\n');
         string_bytes.appendSliceAssumeCapacity(line_bytes);
diff --git a/lib/std/zig/parser_test.zig b/lib/std/zig/parser_test.zig
index d399c58c9cd6..e2c9b034ed7e 100644
--- a/lib/std/zig/parser_test.zig
+++ b/lib/std/zig/parser_test.zig
@@ -3087,6 +3087,22 @@ test "zig fmt: multiline string" {
     );
 }
 
+test "zig fmt: multiline string with CRLF line endings" {
+    try testTransform("" ++
+        "const s =\r\n" ++
+        "    \\\\one\r\n" ++
+        "    \\\\two)\r\n" ++
+        "    \\\\three\r\n" ++
+        ";\r\n",
+        \\const s =
+        \\    \\one
+        \\    \\two)
+        \\    \\three
+        \\;
+        \\
+    );
+}
+
 test "zig fmt: values" {
     try testCanonical(
         \\test "values" {
@@ -4404,6 +4420,28 @@ test "zig fmt: invalid doc comments on comptime and test blocks" {
     });
 }
 
+test "zig fmt: comments with CRLF line endings" {
+    try testTransform("" ++
+        "//! Top-level doc comment\r\n" ++
+        "//! Continuing to another line\r\n" ++
+        "\r\n" ++
+        "/// Regular doc comment\r\n" ++
+        "const S = struct {\r\n" ++
+        "    // Regular comment\r\n" ++
+        "    // More content\r\n" ++
+        "};\r\n",
+        \\//! Top-level doc comment
+        \\//! Continuing to another line
+        \\
+        \\/// Regular doc comment
+        \\const S = struct {
+        \\    // Regular comment
+        \\    // More content
+        \\};
+        \\
+    );
+}
+
 test "zig fmt: else comptime expr" {
     try testCanonical(
         \\comptime {
diff --git a/lib/std/zig/render.zig b/lib/std/zig/render.zig
index cd17b6963956..c0391b4faff4 100644
--- a/lib/std/zig/render.zig
+++ b/lib/std/zig/render.zig
@@ -3170,9 +3170,6 @@ fn discardAllParams(r: *Render, fn_proto_node: Ast.Node.Index) Error!void {
 fn tokenSliceForRender(tree: Ast, token_index: Ast.TokenIndex) []const u8 {
     var ret = tree.tokenSlice(token_index);
     switch (tree.tokens.items(.tag)[token_index]) {
-        .multiline_string_literal_line => {
-            if (ret[ret.len - 1] == '\n') ret.len -= 1;
-        },
         .container_doc_comment, .doc_comment => {
             ret = mem.trimRight(u8, ret, &std.ascii.whitespace);
         },
diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig
index b63bde563385..05c0f8ed89cd 100644
--- a/lib/std/zig/tokenizer.zig
+++ b/lib/std/zig/tokenizer.zig
@@ -847,12 +847,10 @@ pub const Tokenizer = struct {
                         break;
                     },
                     '\n' => {
-                        self.index += 1;
                         break;
                     },
                     '\r' => {
                         if (self.buffer[self.index + 1] == '\n') {
-                            self.index += 2;
                             break;
                         } else {
                             state = .invalid;
@@ -1117,7 +1115,6 @@ pub const Tokenizer = struct {
                     },
                     '\r' => {
                         if (self.buffer[self.index + 1] == '\n') {
-                            self.index += 1;
                             result.tag = .doc_comment;
                             break;
                         } else {
@@ -1167,7 +1164,6 @@ pub const Tokenizer = struct {
                     },
                     '\r' => {
                         if (self.buffer[self.index + 1] == '\n') {
-                            self.index += 1;
                             break;
                         } else {
                             state = .invalid;