From b473123843a3e93572628a131b89237cce344e5f Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Mon, 9 Sep 2024 22:23:45 -0400 Subject: [PATCH] std.zig.tokenizer: simplify line-based tokens Closes #21358 Closes #21360 This commit modifies the `multiline_string_literal_line`, `doc_comment`, and `container_doc_comment` tokens to no longer include the line ending as part of the token. This makes it easier to handle line endings (which may be LF, CRLF, or in edge cases possibly nonexistent) consistently. In the two issues linked above, Autodoc was already assuming this for doc comments, and yielding incorrect results when handling files with CRLF line endings (both in Markdown parsing and source rendering). Applying the same simplification for multiline string literals also brings `zig fmt` into conformance with https://github.com/ziglang/zig-spec/issues/38 regarding formatting of multiline strings with CRLF line endings: the spec says that `zig fmt` should remove the CR from such line endings, but this was not previously the case. --- lib/std/zig/AstGen.zig | 6 ++---- lib/std/zig/parser_test.zig | 38 +++++++++++++++++++++++++++++++++++++ lib/std/zig/render.zig | 3 --- lib/std/zig/tokenizer.zig | 4 ---- 4 files changed, 40 insertions(+), 11 deletions(-) diff --git a/lib/std/zig/AstGen.zig b/lib/std/zig/AstGen.zig index c15a995e6010..675fe095a26e 100644 --- a/lib/std/zig/AstGen.zig +++ b/lib/std/zig/AstGen.zig @@ -11721,16 +11721,14 @@ fn strLitNodeAsString(astgen: *AstGen, node: Ast.Node.Index) !IndexSlice { var tok_i = start; { const slice = tree.tokenSlice(tok_i); - const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1; - const line_bytes = slice[2 .. slice.len - carriage_return_ending]; + const line_bytes = slice[2..]; try string_bytes.appendSlice(gpa, line_bytes); tok_i += 1; } // Following lines: each line prepends a newline. while (tok_i <= end) : (tok_i += 1) { const slice = tree.tokenSlice(tok_i); - const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1; - const line_bytes = slice[2 .. slice.len - carriage_return_ending]; + const line_bytes = slice[2..]; try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len + 1); string_bytes.appendAssumeCapacity('\n'); string_bytes.appendSliceAssumeCapacity(line_bytes); diff --git a/lib/std/zig/parser_test.zig b/lib/std/zig/parser_test.zig index d399c58c9cd6..e2c9b034ed7e 100644 --- a/lib/std/zig/parser_test.zig +++ b/lib/std/zig/parser_test.zig @@ -3087,6 +3087,22 @@ test "zig fmt: multiline string" { ); } +test "zig fmt: multiline string with CRLF line endings" { + try testTransform("" ++ + "const s =\r\n" ++ + " \\\\one\r\n" ++ + " \\\\two)\r\n" ++ + " \\\\three\r\n" ++ + ";\r\n", + \\const s = + \\ \\one + \\ \\two) + \\ \\three + \\; + \\ + ); +} + test "zig fmt: values" { try testCanonical( \\test "values" { @@ -4404,6 +4420,28 @@ test "zig fmt: invalid doc comments on comptime and test blocks" { }); } +test "zig fmt: comments with CRLF line endings" { + try testTransform("" ++ + "//! Top-level doc comment\r\n" ++ + "//! Continuing to another line\r\n" ++ + "\r\n" ++ + "/// Regular doc comment\r\n" ++ + "const S = struct {\r\n" ++ + " // Regular comment\r\n" ++ + " // More content\r\n" ++ + "};\r\n", + \\//! Top-level doc comment + \\//! Continuing to another line + \\ + \\/// Regular doc comment + \\const S = struct { + \\ // Regular comment + \\ // More content + \\}; + \\ + ); +} + test "zig fmt: else comptime expr" { try testCanonical( \\comptime { diff --git a/lib/std/zig/render.zig b/lib/std/zig/render.zig index cd17b6963956..c0391b4faff4 100644 --- a/lib/std/zig/render.zig +++ b/lib/std/zig/render.zig @@ -3170,9 +3170,6 @@ fn discardAllParams(r: *Render, fn_proto_node: Ast.Node.Index) Error!void { fn tokenSliceForRender(tree: Ast, token_index: Ast.TokenIndex) []const u8 { var ret = tree.tokenSlice(token_index); switch (tree.tokens.items(.tag)[token_index]) { - .multiline_string_literal_line => { - if (ret[ret.len - 1] == '\n') ret.len -= 1; - }, .container_doc_comment, .doc_comment => { ret = mem.trimRight(u8, ret, &std.ascii.whitespace); }, diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig index b63bde563385..05c0f8ed89cd 100644 --- a/lib/std/zig/tokenizer.zig +++ b/lib/std/zig/tokenizer.zig @@ -847,12 +847,10 @@ pub const Tokenizer = struct { break; }, '\n' => { - self.index += 1; break; }, '\r' => { if (self.buffer[self.index + 1] == '\n') { - self.index += 2; break; } else { state = .invalid; @@ -1117,7 +1115,6 @@ pub const Tokenizer = struct { }, '\r' => { if (self.buffer[self.index + 1] == '\n') { - self.index += 1; result.tag = .doc_comment; break; } else { @@ -1167,7 +1164,6 @@ pub const Tokenizer = struct { }, '\r' => { if (self.buffer[self.index + 1] == '\n') { - self.index += 1; break; } else { state = .invalid;