Implement some token-based formatting structure. (#4386)

Here I'm trying to add some simple formatting based on the token kind, aiming mainly to keep the implementation short for now. This approach won't generalize to arbitrary structures (e.g., it doesn't discern between braces for a function body and a struct literal). I think we'll probably want to build a parse tree and associate parse kinds with tokens in order to format, additionally doing something less linear. But my essential goal at present is to just get a proof-of-concept that the basics can yield something that looks okay.
carbon-language · Oct 10, 2024 · df55b89 · df55b89
1 parent c721a02
commit df55b89
Show file tree

Hide file tree

Showing 8 changed files with 260 additions and 51 deletions.
diff --git a/toolchain/format/BUILD b/toolchain/format/BUILD
@@ -13,11 +13,16 @@ filegroup(
 
 cc_library(
     name = "format",
-    srcs = ["format.cpp"],
+    srcs = [
+        "format.cpp",
+        "formatter.cpp",
+        "formatter.h",
+    ],
     hdrs = ["format.h"],
     deps = [
         "//common:ostream",
         "//toolchain/lex:token_index",
+        "//toolchain/lex:token_kind",
         "//toolchain/lex:tokenized_buffer",
     ],
 )
diff --git a/toolchain/format/format.cpp b/toolchain/format/format.cpp
@@ -4,45 +4,13 @@
 
 #include "toolchain/format/format.h"
 
-#include "toolchain/lex/token_index.h"
-#include "toolchain/lex/tokenized_buffer.h"
+#include "toolchain/format/formatter.h"
 
 namespace Carbon::Format {
 
-// TODO: Add support for formatting line ranges (will need flags too).
 auto Format(const Lex::TokenizedBuffer& tokens, llvm::raw_ostream& out)
     -> bool {
-  if (tokens.has_errors()) {
-    // TODO: Error recovery.
-    return false;
-  }
-
-  auto comments = tokens.comments();
-  auto comment_it = comments.begin();
-
-  llvm::ListSeparator sep(" ");
-
-  for (auto token : tokens.tokens()) {
-    while (comment_it != comments.end() &&
-           tokens.IsAfterComment(token, *comment_it)) {
-      // TODO: Fix newlines and indent.
-      out << "\n" << tokens.GetCommentText(*comment_it) << "\n";
-      ++comment_it;
-    }
-
-    switch (tokens.GetKind(token)) {
-      case Lex::TokenKind::FileStart:
-        break;
-      case Lex::TokenKind::FileEnd:
-        out << "\n";
-        break;
-      default:
-        // TODO: More dependent formatting.
-        out << sep << tokens.GetTokenText(token);
-        break;
-    }
-  }
-  return true;
+  return Formatter(&tokens, &out).Run();
 }
 
 }  // namespace Carbon::Format
diff --git a/toolchain/format/formatter.cpp b/toolchain/format/formatter.cpp
@@ -0,0 +1,110 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "toolchain/format/formatter.h"
+
+namespace Carbon::Format {
+
+auto Formatter::Run() -> bool {
+  if (tokens_->has_errors()) {
+    // TODO: Error recovery.
+    return false;
+  }
+
+  auto comments = tokens_->comments();
+  auto comment_it = comments.begin();
+
+  // If there are no tokens or comments, format as empty.
+  if (tokens_->size() == 0 && comment_it == comments.end()) {
+    *out_ << "\n";
+    return true;
+  }
+
+  for (auto token : tokens_->tokens()) {
+    auto token_kind = tokens_->GetKind(token);
+
+    while (comment_it != comments.end() &&
+           tokens_->IsAfterComment(token, *comment_it)) {
+      RequireEmptyLine();
+      PrepareForSpacedContent();
+      // TODO: We do need to adjust the indent of multi-line comments.
+      *out_ << tokens_->GetCommentText(*comment_it);
+      // Comment text includes a terminating newline, so just update the state.
+      line_state_ = LineState::Empty;
+      ++comment_it;
+    }
+
+    switch (token_kind) {
+      case Lex::TokenKind::FileStart:
+        break;
+
+      case Lex::TokenKind::FileEnd:
+        RequireEmptyLine();
+        break;
+
+      case Lex::TokenKind::OpenCurlyBrace:
+        PrepareForSpacedContent();
+        *out_ << "{";
+        // Check for `{}`.
+        if (NextToken(token) != tokens_->GetMatchedClosingToken(token)) {
+          RequireEmptyLine();
+        }
+        indent_ += 2;
+        break;
+
+      case Lex::TokenKind::CloseCurlyBrace:
+        indent_ -= 2;
+        PrepareForPackedContent();
+        *out_ << "}";
+        RequireEmptyLine();
+        break;
+
+      case Lex::TokenKind::Semi:
+        PrepareForPackedContent();
+        *out_ << ";";
+        RequireEmptyLine();
+        break;
+
+      default:
+        if (token_kind.IsOneOf(
+                {Lex::TokenKind::CloseParen, Lex::TokenKind::Colon,
+                 Lex::TokenKind::ColonExclaim, Lex::TokenKind::Comma})) {
+          PrepareForPackedContent();
+        } else {
+          PrepareForSpacedContent();
+        }
+        *out_ << tokens_->GetTokenText(token);
+        line_state_ = token_kind.is_opening_symbol()
+                          ? LineState::HasSeparator
+                          : LineState::NeedsSeparator;
+        break;
+    }
+  }
+  return true;
+}
+
+auto Formatter::PrepareForPackedContent() -> void {
+  if (line_state_ == LineState::Empty) {
+    out_->indent(indent_);
+    line_state_ = LineState::HasSeparator;
+  }
+}
+
+auto Formatter::RequireEmptyLine() -> void {
+  if (line_state_ != LineState::Empty) {
+    *out_ << "\n";
+    line_state_ = LineState::Empty;
+  }
+}
+
+auto Formatter::PrepareForSpacedContent() -> void {
+  if (line_state_ == LineState::NeedsSeparator) {
+    *out_ << " ";
+    line_state_ = LineState::HasSeparator;
+  } else {
+    PrepareForPackedContent();
+  }
+}
+
+}  // namespace Carbon::Format
diff --git a/toolchain/format/formatter.h b/toolchain/format/formatter.h
@@ -0,0 +1,83 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_TOOLCHAIN_FORMAT_FORMATTER_H_
+#define CARBON_TOOLCHAIN_FORMAT_FORMATTER_H_
+
+#include <cstdint>
+
+#include "common/ostream.h"
+#include "toolchain/lex/tokenized_buffer.h"
+
+namespace Carbon::Format {
+
+// Implements Format(); see format.h. It's intended to be constructed and
+// `Run()` once, then destructed.
+//
+// TODO: This will probably need to work less linearly in the future, for
+// example to handle smart wrapping of arguments. This is a simple
+// implementation that only handles simple code. Before adding too much more
+// complexity, it should be rewritten.
+//
+// TODO: Add retention of blank lines between original code.
+//
+// TODO: Add support for formatting line ranges (will need flags too).
+class Formatter {
+ public:
+  explicit Formatter(const Lex::TokenizedBuffer* tokens, llvm::raw_ostream* out)
+      : tokens_(tokens), out_(out) {}
+
+  // See class comments.
+  auto Run() -> bool;
+
+ private:
+  // Tracks the status of the current line of output.
+  enum class LineState : uint8_t {
+    // There is no output for the current line.
+    Empty,
+    // The current line has content (possibly just an indent), and does not need
+    // a separator added.
+    HasSeparator,
+    // The current line has content, and will need a separator, typically a
+    // single space or newline.
+    NeedsSeparator,
+  };
+
+  // Ensure output is on an empty line, setting line_state_ to Empty. May output
+  // a newline, dependent on line state. Does not indent, allowing blank lines.
+  auto RequireEmptyLine() -> void;
+
+  // Ensures there is a separator before adding new content. May do
+  // `PrepareForPackedContent` or output a separator space, dependent on line
+  // state. Always results in line_state_ being HasSeparator; the caller is
+  // responsible for adjusting state if needed.
+  auto PrepareForSpacedContent() -> void;
+
+  // Requires that the current line is indented, but not necessarily a separator
+  // space. May output spaces for `indent_`, dependent on line state. Only
+  // guarantees the line_state_ is not Empty; the caller is responsible for
+  // adjusting state if needed.
+  auto PrepareForPackedContent() -> void;
+
+  // Returns the next token index.
+  static auto NextToken(Lex::TokenIndex token) -> Lex::TokenIndex {
+    return *(Lex::TokenIterator(token) + 1);
+  }
+
+  // The tokens being formatted.
+  const Lex::TokenizedBuffer* tokens_;
+
+  // The output stream for formatted content.
+  llvm::raw_ostream* out_;
+
+  // The state of the line currently written to output.
+  LineState line_state_ = LineState::Empty;
+
+  // The current code indent level, to be added to new lines.
+  int indent_ = 0;
+};
+
+}  // namespace Carbon::Format
+
+#endif  // CARBON_TOOLCHAIN_FORMAT_FORMATTER_H_
diff --git a/toolchain/format/testdata/basics/braces.carbon b/toolchain/format/testdata/basics/braces.carbon
@@ -0,0 +1,51 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// AUTOUPDATE
+// TIP: To test this file alone, run:
+// TIP:   bazel test //toolchain/testing:file_test --test_arg=--file_tests=toolchain/format/testdata/basics/braces.carbon
+// TIP: To dump output, run:
+// TIP:   bazel run //toolchain/testing:file_test -- --dump_output --file_tests=toolchain/format/testdata/basics/braces.carbon
+
+// --- test.carbon
+
+fn F() {
+
+}
+
+fn G() -> i32 {
+
+return 3;
+
+}
+
+
+fn H(x: i32, y: i32) -> i32 {
+
+var z: i32 = x + y;
+return z;
+
+}
+
+class C {
+    class D {
+        class E {}
+    }
+}
+
+// --- AUTOUPDATE-SPLIT
+
+// CHECK:STDOUT: fn F () {}
+// CHECK:STDOUT: fn G () -> i32 {
+// CHECK:STDOUT:   return 3;
+// CHECK:STDOUT: }
+// CHECK:STDOUT: fn H (x: i32, y: i32) -> i32 {
+// CHECK:STDOUT:   var z: i32 = x + y;
+// CHECK:STDOUT:   return z;
+// CHECK:STDOUT: }
+// CHECK:STDOUT: class C {
+// CHECK:STDOUT:   class D {
+// CHECK:STDOUT:     class E {}
+// CHECK:STDOUT:   }
+// CHECK:STDOUT: }
diff --git a/toolchain/format/testdata/basics/comments.carbon b/toolchain/format/testdata/basics/comments.carbon
@@ -31,23 +31,15 @@ class C {
 
 // --- AUTOUPDATE-SPLIT
 
-// CHECK:STDOUT:
 // CHECK:STDOUT: // A comment
-// CHECK:STDOUT:
-// CHECK:STDOUT: fn F ( ) { }
+// CHECK:STDOUT: fn F () {}
 // CHECK:STDOUT: // Another comment
-// CHECK:STDOUT:
-// CHECK:STDOUT:
 // CHECK:STDOUT: // Block
 // CHECK:STDOUT:   // comment
-// CHECK:STDOUT:
-// CHECK:STDOUT:  class C {
-// CHECK:STDOUT: // Internal comment
-// CHECK:STDOUT:
-// CHECK:STDOUT:  }
+// CHECK:STDOUT: class C {
+// CHECK:STDOUT:   // Internal comment
+// CHECK:STDOUT: }
 // CHECK:STDOUT: // Another
 // CHECK:STDOUT:   // Block
 // CHECK:STDOUT:   //
 // CHECK:STDOUT:   // Comment
-// CHECK:STDOUT:
-// CHECK:STDOUT:
diff --git a/toolchain/format/testdata/basics/empty.carbon b/toolchain/format/testdata/basics/empty.carbon
@@ -10,5 +10,3 @@
 
 // --- test.carbon
 // --- AUTOUPDATE-SPLIT
-
-// CHECK:STDOUT:
diff --git a/toolchain/format/testdata/basics/simple.carbon b/toolchain/format/testdata/basics/simple.carbon
@@ -10,8 +10,10 @@
 
 // --- test.carbon
 
-fn F(x: i32) -> i32 { return x; }
+fn  F  (  x  :  i32  )  ->  i32  {  return  x  ;  }
 
 // --- AUTOUPDATE-SPLIT
 
-// CHECK:STDOUT: fn F ( x : i32 ) -> i32 { return x ; }
+// CHECK:STDOUT: fn F (x: i32) -> i32 {
+// CHECK:STDOUT:   return x;
+// CHECK:STDOUT: }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,5 +10,3 @@

		// --- test.carbon
		// --- AUTOUPDATE-SPLIT

		// CHECK:STDOUT: