fix #419: handle multiple c-style comments

tconbeer · May 31, 2023 · 80b3563 · 80b3563
1 parent 6485747
commit 80b3563
Show file tree

Hide file tree

Showing 14 changed files with 131 additions and 30 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 
 ## [Unreleased]
 
+-   fixes a bug where multiple c-style comments (e.g., `/* comment */`) on a single line would cause sqlfmt
+    to not include all comments in formatted output ([#419](https://github.com/tconbeer/sqlfmt/issues/419) - thank you [@aersam](https://github.com/aersam)!)
+-   adds a safety check to ensure comments are preserved in formatted output
+
 ## [0.18.2] - 2023-05-31
 
 -   fixes a bug where specifying both relative and absolute paths would cause sqlfmt to crash ([#426](https://github.com/tconbeer/sqlfmt/issues/426) - thank you for the issue and fix, [@smcgivern](https://github.com/smcgivern)!)

diff --git a/src/sqlfmt/actions.py b/src/sqlfmt/actions.py
@@ -80,7 +80,11 @@ def add_comment_to_buffer(
     """
     token = Token.from_match(source_string, match, TokenType.COMMENT)
     is_standalone = (not bool(analyzer.node_buffer)) or "\n" in token.token
-    comment = Comment(token=token, is_standalone=is_standalone)
+    comment = Comment(
+        token=token,
+        is_standalone=is_standalone,
+        previous_node=analyzer.previous_node,
+    )
     analyzer.comment_buffer.append(comment)
     analyzer.pos = token.epos
 

diff --git a/src/sqlfmt/api.py b/src/sqlfmt/api.py
@@ -361,3 +361,40 @@ def _perform_safety_check(analyzer: Analyzer, raw_query: Query, result: str) ->
             f"token at position {mismatch_pos}: raw: {mismatch_raw}; "
             f"result: {mismatch_res}."
         )
+
+    raw_comments = [
+        comment.token.token for line in raw_query.lines for comment in line.comments
+    ]
+    result_comments = [
+        comment.token.token for line in result_query.lines for comment in line.comments
+    ]
+    stripped_raw = "".join(
+        ["".join(c.replace("--", "").replace("#", "").split()) for c in raw_comments]
+    )
+    stripped_res = "".join(
+        ["".join(c.replace("--", "").replace("#", "").split()) for c in result_comments]
+    )
+    try:
+        assert stripped_raw == stripped_res
+    except AssertionError:
+        raw_len = len(stripped_raw)
+        result_len = len(stripped_res)
+        mismatch_pos = 0
+        mismatch_raw = ""
+        mismatch_res = ""
+
+        for i, (raw, res) in enumerate(zip_longest(stripped_raw, stripped_res)):
+            if raw is not res:
+                mismatch_pos = i
+                mismatch_raw = stripped_raw[i : i + 25]
+                mismatch_res = stripped_res[i : i + 25]
+                break
+
+        raise SqlfmtEquivalenceError(
+            "There was a problem formatting your query that "
+            "caused the safety check to fail. Please open an "
+            f"issue. Raw query had {raw_len} comment characters; formatted "
+            f"query had {result_len} comment characters. First mismatching "
+            f"character at position {mismatch_pos}: raw: {mismatch_raw}; "
+            f"result: {mismatch_res}."
+        )
diff --git a/src/sqlfmt/comment.py b/src/sqlfmt/comment.py
@@ -1,7 +1,8 @@
 import re
 from dataclasses import dataclass
-from typing import ClassVar, Iterator, Tuple
+from typing import ClassVar, Iterator, Optional, Tuple
 
+from sqlfmt.node import Node
 from sqlfmt.token import Token
 
 
@@ -14,6 +15,7 @@ class Comment:
 
     token: Token
     is_standalone: bool
+    previous_node: Optional[Node]
     comment_marker: ClassVar[re.Pattern] = re.compile(r"(--|#|/\*|\{#-?)([^\S\n]*)")
 
     def __str__(self) -> str:
@@ -67,16 +69,19 @@ def is_multiline(self) -> bool:
         """
         return "\n" in self.token.token
 
+    @property
+    def is_c_style(self) -> bool:
+        return self.token.token.startswith("/*")
+
     @property
     def is_inline(self) -> bool:
-        return not self.is_standalone and not self.is_multiline
+        return not self.is_standalone and not self.is_multiline and not self.is_c_style
 
     def render_inline(self) -> str:
         """
         Renders a comment as an inline comment, assuming it'll fit.
         """
-        inline_prefix = " " * 2
-        return f"{inline_prefix}{self}"
+        return f"{self}"
 
     def render_standalone(self, max_length: int, prefix: str) -> str:
         """

diff --git a/src/sqlfmt/line.py b/src/sqlfmt/line.py
@@ -105,18 +105,17 @@ def render_with_comments(self, max_length: int) -> str:
         """
         content = str(self).rstrip()
         rendered_lines: List[str] = []
+        inline_comments: List[str] = []
         for comment in self.comments:
             if comment.is_multiline or comment.is_standalone:
                 rendered_lines.append(
                     comment.render_standalone(max_length=max_length, prefix=self.prefix)
                 )
             else:
-                # comment is potentially an inline comment, and we'll handle
-                # that below. Inline comments must be the last comment
-                pass
+                inline_comments.append(comment.render_inline())
 
-        if self.has_inline_comment:
-            rendered_lines.append(f"{content}{self.comments[-1].render_inline()}")
+        if inline_comments:
+            rendered_lines.append(f"{content}  {' '.join(inline_comments)}")
         else:
             rendered_lines.append(f"{self}")
 

diff --git a/src/sqlfmt/merger.py b/src/sqlfmt/merger.py
@@ -77,7 +77,14 @@ def _extract_components(
                     raise CannotMergeException(
                         "Can't merge lines with inline comments and other comments"
                     )
-                elif len(line.comments) == 1 and line.has_inline_comment:
+                elif (
+                    len(line.comments) == 1
+                    and len(line.nodes) > 1
+                    and (
+                        line.comments[0].is_inline
+                        or line.comments[0].previous_node == line.nodes[-2]
+                    )
+                ):
                     # this is a comment that must be rendered inline,
                     # so it'll probably block merging unless the
                     # next line is just a comma

diff --git a/src/sqlfmt/splitter.py b/src/sqlfmt/splitter.py
@@ -157,10 +157,21 @@ def split_at_index(
                 # the only thing after the split is an operator + \n, so keep the
                 # comments with the stuff before the operator
                 head_comments, tail_comments = comments, []
-            elif comments[-1].is_inline:
-                head_comments, tail_comments = comments[:-1], [comments[-1]]
             else:
-                head_comments, tail_comments = comments, []
+                head_comments, tail_comments = [], []
+                for comment in comments:
+                    if (
+                        comment.is_standalone
+                        or comment.is_multiline
+                        or comment.previous_node is None
+                    ):
+                        head_comments.append(comment)
+                    elif comment.is_inline:
+                        tail_comments.append(comment)
+                    elif comment.is_c_style and comment.previous_node in new_nodes:
+                        head_comments.append(comment)
+                    else:
+                        tail_comments.append(comment)
 
         else:  # no comments
             head_comments, tail_comments = [], []

diff --git a/tests/data/unformatted/101_multiline.sql b/tests/data/unformatted/101_multiline.sql
@@ -67,5 +67,5 @@ with
  # that we will also handle.
 #}
 select *
-from renamed
-where true  /* what!?! */
+from renamed  /* what!?! */
+where true
diff --git a/tests/data/unformatted/218_multiple_c_comments.sql b/tests/data/unformatted/218_multiple_c_comments.sql
@@ -0,0 +1,15 @@
+SELECT t.customer_sk AS `Customer_IdDWH`, t.customer_id AS `Customer_IdOrig`, t.__identity AS `CustomerCreditLimit_Agg_IdDWH`, t.kreditlimit_intern /* not existing, proposal: credit_limit_intern (92%) */ AS `KreditlimitIntern`, t.kreditlimit_versichert /* not existing */ AS `KreditlimitVersichert`, t.valid_from /* not existing */ AS `ValidFrom`, t.valid_to /* not existing */ AS `ValidTo` FROM `gold`.`dim_customer_credit_limit_agg` AS t
+)))))__SQLFMT_OUTPUT__(((((
+select
+    t.customer_sk as `Customer_IdDWH`,
+    t.customer_id as `Customer_IdOrig`,
+    t.__identity as `CustomerCreditLimit_Agg_IdDWH`,
+    t.kreditlimit_intern  /* not existing, proposal: credit_limit_intern (92%) */
+    as `KreditlimitIntern`,
+    t.kreditlimit_versichert  /* not existing */
+    as `KreditlimitVersichert`,
+    t.valid_from  /* not existing */
+    as `ValidFrom`,
+    t.valid_to  /* not existing */
+    as `ValidTo`
+from `gold`.`dim_customer_credit_limit_agg` as t
diff --git a/tests/functional_tests/test_general_formatting.py b/tests/functional_tests/test_general_formatting.py
@@ -60,6 +60,7 @@
         "unformatted/215_gitlab_get_backup_table_command.sql",
         "unformatted/216_gitlab_zuora_revenue_revenue_contract_line_source.sql",
         "unformatted/217_dbt_unit_testing_csv.sql",
+        "unformatted/218_multiple_c_comments.sql",
         "unformatted/300_jinjafmt.sql",
         "unformatted/400_create_fn_and_select.sql",
         "unformatted/401_explain_select.sql",

diff --git a/tests/unit_tests/test_analyzer.py b/tests/unit_tests/test_analyzer.py
@@ -185,6 +185,7 @@ def test_multiline_parsing(default_analyzer: Analyzer) -> None:
                 epos=310,
             ),
             is_standalone=True,
+            previous_node=q.nodes[2],
         ),
         Comment(
             token=Token(
@@ -198,6 +199,7 @@ def test_multiline_parsing(default_analyzer: Analyzer) -> None:
                 epos=499,
             ),
             is_standalone=True,
+            previous_node=q.nodes[17],
         ),
         Comment(
             token=Token(
@@ -211,6 +213,7 @@ def test_multiline_parsing(default_analyzer: Analyzer) -> None:
                 epos=837,
             ),
             is_standalone=True,
+            previous_node=q.nodes[38],
         ),
         Comment(
             token=Token(
@@ -221,6 +224,7 @@ def test_multiline_parsing(default_analyzer: Analyzer) -> None:
                 epos=874,
             ),
             is_standalone=False,
+            previous_node=q.nodes[42],
         ),
     ]
 

diff --git a/tests/unit_tests/test_api.py b/tests/unit_tests/test_api.py
@@ -352,9 +352,19 @@ def test_perform_safety_check(default_mode: Mode) -> None:
         "result: TokenType.NAME." in str(excinfo.value)
     )
 
+    with pytest.raises(SqlfmtEquivalenceError) as excinfo:
+        # adds a comment
+        _perform_safety_check(
+            analyzer, raw_query, "select\n-- new comment\n    1, 2, 3\n"
+        )
+
+    assert (
+        "Raw query had 0 comment characters; formatted query had 10 comment characters"
+        in str(excinfo.value)
+    )
+
     # does not raise
     _perform_safety_check(analyzer, raw_query, "select\n    1, 2, 3\n")
-    _perform_safety_check(analyzer, raw_query, "select\n-- new comment\n    1, 2, 3\n")
 
 
 @pytest.mark.parametrize(

diff --git a/tests/unit_tests/test_comment.py b/tests/unit_tests/test_comment.py
@@ -11,7 +11,7 @@ def short_comment() -> Comment:
     t = Token(
         type=TokenType.COMMENT, prefix=" ", token="-- short comment", spos=0, epos=16
     )
-    comment = Comment(t, is_standalone=False)
+    comment = Comment(t, is_standalone=False, previous_node=None)
     return comment
 
 
@@ -20,7 +20,7 @@ def short_mysql_comment() -> Comment:
     t = Token(
         type=TokenType.COMMENT, prefix=" ", token="# short comment", spos=0, epos=15
     )
-    comment = Comment(t, is_standalone=False)
+    comment = Comment(t, is_standalone=False, previous_node=None)
     return comment
 
 
@@ -29,7 +29,7 @@ def nospace_comment() -> Comment:
     t = Token(
         type=TokenType.COMMENT, prefix=" ", token="--short comment", spos=0, epos=15
     )
-    comment = Comment(t, is_standalone=False)
+    comment = Comment(t, is_standalone=False, previous_node=None)
     return comment
 
 
@@ -38,7 +38,7 @@ def standalone_comment() -> Comment:
     t = Token(
         type=TokenType.COMMENT, prefix=" ", token="-- short comment", spos=0, epos=16
     )
-    comment = Comment(t, is_standalone=True)
+    comment = Comment(t, is_standalone=True, previous_node=None)
     return comment
 
 
@@ -47,7 +47,7 @@ def multiline_comment() -> Comment:
     t = Token(
         type=TokenType.COMMENT, prefix=" ", token="/*\ncomment\n*/", spos=0, epos=15
     )
-    comment = Comment(t, is_standalone=True)
+    comment = Comment(t, is_standalone=True, previous_node=None)
     return comment
 
 
@@ -82,7 +82,7 @@ def test_str_len(
 def test_render_inline(
     short_comment: Comment, nospace_comment: Comment, standalone_comment: Comment
 ) -> None:
-    expected = "  -- short comment\n"
+    expected = "-- short comment\n"
     assert short_comment.render_inline() == expected
     assert nospace_comment.render_inline() == expected
 
@@ -107,7 +107,7 @@ def test_render_standalone(short_comment: Comment, prefix: str) -> None:
 def test_render_standalone_wrap_strip_whitespace() -> None:
     txt = "-- foo" + " " * 100 + "bar"
     t = Token(type=TokenType.COMMENT, prefix="", token=txt, spos=0, epos=len(txt))
-    comment = Comment(t, is_standalone=True)
+    comment = Comment(t, is_standalone=True, previous_node=None)
     assert comment.render_standalone(max_length=88, prefix="") == "-- foo\n-- bar\n"
 
 
@@ -135,7 +135,7 @@ def test_split_before(text: str, expected_splits: List[str]) -> None:
 
 def test_empty_comment() -> None:
     t = Token(type=TokenType.COMMENT, prefix=" ", token="-- ", spos=0, epos=3)
-    comment = Comment(t, is_standalone=True)
+    comment = Comment(t, is_standalone=True, previous_node=None)
     assert str(comment) == "--\n"
 
 
@@ -156,7 +156,7 @@ def test_no_wrap_long_jinja_comments() -> None:
         spos=0,
         epos=len(comment_str),
     )
-    comment = Comment(t, is_standalone=True)
+    comment = Comment(t, is_standalone=True, previous_node=None)
     rendered = comment.render_standalone(88, "")
 
     assert rendered == comment_str + "\n"
diff --git a/tests/unit_tests/test_line.py b/tests/unit_tests/test_line.py
@@ -210,14 +210,18 @@ def test_comment_rendering(
         epos=last_node.token.epos + 13,
     )
 
-    inline_comment = Comment(token=comment_token, is_standalone=False)
+    inline_comment = Comment(
+        token=comment_token, is_standalone=False, previous_node=last_node
+    )
     simple_line.comments = [inline_comment]
     expected_inline_render = (
         str(simple_line).rstrip() + "  " + normalized_comment + "\n"
     )
     assert simple_line.render_with_comments(88) == expected_inline_render
 
-    standalone_comment = Comment(token=comment_token, is_standalone=True)
+    standalone_comment = Comment(
+        token=comment_token, is_standalone=True, previous_node=None
+    )
     simple_line.comments = [standalone_comment]
     expected_standalone_render = (
         simple_line.prefix + normalized_comment + "\n" + str(simple_line)
@@ -234,7 +238,7 @@ def test_long_comment_wrapping(simple_line: Line) -> None:
         spos=last_node.token.epos,
         epos=last_node.token.epos + 13,
     )
-    comment = Comment(token=comment_token, is_standalone=True)
+    comment = Comment(token=comment_token, is_standalone=True, previous_node=None)
     simple_line.comments = [comment]
     expected_render = (
         "-- asdf asdf asdf asdf asdf asdf asdf asdf asdf asdf asdf asdf asdf asdf "
@@ -264,7 +268,7 @@ def test_long_comments_that_are_not_wrapped(
         spos=last_node.token.epos,
         epos=last_node.token.epos + 13,
     )
-    comment = Comment(token=comment_token, is_standalone=True)
+    comment = Comment(token=comment_token, is_standalone=True, previous_node=None)
     simple_line.comments = [comment]
     expected_render = raw_comment + "\nwith abc as (select * from my_table)\n"
     assert simple_line.render_with_comments(88) == expected_render