From dc833c3d755ca22b26d142b6a5b1b05d500c8956 Mon Sep 17 00:00:00 2001 From: Mike Patrick Date: Fri, 29 Nov 2024 16:12:26 +0000 Subject: [PATCH 1/5] Break up long string into a heredoc I know heredocs are weird, but this is *probably* more readable. I always need to remind myself how the various flavours of heredoc actually behave. Squiggly heredocs (<<~) remove indentation from their content. It'll remove the same amount of indentation across all lines, equal to the indentation of its least-indented line. In this case, all of our lines start out indented equally, so all indentation ends up removed from the resulting string. Also worth noting: the replacement of newline chars in the old string with actual line breaks in the heredoc. The tests pass, so I hope that that means the output is identical in all cases that we care about. --- lib/govspeak.rb | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/govspeak.rb b/lib/govspeak.rb index 16890ed..626dd63 100644 --- a/lib/govspeak.rb +++ b/lib/govspeak.rb @@ -302,7 +302,12 @@ def render_image(image) end extension("address", surrounded_by("$A")) do |body| - %(\n

\n#{body.sub("\n", '').gsub("\n", '
')}\n

\n) + <<~BODY + +

+ #{body.sub("\n", '').gsub("\n", '
')} +

+ BODY end extension("legislative list", /#{NEW_PARAGRAPH_LOOKBEHIND}\$LegislativeList\s*$(.*?)\$EndLegislativeList/m) do |body| From fd30721010cd5df0e37f395ac284939303c85f8b Mon Sep 17 00:00:00 2001 From: Mike Patrick Date: Fri, 29 Nov 2024 16:29:41 +0000 Subject: [PATCH 2/5] Strip trailing slashes and match \r\n line breaks Unfortunately this commit introduces two changes because I haven't figured out how to make them independently of each other: * Strip trailing backslashes from the lines of an address * When replacing line breaks with "
"s in the parsing of address blocks, replace the whole "\r\n" line break and not just the "\n" character _Strip trailing backslashes from the lines of an address_ Kramdown has a feature where it will replace two trailing backslashes from the input Govspeak with a line break in the output HTML. I'm in the process of making some changes that will stop that feature from working inside address blocks. It doesn't appear to be used in the wild in any address blocks and we don't officially support the feature in address blocks, so I think that intentionally disabling it is reasonable. To be clear, while Kramdown only treats pairs of backslashes this way, we're actually removing all trailing backslashes because even an odd number of them interferes with our code that handles address blocks. _When replacing line breaks with "
"s in the parsing of address blocks, replace the whole "\r\n" line break and not just the "\n" character_ The line breaks in the Govspeak in Whitehall's database and in the strings that Govspeak Preview hands to the parser are composed of both a return char and a newline char. In the resulting HTML, address blocks contain the return chars still intact because the address block's search-and-replace code was only matching and removing the newline chars. The intact return chars might actually be serving the purpose of producing line breaks in our source HTML (i.e. not affecting the way the HTML is rendered in the browser, but admittedly producing more readable plain HTML). But if this is the case and if this turns out to have been desirable behaviour I'd prefer that we do that more deliberately. Since the example Govspeak in the test suite sometimes uses standalone newlines and sometimes uses combined returns-and-newlines, it seems safest to assume that both options are possible and so I've opted for optionally matching either. --- CHANGELOG.md | 6 ++++++ lib/govspeak.rb | 2 +- test/govspeak_test.rb | 19 +++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 677739d..38a3c78 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## Unreleased + +* Strip trailing backslashes from the lines of an address +* When replacing line breaks with "<br>"s in the parsing of address blocks, +replace the whole "\r\n" line break and not just the "\n" character + ## 8.7.0 * Allow data attributes in spans ([#364](https://github.com/alphagov/govspeak/pull/364)) diff --git a/lib/govspeak.rb b/lib/govspeak.rb index 626dd63..62f94c3 100644 --- a/lib/govspeak.rb +++ b/lib/govspeak.rb @@ -305,7 +305,7 @@ def render_image(image) <<~BODY

- #{body.sub("\n", '').gsub("\n", '
')} + #{body.sub(/\r?\n/, '').gsub(/\\*\r?\n/, '
')}

BODY end diff --git a/test/govspeak_test.rb b/test/govspeak_test.rb index 823b4c9..f3c8839 100644 --- a/test/govspeak_test.rb +++ b/test/govspeak_test.rb @@ -109,6 +109,25 @@ class GovspeakTest < Minitest::Test assert_equal %(\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890 \n

\n), doc.to_html end + test "newlines in address block content are replaced with
s" do + input = %($A\n123 Test Street\nTestcase Cliffs\nTeston\n0123 456 7890\n$A) + doc = Govspeak::Document.new(input) + assert_equal %(\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890
\n

\n), doc.to_html + end + + test "combined return-and-newlines in address block content are replaced with
s" do + input = %($A\r\n123 Test Street\r\nTestcase Cliffs\r\nTeston\r\n0123 456 7890\r\n$A) + doc = Govspeak::Document.new(input) + assert_equal %(\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890
\n

\n), doc.to_html + end + + test "trailing backslashes are stripped from address block content" do + # two trailing backslashes would normally be converted into a line break by Kramdown + input = %($A\r\n123 Test Street\\\r\nTestcase Cliffs\\\nTeston\\\\\r\n0123 456 7890\\\\\n$A) + doc = Govspeak::Document.new(input) + assert_equal %(\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890
\n

\n), doc.to_html + end + test "should convert barchart" do input = <<~GOVSPEAK |col| From 659de00ebcd653fe51cdbbe0084532588877b91e Mon Sep 17 00:00:00 2001 From: Mike Patrick Date: Mon, 2 Dec 2024 16:18:06 +0000 Subject: [PATCH 3/5] Make line break removal more targetted This `sub` was present in the very first commit that introduced the address block extension and there's no explanation of what it's specifically intended to achieve. I've interpreted it as "if there's a leading line break, remove it", as opposed to "remove the first occurrence of a line break regardless of where it appears in the input string". I could have missed something. This change makes the intention that I've assumed was there clearer to readers and prevents any other line break being removed unintentionally if something did ever change about the expected format of the input. --- CHANGELOG.md | 2 ++ lib/govspeak.rb | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38a3c78..49a277a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ * Strip trailing backslashes from the lines of an address * When replacing line breaks with "<br>"s in the parsing of address blocks, replace the whole "\r\n" line break and not just the "\n" character +* Only strip a leading line break from an address block, not just any old first +occurrence of a line break ## 8.7.0 diff --git a/lib/govspeak.rb b/lib/govspeak.rb index 62f94c3..b203c05 100644 --- a/lib/govspeak.rb +++ b/lib/govspeak.rb @@ -305,7 +305,7 @@ def render_image(image) <<~BODY

- #{body.sub(/\r?\n/, '').gsub(/\\*\r?\n/, '
')} + #{body.lstrip.gsub(/\\*\r?\n/, '
')}

BODY end From 210ac6dff57fa0f959434580a2dac98c99e3aa7f Mon Sep 17 00:00:00 2001 From: Mike Patrick Date: Fri, 6 Dec 2024 16:59:51 +0000 Subject: [PATCH 4/5] Strip trailing space from the lines of an address There's a feature in the Kramdown parser (i.e. the library that the Govspeak parser is built on top of) where two or more trailing spaces at the end of a line of Govspeak will be converted into an additional line break in the output HTML. Until recently, the contents of address blocks weren't "reparsed" in the Govspeak parser, meaning that they weren't subject to this Kramdown feature. But the Govspeak parser code was recently updated and the contents of address blocks are now reparsed. This has resulted in unintentional additional line spacing being made visible in some addresses that have been rendered by this new version Govspeak. A typical example of the issue looks like: | Name | | Street | | Town | | Postcode | where it would've previously looked like: | Name | Street | Town | Postcode By stripping the trailing space from Govspeak lines within address blocks, we effectively disable the Kramdown feature and prevent unintentional line breaks in addresses going forward. BTW, it's still straightforward to introduce additional line breaks deliberately. _A note on the move to handling the last line of the address separately from the rest:_ In Govspeak, address blocks don't necessarily have a line break between the last line of the address and its closing tag. Until now, where the input Govspeak did have a line break, the output HTML replaced it with a
. When there wasn't a line break, the output had no
. This difference didn't seem to me to have any impact on the final rendered page in the browser, so I've gone ahead and stopped those
s from appearing (as can be seen in the changes we've made here to existing tests). So now, while trailing spaces and backslashes are being stripped from all address lines, in addition to that, for the very last line of the address, trailing line break characters are also being stripped. --- CHANGELOG.md | 1 + lib/govspeak.rb | 2 +- test/govspeak_test.rb | 28 +++++++++++++++++++++------- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 49a277a..d66ad4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ replace the whole "\r\n" line break and not just the "\n" character * Only strip a leading line break from an address block, not just any old first occurrence of a line break +* Strip trailing whitespace from the lines of an address ## 8.7.0 diff --git a/lib/govspeak.rb b/lib/govspeak.rb index b203c05..8d36123 100644 --- a/lib/govspeak.rb +++ b/lib/govspeak.rb @@ -305,7 +305,7 @@ def render_image(image) <<~BODY

- #{body.lstrip.gsub(/\\*\r?\n/, '
')} + #{body.lstrip.sub(/[\s\\]*\z/, '').gsub(/[ \\]*\r?\n/, '
')}

BODY end diff --git a/test/govspeak_test.rb b/test/govspeak_test.rb index f3c8839..ddc344c 100644 --- a/test/govspeak_test.rb +++ b/test/govspeak_test.rb @@ -106,26 +106,40 @@ class GovspeakTest < Minitest::Test Teston 0123 456 7890 $A ) doc = Govspeak::Document.new(input) - assert_equal %(\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890 \n

\n), doc.to_html + assert_equal %(\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890\n

\n), doc.to_html end test "newlines in address block content are replaced with
s" do input = %($A\n123 Test Street\nTestcase Cliffs\nTeston\n0123 456 7890\n$A) doc = Govspeak::Document.new(input) - assert_equal %(\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890
\n

\n), doc.to_html + assert_equal %(\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890\n

\n), doc.to_html end test "combined return-and-newlines in address block content are replaced with
s" do input = %($A\r\n123 Test Street\r\nTestcase Cliffs\r\nTeston\r\n0123 456 7890\r\n$A) doc = Govspeak::Document.new(input) - assert_equal %(\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890
\n

\n), doc.to_html + assert_equal %(\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890\n

\n), doc.to_html end test "trailing backslashes are stripped from address block content" do # two trailing backslashes would normally be converted into a line break by Kramdown input = %($A\r\n123 Test Street\\\r\nTestcase Cliffs\\\nTeston\\\\\r\n0123 456 7890\\\\\n$A) doc = Govspeak::Document.new(input) - assert_equal %(\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890
\n

\n), doc.to_html + assert_equal %(\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890\n

\n), doc.to_html + end + + test "trailing spaces are stripped from address block content" do + # two trailing spaces would normally be converted into a line break by Kramdown + input = %($A\r\n123 Test Street \r\nTestcase Cliffs \nTeston \r\n0123 456 7890 \n$A) + doc = Govspeak::Document.new(input) + assert_equal %(\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890\n

\n), doc.to_html + end + + test "trailing backslashes and spaces are stripped from address block content" do + # two trailing backslashes or two trailing spaces would normally be converted into a line break by Kramdown + input = %($A\r\n123 Test Street \\\r\nTestcase Cliffs\\ \nTeston\\\\ \r\n0123 456 7890 \\\\\n$A) + doc = Govspeak::Document.new(input) + assert_equal %(\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890\n

\n), doc.to_html end test "should convert barchart" do @@ -160,7 +174,7 @@ class GovspeakTest < Minitest::Test Teston 0123 456 7890 $A) doc = Govspeak::Document.new(input) - assert_equal %(

Paragraph1

\n\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890 \n

\n), doc.to_html + assert_equal %(

Paragraph1

\n\n

\n123 Test Street
Testcase Cliffs
Teston
0123 456 7890\n

\n), doc.to_html end test_given_govspeak("^ I am very informational ^") do @@ -407,7 +421,7 @@ class GovspeakTest < Minitest::Test $A" do assert_html_output %(

- street
road
+ street
road

) assert_text_output "street road" end @@ -421,7 +435,7 @@ class GovspeakTest < Minitest::Test *[ACRONYM]: This is the acronym explanation" do assert_html_output %(

- street with ACRONYM
road
+ street with ACRONYM
road

) end From 9c17377dc6c014937dcba39bc4cda17a83fa4b96 Mon Sep 17 00:00:00 2001 From: Mike Patrick Date: Mon, 2 Dec 2024 16:16:57 +0000 Subject: [PATCH 5/5] Bump minor version to 8.8.0 Mostly stripping trailing whitespace from lines within an address block, to prevent Kramdown from converting them into unintended additional line breaks. --- CHANGELOG.md | 10 +++++----- lib/govspeak/version.rb | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d66ad4b..47b1249 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,13 @@ # Changelog -## Unreleased +## 8.8.0 -* Strip trailing backslashes from the lines of an address +* Strip trailing backslashes from the lines of an address ([#371](https://github.com/alphagov/govspeak/pull/371)) * When replacing line breaks with "<br>"s in the parsing of address blocks, -replace the whole "\r\n" line break and not just the "\n" character +replace the whole "\r\n" line break and not just the "\n" character ([#371](https://github.com/alphagov/govspeak/pull/371)) * Only strip a leading line break from an address block, not just any old first -occurrence of a line break -* Strip trailing whitespace from the lines of an address +occurrence of a line break ([#371](https://github.com/alphagov/govspeak/pull/371)) +* Strip trailing whitespace from the lines of an address ([#371](https://github.com/alphagov/govspeak/pull/371)) ## 8.7.0 diff --git a/lib/govspeak/version.rb b/lib/govspeak/version.rb index 41b5149..f962661 100644 --- a/lib/govspeak/version.rb +++ b/lib/govspeak/version.rb @@ -1,3 +1,3 @@ module Govspeak - VERSION = "8.7.0".freeze + VERSION = "8.8.0".freeze end