From e77bcff6d7d2e6dab0440881d03f6c7db6e47372 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Star=C3=BD=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Wed, 3 Apr 2024 21:30:00 +0200
Subject: [PATCH] Define `parsers.punctuation` in a streaming fashion

Since #416, we have first read all Unicode punctuation characters to a
table `punctuation`, then defined `parsers.punctuation` using the table
`punctuation`, and then we deleted the table `punctuation`. Since #416,
we have also been experiencing steady out-of-memory issues with our
capybara runner, as discussed with @TeXhackse earlier today.

I have disabled capybara, since it's been having intermittent
out-of-memory issues ever since Markdown 3.0.0 and its speed has also
lately been an issue. Nevertheless, this indicates a potential cost of
the current approach, which may eventually impact our users as well.

This PR removes the table `punctuation` directly from the file
`UnicodeData.txt` without any intermediate data structure. This should
alleviate any memory issues caused by #416.
---
 markdown.dtx | 84 +++++++++++++++++++---------------------------------
 1 file changed, 31 insertions(+), 53 deletions(-)

diff --git a/markdown.dtx b/markdown.dtx
index 6ea0a2db..29bac53f 100644
--- a/markdown.dtx
+++ b/markdown.dtx
@@ -24510,38 +24510,6 @@ end
 % \par
 % \begin{markdown}
 %
-%### Unicode punctuation
-% This section documents [the Unicode punctuation][unicode-punctuation]
-% recognized by the markdown reader. The punctuation is organized in the
-% \luamdef{punctuation} table according to the number of bytes occupied after
-% conversion to \acro{utf}8.
-%
-% [unicode-punctuation]: https://spec.commonmark.org/0.31.2/#unicode-punctuation-character
-%                        (CommonMark Spec, Version 0.31.2 (2024-01-28))
-%
-% \end{markdown}
-%  \begin{macrocode}
-local punctuation = {}
-(function()
-  local pathname = kpse.lookup("UnicodeData.txt")
-  local file = assert(io.open(pathname, "r"),
-    [[Could not open file "UnicodeData.txt"]])
-  for line in file:lines() do
-    local codepoint, major_category = line:match("^(%x+);[^;]*;(%a)")
-    if major_category == "P" or major_category == "S" then
-      local code = unicode.utf8.char(tonumber(codepoint, 16))
-      if punctuation[#code] == nil then
-        punctuation[#code] = {}
-      end
-      table.insert(punctuation[#code], code)
-    end
-  end
-  assert(file:close())
-end)()
-%    \end{macrocode}
-% \par
-% \begin{markdown}
-%
 %### Plain \TeX{} Writer {#tex-writer}
 %
 % This section documents the \luamref{writer} object, which implements the
@@ -25809,36 +25777,46 @@ parsers.fail                   = P(false)
 
 parsers.internal_punctuation   = S(":;,.?")
 parsers.ascii_punctuation      = S("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")
+%    \end{macrocode}
+% \par
+% \begin{markdown}
+%
+%### Unicode punctuation
+% This section documents [the Unicode punctuation][unicode-punctuation]
+% recognized by the markdown reader. The punctuation is organized in the
+% \luamdef{parsers.punctuation} table according to the number of bytes occupied
+% after conversion to \acro{utf}8.
+%
+% [unicode-punctuation]: https://spec.commonmark.org/0.31.2/#unicode-punctuation-character
+%                        (CommonMark Spec, Version 0.31.2 (2024-01-28))
+%
+% \end{markdown}
+%  \begin{macrocode}
 parsers.punctuation            = {}
 (function()
-  for size = 1, 4 do
-    local codepoint_parser = parsers.fail
-    if size == 1 then
-      codepoint_parser = codepoint_parser + parsers.ascii_punctuation
-    end
-    for _, code in ipairs(punctuation[size] or {}) do
+  local pathname = kpse.lookup("UnicodeData.txt")
+  local file = assert(io.open(pathname, "r"),
+    [[Could not open file "UnicodeData.txt"]])
+  for line in file:lines() do
+    local codepoint, major_category = line:match("^(%x+);[^;]*;(%a)")
+    if major_category == "P" or major_category == "S" then
+      local code = unicode.utf8.char(tonumber(codepoint, 16))
+      if parsers.punctuation[#code] == nil then
+        parsers.punctuation[#code] = parsers.fail
+      end
       local code_parser = parsers.succeed
-      assert(#code == size)
-      for i = 1, size do
+      for i = 1, #code do
         local byte = code:sub(i, i)
         local byte_parser = S(byte)
-        code_parser = code_parser * byte_parser
+        code_parser = code_parser
+                    * byte_parser
       end
-      codepoint_parser = codepoint_parser + code_parser
+      parsers.punctuation[#code] = parsers.punctuation[#code]
+                                 + code_parser
     end
-    parsers.punctuation[size] = codepoint_parser
   end
+  assert(file:close())
 end)()
-%    \end{macrocode}
-% \par
-% \begin{markdown}
-%
-% Here, we garbage-collect the \luamref{punctuation} table, since we won't need it anymore.
-%
-% \end{markdown}
-%  \begin{macrocode}
-punctuation = nil
-collectgarbage("collect")
 
 parsers.escapable              = parsers.ascii_punctuation
 parsers.anyescaped             = parsers.backslash / "" * parsers.escapable