From e77bcff6d7d2e6dab0440881d03f6c7db6e47372 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Star=C3=BD=20Novotn=C3=BD?= Date: Wed, 3 Apr 2024 21:30:00 +0200 Subject: [PATCH] Define `parsers.punctuation` in a streaming fashion Since #416, we have first read all Unicode punctuation characters to a table `punctuation`, then defined `parsers.punctuation` using the table `punctuation`, and then we deleted the table `punctuation`. Since #416, we have also been experiencing steady out-of-memory issues with our capybara runner, as discussed with @TeXhackse earlier today. I have disabled capybara, since it's been having intermittent out-of-memory issues ever since Markdown 3.0.0 and its speed has also lately been an issue. Nevertheless, this indicates a potential cost of the current approach, which may eventually impact our users as well. This PR removes the table `punctuation` directly from the file `UnicodeData.txt` without any intermediate data structure. This should alleviate any memory issues caused by #416. --- markdown.dtx | 84 +++++++++++++++++++--------------------------------- 1 file changed, 31 insertions(+), 53 deletions(-) diff --git a/markdown.dtx b/markdown.dtx index 6ea0a2db..29bac53f 100644 --- a/markdown.dtx +++ b/markdown.dtx @@ -24510,38 +24510,6 @@ end % \par % \begin{markdown} % -%### Unicode punctuation -% This section documents [the Unicode punctuation][unicode-punctuation] -% recognized by the markdown reader. The punctuation is organized in the -% \luamdef{punctuation} table according to the number of bytes occupied after -% conversion to \acro{utf}8. -% -% [unicode-punctuation]: https://spec.commonmark.org/0.31.2/#unicode-punctuation-character -% (CommonMark Spec, Version 0.31.2 (2024-01-28)) -% -% \end{markdown} -% \begin{macrocode} -local punctuation = {} -(function() - local pathname = kpse.lookup("UnicodeData.txt") - local file = assert(io.open(pathname, "r"), - [[Could not open file "UnicodeData.txt"]]) - for line in file:lines() do - local codepoint, major_category = line:match("^(%x+);[^;]*;(%a)") - if major_category == "P" or major_category == "S" then - local code = unicode.utf8.char(tonumber(codepoint, 16)) - if punctuation[#code] == nil then - punctuation[#code] = {} - end - table.insert(punctuation[#code], code) - end - end - assert(file:close()) -end)() -% \end{macrocode} -% \par -% \begin{markdown} -% %### Plain \TeX{} Writer {#tex-writer} % % This section documents the \luamref{writer} object, which implements the @@ -25809,36 +25777,46 @@ parsers.fail = P(false) parsers.internal_punctuation = S(":;,.?") parsers.ascii_punctuation = S("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") +% \end{macrocode} +% \par +% \begin{markdown} +% +%### Unicode punctuation +% This section documents [the Unicode punctuation][unicode-punctuation] +% recognized by the markdown reader. The punctuation is organized in the +% \luamdef{parsers.punctuation} table according to the number of bytes occupied +% after conversion to \acro{utf}8. +% +% [unicode-punctuation]: https://spec.commonmark.org/0.31.2/#unicode-punctuation-character +% (CommonMark Spec, Version 0.31.2 (2024-01-28)) +% +% \end{markdown} +% \begin{macrocode} parsers.punctuation = {} (function() - for size = 1, 4 do - local codepoint_parser = parsers.fail - if size == 1 then - codepoint_parser = codepoint_parser + parsers.ascii_punctuation - end - for _, code in ipairs(punctuation[size] or {}) do + local pathname = kpse.lookup("UnicodeData.txt") + local file = assert(io.open(pathname, "r"), + [[Could not open file "UnicodeData.txt"]]) + for line in file:lines() do + local codepoint, major_category = line:match("^(%x+);[^;]*;(%a)") + if major_category == "P" or major_category == "S" then + local code = unicode.utf8.char(tonumber(codepoint, 16)) + if parsers.punctuation[#code] == nil then + parsers.punctuation[#code] = parsers.fail + end local code_parser = parsers.succeed - assert(#code == size) - for i = 1, size do + for i = 1, #code do local byte = code:sub(i, i) local byte_parser = S(byte) - code_parser = code_parser * byte_parser + code_parser = code_parser + * byte_parser end - codepoint_parser = codepoint_parser + code_parser + parsers.punctuation[#code] = parsers.punctuation[#code] + + code_parser end - parsers.punctuation[size] = codepoint_parser end + assert(file:close()) end)() -% \end{macrocode} -% \par -% \begin{markdown} -% -% Here, we garbage-collect the \luamref{punctuation} table, since we won't need it anymore. -% -% \end{markdown} -% \begin{macrocode} -punctuation = nil -collectgarbage("collect") parsers.escapable = parsers.ascii_punctuation parsers.anyescaped = parsers.backslash / "" * parsers.escapable