From 19c6fac8dc81b5f8a16894dec562edd28f318e6d Mon Sep 17 00:00:00 2001 From: Timon Van Overveldt Date: Tue, 6 Feb 2024 08:54:25 -0800 Subject: [PATCH] Update pulldown_cmark dep to v0.10, and add pulldown_cmark_escape dep. The pulldown_cmark escaping functionality is now shipped in a separate pulldown_cmark_escape crate (https://crates.io/crates/pulldown-cmark-escape), starting with v0.10.0. The markdown.rs module has to be adapted to a few API changes in pulldown_cmark, and we have to introduce explicit handling of alt text to ensure it continues to be properly escaped. There are also a few other behavior changes that are caught by the tests, but these actually seem to be desired, so I've updated the insta snapshot files for those tests to incorporate those changes. Specifically, one footnote-parsing case seems to be handled better now, and pulldown-cmark's `push_html` now doesn't escape quotes in text nodes anymore (see https://github.com/pulldown-cmark/pulldown-cmark/pull/836). --- Cargo.lock | 23 ++++- components/libs/Cargo.toml | 3 +- components/libs/src/lib.rs | 1 + components/markdown/src/markdown.rs | 88 +++++++++++++------ ...wn__all_markdown_features_integration.snap | 6 +- .../markdown__can_handle_heading_ids-2.snap | 8 +- .../markdown__can_handle_heading_ids.snap | 6 +- ...des__doesnt_render_ignored_shortcodes.snap | 6 +- 8 files changed, 93 insertions(+), 48 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 26050e06e7..0d90eea7c0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1736,7 +1736,8 @@ dependencies = [ "num-format", "once_cell", "percent-encoding", - "pulldown-cmark", + "pulldown-cmark 0.10.0", + "pulldown-cmark-escape", "quickxml_to_serde", "rayon", "regex", @@ -2775,6 +2776,24 @@ dependencies = [ "unicase", ] +[[package]] +name = "pulldown-cmark" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce76ce678ffc8e5675b22aa1405de0b7037e2fdf8913fea40d1926c6fe1e6e7" +dependencies = [ + "bitflags 2.4.1", + "memchr", + "pulldown-cmark-escape", + "unicase", +] + +[[package]] +name = "pulldown-cmark-escape" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5d8f9aa0e3cbcfaf8bf00300004ee3b72f74770f9cbac93f6928771f613276b" + [[package]] name = "pure-rust-locales" version = "0.7.0" @@ -3341,7 +3360,7 @@ dependencies = [ "cargo_metadata", "error-chain", "glob", - "pulldown-cmark", + "pulldown-cmark 0.9.3", "tempfile", "walkdir", ] diff --git a/components/libs/Cargo.toml b/components/libs/Cargo.toml index 5f551ea663..9f6abf1d58 100644 --- a/components/libs/Cargo.toml +++ b/components/libs/Cargo.toml @@ -21,7 +21,8 @@ nom-bibtex = "0.5" num-format = "0.4" once_cell = "1" percent-encoding = "2" -pulldown-cmark = { version = "0.9", default-features = false, features = ["simd"] } +pulldown-cmark = { version = "0.10", default-features = false, features = ["html", "simd"] } +pulldown-cmark-escape = { version = "0.10", default-features = false } quickxml_to_serde = "0.5" rayon = "1" regex = "1" diff --git a/components/libs/src/lib.rs b/components/libs/src/lib.rs index d0772dc0a4..3992c2ed29 100644 --- a/components/libs/src/lib.rs +++ b/components/libs/src/lib.rs @@ -23,6 +23,7 @@ pub use num_format; pub use once_cell; pub use percent_encoding; pub use pulldown_cmark; +pub use pulldown_cmark_escape; pub use quickxml_to_serde; pub use rayon; pub use regex; diff --git a/components/markdown/src/markdown.rs b/components/markdown/src/markdown.rs index 4d30b2ac92..40c94afac5 100644 --- a/components/markdown/src/markdown.rs +++ b/components/markdown/src/markdown.rs @@ -4,19 +4,20 @@ use errors::bail; use libs::gh_emoji::Replacer as EmojiReplacer; use libs::once_cell::sync::Lazy; use libs::pulldown_cmark as cmark; +use libs::pulldown_cmark_escape as cmark_escape; use libs::tera; use utils::net::is_external_link; use crate::context::RenderContext; use errors::{Context, Error, Result}; -use libs::pulldown_cmark::escape::escape_html; +use libs::pulldown_cmark_escape::escape_html; use libs::regex::{Regex, RegexBuilder}; use utils::site::resolve_internal_link; use utils::slugs::slugify_anchors; use utils::table_of_contents::{make_table_of_contents, Heading}; use utils::types::InsertAnchor; -use self::cmark::{Event, LinkType, Options, Parser, Tag}; +use self::cmark::{Event, LinkType, Options, Parser, Tag, TagEnd}; use crate::codeblock::{CodeBlock, FenceSettings}; use crate::shortcode::{Shortcode, SHORTCODE_PLACEHOLDER}; @@ -220,15 +221,15 @@ fn get_heading_refs(events: &[Event]) -> Vec { for (i, event) in events.iter().enumerate() { match event { - Event::Start(Tag::Heading(level, anchor, classes)) => { + Event::Start(Tag::Heading { level, id, classes, .. }) => { heading_refs.push(HeadingRef::new( i, *level as u32, - anchor.map(|a| a.to_owned()), + id.clone().map(|a| a.to_string()), &classes.iter().map(|x| x.to_string()).collect::>(), )); } - Event::End(Tag::Heading(_, _, _)) => { + Event::End(TagEnd::Heading { .. }) => { heading_refs.last_mut().expect("Heading end before start?").end_idx = i; } _ => (), @@ -254,6 +255,10 @@ pub fn markdown_to_html( let mut error = None; let mut code_block: Option = None; + // Indicates whether we're in the middle of parsing a text node which will be placed in an HTML + // attribute, and which hence has to be escaped using escape_html rather than push_html's + // default HTML body escaping for text nodes. + let mut inside_attribute = false; let mut headings: Vec = vec![]; let mut internal_links = Vec::new(); @@ -294,12 +299,19 @@ pub fn markdown_to_html( // we have some text before the shortcode, push that first if $range.start != sc_span.start { - let content = $text[($range.start - orig_range_start) - ..(sc_span.start - orig_range_start)] - .to_string() - .into(); + let content: cmark::CowStr<'_> = + $text[($range.start - orig_range_start) + ..(sc_span.start - orig_range_start)] + .to_string() + .into(); events.push(if $is_text { - Event::Text(content) + if inside_attribute { + let mut buffer = "".to_string(); + escape_html(&mut buffer, content.as_ref()).unwrap(); + Event::Html(buffer.into()) + } else { + Event::Text(content) + } } else { Event::Html(content) }); @@ -370,7 +382,13 @@ pub fn markdown_to_html( }; if !contains_shortcode(text.as_ref()) { - events.push(Event::Text(text)); + if inside_attribute { + let mut buffer = "".to_string(); + escape_html(&mut buffer, text.as_ref()).unwrap(); + events.push(Event::Html(buffer.into())); + } else { + events.push(Event::Text(text)); + } continue; } @@ -386,7 +404,7 @@ pub fn markdown_to_html( code_block = Some(block); events.push(Event::Html(begin.into())); } - Event::End(Tag::CodeBlock(_)) => { + Event::End(TagEnd::CodeBlock { .. }) => { if let Some(ref mut code_block) = code_block { let html = code_block.highlight(&accumulated_block); events.push(Event::Html(html.into())); @@ -397,44 +415,53 @@ pub fn markdown_to_html( code_block = None; events.push(Event::Html("\n".into())); } - Event::Start(Tag::Image(link_type, src, title)) => { - let link = if is_colocated_asset_link(&src) { - let link = format!("{}{}", context.current_page_permalink, &*src); + Event::Start(Tag::Image { link_type, dest_url, title, id }) => { + let link = if is_colocated_asset_link(&dest_url) { + let link = format!("{}{}", context.current_page_permalink, &*dest_url); link.into() } else { - src + dest_url }; events.push(if lazy_async_image { let mut img_before_alt: String = "\"").expect("Could events.push(if lazy_async_image { + Event::End(TagEnd::Image) => events.push(if lazy_async_image { Event::Html("\" loading=\"lazy\" decoding=\"async\" />".into()) } else { event }), - Event::Start(Tag::Link(link_type, link, title)) if link.is_empty() => { + Event::Start(Tag::Link { link_type, dest_url, title, id }) + if dest_url.is_empty() => + { error = Some(Error::msg("There is a link that is missing a URL")); - events.push(Event::Start(Tag::Link(link_type, "#".into(), title))); + events.push(Event::Start(Tag::Link { + link_type, + dest_url: "#".into(), + title, + id, + })); } - Event::Start(Tag::Link(link_type, link, title)) => { + Event::Start(Tag::Link { link_type, dest_url, title, id }) => { let fixed_link = match fix_link( link_type, - &link, + &dest_url, context, &mut internal_links, &mut external_links, @@ -448,12 +475,12 @@ pub fn markdown_to_html( }; events.push( - if is_external_link(&link) + if is_external_link(&dest_url) && context.config.markdown.has_external_link_tweaks() { let mut escaped = String::new(); // write_str can fail but here there are no reasons it should (afaik?) - cmark::escape::escape_href(&mut escaped, &link) + cmark_escape::escape_href(&mut escaped, &dest_url) .expect("Could not write to buffer"); Event::Html( context @@ -463,7 +490,12 @@ pub fn markdown_to_html( .into(), ) } else { - Event::Start(Tag::Link(link_type, fixed_link.into(), title)) + Event::Start(Tag::Link { + link_type, + dest_url: fixed_link.into(), + title, + id, + }) }, ) } @@ -485,7 +517,7 @@ pub fn markdown_to_html( events.push(event); } - Event::End(Tag::Paragraph) => { + Event::End(TagEnd::Paragraph) => { events.push(if stop_next_end_p { stop_next_end_p = false; Event::Html("".into()) diff --git a/components/markdown/tests/snapshots/markdown__all_markdown_features_integration.snap b/components/markdown/tests/snapshots/markdown__all_markdown_features_integration.snap index 72aedc4c21..9e95d41805 100644 --- a/components/markdown/tests/snapshots/markdown__all_markdown_features_integration.snap +++ b/components/markdown/tests/snapshots/markdown__all_markdown_features_integration.snap @@ -1,8 +1,6 @@ --- -source: components/rendering/tests/markdown.rs -assertion_line: 358 +source: components/markdown/tests/markdown.rs expression: body - ---

h1 Heading

@@ -83,7 +81,7 @@ line 1 of code line 2 of code line 3 of code -

Block code "fences"

+

Block code "fences"

Sample text here...
 

Syntax highlighting

diff --git a/components/markdown/tests/snapshots/markdown__can_handle_heading_ids-2.snap b/components/markdown/tests/snapshots/markdown__can_handle_heading_ids-2.snap index d2847a8e94..e9fabcc4fa 100644 --- a/components/markdown/tests/snapshots/markdown__can_handle_heading_ids-2.snap +++ b/components/markdown/tests/snapshots/markdown__can_handle_heading_ids-2.snap @@ -1,12 +1,10 @@ --- -source: components/rendering/tests/markdown.rs -assertion_line: 84 +source: components/markdown/tests/markdown.rs expression: body - ---

Hello

Hello

-

L'écologie et vous

+

L'écologie et vous

Hello

Hello

Hello

@@ -22,6 +20,6 @@ expression: body

text 1 there

1

footnote

-

Classes

+

Classes

diff --git a/components/markdown/tests/snapshots/markdown__can_handle_heading_ids.snap b/components/markdown/tests/snapshots/markdown__can_handle_heading_ids.snap index 87cce483ae..663be5947f 100644 --- a/components/markdown/tests/snapshots/markdown__can_handle_heading_ids.snap +++ b/components/markdown/tests/snapshots/markdown__can_handle_heading_ids.snap @@ -1,8 +1,6 @@ --- -source: components/rendering/tests/markdown.rs -assertion_line: 79 +source: components/markdown/tests/markdown.rs expression: body - ---

Hello

Hello

@@ -22,6 +20,6 @@ expression: body

text 1 there

1

footnote

-

Classes

+

Classes

diff --git a/components/markdown/tests/snapshots/shortcodes__doesnt_render_ignored_shortcodes.snap b/components/markdown/tests/snapshots/shortcodes__doesnt_render_ignored_shortcodes.snap index 64d9bfbabe..15c94e4678 100644 --- a/components/markdown/tests/snapshots/shortcodes__doesnt_render_ignored_shortcodes.snap +++ b/components/markdown/tests/snapshots/shortcodes__doesnt_render_ignored_shortcodes.snap @@ -1,8 +1,6 @@ --- -source: components/rendering/tests/shortcodes.rs -assertion_line: 104 +source: components/markdown/tests/shortcodes.rs expression: body - --- -

{{ youtube(id="w7Ft2ymGmfc") }}

+

{{ youtube(id="w7Ft2ymGmfc") }}