Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update pulldown_cmark dep to v0.10, and add pulldown_cmark_escape dep. #2432

Merged
merged 1 commit into from
Feb 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## 0.19.0 (unreleased)

- Updates the pulldown-cmark dependency to v0.10.0. This improves footnote handling, and may also introduce some minor behavior changes such as reducing the amount of unnecessary HTML-escaping of text content.

## 0.18.0 (2023-12-18)

Expand Down
23 changes: 21 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion components/libs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ nom-bibtex = "0.5"
num-format = "0.4"
once_cell = "1"
percent-encoding = "2"
pulldown-cmark = { version = "0.9", default-features = false, features = ["simd"] }
pulldown-cmark = { version = "0.10", default-features = false, features = ["html", "simd"] }
pulldown-cmark-escape = { version = "0.10", default-features = false }
quickxml_to_serde = "0.5"
rayon = "1"
regex = "1"
Expand Down
1 change: 1 addition & 0 deletions components/libs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub use num_format;
pub use once_cell;
pub use percent_encoding;
pub use pulldown_cmark;
pub use pulldown_cmark_escape;
pub use quickxml_to_serde;
pub use rayon;
pub use regex;
Expand Down
88 changes: 60 additions & 28 deletions components/markdown/src/markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,20 @@ use errors::bail;
use libs::gh_emoji::Replacer as EmojiReplacer;
use libs::once_cell::sync::Lazy;
use libs::pulldown_cmark as cmark;
use libs::pulldown_cmark_escape as cmark_escape;
use libs::tera;
use utils::net::is_external_link;

use crate::context::RenderContext;
use errors::{Context, Error, Result};
use libs::pulldown_cmark::escape::escape_html;
use libs::pulldown_cmark_escape::escape_html;
use libs::regex::{Regex, RegexBuilder};
use utils::site::resolve_internal_link;
use utils::slugs::slugify_anchors;
use utils::table_of_contents::{make_table_of_contents, Heading};
use utils::types::InsertAnchor;

use self::cmark::{Event, LinkType, Options, Parser, Tag};
use self::cmark::{Event, LinkType, Options, Parser, Tag, TagEnd};
use crate::codeblock::{CodeBlock, FenceSettings};
use crate::shortcode::{Shortcode, SHORTCODE_PLACEHOLDER};

Expand Down Expand Up @@ -220,15 +221,15 @@ fn get_heading_refs(events: &[Event]) -> Vec<HeadingRef> {

for (i, event) in events.iter().enumerate() {
match event {
Event::Start(Tag::Heading(level, anchor, classes)) => {
Event::Start(Tag::Heading { level, id, classes, .. }) => {
heading_refs.push(HeadingRef::new(
i,
*level as u32,
anchor.map(|a| a.to_owned()),
id.clone().map(|a| a.to_string()),
&classes.iter().map(|x| x.to_string()).collect::<Vec<_>>(),
));
}
Event::End(Tag::Heading(_, _, _)) => {
Event::End(TagEnd::Heading { .. }) => {
heading_refs.last_mut().expect("Heading end before start?").end_idx = i;
}
_ => (),
Expand All @@ -254,6 +255,10 @@ pub fn markdown_to_html(
let mut error = None;

let mut code_block: Option<CodeBlock> = None;
// Indicates whether we're in the middle of parsing a text node which will be placed in an HTML
// attribute, and which hence has to be escaped using escape_html rather than push_html's
// default HTML body escaping for text nodes.
let mut inside_attribute = false;

let mut headings: Vec<Heading> = vec![];
let mut internal_links = Vec::new();
Expand Down Expand Up @@ -294,12 +299,19 @@ pub fn markdown_to_html(

// we have some text before the shortcode, push that first
if $range.start != sc_span.start {
let content = $text[($range.start - orig_range_start)
..(sc_span.start - orig_range_start)]
.to_string()
.into();
let content: cmark::CowStr<'_> =
$text[($range.start - orig_range_start)
..(sc_span.start - orig_range_start)]
.to_string()
.into();
events.push(if $is_text {
Event::Text(content)
if inside_attribute {
let mut buffer = "".to_string();
escape_html(&mut buffer, content.as_ref()).unwrap();
Event::Html(buffer.into())
} else {
Event::Text(content)
}
} else {
Event::Html(content)
});
Expand Down Expand Up @@ -370,7 +382,13 @@ pub fn markdown_to_html(
};

if !contains_shortcode(text.as_ref()) {
events.push(Event::Text(text));
if inside_attribute {
let mut buffer = "".to_string();
escape_html(&mut buffer, text.as_ref()).unwrap();
events.push(Event::Html(buffer.into()));
} else {
events.push(Event::Text(text));
}
continue;
}

Expand All @@ -386,7 +404,7 @@ pub fn markdown_to_html(
code_block = Some(block);
events.push(Event::Html(begin.into()));
}
Event::End(Tag::CodeBlock(_)) => {
Event::End(TagEnd::CodeBlock { .. }) => {
if let Some(ref mut code_block) = code_block {
let html = code_block.highlight(&accumulated_block);
events.push(Event::Html(html.into()));
Expand All @@ -397,44 +415,53 @@ pub fn markdown_to_html(
code_block = None;
events.push(Event::Html("</code></pre>\n".into()));
}
Event::Start(Tag::Image(link_type, src, title)) => {
let link = if is_colocated_asset_link(&src) {
let link = format!("{}{}", context.current_page_permalink, &*src);
Event::Start(Tag::Image { link_type, dest_url, title, id }) => {
let link = if is_colocated_asset_link(&dest_url) {
let link = format!("{}{}", context.current_page_permalink, &*dest_url);
link.into()
} else {
src
dest_url
};

events.push(if lazy_async_image {
let mut img_before_alt: String = "<img src=\"".to_string();
cmark::escape::escape_href(&mut img_before_alt, &link)
cmark_escape::escape_href(&mut img_before_alt, &link)
.expect("Could not write to buffer");
if !title.is_empty() {
img_before_alt
.write_str("\" title=\"")
.expect("Could not write to buffer");
cmark::escape::escape_href(&mut img_before_alt, &title)
cmark_escape::escape_href(&mut img_before_alt, &title)
.expect("Could not write to buffer");
}
img_before_alt.write_str("\" alt=\"").expect("Could not write to buffer");
inside_attribute = true;
Event::Html(img_before_alt.into())
} else {
Event::Start(Tag::Image(link_type, link, title))
inside_attribute = false;
Event::Start(Tag::Image { link_type, dest_url: link, title, id })
});
}
Event::End(Tag::Image(..)) => events.push(if lazy_async_image {
Event::End(TagEnd::Image) => events.push(if lazy_async_image {
Event::Html("\" loading=\"lazy\" decoding=\"async\" />".into())
} else {
event
}),
Event::Start(Tag::Link(link_type, link, title)) if link.is_empty() => {
Event::Start(Tag::Link { link_type, dest_url, title, id })
if dest_url.is_empty() =>
{
error = Some(Error::msg("There is a link that is missing a URL"));
events.push(Event::Start(Tag::Link(link_type, "#".into(), title)));
events.push(Event::Start(Tag::Link {
link_type,
dest_url: "#".into(),
title,
id,
}));
}
Event::Start(Tag::Link(link_type, link, title)) => {
Event::Start(Tag::Link { link_type, dest_url, title, id }) => {
let fixed_link = match fix_link(
link_type,
&link,
&dest_url,
context,
&mut internal_links,
&mut external_links,
Expand All @@ -448,12 +475,12 @@ pub fn markdown_to_html(
};

events.push(
if is_external_link(&link)
if is_external_link(&dest_url)
&& context.config.markdown.has_external_link_tweaks()
{
let mut escaped = String::new();
// write_str can fail but here there are no reasons it should (afaik?)
cmark::escape::escape_href(&mut escaped, &link)
cmark_escape::escape_href(&mut escaped, &dest_url)
.expect("Could not write to buffer");
Event::Html(
context
Expand All @@ -463,7 +490,12 @@ pub fn markdown_to_html(
.into(),
)
} else {
Event::Start(Tag::Link(link_type, fixed_link.into(), title))
Event::Start(Tag::Link {
link_type,
dest_url: fixed_link.into(),
title,
id,
})
},
)
}
Expand All @@ -485,7 +517,7 @@ pub fn markdown_to_html(

events.push(event);
}
Event::End(Tag::Paragraph) => {
Event::End(TagEnd::Paragraph) => {
events.push(if stop_next_end_p {
stop_next_end_p = false;
Event::Html("".into())
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
---
source: components/rendering/tests/markdown.rs
assertion_line: 358
source: components/markdown/tests/markdown.rs
expression: body

---
<!-- Adapted from https://markdown-it.github.io/ -->
<h1 id="h1-heading">h1 Heading</h1>
Expand Down Expand Up @@ -83,7 +81,7 @@ line 1 of code
line 2 of code
line 3 of code
</code></pre>
<p>Block code &quot;fences&quot;</p>
<p>Block code "fences"</p>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how come it's not escaping anymore here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is due to a behavior change in pulldown-cmark, rather than the changes I made in this pull request. I think it might be pulldown-cmark/pulldown-cmark#830, specifically. It introduces a new escape_html_body_text which should be used for escaping text nodes and which only escapes '<', '>', and '&', as opposed to escape_html which should be used for escaping HTML attributes and which escapes single and double quotes as well.

I believe that pulldown-cmark's push_html will now convert Event::Text nodes to HTML using escape_html_body_text (see src/html.rs in that pull request), hence why we see these quotes not being escaped anymore. This is also why we need to handle the alt attribute's text manually now, to ensure it gets attribute-escaped rather than body-escaped (which is now the default).

And FWIW, all other existing uses of escape_html that I could find in Zola were for escaping HTML attributes, so I think none of them need to be switched to escape_html_body_text.

<pre><code>Sample text here...
</code></pre>
<p>Syntax highlighting</p>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
---
source: components/rendering/tests/markdown.rs
assertion_line: 84
source: components/markdown/tests/markdown.rs
expression: body

---
<h1 id="Hello">Hello</h1>
<h1 id="Hello-1">Hello</h1>
<h1 id="L'écologie_et_vous">L'écologie et vous</h1>
<h1 id="L&#39;écologie_et_vous">L'écologie et vous</h1>
<h1 id="hello">Hello</h1>
<h1 id="hello">Hello</h1>
<h1 id="Something_else">Hello</h1>
Expand All @@ -22,6 +20,6 @@ expression: body
<h1 id="text__there">text <sup class="footnote-reference"><a href="#1">1</a></sup> there</h1>
<div class="footnote-definition" id="1"><sup class="footnote-definition-label">1</sup>
<p>footnote</p>
<h1 id="classes" class="bold another">Classes</h1>
</div>
<h1 id="classes" class="bold another">Classes</h1>

Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
---
source: components/rendering/tests/markdown.rs
assertion_line: 79
source: components/markdown/tests/markdown.rs
expression: body

---
<h1 id="hello-1">Hello</h1>
<h1 id="hello-2">Hello</h1>
Expand All @@ -22,6 +20,6 @@ expression: body
<h1 id="text-there">text <sup class="footnote-reference"><a href="#1">1</a></sup> there</h1>
<div class="footnote-definition" id="1"><sup class="footnote-definition-label">1</sup>
<p>footnote</p>
<h1 id="classes" class="bold another">Classes</h1>
</div>
<h1 id="classes" class="bold another">Classes</h1>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so it was inserting the header in the footnote?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, seems like it. pulldown-cmark 0.10 contains a bunch of footnotes-related fixes, which I'm guessing are responble for this change. Since the new version seems to be correct and the old version seemed wrong, I didn't investigate further to figure out exactly which commit fixed this.


Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
---
source: components/rendering/tests/shortcodes.rs
assertion_line: 104
source: components/markdown/tests/shortcodes.rs
expression: body

---
<p>{{ youtube(id=&quot;w7Ft2ymGmfc&quot;) }}</p>
<p>{{ youtube(id="w7Ft2ymGmfc") }}</p>