From 1573d10325f286ffcbba290d27984ded9538a5c3 Mon Sep 17 00:00:00 2001 From: Philipp Hansch Date: Tue, 6 Apr 2021 06:59:10 +0200 Subject: [PATCH 1/5] tabs_in_doc_comments: Fix ICE due to char indexing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a quick-fix for an ICE in `tabs_in_doc_comments`. The problem was that we we're indexing into possibly multi-byte characters, such as '位'. More specifically `get_chunks_of_tabs` was returning indices into multi-byte characters. Those were passed on to a `Span` creation that then caused the ICE. This fix makes sure that we don't return indices that point inside a multi-byte character. *However*, we are still iterating over unicode codepoints, not grapheme clusters. So a seemingly single character like y̆ , which actually consists of two codepoints, will probably still cause incorrect spans in the output. --- clippy_lints/src/tabs_in_doc_comments.rs | 28 ++++++++++++++---------- tests/ui/crashes/ice-5835.rs | 8 +++++++ tests/ui/crashes/ice-5835.stderr | 20 +++++++++++++++++ 3 files changed, 45 insertions(+), 11 deletions(-) create mode 100644 tests/ui/crashes/ice-5835.rs create mode 100644 tests/ui/crashes/ice-5835.stderr diff --git a/clippy_lints/src/tabs_in_doc_comments.rs b/clippy_lints/src/tabs_in_doc_comments.rs index 88bd2feaadda..3f9692540f70 100644 --- a/clippy_lints/src/tabs_in_doc_comments.rs +++ b/clippy_lints/src/tabs_in_doc_comments.rs @@ -104,30 +104,29 @@ fn get_chunks_of_tabs(the_str: &str) -> Vec<(u32, u32)> { // tracker to decide if the last group of tabs is not closed by a non-tab character let mut is_active = false; - let chars_array: Vec<_> = the_str.chars().collect(); + let char_indices: Vec<_> = the_str.char_indices().collect(); - if chars_array == vec!['\t'] { + if char_indices.len() == 1 && char_indices.first().unwrap().1 == '\t' { return vec![(0, 1)]; } - for (index, arr) in chars_array.windows(2).enumerate() { - let index = u32::try_from(index).expect(line_length_way_to_long); - match arr { - ['\t', '\t'] => { + for entry in char_indices.windows(2) { + match entry { + [(_, '\t'), (_, '\t')] => { // either string starts with double tab, then we have to set it active, // otherwise is_active is true anyway is_active = true; }, - [_, '\t'] => { + [(_, _), (index_b, '\t')] => { // as ['\t', '\t'] is excluded, this has to be a start of a tab group, // set indices accordingly is_active = true; - current_start = index + 1; + current_start = *index_b as u32; }, - ['\t', _] => { + [(_, '\t'), (index_b, _)] => { // this now has to be an end of the group, hence we have to push a new tuple is_active = false; - spans.push((current_start, index + 1)); + spans.push((current_start, *index_b as u32)); }, _ => {}, } @@ -137,7 +136,7 @@ fn get_chunks_of_tabs(the_str: &str) -> Vec<(u32, u32)> { if is_active { spans.push(( current_start, - u32::try_from(the_str.chars().count()).expect(line_length_way_to_long), + u32::try_from(char_indices.last().unwrap().0 + 1).expect(line_length_way_to_long), )); } @@ -148,6 +147,13 @@ fn get_chunks_of_tabs(the_str: &str) -> Vec<(u32, u32)> { mod tests_for_get_chunks_of_tabs { use super::get_chunks_of_tabs; + #[test] + fn test_unicode_han_string() { + let res = get_chunks_of_tabs(" 位\t"); + + assert_eq!(res, vec![(4, 5)]); + } + #[test] fn test_empty_string() { let res = get_chunks_of_tabs(""); diff --git a/tests/ui/crashes/ice-5835.rs b/tests/ui/crashes/ice-5835.rs new file mode 100644 index 000000000000..209a5b1eb095 --- /dev/null +++ b/tests/ui/crashes/ice-5835.rs @@ -0,0 +1,8 @@ +#![rustfmt::skip] + +pub struct Foo { + /// 位 + pub bar: u8, +} + +fn main() {} diff --git a/tests/ui/crashes/ice-5835.stderr b/tests/ui/crashes/ice-5835.stderr new file mode 100644 index 000000000000..e286bc580adb --- /dev/null +++ b/tests/ui/crashes/ice-5835.stderr @@ -0,0 +1,20 @@ +error[E0658]: custom inner attributes are unstable + --> $DIR/ice-5835.rs:1:4 + | +LL | #![rustfmt::skip] + | ^^^^^^^^^^^^^ + | + = note: see issue #54726 for more information + = help: add `#![feature(custom_inner_attributes)]` to the crate attributes to enable + +error: using tabs in doc comments is not recommended + --> $DIR/ice-5835.rs:4:10 + | +LL | /// 位 + | ^^^^ help: consider using four spaces per tab + | + = note: `-D clippy::tabs-in-doc-comments` implied by `-D warnings` + +error: aborting due to 2 previous errors + +For more information about this error, try `rustc --explain E0658`. From dde46c9340994ad396a8677702be58122384a5ed Mon Sep 17 00:00:00 2001 From: Philipp Hansch Date: Sat, 10 Apr 2021 14:42:33 +0200 Subject: [PATCH 2/5] Replace complex conditional with pattern matching --- clippy_lints/src/tabs_in_doc_comments.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clippy_lints/src/tabs_in_doc_comments.rs b/clippy_lints/src/tabs_in_doc_comments.rs index 3f9692540f70..d68227545a68 100644 --- a/clippy_lints/src/tabs_in_doc_comments.rs +++ b/clippy_lints/src/tabs_in_doc_comments.rs @@ -106,7 +106,7 @@ fn get_chunks_of_tabs(the_str: &str) -> Vec<(u32, u32)> { let char_indices: Vec<_> = the_str.char_indices().collect(); - if char_indices.len() == 1 && char_indices.first().unwrap().1 == '\t' { + if let &[(_, '\t')] = &char_indices.as_slice() { return vec![(0, 1)]; } From 8b9331b49dd86cbc1ec655f6a6f5f9d74ca8d901 Mon Sep 17 00:00:00 2001 From: Philipp Hansch Date: Sat, 10 Apr 2021 14:43:52 +0200 Subject: [PATCH 3/5] Fix rustfmt error / Add comment for tab character --- tests/ui/crashes/ice-5835.rs | 5 +++-- tests/ui/crashes/ice-5835.stderr | 14 ++------------ 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/tests/ui/crashes/ice-5835.rs b/tests/ui/crashes/ice-5835.rs index 209a5b1eb095..5e99cb432b6e 100644 --- a/tests/ui/crashes/ice-5835.rs +++ b/tests/ui/crashes/ice-5835.rs @@ -1,7 +1,8 @@ -#![rustfmt::skip] - +#[rustfmt::skip] pub struct Foo { /// 位 + /// ^ Do not remove this tab character. + /// It was required to trigger the ICE. pub bar: u8, } diff --git a/tests/ui/crashes/ice-5835.stderr b/tests/ui/crashes/ice-5835.stderr index e286bc580adb..c972bcb60a0c 100644 --- a/tests/ui/crashes/ice-5835.stderr +++ b/tests/ui/crashes/ice-5835.stderr @@ -1,20 +1,10 @@ -error[E0658]: custom inner attributes are unstable - --> $DIR/ice-5835.rs:1:4 - | -LL | #![rustfmt::skip] - | ^^^^^^^^^^^^^ - | - = note: see issue #54726 for more information - = help: add `#![feature(custom_inner_attributes)]` to the crate attributes to enable - error: using tabs in doc comments is not recommended - --> $DIR/ice-5835.rs:4:10 + --> $DIR/ice-5835.rs:3:10 | LL | /// 位 | ^^^^ help: consider using four spaces per tab | = note: `-D clippy::tabs-in-doc-comments` implied by `-D warnings` -error: aborting due to 2 previous errors +error: aborting due to previous error -For more information about this error, try `rustc --explain E0658`. From 47a4865406ec748729a6e7ec23e5a4d5f9b88230 Mon Sep 17 00:00:00 2001 From: Philipp Hansch Date: Sat, 10 Apr 2021 15:30:51 +0200 Subject: [PATCH 4/5] Fix dogfood --- clippy_lints/src/tabs_in_doc_comments.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clippy_lints/src/tabs_in_doc_comments.rs b/clippy_lints/src/tabs_in_doc_comments.rs index d68227545a68..51581b75ce3f 100644 --- a/clippy_lints/src/tabs_in_doc_comments.rs +++ b/clippy_lints/src/tabs_in_doc_comments.rs @@ -106,7 +106,7 @@ fn get_chunks_of_tabs(the_str: &str) -> Vec<(u32, u32)> { let char_indices: Vec<_> = the_str.char_indices().collect(); - if let &[(_, '\t')] = &char_indices.as_slice() { + if let [(_, '\t')] = char_indices.as_slice() { return vec![(0, 1)]; } @@ -121,12 +121,12 @@ fn get_chunks_of_tabs(the_str: &str) -> Vec<(u32, u32)> { // as ['\t', '\t'] is excluded, this has to be a start of a tab group, // set indices accordingly is_active = true; - current_start = *index_b as u32; + current_start = u32::try_from(*index_b).unwrap(); }, [(_, '\t'), (index_b, _)] => { // this now has to be an end of the group, hence we have to push a new tuple is_active = false; - spans.push((current_start, *index_b as u32)); + spans.push((current_start, u32::try_from(*index_b).unwrap())); }, _ => {}, } @@ -149,7 +149,7 @@ mod tests_for_get_chunks_of_tabs { #[test] fn test_unicode_han_string() { - let res = get_chunks_of_tabs(" 位\t"); + let res = get_chunks_of_tabs(" \u{4f4d}\t"); assert_eq!(res, vec![(4, 5)]); } From cbdebd97ec4846391dc0f9a1288a3ab1fc053f99 Mon Sep 17 00:00:00 2001 From: Philipp Hansch Date: Wed, 14 Apr 2021 06:30:51 +0200 Subject: [PATCH 5/5] Explain why we use `char_indices()` instead of `chars()` --- clippy_lints/src/tabs_in_doc_comments.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clippy_lints/src/tabs_in_doc_comments.rs b/clippy_lints/src/tabs_in_doc_comments.rs index 51581b75ce3f..1995981b905d 100644 --- a/clippy_lints/src/tabs_in_doc_comments.rs +++ b/clippy_lints/src/tabs_in_doc_comments.rs @@ -104,6 +104,9 @@ fn get_chunks_of_tabs(the_str: &str) -> Vec<(u32, u32)> { // tracker to decide if the last group of tabs is not closed by a non-tab character let mut is_active = false; + // Note that we specifically need the char _byte_ indices here, not the positional indexes + // within the char array to deal with multi-byte characters properly. `char_indices` does + // exactly that. It provides an iterator over tuples of the form `(byte position, char)`. let char_indices: Vec<_> = the_str.char_indices().collect(); if let [(_, '\t')] = char_indices.as_slice() {