From 37b085d4234c37d68dd4c449fa8ac1c3767b1549 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Thu, 19 Sep 2024 08:51:37 -0500 Subject: [PATCH 1/7] paste: permit the delimiter list to be empty Also: refactored the delimiter processing logic --- src/uu/paste/src/paste.rs | 171 ++++++++++++++++++++++++++++++------ tests/by-util/test_paste.rs | 21 +++++ 2 files changed, 167 insertions(+), 25 deletions(-) diff --git a/src/uu/paste/src/paste.rs b/src/uu/paste/src/paste.rs index 3d4cf733ca8..63df02b7bf7 100644 --- a/src/uu/paste/src/paste.rs +++ b/src/uu/paste/src/paste.rs @@ -9,6 +9,7 @@ use clap::{crate_version, Arg, ArgAction, Command}; use std::fs::File; use std::io::{stdin, stdout, BufRead, BufReader, Read, Write}; use std::path::Path; +use std::slice::Iter; use uucore::error::{FromIo, UResult, USimpleError}; use uucore::line_ending::LineEnding; use uucore::{format_usage, help_about, help_usage}; @@ -115,39 +116,119 @@ fn paste( )); } - let delimiters: Vec = unescape(delimiters).chars().collect(); - let mut delim_count = 0; - let mut delim_length = 1; + struct DelimiterData<'a> { + current_delimiter_length: usize, + delimiters_encoded: &'a [Box<[u8]>], + delimiters_encoded_iter: Iter<'a, Box<[u8]>>, + } + + // Precompute instead of doing this inside the loops + let mut delimiters_encoded_option = { + let delimiters_unescaped = unescape(delimiters).chars().collect::>(); + + let number_of_delimiters = delimiters_unescaped.len(); + + if number_of_delimiters > 0 { + let mut vec = Vec::>::with_capacity(number_of_delimiters); + + { + // a buffer of length four is large enough to encode any char + let mut buffer = [0_u8; 4_usize]; + + for ch in delimiters_unescaped { + let delimiter_encoded = ch.encode_utf8(&mut buffer); + + vec.push(Box::from(delimiter_encoded.as_bytes())); + } + } + + Some(vec.into_boxed_slice()) + } else { + None + } + }; + + let mut delimiter_data_option = match &mut delimiters_encoded_option { + &mut Some(ref mut delimiters_encoded) => { + // TODO + // Is this initial value correct? + let current_delimiter_length = delimiters_encoded.first().unwrap().len(); + + Some(DelimiterData { + delimiters_encoded, + delimiters_encoded_iter: delimiters_encoded.iter(), + current_delimiter_length, + }) + } + None => None, + }; + let stdout = stdout(); let mut stdout = stdout.lock(); let mut output = Vec::new(); + if serial { for file in &mut files { output.clear(); + loop { + let current_delimiter_option = match &mut delimiter_data_option { + Some(DelimiterData { + current_delimiter_length, + delimiters_encoded, + delimiters_encoded_iter, + }) => { + let current_delimiter = if let Some(delimiter_from_current_iter) = + delimiters_encoded_iter.next() + { + delimiter_from_current_iter + } else { + // Reset iter after hitting the end + *delimiters_encoded_iter = delimiters_encoded.iter(); + + // Unwrapping because: + // 1) `delimiters_encoded` is non-empty + // 2) `delimiters_encoded_iter` is a newly constructed Iter + // So `next` should always return an element + delimiters_encoded_iter.next().unwrap() + }; + + *current_delimiter_length = current_delimiter.len(); + + Some(current_delimiter) + } + None => None, + }; + match read_until(file.as_mut(), line_ending as u8, &mut output) { Ok(0) => break, Ok(_) => { if output.ends_with(&[line_ending as u8]) { output.pop(); } - // a buffer of length four is large enough to encode any char - let mut buffer = [0; 4]; - let ch = - delimiters[delim_count % delimiters.len()].encode_utf8(&mut buffer); - delim_length = ch.len(); - - for byte in buffer.iter().take(delim_length) { - output.push(*byte); + + // Write delimiter, if one exists, to output + if let Some(current_delimiter) = current_delimiter_option { + output.extend_from_slice(current_delimiter); } } Err(e) => return Err(e.map_err_context(String::new)), } - delim_count += 1; } - // remove final delimiter - output.truncate(output.len() - delim_length); + + if let Some(DelimiterData { + current_delimiter_length, + .. + }) = &mut delimiter_data_option + { + // Remove trailing delimiter, if there is a delimiter + // It's safe to truncate to zero (or to a length greater than the length of the Vec),so as long as + // the subtraction didn't panic, this should be fine + if let Some(us) = output.len().checked_sub(*current_delimiter_length) { + output.truncate(us); + } + } write!( stdout, @@ -158,10 +239,39 @@ fn paste( } } else { let mut eof = vec![false; files.len()]; + loop { output.clear(); + let mut eof_count = 0; + for (i, file) in files.iter_mut().enumerate() { + let current_delimiter_option = if let Some(DelimiterData { + current_delimiter_length, + delimiters_encoded, + delimiters_encoded_iter, + }) = &mut delimiter_data_option + { + let current_delimiter = if let Some(bo) = delimiters_encoded_iter.next() { + bo + } else { + // Reset iter after hitting the end + *delimiters_encoded_iter = delimiters_encoded.iter(); + + // Unwrapping because: + // 1) `delimiters_encoded` is non-empty + // 2) `delimiters_encoded_iter` is a newly constructed Iter + // So `next` should always return an element + delimiters_encoded_iter.next().unwrap() + }; + + *current_delimiter_length = current_delimiter.len(); + + Some(current_delimiter) + } else { + None + }; + if eof[i] { eof_count += 1; } else { @@ -178,22 +288,33 @@ fn paste( Err(e) => return Err(e.map_err_context(String::new)), } } - // a buffer of length four is large enough to encode any char - let mut buffer = [0; 4]; - let ch = delimiters[delim_count % delimiters.len()].encode_utf8(&mut buffer); - delim_length = ch.len(); - for byte in buffer.iter().take(delim_length) { - output.push(*byte); + // Write delimiter, if one exists, to output + if let Some(current_delimiter) = current_delimiter_option { + output.extend_from_slice(current_delimiter); } - - delim_count += 1; } + if files.len() == eof_count { break; } - // Remove final delimiter - output.truncate(output.len() - delim_length); + + if let Some(DelimiterData { + current_delimiter_length, + delimiters_encoded, + delimiters_encoded_iter, + }) = &mut delimiter_data_option + { + // Reset iter after file is processed + *delimiters_encoded_iter = delimiters_encoded.iter(); + + // Remove trailing delimiter, if there is a delimiter + // It's safe to truncate to zero (or to a length greater than the length of the Vec),so as long as + // the subtraction didn't panic, this should be fine + if let Some(us) = output.len().checked_sub(*current_delimiter_length) { + output.truncate(us); + } + } write!( stdout, @@ -201,9 +322,9 @@ fn paste( String::from_utf8_lossy(&output), line_ending )?; - delim_count = 0; } } + Ok(()) } diff --git a/tests/by-util/test_paste.rs b/tests/by-util/test_paste.rs index e770262c2a2..674e2c74e53 100644 --- a/tests/by-util/test_paste.rs +++ b/tests/by-util/test_paste.rs @@ -193,6 +193,27 @@ fn test_delimiter_list_ending_with_unescaped_backslash() { } } +#[test] +fn test_delimiter_list_empty() { + for st in ["-d", "--delimiters"] { + new_ucmd!() + .args(&[st, "", "-s", "--", "-"]) + .pipe_in( + "\ +A ALPHA 1 _ +B BRAVO 2 _ +C CHARLIE 3 _ +", + ) + .succeeds() + .stdout_only( + "\ +A ALPHA 1 _B BRAVO 2 _C CHARLIE 3 _ +", + ); + } +} + #[test] fn test_data() { for example in EXAMPLE_DATA { From 7a848830b2dcd5198424d1ca21eb5fdbedf53be7 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Fri, 20 Sep 2024 17:52:52 -0500 Subject: [PATCH 2/7] Extract duplicated code into function --- src/uu/paste/src/paste.rs | 172 +++++++++++++++++--------------------- 1 file changed, 76 insertions(+), 96 deletions(-) diff --git a/src/uu/paste/src/paste.rs b/src/uu/paste/src/paste.rs index 63df02b7bf7..a4bceecfcc5 100644 --- a/src/uu/paste/src/paste.rs +++ b/src/uu/paste/src/paste.rs @@ -90,6 +90,53 @@ pub fn uu_app() -> Command { ) } +struct DelimiterData<'a> { + current_delimiter_length: usize, + delimiters_encoded: &'a [Box<[u8]>], + delimiters_encoded_iter: Iter<'a, Box<[u8]>>, +} + +/// - If there are no delimiters, returns `None` +/// - If there are delimiters, tries to return the next delimiter +/// - If the end of the delimiter list was reached, resets the iter to point to the beginning of the delimiter list +/// - (Technically this is done by creating a new iter) +/// - Then returns the next delimiter (which will be the first delimiter in the delimiter list) +fn get_delimiter_to_use_option<'a>( + delimiter_data_option: &'a mut Option, +) -> Option<&'a [u8]> { + match *delimiter_data_option { + Some(ref mut de) => { + let &mut DelimiterData { + ref mut current_delimiter_length, + delimiters_encoded, + ref mut delimiters_encoded_iter, + } = de; + + let current_delimiter = if let Some(bo) = delimiters_encoded_iter.next() { + bo + } else { + let mut new_delimiters_encoded_iter = delimiters_encoded.iter(); + + // Unwrapping because: + // 1) `delimiters_encoded` is non-empty + // 2) `new_delimiters_encoded_iter` is a newly constructed Iter + // So: `next` should always return an element + let bo = new_delimiters_encoded_iter.next().unwrap(); + + // The old iter hit the end, so assign the new iter + *delimiters_encoded_iter = new_delimiters_encoded_iter; + + bo + }; + + *current_delimiter_length = current_delimiter.len(); + + Some(current_delimiter) + } + None => None, + } +} + #[allow(clippy::cognitive_complexity)] fn paste( filenames: Vec, @@ -116,19 +163,13 @@ fn paste( )); } - struct DelimiterData<'a> { - current_delimiter_length: usize, - delimiters_encoded: &'a [Box<[u8]>], - delimiters_encoded_iter: Iter<'a, Box<[u8]>>, - } - // Precompute instead of doing this inside the loops let mut delimiters_encoded_option = { let delimiters_unescaped = unescape(delimiters).chars().collect::>(); let number_of_delimiters = delimiters_unescaped.len(); - if number_of_delimiters > 0 { + if number_of_delimiters > 0_usize { let mut vec = Vec::>::with_capacity(number_of_delimiters); { @@ -148,23 +189,13 @@ fn paste( } }; - let mut delimiter_data_option = match &mut delimiters_encoded_option { - &mut Some(ref mut delimiters_encoded) => { - // TODO - // Is this initial value correct? - let current_delimiter_length = delimiters_encoded.first().unwrap().len(); + let mut delimiter_data_option = delimiters_encoded_option.as_mut().map(|bo| DelimiterData { + delimiters_encoded: bo, + delimiters_encoded_iter: bo.iter(), + current_delimiter_length: 0_usize, + }); - Some(DelimiterData { - delimiters_encoded, - delimiters_encoded_iter: delimiters_encoded.iter(), - current_delimiter_length, - }) - } - None => None, - }; - - let stdout = stdout(); - let mut stdout = stdout.lock(); + let mut stdout = stdout().lock(); let mut output = Vec::new(); @@ -173,43 +204,18 @@ fn paste( output.clear(); loop { - let current_delimiter_option = match &mut delimiter_data_option { - Some(DelimiterData { - current_delimiter_length, - delimiters_encoded, - delimiters_encoded_iter, - }) => { - let current_delimiter = if let Some(delimiter_from_current_iter) = - delimiters_encoded_iter.next() - { - delimiter_from_current_iter - } else { - // Reset iter after hitting the end - *delimiters_encoded_iter = delimiters_encoded.iter(); - - // Unwrapping because: - // 1) `delimiters_encoded` is non-empty - // 2) `delimiters_encoded_iter` is a newly constructed Iter - // So `next` should always return an element - delimiters_encoded_iter.next().unwrap() - }; - - *current_delimiter_length = current_delimiter.len(); - - Some(current_delimiter) - } - None => None, - }; + let delimiter_to_use_option = + get_delimiter_to_use_option(&mut delimiter_data_option); match read_until(file.as_mut(), line_ending as u8, &mut output) { - Ok(0) => break, + Ok(0_usize) => break, Ok(_) => { if output.ends_with(&[line_ending as u8]) { output.pop(); } // Write delimiter, if one exists, to output - if let Some(current_delimiter) = current_delimiter_option { + if let Some(current_delimiter) = delimiter_to_use_option { output.extend_from_slice(current_delimiter); } } @@ -217,16 +223,12 @@ fn paste( } } - if let Some(DelimiterData { - current_delimiter_length, - .. - }) = &mut delimiter_data_option - { + if let Some(ref de) = delimiter_data_option { // Remove trailing delimiter, if there is a delimiter - // It's safe to truncate to zero (or to a length greater than the length of the Vec),so as long as - // the subtraction didn't panic, this should be fine - if let Some(us) = output.len().checked_sub(*current_delimiter_length) { + if let Some(us) = output.len().checked_sub(de.current_delimiter_length) { output.truncate(us); + } else { + // Subtraction would have resulted in a negative number. This should never happen. } } @@ -246,37 +248,14 @@ fn paste( let mut eof_count = 0; for (i, file) in files.iter_mut().enumerate() { - let current_delimiter_option = if let Some(DelimiterData { - current_delimiter_length, - delimiters_encoded, - delimiters_encoded_iter, - }) = &mut delimiter_data_option - { - let current_delimiter = if let Some(bo) = delimiters_encoded_iter.next() { - bo - } else { - // Reset iter after hitting the end - *delimiters_encoded_iter = delimiters_encoded.iter(); - - // Unwrapping because: - // 1) `delimiters_encoded` is non-empty - // 2) `delimiters_encoded_iter` is a newly constructed Iter - // So `next` should always return an element - delimiters_encoded_iter.next().unwrap() - }; - - *current_delimiter_length = current_delimiter.len(); - - Some(current_delimiter) - } else { - None - }; + let delimiter_to_use_option = + get_delimiter_to_use_option(&mut delimiter_data_option); if eof[i] { eof_count += 1; } else { match read_until(file.as_mut(), line_ending as u8, &mut output) { - Ok(0) => { + Ok(0_usize) => { eof[i] = true; eof_count += 1; } @@ -290,7 +269,7 @@ fn paste( } // Write delimiter, if one exists, to output - if let Some(current_delimiter) = current_delimiter_option { + if let Some(current_delimiter) = delimiter_to_use_option { output.extend_from_slice(current_delimiter); } } @@ -299,20 +278,21 @@ fn paste( break; } - if let Some(DelimiterData { - current_delimiter_length, - delimiters_encoded, - delimiters_encoded_iter, - }) = &mut delimiter_data_option - { + if let &mut Some(ref mut de) = &mut delimiter_data_option { + let &mut DelimiterData { + current_delimiter_length, + delimiters_encoded, + ref mut delimiters_encoded_iter, + } = de; + // Reset iter after file is processed *delimiters_encoded_iter = delimiters_encoded.iter(); // Remove trailing delimiter, if there is a delimiter - // It's safe to truncate to zero (or to a length greater than the length of the Vec),so as long as - // the subtraction didn't panic, this should be fine - if let Some(us) = output.len().checked_sub(*current_delimiter_length) { + if let Some(us) = output.len().checked_sub(current_delimiter_length) { output.truncate(us); + } else { + // Subtraction would have resulted in a negative number. This should never happen. } } From 884b7ef628d86db42f94cfe8e08660d6725650c0 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sun, 22 Sep 2024 12:42:28 -0500 Subject: [PATCH 3/7] Address PR comments. Improve code structure. --- src/uu/paste/src/paste.rs | 263 ++++++++++++++++++++---------------- tests/by-util/test_paste.rs | 31 ++++- 2 files changed, 174 insertions(+), 120 deletions(-) diff --git a/src/uu/paste/src/paste.rs b/src/uu/paste/src/paste.rs index a4bceecfcc5..6249b6e5ca1 100644 --- a/src/uu/paste/src/paste.rs +++ b/src/uu/paste/src/paste.rs @@ -8,6 +8,7 @@ use clap::{crate_version, Arg, ArgAction, Command}; use std::fs::File; use std::io::{stdin, stdout, BufRead, BufReader, Read, Write}; +use std::iter::Cycle; use std::path::Path; use std::slice::Iter; use uucore::error::{FromIo, UResult, USimpleError}; @@ -90,53 +91,6 @@ pub fn uu_app() -> Command { ) } -struct DelimiterData<'a> { - current_delimiter_length: usize, - delimiters_encoded: &'a [Box<[u8]>], - delimiters_encoded_iter: Iter<'a, Box<[u8]>>, -} - -/// - If there are no delimiters, returns `None` -/// - If there are delimiters, tries to return the next delimiter -/// - If the end of the delimiter list was reached, resets the iter to point to the beginning of the delimiter list -/// - (Technically this is done by creating a new iter) -/// - Then returns the next delimiter (which will be the first delimiter in the delimiter list) -fn get_delimiter_to_use_option<'a>( - delimiter_data_option: &'a mut Option, -) -> Option<&'a [u8]> { - match *delimiter_data_option { - Some(ref mut de) => { - let &mut DelimiterData { - ref mut current_delimiter_length, - delimiters_encoded, - ref mut delimiters_encoded_iter, - } = de; - - let current_delimiter = if let Some(bo) = delimiters_encoded_iter.next() { - bo - } else { - let mut new_delimiters_encoded_iter = delimiters_encoded.iter(); - - // Unwrapping because: - // 1) `delimiters_encoded` is non-empty - // 2) `new_delimiters_encoded_iter` is a newly constructed Iter - // So: `next` should always return an element - let bo = new_delimiters_encoded_iter.next().unwrap(); - - // The old iter hit the end, so assign the new iter - *delimiters_encoded_iter = new_delimiters_encoded_iter; - - bo - }; - - *current_delimiter_length = current_delimiter.len(); - - Some(current_delimiter) - } - None => None, - } -} - #[allow(clippy::cognitive_complexity)] fn paste( filenames: Vec, @@ -150,8 +104,10 @@ fn paste( None } else { let path = Path::new(&name); - let r = File::open(path).map_err_context(String::new)?; - Some(BufReader::new(r)) + // TODO + // Is `map_err_context` correct here? + let file = File::open(path).map_err_context(String::new)?; + Some(BufReader::new(file)) }; files.push(file); } @@ -163,38 +119,20 @@ fn paste( )); } - // Precompute instead of doing this inside the loops - let mut delimiters_encoded_option = { - let delimiters_unescaped = unescape(delimiters).chars().collect::>(); - - let number_of_delimiters = delimiters_unescaped.len(); - - if number_of_delimiters > 0_usize { - let mut vec = Vec::>::with_capacity(number_of_delimiters); - - { - // a buffer of length four is large enough to encode any char - let mut buffer = [0_u8; 4_usize]; - - for ch in delimiters_unescaped { - let delimiter_encoded = ch.encode_utf8(&mut buffer); - - vec.push(Box::from(delimiter_encoded.as_bytes())); - } - } - - Some(vec.into_boxed_slice()) - } else { - None - } + let unescaped_and_encoded_delimiters = parse_delimiters(delimiters); + + let mut delimiter_state = match unescaped_and_encoded_delimiters.as_ref() { + [] => DelimiterState::NoDelimiters, + [only_delimiter] => DelimiterState::OneDelimiter { + delimiter: only_delimiter, + }, + [first_delimiter, ..] => DelimiterState::MultipleDelimiters { + current_delimiter: first_delimiter, + delimiters: &unescaped_and_encoded_delimiters, + delimiters_iter: unescaped_and_encoded_delimiters.iter().cycle(), + }, }; - let mut delimiter_data_option = delimiters_encoded_option.as_mut().map(|bo| DelimiterData { - delimiters_encoded: bo, - delimiters_encoded_iter: bo.iter(), - current_delimiter_length: 0_usize, - }); - let mut stdout = stdout().lock(); let mut output = Vec::new(); @@ -204,34 +142,27 @@ fn paste( output.clear(); loop { - let delimiter_to_use_option = - get_delimiter_to_use_option(&mut delimiter_data_option); + delimiter_state.advance_to_next_delimiter(); match read_until(file.as_mut(), line_ending as u8, &mut output) { - Ok(0_usize) => break, + Ok(0) => break, Ok(_) => { if output.ends_with(&[line_ending as u8]) { output.pop(); } - // Write delimiter, if one exists, to output - if let Some(current_delimiter) = delimiter_to_use_option { - output.extend_from_slice(current_delimiter); - } + delimiter_state.write_delimiter(&mut output); } + // TODO + // Is `map_err_context` correct here? Err(e) => return Err(e.map_err_context(String::new)), } } - if let Some(ref de) = delimiter_data_option { - // Remove trailing delimiter, if there is a delimiter - if let Some(us) = output.len().checked_sub(de.current_delimiter_length) { - output.truncate(us); - } else { - // Subtraction would have resulted in a negative number. This should never happen. - } - } + delimiter_state.remove_trailing_delimiter(&mut output); + // TODO + // Should the output be converted to UTF-8? write!( stdout, "{}{}", @@ -248,14 +179,13 @@ fn paste( let mut eof_count = 0; for (i, file) in files.iter_mut().enumerate() { - let delimiter_to_use_option = - get_delimiter_to_use_option(&mut delimiter_data_option); + delimiter_state.advance_to_next_delimiter(); if eof[i] { eof_count += 1; } else { match read_until(file.as_mut(), line_ending as u8, &mut output) { - Ok(0_usize) => { + Ok(0) => { eof[i] = true; eof_count += 1; } @@ -264,38 +194,30 @@ fn paste( output.pop(); } } + // TODO + // Is `map_err_context` correct here? Err(e) => return Err(e.map_err_context(String::new)), } } - // Write delimiter, if one exists, to output - if let Some(current_delimiter) = delimiter_to_use_option { - output.extend_from_slice(current_delimiter); - } + delimiter_state.write_delimiter(&mut output); } if files.len() == eof_count { break; } - if let &mut Some(ref mut de) = &mut delimiter_data_option { - let &mut DelimiterData { - current_delimiter_length, - delimiters_encoded, - ref mut delimiters_encoded_iter, - } = de; + // Quote: + // When the -s option is not specified: + // [...] + // The delimiter shall be reset to the first element of list after each file operand is processed. + // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html + delimiter_state.reset_to_first_delimiter(); - // Reset iter after file is processed - *delimiters_encoded_iter = delimiters_encoded.iter(); - - // Remove trailing delimiter, if there is a delimiter - if let Some(us) = output.len().checked_sub(current_delimiter_length) { - output.truncate(us); - } else { - // Subtraction would have resulted in a negative number. This should never happen. - } - } + delimiter_state.remove_trailing_delimiter(&mut output); + // TODO + // Should the output be converted to UTF-8? write!( stdout, "{}{}", @@ -308,9 +230,114 @@ fn paste( Ok(()) } -// Unescape all special characters +/// Unescape all special characters fn unescape(s: &str) -> String { s.replace("\\n", "\n") .replace("\\t", "\t") .replace("\\\\", "\\") } + +fn parse_delimiters(delimiters: &str) -> Box<[Box<[u8]>]> { + let delimiters_unescaped = unescape(delimiters).chars().collect::>(); + + let delimiters_unescaped_len = delimiters_unescaped.len(); + + if delimiters_unescaped_len > 0 { + let mut vec = Vec::>::with_capacity(delimiters_unescaped_len); + + // a buffer of length four is large enough to encode any char + let mut buffer = [0; 4]; + + for delimiter in delimiters_unescaped { + let delimiter_encoded = delimiter.encode_utf8(&mut buffer); + + vec.push(Box::from(delimiter_encoded.as_bytes())); + } + + vec.into_boxed_slice() + } else { + Box::new([]) + } +} + +enum DelimiterState<'a> { + NoDelimiters, + OneDelimiter { + delimiter: &'a [u8], + }, + MultipleDelimiters { + current_delimiter: &'a [u8], + delimiters: &'a [Box<[u8]>], + delimiters_iter: Cycle>>, + }, +} + +impl<'a> DelimiterState<'a> { + /// If there are multiple delimiters, advance the iterator over the delimiter list. + /// This is a no-op unless there are multiple delimiters. + fn advance_to_next_delimiter(&mut self) { + if let DelimiterState::MultipleDelimiters { + current_delimiter, + delimiters_iter, + .. + } = self + { + // Unwrap because "delimiters_encoded_iter" is a cycle iter and was created from a non-empty slice + *current_delimiter = delimiters_iter.next().unwrap(); + } + } + + /// This should only be used to return to the start of the delimiter list after a file has been processed. + /// This should only be used when the "serial" option is disabled. + /// This is a no-op unless there are multiple delimiters. + fn reset_to_first_delimiter(&mut self) { + if let DelimiterState::MultipleDelimiters { + delimiters_iter, + delimiters, + .. + } = self + { + *delimiters_iter = delimiters.iter().cycle(); + } + } + + /// Remove the trailing delimiter. + /// If there are no delimiters, this is a no-op. + fn remove_trailing_delimiter(&mut self, output: &mut Vec) { + let delimiter_length = match self { + DelimiterState::OneDelimiter { delimiter } => delimiter.len(), + DelimiterState::MultipleDelimiters { + current_delimiter, .. + } => current_delimiter.len(), + _ => { + return; + } + }; + + let output_len = output.len(); + + if let Some(output_without_delimiter_length) = output_len.checked_sub(delimiter_length) { + output.truncate(output_without_delimiter_length); + } else { + // This branch is NOT unreachable, must be skipped + // "output" should be empty in this case + assert!(output_len == 0); + } + } + + /// Append the current delimiter to `output`. + /// If there are no delimiters, this is a no-op. + fn write_delimiter(&mut self, output: &mut Vec) { + match self { + DelimiterState::OneDelimiter { delimiter } => { + output.extend_from_slice(delimiter); + } + DelimiterState::MultipleDelimiters { + current_delimiter, .. + } => { + output.extend_from_slice(current_delimiter); + } + _ => {} + } + } +} diff --git a/tests/by-util/test_paste.rs b/tests/by-util/test_paste.rs index 674e2c74e53..81f05d37a39 100644 --- a/tests/by-util/test_paste.rs +++ b/tests/by-util/test_paste.rs @@ -195,9 +195,9 @@ fn test_delimiter_list_ending_with_unescaped_backslash() { #[test] fn test_delimiter_list_empty() { - for st in ["-d", "--delimiters"] { + for option_style in ["-d", "--delimiters"] { new_ucmd!() - .args(&[st, "", "-s", "--", "-"]) + .args(&[option_style, "", "-s"]) .pipe_in( "\ A ALPHA 1 _ @@ -214,6 +214,33 @@ A ALPHA 1 _B BRAVO 2 _C CHARLIE 3 _ } } +// Was panicking (usize subtraction that would have resulted in a negative number) +// Not observable in release builds, since integer overflow checking is not enabled +#[test] +fn test_delimiter_truncation() { + for option_style in ["-d", "--delimiters"] { + new_ucmd!() + .args(&[option_style, "!@#", "-s", "-", "-", "-"]) + .pipe_in( + "\ +FIRST +SECOND +THIRD +FOURTH +ABCDEFG +", + ) + .succeeds() + .stdout_only( + "\ +FIRST!SECOND@THIRD#FOURTH!ABCDEFG + + +", + ); + } +} + #[test] fn test_data() { for example in EXAMPLE_DATA { From f096a214e29ffd93669eb40f281bbdc25d77248d Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Tue, 8 Oct 2024 06:32:39 -0500 Subject: [PATCH 4/7] Fix additional paste bugs --- src/uu/paste/src/paste.rs | 95 ++++++++++++++++++++++++------------- tests/by-util/test_paste.rs | 48 +++++++++++++++++++ 2 files changed, 110 insertions(+), 33 deletions(-) diff --git a/src/uu/paste/src/paste.rs b/src/uu/paste/src/paste.rs index 6249b6e5ca1..de325a69ea0 100644 --- a/src/uu/paste/src/paste.rs +++ b/src/uu/paste/src/paste.rs @@ -98,6 +98,9 @@ fn paste( delimiters: &str, line_ending: LineEnding, ) -> UResult<()> { + let line_ending_byte = u8::from(line_ending); + let line_ending_byte_array_ref = &[line_ending_byte]; + let mut files = Vec::with_capacity(filenames.len()); for name in filenames { let file = if name == "-" { @@ -112,7 +115,13 @@ fn paste( files.push(file); } - if delimiters.ends_with('\\') && !delimiters.ends_with("\\\\") { + let trailing_backslashes_count = delimiters + .chars() + .rev() + .take_while(|&ch| ch == '\\') + .count(); + + if trailing_backslashes_count % 2 != 0 { return Err(USimpleError::new( 1, format!("delimiter list ends with an unescaped backslash: {delimiters}"), @@ -129,7 +138,7 @@ fn paste( [first_delimiter, ..] => DelimiterState::MultipleDelimiters { current_delimiter: first_delimiter, delimiters: &unescaped_and_encoded_delimiters, - delimiters_iter: unescaped_and_encoded_delimiters.iter().cycle(), + delimiters_iterator: unescaped_and_encoded_delimiters.iter().cycle(), }, }; @@ -144,10 +153,10 @@ fn paste( loop { delimiter_state.advance_to_next_delimiter(); - match read_until(file.as_mut(), line_ending as u8, &mut output) { + match read_until(file.as_mut(), line_ending_byte, &mut output) { Ok(0) => break, Ok(_) => { - if output.ends_with(&[line_ending as u8]) { + if output.ends_with(line_ending_byte_array_ref) { output.pop(); } @@ -161,14 +170,8 @@ fn paste( delimiter_state.remove_trailing_delimiter(&mut output); - // TODO - // Should the output be converted to UTF-8? - write!( - stdout, - "{}{}", - String::from_utf8_lossy(&output), - line_ending - )?; + stdout.write_all(&output)?; + stdout.write_all(line_ending_byte_array_ref)?; } } else { let mut eof = vec![false; files.len()]; @@ -184,13 +187,13 @@ fn paste( if eof[i] { eof_count += 1; } else { - match read_until(file.as_mut(), line_ending as u8, &mut output) { + match read_until(file.as_mut(), line_ending_byte, &mut output) { Ok(0) => { eof[i] = true; eof_count += 1; } Ok(_) => { - if output.ends_with(&[line_ending as u8]) { + if output.ends_with(line_ending_byte_array_ref) { output.pop(); } } @@ -216,14 +219,8 @@ fn paste( delimiter_state.remove_trailing_delimiter(&mut output); - // TODO - // Should the output be converted to UTF-8? - write!( - stdout, - "{}{}", - String::from_utf8_lossy(&output), - line_ending - )?; + stdout.write_all(&output)?; + stdout.write_all(line_ending_byte_array_ref)?; } } @@ -231,10 +228,42 @@ fn paste( } /// Unescape all special characters -fn unescape(s: &str) -> String { - s.replace("\\n", "\n") - .replace("\\t", "\t") - .replace("\\\\", "\\") +fn unescape(input: &str) -> String { + /// A single backslash char + const BACKSLASH: char = '\\'; + + let mut string = String::with_capacity(input.len()); + + let mut chars = input.chars(); + + while let Some(char) = chars.next() { + match char { + BACKSLASH => match chars.next() { + // Keep "\" if it is the last char + // "\\" to "\" + None | Some(BACKSLASH) => { + string.push(BACKSLASH); + } + // "\n" to U+000A + Some('n') => { + string.push('\n'); + } + // "\t" to U+0009 + Some('t') => { + string.push('\t'); + } + Some(other_char) => { + string.push(BACKSLASH); + string.push(other_char); + } + }, + non_backslash_char => { + string.push(non_backslash_char); + } + } + } + + string } fn parse_delimiters(delimiters: &str) -> Box<[Box<[u8]>]> { @@ -268,7 +297,7 @@ enum DelimiterState<'a> { MultipleDelimiters { current_delimiter: &'a [u8], delimiters: &'a [Box<[u8]>], - delimiters_iter: Cycle>>, + delimiters_iterator: Cycle>>, }, } @@ -278,12 +307,12 @@ impl<'a> DelimiterState<'a> { fn advance_to_next_delimiter(&mut self) { if let DelimiterState::MultipleDelimiters { current_delimiter, - delimiters_iter, + delimiters_iterator, .. } = self { - // Unwrap because "delimiters_encoded_iter" is a cycle iter and was created from a non-empty slice - *current_delimiter = delimiters_iter.next().unwrap(); + // Unwrap because `delimiters_iterator` is a cycle iter and was created from a non-empty slice + *current_delimiter = delimiters_iterator.next().unwrap(); } } @@ -292,12 +321,12 @@ impl<'a> DelimiterState<'a> { /// This is a no-op unless there are multiple delimiters. fn reset_to_first_delimiter(&mut self) { if let DelimiterState::MultipleDelimiters { - delimiters_iter, + delimiters_iterator, delimiters, .. } = self { - *delimiters_iter = delimiters.iter().cycle(); + *delimiters_iterator = delimiters.iter().cycle(); } } @@ -320,7 +349,7 @@ impl<'a> DelimiterState<'a> { output.truncate(output_without_delimiter_length); } else { // This branch is NOT unreachable, must be skipped - // "output" should be empty in this case + // `output` should be empty in this case assert!(output_len == 0); } } diff --git a/tests/by-util/test_paste.rs b/tests/by-util/test_paste.rs index 81f05d37a39..e7b914624bb 100644 --- a/tests/by-util/test_paste.rs +++ b/tests/by-util/test_paste.rs @@ -241,6 +241,54 @@ FIRST!SECOND@THIRD#FOURTH!ABCDEFG } } +#[test] +fn test_non_utf8_input() { + const PREFIX_LEN: usize = 16; + const MIDDLE_LEN: usize = 3; + const SUFFIX_LEN: usize = 2; + + const TOTAL_LEN: usize = PREFIX_LEN + MIDDLE_LEN + SUFFIX_LEN; + + const PREFIX: &[u8; PREFIX_LEN] = b"Non-UTF-8 test: "; + // 0xC0 is not valid UTF-8 + const MIDDLE: &[u8; MIDDLE_LEN] = &[0xC0, 0x00, 0xC0]; + const SUFFIX: &[u8; SUFFIX_LEN] = b".\n"; + + let mut input = Vec::::with_capacity(TOTAL_LEN); + + input.extend_from_slice(PREFIX); + + input.extend_from_slice(MIDDLE); + + input.extend_from_slice(SUFFIX); + + let input_clone = input.clone(); + + new_ucmd!() + .pipe_in(input_clone) + .succeeds() + .stdout_only_bytes(input); +} + +#[test] +fn test_three_trailing_backslashes_delimiter() { + const ONE_BACKSLASH_STR: &str = "\\"; + + let three_backslashes_string = ONE_BACKSLASH_STR.repeat(3); + + for option_style in ["-d", "--delimiters"] { + new_ucmd!() + .args(&[option_style, &three_backslashes_string]) + .fails() + .no_stdout() + .stderr_str_check(|st| { + st.ends_with(&format!( + ": delimiter list ends with an unescaped backslash: {three_backslashes_string}\n" + )) + }); + } +} + #[test] fn test_data() { for example in EXAMPLE_DATA { From 5a54253dc92dac5243225c9e31b3cbead85ed006 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Tue, 8 Oct 2024 10:41:24 -0500 Subject: [PATCH 5/7] Fix additional paste bugs --- src/uu/paste/src/paste.rs | 73 +++++++++++++++++++------------------ tests/by-util/test_paste.rs | 53 +++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 36 deletions(-) diff --git a/src/uu/paste/src/paste.rs b/src/uu/paste/src/paste.rs index de325a69ea0..1acb91f6d20 100644 --- a/src/uu/paste/src/paste.rs +++ b/src/uu/paste/src/paste.rs @@ -227,66 +227,67 @@ fn paste( Ok(()) } -/// Unescape all special characters -fn unescape(input: &str) -> String { +fn parse_delimiters(delimiters: &str) -> Box<[Box<[u8]>]> { /// A single backslash char const BACKSLASH: char = '\\'; - let mut string = String::with_capacity(input.len()); + fn add_one_byte_single_char_delimiter(vec: &mut Vec>, byte: u8) { + vec.push(Box::new([byte])); + } + + // a buffer of length four is large enough to encode any char + let mut buffer = [0; 4]; + + let mut add_single_char_delimiter = |vec: &mut Vec>, ch: char| { + let delimiter_encoded = ch.encode_utf8(&mut buffer); + + vec.push(Box::from(delimiter_encoded.as_bytes())); + }; - let mut chars = input.chars(); + let mut vec = Vec::>::with_capacity(delimiters.len()); + let mut chars = delimiters.chars(); + + // Unescape all special characters while let Some(char) = chars.next() { match char { + // Empty string (not a null character) BACKSLASH => match chars.next() { - // Keep "\" if it is the last char + // "Empty string (not a null character)" + // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html + Some('0') => { + vec.push(Box::<[u8; 0]>::new([])); + } // "\\" to "\" - None | Some(BACKSLASH) => { - string.push(BACKSLASH); + Some(BACKSLASH) => { + add_one_byte_single_char_delimiter(&mut vec, b'\\'); } // "\n" to U+000A Some('n') => { - string.push('\n'); + add_one_byte_single_char_delimiter(&mut vec, b'\n'); } // "\t" to U+0009 Some('t') => { - string.push('\t'); + add_one_byte_single_char_delimiter(&mut vec, b'\t'); } Some(other_char) => { - string.push(BACKSLASH); - string.push(other_char); + // "If any other characters follow the , the results are unspecified." + // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html + // However, other implementations remove the backslash + // See "test_posix_unspecified_delimiter" + add_single_char_delimiter(&mut vec, other_char); + } + None => { + unreachable!("Delimiter list cannot end with an unescaped backslash"); } }, non_backslash_char => { - string.push(non_backslash_char); + add_single_char_delimiter(&mut vec, non_backslash_char); } } } - string -} - -fn parse_delimiters(delimiters: &str) -> Box<[Box<[u8]>]> { - let delimiters_unescaped = unescape(delimiters).chars().collect::>(); - - let delimiters_unescaped_len = delimiters_unescaped.len(); - - if delimiters_unescaped_len > 0 { - let mut vec = Vec::>::with_capacity(delimiters_unescaped_len); - - // a buffer of length four is large enough to encode any char - let mut buffer = [0; 4]; - - for delimiter in delimiters_unescaped { - let delimiter_encoded = delimiter.encode_utf8(&mut buffer); - - vec.push(Box::from(delimiter_encoded.as_bytes())); - } - - vec.into_boxed_slice() - } else { - Box::new([]) - } + vec.into_boxed_slice() } enum DelimiterState<'a> { diff --git a/tests/by-util/test_paste.rs b/tests/by-util/test_paste.rs index e7b914624bb..ac0cc93b8fa 100644 --- a/tests/by-util/test_paste.rs +++ b/tests/by-util/test_paste.rs @@ -289,6 +289,59 @@ fn test_three_trailing_backslashes_delimiter() { } } +// "If any other characters follow the , the results are unspecified." +// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html +// However, other implementations remove the backslash +#[test] +fn test_posix_unspecified_delimiter() { + for option_style in ["-d", "--delimiters"] { + new_ucmd!() + // This is not "\\z", but "\z" + .args(&[option_style, "\\z", "-s"]) + .pipe_in( + "\ +1 +2 +3 +4 +", + ) + .succeeds() + .stdout_only( + "\ +1z2z3z4 +", + ); + } +} + +// "Empty string (not a null character)" +// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html +#[test] +fn test_backslash_zero_delimiter() { + for option_style in ["-d", "--delimiters"] { + new_ucmd!() + // This is "\0z\0" + .args(&[option_style, "\\0z\\0", "-s"]) + .pipe_in( + "\ +1 +2 +3 +4 +5 +6 +", + ) + .succeeds() + .stdout_only( + "\ +12z345z6 +", + ); + } +} + #[test] fn test_data() { for example in EXAMPLE_DATA { From 6050dd034740f7f4ecd7102ebbec52907f4a1ff6 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Wed, 9 Oct 2024 00:21:24 -0500 Subject: [PATCH 6/7] Simplify backslash delimiter validation --- src/uu/paste/src/paste.rs | 241 ++++++++++++++++++------------------ tests/by-util/test_paste.rs | 72 ++++++----- 2 files changed, 162 insertions(+), 151 deletions(-) diff --git a/src/uu/paste/src/paste.rs b/src/uu/paste/src/paste.rs index 1acb91f6d20..9d26197813b 100644 --- a/src/uu/paste/src/paste.rs +++ b/src/uu/paste/src/paste.rs @@ -3,15 +3,14 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore (ToDO) delim - use clap::{crate_version, Arg, ArgAction, Command}; +use std::cell::{OnceCell, RefCell}; use std::fs::File; -use std::io::{stdin, stdout, BufRead, BufReader, Read, Write}; +use std::io::{stdin, stdout, BufRead, BufReader, Stdin, Write}; use std::iter::Cycle; -use std::path::Path; +use std::rc::Rc; use std::slice::Iter; -use uucore::error::{FromIo, UResult, USimpleError}; +use uucore::error::{UResult, USimpleError}; use uucore::line_ending::LineEnding; use uucore::{format_usage, help_about, help_usage}; @@ -25,18 +24,6 @@ mod options { pub const ZERO_TERMINATED: &str = "zero-terminated"; } -// Wraps BufReader and stdin -fn read_until( - reader: Option<&mut BufReader>, - byte: u8, - buf: &mut Vec, -) -> std::io::Result { - match reader { - Some(reader) => reader.read_until(byte, buf), - None => stdin().lock().read_until(byte, buf), - } -} - #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uu_app().try_get_matches_from(args)?; @@ -98,73 +85,52 @@ fn paste( delimiters: &str, line_ending: LineEnding, ) -> UResult<()> { - let line_ending_byte = u8::from(line_ending); - let line_ending_byte_array_ref = &[line_ending_byte]; + let unescaped_and_encoded_delimiters = parse_delimiters(delimiters)?; + + let stdin_once_cell = OnceCell::>>::new(); + + let mut input_source_vec = Vec::with_capacity(filenames.len()); - let mut files = Vec::with_capacity(filenames.len()); - for name in filenames { - let file = if name == "-" { - None - } else { - let path = Path::new(&name); - // TODO - // Is `map_err_context` correct here? - let file = File::open(path).map_err_context(String::new)?; - Some(BufReader::new(file)) + for filename in filenames { + let input_source = match filename.as_str() { + "-" => InputSource::StandardInput( + stdin_once_cell + .get_or_init(|| Rc::new(RefCell::new(stdin()))) + .clone(), + ), + st => { + let file = File::open(st)?; + + InputSource::File(BufReader::new(file)) + } }; - files.push(file); - } - let trailing_backslashes_count = delimiters - .chars() - .rev() - .take_while(|&ch| ch == '\\') - .count(); - - if trailing_backslashes_count % 2 != 0 { - return Err(USimpleError::new( - 1, - format!("delimiter list ends with an unescaped backslash: {delimiters}"), - )); + input_source_vec.push(input_source); } - let unescaped_and_encoded_delimiters = parse_delimiters(delimiters); - - let mut delimiter_state = match unescaped_and_encoded_delimiters.as_ref() { - [] => DelimiterState::NoDelimiters, - [only_delimiter] => DelimiterState::OneDelimiter { - delimiter: only_delimiter, - }, - [first_delimiter, ..] => DelimiterState::MultipleDelimiters { - current_delimiter: first_delimiter, - delimiters: &unescaped_and_encoded_delimiters, - delimiters_iterator: unescaped_and_encoded_delimiters.iter().cycle(), - }, - }; - let mut stdout = stdout().lock(); + let line_ending_byte = u8::from(line_ending); + let line_ending_byte_array_ref = &[line_ending_byte]; + + let input_source_vec_len = input_source_vec.len(); + + let mut delimiter_state = DelimiterState::new(&unescaped_and_encoded_delimiters); + let mut output = Vec::new(); if serial { - for file in &mut files { + for input_source in &mut input_source_vec { output.clear(); loop { - delimiter_state.advance_to_next_delimiter(); - - match read_until(file.as_mut(), line_ending_byte, &mut output) { - Ok(0) => break, - Ok(_) => { - if output.ends_with(line_ending_byte_array_ref) { - output.pop(); - } + match input_source.read_until(line_ending_byte, &mut output)? { + 0 => break, + _ => { + remove_trailing_line_ending_byte(line_ending_byte, &mut output); delimiter_state.write_delimiter(&mut output); } - // TODO - // Is `map_err_context` correct here? - Err(e) => return Err(e.map_err_context(String::new)), } } @@ -174,60 +140,53 @@ fn paste( stdout.write_all(line_ending_byte_array_ref)?; } } else { - let mut eof = vec![false; files.len()]; + let mut eof = vec![false; input_source_vec_len]; loop { output.clear(); let mut eof_count = 0; - for (i, file) in files.iter_mut().enumerate() { - delimiter_state.advance_to_next_delimiter(); - + for (i, input_source) in input_source_vec.iter_mut().enumerate() { if eof[i] { eof_count += 1; } else { - match read_until(file.as_mut(), line_ending_byte, &mut output) { - Ok(0) => { + match input_source.read_until(line_ending_byte, &mut output)? { + 0 => { eof[i] = true; eof_count += 1; } - Ok(_) => { - if output.ends_with(line_ending_byte_array_ref) { - output.pop(); - } + _ => { + remove_trailing_line_ending_byte(line_ending_byte, &mut output); } - // TODO - // Is `map_err_context` correct here? - Err(e) => return Err(e.map_err_context(String::new)), } } delimiter_state.write_delimiter(&mut output); } - if files.len() == eof_count { + if eof_count == input_source_vec_len { break; } + delimiter_state.remove_trailing_delimiter(&mut output); + + stdout.write_all(&output)?; + stdout.write_all(line_ending_byte_array_ref)?; + // Quote: // When the -s option is not specified: // [...] // The delimiter shall be reset to the first element of list after each file operand is processed. // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html delimiter_state.reset_to_first_delimiter(); - - delimiter_state.remove_trailing_delimiter(&mut output); - - stdout.write_all(&output)?; - stdout.write_all(line_ending_byte_array_ref)?; } } Ok(()) } -fn parse_delimiters(delimiters: &str) -> Box<[Box<[u8]>]> { +fn parse_delimiters(delimiters: &str) -> UResult]>> { /// A single backslash char const BACKSLASH: char = '\\'; @@ -251,14 +210,13 @@ fn parse_delimiters(delimiters: &str) -> Box<[Box<[u8]>]> { // Unescape all special characters while let Some(char) = chars.next() { match char { - // Empty string (not a null character) BACKSLASH => match chars.next() { // "Empty string (not a null character)" // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html Some('0') => { vec.push(Box::<[u8; 0]>::new([])); } - // "\\" to "\" + // "\\" to "\" (U+005C) Some(BACKSLASH) => { add_one_byte_single_char_delimiter(&mut vec, b'\\'); } @@ -278,7 +236,10 @@ fn parse_delimiters(delimiters: &str) -> Box<[Box<[u8]>]> { add_single_char_delimiter(&mut vec, other_char); } None => { - unreachable!("Delimiter list cannot end with an unescaped backslash"); + return Err(USimpleError::new( + 1, + format!("delimiter list ends with an unescaped backslash: {delimiters}"), + )); } }, non_backslash_char => { @@ -287,14 +248,20 @@ fn parse_delimiters(delimiters: &str) -> Box<[Box<[u8]>]> { } } - vec.into_boxed_slice() + Ok(vec.into_boxed_slice()) +} + +fn remove_trailing_line_ending_byte(line_ending_byte: u8, output: &mut Vec) { + if let Some(&byte) = output.last() { + if byte == line_ending_byte { + assert!(output.pop() == Some(line_ending_byte)); + } + } } enum DelimiterState<'a> { NoDelimiters, - OneDelimiter { - delimiter: &'a [u8], - }, + OneDelimiter(&'a [u8]), MultipleDelimiters { current_delimiter: &'a [u8], delimiters: &'a [Box<[u8]>], @@ -303,17 +270,22 @@ enum DelimiterState<'a> { } impl<'a> DelimiterState<'a> { - /// If there are multiple delimiters, advance the iterator over the delimiter list. - /// This is a no-op unless there are multiple delimiters. - fn advance_to_next_delimiter(&mut self) { - if let DelimiterState::MultipleDelimiters { - current_delimiter, - delimiters_iterator, - .. - } = self - { - // Unwrap because `delimiters_iterator` is a cycle iter and was created from a non-empty slice - *current_delimiter = delimiters_iterator.next().unwrap(); + fn new(unescaped_and_encoded_delimiters: &'a [Box<[u8]>]) -> DelimiterState<'a> { + match unescaped_and_encoded_delimiters { + [] => DelimiterState::NoDelimiters, + [only_delimiter] => { + // -d '\0' is equivalent to -d '' + if only_delimiter.is_empty() { + DelimiterState::NoDelimiters + } else { + DelimiterState::OneDelimiter(only_delimiter) + } + } + [first_delimiter, ..] => DelimiterState::MultipleDelimiters { + current_delimiter: first_delimiter, + delimiters: unescaped_and_encoded_delimiters, + delimiters_iterator: unescaped_and_encoded_delimiters.iter().cycle(), + }, } } @@ -335,7 +307,7 @@ impl<'a> DelimiterState<'a> { /// If there are no delimiters, this is a no-op. fn remove_trailing_delimiter(&mut self, output: &mut Vec) { let delimiter_length = match self { - DelimiterState::OneDelimiter { delimiter } => delimiter.len(), + DelimiterState::OneDelimiter(only_delimiter) => only_delimiter.len(), DelimiterState::MultipleDelimiters { current_delimiter, .. } => current_delimiter.len(), @@ -344,14 +316,18 @@ impl<'a> DelimiterState<'a> { } }; - let output_len = output.len(); - - if let Some(output_without_delimiter_length) = output_len.checked_sub(delimiter_length) { - output.truncate(output_without_delimiter_length); - } else { - // This branch is NOT unreachable, must be skipped - // `output` should be empty in this case - assert!(output_len == 0); + // `delimiter_length` will be zero if the current delimiter is a "\0" delimiter + if delimiter_length > 0 { + let output_len = output.len(); + + if let Some(output_without_delimiter_length) = output_len.checked_sub(delimiter_length) + { + output.truncate(output_without_delimiter_length); + } else { + // This branch is NOT unreachable, must be skipped + // `output` should be empty in this case + assert!(output_len == 0); + } } } @@ -359,15 +335,42 @@ impl<'a> DelimiterState<'a> { /// If there are no delimiters, this is a no-op. fn write_delimiter(&mut self, output: &mut Vec) { match self { - DelimiterState::OneDelimiter { delimiter } => { - output.extend_from_slice(delimiter); + DelimiterState::OneDelimiter(only_delimiter) => { + output.extend_from_slice(only_delimiter); } DelimiterState::MultipleDelimiters { - current_delimiter, .. + current_delimiter, + delimiters_iterator, + .. } => { - output.extend_from_slice(current_delimiter); + // Unwrap because `delimiters_iterator` is a cycle iter and was created from a non-empty slice + let bo = delimiters_iterator.next().unwrap(); + + output.extend_from_slice(bo); + + *current_delimiter = bo; } _ => {} } } } + +enum InputSource { + File(BufReader), + StandardInput(Rc>), +} + +impl InputSource { + fn read_until(&mut self, byte: u8, buf: &mut Vec) -> UResult { + let us = match self { + Self::File(bu) => bu.read_until(byte, buf)?, + Self::StandardInput(rc) => rc + .try_borrow() + .map_err(|bo| USimpleError::new(1, format!("{bo}")))? + .lock() + .read_until(byte, buf)?, + }; + + Ok(us) + } +} diff --git a/tests/by-util/test_paste.rs b/tests/by-util/test_paste.rs index ac0cc93b8fa..0e53a5c6822 100644 --- a/tests/by-util/test_paste.rs +++ b/tests/by-util/test_paste.rs @@ -2,6 +2,7 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. + use crate::common::util::TestScenario; struct TestData<'b> { @@ -11,7 +12,7 @@ struct TestData<'b> { out: &'b str, } -static EXAMPLE_DATA: &[TestData] = &[ +const EXAMPLE_DATA: &[TestData] = &[ // Ensure that paste properly handles files lacking a final newline. TestData { name: "no-nl-1", @@ -172,7 +173,7 @@ fn test_delimiter_list_ending_with_escaped_backslash() { at.write(&file, one_in); ins.push(file); } - ucmd.args(&[d, "\\\\"]) + ucmd.args(&[d, r#"\\"#]) .args(&ins) .succeeds() .stdout_is("a\\b\n"); @@ -183,13 +184,14 @@ fn test_delimiter_list_ending_with_escaped_backslash() { fn test_delimiter_list_ending_with_unescaped_backslash() { for d in ["-d", "--delimiters"] { new_ucmd!() - .args(&[d, "\\"]) + .args(&[d, r#"\"#]) .fails() - .stderr_contains("delimiter list ends with an unescaped backslash: \\"); + .stderr_contains(r#"delimiter list ends with an unescaped backslash: \"#); + new_ucmd!() - .args(&[d, "_\\"]) + .args(&[d, r#"_\"#]) .fails() - .stderr_contains("delimiter list ends with an unescaped backslash: _\\"); + .stderr_contains(r#"delimiter list ends with an unescaped backslash: _\"#); } } @@ -243,36 +245,18 @@ FIRST!SECOND@THIRD#FOURTH!ABCDEFG #[test] fn test_non_utf8_input() { - const PREFIX_LEN: usize = 16; - const MIDDLE_LEN: usize = 3; - const SUFFIX_LEN: usize = 2; - - const TOTAL_LEN: usize = PREFIX_LEN + MIDDLE_LEN + SUFFIX_LEN; - - const PREFIX: &[u8; PREFIX_LEN] = b"Non-UTF-8 test: "; // 0xC0 is not valid UTF-8 - const MIDDLE: &[u8; MIDDLE_LEN] = &[0xC0, 0x00, 0xC0]; - const SUFFIX: &[u8; SUFFIX_LEN] = b".\n"; - - let mut input = Vec::::with_capacity(TOTAL_LEN); - - input.extend_from_slice(PREFIX); - - input.extend_from_slice(MIDDLE); - - input.extend_from_slice(SUFFIX); - - let input_clone = input.clone(); + const INPUT: &[u8] = b"Non-UTF-8 test: \xC0\x00\xC0.\n"; new_ucmd!() - .pipe_in(input_clone) + .pipe_in(INPUT) .succeeds() - .stdout_only_bytes(input); + .stdout_only_bytes(INPUT); } #[test] fn test_three_trailing_backslashes_delimiter() { - const ONE_BACKSLASH_STR: &str = "\\"; + const ONE_BACKSLASH_STR: &str = r#"\"#; let three_backslashes_string = ONE_BACKSLASH_STR.repeat(3); @@ -296,8 +280,7 @@ fn test_three_trailing_backslashes_delimiter() { fn test_posix_unspecified_delimiter() { for option_style in ["-d", "--delimiters"] { new_ucmd!() - // This is not "\\z", but "\z" - .args(&[option_style, "\\z", "-s"]) + .args(&[option_style, r#"\z"#, "-s"]) .pipe_in( "\ 1 @@ -321,8 +304,7 @@ fn test_posix_unspecified_delimiter() { fn test_backslash_zero_delimiter() { for option_style in ["-d", "--delimiters"] { new_ucmd!() - // This is "\0z\0" - .args(&[option_style, "\\0z\\0", "-s"]) + .args(&[option_style, r#"\0z\0"#, "-s"]) .pipe_in( "\ 1 @@ -342,6 +324,32 @@ fn test_backslash_zero_delimiter() { } } +// As of 2024-10-09, only bsdutils (https://github.com/dcantrell/bsdutils, derived from FreeBSD) and toybox handle +// multibyte delimiter characters in the way a user would likely expect. BusyBox and GNU Core Utilities do not. +#[test] +fn test_multi_byte_delimiter() { + for option_style in ["-d", "--delimiters"] { + new_ucmd!() + .args(&[option_style, "!ß@", "-s"]) + .pipe_in( + "\ +1 +2 +3 +4 +5 +6 +", + ) + .succeeds() + .stdout_only( + "\ +1!2ß3@4!5ß6 +", + ); + } +} + #[test] fn test_data() { for example in EXAMPLE_DATA { From a7393ef1f897c987985cadeff1d94d57f424b2f1 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Wed, 9 Oct 2024 00:39:16 -0500 Subject: [PATCH 7/7] Fix Clippy violations --- tests/by-util/test_paste.rs | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/tests/by-util/test_paste.rs b/tests/by-util/test_paste.rs index 0e53a5c6822..75fc9389519 100644 --- a/tests/by-util/test_paste.rs +++ b/tests/by-util/test_paste.rs @@ -3,6 +3,8 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +// spell-checker:ignore bsdutils toybox + use crate::common::util::TestScenario; struct TestData<'b> { @@ -173,7 +175,7 @@ fn test_delimiter_list_ending_with_escaped_backslash() { at.write(&file, one_in); ins.push(file); } - ucmd.args(&[d, r#"\\"#]) + ucmd.args(&[d, r"\\"]) .args(&ins) .succeeds() .stdout_is("a\\b\n"); @@ -184,14 +186,19 @@ fn test_delimiter_list_ending_with_escaped_backslash() { fn test_delimiter_list_ending_with_unescaped_backslash() { for d in ["-d", "--delimiters"] { new_ucmd!() - .args(&[d, r#"\"#]) + .args(&[d, r"\"]) + .fails() + .stderr_contains(r"delimiter list ends with an unescaped backslash: \"); + + new_ucmd!() + .args(&[d, r"\\\"]) .fails() - .stderr_contains(r#"delimiter list ends with an unescaped backslash: \"#); + .stderr_contains(r"delimiter list ends with an unescaped backslash: \\\"); new_ucmd!() - .args(&[d, r#"_\"#]) + .args(&[d, r"_\"]) .fails() - .stderr_contains(r#"delimiter list ends with an unescaped backslash: _\"#); + .stderr_contains(r"delimiter list ends with an unescaped backslash: _\"); } } @@ -256,7 +263,7 @@ fn test_non_utf8_input() { #[test] fn test_three_trailing_backslashes_delimiter() { - const ONE_BACKSLASH_STR: &str = r#"\"#; + const ONE_BACKSLASH_STR: &str = r"\"; let three_backslashes_string = ONE_BACKSLASH_STR.repeat(3); @@ -280,7 +287,7 @@ fn test_three_trailing_backslashes_delimiter() { fn test_posix_unspecified_delimiter() { for option_style in ["-d", "--delimiters"] { new_ucmd!() - .args(&[option_style, r#"\z"#, "-s"]) + .args(&[option_style, r"\z", "-s"]) .pipe_in( "\ 1 @@ -304,7 +311,7 @@ fn test_posix_unspecified_delimiter() { fn test_backslash_zero_delimiter() { for option_style in ["-d", "--delimiters"] { new_ucmd!() - .args(&[option_style, r#"\0z\0"#, "-s"]) + .args(&[option_style, r"\0z\0", "-s"]) .pipe_in( "\ 1