diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index c94443822e0..6d8c6242111 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -3,12 +3,15 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +use clap::builder::ValueParser; +use clap::parser::ValuesRef; use clap::{crate_version, Arg, ArgAction, Command}; -use std::io::{self, Write}; +use std::ffi::{OsStr, OsString}; +use std::io::{self, StdoutLock, Write}; use std::iter::Peekable; use std::ops::ControlFlow; -use std::str::Chars; -use uucore::error::{FromIo, UResult}; +use std::slice::Iter; +use uucore::error::{UResult, USimpleError}; use uucore::{format_usage, help_about, help_section, help_usage}; const ABOUT: &str = help_about!("echo.md"); @@ -22,94 +25,249 @@ mod options { pub const DISABLE_BACKSLASH_ESCAPE: &str = "disable_backslash_escape"; } -#[repr(u8)] -#[derive(Clone, Copy)] +enum BackslashNumberType { + OctalStartingWithNonZero(u8), + OctalStartingWithZero, + Hexadecimal, +} + +impl BackslashNumberType { + fn base(&self) -> Base { + match self { + BackslashNumberType::OctalStartingWithZero + | BackslashNumberType::OctalStartingWithNonZero(_) => Base::Octal, + BackslashNumberType::Hexadecimal => Base::Hexadecimal, + } + } +} + enum Base { - Oct = 8, - Hex = 16, + Octal, + Hexadecimal, } impl Base { - fn max_digits(&self) -> u8 { + fn ascii_to_number(&self, digit: u8) -> Option { + fn octal_ascii_digit_to_number(digit: u8) -> Option { + let number = match digit { + b'0' => 0, + b'1' => 1, + b'2' => 2, + b'3' => 3, + b'4' => 4, + b'5' => 5, + b'6' => 6, + b'7' => 7, + _ => { + return None; + } + }; + + Some(number) + } + + fn hexadecimal_ascii_digit_to_number(digit: u8) -> Option { + let number = match digit { + b'0' => 0, + b'1' => 1, + b'2' => 2, + b'3' => 3, + b'4' => 4, + b'5' => 5, + b'6' => 6, + b'7' => 7, + b'8' => 8, + b'9' => 9, + b'A' | b'a' => 10, + b'B' | b'b' => 11, + b'C' | b'c' => 12, + b'D' | b'd' => 13, + b'E' | b'e' => 14, + b'F' | b'f' => 15, + _ => { + return None; + } + }; + + Some(number) + } + + match self { + Self::Octal => octal_ascii_digit_to_number(digit), + Self::Hexadecimal => hexadecimal_ascii_digit_to_number(digit), + } + } + + fn maximum_number_of_digits(&self) -> u8 { + match self { + Self::Octal => 3, + Self::Hexadecimal => 2, + } + } + + fn radix(&self) -> u8 { match self { - Self::Oct => 3, - Self::Hex => 2, + Self::Octal => 8, + Self::Hexadecimal => 16, } } } -/// Parse the numeric part of the `\xHHH` and `\0NNN` escape sequences -fn parse_code(input: &mut Peekable, base: Base) -> Option { - // All arithmetic on `ret` needs to be wrapping, because octal input can - // take 3 digits, which is 9 bits, and therefore more than what fits in a - // `u8`. GNU just seems to wrap these values. - // Note that if we instead make `ret` a `u32` and use `char::from_u32` will - // yield incorrect results because it will interpret values larger than - // `u8::MAX` as unicode. - let mut ret = input.peek().and_then(|c| c.to_digit(base as u32))? as u8; - - // We can safely ignore the None case because we just peeked it. - let _ = input.next(); - - for _ in 1..base.max_digits() { - match input.peek().and_then(|c| c.to_digit(base as u32)) { - Some(n) => ret = ret.wrapping_mul(base as u8).wrapping_add(n as u8), - None => break, - } - // We can safely ignore the None case because we just peeked it. - let _ = input.next(); +/// Parse the numeric part of `\xHHH`, `\0NNN`, and `\NNN` escape sequences +fn parse_backslash_number( + input: &mut Peekable>, + backslash_number_type: BackslashNumberType, +) -> Option { + let first_digit_ascii = match backslash_number_type { + BackslashNumberType::OctalStartingWithZero | BackslashNumberType::Hexadecimal => { + match input.peek() { + Some(&&digit_ascii) => digit_ascii, + None => { + // One of the following cases: argument ends with "\0" or "\x" + // If "\0" (octal): caller will print not ASCII '0', 0x30, but ASCII '\0' (NUL), 0x00 + // If "\x" (hexadecimal): caller will print literal "\x" + return None; + } + } + } + // Never returns early when backslash number starts with "\1" through "\7", because caller provides the + // first digit + BackslashNumberType::OctalStartingWithNonZero(digit_ascii) => digit_ascii, + }; + + let base = backslash_number_type.base(); + + let first_digit_number = match base.ascii_to_number(first_digit_ascii) { + Some(digit_number) => { + // Move past byte, since it was successfully parsed + let _ = input.next(); + + digit_number + } + None => { + // The first digit was not a valid octal or hexadecimal digit + // This should never be the case when the backslash number starts with "\1" through "\7" + // (caller unwraps to verify this) + return None; + } + }; + + let radix = base.radix(); + + let mut sum = first_digit_number; + + for _ in 1..(base.maximum_number_of_digits()) { + match input + .peek() + .and_then(|&&digit_ascii| base.ascii_to_number(digit_ascii)) + { + Some(digit_number) => { + // Move past byte, since it was successfully parsed + let _ = input.next(); + + // All arithmetic on `sum` needs to be wrapping, because octal input can + // take 3 digits, which is 9 bits, and therefore more than what fits in a + // `u8`. + // + // GNU Core Utilities: "if nnn is a nine-bit value, the ninth bit is ignored" + // https://www.gnu.org/software/coreutils/manual/html_node/echo-invocation.html + sum = sum.wrapping_mul(radix).wrapping_add(digit_number); + } + None => { + break; + } + } } - Some(ret.into()) + Some(sum) } -fn print_escaped(input: &str, mut output: impl Write) -> io::Result> { - let mut iter = input.chars().peekable(); - while let Some(c) = iter.next() { - if c != '\\' { - write!(output, "{c}")?; +fn print_escaped(input: &[u8], output: &mut StdoutLock) -> io::Result> { + let mut iter = input.iter().peekable(); + + while let Some(¤t_byte) = iter.next() { + if current_byte != b'\\' { + output.write_all(&[current_byte])?; + continue; } - // This is for the \NNN syntax for octal sequences. - // Note that '0' is intentionally omitted because that - // would be the \0NNN syntax. - if let Some('1'..='8') = iter.peek() { - if let Some(parsed) = parse_code(&mut iter, Base::Oct) { - write!(output, "{parsed}")?; - continue; - } + // This is for the \NNN syntax for octal sequences + // Note that '0' is intentionally omitted, because the \0NNN syntax is handled below + if let Some(&&first_digit @ b'1'..=b'7') = iter.peek() { + // Unwrap because anything starting with "\1" through "\7" can be successfully parsed + let parsed_octal_number = parse_backslash_number( + &mut iter, + BackslashNumberType::OctalStartingWithNonZero(first_digit), + ) + .unwrap(); + + output.write_all(&[parsed_octal_number])?; + + continue; } if let Some(next) = iter.next() { - let unescaped = match next { - '\\' => '\\', - 'a' => '\x07', - 'b' => '\x08', - 'c' => return Ok(ControlFlow::Break(())), - 'e' => '\x1b', - 'f' => '\x0c', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - 'v' => '\x0b', - 'x' => { - if let Some(c) = parse_code(&mut iter, Base::Hex) { - c + // For extending lifetime + // Unnecessary when using Rust >= 1.79.0 + // https://github.com/rust-lang/rust/pull/121346 + // TODO: when we have a MSRV >= 1.79.0, delete these "hold" bindings + let hold_one_byte_outside_of_match: [u8; 1_usize]; + let hold_two_bytes_outside_of_match: [u8; 2_usize]; + + let unescaped: &[u8] = match *next { + b'\\' => br"\", + b'a' => b"\x07", + b'b' => b"\x08", + b'c' => return Ok(ControlFlow::Break(())), + b'e' => b"\x1B", + b'f' => b"\x0C", + b'n' => b"\n", + b'r' => b"\r", + b't' => b"\t", + b'v' => b"\x0B", + b'x' => { + if let Some(parsed_hexadecimal_number) = + parse_backslash_number(&mut iter, BackslashNumberType::Hexadecimal) + { + // TODO: remove when we have a MSRV >= 1.79.0 + hold_one_byte_outside_of_match = [parsed_hexadecimal_number]; + + // TODO: when we have a MSRV >= 1.79.0, return reference to a temporary array: + // &[parsed_hexadecimal_number] + &hold_one_byte_outside_of_match } else { - write!(output, "\\")?; - 'x' + // "\x" with any non-hexadecimal digit after means "\x" is treated literally + br"\x" } } - '0' => parse_code(&mut iter, Base::Oct).unwrap_or('\0'), - c => { - write!(output, "\\")?; - c + b'0' => { + if let Some(parsed_octal_number) = parse_backslash_number( + &mut iter, + BackslashNumberType::OctalStartingWithZero, + ) { + // TODO: remove when we have a MSRV >= 1.79.0 + hold_one_byte_outside_of_match = [parsed_octal_number]; + + // TODO: when we have a MSRV >= 1.79.0, return reference to a temporary array: + // &[parsed_octal_number] + &hold_one_byte_outside_of_match + } else { + // "\0" with any non-octal digit after it means "\0" is treated as ASCII '\0' (NUL), 0x00 + b"\0" + } + } + other_byte => { + // Backslash and the following byte are treated literally + hold_two_bytes_outside_of_match = [b'\\', other_byte]; + + &hold_two_bytes_outside_of_match } }; - write!(output, "{unescaped}")?; + + output.write_all(unescaped)?; } else { - write!(output, "\\")?; + output.write_all(br"\")?; } } @@ -120,15 +278,33 @@ fn print_escaped(input: &str, mut output: impl Write) -> io::Result UResult<()> { let matches = uu_app().get_matches_from(args); + // TODO + // "If the POSIXLY_CORRECT environment variable is set, then when echo’s first argument is not -n it outputs option-like arguments instead of treating them as options." + // https://www.gnu.org/software/coreutils/manual/html_node/echo-invocation.html + let no_newline = matches.get_flag(options::NO_NEWLINE); let escaped = matches.get_flag(options::ENABLE_BACKSLASH_ESCAPE); - let values: Vec = match matches.get_many::(options::STRING) { - Some(s) => s.map(|s| s.to_string()).collect(), - None => vec![String::new()], - }; - execute(no_newline, escaped, &values) - .map_err_context(|| "could not write to stdout".to_string()) + let mut stdout_lock = io::stdout().lock(); + + match matches.get_many::(options::STRING) { + Some(arguments_after_options) => { + execute( + &mut stdout_lock, + no_newline, + escaped, + arguments_after_options, + )?; + } + None => { + // No strings to print, so just handle newline setting + if !no_newline { + stdout_lock.write_all(b"\n")?; + } + } + } + + Ok(()) } pub fn uu_app() -> Command { @@ -165,29 +341,63 @@ pub fn uu_app() -> Command { .action(ArgAction::SetTrue) .overrides_with(options::ENABLE_BACKSLASH_ESCAPE), ) - .arg(Arg::new(options::STRING).action(ArgAction::Append)) + .arg( + Arg::new(options::STRING) + .action(ArgAction::Append) + .value_parser(ValueParser::os_string()), + ) } -fn execute(no_newline: bool, escaped: bool, free: &[String]) -> io::Result<()> { - let stdout = io::stdout(); - let mut output = stdout.lock(); +fn execute( + stdout_lock: &mut StdoutLock, + no_newline: bool, + escaped: bool, + arguments_after_options: ValuesRef<'_, OsString>, +) -> UResult<()> { + for (i, input) in arguments_after_options.enumerate() { + let Some(bytes) = bytes_from_os_string(input.as_os_str()) else { + return Err(USimpleError::new( + 1, + "Non-UTF-8 arguments provided, but this platform does not support them", + )); + }; - for (i, input) in free.iter().enumerate() { if i > 0 { - write!(output, " ")?; + stdout_lock.write_all(b" ")?; } + if escaped { - if print_escaped(input, &mut output)?.is_break() { + if print_escaped(bytes, stdout_lock)?.is_break() { return Ok(()); } } else { - write!(output, "{input}")?; + stdout_lock.write_all(bytes)?; } } if !no_newline { - writeln!(output)?; + stdout_lock.write_all(b"\n")?; } Ok(()) } + +fn bytes_from_os_string(input: &OsStr) -> Option<&[u8]> { + let option = { + #[cfg(target_family = "unix")] + { + use std::os::unix::ffi::OsStrExt; + + Some(input.as_bytes()) + } + + #[cfg(not(target_family = "unix"))] + { + // TODO + // Verify that this works correctly on these platforms + input.to_str().map(|st| st.as_bytes()) + } + }; + + option +} diff --git a/tests/by-util/test_echo.rs b/tests/by-util/test_echo.rs index 4ae623f2f6f..47240c7c056 100644 --- a/tests/by-util/test_echo.rs +++ b/tests/by-util/test_echo.rs @@ -303,3 +303,88 @@ fn partial_version_argument() { fn partial_help_argument() { new_ucmd!().arg("--he").succeeds().stdout_is("--he\n"); } + +#[test] +fn multibyte_escape_unicode() { + // spell-checker:disable-next-line + // Tests suggested by kkew3 + // https://github.com/uutils/coreutils/issues/6741 + + // \u{1F602} is: + // + // "Face with Tears of Joy" + // U+1F602 + // "😂" + + new_ucmd!() + .args(&["-e", r"\xf0\x9f\x98\x82"]) + .succeeds() + .stdout_only("\u{1F602}\n"); + + new_ucmd!() + .args(&["-e", r"\x41\xf0\x9f\x98\x82\x42"]) + .succeeds() + .stdout_only("A\u{1F602}B\n"); + + new_ucmd!() + .args(&["-e", r"\xf0\x41\x9f\x98\x82"]) + .succeeds() + .stdout_only_bytes(b"\xF0A\x9F\x98\x82\n"); + + new_ucmd!() + .args(&["-e", r"\x41\xf0\c\x9f\x98\x82"]) + .succeeds() + .stdout_only_bytes(b"A\xF0"); +} + +#[test] +fn non_utf_8_hex_round_trip() { + new_ucmd!() + .args(&["-e", r"\xFF"]) + .succeeds() + .stdout_only_bytes(b"\xFF\n"); +} + +#[test] +fn nine_bit_octal() { + const RESULT: &[u8] = b"\xFF\n"; + + new_ucmd!() + .args(&["-e", r"\0777"]) + .succeeds() + .stdout_only_bytes(RESULT); + + new_ucmd!() + .args(&["-e", r"\777"]) + .succeeds() + .stdout_only_bytes(RESULT); +} + +#[test] +#[cfg(target_family = "unix")] +fn non_utf_8() { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + // ISO-8859-1 encoded text + // spell-checker:disable + const INPUT_AND_OUTPUT: &[u8] = + b"Swer an rehte g\xFCete wendet s\xEEn gem\xFCete, dem volget s\xE6lde und \xEAre."; + // spell-checker:enable + + let os_str = OsStr::from_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg("-n") + .arg(os_str) + .succeeds() + .stdout_only_bytes(INPUT_AND_OUTPUT); +} + +#[test] +fn slash_eight_off_by_one() { + new_ucmd!() + .args(&["-e", "-n", r"\8"]) + .succeeds() + .stdout_only(r"\8"); +}