Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lexer: Disallow bare CR in raw byte strings #60793

Merged
merged 11 commits into from
Jun 11, 2019
156 changes: 56 additions & 100 deletions src/libsyntax/parse/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ impl<'a> StringReader<'a> {
self.ch.is_none()
}

fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) {
fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) -> ! {
let mut err = self.struct_span_fatal(pos, pos, "unterminated raw string");
err.span_label(self.mk_sp(pos, pos), "unterminated raw string");

Expand Down Expand Up @@ -292,15 +292,6 @@ impl<'a> StringReader<'a> {
self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
}

/// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
/// escaped character to the error message
fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
let mut m = m.to_string();
m.push_str(": ");
push_escaped_char(&mut m, c);
self.err_span_(from_pos, to_pos, &m[..]);
}

/// Advance peek_token to refer to the next token, and
/// possibly update the interner.
fn advance_token(&mut self) -> Result<(), ()> {
Expand Down Expand Up @@ -1070,7 +1061,13 @@ impl<'a> StringReader<'a> {
self.validate_byte_str_escape(start_with_quote);
(token::ByteStr, symbol)
},
Some('r') => self.scan_raw_byte_string(),
Some('r') => {
let (start, end, hash_count) = self.scan_raw_string();
let symbol = self.name_from_to(start, end);
self.validate_raw_byte_str_escape(start, end);

(token::ByteStrRaw(hash_count), symbol)
}
_ => unreachable!(), // Should have been a token::Ident above.
};
let suffix = self.scan_optional_raw_name();
Expand All @@ -1086,79 +1083,9 @@ impl<'a> StringReader<'a> {
Ok(TokenKind::lit(token::Str, symbol, suffix))
}
'r' => {
let start_bpos = self.pos;
self.bump();
let mut hash_count: u16 = 0;
while self.ch_is('#') {
if hash_count == 65535 {
let bpos = self.next_pos;
self.fatal_span_(start_bpos,
bpos,
"too many `#` symbols: raw strings may be \
delimited by up to 65535 `#` symbols").raise();
}
self.bump();
hash_count += 1;
}

if self.is_eof() {
self.fail_unterminated_raw_string(start_bpos, hash_count);
} else if !self.ch_is('"') {
let last_bpos = self.pos;
let curr_char = self.ch.unwrap();
self.fatal_span_char(start_bpos,
last_bpos,
"found invalid character; only `#` is allowed \
in raw string delimitation",
curr_char).raise();
}
self.bump();
let content_start_bpos = self.pos;
let mut content_end_bpos;
let mut valid = true;
'outer: loop {
if self.is_eof() {
self.fail_unterminated_raw_string(start_bpos, hash_count);
}
// if self.ch_is('"') {
// content_end_bpos = self.pos;
// for _ in 0..hash_count {
// self.bump();
// if !self.ch_is('#') {
// continue 'outer;
let c = self.ch.unwrap();
match c {
'"' => {
content_end_bpos = self.pos;
for _ in 0..hash_count {
self.bump();
if !self.ch_is('#') {
continue 'outer;
}
}
break;
}
'\r' => {
if !self.nextch_is('\n') {
let last_bpos = self.pos;
self.err_span_(start_bpos,
last_bpos,
"bare CR not allowed in raw string, use \\r \
instead");
valid = false;
}
}
_ => (),
}
self.bump();
}

self.bump();
let symbol = if valid {
self.name_from_to(content_start_bpos, content_end_bpos)
} else {
Symbol::intern("??")
};
let (start, end, hash_count) = self.scan_raw_string();
let symbol = self.name_from_to(start, end);
self.validate_raw_str_escape(start, end);
let suffix = self.scan_optional_raw_name();

Ok(TokenKind::lit(token::StrRaw(hash_count), symbol, suffix))
Expand Down Expand Up @@ -1315,16 +1242,16 @@ impl<'a> StringReader<'a> {
id
}

fn scan_raw_byte_string(&mut self) -> (token::LitKind, Symbol) {
fn scan_raw_string(&mut self) -> (BytePos, BytePos, u16) {
let start_bpos = self.pos;
self.bump();
let mut hash_count = 0;
let mut hash_count: u16 = 0;
while self.ch_is('#') {
if hash_count == 65535 {
let bpos = self.next_pos;
self.fatal_span_(start_bpos,
bpos,
"too many `#` symbols: raw byte strings may be \
"too many `#` symbols: raw strings may be \
delimited by up to 65535 `#` symbols").raise();
}
self.bump();
Expand All @@ -1334,13 +1261,13 @@ impl<'a> StringReader<'a> {
if self.is_eof() {
self.fail_unterminated_raw_string(start_bpos, hash_count);
} else if !self.ch_is('"') {
let pos = self.pos;
let ch = self.ch.unwrap();
let last_bpos = self.pos;
let curr_char = self.ch.unwrap();
self.fatal_span_char(start_bpos,
pos,
"found invalid character; only `#` is allowed in raw \
string delimitation",
ch).raise();
last_bpos,
"found invalid character; only `#` is allowed \
in raw string delimitation",
curr_char).raise();
}
self.bump();
let content_start_bpos = self.pos;
Expand All @@ -1360,19 +1287,14 @@ impl<'a> StringReader<'a> {
}
break;
}
Some(c) => {
if c > '\x7F' {
let pos = self.pos;
self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
}
}
_ => (),
}
self.bump();
}

self.bump();

(token::ByteStrRaw(hash_count), self.name_from_to(content_start_bpos, content_end_bpos))
(content_start_bpos, content_end_bpos, hash_count)
}

fn validate_char_escape(&self, start_with_quote: BytePos) {
Expand Down Expand Up @@ -1422,6 +1344,40 @@ impl<'a> StringReader<'a> {
});
}

fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) {
self.with_str_from_to(content_start, content_end, |lit: &str| {
unescape::unescape_raw_str(lit, unescape::Mode::Str, &mut |range, c| {
if let Err(err) = c {
emit_unescape_error(
&self.sess.span_diagnostic,
lit,
self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
unescape::Mode::Str,
range,
err,
)
}
})
});
}

fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) {
self.with_str_from_to(content_start, content_end, |lit: &str| {
unescape::unescape_raw_str(lit, unescape::Mode::ByteStr, &mut |range, c| {
if let Err(err) = c {
emit_unescape_error(
&self.sess.span_diagnostic,
lit,
self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
unescape::Mode::ByteStr,
range,
err,
)
}
})
});
}

fn validate_byte_str_escape(&self, start_with_quote: BytePos) {
self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
unescape::unescape_byte_str(lit, &mut |range, c| {
Expand Down
38 changes: 13 additions & 25 deletions src/libsyntax/parse/literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ use crate::ast::{self, Lit, LitKind};
use crate::parse::parser::Parser;
use crate::parse::PResult;
use crate::parse::token::{self, Token, TokenKind};
use crate::parse::unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte};
use crate::parse::unescape::{self, unescape_str, unescape_byte_str, unescape_raw_str};
use crate::parse::unescape::{unescape_char, unescape_byte};
use crate::print::pprust;
use crate::symbol::{kw, sym, Symbol};
use crate::tokenstream::{TokenStream, TokenTree};
Expand Down Expand Up @@ -141,7 +142,17 @@ impl LitKind {
// Ditto.
let s = symbol.as_str();
let symbol = if s.contains('\r') {
Symbol::intern(&raw_str_lit(&s))
let mut buf = String::with_capacity(s.len());
let mut error = Ok(());
unescape_raw_str(&s, unescape::Mode::Str, &mut |_, unescaped_char| {
match unescaped_char {
Ok(c) => buf.push(c),
Err(_) => error = Err(LitError::LexerError),
}
});
error?;
buf.shrink_to_fit();
Symbol::intern(&buf)
} else {
symbol
};
Expand Down Expand Up @@ -350,29 +361,6 @@ crate fn expect_no_suffix(diag: &Handler, sp: Span, kind: &str, suffix: Option<S
}
}

/// Parses a string representing a raw string literal into its final form. The
/// only operation this does is convert embedded CRLF into a single LF.
fn raw_str_lit(lit: &str) -> String {
debug!("raw_str_lit: {:?}", lit);
let mut res = String::with_capacity(lit.len());

let mut chars = lit.chars().peekable();
while let Some(c) = chars.next() {
if c == '\r' {
if *chars.peek().unwrap() != '\n' {
panic!("lexer accepted bare CR");
}
chars.next();
res.push('\n');
} else {
res.push(c);
}
}

res.shrink_to_fit();
res
}

// Checks if `s` looks like i32 or u1234 etc.
fn looks_like_width_suffix(first_chars: &[char], s: &str) -> bool {
s.len() > 1 && s.starts_with(first_chars) && s[1..].chars().all(|c| c.is_ascii_digit())
Expand Down
32 changes: 31 additions & 1 deletion src/libsyntax/parse/unescape.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//! Utilities for validating string and char literals and turning them into
//! Utilities for validating string and char literals and turning them into
//! values they represent.

use std::str::Chars;
Expand Down Expand Up @@ -29,6 +29,7 @@ pub(crate) enum EscapeError {

UnicodeEscapeInByte,
NonAsciiCharInByte,
NonAsciiCharInByteString,
}

/// Takes a contents of a char literal (without quotes), and returns an
Expand Down Expand Up @@ -66,6 +67,35 @@ where
})
}

/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of characters or errors.
/// NOTE: Raw strings do not perform any explicit character escaping, here we
/// only translate CRLF to LF and produce errors on bare CR.
pub(crate) fn unescape_raw_str<F>(literal_text: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
let mut byte_offset: usize = 0;

let mut chars = literal_text.chars().peekable();
while let Some(curr) = chars.next() {
let (result, scanned) = match (curr, chars.peek()) {
('\r', Some('\n')) => {
chars.next();
(Ok('\n'), [Some('\r'), Some('\n')])
},
('\r', _) =>
(Err(EscapeError::BareCarriageReturn), [Some('\r'), None]),
(c, _) if mode.is_bytes() && c > '\x7F' =>
(Err(EscapeError::NonAsciiCharInByteString), [Some(c), None]),
(c, _) => (Ok(c), [Some(c), None]),
};
let len_utf8: usize = scanned.iter().filter_map(|&x| x).map(char::len_utf8).sum();
callback(byte_offset..(byte_offset + len_utf8), result);
byte_offset += len_utf8;
}
}

#[derive(Debug, Clone, Copy)]
pub(crate) enum Mode {
Char,
Expand Down
5 changes: 5 additions & 0 deletions src/libsyntax/parse/unescape_error_reporting.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,11 @@ pub(crate) fn emit_unescape_error(
handler.span_err(span, "byte constant must be ASCII. \
Use a \\xHH escape for a non-ASCII byte")
}
EscapeError::NonAsciiCharInByteString => {
assert!(mode.is_bytes());
let (_c, span) = last_char();
handler.span_err(span, "raw byte string must be ASCII")
}
EscapeError::OutOfRangeHexEscape => {
handler.span_err(span, "this form of character escape may only be used \
with characters in the range [\\x00-\\x7f]")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ fn main() {
let _s = "foobar"; //~ ERROR: bare CR not allowed in string

// the following string literal has a bare CR in it
let _s = r"barfoo"; //~ ERROR: bare CR not allowed in raw string
let _s = r"barfoo"; //~ ERROR: bare CR not allowed in string

// the following string literal has a bare CR in it
let _s = "foo\bar"; //~ ERROR: unknown character escape: \r
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ error: bare CR not allowed in string, use \r instead
LL | let _s = "foobar";
| ^

error: bare CR not allowed in raw string, use \r instead
--> $DIR/lex-bare-cr-string-literal-doc-comment.rs:24:14
error: bare CR not allowed in string, use \r instead
--> $DIR/lex-bare-cr-string-literal-doc-comment.rs:24:19
|
LL | let _s = r"barfoo";
| ^^^^^
| ^

error: unknown character escape: \r
--> $DIR/lex-bare-cr-string-literal-doc-comment.rs:27:19
Expand Down
3 changes: 3 additions & 0 deletions src/test/ui/parser/raw-byte-string-literals.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
// ignore-tidy-cr
// compile-flags: -Z continue-parse-after-error
pub fn main() {
br"a"; //~ ERROR bare CR not allowed in string
br"é"; //~ ERROR raw byte string must be ASCII
br##~"a"~##; //~ ERROR only `#` is allowed in raw string delimitation
}
Expand Down
Loading