Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lexer: Disallow bare CR in raw byte strings #60793

Merged
merged 11 commits into from
Jun 11, 2019
158 changes: 58 additions & 100 deletions src/libsyntax/parse/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ impl<'a> StringReader<'a> {
self.ch.is_none()
}

fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) {
fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) -> ! {
let mut err = self.struct_span_fatal(pos, pos, "unterminated raw string");
err.span_label(self.mk_sp(pos, pos), "unterminated raw string");

Expand Down Expand Up @@ -292,15 +292,6 @@ impl<'a> StringReader<'a> {
self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
}

/// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
/// escaped character to the error message
fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
let mut m = m.to_string();
m.push_str(": ");
push_escaped_char(&mut m, c);
self.err_span_(from_pos, to_pos, &m[..]);
}

/// Advance peek_token to refer to the next token, and
/// possibly update the interner.
fn advance_token(&mut self) -> Result<(), ()> {
Expand Down Expand Up @@ -1070,7 +1061,13 @@ impl<'a> StringReader<'a> {
self.validate_byte_str_escape(start_with_quote);
(token::ByteStr, symbol)
},
Some('r') => self.scan_raw_byte_string(),
Some('r') => {
let (start, end, hash_count) = self.scan_raw_string();
let symbol = self.name_from_to(start, end);
self.validate_raw_byte_str_escape(start, end);

(token::ByteStrRaw(hash_count), symbol)
}
_ => unreachable!(), // Should have been a token::Ident above.
};
let suffix = self.scan_optional_raw_name();
Expand All @@ -1086,79 +1083,9 @@ impl<'a> StringReader<'a> {
Ok(TokenKind::lit(token::Str, symbol, suffix))
}
'r' => {
let start_bpos = self.pos;
self.bump();
let mut hash_count: u16 = 0;
while self.ch_is('#') {
if hash_count == 65535 {
let bpos = self.next_pos;
self.fatal_span_(start_bpos,
bpos,
"too many `#` symbols: raw strings may be \
delimited by up to 65535 `#` symbols").raise();
}
self.bump();
hash_count += 1;
}

if self.is_eof() {
self.fail_unterminated_raw_string(start_bpos, hash_count);
} else if !self.ch_is('"') {
let last_bpos = self.pos;
let curr_char = self.ch.unwrap();
self.fatal_span_char(start_bpos,
last_bpos,
"found invalid character; only `#` is allowed \
in raw string delimitation",
curr_char).raise();
}
self.bump();
let content_start_bpos = self.pos;
let mut content_end_bpos;
let mut valid = true;
'outer: loop {
if self.is_eof() {
self.fail_unterminated_raw_string(start_bpos, hash_count);
}
// if self.ch_is('"') {
// content_end_bpos = self.pos;
// for _ in 0..hash_count {
// self.bump();
// if !self.ch_is('#') {
// continue 'outer;
let c = self.ch.unwrap();
match c {
'"' => {
content_end_bpos = self.pos;
for _ in 0..hash_count {
self.bump();
if !self.ch_is('#') {
continue 'outer;
}
}
break;
}
'\r' => {
if !self.nextch_is('\n') {
let last_bpos = self.pos;
self.err_span_(start_bpos,
last_bpos,
"bare CR not allowed in raw string, use \\r \
instead");
valid = false;
}
}
_ => (),
}
self.bump();
}

self.bump();
let symbol = if valid {
self.name_from_to(content_start_bpos, content_end_bpos)
} else {
Symbol::intern("??")
};
let (start, end, hash_count) = self.scan_raw_string();
let symbol = self.name_from_to(start, end);
self.validate_raw_str_escape(start, end);
let suffix = self.scan_optional_raw_name();

Ok(TokenKind::lit(token::StrRaw(hash_count), symbol, suffix))
Expand Down Expand Up @@ -1315,16 +1242,18 @@ impl<'a> StringReader<'a> {
id
}

fn scan_raw_byte_string(&mut self) -> (token::LitKind, Symbol) {
/// Scans a raw (byte) string, returning byte position range for `"<literal>"`
/// (including quotes) along with `#` character count in `(b)r##..."<literal>"##...`;
fn scan_raw_string(&mut self) -> (BytePos, BytePos, u16) {
Xanewok marked this conversation as resolved.
Show resolved Hide resolved
let start_bpos = self.pos;
self.bump();
let mut hash_count = 0;
let mut hash_count: u16 = 0;
while self.ch_is('#') {
if hash_count == 65535 {
let bpos = self.next_pos;
self.fatal_span_(start_bpos,
bpos,
"too many `#` symbols: raw byte strings may be \
"too many `#` symbols: raw strings may be \
delimited by up to 65535 `#` symbols").raise();
}
self.bump();
Expand All @@ -1334,13 +1263,13 @@ impl<'a> StringReader<'a> {
if self.is_eof() {
self.fail_unterminated_raw_string(start_bpos, hash_count);
} else if !self.ch_is('"') {
let pos = self.pos;
let ch = self.ch.unwrap();
let last_bpos = self.pos;
let curr_char = self.ch.unwrap();
self.fatal_span_char(start_bpos,
pos,
"found invalid character; only `#` is allowed in raw \
string delimitation",
ch).raise();
last_bpos,
"found invalid character; only `#` is allowed \
in raw string delimitation",
curr_char).raise();
}
self.bump();
let content_start_bpos = self.pos;
Expand All @@ -1360,19 +1289,14 @@ impl<'a> StringReader<'a> {
}
break;
}
Some(c) => {
if c > '\x7F' {
let pos = self.pos;
self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
}
}
_ => (),
}
self.bump();
}

self.bump();

(token::ByteStrRaw(hash_count), self.name_from_to(content_start_bpos, content_end_bpos))
(content_start_bpos, content_end_bpos, hash_count)
}

fn validate_char_escape(&self, start_with_quote: BytePos) {
Expand Down Expand Up @@ -1422,6 +1346,40 @@ impl<'a> StringReader<'a> {
});
}

fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) {
self.with_str_from_to(content_start, content_end, |lit: &str| {
unescape::unescape_raw_str(lit, &mut |range, c| {
if let Err(err) = c {
emit_unescape_error(
&self.sess.span_diagnostic,
lit,
self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
unescape::Mode::Str,
range,
err,
)
}
})
});
}

fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) {
self.with_str_from_to(content_start, content_end, |lit: &str| {
unescape::unescape_raw_byte_str(lit, &mut |range, c| {
if let Err(err) = c {
emit_unescape_error(
&self.sess.span_diagnostic,
lit,
self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
unescape::Mode::ByteStr,
range,
err,
)
}
})
});
}

fn validate_byte_str_escape(&self, start_with_quote: BytePos) {
self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
unescape::unescape_byte_str(lit, &mut |range, c| {
Expand Down
60 changes: 34 additions & 26 deletions src/libsyntax/parse/literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ use crate::ast::{self, Lit, LitKind};
use crate::parse::parser::Parser;
use crate::parse::PResult;
use crate::parse::token::{self, Token, TokenKind};
use crate::parse::unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte};
use crate::parse::unescape::{unescape_char, unescape_byte};
use crate::parse::unescape::{unescape_str, unescape_byte_str};
use crate::parse::unescape::{unescape_raw_str, unescape_raw_byte_str};
use crate::print::pprust;
use crate::symbol::{kw, sym, Symbol};
use crate::tokenstream::{TokenStream, TokenTree};
Expand Down Expand Up @@ -141,7 +143,17 @@ impl LitKind {
// Ditto.
let s = symbol.as_str();
let symbol = if s.contains('\r') {
Symbol::intern(&raw_str_lit(&s))
let mut buf = String::with_capacity(s.len());
let mut error = Ok(());
unescape_raw_str(&s, &mut |_, unescaped_char| {
match unescaped_char {
Ok(c) => buf.push(c),
Err(_) => error = Err(LitError::LexerError),
}
});
error?;
buf.shrink_to_fit();
Symbol::intern(&buf)
} else {
symbol
};
Expand All @@ -161,7 +173,26 @@ impl LitKind {
buf.shrink_to_fit();
LitKind::ByteStr(Lrc::new(buf))
}
token::ByteStrRaw(_) => LitKind::ByteStr(Lrc::new(symbol.to_string().into_bytes())),
token::ByteStrRaw(_) => {
let s = symbol.as_str();
let bytes = if s.contains('\r') {
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_raw_byte_str(&s, &mut |_, unescaped_byte| {
match unescaped_byte {
Ok(c) => buf.push(c),
Err(_) => error = Err(LitError::LexerError),
}
});
error?;
buf.shrink_to_fit();
buf
} else {
symbol.to_string().into_bytes()
};

LitKind::ByteStr(Lrc::new(bytes))
},
token::Err => LitKind::Err(symbol),
})
}
Expand Down Expand Up @@ -350,29 +381,6 @@ crate fn expect_no_suffix(diag: &Handler, sp: Span, kind: &str, suffix: Option<S
}
}

/// Parses a string representing a raw string literal into its final form. The
/// only operation this does is convert embedded CRLF into a single LF.
fn raw_str_lit(lit: &str) -> String {
debug!("raw_str_lit: {:?}", lit);
let mut res = String::with_capacity(lit.len());

let mut chars = lit.chars().peekable();
while let Some(c) = chars.next() {
if c == '\r' {
if *chars.peek().unwrap() != '\n' {
panic!("lexer accepted bare CR");
}
chars.next();
res.push('\n');
} else {
res.push(c);
}
}

res.shrink_to_fit();
res
}

// Checks if `s` looks like i32 or u1234 etc.
fn looks_like_width_suffix(first_chars: &[char], s: &str) -> bool {
s.len() > 1 && s.starts_with(first_chars) && s[1..].chars().all(|c| c.is_ascii_digit())
Expand Down
Loading