-
Notifications
You must be signed in to change notification settings - Fork 13.4k
Unescaping cleanups #103919
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Unescaping cleanups #103919
Changes from all commits
f32e678
84ca2c3
34b32b0
7dbf2c0
a21c045
d963686
a203482
a838952
43d21b5
d6c97a3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,10 +52,8 @@ pub enum EscapeError { | |
|
||
/// Unicode escape code in byte literal. | ||
UnicodeEscapeInByte, | ||
/// Non-ascii character in byte literal. | ||
/// Non-ascii character in byte literal, byte string literal, or raw byte string literal. | ||
NonAsciiCharInByte, | ||
/// Non-ascii character in byte string literal. | ||
NonAsciiCharInByteString, | ||
|
||
/// After a line ending with '\', the next line contains whitespace | ||
/// characters that are not skipped. | ||
|
@@ -78,54 +76,33 @@ impl EscapeError { | |
/// Takes a contents of a literal (without quotes) and produces a | ||
/// sequence of escaped characters or errors. | ||
/// Values are returned through invoking of the provided callback. | ||
pub fn unescape_literal<F>(literal_text: &str, mode: Mode, callback: &mut F) | ||
pub fn unescape_literal<F>(src: &str, mode: Mode, callback: &mut F) | ||
where | ||
F: FnMut(Range<usize>, Result<char, EscapeError>), | ||
{ | ||
match mode { | ||
Mode::Char | Mode::Byte => { | ||
let mut chars = literal_text.chars(); | ||
let result = unescape_char_or_byte(&mut chars, mode); | ||
// The Chars iterator moved forward. | ||
callback(0..(literal_text.len() - chars.as_str().len()), result); | ||
let mut chars = src.chars(); | ||
let res = unescape_char_or_byte(&mut chars, mode == Mode::Byte); | ||
callback(0..(src.len() - chars.as_str().len()), res); | ||
} | ||
Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(literal_text, mode, callback), | ||
// NOTE: Raw strings do not perform any explicit character escaping, here we | ||
// only translate CRLF to LF and produce errors on bare CR. | ||
Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(src, mode == Mode::ByteStr, callback), | ||
Mode::RawStr | Mode::RawByteStr => { | ||
unescape_raw_str_or_raw_byte_str(literal_text, mode, callback) | ||
unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback) | ||
} | ||
} | ||
} | ||
|
||
/// Takes a contents of a byte, byte string or raw byte string (without quotes) | ||
/// and produces a sequence of bytes or errors. | ||
/// Values are returned through invoking of the provided callback. | ||
pub fn unescape_byte_literal<F>(literal_text: &str, mode: Mode, callback: &mut F) | ||
where | ||
F: FnMut(Range<usize>, Result<u8, EscapeError>), | ||
{ | ||
debug_assert!(mode.is_bytes()); | ||
unescape_literal(literal_text, mode, &mut |range, result| { | ||
callback(range, result.map(byte_from_char)); | ||
}) | ||
} | ||
|
||
/// Takes a contents of a char literal (without quotes), and returns an | ||
/// unescaped char or an error | ||
pub fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> { | ||
let mut chars = literal_text.chars(); | ||
unescape_char_or_byte(&mut chars, Mode::Char) | ||
.map_err(|err| (literal_text.len() - chars.as_str().len(), err)) | ||
/// unescaped char or an error. | ||
pub fn unescape_char(src: &str) -> Result<char, EscapeError> { | ||
unescape_char_or_byte(&mut src.chars(), false) | ||
} | ||
|
||
/// Takes a contents of a byte literal (without quotes), and returns an | ||
/// unescaped byte or an error. | ||
pub fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> { | ||
let mut chars = literal_text.chars(); | ||
unescape_char_or_byte(&mut chars, Mode::Byte) | ||
.map(byte_from_char) | ||
.map_err(|err| (literal_text.len() - chars.as_str().len(), err)) | ||
pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> { | ||
unescape_char_or_byte(&mut src.chars(), true).map(byte_from_char) | ||
} | ||
|
||
/// What kind of literal do we parse. | ||
|
@@ -147,20 +124,17 @@ impl Mode { | |
} | ||
} | ||
|
||
pub fn is_bytes(self) -> bool { | ||
pub fn is_byte(self) -> bool { | ||
match self { | ||
Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, | ||
Mode::Char | Mode::Str | Mode::RawStr => false, | ||
} | ||
} | ||
} | ||
|
||
fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { | ||
fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> { | ||
// Previous character was '\\', unescape what follows. | ||
|
||
let second_char = chars.next().ok_or(EscapeError::LoneSlash)?; | ||
|
||
let res = match second_char { | ||
let res = match chars.next().ok_or(EscapeError::LoneSlash)? { | ||
'"' => '"', | ||
'n' => '\n', | ||
'r' => '\r', | ||
|
@@ -181,7 +155,7 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { | |
let value = hi * 16 + lo; | ||
|
||
// For a non-byte literal verify that it is within ASCII range. | ||
if !mode.is_bytes() && !is_ascii(value) { | ||
if !is_byte && !is_ascii(value) { | ||
return Err(EscapeError::OutOfRangeHexEscape); | ||
} | ||
let value = value as u8; | ||
|
@@ -217,7 +191,7 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { | |
|
||
// Incorrect syntax has higher priority for error reporting | ||
// than unallowed value for a literal. | ||
if mode.is_bytes() { | ||
if is_byte { | ||
return Err(EscapeError::UnicodeEscapeInByte); | ||
} | ||
|
||
|
@@ -249,23 +223,22 @@ fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { | |
} | ||
|
||
#[inline] | ||
fn ascii_check(first_char: char, mode: Mode) -> Result<char, EscapeError> { | ||
if mode.is_bytes() && !first_char.is_ascii() { | ||
fn ascii_check(c: char, is_byte: bool) -> Result<char, EscapeError> { | ||
if is_byte && !c.is_ascii() { | ||
// Byte literal can't be a non-ascii character. | ||
Err(EscapeError::NonAsciiCharInByte) | ||
} else { | ||
Ok(first_char) | ||
Ok(c) | ||
} | ||
} | ||
|
||
fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { | ||
debug_assert!(mode == Mode::Char || mode == Mode::Byte); | ||
let first_char = chars.next().ok_or(EscapeError::ZeroChars)?; | ||
let res = match first_char { | ||
'\\' => scan_escape(chars, mode), | ||
fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result<char, EscapeError> { | ||
let c = chars.next().ok_or(EscapeError::ZeroChars)?; | ||
let res = match c { | ||
'\\' => scan_escape(chars, is_byte), | ||
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), | ||
'\r' => Err(EscapeError::BareCarriageReturn), | ||
_ => ascii_check(first_char, mode), | ||
_ => ascii_check(c, is_byte), | ||
}?; | ||
if chars.next().is_some() { | ||
return Err(EscapeError::MoreThanOneChar); | ||
|
@@ -275,20 +248,20 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca | |
|
||
/// Takes a contents of a string literal (without quotes) and produces a | ||
/// sequence of escaped characters or errors. | ||
fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F) | ||
fn unescape_str_or_byte_str<F>(src: &str, is_byte: bool, callback: &mut F) | ||
where | ||
F: FnMut(Range<usize>, Result<char, EscapeError>), | ||
{ | ||
debug_assert!(mode == Mode::Str || mode == Mode::ByteStr); | ||
let initial_len = src.len(); | ||
let mut chars = src.chars(); | ||
while let Some(first_char) = chars.next() { | ||
let start = initial_len - chars.as_str().len() - first_char.len_utf8(); | ||
|
||
let unescaped_char = match first_char { | ||
// The `start` and `end` computation here is complicated because | ||
// `skip_ascii_whitespace` makes us to skip over chars without counting | ||
// them in the range computation. | ||
while let Some(c) = chars.next() { | ||
let start = src.len() - chars.as_str().len() - c.len_utf8(); | ||
let res = match c { | ||
'\\' => { | ||
let second_char = chars.clone().next(); | ||
match second_char { | ||
match chars.clone().next() { | ||
Some('\n') => { | ||
// Rust language specification requires us to skip whitespaces | ||
// if unescaped '\' character is followed by '\n'. | ||
|
@@ -297,17 +270,17 @@ where | |
skip_ascii_whitespace(&mut chars, start, callback); | ||
continue; | ||
} | ||
_ => scan_escape(&mut chars, mode), | ||
_ => scan_escape(&mut chars, is_byte), | ||
} | ||
} | ||
'\n' => Ok('\n'), | ||
'\t' => Ok('\t'), | ||
'"' => Err(EscapeError::EscapeOnlyChar), | ||
'\r' => Err(EscapeError::BareCarriageReturn), | ||
_ => ascii_check(first_char, mode), | ||
_ => ascii_check(c, is_byte), | ||
}; | ||
let end = initial_len - chars.as_str().len(); | ||
callback(start..end, unescaped_char); | ||
let end = src.len() - chars.as_str().len(); | ||
callback(start..end, res); | ||
} | ||
|
||
fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F) | ||
|
@@ -340,30 +313,29 @@ where | |
/// Takes a contents of a string literal (without quotes) and produces a | ||
/// sequence of characters or errors. | ||
/// NOTE: Raw strings do not perform any explicit character escaping, here we | ||
/// only translate CRLF to LF and produce errors on bare CR. | ||
fn unescape_raw_str_or_raw_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F) | ||
/// only produce errors on bare CR. | ||
fn unescape_raw_str_or_raw_byte_str<F>(src: &str, is_byte: bool, callback: &mut F) | ||
where | ||
F: FnMut(Range<usize>, Result<char, EscapeError>), | ||
{ | ||
debug_assert!(mode == Mode::RawStr || mode == Mode::RawByteStr); | ||
let initial_len = literal_text.len(); | ||
|
||
let mut chars = literal_text.chars(); | ||
while let Some(curr) = chars.next() { | ||
let start = initial_len - chars.as_str().len() - curr.len_utf8(); | ||
let mut chars = src.chars(); | ||
|
||
let result = match curr { | ||
// The `start` and `end` computation here matches the one in | ||
// `unescape_str_or_byte_str` for consistency, even though this function | ||
// doesn't have to worry about skipping any chars. | ||
while let Some(c) = chars.next() { | ||
let start = src.len() - chars.as_str().len() - c.len_utf8(); | ||
let res = match c { | ||
'\r' => Err(EscapeError::BareCarriageReturnInRawString), | ||
c if mode.is_bytes() && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), | ||
c => Ok(c), | ||
_ => ascii_check(c, is_byte), | ||
}; | ||
let end = initial_len - chars.as_str().len(); | ||
|
||
callback(start..end, result); | ||
let end = src.len() - chars.as_str().len(); | ||
callback(start..end, res); | ||
} | ||
} | ||
|
||
fn byte_from_char(c: char) -> u8 { | ||
#[inline] | ||
pub fn byte_from_char(c: char) -> u8 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one is a bit dicey. I agree that in the context of compiler we don't really care about nice abstrction boundaries, and this is a simplification. But we want No strong opinion overall. Though, if we do this, this one surely wants to be tagged as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a fair point. A couple of questions:
|
||
let res = c as u32; | ||
debug_assert!(res <= u8::MAX as u32, "guaranteed because of Mode::ByteStr"); | ||
res as u8 | ||
|
Uh oh!
There was an error while loading. Please reload this page.