From d6b42c2463ddfc7d16b9199f7725e704c94c2acf Mon Sep 17 00:00:00 2001 From: root Date: Thu, 17 Jul 2014 18:03:34 +0200 Subject: [PATCH 1/6] str: Add better tests for string slice's Chars iterator Test using for ch s.chars() { black_box(ch) } to have a test that should force the iterator to run its full decoding computations. --- src/libcollections/str.rs | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs index 85921f1176a3..f1bb3b408041 100644 --- a/src/libcollections/str.rs +++ b/src/libcollections/str.rs @@ -2240,16 +2240,26 @@ mod tests { #[cfg(test)] mod bench { use test::Bencher; + use test::black_box; use super::*; + use std::option::{None, Some}; use std::iter::{Iterator, DoubleEndedIterator}; use std::collections::Collection; #[bench] fn char_iterator(b: &mut Bencher) { let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb"; - let len = s.char_len(); - b.iter(|| assert_eq!(s.chars().count(), len)); + b.iter(|| s.chars().count()); + } + + #[bench] + fn char_iterator_for(b: &mut Bencher) { + let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb"; + + b.iter(|| { + for ch in s.chars() { black_box(ch) } + }); } #[bench] @@ -2260,17 +2270,24 @@ mod bench { Mary had a little lamb, Little lamb Mary had a little lamb, Little lamb Mary had a little lamb, Little lamb"; - let len = s.char_len(); - b.iter(|| assert_eq!(s.chars().count(), len)); + b.iter(|| s.chars().count()); } #[bench] fn char_iterator_rev(b: &mut Bencher) { let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb"; - let len = s.char_len(); - b.iter(|| assert_eq!(s.chars().rev().count(), len)); + b.iter(|| s.chars().rev().count()); + } + + #[bench] + fn char_iterator_rev_for(b: &mut Bencher) { + let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb"; + + b.iter(|| { + for ch in s.chars().rev() { black_box(ch) } + }); } #[bench] From 42357d772b8a3a1ce4395deeac0a5cf1f66e951d Mon Sep 17 00:00:00 2001 From: root Date: Thu, 17 Jul 2014 19:34:07 +0200 Subject: [PATCH 2/6] core::str: Implement Chars iterator using slice::Items Re-use the vector iterator to implement the chars iterator. The iterator uses our guarantee that the string contains valid UTF-8, but its only unsafe code is transmuting the decoded u32 into char. --- src/libcore/str.rs | 158 ++++++++++++++++++++++++++++++++------------- 1 file changed, 114 insertions(+), 44 deletions(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index aa2050dacf1a..9711d2e3bcc1 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -97,47 +97,121 @@ impl<'a> CharEq for &'a [char] { Section: Iterators */ -/// External iterator for a string's characters. -/// Use with the `std::iter` module. +/// Iterator for the char (representing *Unicode Scalar Values*) of a string +/// +/// Created with the method `.chars()`. #[deriving(Clone)] pub struct Chars<'a> { - /// The slice remaining to be iterated - string: &'a str, + iter: slice::Items<'a, u8> +} + +// Return the initial codepoint accumulator for the first byte. +// The first byte is special, only want bottom 5 bits for width 2, 4 bits +// for width 3, and 3 bits for width 4 +macro_rules! utf8_first_byte( + ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32) +) + +// return the value of $ch updated with continuation byte $byte +macro_rules! utf8_acc_cont_byte( + ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32) +) + +macro_rules! utf8_is_cont_byte( + ($byte:expr) => (($byte & 192u8) == 128) +) + +#[inline] +fn unwrap_or_0(opt: Option<&u8>) -> u8 { + match opt { + Some(&byte) => byte, + None => 0, + } } impl<'a> Iterator for Chars<'a> { #[inline] fn next(&mut self) -> Option { - // Decode the next codepoint, then update - // the slice to be just the remaining part - if self.string.len() != 0 { - let CharRange {ch, next} = self.string.char_range_at(0); + // Decode UTF-8, using the valid UTF-8 invariant + #[inline] + fn decode_multibyte<'a>(x: u8, it: &mut slice::Items<'a, u8>) -> char { + // NOTE: Performance is very sensitive to the exact formulation here + // Decode from a byte combination out of: [[[x y] z] w] + let cont_mask = 0x3F; // continuation byte mask + let init = utf8_first_byte!(x, 2); + let y = unwrap_or_0(it.next()); + let mut ch = utf8_acc_cont_byte!(init, y); + if x >= 0xE0 { + /* [[x y z] w] case */ + let z = unwrap_or_0(it.next()); + + let y_z = (((y & cont_mask) as u32) << 6) | (z & cont_mask) as u32; + ch = init << 12 | y_z; + if x >= 0xF0 { + /* [x y z w] case */ + let w = unwrap_or_0(it.next()); + ch = (init & 7) << 18 | y_z << 6 | (w & cont_mask) as u32; + } + } unsafe { - self.string = raw::slice_unchecked(self.string, next, self.string.len()); + mem::transmute(ch) + } + } + + match self.iter.next() { + None => None, + Some(&next_byte) => { + if next_byte < 128 { + Some(next_byte as char) + } else { + Some(decode_multibyte(next_byte, &mut self.iter)) + } } - Some(ch) - } else { - None } } #[inline] fn size_hint(&self) -> (uint, Option) { - (self.string.len().saturating_add(3)/4, Some(self.string.len())) + let (len, _) = self.iter.size_hint(); + (len.saturating_add(3) / 4, Some(len)) } } impl<'a> DoubleEndedIterator for Chars<'a> { #[inline] fn next_back(&mut self) -> Option { - if self.string.len() != 0 { - let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len()); + #[inline] + fn decode_multibyte_back<'a>(w: u8, it: &mut slice::Items<'a, u8>) -> char { + // Decode from a byte combination out of: [x [y [z w]]] + let mut ch; + let z = unwrap_or_0(it.next_back()); + ch = utf8_first_byte!(z, 2); + if utf8_is_cont_byte!(z) { + let y = unwrap_or_0(it.next_back()); + ch = utf8_first_byte!(y, 3); + if utf8_is_cont_byte!(y) { + let x = unwrap_or_0(it.next_back()); + ch = utf8_first_byte!(x, 4); + ch = utf8_acc_cont_byte!(ch, y); + } + ch = utf8_acc_cont_byte!(ch, z); + } + ch = utf8_acc_cont_byte!(ch, w); + unsafe { - self.string = raw::slice_unchecked(self.string, 0, next); + mem::transmute(ch) + } + } + + match self.iter.next_back() { + None => None, + Some(&back_byte) => { + if back_byte < 128 { + Some(back_byte as char) + } else { + Some(decode_multibyte_back(back_byte, &mut self.iter)) + } } - Some(ch) - } else { - None } } } @@ -146,18 +220,23 @@ impl<'a> DoubleEndedIterator for Chars<'a> { /// Use with the `std::iter` module. #[deriving(Clone)] pub struct CharOffsets<'a> { - /// The original string to be iterated - string: &'a str, + front: uint, + back: uint, iter: Chars<'a>, } impl<'a> Iterator<(uint, char)> for CharOffsets<'a> { #[inline] fn next(&mut self) -> Option<(uint, char)> { - // Compute the byte offset by using the pointer offset between - // the original string slice and the iterator's remaining part - let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint; - self.iter.next().map(|ch| (offset, ch)) + match self.iter.next() { + None => None, + Some(ch) => { + let index = self.front; + let (len, _) = self.iter.iter.size_hint(); + self.front += self.back - self.front - len; + Some((index, ch)) + } + } } #[inline] @@ -169,11 +248,14 @@ impl<'a> Iterator<(uint, char)> for CharOffsets<'a> { impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> { #[inline] fn next_back(&mut self) -> Option<(uint, char)> { - self.iter.next_back().map(|ch| { - let offset = self.iter.string.len() + - self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint; - (offset, ch) - }) + match self.iter.next_back() { + None => None, + Some(ch) => { + let (len, _) = self.iter.iter.size_hint(); + self.back -= self.back - self.front - len; + Some((self.back, ch)) + } + } } } @@ -880,18 +962,6 @@ pub struct CharRange { pub next: uint, } -// Return the initial codepoint accumulator for the first byte. -// The first byte is special, only want bottom 5 bits for width 2, 4 bits -// for width 3, and 3 bits for width 4 -macro_rules! utf8_first_byte( - ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32) -) - -// return the value of $ch updated with continuation byte $byte -macro_rules! utf8_acc_cont_byte( - ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32) -) - static TAG_CONT_U8: u8 = 128u8; /// Unsafe operations @@ -1608,7 +1678,7 @@ impl<'a> StrSlice<'a> for &'a str { #[inline] fn chars(&self) -> Chars<'a> { - Chars{string: *self} + Chars{iter: self.as_bytes().iter()} } #[inline] @@ -1618,7 +1688,7 @@ impl<'a> StrSlice<'a> for &'a str { #[inline] fn char_indices(&self) -> CharOffsets<'a> { - CharOffsets{string: *self, iter: self.chars()} + CharOffsets{front: 0, back: self.len(), iter: self.chars()} } #[inline] From 9e59c7626381eff9ee9110091b68a079b8a5b7d8 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 18 Jul 2014 00:59:09 +0200 Subject: [PATCH 3/6] Add more tests for str Chars iterator Test iterating (decoding) every codepoint. --- src/libcollections/str.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs index f1bb3b408041..b0c8f17eecc7 100644 --- a/src/libcollections/str.rs +++ b/src/libcollections/str.rs @@ -808,6 +808,7 @@ impl OwnedStr for String { #[cfg(test)] mod tests { use std::iter::AdditiveIterator; + use std::iter::range; use std::default::Default; use std::char::Char; use std::clone::Clone; @@ -1610,6 +1611,30 @@ mod tests { assert_eq!(pos, v.len()); } + #[test] + fn test_chars_decoding() { + let mut bytes = [0u8, ..4]; + for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) { + let len = c.encode_utf8(bytes); + let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap(); + if Some(c) != s.chars().next() { + fail!("character {:x}={} does not decode correctly", c as u32, c); + } + } + } + + #[test] + fn test_chars_rev_decoding() { + let mut bytes = [0u8, ..4]; + for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) { + let len = c.encode_utf8(bytes); + let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap(); + if Some(c) != s.chars().rev().next() { + fail!("character {:x}={} does not decode correctly", c as u32, c); + } + } + } + #[test] fn test_iterator_clone() { let s = "ศไทย中华Việt Nam"; From bbb299ad9840d02c52eefbd9989b5b18b51a7b8d Mon Sep 17 00:00:00 2001 From: root Date: Fri, 18 Jul 2014 00:59:49 +0200 Subject: [PATCH 4/6] Clarify str Chars iterator implementation Thanks to comments from @huonw, clarify decoding details and use statics for important constants for UTF-8 decoding. Convert some magic numbers scattered in the same file to use the statics too. --- src/libcore/str.rs | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index 9711d2e3bcc1..f8ff4cfb1d9a 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -114,11 +114,11 @@ macro_rules! utf8_first_byte( // return the value of $ch updated with continuation byte $byte macro_rules! utf8_acc_cont_byte( - ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32) + ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & CONT_MASK) as u32) ) macro_rules! utf8_is_cont_byte( - ($byte:expr) => (($byte & 192u8) == 128) + ($byte:expr) => (($byte & !CONT_MASK) == TAG_CONT_U8) ) #[inline] @@ -137,20 +137,20 @@ impl<'a> Iterator for Chars<'a> { fn decode_multibyte<'a>(x: u8, it: &mut slice::Items<'a, u8>) -> char { // NOTE: Performance is very sensitive to the exact formulation here // Decode from a byte combination out of: [[[x y] z] w] - let cont_mask = 0x3F; // continuation byte mask let init = utf8_first_byte!(x, 2); let y = unwrap_or_0(it.next()); let mut ch = utf8_acc_cont_byte!(init, y); if x >= 0xE0 { - /* [[x y z] w] case */ + /* [[x y z] w] case + * 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid */ let z = unwrap_or_0(it.next()); - - let y_z = (((y & cont_mask) as u32) << 6) | (z & cont_mask) as u32; + let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z); ch = init << 12 | y_z; if x >= 0xF0 { - /* [x y z w] case */ + /* [x y z w] case + * use only the lower 3 bits of `init` */ let w = unwrap_or_0(it.next()); - ch = (init & 7) << 18 | y_z << 6 | (w & cont_mask) as u32; + ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w); } } unsafe { @@ -754,9 +754,9 @@ fn run_utf8_validation_iterator(iter: &mut slice::Items) -> bool { // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / // %xF4 %x80-8F 2( UTF8-tail ) match w { - 2 => if second & 192 != TAG_CONT_U8 {err!()}, + 2 => if second & !CONT_MASK != TAG_CONT_U8 {err!()}, 3 => { - match (first, second, next!() & 192) { + match (first, second, next!() & !CONT_MASK) { (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) | (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) | (0xED , 0x80 .. 0x9F, TAG_CONT_U8) | @@ -765,7 +765,7 @@ fn run_utf8_validation_iterator(iter: &mut slice::Items) -> bool { } } 4 => { - match (first, second, next!() & 192, next!() & 192) { + match (first, second, next!() & !CONT_MASK, next!() & !CONT_MASK) { (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) | (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) | (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {} @@ -962,7 +962,10 @@ pub struct CharRange { pub next: uint, } -static TAG_CONT_U8: u8 = 128u8; +/// Mask of the value bits of a continuation byte +static CONT_MASK: u8 = 0b0011_1111u8; +/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte +static TAG_CONT_U8: u8 = 0b1000_0000u8; /// Unsafe operations pub mod raw { @@ -1898,7 +1901,7 @@ impl<'a> StrSlice<'a> for &'a str { // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange { // while there is a previous byte == 10...... - while i > 0 && s.as_bytes()[i] & 192u8 == TAG_CONT_U8 { + while i > 0 && s.as_bytes()[i] & !CONT_MASK == TAG_CONT_U8 { i -= 1u; } From 45921648699a42fa1d257f6a54d2dbe9e46b0e20 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 19 Jul 2014 00:02:30 +0200 Subject: [PATCH 5/6] Write multibyte case for str Chars iterator in-line Thanks to comments from @alexcrichton, write the next/next_back function bodies without nested functions in a more top-to-bottom flow style. Also improve comment style and motivate the unsafe blocks with comments. --- src/libcore/str.rs | 107 ++++++++++++++++++++------------------------- 1 file changed, 48 insertions(+), 59 deletions(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index f8ff4cfb1d9a..293c0118af1e 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -133,40 +133,35 @@ impl<'a> Iterator for Chars<'a> { #[inline] fn next(&mut self) -> Option { // Decode UTF-8, using the valid UTF-8 invariant - #[inline] - fn decode_multibyte<'a>(x: u8, it: &mut slice::Items<'a, u8>) -> char { - // NOTE: Performance is very sensitive to the exact formulation here - // Decode from a byte combination out of: [[[x y] z] w] - let init = utf8_first_byte!(x, 2); - let y = unwrap_or_0(it.next()); - let mut ch = utf8_acc_cont_byte!(init, y); - if x >= 0xE0 { - /* [[x y z] w] case - * 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid */ - let z = unwrap_or_0(it.next()); - let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z); - ch = init << 12 | y_z; - if x >= 0xF0 { - /* [x y z w] case - * use only the lower 3 bits of `init` */ - let w = unwrap_or_0(it.next()); - ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w); - } - } - unsafe { - mem::transmute(ch) + let x = match self.iter.next() { + None => return None, + Some(&next_byte) if next_byte < 128 => return Some(next_byte as char), + Some(&next_byte) => next_byte, + }; + + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + // NOTE: Performance is sensitive to the exact formulation here + let init = utf8_first_byte!(x, 2); + let y = unwrap_or_0(self.iter.next()); + let mut ch = utf8_acc_cont_byte!(init, y); + if x >= 0xE0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + let z = unwrap_or_0(self.iter.next()); + let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xF0 { + // [x y z w] case + // use only the lower 3 bits of `init` + let w = unwrap_or_0(self.iter.next()); + ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w); } } - match self.iter.next() { - None => None, - Some(&next_byte) => { - if next_byte < 128 { - Some(next_byte as char) - } else { - Some(decode_multibyte(next_byte, &mut self.iter)) - } - } + // str invariant says `ch` is a valid Unicode Scalar Value + unsafe { + Some(mem::transmute(ch)) } } @@ -180,38 +175,32 @@ impl<'a> Iterator for Chars<'a> { impl<'a> DoubleEndedIterator for Chars<'a> { #[inline] fn next_back(&mut self) -> Option { - #[inline] - fn decode_multibyte_back<'a>(w: u8, it: &mut slice::Items<'a, u8>) -> char { - // Decode from a byte combination out of: [x [y [z w]]] - let mut ch; - let z = unwrap_or_0(it.next_back()); - ch = utf8_first_byte!(z, 2); - if utf8_is_cont_byte!(z) { - let y = unwrap_or_0(it.next_back()); - ch = utf8_first_byte!(y, 3); - if utf8_is_cont_byte!(y) { - let x = unwrap_or_0(it.next_back()); - ch = utf8_first_byte!(x, 4); - ch = utf8_acc_cont_byte!(ch, y); - } - ch = utf8_acc_cont_byte!(ch, z); - } - ch = utf8_acc_cont_byte!(ch, w); + let w = match self.iter.next_back() { + None => return None, + Some(&back_byte) if back_byte < 128 => return Some(back_byte as char), + Some(&back_byte) => back_byte, + }; - unsafe { - mem::transmute(ch) + // Multibyte case follows + // Decode from a byte combination out of: [x [y [z w]]] + let mut ch; + let z = unwrap_or_0(self.iter.next_back()); + ch = utf8_first_byte!(z, 2); + if utf8_is_cont_byte!(z) { + let y = unwrap_or_0(self.iter.next_back()); + ch = utf8_first_byte!(y, 3); + if utf8_is_cont_byte!(y) { + let x = unwrap_or_0(self.iter.next_back()); + ch = utf8_first_byte!(x, 4); + ch = utf8_acc_cont_byte!(ch, y); } + ch = utf8_acc_cont_byte!(ch, z); } + ch = utf8_acc_cont_byte!(ch, w); - match self.iter.next_back() { - None => None, - Some(&back_byte) => { - if back_byte < 128 { - Some(back_byte as char) - } else { - Some(decode_multibyte_back(back_byte, &mut self.iter)) - } - } + // str invariant says `ch` is a valid Unicode Scalar Value + unsafe { + Some(mem::transmute(ch)) } } } From c5e0736c2434fbed6b7f249b42b336f0e27804a6 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 19 Jul 2014 15:39:02 +0200 Subject: [PATCH 6/6] Simplify str CharOffsets iterator Only one uint is needed to keep track of the offset from the original full string. --- src/libcore/str.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index 293c0118af1e..c6aff9c8bdac 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -209,20 +209,20 @@ impl<'a> DoubleEndedIterator for Chars<'a> { /// Use with the `std::iter` module. #[deriving(Clone)] pub struct CharOffsets<'a> { - front: uint, - back: uint, + front_offset: uint, iter: Chars<'a>, } impl<'a> Iterator<(uint, char)> for CharOffsets<'a> { #[inline] fn next(&mut self) -> Option<(uint, char)> { + let (pre_len, _) = self.iter.iter.size_hint(); match self.iter.next() { None => None, Some(ch) => { - let index = self.front; + let index = self.front_offset; let (len, _) = self.iter.iter.size_hint(); - self.front += self.back - self.front - len; + self.front_offset += pre_len - len; Some((index, ch)) } } @@ -241,8 +241,8 @@ impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> { None => None, Some(ch) => { let (len, _) = self.iter.iter.size_hint(); - self.back -= self.back - self.front - len; - Some((self.back, ch)) + let index = self.front_offset + len; + Some((index, ch)) } } } @@ -1680,7 +1680,7 @@ impl<'a> StrSlice<'a> for &'a str { #[inline] fn char_indices(&self) -> CharOffsets<'a> { - CharOffsets{front: 0, back: self.len(), iter: self.chars()} + CharOffsets{front_offset: 0, iter: self.chars()} } #[inline]