Apply feedback for Group/BitMask

Amanieu · Amanieu · commit f6b2ef82516a · 2018-12-11T20:56:18.000Z
diff --git a/src/raw/bitmask.rs b/src/raw/bitmask.rs
@@ -1,4 +1,4 @@
-use super::imp::{BitMaskWord, BITMASK_MASK, BITMASK_SHIFT};
+use super::imp::{BitMaskWord, BITMASK_MASK, BITMASK_STRIDE};
 #[cfg(feature = "nightly")]
 use core::intrinsics;
 
@@ -7,6 +7,12 @@ use core::intrinsics;
 ///
 /// The bit mask is arranged so that low-order bits represent lower memory
 /// addresses for group match results.
+///
+/// For implementation reasons, the bits in the set may be sparsely packed, so
+/// that there is only one bit-per-byte used (the high bit, 7). If this is the
+/// case, `BITMASK_STRIDE` will be 8 to indicate a divide-by-8 should be
+/// performed on counts/indices to normalize this difference. `BITMASK_MASK` is
+/// similarly a mask of all the actually-used bits.
 #[derive(Copy, Clone)]
 pub struct BitMask(pub BitMaskWord);
 
@@ -24,7 +30,7 @@ impl BitMask {
     pub fn remove_lowest_bit(self) -> BitMask {
         BitMask(self.0 & (self.0 - 1))
     }
-    /// Returns whether the `BitMask` has at least one set bits.
+    /// Returns whether the `BitMask` has at least one set bit.
     #[inline]
     pub fn any_bit_set(self) -> bool {
         self.0 != 0
@@ -36,7 +42,7 @@ impl BitMask {
         if self.0 == 0 {
             None
         } else {
-            Some(self.trailing_zeros())
+            Some(unsafe { self.lowest_set_bit_nonzero() })
         }
     }
 
@@ -45,7 +51,7 @@ impl BitMask {
     #[inline]
     #[cfg(feature = "nightly")]
     pub unsafe fn lowest_set_bit_nonzero(self) -> usize {
-        intrinsics::cttz_nonzero(self.0) as usize >> BITMASK_SHIFT
+        intrinsics::cttz_nonzero(self.0) as usize / BITMASK_STRIDE
     }
     #[cfg(not(feature = "nightly"))]
     pub unsafe fn lowest_set_bit_nonzero(self) -> usize {
@@ -55,21 +61,22 @@ impl BitMask {
     /// Returns the number of trailing zeroes in the `BitMask`.
     #[inline]
     pub fn trailing_zeros(self) -> usize {
-        // ARM doesn't have a CTZ instruction, and instead uses RBIT + CLZ.
-        // However older ARM versions (pre-ARMv7) don't have RBIT and need to
-        // emulate it instead. Since we only have 1 bit set in each byte we can
-        // use REV + CLZ instead.
-        if cfg!(target_arch = "arm") && BITMASK_SHIFT >= 3 {
-            self.0.swap_bytes().leading_zeros() as usize >> BITMASK_SHIFT
+        // ARM doesn't have a trailing_zeroes instruction, and instead uses
+        // reverse_bits (RBIT) + leading_zeroes (CLZ). However older ARM
+        // versions (pre-ARMv7) don't have RBIT and need to emulate it
+        // instead. Since we only have 1 bit set in each byte on ARM, we can
+        // use swap_bytes (REV) + leading_zeroes instead.
+        if cfg!(target_arch = "arm") && BITMASK_STRIDE % 8 == 0 {
+            self.0.swap_bytes().leading_zeros() as usize / BITMASK_STRIDE
         } else {
-            self.0.trailing_zeros() as usize >> BITMASK_SHIFT
+            self.0.trailing_zeros() as usize / BITMASK_STRIDE
         }
     }
 
     /// Returns the number of leading zeroes in the `BitMask`.
     #[inline]
     pub fn leading_zeros(self) -> usize {
-        self.0.leading_zeros() as usize >> BITMASK_SHIFT
+        self.0.leading_zeros() as usize / BITMASK_STRIDE
     }
 }
 
diff --git a/src/raw/generic.rs b/src/raw/generic.rs
@@ -19,8 +19,9 @@ type GroupWord = u64;
 type GroupWord = u32;
 
 pub type BitMaskWord = GroupWord;
-pub const BITMASK_SHIFT: u32 = 3;
-pub const BITMASK_MASK: GroupWord = 0x8080808080808080u64 as GroupWord;
+pub const BITMASK_STRIDE: usize = 8;
+// We only care about the highest bit of each byte for the mask.
+pub const BITMASK_MASK: BitMaskWord = 0x8080_8080_8080_8080u64 as GroupWord;
 
 /// Helper function to replicate a byte across a `GroupWord`.
 #[inline]
@@ -107,13 +108,17 @@ impl Group {
     /// `EMPTY`.
     #[inline]
     pub fn match_empty(&self) -> BitMask {
+        // If the high bit is set, then the byte must be either:
+        // 1111_1111 (EMPTY) or 1000_0000 (DELETED).
+        // So we can just check if the top two bits are 1 by ANDing them.
         BitMask((self.0 & (self.0 << 1) & repeat(0x80)).to_le())
     }
 
     /// Returns a `BitMask` indicating all bytes in the group which are
-    /// `EMPTY` pr `DELETED`.
+    /// `EMPTY` or `DELETED`.
     #[inline]
     pub fn match_empty_or_deleted(&self) -> BitMask {
+        // A byte is EMPTY or DELETED iff the high bit is set
         BitMask((self.0 & repeat(0x80)).to_le())
     }
 
@@ -123,6 +128,13 @@ impl Group {
     /// - `FULL => DELETED`
     #[inline]
     pub fn convert_special_to_empty_and_full_to_deleted(&self) -> Group {
+        // Map high_bit = 1 (EMPTY or DELETED) to 1111_1111
+        // and high_bit = 0 (FULL) to 1000_0000
+        //
+        // Here's this logic expanded to concrete values:
+        //   let full = 1000_0000 (true) or 0000_0000 (false)
+        //   !1000_0000 + 1 = 0111_1111 + 1 = 1000_0000 (no carry)
+        //   !0000_0000 + 0 = 1111_1111 + 0 = 1111_1111 (no carry)
         let full = !self.0 & repeat(0x80);
         Group(!full + (full >> 7))
     }
diff --git a/src/raw/sse2.rs b/src/raw/sse2.rs
@@ -8,8 +8,8 @@ use core::arch::x86;
 use core::arch::x86_64 as x86;
 
 pub type BitMaskWord = u16;
-pub const BITMASK_SHIFT: u32 = 0;
-pub const BITMASK_MASK: u16 = 0xffff;
+pub const BITMASK_STRIDE: usize = 1;
+pub const BITMASK_MASK: BitMaskWord = 0xffff;
 
 /// Abstraction over a group of control bytes which can be scanned in
 /// parallel.
@@ -78,9 +78,10 @@ impl Group {
     }
 
     /// Returns a `BitMask` indicating all bytes in the group which are
-    /// `EMPTY` pr `DELETED`.
+    /// `EMPTY` or `DELETED`.
     #[inline]
     pub fn match_empty_or_deleted(&self) -> BitMask {
+        // A byte is EMPTY or DELETED iff the high bit is set
         unsafe { BitMask(x86::_mm_movemask_epi8(self.0) as u16) }
     }
 
@@ -90,6 +91,13 @@ impl Group {
     /// - `FULL => DELETED`
     #[inline]
     pub fn convert_special_to_empty_and_full_to_deleted(&self) -> Group {
+        // Map high_bit = 1 (EMPTY or DELETED) to 1111_1111
+        // and high_bit = 0 (FULL) to 1000_0000
+        //
+        // Here's this logic expanded to concrete values:
+        //   let special = 0 > byte = 1111_1111 (true) or 0000_0000 (false)
+        //   1111_1111 | 1000_0000 = 1111_1111
+        //   0000_0000 | 1000_0000 = 1000_0000
         unsafe {
             let zero = x86::_mm_setzero_si128();
             let special = x86::_mm_cmpgt_epi8(zero, self.0);