Skip to content

Commit d340b8b

Browse files
committed
Auto merge of #118273 - AngelicosPhosphoros:dedup_2_loops_version_77772_2, r=<try>
Split `Vec::dedup_by` into 2 cycles First cycle runs until we found 2 same elements, second runs after if there any found in the first one. This allows to avoid any memory writes until we found an item which we want to remove. This leads to significant performance gains if all `Vec` items are kept: -40% on my benchmark with unique integers. Results of benchmarks before implementation (including new benchmark where nothing needs to be removed): * vec::bench_dedup_all_100 74.00ns/iter +/- 13.00ns * vec::bench_dedup_all_1000 572.00ns/iter +/- 272.00ns * vec::bench_dedup_all_100000 64.42µs/iter +/- 19.47µs * __vec::bench_dedup_none_100 67.00ns/iter +/- 17.00ns__ * __vec::bench_dedup_none_1000 662.00ns/iter +/- 86.00ns__ * __vec::bench_dedup_none_10000 9.16µs/iter +/- 2.71µs__ * __vec::bench_dedup_none_100000 91.25µs/iter +/- 1.82µs__ * vec::bench_dedup_random_100 105.00ns/iter +/- 11.00ns * vec::bench_dedup_random_1000 781.00ns/iter +/- 10.00ns * vec::bench_dedup_random_10000 9.00µs/iter +/- 5.62µs * vec::bench_dedup_random_100000 449.81µs/iter +/- 74.99µs * vec::bench_dedup_slice_truncate_100 105.00ns/iter +/- 16.00ns * vec::bench_dedup_slice_truncate_1000 2.65µs/iter +/- 481.00ns * vec::bench_dedup_slice_truncate_10000 18.33µs/iter +/- 5.23µs * vec::bench_dedup_slice_truncate_100000 501.12µs/iter +/- 46.97µs Results after implementation: * vec::bench_dedup_all_100 75.00ns/iter +/- 9.00ns * vec::bench_dedup_all_1000 494.00ns/iter +/- 117.00ns * vec::bench_dedup_all_100000 58.13µs/iter +/- 8.78µs * __vec::bench_dedup_none_100 52.00ns/iter +/- 22.00ns__ * __vec::bench_dedup_none_1000 417.00ns/iter +/- 116.00ns__ * __vec::bench_dedup_none_10000 4.11µs/iter +/- 546.00ns__ * __vec::bench_dedup_none_100000 40.47µs/iter +/- 5.36µs__ * vec::bench_dedup_random_100 77.00ns/iter +/- 15.00ns * vec::bench_dedup_random_1000 681.00ns/iter +/- 86.00ns * vec::bench_dedup_random_10000 11.66µs/iter +/- 2.22µs * vec::bench_dedup_random_100000 469.35µs/iter +/- 20.53µs * vec::bench_dedup_slice_truncate_100 100.00ns/iter +/- 5.00ns * vec::bench_dedup_slice_truncate_1000 2.55µs/iter +/- 224.00ns * vec::bench_dedup_slice_truncate_10000 18.95µs/iter +/- 2.59µs * vec::bench_dedup_slice_truncate_100000 492.85µs/iter +/- 72.84µs Resolves #77772 P.S. Note that this is same PR as #92104 I just missed review then forgot about it. Also, I cannot reopen that pull request so I am creating a new one. I responded to remaining questions directly by adding commentaries to my code.
2 parents 3166210 + f0c9274 commit d340b8b

File tree

2 files changed

+149
-30
lines changed

2 files changed

+149
-30
lines changed

library/alloc/benches/vec.rs

Lines changed: 105 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -658,73 +658,160 @@ fn random_sorted_fill(mut seed: u32, buf: &mut [u32]) {
658658
buf.sort();
659659
}
660660

661-
fn bench_vec_dedup_old(b: &mut Bencher, sz: usize) {
661+
// Measures performance of slice dedup impl.
662+
// "Old" implementation of Vec::dedup
663+
fn bench_dedup_slice_truncate(b: &mut Bencher, sz: usize) {
662664
let mut template = vec![0u32; sz];
663665
b.bytes = std::mem::size_of_val(template.as_slice()) as u64;
664666
random_sorted_fill(0x43, &mut template);
665667

666668
let mut vec = template.clone();
667669
b.iter(|| {
670+
let vec = black_box(&mut vec);
668671
let len = {
669672
let (dedup, _) = vec.partition_dedup();
670673
dedup.len()
671674
};
672675
vec.truncate(len);
673676

674677
black_box(vec.first());
678+
let vec = black_box(vec);
675679
vec.clear();
676680
vec.extend_from_slice(&template);
677681
});
678682
}
679683

680-
fn bench_vec_dedup_new(b: &mut Bencher, sz: usize) {
684+
// Measures performance of Vec::dedup on random data.
685+
fn bench_vec_dedup_random(b: &mut Bencher, sz: usize) {
681686
let mut template = vec![0u32; sz];
682687
b.bytes = std::mem::size_of_val(template.as_slice()) as u64;
683688
random_sorted_fill(0x43, &mut template);
684689

685690
let mut vec = template.clone();
686691
b.iter(|| {
692+
let vec = black_box(&mut vec);
687693
vec.dedup();
688694
black_box(vec.first());
695+
let vec = black_box(vec);
696+
vec.clear();
697+
vec.extend_from_slice(&template);
698+
});
699+
}
700+
701+
// Measures performance of Vec::dedup when there is no items removed
702+
fn bench_vec_dedup_none(b: &mut Bencher, sz: usize) {
703+
let mut template = vec![0u32; sz];
704+
b.bytes = std::mem::size_of_val(template.as_slice()) as u64;
705+
template.chunks_exact_mut(2).for_each(|w| {
706+
w[0] = black_box(0);
707+
w[1] = black_box(5);
708+
});
709+
710+
let mut vec = template.clone();
711+
b.iter(|| {
712+
let vec = black_box(&mut vec);
713+
vec.dedup();
714+
black_box(vec.first());
715+
// Unlike other benches of `dedup`
716+
// this doesn't reinitialize vec
717+
// because we measure how efficient dedup is
718+
// when no memory written
719+
});
720+
}
721+
722+
// Measures performance of Vec::dedup when there is all items removed
723+
fn bench_vec_dedup_all(b: &mut Bencher, sz: usize) {
724+
let mut template = vec![0u32; sz];
725+
b.bytes = std::mem::size_of_val(template.as_slice()) as u64;
726+
template.iter_mut().for_each(|w| {
727+
*w = black_box(0);
728+
});
729+
730+
let mut vec = template.clone();
731+
b.iter(|| {
732+
let vec = black_box(&mut vec);
733+
vec.dedup();
734+
black_box(vec.first());
735+
let vec = black_box(vec);
689736
vec.clear();
690737
vec.extend_from_slice(&template);
691738
});
692739
}
693740

694741
#[bench]
695-
fn bench_dedup_old_100(b: &mut Bencher) {
696-
bench_vec_dedup_old(b, 100);
742+
fn bench_dedup_slice_truncate_100(b: &mut Bencher) {
743+
bench_dedup_slice_truncate(b, 100);
697744
}
698745
#[bench]
699-
fn bench_dedup_new_100(b: &mut Bencher) {
700-
bench_vec_dedup_new(b, 100);
746+
fn bench_dedup_random_100(b: &mut Bencher) {
747+
bench_vec_dedup_random(b, 100);
701748
}
702749

703750
#[bench]
704-
fn bench_dedup_old_1000(b: &mut Bencher) {
705-
bench_vec_dedup_old(b, 1000);
751+
fn bench_dedup_none_100(b: &mut Bencher) {
752+
bench_vec_dedup_none(b, 100);
706753
}
754+
755+
#[bench]
756+
fn bench_dedup_all_100(b: &mut Bencher) {
757+
bench_vec_dedup_all(b, 100);
758+
}
759+
760+
#[bench]
761+
fn bench_dedup_slice_truncate_1000(b: &mut Bencher) {
762+
bench_dedup_slice_truncate(b, 1000);
763+
}
764+
#[bench]
765+
fn bench_dedup_random_1000(b: &mut Bencher) {
766+
bench_vec_dedup_random(b, 1000);
767+
}
768+
769+
#[bench]
770+
fn bench_dedup_none_1000(b: &mut Bencher) {
771+
bench_vec_dedup_none(b, 1000);
772+
}
773+
707774
#[bench]
708-
fn bench_dedup_new_1000(b: &mut Bencher) {
709-
bench_vec_dedup_new(b, 1000);
775+
fn bench_dedup_all_1000(b: &mut Bencher) {
776+
bench_vec_dedup_all(b, 1000);
710777
}
711778

712779
#[bench]
713-
fn bench_dedup_old_10000(b: &mut Bencher) {
714-
bench_vec_dedup_old(b, 10000);
780+
fn bench_dedup_slice_truncate_10000(b: &mut Bencher) {
781+
bench_dedup_slice_truncate(b, 10000);
715782
}
716783
#[bench]
717-
fn bench_dedup_new_10000(b: &mut Bencher) {
718-
bench_vec_dedup_new(b, 10000);
784+
fn bench_dedup_random_10000(b: &mut Bencher) {
785+
bench_vec_dedup_random(b, 10000);
719786
}
720787

721788
#[bench]
722-
fn bench_dedup_old_100000(b: &mut Bencher) {
723-
bench_vec_dedup_old(b, 100000);
789+
fn bench_dedup_none_10000(b: &mut Bencher) {
790+
bench_vec_dedup_none(b, 10000);
724791
}
792+
793+
#[bench]
794+
fn bench_dedup_all_10000(b: &mut Bencher) {
795+
bench_vec_dedup_all(b, 10000);
796+
}
797+
798+
#[bench]
799+
fn bench_dedup_slice_truncate_100000(b: &mut Bencher) {
800+
bench_dedup_slice_truncate(b, 100000);
801+
}
802+
#[bench]
803+
fn bench_dedup_random_100000(b: &mut Bencher) {
804+
bench_vec_dedup_random(b, 100000);
805+
}
806+
807+
#[bench]
808+
fn bench_dedup_none_100000(b: &mut Bencher) {
809+
bench_vec_dedup_none(b, 100000);
810+
}
811+
725812
#[bench]
726-
fn bench_dedup_new_100000(b: &mut Bencher) {
727-
bench_vec_dedup_new(b, 100000);
813+
fn bench_dedup_all_100000(b: &mut Bencher) {
814+
bench_vec_dedup_all(b, 100000);
728815
}
729816

730817
#[bench]

library/alloc/src/vec/mod.rs

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1775,7 +1775,31 @@ impl<T, A: Allocator> Vec<T, A> {
17751775
return;
17761776
}
17771777

1778-
/* INVARIANT: vec.len() > read >= write > write-1 >= 0 */
1778+
// Check if we ever want to remove anything.
1779+
// This allows to use copy_non_overlapping in next cycle.
1780+
// And avoids any memory writes if we don't need to remove anything.
1781+
let mut possible_remove_idx = 1;
1782+
let start = self.as_mut_ptr();
1783+
while possible_remove_idx < len {
1784+
let found_duplicate = unsafe {
1785+
// SAFETY: possible_remove_idx always in range [1..len)
1786+
// Note that we start iteration from 1 so we never overflow.
1787+
let prev = start.add(possible_remove_idx.wrapping_sub(1));
1788+
let current = start.add(possible_remove_idx);
1789+
// We explicitly say in docs that references are reversed.
1790+
same_bucket(&mut *current, &mut *prev)
1791+
};
1792+
if found_duplicate {
1793+
break;
1794+
}
1795+
possible_remove_idx += 1;
1796+
}
1797+
// Don't need to remove anything.
1798+
if possible_remove_idx >= len {
1799+
return;
1800+
}
1801+
1802+
/* INVARIANT: vec.len() > read > write > write-1 >= 0 */
17791803
struct FillGapOnDrop<'a, T, A: core::alloc::Allocator> {
17801804
/* Offset of the element we want to check if it is duplicate */
17811805
read: usize,
@@ -1821,31 +1845,39 @@ impl<T, A: Allocator> Vec<T, A> {
18211845
}
18221846
}
18231847

1824-
let mut gap = FillGapOnDrop { read: 1, write: 1, vec: self };
1825-
let ptr = gap.vec.as_mut_ptr();
1826-
18271848
/* Drop items while going through Vec, it should be more efficient than
18281849
* doing slice partition_dedup + truncate */
18291850

1851+
// Construct gap first and then drop item to avoid memory corruption if `T::drop` panics.
1852+
let mut gap =
1853+
FillGapOnDrop { read: possible_remove_idx + 1, write: possible_remove_idx, vec: self };
1854+
unsafe {
1855+
// SAFETY: we checked that possible_remove_idx < len before.
1856+
// If drop panics, `gap` would remove this item without drop.
1857+
ptr::drop_in_place(start.add(possible_remove_idx));
1858+
}
1859+
18301860
/* SAFETY: Because of the invariant, read_ptr, prev_ptr and write_ptr
18311861
* are always in-bounds and read_ptr never aliases prev_ptr */
18321862
unsafe {
18331863
while gap.read < len {
1834-
let read_ptr = ptr.add(gap.read);
1835-
let prev_ptr = ptr.add(gap.write.wrapping_sub(1));
1864+
let read_ptr = start.add(gap.read);
1865+
let prev_ptr = start.add(gap.write.wrapping_sub(1));
18361866

1837-
if same_bucket(&mut *read_ptr, &mut *prev_ptr) {
1867+
// We explicitly say in docs that references are reversed.
1868+
let found_duplicate = same_bucket(&mut *read_ptr, &mut *prev_ptr);
1869+
if found_duplicate {
18381870
// Increase `gap.read` now since the drop may panic.
18391871
gap.read += 1;
18401872
/* We have found duplicate, drop it in-place */
18411873
ptr::drop_in_place(read_ptr);
18421874
} else {
1843-
let write_ptr = ptr.add(gap.write);
1875+
let write_ptr = start.add(gap.write);
18441876

1845-
/* Because `read_ptr` can be equal to `write_ptr`, we either
1846-
* have to use `copy` or conditional `copy_nonoverlapping`.
1847-
* Looks like the first option is faster. */
1848-
ptr::copy(read_ptr, write_ptr, 1);
1877+
/* read_ptr cannot be equal to write_ptr because at this point
1878+
* we guaranteed to skip at least one element (before loop starts).
1879+
*/
1880+
ptr::copy_nonoverlapping(read_ptr, write_ptr, 1);
18491881

18501882
/* We have filled that place, so go further */
18511883
gap.write += 1;

0 commit comments

Comments
 (0)