Skip to content

Commit 3a19de8

Browse files
nominoloalexcrichton
authored andcommitted
Implement _mm_prefetch (rust-lang#78)
This boils down to using LLVMs `prefetch` intrinsic [1]. [1]: https://llvm.org/docs/LangRef.html#llvm-prefetch-intrinsic
1 parent 4492b4c commit 3a19de8

File tree

1 file changed

+77
-0
lines changed

1 file changed

+77
-0
lines changed

src/x86/sse.rs

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use simd_llvm::simd_shuffle4;
22
use v128::*;
3+
use std::os::raw::c_void;
34

45
#[cfg(test)]
56
use stdsimd_test::assert_instr;
@@ -267,6 +268,80 @@ pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
267268
movmskps(a)
268269
}
269270

271+
272+
/// See [`_mm_prefetch`](fn._mm_prefetch.html).
273+
pub const _MM_HINT_T0: i8 = 3;
274+
275+
/// See [`_mm_prefetch`](fn._mm_prefetch.html).
276+
pub const _MM_HINT_T1: i8 = 2;
277+
278+
/// See [`_mm_prefetch`](fn._mm_prefetch.html).
279+
pub const _MM_HINT_T2: i8 = 1;
280+
281+
/// See [`_mm_prefetch`](fn._mm_prefetch.html).
282+
pub const _MM_HINT_NTA: i8 = 0;
283+
284+
285+
/// Fetch the cache line that contains address `p` using the given `strategy`.
286+
///
287+
/// The `strategy` must be one of:
288+
///
289+
/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
290+
/// cache hierachy.
291+
///
292+
/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
293+
///
294+
/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or an
295+
/// implementation-specific choice (e.g., L2 if there is no L3).
296+
///
297+
/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
298+
/// non-temporal access (NTA) hint. It may be a place closer than main memory
299+
/// but outside of the cache hierarchy. This is used to reduce access latency
300+
/// without polluting the cache.
301+
///
302+
/// The actual implementation depends on the particular CPU. This instruction
303+
/// is considered a hint, so the CPU is also free to simply ignore the request.
304+
///
305+
/// The amount of prefetched data depends on the cache line size of the specific
306+
/// CPU, but it will be at least 32 bytes.
307+
///
308+
/// Common caveats:
309+
///
310+
/// * Most modern CPUs already automatically prefetch data based on predicted
311+
/// access patterns.
312+
///
313+
/// * Data is usually not fetched if this would cause a TLB miss or a page
314+
/// fault.
315+
///
316+
/// * Too much prefetching can cause unnecessary cache evictions.
317+
///
318+
/// * Prefetching may also fail if there are not enough memory-subsystem
319+
/// resources (e.g., request buffers).
320+
///
321+
#[inline(always)]
322+
#[target_feature = "+sse"]
323+
#[cfg_attr(test, assert_instr(prefetcht0, strategy = _MM_HINT_T0))]
324+
// #[cfg_attr(test, assert_instr(prefetcht1, strategy = _MM_HINT_T1))]
325+
// #[cfg_attr(test, assert_instr(prefetcht2, strategy = _MM_HINT_T2))]
326+
// #[cfg_attr(test, assert_instr(prefetchnta, strategy = _MM_HINT_NTA))]
327+
pub unsafe fn _mm_prefetch(p: *const c_void, strategy: i8) {
328+
// The `strategy` must be a compile-time constant, so we use a short form of
329+
// `constify_imm8!` for now.
330+
// We use the `llvm.prefetch` instrinsic with `rw` = 0 (read), and
331+
// `cache type` = 1 (data cache). `locality` is based on our `strategy`.
332+
macro_rules! pref {
333+
($imm8:expr) => {
334+
match $imm8 {
335+
0 => prefetch(p, 0, 0, 1),
336+
1 => prefetch(p, 0, 1, 1),
337+
2 => prefetch(p, 0, 2, 1),
338+
_ => prefetch(p, 0, 3, 1),
339+
}
340+
}
341+
}
342+
pref!(strategy)
343+
}
344+
270345
#[allow(improper_ctypes)]
271346
extern {
272347
#[link_name = "llvm.x86.sse.add.ss"]
@@ -299,6 +374,8 @@ extern {
299374
fn maxps(a: f32x4, b: f32x4) -> f32x4;
300375
#[link_name = "llvm.x86.sse.movmsk.ps"]
301376
fn movmskps(a: f32x4) -> i32;
377+
#[link_name = "llvm.prefetch"]
378+
fn prefetch(p: *const c_void, rw: i32, loc: i32, ty: i32);
302379
}
303380

304381
#[cfg(test)]

0 commit comments

Comments
 (0)