|
1 | 1 | use simd_llvm::simd_shuffle4;
|
2 | 2 | use v128::*;
|
| 3 | +use std::os::raw::c_void; |
3 | 4 |
|
4 | 5 | #[cfg(test)]
|
5 | 6 | use stdsimd_test::assert_instr;
|
@@ -267,6 +268,80 @@ pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
|
267 | 268 | movmskps(a)
|
268 | 269 | }
|
269 | 270 |
|
| 271 | + |
| 272 | +/// See [`_mm_prefetch`](fn._mm_prefetch.html). |
| 273 | +pub const _MM_HINT_T0: i8 = 3; |
| 274 | + |
| 275 | +/// See [`_mm_prefetch`](fn._mm_prefetch.html). |
| 276 | +pub const _MM_HINT_T1: i8 = 2; |
| 277 | + |
| 278 | +/// See [`_mm_prefetch`](fn._mm_prefetch.html). |
| 279 | +pub const _MM_HINT_T2: i8 = 1; |
| 280 | + |
| 281 | +/// See [`_mm_prefetch`](fn._mm_prefetch.html). |
| 282 | +pub const _MM_HINT_NTA: i8 = 0; |
| 283 | + |
| 284 | + |
| 285 | +/// Fetch the cache line that contains address `p` using the given `strategy`. |
| 286 | +/// |
| 287 | +/// The `strategy` must be one of: |
| 288 | +/// |
| 289 | +/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the |
| 290 | +/// cache hierachy. |
| 291 | +/// |
| 292 | +/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher. |
| 293 | +/// |
| 294 | +/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or an |
| 295 | +/// implementation-specific choice (e.g., L2 if there is no L3). |
| 296 | +/// |
| 297 | +/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the |
| 298 | +/// non-temporal access (NTA) hint. It may be a place closer than main memory |
| 299 | +/// but outside of the cache hierarchy. This is used to reduce access latency |
| 300 | +/// without polluting the cache. |
| 301 | +/// |
| 302 | +/// The actual implementation depends on the particular CPU. This instruction |
| 303 | +/// is considered a hint, so the CPU is also free to simply ignore the request. |
| 304 | +/// |
| 305 | +/// The amount of prefetched data depends on the cache line size of the specific |
| 306 | +/// CPU, but it will be at least 32 bytes. |
| 307 | +/// |
| 308 | +/// Common caveats: |
| 309 | +/// |
| 310 | +/// * Most modern CPUs already automatically prefetch data based on predicted |
| 311 | +/// access patterns. |
| 312 | +/// |
| 313 | +/// * Data is usually not fetched if this would cause a TLB miss or a page |
| 314 | +/// fault. |
| 315 | +/// |
| 316 | +/// * Too much prefetching can cause unnecessary cache evictions. |
| 317 | +/// |
| 318 | +/// * Prefetching may also fail if there are not enough memory-subsystem |
| 319 | +/// resources (e.g., request buffers). |
| 320 | +/// |
| 321 | +#[inline(always)] |
| 322 | +#[target_feature = "+sse"] |
| 323 | +#[cfg_attr(test, assert_instr(prefetcht0, strategy = _MM_HINT_T0))] |
| 324 | +// #[cfg_attr(test, assert_instr(prefetcht1, strategy = _MM_HINT_T1))] |
| 325 | +// #[cfg_attr(test, assert_instr(prefetcht2, strategy = _MM_HINT_T2))] |
| 326 | +// #[cfg_attr(test, assert_instr(prefetchnta, strategy = _MM_HINT_NTA))] |
| 327 | +pub unsafe fn _mm_prefetch(p: *const c_void, strategy: i8) { |
| 328 | + // The `strategy` must be a compile-time constant, so we use a short form of |
| 329 | + // `constify_imm8!` for now. |
| 330 | + // We use the `llvm.prefetch` instrinsic with `rw` = 0 (read), and |
| 331 | + // `cache type` = 1 (data cache). `locality` is based on our `strategy`. |
| 332 | + macro_rules! pref { |
| 333 | + ($imm8:expr) => { |
| 334 | + match $imm8 { |
| 335 | + 0 => prefetch(p, 0, 0, 1), |
| 336 | + 1 => prefetch(p, 0, 1, 1), |
| 337 | + 2 => prefetch(p, 0, 2, 1), |
| 338 | + _ => prefetch(p, 0, 3, 1), |
| 339 | + } |
| 340 | + } |
| 341 | + } |
| 342 | + pref!(strategy) |
| 343 | +} |
| 344 | + |
270 | 345 | #[allow(improper_ctypes)]
|
271 | 346 | extern {
|
272 | 347 | #[link_name = "llvm.x86.sse.add.ss"]
|
@@ -299,6 +374,8 @@ extern {
|
299 | 374 | fn maxps(a: f32x4, b: f32x4) -> f32x4;
|
300 | 375 | #[link_name = "llvm.x86.sse.movmsk.ps"]
|
301 | 376 | fn movmskps(a: f32x4) -> i32;
|
| 377 | + #[link_name = "llvm.prefetch"] |
| 378 | + fn prefetch(p: *const c_void, rw: i32, loc: i32, ty: i32); |
302 | 379 | }
|
303 | 380 |
|
304 | 381 | #[cfg(test)]
|
|
0 commit comments