[performance] write the results to memory by chunk

lovasoa · lovasoa · commit c18b947a8c4d · 2020-04-15T19:43:58.000+02:00
This necessitates to transpose the result matrix at the end of the IDCT,
but benchmarking still shows a ~5% performance improvement with this
(on the 512x12 jpeg decode benchmark, when compiled on rust nightly
with target-cpu=native and the packed_simd feature)
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,7 +13,7 @@ exclude = ["tests/*"]
 [dependencies]
 byteorder = "1.0"
 rayon = { version = "1.0", optional = true }
-ssimd = { git = "https://github.com/lovasoa/ssimd.git" }
+ssimd = { git = "https://github.com/lovasoa/ssimd.git#18afc1af" }
 packed_simd = { version = "0.3", optional = true }
 
 [dev-dependencies]
diff --git a/src/idct.rs b/src/idct.rs
@@ -149,7 +149,7 @@ fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16
     let x = idct_1d_x(&s, 65536 + (128 << 17));
     let t = idct_1d_t(&s);
 
-    let results = [
+    let mut results = [
         stbi_clamp_simd!(i32x8,u8x8, (x[0] + t[3]) >> 17),
         stbi_clamp_simd!(i32x8,u8x8, (x[1] + t[2]) >> 17),
         stbi_clamp_simd!(i32x8,u8x8, (x[2] + t[1]) >> 17),
@@ -160,10 +160,10 @@ fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16
         stbi_clamp_simd!(i32x8,u8x8, (x[0] - t[3]) >> 17),
     ];
 
+    simd_transpose!(results);
+
     for i in 0..8 {
-        for j in 0..8 {
-            output[i * output_linestride + j] = results[j].extract(i);
-        }
+        results[i].write_to_slice_aligned(&mut output[i * output_linestride..]);
     }
 }