Skip to content

Commit c18b947

Browse files
committed
[performance] write the results to memory by chunk
This necessitates to transpose the result matrix at the end of the IDCT, but benchmarking still shows a ~5% performance improvement with this (on the 512x12 jpeg decode benchmark, when compiled on rust nightly with target-cpu=native and the packed_simd feature)
1 parent e1ffc1f commit c18b947

File tree

2 files changed

+5
-5
lines changed

2 files changed

+5
-5
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ exclude = ["tests/*"]
1313
[dependencies]
1414
byteorder = "1.0"
1515
rayon = { version = "1.0", optional = true }
16-
ssimd = { git = "https://github.com/lovasoa/ssimd.git" }
16+
ssimd = { git = "https://github.com/lovasoa/ssimd.git#18afc1af" }
1717
packed_simd = { version = "0.3", optional = true }
1818

1919
[dev-dependencies]

src/idct.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16
149149
let x = idct_1d_x(&s, 65536 + (128 << 17));
150150
let t = idct_1d_t(&s);
151151

152-
let results = [
152+
let mut results = [
153153
stbi_clamp_simd!(i32x8,u8x8, (x[0] + t[3]) >> 17),
154154
stbi_clamp_simd!(i32x8,u8x8, (x[1] + t[2]) >> 17),
155155
stbi_clamp_simd!(i32x8,u8x8, (x[2] + t[1]) >> 17),
@@ -160,10 +160,10 @@ fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16
160160
stbi_clamp_simd!(i32x8,u8x8, (x[0] - t[3]) >> 17),
161161
];
162162

163+
simd_transpose!(results);
164+
163165
for i in 0..8 {
164-
for j in 0..8 {
165-
output[i * output_linestride + j] = results[j].extract(i);
166-
}
166+
results[i].write_to_slice_aligned(&mut output[i * output_linestride..]);
167167
}
168168
}
169169

0 commit comments

Comments
 (0)