Closed
Description
Hi!
The code: https://godbolt.org/z/8n64TrnqK
Clang introduced a lot of
vextracti128 xmm6, ymm3, 1
vpackssdw xmm3, xmm3, xmm6
And other things. It is possoble that I don't understand why this is an optimization but looks suspicious.
Code pasted
#include "immintrin.h"
__m256i has_equal_in_u32(__m256i a, __m256i b0) {
__m256i b1 = _mm256_shuffle_epi32(b0, 57); // [1,2,3,0,5,6,7,4]
__m256i b2 = _mm256_shuffle_epi32(b0, 78); // [2,3,0,1,6,7,4,5]
__m256i b3 = _mm256_shuffle_epi32(b1, 78);
__m256i b4 = _mm256_permute4x64_epi64(b0, 78); // [2,3,0,1]
__m256i b5 = _mm256_permute4x64_epi64(b1, 78);
__m256i b6 = _mm256_permute4x64_epi64(b2, 78);
__m256i b7 = _mm256_permute4x64_epi64(b3, 78);
b0 = _mm256_cmpeq_epi32(a, b0);
b1 = _mm256_cmpeq_epi32(a, b1);
b2 = _mm256_cmpeq_epi32(a, b2);
b3 = _mm256_cmpeq_epi32(a, b3);
b4 = _mm256_cmpeq_epi32(a, b4);
b5 = _mm256_cmpeq_epi32(a, b5);
b6 = _mm256_cmpeq_epi32(a, b6);
b7 = _mm256_cmpeq_epi32(a, b7);
b0 = _mm256_or_si256(b0, b1);
b1 = _mm256_or_si256(b2, b3);
b2 = _mm256_or_si256(b4, b5);
b3 = _mm256_or_si256(b6, b7);
b0 = _mm256_or_si256(b0, b1);
b1 = _mm256_or_si256(b2, b3);
return _mm256_or_si256(b0, b1);
}