From e366cb1d384d08387bb2cb106c5991ec57a0fb55 Mon Sep 17 00:00:00 2001 From: Zbigniew Sarbinowski Date: Wed, 22 May 2024 18:11:27 +0000 Subject: [PATCH 1/9] Fix endianess for algorithm mismatch --- libcxx/include/__algorithm/mismatch.h | 47 ++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h index 632bec02406a4..a8219f0817a6c 100644 --- a/libcxx/include/__algorithm/mismatch.h +++ b/libcxx/include/__algorithm/mismatch.h @@ -56,6 +56,39 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro #if _LIBCPP_VECTORIZE_ALGORITHMS +template ::value, int> = 0> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 8> +__reverse_vector(__simd_vector<_Tp, 8>& __cmp_res) { +#if defined(_LIBCPP_BIG_ENDIAN) + static_assert(__native_vector_size<_Tp> == 8, "The __native_vector_size has to be 8"); + __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0); +#endif + return __cmp_res; +} + +template ::value, int> = 0> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 16> +__reverse_vector(__simd_vector<_Tp, 16> __cmp_res) { +#if defined(_LIBCPP_BIG_ENDIAN) + static_assert(__native_vector_size<_Tp> == 16, "The __native_vector_size has to be 16"); + __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +#endif + return __cmp_res; +} + +template ::value, int> = 0> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 32> +__reverse_vector(__simd_vector<_Tp, 32> __cmp_res) { +#if defined(_LIBCPP_BIG_ENDIAN) + static_assert(__native_vector_size<_Tp> == 32, "The __native_vector_size has to be 32"); + __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +#endif + return __cmp_res; +} + template _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter, _Iter> __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { @@ -77,7 +110,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { } for (size_t __i = 0; __i != __unroll_count; ++__i) { - if (auto __cmp_res = __lhs[__i] == __rhs[__i]; !std::__all_of(__cmp_res)) { + auto __cmp_res = __lhs[__i] == __rhs[__i]; + __cmp_res = __reverse_vector<_Tp>(__cmp_res); + if (!std::__all_of(__cmp_res)) { auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; } @@ -89,8 +124,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { // check the remaining 0-3 vectors while (static_cast(__last1 - __first1) >= __vec_size) { - if (auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); - !std::__all_of(__cmp_res)) { + auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); + __cmp_res = __reverse_vector<_Tp>(__cmp_res); + if (!std::__all_of(__cmp_res)) { auto __offset = std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; } @@ -106,8 +142,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { if (static_cast(__first1 - __orig_first1) >= __vec_size) { __first1 = __last1 - __vec_size; __first2 = __last2 - __vec_size; - auto __offset = - std::__find_first_not_set(std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2)); + auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); + __cmp_res = __reverse_vector<_Tp>(__cmp_res); + auto __offset = std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; } // else loop over the elements individually } From 05395e0b07312d5a7594bdc1db46cf695001242c Mon Sep 17 00:00:00 2001 From: Zbigniew Sarbinowski Date: Wed, 22 May 2024 18:21:04 +0000 Subject: [PATCH 2/9] Update based on the latest changes --- libcxx/include/__algorithm/mismatch.h | 83 ++++++++++++++++++--------- 1 file changed, 57 insertions(+), 26 deletions(-) diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h index a8219f0817a6c..7b4e7da35cf7f 100644 --- a/libcxx/include/__algorithm/mismatch.h +++ b/libcxx/include/__algorithm/mismatch.h @@ -56,36 +56,67 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro #if _LIBCPP_VECTORIZE_ALGORITHMS -template ::value, int> = 0> -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 8> -__reverse_vector(__simd_vector<_Tp, 8>& __cmp_res) { -#if defined(_LIBCPP_BIG_ENDIAN) - static_assert(__native_vector_size<_Tp> == 8, "The __native_vector_size has to be 8"); +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Value_type, 8> +__reverse_vector(__simd_vector<_Value_type, 8>& __cmp_res) { +# if defined(_LIBCPP_BIG_ENDIAN) + static_assert(__native_vector_size<_Value_type> == 8, "The __native_vector_size has to be 8"); __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0); -#endif +# endif return __cmp_res; } -template ::value, int> = 0> -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 16> -__reverse_vector(__simd_vector<_Tp, 16> __cmp_res) { -#if defined(_LIBCPP_BIG_ENDIAN) - static_assert(__native_vector_size<_Tp> == 16, "The __native_vector_size has to be 16"); +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Value_type, 16> +__reverse_vector(__simd_vector<_Value_type, 16> __cmp_res) { +# if defined(_LIBCPP_BIG_ENDIAN) + static_assert(__native_vector_size<_Value_type> == 16, "The __native_vector_size has to be 16"); __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); -#endif +# endif return __cmp_res; } -template ::value, int> = 0> -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 32> -__reverse_vector(__simd_vector<_Tp, 32> __cmp_res) { -#if defined(_LIBCPP_BIG_ENDIAN) - static_assert(__native_vector_size<_Tp> == 32, "The __native_vector_size has to be 32"); - __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); -#endif +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Value_type, 32> +__reverse_vector(__simd_vector<_Value_type, 32> __cmp_res) { +# if defined(_LIBCPP_BIG_ENDIAN) + static_assert(__native_vector_size<_Value_type> == 32, "The __native_vector_size has to be 32"); + __cmp_res = __builtin_shufflevector( + __cmp_res, + __cmp_res, + 31, + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0); +# endif return __cmp_res; } @@ -111,7 +142,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { for (size_t __i = 0; __i != __unroll_count; ++__i) { auto __cmp_res = __lhs[__i] == __rhs[__i]; - __cmp_res = __reverse_vector<_Tp>(__cmp_res); + __cmp_res = __reverse_vector<__value_type>(__cmp_res); if (!std::__all_of(__cmp_res)) { auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; @@ -125,7 +156,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { // check the remaining 0-3 vectors while (static_cast(__last1 - __first1) >= __vec_size) { auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); - __cmp_res = __reverse_vector<_Tp>(__cmp_res); + __cmp_res = __reverse_vector<__value_type>(__cmp_res); if (!std::__all_of(__cmp_res)) { auto __offset = std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; @@ -143,8 +174,8 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { __first1 = __last1 - __vec_size; __first2 = __last2 - __vec_size; auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); - __cmp_res = __reverse_vector<_Tp>(__cmp_res); - auto __offset = std::__find_first_not_set(__cmp_res); + __cmp_res = __reverse_vector<__value_type>(__cmp_res); + auto __offset = std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; } // else loop over the elements individually } From b4ce87242c5eb820fe6fde076ecbaedeb78b427e Mon Sep 17 00:00:00 2001 From: Zbigniew Sarbinowski Date: Wed, 22 May 2024 22:45:20 +0000 Subject: [PATCH 3/9] Add more __reverse_vector overloads --- libcxx/include/__algorithm/mismatch.h | 70 +++++++++++++++++++++------ 1 file changed, 55 insertions(+), 15 deletions(-) diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h index 7b4e7da35cf7f..8c4b4a0e031ed 100644 --- a/libcxx/include/__algorithm/mismatch.h +++ b/libcxx/include/__algorithm/mismatch.h @@ -56,31 +56,71 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro #if _LIBCPP_VECTORIZE_ALGORITHMS -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Value_type, 8> -__reverse_vector(__simd_vector<_Value_type, 8>& __cmp_res) { +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector +__reverse_vector(__simd_vector& __cmp_res) { # if defined(_LIBCPP_BIG_ENDIAN) - static_assert(__native_vector_size<_Value_type> == 8, "The __native_vector_size has to be 8"); + static_assert(__native_vector_size == 2, "The __native_vector_size has to be 2"); + __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 1, 0); +# endif + return __cmp_res; +} + +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector +__reverse_vector(__simd_vector& __cmp_res) { +# if defined(_LIBCPP_BIG_ENDIAN) + static_assert(__native_vector_size == 4, "The __native_vector_size has to be 4"); + __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 3, 2, 1, 0); +# endif + return __cmp_res; +} + +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector +__reverse_vector(__simd_vector& __cmp_res) { +# if defined(_LIBCPP_BIG_ENDIAN) + static_assert(__native_vector_size == 8, "The __native_vector_size has to be 8"); + __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0); +# endif + return __cmp_res; +} + +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector +__reverse_vector(__simd_vector& __cmp_res) { +# if defined(_LIBCPP_BIG_ENDIAN) + static_assert(__native_vector_size == 4, "The __native_vector_size has to be 4"); + __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 3, 2, 1, 0); +# endif + return __cmp_res; +} + +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 8> +__reverse_vector(__simd_vector<_ValueType, 8>& __cmp_res) { +# if defined(_LIBCPP_BIG_ENDIAN) + static_assert(__native_vector_size<_ValueType> == 8, "The __native_vector_size has to be 8"); __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0); # endif return __cmp_res; } -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Value_type, 16> -__reverse_vector(__simd_vector<_Value_type, 16> __cmp_res) { +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 16> +__reverse_vector(__simd_vector<_ValueType, 16> __cmp_res) { # if defined(_LIBCPP_BIG_ENDIAN) - static_assert(__native_vector_size<_Value_type> == 16, "The __native_vector_size has to be 16"); + static_assert(__native_vector_size<_ValueType> == 16, "The __native_vector_size has to be 16"); __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); # endif return __cmp_res; } -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Value_type, 32> -__reverse_vector(__simd_vector<_Value_type, 32> __cmp_res) { +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 32> +__reverse_vector(__simd_vector<_ValueType, 32> __cmp_res) { # if defined(_LIBCPP_BIG_ENDIAN) - static_assert(__native_vector_size<_Value_type> == 32, "The __native_vector_size has to be 32"); + static_assert(__native_vector_size<_ValueType> == 32, "The __native_vector_size has to be 32"); __cmp_res = __builtin_shufflevector( __cmp_res, __cmp_res, @@ -142,7 +182,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { for (size_t __i = 0; __i != __unroll_count; ++__i) { auto __cmp_res = __lhs[__i] == __rhs[__i]; - __cmp_res = __reverse_vector<__value_type>(__cmp_res); + __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); if (!std::__all_of(__cmp_res)) { auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; @@ -156,7 +196,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { // check the remaining 0-3 vectors while (static_cast(__last1 - __first1) >= __vec_size) { auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); - __cmp_res = __reverse_vector<__value_type>(__cmp_res); + __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); if (!std::__all_of(__cmp_res)) { auto __offset = std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; @@ -174,7 +214,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { __first1 = __last1 - __vec_size; __first2 = __last2 - __vec_size; auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); - __cmp_res = __reverse_vector<__value_type>(__cmp_res); + __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); auto __offset = std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; } // else loop over the elements individually From 40cab2461b7efa5861fc01272baa3b196fb91c98 Mon Sep 17 00:00:00 2001 From: Zbigniew Sarbinowski Date: Fri, 24 May 2024 19:16:57 +0000 Subject: [PATCH 4/9] Try to fix windows CI --- libcxx/include/__algorithm/mismatch.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h index 8c4b4a0e031ed..8519ef76140ed 100644 --- a/libcxx/include/__algorithm/mismatch.h +++ b/libcxx/include/__algorithm/mismatch.h @@ -56,11 +56,19 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro #if _LIBCPP_VECTORIZE_ALGORITHMS +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector +__reverse_vector(__simd_vector& __cmp_res) { +# if defined(_LIBCPP_BIG_ENDIAN) + __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 1, 0); +# endif + return __cmp_res; +} + template _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector __reverse_vector(__simd_vector& __cmp_res) { # if defined(_LIBCPP_BIG_ENDIAN) - static_assert(__native_vector_size == 2, "The __native_vector_size has to be 2"); __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 1, 0); # endif return __cmp_res; @@ -70,7 +78,6 @@ template _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector __reverse_vector(__simd_vector& __cmp_res) { # if defined(_LIBCPP_BIG_ENDIAN) - static_assert(__native_vector_size == 4, "The __native_vector_size has to be 4"); __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 3, 2, 1, 0); # endif return __cmp_res; @@ -80,7 +87,6 @@ template _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector __reverse_vector(__simd_vector& __cmp_res) { # if defined(_LIBCPP_BIG_ENDIAN) - static_assert(__native_vector_size == 8, "The __native_vector_size has to be 8"); __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0); # endif return __cmp_res; @@ -90,7 +96,6 @@ template _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector __reverse_vector(__simd_vector& __cmp_res) { # if defined(_LIBCPP_BIG_ENDIAN) - static_assert(__native_vector_size == 4, "The __native_vector_size has to be 4"); __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 3, 2, 1, 0); # endif return __cmp_res; @@ -100,7 +105,6 @@ template _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 8> __reverse_vector(__simd_vector<_ValueType, 8>& __cmp_res) { # if defined(_LIBCPP_BIG_ENDIAN) - static_assert(__native_vector_size<_ValueType> == 8, "The __native_vector_size has to be 8"); __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0); # endif return __cmp_res; @@ -110,7 +114,6 @@ template _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 16> __reverse_vector(__simd_vector<_ValueType, 16> __cmp_res) { # if defined(_LIBCPP_BIG_ENDIAN) - static_assert(__native_vector_size<_ValueType> == 16, "The __native_vector_size has to be 16"); __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); # endif return __cmp_res; @@ -120,7 +123,6 @@ template _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 32> __reverse_vector(__simd_vector<_ValueType, 32> __cmp_res) { # if defined(_LIBCPP_BIG_ENDIAN) - static_assert(__native_vector_size<_ValueType> == 32, "The __native_vector_size has to be 32"); __cmp_res = __builtin_shufflevector( __cmp_res, __cmp_res, From 42f64c8d8ebee7b314d28bdf724a2ed2b2ed68b0 Mon Sep 17 00:00:00 2001 From: Zbigniew Sarbinowski Date: Mon, 3 Jun 2024 18:30:36 +0000 Subject: [PATCH 5/9] Based on the suggestion, apply the variadic template technique to reduce code --- libcxx/include/__algorithm/mismatch.h | 139 ++++++-------------------- 1 file changed, 33 insertions(+), 106 deletions(-) diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h index 8519ef76140ed..bde3010b0c457 100644 --- a/libcxx/include/__algorithm/mismatch.h +++ b/libcxx/include/__algorithm/mismatch.h @@ -55,113 +55,34 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro } #if _LIBCPP_VECTORIZE_ALGORITHMS - -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector -__reverse_vector(__simd_vector& __cmp_res) { -# if defined(_LIBCPP_BIG_ENDIAN) - __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 1, 0); -# endif - return __cmp_res; -} - -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector -__reverse_vector(__simd_vector& __cmp_res) { -# if defined(_LIBCPP_BIG_ENDIAN) - __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 1, 0); -# endif - return __cmp_res; -} - -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector -__reverse_vector(__simd_vector& __cmp_res) { -# if defined(_LIBCPP_BIG_ENDIAN) - __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 3, 2, 1, 0); -# endif - return __cmp_res; -} - -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector -__reverse_vector(__simd_vector& __cmp_res) { -# if defined(_LIBCPP_BIG_ENDIAN) - __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0); -# endif - return __cmp_res; -} - -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector -__reverse_vector(__simd_vector& __cmp_res) { -# if defined(_LIBCPP_BIG_ENDIAN) - __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 3, 2, 1, 0); -# endif - return __cmp_res; +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector +__reverse_vector(__simd_vector __cmp_res) { + return [&](index_sequence<_Indices...>) { + return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...); + }(make_index_sequence<_Np>{}); +} +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector +__reverse_vector(__simd_vector __cmp_res) { + return [&](index_sequence<_Indices...>) { + return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...); + }(make_index_sequence<_Np>{}); +} +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector +__reverse_vector(__simd_vector __cmp_res) { + return [&](index_sequence<_Indices...>) { + return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...); + }(make_index_sequence<_Np>{}); +} +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<_ValueType, _Np> +__reverse_vector(__simd_vector<_ValueType, _Np> __cmp_res) { + return [&](index_sequence<_Indices...>) { + return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...); + }(make_index_sequence<_Np>{}); } - -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 8> -__reverse_vector(__simd_vector<_ValueType, 8>& __cmp_res) { -# if defined(_LIBCPP_BIG_ENDIAN) - __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0); -# endif - return __cmp_res; -} - -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 16> -__reverse_vector(__simd_vector<_ValueType, 16> __cmp_res) { -# if defined(_LIBCPP_BIG_ENDIAN) - __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); -# endif - return __cmp_res; -} - -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 32> -__reverse_vector(__simd_vector<_ValueType, 32> __cmp_res) { -# if defined(_LIBCPP_BIG_ENDIAN) - __cmp_res = __builtin_shufflevector( - __cmp_res, - __cmp_res, - 31, - 30, - 29, - 28, - 27, - 26, - 25, - 24, - 23, - 22, - 21, - 20, - 19, - 18, - 17, - 16, - 15, - 14, - 13, - 12, - 11, - 10, - 9, - 8, - 7, - 6, - 5, - 4, - 3, - 2, - 1, - 0); -# endif - return __cmp_res; -} - template _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter, _Iter> __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { @@ -184,7 +105,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { for (size_t __i = 0; __i != __unroll_count; ++__i) { auto __cmp_res = __lhs[__i] == __rhs[__i]; +# if defined(_LIBCPP_BIG_ENDIAN) __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); +# endif if (!std::__all_of(__cmp_res)) { auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; @@ -198,7 +121,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { // check the remaining 0-3 vectors while (static_cast(__last1 - __first1) >= __vec_size) { auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); +# if defined(_LIBCPP_BIG_ENDIAN) __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); +# endif if (!std::__all_of(__cmp_res)) { auto __offset = std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; @@ -216,7 +141,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { __first1 = __last1 - __vec_size; __first2 = __last2 - __vec_size; auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); +# if defined(_LIBCPP_BIG_ENDIAN) __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); +# endif auto __offset = std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; } // else loop over the elements individually From 8cddb8352696412011d1a513cb86b01af925044f Mon Sep 17 00:00:00 2001 From: Zbigniew Sarbinowski Date: Mon, 3 Jun 2024 18:43:05 +0000 Subject: [PATCH 6/9] fix formatting --- libcxx/include/__algorithm/mismatch.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h index bde3010b0c457..bdd3314ed1ec5 100644 --- a/libcxx/include/__algorithm/mismatch.h +++ b/libcxx/include/__algorithm/mismatch.h @@ -63,15 +63,13 @@ __reverse_vector(__simd_vector __cmp_res) { }(make_index_sequence<_Np>{}); } template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector -__reverse_vector(__simd_vector __cmp_res) { +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector __reverse_vector(__simd_vector __cmp_res) { return [&](index_sequence<_Indices...>) { return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...); }(make_index_sequence<_Np>{}); } template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector -__reverse_vector(__simd_vector __cmp_res) { +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector __reverse_vector(__simd_vector __cmp_res) { return [&](index_sequence<_Indices...>) { return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...); }(make_index_sequence<_Np>{}); @@ -106,7 +104,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { for (size_t __i = 0; __i != __unroll_count; ++__i) { auto __cmp_res = __lhs[__i] == __rhs[__i]; # if defined(_LIBCPP_BIG_ENDIAN) - __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); + __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); # endif if (!std::__all_of(__cmp_res)) { auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res); @@ -122,7 +120,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { while (static_cast(__last1 - __first1) >= __vec_size) { auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); # if defined(_LIBCPP_BIG_ENDIAN) - __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); + __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); # endif if (!std::__all_of(__cmp_res)) { auto __offset = std::__find_first_not_set(__cmp_res); @@ -142,9 +140,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { __first2 = __last2 - __vec_size; auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); # if defined(_LIBCPP_BIG_ENDIAN) - __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); + __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); # endif - auto __offset = std::__find_first_not_set(__cmp_res); + auto __offset = std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; } // else loop over the elements individually } From 9bf257d9563cbe7b1d9e1b6a349e81d249baa150 Mon Sep 17 00:00:00 2001 From: Zbigniew Sarbinowski Date: Mon, 3 Jun 2024 19:19:50 +0000 Subject: [PATCH 7/9] attempt to fix CI --- libcxx/include/__algorithm/mismatch.h | 1 + 1 file changed, 1 insertion(+) diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h index bdd3314ed1ec5..3ff0f59caec2b 100644 --- a/libcxx/include/__algorithm/mismatch.h +++ b/libcxx/include/__algorithm/mismatch.h @@ -22,6 +22,7 @@ #include <__type_traits/is_constant_evaluated.h> #include <__type_traits/is_equality_comparable.h> #include <__type_traits/is_integral.h> +#include <__utility/integer_sequence.h> #include <__utility/move.h> #include <__utility/pair.h> #include <__utility/unreachable.h> From 5eba555bf6aaf042a9b3c7783bcce3efd9c04985 Mon Sep 17 00:00:00 2001 From: Zbigniew Sarbinowski Date: Mon, 10 Jun 2024 13:32:52 +0000 Subject: [PATCH 8/9] Make __find_first_set endianness aware --- libcxx/include/__algorithm/mismatch.h | 48 ++++--------------------- libcxx/include/__algorithm/simd_utils.h | 11 +++++- 2 files changed, 16 insertions(+), 43 deletions(-) diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h index 3ff0f59caec2b..632bec02406a4 100644 --- a/libcxx/include/__algorithm/mismatch.h +++ b/libcxx/include/__algorithm/mismatch.h @@ -22,7 +22,6 @@ #include <__type_traits/is_constant_evaluated.h> #include <__type_traits/is_equality_comparable.h> #include <__type_traits/is_integral.h> -#include <__utility/integer_sequence.h> #include <__utility/move.h> #include <__utility/pair.h> #include <__utility/unreachable.h> @@ -56,32 +55,7 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro } #if _LIBCPP_VECTORIZE_ALGORITHMS -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector -__reverse_vector(__simd_vector __cmp_res) { - return [&](index_sequence<_Indices...>) { - return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...); - }(make_index_sequence<_Np>{}); -} -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector __reverse_vector(__simd_vector __cmp_res) { - return [&](index_sequence<_Indices...>) { - return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...); - }(make_index_sequence<_Np>{}); -} -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector __reverse_vector(__simd_vector __cmp_res) { - return [&](index_sequence<_Indices...>) { - return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...); - }(make_index_sequence<_Np>{}); -} -template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<_ValueType, _Np> -__reverse_vector(__simd_vector<_ValueType, _Np> __cmp_res) { - return [&](index_sequence<_Indices...>) { - return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...); - }(make_index_sequence<_Np>{}); -} + template _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter, _Iter> __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { @@ -103,11 +77,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { } for (size_t __i = 0; __i != __unroll_count; ++__i) { - auto __cmp_res = __lhs[__i] == __rhs[__i]; -# if defined(_LIBCPP_BIG_ENDIAN) - __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); -# endif - if (!std::__all_of(__cmp_res)) { + if (auto __cmp_res = __lhs[__i] == __rhs[__i]; !std::__all_of(__cmp_res)) { auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; } @@ -119,11 +89,8 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { // check the remaining 0-3 vectors while (static_cast(__last1 - __first1) >= __vec_size) { - auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); -# if defined(_LIBCPP_BIG_ENDIAN) - __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); -# endif - if (!std::__all_of(__cmp_res)) { + if (auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); + !std::__all_of(__cmp_res)) { auto __offset = std::__find_first_not_set(__cmp_res); return {__first1 + __offset, __first2 + __offset}; } @@ -139,11 +106,8 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { if (static_cast(__first1 - __orig_first1) >= __vec_size) { __first1 = __last1 - __vec_size; __first2 = __last2 - __vec_size; - auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); -# if defined(_LIBCPP_BIG_ENDIAN) - __cmp_res = std::__reverse_vector<__value_type>(__cmp_res); -# endif - auto __offset = std::__find_first_not_set(__cmp_res); + auto __offset = + std::__find_first_not_set(std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2)); return {__first1 + __offset, __first2 + __offset}; } // else loop over the elements individually } diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h index aa4336a2214c8..fd1d3d439092b 100644 --- a/libcxx/include/__algorithm/simd_utils.h +++ b/libcxx/include/__algorithm/simd_utils.h @@ -11,7 +11,11 @@ #include <__algorithm/min.h> #include <__bit/bit_cast.h> -#include <__bit/countr.h> +#if defined(_LIBCPP_BIG_ENDIAN) +# include <__bit/countl.h> +#else +# include <__bit/countr.h> +#endif #include <__config> #include <__type_traits/is_arithmetic.h> #include <__type_traits/is_same.h> @@ -126,8 +130,13 @@ _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_T // This has MSan disabled du to https://github.com/llvm/llvm-project/issues/85876 auto __impl = [&](_MaskT) _LIBCPP_NO_SANITIZE("memory") noexcept { +# if defined(_LIBCPP_BIG_ENDIAN) + return std::min( + _Np, std::__countl_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec)))); +# else return std::min( _Np, std::__countr_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec)))); +# endif }; if constexpr (sizeof(__mask_vec) == sizeof(uint8_t)) { From cf070c46a59d4fa097dd5654cd05ea93717ad4ec Mon Sep 17 00:00:00 2001 From: Zbigniew Sarbinowski Date: Tue, 11 Jun 2024 12:31:48 +0000 Subject: [PATCH 9/9] Remove conditinal statement for #include directive --- libcxx/include/__algorithm/simd_utils.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h index fd1d3d439092b..549197be80183 100644 --- a/libcxx/include/__algorithm/simd_utils.h +++ b/libcxx/include/__algorithm/simd_utils.h @@ -11,11 +11,8 @@ #include <__algorithm/min.h> #include <__bit/bit_cast.h> -#if defined(_LIBCPP_BIG_ENDIAN) -# include <__bit/countl.h> -#else -# include <__bit/countr.h> -#endif +#include <__bit/countl.h> +#include <__bit/countr.h> #include <__config> #include <__type_traits/is_arithmetic.h> #include <__type_traits/is_same.h>