diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 2a4f9c16d..45bcd23c6 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -53,24 +53,6 @@ #include // for memcpy -#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 -#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0 -#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8 - -/** \brief LUT for the mask1bit functions. */ -ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = { - ZEROES_32, ZEROES_32, - ZEROES_31, 0x01, ZEROES_32, - ZEROES_31, 0x02, ZEROES_32, - ZEROES_31, 0x04, ZEROES_32, - ZEROES_31, 0x08, ZEROES_32, - ZEROES_31, 0x10, ZEROES_32, - ZEROES_31, 0x20, ZEROES_32, - ZEROES_31, 0x40, ZEROES_32, - ZEROES_31, 0x80, ZEROES_32, - ZEROES_32, ZEROES_32, -}; - static really_inline m128 ones128(void) { return (m128) vdupq_n_s8(0xFF); } @@ -595,9 +577,9 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) { static really_inline m128 mask1bit128(unsigned int n) { assert(n < sizeof(m128) * 8); - u32 mask_idx = ((n % 8) * 64) + 95; - mask_idx -= n / 8; - return loadu128(&simd_onebit_masks[mask_idx]); + static m128 onebit = { 1, 0 }; + m128 mask = lshiftbyte_m128( onebit, n / 8 ); + return lshift64_m128( mask, n % 8 ); } // switches on bit N in the given vector. diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index 17de949a9..90ae80b06 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -88,6 +88,26 @@ static inline void print_m128_2x64(const char *label, m128 vec) { #define print_m128_2x64(label, vec) ; #endif +#if !defined(ARCH_IA32) && !defined(ARCH_X86_64) +#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 +#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0 +#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8 + +/** \brief LUT for the mask1bit functions. */ +ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = { + ZEROES_32, ZEROES_32, + ZEROES_31, 0x01, ZEROES_32, + ZEROES_31, 0x02, ZEROES_32, + ZEROES_31, 0x04, ZEROES_32, + ZEROES_31, 0x08, ZEROES_32, + ZEROES_31, 0x10, ZEROES_32, + ZEROES_31, 0x20, ZEROES_32, + ZEROES_31, 0x40, ZEROES_32, + ZEROES_31, 0x80, ZEROES_32, + ZEROES_32, ZEROES_32, +}; +#endif // !defined(ARCH_IA32) && !defined(ARCH_X86_64) + /**** **** 256-bit Primitives ****/ diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index d046ed47e..ea1766b26 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -54,34 +54,6 @@ typedef __vector signed char int8x16_t; typedef unsigned long long int ulong64_t; typedef signed long long int long64_t; -/* -typedef __vector uint64_t uint64x2_t; -typedef __vector int64_t int64x2_t; -typedef __vector uint32_t uint32x4_t; -typedef __vector int32_t int32x4_t; -typedef __vector uint16_t uint16x8_t; -typedef __vector int16_t int16x8_t; -typedef __vector uint8_t uint8x16_t; -typedef __vector int8_t int8x16_t;*/ - - -#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 -#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0 -#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8 - -/** \brief LUT for the mask1bit functions. */ -ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = { - ZEROES_32, ZEROES_32, - ZEROES_31, 0x01, ZEROES_32, - ZEROES_31, 0x02, ZEROES_32, - ZEROES_31, 0x04, ZEROES_32, - ZEROES_31, 0x08, ZEROES_32, - ZEROES_31, 0x10, ZEROES_32, - ZEROES_31, 0x20, ZEROES_32, - ZEROES_31, 0x40, ZEROES_32, - ZEROES_31, 0x80, ZEROES_32, - ZEROES_32, ZEROES_32, -}; static really_inline m128 ones128(void) { return (m128) vec_splat_u8(-1); @@ -115,10 +87,6 @@ static really_inline u32 diffrich128(m128 a, m128 b) { m128 mask = (m128) vec_cmpeq(a, b); // _mm_cmpeq_epi32 (a, b); mask = vec_and(not128(mask), movemask); m128 sum = vec_sums(mask, zeroes128()); - //sum = vec_sld(zeroes128(), sum, 4); - //s32 ALIGN_ATTR(16) x; - //vec_ste(sum, 0, &x); - //return x; // it could be ~(movemask_128(mask)) & 0x; return sum[3]; } @@ -131,10 +99,6 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) { uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b); mask = (uint64x2_t) vec_and((uint64x2_t)not128((m128)mask), movemask); m128 sum = vec_sums((m128)mask, zeroes128()); - //sum = vec_sld(zeroes128(), sum, 4); - //s32 ALIGN_ATTR(16) x; - //vec_ste(sum, 0, &x); - //return x; return sum[3]; } @@ -150,46 +114,18 @@ m128 sub_2x64(m128 a, m128 b) { static really_really_inline m128 lshift_m128(m128 a, unsigned b) { - switch(b){ - case 1: return vec_sld(a, zeroes128(), 1); break; - case 2: return vec_sld(a, zeroes128(), 2); break; - case 3: return vec_sld(a, zeroes128(), 3); break; - case 4: return vec_sld(a, zeroes128(), 4); break; - case 5: return vec_sld(a, zeroes128(), 5); break; - case 6: return vec_sld(a, zeroes128(), 6); break; - case 7: return vec_sld(a, zeroes128(), 7); break; - case 8: return vec_sld(a, zeroes128(), 8); break; - case 9: return vec_sld(a, zeroes128(), 9); break; - case 10: return vec_sld(a, zeroes128(), 10); break; - case 11: return vec_sld(a, zeroes128(), 11); break; - case 12: return vec_sld(a, zeroes128(), 12); break; - case 13: return vec_sld(a, zeroes128(), 13); break; - case 14: return vec_sld(a, zeroes128(), 14); break; - case 15: return vec_sld(a, zeroes128(), 15); break; - } - return a; + if (b == 0) return a; + m128 sl = (m128) vec_splats((uint8_t) b << 3); + m128 result = (m128) vec_slo((uint8x16_t) a, (uint8x16_t) sl); + return result; } static really_really_inline m128 rshift_m128(m128 a, unsigned b) { - switch(b){ - case 1: return vec_sld(zeroes128(), a, 15); break; - case 2: return vec_sld(zeroes128(), a, 14); break; - case 3: return vec_sld(zeroes128(), a, 13); break; - case 4: return vec_sld(zeroes128(), a, 12); break; - case 5: return vec_sld(zeroes128(), a, 11); break; - case 6: return vec_sld(zeroes128(), a, 10); break; - case 7: return vec_sld(zeroes128(), a, 9); break; - case 8: return vec_sld(zeroes128(), a, 8); break; - case 9: return vec_sld(zeroes128(), a, 7); break; - case 10: return vec_sld(zeroes128(), a, 6); break; - case 11: return vec_sld(zeroes128(), a, 5); break; - case 12: return vec_sld(zeroes128(), a, 4); break; - case 13: return vec_sld(zeroes128(), a, 3); break; - case 14: return vec_sld(zeroes128(), a, 2); break; - case 15: return vec_sld(zeroes128(), a, 1); break; - } - return a; + if (b == 0) return a; + m128 sl = (m128) vec_splats((uint8_t) b << 3); + m128 result = (m128) vec_sro((uint8x16_t) a, (uint8x16_t) sl); + return result; } static really_really_inline @@ -212,27 +148,13 @@ static really_inline m128 eq64_m128(m128 a, m128 b) { return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b); } - static really_inline u32 movemask128(m128 a) { - uint8x16_t s1 = vec_sr((uint8x16_t)a, vec_splat_u8(7)); - - uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7)); - uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff)); - uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and); - - uint32x4_t ss2 = vec_sr((uint32x4_t)s2, vec_splat_u32(14)); - uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff)); - uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2); - - uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28)); - uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff)); - uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3); - - uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9); - uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff)); - uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4); - - return s5[0]; + static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + uint8x16_t bitmask = vec_gb((uint8x16_t) a); + bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm); + u32 movemask; + vec_ste((uint32x4_t) bitmask, 0, &movemask); + return movemask; } static really_inline m128 set1_16x8(u8 c) { @@ -363,7 +285,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) { return a; } - #define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vec_sld((int8x16_t)(b), (int8x16_t)(a), (16 - offset)); break; static really_really_inline @@ -392,42 +313,50 @@ m128 palignr_imm(m128 r, m128 l, int offset) { static really_really_inline m128 palignr(m128 r, m128 l, int offset) { -#if defined(HS_OPTIMIZE) - // need a faster way to do this. - return palignr_imm(r, l, offset); -#else - return palignr_imm(r, l, offset); + if (offset == 0) return l; + if (offset == 16) return r; +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(offset)) { + return (m128)vec_sld((int8x16_t)(r), (int8x16_t)(l), 16 - offset); + } #endif + m128 sl = (m128) vec_splats((uint8_t) (offset << 3)); + m128 sr = (m128) vec_splats((uint8_t) ((16 - offset) << 3)); + m128 rhs = (m128) vec_slo((uint8x16_t) r, (uint8x16_t) sr); + m128 lhs = (m128) vec_sro((uint8x16_t) l, (uint8x16_t) sl); + return or128(lhs, rhs); } #undef CASE_ALIGN_VECTORS static really_really_inline m128 rshiftbyte_m128(m128 a, unsigned b) { - return rshift_m128(a,b); + return palignr_imm(zeroes128(), a, b); } static really_really_inline m128 lshiftbyte_m128(m128 a, unsigned b) { - return lshift_m128(a,b); + return palignr_imm(a, zeroes128(), 16 - b); } static really_inline m128 variable_byte_shift_m128(m128 in, s32 amount) { assert(amount >= -16 && amount <= 16); - if (amount < 0){ - return palignr_imm(zeroes128(), in, -amount); - } else{ - return palignr_imm(in, zeroes128(), 16 - amount); + if (amount < 0) { + return rshiftbyte_m128(in, -amount); + } else { + return lshiftbyte_m128(in, amount); } } static really_inline m128 mask1bit128(unsigned int n) { assert(n < sizeof(m128) * 8); - u32 mask_idx = ((n % 8) * 64) + 95; - mask_idx -= n / 8; - return loadu128(&simd_onebit_masks[mask_idx]); + static uint64x2_t onebit = { 1, 0 }; + m128 octets = (m128) vec_splats((uint8_t) ((n / 8) << 3)); + m128 bits = (m128) vec_splats((uint8_t) ((n % 8))); + m128 mask = (m128) vec_slo((uint8x16_t) onebit, (uint8x16_t) octets); + return (m128) vec_sll((uint8x16_t) mask, (uint8x16_t) bits); } // switches on bit N in the given vector. diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index c4a3b97c5..924a91c6a 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -165,8 +165,67 @@ m128 load_m128_from_u64a(const u64a *p) { return _mm_set_epi64x(0LL, *p); } -#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed) -#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed) +#define CASE_RSHIFT_VECTOR(a, count) case count: return _mm_srli_si128((m128)(a), (count)); break; + +static really_inline +m128 rshiftbyte_m128(const m128 a, int count_immed) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(count_immed)) { + return _mm_srli_si128(a, count_immed); + } +#endif + switch (count_immed) { + case 0: return a; break; + CASE_RSHIFT_VECTOR(a, 1); + CASE_RSHIFT_VECTOR(a, 2); + CASE_RSHIFT_VECTOR(a, 3); + CASE_RSHIFT_VECTOR(a, 4); + CASE_RSHIFT_VECTOR(a, 5); + CASE_RSHIFT_VECTOR(a, 6); + CASE_RSHIFT_VECTOR(a, 7); + CASE_RSHIFT_VECTOR(a, 8); + CASE_RSHIFT_VECTOR(a, 9); + CASE_RSHIFT_VECTOR(a, 10); + CASE_RSHIFT_VECTOR(a, 11); + CASE_RSHIFT_VECTOR(a, 12); + CASE_RSHIFT_VECTOR(a, 13); + CASE_RSHIFT_VECTOR(a, 14); + CASE_RSHIFT_VECTOR(a, 15); + default: return zeroes128(); break; + } +} +#undef CASE_RSHIFT_VECTOR + +#define CASE_LSHIFT_VECTOR(a, count) case count: return _mm_slli_si128((m128)(a), (count)); break; + +static really_inline +m128 lshiftbyte_m128(const m128 a, int count_immed) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(count_immed)) { + return _mm_slli_si128(a, count_immed); + } +#endif + switch (count_immed) { + case 0: return a; break; + CASE_LSHIFT_VECTOR(a, 1); + CASE_LSHIFT_VECTOR(a, 2); + CASE_LSHIFT_VECTOR(a, 3); + CASE_LSHIFT_VECTOR(a, 4); + CASE_LSHIFT_VECTOR(a, 5); + CASE_LSHIFT_VECTOR(a, 6); + CASE_LSHIFT_VECTOR(a, 7); + CASE_LSHIFT_VECTOR(a, 8); + CASE_LSHIFT_VECTOR(a, 9); + CASE_LSHIFT_VECTOR(a, 10); + CASE_LSHIFT_VECTOR(a, 11); + CASE_LSHIFT_VECTOR(a, 12); + CASE_LSHIFT_VECTOR(a, 13); + CASE_LSHIFT_VECTOR(a, 14); + CASE_LSHIFT_VECTOR(a, 15); + default: return zeroes128(); break; + } +} +#undef CASE_LSHIFT_VECTOR #if defined(HAVE_SSE41) #define extract32from128(a, imm) _mm_extract_epi32(a, imm) @@ -255,14 +314,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) { memcpy(&a, ptr, n); return a; } -/* -#ifdef __cplusplus -extern "C" { -#endif -extern const u8 simd_onebit_masks[]; -#ifdef __cplusplus -} -#endif*/ static really_inline m128 mask1bit128(unsigned int n) { @@ -330,6 +381,7 @@ m128 palignr_sw(m128 r, m128 l, int offset) { break; } } +#undef CASE_ALIGN_VECTORS static really_really_inline m128 palignr(m128 r, m128 l, int offset) { @@ -340,7 +392,6 @@ m128 palignr(m128 r, m128 l, int offset) { #endif return palignr_sw(r, l, offset); } -#undef CASE_ALIGN_VECTORS static really_inline m128 variable_byte_shift_m128(m128 in, s32 amount) { diff --git a/src/util/bitfield.h b/src/util/bitfield.h index a580da7b6..202232b62 100644 --- a/src/util/bitfield.h +++ b/src/util/bitfield.h @@ -189,10 +189,7 @@ class bitfield { size_t sum = 0; size_t i = 0; for (; i + 4 <= num_blocks; i += 4) { - sum += popcount64(bits[i]); - sum += popcount64(bits[i + 1]); - sum += popcount64(bits[i + 2]); - sum += popcount64(bits[i + 3]); + sum += popcount64x4(&bits[i]); } for (; i < num_blocks; i++) { sum += popcount64(bits[i]); diff --git a/src/util/popcount.h b/src/util/popcount.h index c7a69d467..d90a0d50d 100644 --- a/src/util/popcount.h +++ b/src/util/popcount.h @@ -52,6 +52,15 @@ u32 popcount32(u32 x) { // #endif } +static really_inline +u32 popcount32x4(u32 const *x) { + u32 sum = popcount32(x[0]); + sum += popcount32(x[1]); + sum += popcount32(x[2]); + sum += popcount32(x[3]); + return sum; +} + static really_inline u32 popcount64(u64a x) { return __builtin_popcountll(x); @@ -73,5 +82,14 @@ u32 popcount64(u64a x) { // #endif } +static really_inline +u32 popcount64x4(u64a const *x) { + volatile u32 sum = popcount64(x[0]); + sum += popcount64(x[1]); + sum += popcount64(x[2]); + sum += popcount64(x[3]); + return sum; +} + #endif /* UTIL_POPCOUNT_H_ */ diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index 5becb8f81..2eba69b2d 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -39,7 +39,7 @@ #include "util/supervector/supervector.hpp" #include -// 128-bit Powerpc64le implementation +// 128-bit IBM Power VSX implementation template<> really_inline SuperVector<16>::SuperVector(SuperVector const &other) @@ -47,6 +47,69 @@ really_inline SuperVector<16>::SuperVector(SuperVector const &other) u.v128[0] = other.u.v128[0]; } +template<> +template<> +really_inline SuperVector<16>::SuperVector(char __bool __vector v) +{ + u.u8x16[0] = (uint8x16_t) v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int8x16_t const v) +{ + u.s8x16[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint8x16_t const v) +{ + u.u8x16[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int16x8_t const v) +{ + u.s16x8[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint16x8_t const v) +{ + u.u16x8[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int32x4_t const v) +{ + u.s32x4[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint32x4_t const v) +{ + u.u32x4[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int64x2_t const v) +{ + u.s64x2[0] = v; +}; + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint64x2_t const v) +{ + u.u64x2[0] = v; +}; + template<> really_inline SuperVector<16>::SuperVector(typename base_type::type const v) { @@ -57,69 +120,69 @@ template<> template<> really_inline SuperVector<16>::SuperVector(int8_t const other) { - u.v128[0] = (m128) vec_splats(other); + u.s8x16[0] = vec_splats(other); } template<> template<> really_inline SuperVector<16>::SuperVector(uint8_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.u8x16[0] = vec_splats(static_cast(other)); } template<> template<> really_inline SuperVector<16>::SuperVector(int16_t const other) { - u.v128[0] = (m128) vec_splats(other); + u.s16x8[0] = vec_splats(other); } template<> template<> really_inline SuperVector<16>::SuperVector(uint16_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.u16x8[0] = vec_splats(static_cast(other)); } template<> template<> really_inline SuperVector<16>::SuperVector(int32_t const other) { - u.v128[0] = (m128) vec_splats(other); + u.s32x4[0] = vec_splats(other); } template<> template<> really_inline SuperVector<16>::SuperVector(uint32_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.u32x4[0] = vec_splats(static_cast(other)); } template<> template<> really_inline SuperVector<16>::SuperVector(int64_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.s64x2[0] = (int64x2_t) vec_splats(static_cast(other)); } template<> template<> really_inline SuperVector<16>::SuperVector(uint64_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.u64x2[0] = (uint64x2_t) vec_splats(static_cast(other)); } // Constants template<> really_inline SuperVector<16> SuperVector<16>::Ones(void) { - return {(m128) vec_splat_s8(-1)}; + return { vec_splat_s8(-1)}; } template<> really_inline SuperVector<16> SuperVector<16>::Zeroes(void) { - return {(m128) vec_splat_s8(0)}; + return { vec_splat_s8(0) }; } // Methods @@ -133,39 +196,38 @@ really_inline void SuperVector<16>::operator=(SuperVector<16> const &other) template <> really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const { - return {vec_and(u.v128[0], b.u.v128[0])}; + return { vec_and(u.v128[0], b.u.v128[0]) }; } template <> really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const { - return {vec_or(u.v128[0], b.u.v128[0])}; + return { vec_or(u.v128[0], b.u.v128[0]) }; } template <> really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const { - return {(m128) vec_xor(u.v128[0], b.u.v128[0])}; + return { vec_xor(u.v128[0], b.u.v128[0]) }; } template <> really_inline SuperVector<16> SuperVector<16>::operator!() const { - return {(m128) vec_xor(u.v128[0], u.v128[0])}; + return { vec_xor(u.v128[0], u.v128[0]) }; } template <> really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const { - m128 not_res = vec_xor(u.v128[0], (m128)vec_splat_s8(-1)); - return {(m128) vec_and(not_res, (m128)b.u.v128[0]) }; + int8x16_t not_res = vec_xor(u.s8x16[0], vec_splat_s8(-1)); + return { vec_and(not_res, b.u.s8x16[0]) }; } - template <> really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const { - return {(m128) vec_cmpeq(u.s8x16[0], b.u.s8x16[0])}; + return { vec_cmpeq(u.s8x16[0], b.u.s8x16[0])}; } template <> @@ -177,28 +239,27 @@ really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const template <> really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const { - return {(m128) vec_cmpgt(u.v128[0], b.u.v128[0])}; + return { vec_cmpgt(u.s8x16[0], b.u.s8x16[0])}; } template <> really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const { - return {(m128) vec_cmpge(u.v128[0], b.u.v128[0])}; + return { vec_cmpge(u.s8x16[0], b.u.s8x16[0])}; } template <> really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const { - return {(m128) vec_cmpgt(b.u.v128[0], u.v128[0])}; + return { vec_cmpgt(b.u.s8x16[0], u.s8x16[0])}; } template <> really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const { - return {(m128) vec_cmpge(b.u.v128[0], u.v128[0])}; + return { vec_cmpge(b.u.s8x16[0], u.s8x16[0])}; } - template <> really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const { @@ -208,25 +269,12 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons template <> really_inline typename SuperVector<16>::comparemask_type SuperVector<16>::comparemask(void) const { - uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7)); - - uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7)); - uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff)); - uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and); - - uint32x4_t ss2 = vec_sr((uint32x4_t)s2 , vec_splat_u32(14)); - uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff)); - uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2); - - uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28)); - uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff)); - uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3); - - uint64x2_t ss4 = vec_sld((uint64x2_t) vec_splats(0), s4, 9); - uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff)); - uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4); - - return s5[0]; + uint8x16_t bitmask = vec_gb( u.u8x16[0]); + static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm); + u32 movemask; + vec_ste((uint32x4_t) bitmask, 0, &movemask); + return movemask; } template <> @@ -248,35 +296,35 @@ template <> template really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const { - return { (m128) vec_sl(u.s8x16[0], vec_splats((uint8_t)N)) }; + return { vec_sl(u.s8x16[0], vec_splat_u8(N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const { - return { (m128) vec_sl(u.s16x8[0], vec_splats((uint16_t)N)) }; + return { vec_sl(u.s16x8[0], vec_splat_u16(N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const { - return { (m128) vec_sl(u.s32x4[0], vec_splats((uint32_t)N)) }; + return { vec_sl(u.s32x4[0], vec_splat_u32(N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const { - return { (m128) vec_sl(u.s64x2[0], vec_splats((ulong64_t)N)) }; + return { vec_sl(u.s64x2[0], vec_splats((ulong64_t) N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const { - return { (m128) vec_sld(u.s8x16[0], (int8x16_t)vec_splat_s8(0), N)}; + return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)}; } template <> @@ -290,35 +338,35 @@ template <> template really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const { - return { (m128) vec_sr(u.s8x16[0], vec_splats((uint8_t)N)) }; + return { vec_sr(u.s8x16[0], vec_splat_u8(N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const { - return { (m128) vec_sr(u.s16x8[0], vec_splats((uint16_t)N)) }; + return { vec_sr(u.s16x8[0], vec_splat_u16(N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const { - return { (m128) vec_sr(u.s32x4[0], vec_splats((uint32_t)N)) }; + return { vec_sr(u.s32x4[0], vec_splat_u32(N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const { - return { (m128) vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) }; + return { vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) }; } template <> template really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const { - return { (m128) vec_sld((int8x16_t)vec_splat_s8(0), u.s8x16[0], 16 - N) }; + return { vec_sld(vec_splat_s8(0), u.s8x16[0], 16 - N) }; } template <> @@ -348,50 +396,40 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s8x16[0], vec_splats((uint8_t)n))}; }); - return result; + uint8x16_t shift_indices = vec_splats((uint8_t) N); + return { vec_sl(u.u8x16[0], shift_indices) }; } template <> really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s16x8[0], vec_splats((uint16_t)n))}; }); - return result; + uint16x8_t shift_indices = vec_splats((uint16_t) N); + return { vec_sl(u.u16x8[0], shift_indices) }; } template <> really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s32x4[0], vec_splats((uint32_t)n))}; }); - return result; + uint32x4_t shift_indices = vec_splats((uint32_t) N); + return { vec_sl(u.u32x4[0], shift_indices) }; } template <> really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s64x2[0], vec_splats((ulong64_t)n))}; }); - return result; + uint64x2_t shift_indices = vec_splats((ulong64_t) N); + return { vec_sl(u.u64x2[0], shift_indices) }; } template <> really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(v->u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; }); - return result; + SuperVector sl{N << 3}; + return { vec_slo(u.u8x16[0], sl.u.u8x16[0]) }; } template <> @@ -404,50 +442,40 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s8x16[0], vec_splats((uint8_t)n))}; }); - return result; + uint8x16_t shift_indices = vec_splats((uint8_t) N); + return { vec_sr(u.u8x16[0], shift_indices) }; } template <> really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s16x8[0], vec_splats((uint16_t)n))}; }); - return result; + uint16x8_t shift_indices = vec_splats((uint16_t) N); + return { vec_sr(u.u16x8[0], shift_indices) }; } template <> really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s32x4[0], vec_splats((uint32_t)n))}; }); - return result; + uint32x4_t shift_indices = vec_splats((uint32_t) N); + return { vec_sr(u.u32x4[0], shift_indices) }; } template <> really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s64x2[0], vec_splats((ulong64_t)n))}; }); - return result; + uint64x2_t shift_indices = vec_splats((ulong64_t) N); + return { vec_sr(u.u64x2[0], shift_indices) }; } template <> -really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const UNUSED N) const +really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); - SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), v->u.s8x16[0], 16 - n)}; }); - return result; + SuperVector sr{N << 3}; + return { vec_sro(u.u8x16[0], sr.u.u8x16[0]) }; } template <> @@ -459,51 +487,25 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const template <> really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const { - switch(N) { - case 1: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 15)}; break; - case 2: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 14)}; break; - case 3: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 13)}; break; - case 4: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 12)}; break; - case 5: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 11)}; break; - case 6: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 10)}; break; - case 7: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 9)}; break; - case 8: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 8)}; break; - case 9: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 7)}; break; - case 10: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 6)}; break; - case 11: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 5)}; break; - case 12: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 4)}; break; - case 13: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 3)}; break; - case 14: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 2)}; break; - case 15: return {(m128) vec_sld((int8x16_t) vec_splat_s8(0), u.s8x16[0], 1)}; break; - case 16: return Zeroes(); break; - default: break; +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (N == 0) return *this; + if (__builtin_constant_p(N)) { + return { vec_sld(vec_splat_s8(0), u.s8x16[0], 16 - N) }; } - return *this; +#endif + return vshr_128(N); } template <> really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { - switch(N) { - case 1: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 1)}; break; - case 2: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 2)}; break; - case 3: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 3)}; break; - case 4: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 4)}; break; - case 5: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 5)}; break; - case 6: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 6)}; break; - case 7: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 7)}; break; - case 8: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 8)}; break; - case 9: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 9)}; break; - case 10: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 10)}; break; - case 11: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 11)}; break; - case 12: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 12)}; break; - case 13: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 13)}; break; - case 14: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 14)}; break; - case 15: return {(m128) vec_sld(u.s8x16[0], (int8x16_t) vec_splat_s8(0), 15)}; break; - case 16: return Zeroes(); break; - default: break; +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (N == 0) return *this; + if (__builtin_constant_p(N)) { + return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)}; } - return *this; +#endif + return vshl_128(N); } template<> @@ -521,50 +523,39 @@ really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N) template <> really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) { - return (m128) vec_xl(0, (const long64_t*)ptr); + return { vec_xl(0, (const long64_t*)ptr) }; } template <> really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) { assert(ISALIGNED_N(ptr, alignof(SuperVector::size))); - return (m128) vec_xl(0, (const long64_t*)ptr); + return { vec_xl(0, (const long64_t*)ptr) }; } template <> really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len) { SuperVector<16> mask = Ones_vshr(16 -len); - mask.print8("mask"); SuperVector<16> v = loadu(ptr); - v.print8("v"); return mask & v; } template<> really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) { - - switch(offset) { - case 0: return other; break; - case 1: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 15)}; break; - case 2: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 14)}; break; - case 3: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 13)}; break; - case 4: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 12)}; break; - case 5: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 11)}; break; - case 6: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 10)}; break; - case 7: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 9)}; break; - case 8: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 8)}; break; - case 9: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 7)}; break; - case 10: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 6)}; break; - case 11: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 5)}; break; - case 12: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 4)}; break; - case 13: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 3)}; break; - case 14: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 2)}; break; - case 15: return {(m128) vec_sld(u.s8x16[0], other.u.s8x16[0], 1)}; break; - default: break; + if (offset == 0) return other; + if (offset == 16) return *this; +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(offset)) { + return { vec_sld(u.s8x16[0], other.u.s8x16[0], offset) }; } - return *this; +#endif + uint8x16_t sl = vec_splats((uint8_t) (offset << 3)); + uint8x16_t sr = vec_splats((uint8_t) ((16 - offset) << 3)); + uint8x16_t rhs = vec_slo(u.u8x16[0], sr); + uint8x16_t lhs = vec_sro(other.u.u8x16[0], sl); + return { vec_or(lhs, rhs) }; } template<> @@ -574,9 +565,9 @@ really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane. below is the version that is converted from Intel to PPC. */ - uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], (uint8x16_t)vec_splats((uint8_t)0x80)); + uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], vec_splats((uint8_t)0x80)); uint8x16_t res = vec_perm (u.u8x16[0], u.u8x16[0], b.u.u8x16[0]); - return (m128) vec_sel(res, (uint8x16_t)vec_splat_s8(0), mask); + return { vec_sel(res, vec_splat_u8(0), mask) }; } template<> diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index 5d066c1ab..fef5f09f6 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -177,13 +177,13 @@ class SuperVector : public BaseVector #if defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL) uint64x2_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size]; - int64x2_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size]; + int64x2_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size]; uint32x4_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size]; - int32x4_t ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size]; + int32x4_t ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size]; uint16x8_t ALIGN_ATTR(BaseVector<16>::size) u16x8[SIZE / BaseVector<16>::size]; - int16x8_t ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size]; + int16x8_t ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size]; uint8x16_t ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size]; - int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size]; + int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size]; #endif uint64_t u64[SIZE / sizeof(uint64_t)]; @@ -204,7 +204,7 @@ class SuperVector : public BaseVector SuperVector(typename base_type::type const v); template - SuperVector(T other); + SuperVector(T const other); SuperVector(SuperVector const lo, SuperVector const hi); SuperVector(previous_type const lo, previous_type const hi); diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 69f1a64c3..c5cfec7b6 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -723,10 +723,59 @@ TEST(SimdUtilsTest, set2x128) { } #endif +#define TEST_LSHIFTBYTE128(v1, buf, l) { \ + m128 v_shifted = lshiftbyte_m128(v1, l); \ + storeu128(res, v_shifted); \ + int i; \ + for (i=0; i < l; i++) { \ + assert(res[i] == 0); \ + } \ + for (; i < 16; i++) { \ + assert(res[i] == vec[i - l]); \ + } \ + } + +TEST(SimdUtilsTest, lshiftbyte128){ + u8 vec[16]; + u8 res[16]; + for (int i=0; i<16; i++) { + vec[i]=i; + } + m128 v1 = loadu128(vec); + for (int j = 0; j<16; j++){ + TEST_LSHIFTBYTE128(v1, vec, j); + } +} + +#define TEST_RSHIFTBYTE128(v1, buf, l) { \ + m128 v_shifted = rshiftbyte_m128(v1, l); \ + storeu128(res, v_shifted); \ + int i; \ + for (i=15; i >= 16 - l; i--) { \ + assert(res[i] == 0); \ + } \ + for (; i >= 0; i--) { \ + assert(res[i] == vec[i + l]); \ + } \ + } + +TEST(SimdUtilsTest, rshiftbyte128){ + u8 vec[16]; + u8 res[16]; + for (int i=0; i<16; i++) { + vec[i]=i; + } + m128 v1 = loadu128(vec); + for (int j = 0; j<16; j++){ + TEST_RSHIFTBYTE128(v1, vec, j); + } +} + TEST(SimdUtilsTest, variableByteShift128) { char base[] = "0123456789ABCDEF"; m128 in = loadu128(base); + EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0), variable_byte_shift_m128(in, 0))); EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1), @@ -773,7 +822,7 @@ TEST(SimdUtilsTest, variableByteShift128) { EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10), variable_byte_shift_m128(in, 10))); - EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16))); + EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 15), variable_byte_shift_m128(in, 15))); EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16))); }