Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VSX optimizations #119

Merged
merged 15 commits into from
Sep 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 3 additions & 21 deletions src/util/arch/arm/simd_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,24 +53,6 @@

#include <string.h> // for memcpy

#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8

/** \brief LUT for the mask1bit functions. */
ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
ZEROES_32, ZEROES_32,
ZEROES_31, 0x01, ZEROES_32,
ZEROES_31, 0x02, ZEROES_32,
ZEROES_31, 0x04, ZEROES_32,
ZEROES_31, 0x08, ZEROES_32,
ZEROES_31, 0x10, ZEROES_32,
ZEROES_31, 0x20, ZEROES_32,
ZEROES_31, 0x40, ZEROES_32,
ZEROES_31, 0x80, ZEROES_32,
ZEROES_32, ZEROES_32,
};

static really_inline m128 ones128(void) {
return (m128) vdupq_n_s8(0xFF);
}
Expand Down Expand Up @@ -595,9 +577,9 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) {
static really_inline
m128 mask1bit128(unsigned int n) {
assert(n < sizeof(m128) * 8);
u32 mask_idx = ((n % 8) * 64) + 95;
mask_idx -= n / 8;
return loadu128(&simd_onebit_masks[mask_idx]);
static m128 onebit = { 1, 0 };
m128 mask = lshiftbyte_m128( onebit, n / 8 );
return lshift64_m128( mask, n % 8 );
}

// switches on bit N in the given vector.
Expand Down
20 changes: 20 additions & 0 deletions src/util/arch/common/simd_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,26 @@ static inline void print_m128_2x64(const char *label, m128 vec) {
#define print_m128_2x64(label, vec) ;
#endif

#if !defined(ARCH_IA32) && !defined(ARCH_X86_64)
#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8

/** \brief LUT for the mask1bit functions. */
ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
ZEROES_32, ZEROES_32,
ZEROES_31, 0x01, ZEROES_32,
ZEROES_31, 0x02, ZEROES_32,
ZEROES_31, 0x04, ZEROES_32,
ZEROES_31, 0x08, ZEROES_32,
ZEROES_31, 0x10, ZEROES_32,
ZEROES_31, 0x20, ZEROES_32,
ZEROES_31, 0x40, ZEROES_32,
ZEROES_31, 0x80, ZEROES_32,
ZEROES_32, ZEROES_32,
};
#endif // !defined(ARCH_IA32) && !defined(ARCH_X86_64)

/****
**** 256-bit Primitives
****/
Expand Down
143 changes: 36 additions & 107 deletions src/util/arch/ppc64el/simd_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,34 +54,6 @@ typedef __vector signed char int8x16_t;

typedef unsigned long long int ulong64_t;
typedef signed long long int long64_t;
/*
typedef __vector uint64_t uint64x2_t;
typedef __vector int64_t int64x2_t;
typedef __vector uint32_t uint32x4_t;
typedef __vector int32_t int32x4_t;
typedef __vector uint16_t uint16x8_t;
typedef __vector int16_t int16x8_t;
typedef __vector uint8_t uint8x16_t;
typedef __vector int8_t int8x16_t;*/


#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8

/** \brief LUT for the mask1bit functions. */
ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
ZEROES_32, ZEROES_32,
ZEROES_31, 0x01, ZEROES_32,
ZEROES_31, 0x02, ZEROES_32,
ZEROES_31, 0x04, ZEROES_32,
ZEROES_31, 0x08, ZEROES_32,
ZEROES_31, 0x10, ZEROES_32,
ZEROES_31, 0x20, ZEROES_32,
ZEROES_31, 0x40, ZEROES_32,
ZEROES_31, 0x80, ZEROES_32,
ZEROES_32, ZEROES_32,
};

static really_inline m128 ones128(void) {
return (m128) vec_splat_u8(-1);
Expand Down Expand Up @@ -115,10 +87,6 @@ static really_inline u32 diffrich128(m128 a, m128 b) {
m128 mask = (m128) vec_cmpeq(a, b); // _mm_cmpeq_epi32 (a, b);
mask = vec_and(not128(mask), movemask);
m128 sum = vec_sums(mask, zeroes128());
//sum = vec_sld(zeroes128(), sum, 4);
//s32 ALIGN_ATTR(16) x;
//vec_ste(sum, 0, &x);
//return x; // it could be ~(movemask_128(mask)) & 0x;
return sum[3];
}

Expand All @@ -131,10 +99,6 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
mask = (uint64x2_t) vec_and((uint64x2_t)not128((m128)mask), movemask);
m128 sum = vec_sums((m128)mask, zeroes128());
//sum = vec_sld(zeroes128(), sum, 4);
//s32 ALIGN_ATTR(16) x;
//vec_ste(sum, 0, &x);
//return x;
return sum[3];
}

Expand All @@ -150,46 +114,18 @@ m128 sub_2x64(m128 a, m128 b) {

static really_really_inline
m128 lshift_m128(m128 a, unsigned b) {
switch(b){
case 1: return vec_sld(a, zeroes128(), 1); break;
case 2: return vec_sld(a, zeroes128(), 2); break;
case 3: return vec_sld(a, zeroes128(), 3); break;
case 4: return vec_sld(a, zeroes128(), 4); break;
case 5: return vec_sld(a, zeroes128(), 5); break;
case 6: return vec_sld(a, zeroes128(), 6); break;
case 7: return vec_sld(a, zeroes128(), 7); break;
case 8: return vec_sld(a, zeroes128(), 8); break;
case 9: return vec_sld(a, zeroes128(), 9); break;
case 10: return vec_sld(a, zeroes128(), 10); break;
case 11: return vec_sld(a, zeroes128(), 11); break;
case 12: return vec_sld(a, zeroes128(), 12); break;
case 13: return vec_sld(a, zeroes128(), 13); break;
case 14: return vec_sld(a, zeroes128(), 14); break;
case 15: return vec_sld(a, zeroes128(), 15); break;
}
return a;
if (b == 0) return a;
m128 sl = (m128) vec_splats((uint8_t) b << 3);
m128 result = (m128) vec_slo((uint8x16_t) a, (uint8x16_t) sl);
return result;
}

static really_really_inline
m128 rshift_m128(m128 a, unsigned b) {
switch(b){
case 1: return vec_sld(zeroes128(), a, 15); break;
case 2: return vec_sld(zeroes128(), a, 14); break;
case 3: return vec_sld(zeroes128(), a, 13); break;
case 4: return vec_sld(zeroes128(), a, 12); break;
case 5: return vec_sld(zeroes128(), a, 11); break;
case 6: return vec_sld(zeroes128(), a, 10); break;
case 7: return vec_sld(zeroes128(), a, 9); break;
case 8: return vec_sld(zeroes128(), a, 8); break;
case 9: return vec_sld(zeroes128(), a, 7); break;
case 10: return vec_sld(zeroes128(), a, 6); break;
case 11: return vec_sld(zeroes128(), a, 5); break;
case 12: return vec_sld(zeroes128(), a, 4); break;
case 13: return vec_sld(zeroes128(), a, 3); break;
case 14: return vec_sld(zeroes128(), a, 2); break;
case 15: return vec_sld(zeroes128(), a, 1); break;
}
return a;
if (b == 0) return a;
m128 sl = (m128) vec_splats((uint8_t) b << 3);
m128 result = (m128) vec_sro((uint8x16_t) a, (uint8x16_t) sl);
return result;
}

static really_really_inline
Expand All @@ -212,27 +148,13 @@ static really_inline m128 eq64_m128(m128 a, m128 b) {
return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
}


static really_inline u32 movemask128(m128 a) {
uint8x16_t s1 = vec_sr((uint8x16_t)a, vec_splat_u8(7));

uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff));
uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and);

uint32x4_t ss2 = vec_sr((uint32x4_t)s2, vec_splat_u32(14));
uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff));
uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);

uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);

uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9);
uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);

return s5[0];
static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
uint8x16_t bitmask = vec_gb((uint8x16_t) a);
bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm);
u32 movemask;
vec_ste((uint32x4_t) bitmask, 0, &movemask);
return movemask;
}

static really_inline m128 set1_16x8(u8 c) {
Expand Down Expand Up @@ -363,7 +285,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
return a;
}


#define CASE_ALIGN_VECTORS(a, b, offset) case offset: return (m128)vec_sld((int8x16_t)(b), (int8x16_t)(a), (16 - offset)); break;

static really_really_inline
Expand Down Expand Up @@ -392,42 +313,50 @@ m128 palignr_imm(m128 r, m128 l, int offset) {

static really_really_inline
m128 palignr(m128 r, m128 l, int offset) {
#if defined(HS_OPTIMIZE)
// need a faster way to do this.
return palignr_imm(r, l, offset);
#else
return palignr_imm(r, l, offset);
if (offset == 0) return l;
if (offset == 16) return r;
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(offset)) {
return (m128)vec_sld((int8x16_t)(r), (int8x16_t)(l), 16 - offset);
}
#endif
m128 sl = (m128) vec_splats((uint8_t) (offset << 3));
m128 sr = (m128) vec_splats((uint8_t) ((16 - offset) << 3));
m128 rhs = (m128) vec_slo((uint8x16_t) r, (uint8x16_t) sr);
m128 lhs = (m128) vec_sro((uint8x16_t) l, (uint8x16_t) sl);
return or128(lhs, rhs);
}

#undef CASE_ALIGN_VECTORS

static really_really_inline
m128 rshiftbyte_m128(m128 a, unsigned b) {
return rshift_m128(a,b);
return palignr_imm(zeroes128(), a, b);
}

static really_really_inline
m128 lshiftbyte_m128(m128 a, unsigned b) {
return lshift_m128(a,b);
return palignr_imm(a, zeroes128(), 16 - b);
}

static really_inline
m128 variable_byte_shift_m128(m128 in, s32 amount) {
assert(amount >= -16 && amount <= 16);
if (amount < 0){
return palignr_imm(zeroes128(), in, -amount);
} else{
return palignr_imm(in, zeroes128(), 16 - amount);
if (amount < 0) {
return rshiftbyte_m128(in, -amount);
} else {
return lshiftbyte_m128(in, amount);
}
}

static really_inline
m128 mask1bit128(unsigned int n) {
assert(n < sizeof(m128) * 8);
u32 mask_idx = ((n % 8) * 64) + 95;
mask_idx -= n / 8;
return loadu128(&simd_onebit_masks[mask_idx]);
static uint64x2_t onebit = { 1, 0 };
m128 octets = (m128) vec_splats((uint8_t) ((n / 8) << 3));
m128 bits = (m128) vec_splats((uint8_t) ((n % 8)));
m128 mask = (m128) vec_slo((uint8x16_t) onebit, (uint8x16_t) octets);
return (m128) vec_sll((uint8x16_t) mask, (uint8x16_t) bits);
}

// switches on bit N in the given vector.
Expand Down
73 changes: 62 additions & 11 deletions src/util/arch/x86/simd_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,67 @@ m128 load_m128_from_u64a(const u64a *p) {
return _mm_set_epi64x(0LL, *p);
}

#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
#define CASE_RSHIFT_VECTOR(a, count) case count: return _mm_srli_si128((m128)(a), (count)); break;

static really_inline
m128 rshiftbyte_m128(const m128 a, int count_immed) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(count_immed)) {
return _mm_srli_si128(a, count_immed);
}
#endif
switch (count_immed) {
case 0: return a; break;
CASE_RSHIFT_VECTOR(a, 1);
CASE_RSHIFT_VECTOR(a, 2);
CASE_RSHIFT_VECTOR(a, 3);
CASE_RSHIFT_VECTOR(a, 4);
CASE_RSHIFT_VECTOR(a, 5);
CASE_RSHIFT_VECTOR(a, 6);
CASE_RSHIFT_VECTOR(a, 7);
CASE_RSHIFT_VECTOR(a, 8);
CASE_RSHIFT_VECTOR(a, 9);
CASE_RSHIFT_VECTOR(a, 10);
CASE_RSHIFT_VECTOR(a, 11);
CASE_RSHIFT_VECTOR(a, 12);
CASE_RSHIFT_VECTOR(a, 13);
CASE_RSHIFT_VECTOR(a, 14);
CASE_RSHIFT_VECTOR(a, 15);
default: return zeroes128(); break;
}
}
#undef CASE_RSHIFT_VECTOR

#define CASE_LSHIFT_VECTOR(a, count) case count: return _mm_slli_si128((m128)(a), (count)); break;

static really_inline
m128 lshiftbyte_m128(const m128 a, int count_immed) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
if (__builtin_constant_p(count_immed)) {
return _mm_slli_si128(a, count_immed);
}
#endif
switch (count_immed) {
case 0: return a; break;
CASE_LSHIFT_VECTOR(a, 1);
CASE_LSHIFT_VECTOR(a, 2);
CASE_LSHIFT_VECTOR(a, 3);
CASE_LSHIFT_VECTOR(a, 4);
CASE_LSHIFT_VECTOR(a, 5);
CASE_LSHIFT_VECTOR(a, 6);
CASE_LSHIFT_VECTOR(a, 7);
CASE_LSHIFT_VECTOR(a, 8);
CASE_LSHIFT_VECTOR(a, 9);
CASE_LSHIFT_VECTOR(a, 10);
CASE_LSHIFT_VECTOR(a, 11);
CASE_LSHIFT_VECTOR(a, 12);
CASE_LSHIFT_VECTOR(a, 13);
CASE_LSHIFT_VECTOR(a, 14);
CASE_LSHIFT_VECTOR(a, 15);
default: return zeroes128(); break;
}
}
#undef CASE_LSHIFT_VECTOR

#if defined(HAVE_SSE41)
#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
Expand Down Expand Up @@ -255,14 +314,6 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
memcpy(&a, ptr, n);
return a;
}
/*
#ifdef __cplusplus
extern "C" {
#endif
extern const u8 simd_onebit_masks[];
#ifdef __cplusplus
}
#endif*/

static really_inline
m128 mask1bit128(unsigned int n) {
Expand Down Expand Up @@ -330,6 +381,7 @@ m128 palignr_sw(m128 r, m128 l, int offset) {
break;
}
}
#undef CASE_ALIGN_VECTORS

static really_really_inline
m128 palignr(m128 r, m128 l, int offset) {
Expand All @@ -340,7 +392,6 @@ m128 palignr(m128 r, m128 l, int offset) {
#endif
return palignr_sw(r, l, offset);
}
#undef CASE_ALIGN_VECTORS

static really_inline
m128 variable_byte_shift_m128(m128 in, s32 amount) {
Expand Down
5 changes: 1 addition & 4 deletions src/util/bitfield.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,7 @@ class bitfield {
size_t sum = 0;
size_t i = 0;
for (; i + 4 <= num_blocks; i += 4) {
sum += popcount64(bits[i]);
sum += popcount64(bits[i + 1]);
sum += popcount64(bits[i + 2]);
sum += popcount64(bits[i + 3]);
sum += popcount64x4(&bits[i]);
}
for (; i < num_blocks; i++) {
sum += popcount64(bits[i]);
Expand Down
Loading