From 72efebc84fbca7e679d3f21f3be7b1871a2e2506 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Wed, 16 Oct 2024 09:55:33 +0800 Subject: [PATCH] ppc64x: decode #18 --- base64_ppc64x.go | 30 ++++++ base64_ppc64x.s | 215 ++++++++++++++++++++++++++++++++++++++++++ base64_ppc64x_test.go | 84 +++++++++++++++++ 3 files changed, 329 insertions(+) diff --git a/base64_ppc64x.go b/base64_ppc64x.go index 6354ab5..2ad4f80 100644 --- a/base64_ppc64x.go +++ b/base64_ppc64x.go @@ -6,9 +6,17 @@ package base64 +import "fmt" + //go:noescape func encodeAsm(dst, src []byte, lut *[16]byte) int +//go:noescape +func decodeStdAsm(dst, src []byte) int + +//go:noescape +func decodeUrlAsm(dst, src []byte) int + func encode(enc *Encoding, dst, src []byte) { if len(src) >= 16 && enc.lut != nil { encoded := encodeAsm(dst, src, enc.lut) @@ -21,5 +29,27 @@ func encode(enc *Encoding, dst, src []byte) { } func decode(enc *Encoding, dst, src []byte) (int, error) { + srcLen := len(src) + if srcLen >= 24 { + remain := srcLen + if enc.lut == &encodeStdLut { + remain = decodeStdAsm(dst, src) + } else if enc.lut == &encodeURLLut { + remain = decodeUrlAsm(dst, src) + } + fmt.Printf("total %d remain: %d\n", srcLen, remain) + if remain < srcLen { + // decoded by ASM + remain = srcLen - remain // remain is decoded length now + src = src[remain:] + dstStart := (remain / 4) * 3 + dst = dst[dstStart:] + n, err := decodeGeneric(enc, dst, src) + if cerr, ok := err.(CorruptInputError); ok { + return n + dstStart, CorruptInputError(int(cerr) + remain) + } + return n + dstStart, err + } + } return decodeGeneric(enc, dst, src) } diff --git a/base64_ppc64x.s b/base64_ppc64x.s index 67c43ef..112c887 100644 --- a/base64_ppc64x.s +++ b/base64_ppc64x.s @@ -26,6 +26,32 @@ DATA base64_const<>+0x80(SB)/8, $0x1919191919191919 // range 0 end DATA base64_const<>+0x88(SB)/8, $0x1919191919191919 // range 0 end GLOBL base64_const<>(SB), (NOPTR+RODATA), $144 +DATA decode_const<>+0x00(SB)/8, $0x1010010204080408 // standard decode lut hi +DATA decode_const<>+0x08(SB)/8, $0x1010101010101010 +DATA decode_const<>+0x10(SB)/8, $0x1511111111111111 // standard decode lut lo +DATA decode_const<>+0x18(SB)/8, $0x1111131A1B1B1B1A +DATA decode_const<>+0x20(SB)/8, $0x2F2F2F2F2F2F2F2F // standard decode mask +DATA decode_const<>+0x28(SB)/8, $0x2F2F2F2F2F2F2F2F +DATA decode_const<>+0x30(SB)/8, $0x00101304BFBFB9B9 // standard decode lut roll +DATA decode_const<>+0x38(SB)/8, $0x0000000000000000 +DATA decode_const<>+0x40(SB)/8, $0x1010010204080428 // url decode lut hi +DATA decode_const<>+0x48(SB)/8, $0x1010101010101010 +DATA decode_const<>+0x50(SB)/8, $0x1511111111111111 // url decode lut lo +DATA decode_const<>+0x58(SB)/8, $0x1111131B1B1A1B33 +DATA decode_const<>+0x60(SB)/8, $0x5E5E5E5E5E5E5E5E // url decode mask +DATA decode_const<>+0x68(SB)/8, $0x5E5E5E5E5E5E5E5E +DATA decode_const<>+0x70(SB)/8, $0x00001104BFBFE0B9 // url decode lut roll +DATA decode_const<>+0x78(SB)/8, $0xB900000000000000 +DATA decode_const<>+0x80(SB)/8, $0x4001400140014001 // decode reshufling constant 0 +DATA decode_const<>+0x88(SB)/8, $0x4001400140014001 +DATA decode_const<>+0x90(SB)/8, $0x1000000110000001 // decode reshufling constant 1 +DATA decode_const<>+0x98(SB)/8, $0x1000000110000001 +DATA decode_const<>+0xA0(SB)/8, $0x0A09070605030201 // decode reshufling mask for ppc64le +DATA decode_const<>+0xA8(SB)/8, $0x000000000F0E0D0B +DATA decode_const<>+0xB0(SB)/8, $0x010203050607090A // decode reshufling mask for ppc64 +DATA decode_const<>+0xB8(SB)/8, $0x0B0D0E0F00000000 +GLOBL decode_const<>(SB), (NOPTR+RODATA), $192 + #define REV_BYTES V0 #define RESHUFFLE_MASK V1 #define SHIFT_RIGHT_MASK V2 @@ -105,3 +131,192 @@ loop: done: MOVD R7, ret+56(FP) RET + +#undef RESHUFFLE_MASK +#undef SHIFT_RIGHT_MASK +#undef MULHI_MASK +#undef SHIFT_LEFT_MASK +#undef MULLO_MASK +#undef RANGE1_END +#undef RANGE0_END +#undef LUT +#undef X0 +#undef X1 +#undef X2 + +#define NIBBLE_MASK V1 +#define LUT_HI V2 +#define LUT_LO V3 +#define DECODE_END V4 +#define LUT_ROLL V5 +#define RESHUFFLE_CONST0 V6 +#define RESHUFFLE_CONST1 V7 +#define RESHUFFLE_MASK V8 +#define FOUR V9 + +#define X0 V10 +#define X1 V11 +#define X2 V12 +#define X3 V13 +#define ZERO V14 + +//func decodeStdAsm(dst, src []byte) int +TEXT ·decodeStdAsm(SB),NOSPLIT,$0 + MOVD dst_base+0(FP), R4 + MOVD src_base+24(FP), R5 + MOVD src_len+32(FP), R6 + + // Load constants +#ifdef GOARCH_ppc64le + MOVD $base64_const<>(SB), R8 + LXVD2X (R8), REV_BYTES +#endif + VSPLTISB $0, ZERO + VSPLTISB $0x4, FOUR + VSPLTISB $0x0F, NIBBLE_MASK + MOVD $decode_const<>(SB), R8 + LXVD2X (R8), LUT_HI + MOVD $0x10, R9 + LXVD2X (R8)(R9), LUT_LO + MOVD $0x20, R9 + LXVD2X (R8)(R9), DECODE_END + MOVD $0x30, R9 + LXVD2X (R8)(R9), LUT_ROLL + MOVD $0x80, R9 + LXVD2X (R8)(R9), RESHUFFLE_CONST0 + MOVD $0x90, R9 + LXVD2X (R8)(R9), RESHUFFLE_CONST1 +#ifdef GOARCH_ppc64le + MOVD $0xA0, R9 +#else + MOVD $0xB0, R9 +#endif + LXVD2X (R8)(R9), RESHUFFLE_MASK + + MOVD $0, R7 + MOVD R7, R8 +loop: + // load data + LXVD2X (R5)(R7), X0 +#ifdef GOARCH_ppc64le + VPERM X0, X0, REV_BYTES, X0 +#endif + // validate input + VSRW(X0, FOUR, X1) + VAND(X1, NIBBLE_MASK, X1) // high nibble + VAND(X0, NIBBLE_MASK, X2) + VPERM LUT_HI, LUT_HI, X1, X3 + VPERM LUT_LO, LUT_LO, X2, X2 + VAND(X3, X2, X2) + VCMPEQUBCC X2, ZERO, X3 + BGE CR6, done + + // translate + VCMPEQUB X0, DECODE_END, X2 + VADDUBM X1, X2, X1 + + VPERM LUT_ROLL, LUT_ROLL, X1, X1 + VADDUBM X0, X1, X0 + + // PMADDUBSW + VMULEUB X0, RESHUFFLE_CONST0, X1 + VMULOUB X0, RESHUFFLE_CONST0, X2 + VADDUHM X1, X2, X0 + // PMADDWD + VMULEUH X0, RESHUFFLE_CONST1, X1 + VMULOUH X0, RESHUFFLE_CONST1, X2 + VADDUWM X1, X2, X0 + + VPERM X0, X0, RESHUFFLE_MASK, X0 + STXVD2X X0, (R4)(R8) + + ADD $-16, R6 + ADD $16, R7 + ADD $12, R8 + CMP R6, $24 + BGE loop + +done: + MOVD R6, ret+48(FP) + RET + +//func decodeUrlAsm(dst, src []byte) int +TEXT ·decodeUrlAsm(SB),NOSPLIT,$0 + MOVD dst_base+0(FP), R4 + MOVD src_base+24(FP), R5 + MOVD src_len+32(FP), R6 + + // Load constants +#ifdef GOARCH_ppc64le + MOVD $base64_const<>(SB), R8 + LXVD2X (R8), REV_BYTES +#endif + VSPLTISB $0, ZERO + VSPLTISB $0x4, FOUR + VSPLTISB $0x0F, NIBBLE_MASK + MOVD $decode_const<>(SB), R8 + MOVD $0x40, R9 + LXVD2X (R8)(R9), LUT_HI + MOVD $0x50, R9 + LXVD2X (R8)(R9), LUT_LO + MOVD $0x60, R9 + LXVD2X (R8)(R9), DECODE_END + MOVD $0x70, R9 + LXVD2X (R8)(R9), LUT_ROLL + MOVD $0x80, R9 + LXVD2X (R8)(R9), RESHUFFLE_CONST0 + MOVD $0x90, R9 + LXVD2X (R8)(R9), RESHUFFLE_CONST1 +#ifdef GOARCH_ppc64le + MOVD $0xA0, R9 +#else + MOVD $0xB0, R9 +#endif + LXVD2X (R8)(R9), RESHUFFLE_MASK + + MOVD $0, R7 + MOVD R7, R8 +loop: + // load data + LXVD2X (R5)(R7), X0 +#ifdef GOARCH_ppc64le + VPERM X0, X0, REV_BYTES, X0 +#endif + // validate input + VSRW(X0, FOUR, X1) + VAND(X1, NIBBLE_MASK, X1) // high nibble + VAND(X0, NIBBLE_MASK, X2) + VPERM LUT_HI, LUT_HI, X1, X3 + VPERM LUT_LO, LUT_LO, X2, X2 + VAND(X3, X2, X2) + VCMPEQUBCC X2, ZERO, X3 + BGE CR6, done + + // translate + VCMPGTUB X0, DECODE_END, X2 + VSUBUBM X1, X2, X1 + + VPERM LUT_ROLL, LUT_ROLL, X1, X1 + VADDUBM X0, X1, X0 + + // PMADDUBSW + VMULEUB X0, RESHUFFLE_CONST0, X1 + VMULOUB X0, RESHUFFLE_CONST0, X2 + VADDUHM X1, X2, X0 + // PMADDWD + VMULEUH X0, RESHUFFLE_CONST1, X1 + VMULOUH X0, RESHUFFLE_CONST1, X2 + VADDUWM X1, X2, X0 + + VPERM X0, X0, RESHUFFLE_MASK, X0 + STXVD2X X0, (R4)(R8) + + ADD $-16, R6 + ADD $16, R7 + ADD $12, R8 + CMP R6, $24 + BGE loop + +done: + MOVD R6, ret+48(FP) + RET diff --git a/base64_ppc64x_test.go b/base64_ppc64x_test.go index 45c3863..4fd0de6 100644 --- a/base64_ppc64x_test.go +++ b/base64_ppc64x_test.go @@ -35,3 +35,87 @@ func TestStdEncodeAsm(t *testing.T) { } } + +func TestStdDecodeSIMD(t *testing.T) { + pairs := []testpair{ + {"abcdefghijkl", "YWJjZGVmZ2hpamtsYWJjZGVmZ2hpamts"}, + {"\x2b\xf7\xcc\x27\x01\xfe\x43\x97\xb4\x9e\xbe\xed", "K/fMJwH+Q5e0nr7tK/fMJwH+Q5e0nr7t"}, + {"abcdefghijklabcdefghijklabcdefghijkl", "YWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamtsYWJjZGVmZ2hpamts"}, + } + for _, p := range pairs { + expected := []byte(p.decoded) + src := []byte(p.encoded) + dst := make([]byte, len(expected)) + + ret := decodeStdAsm(dst, src) + if ret == len(src) { + t.Errorf("should return decode") + } + if !bytes.Equal(dst, expected) { + t.Errorf("got %x, expected %x", dst, expected) + } + } +} + +func TestURLEncodeSIMD(t *testing.T) { + pairs := []testpair{ + {"!?$*&()'-=@~0000", "IT8kKiYoKSctPUB-"}, + {"\x2b\xf7\xcc\x27\x01\xfe\x43\x97\xb4\x9e\xbe\xed\x5a\xcc\x70\x90", "K_fMJwH-Q5e0nr7t"}, + {"!?$*&()'-=@~!?$*&()'-=@~0000", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"}, + } + for _, p := range pairs { + src := []byte(p.decoded) + expected := []byte(p.encoded) + dst := make([]byte, len(expected)) + + ret := encodeAsm(dst, src, &encodeURLLut) + if ret != len(expected) { + t.Errorf("should return %v", len(expected)) + } + if !bytes.Equal(dst, expected) { + t.Errorf("got %v", string(dst)) + } + + } +} + +func TestUrlDecodeSIMD(t *testing.T) { + pairs := []testpair{ + {"!?$*&()'-=@~", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"}, + {"\x2b\xf7\xcc\x27\x01\xfe\x43\x97\xb4\x9e\xbe\xed", "K_fMJwH-Q5e0nr7tK_fMJwH-Q5e0nr7t"}, + {"!?$*&()'-=@~!?$*&()'-=@~!?$*&()'-=@~", "IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-IT8kKiYoKSctPUB-"}, + } + for _, p := range pairs { + expected := []byte(p.decoded) + src := []byte(p.encoded) + dst := make([]byte, len(expected)) + + ret := decodeUrlAsm(dst, src) + if ret == len(src) { + t.Errorf("should return decode") + } + if !bytes.Equal(dst, expected) { + t.Errorf("got %x, expected %x", dst, expected) + } + } +} + + +func BenchmarkEncode(b *testing.B) { + data := make([]byte, 8192) + dst := make([]byte, StdEncoding.EncodedLen(8192)) + b.SetBytes(int64(len(data))) + for i := 0; i < b.N; i++ { + StdEncoding.Encode(dst, data) + } +} + +func BenchmarkDecode(b *testing.B) { + data := []byte(StdEncoding.EncodeToString(make([]byte, 8192))) + dbuf := make([]byte, StdEncoding.DecodedLen(len(data))) + b.SetBytes(int64(len(data))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + StdEncoding.Decode(dbuf, data) + } +}