diff --git a/flate/fast_encoder.go b/flate/fast_encoder.go index 433977767..05c27b822 100644 --- a/flate/fast_encoder.go +++ b/flate/fast_encoder.go @@ -7,6 +7,7 @@ package flate import ( "fmt" + "math/bits" "github.com/klauspost/compress/internal/le" ) @@ -150,13 +151,33 @@ func (e *fastGen) matchlen(s, t int32, src []byte) int32 { panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")")) } } - s1 := int(s) + maxMatchLength - 4 - if s1 > len(src) { - s1 = len(src) + s1 := int32(s) + maxMatchLength - 4 + if s1 > int32(len(src)) { + s1 = int32(len(src)) } + left := s1 - s + n := 0 + for left >= 8 { + diff := le.Load64(src, s) ^ le.Load64(src, t) + if diff != 0 { + return int32(n + bits.TrailingZeros64(diff)>>3) + } + s += 8 + t += 8 + left -= 8 + } + + a := src[s:s1] + b := src[t:] + for i := range a { + if a[i] != b[i] { + break + } + n++ + } + return int32(n) // Extend the match to be as long as possible. - return int32(matchLen(src[s:s1], src[t:])) } // matchlenLong will return the match length between offsets and t in src. @@ -177,7 +198,28 @@ func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 { } } // Extend the match to be as long as possible. - return int32(matchLen(src[s:], src[t:])) + left := int32(len(src)) - s + n := int32(0) + for left >= 8 { + diff := le.Load64(src, s) ^ le.Load64(src, t) + if diff != 0 { + return n + int32(bits.TrailingZeros64(diff)>>3) + } + s += 8 + t += 8 + n += 8 + left -= 8 + } + + a := src[s:] + b := src[t:] + for i := range a { + if a[i] != b[i] { + break + } + n++ + } + return n } // Reset the encoding table. diff --git a/flate/level1.go b/flate/level1.go index 61854a352..ec721d859 100644 --- a/flate/level1.go +++ b/flate/level1.go @@ -2,9 +2,6 @@ package flate import ( "fmt" - "math/bits" - - "github.com/klauspost/compress/internal/le" ) // fastGen maintains the table for matches, @@ -122,32 +119,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { // Extend the 4-byte match as long as possible. t := candidate.offset - e.cur - var l = int32(4) - if false { - l = e.matchlenLong(s+4, t+4, src) + 4 - } else { - // inlined: - a := src[s:] - b := src[t:] - left := len(a) - 4 - for left >= 8 { - if diff := le.Load64(a, l) ^ le.Load64(b, l); diff != 0 { - l += int32(bits.TrailingZeros64(diff) >> 3) - goto endMatch - } - l += 8 - left -= 8 - } - a = a[l:] - b = b[l:] - for i := range a { - if a[i] != b[i] { - break - } - l++ - } - endMatch: - } + l := e.matchlenLong(s+4, t+4, src) + 4 // Extend backwards for t > 0 && s > nextEmit && src[t-1] == src[s-1] { diff --git a/flate/matchlen_amd64.go b/flate/matchlen_amd64.go deleted file mode 100644 index 4bd388584..000000000 --- a/flate/matchlen_amd64.go +++ /dev/null @@ -1,16 +0,0 @@ -//go:build amd64 && !appengine && !noasm && gc -// +build amd64,!appengine,!noasm,gc - -// Copyright 2019+ Klaus Post. All rights reserved. -// License information can be found in the LICENSE file. - -package flate - -// matchLen returns how many bytes match in a and b -// -// It assumes that: -// -// len(a) <= len(b) and len(a) > 0 -// -//go:noescape -func matchLen(a []byte, b []byte) int diff --git a/flate/matchlen_amd64.s b/flate/matchlen_amd64.s deleted file mode 100644 index 0782b86e3..000000000 --- a/flate/matchlen_amd64.s +++ /dev/null @@ -1,66 +0,0 @@ -// Copied from S2 implementation. - -//go:build !appengine && !noasm && gc && !noasm - -#include "textflag.h" - -// func matchLen(a []byte, b []byte) int -TEXT ·matchLen(SB), NOSPLIT, $0-56 - MOVQ a_base+0(FP), AX - MOVQ b_base+24(FP), CX - MOVQ a_len+8(FP), DX - - // matchLen - XORL SI, SI - CMPL DX, $0x08 - JB matchlen_match4_standalone - -matchlen_loopback_standalone: - MOVQ (AX)(SI*1), BX - XORQ (CX)(SI*1), BX - JZ matchlen_loop_standalone - -#ifdef GOAMD64_v3 - TZCNTQ BX, BX -#else - BSFQ BX, BX -#endif - SHRL $0x03, BX - LEAL (SI)(BX*1), SI - JMP gen_match_len_end - -matchlen_loop_standalone: - LEAL -8(DX), DX - LEAL 8(SI), SI - CMPL DX, $0x08 - JAE matchlen_loopback_standalone - -matchlen_match4_standalone: - CMPL DX, $0x04 - JB matchlen_match2_standalone - MOVL (AX)(SI*1), BX - CMPL (CX)(SI*1), BX - JNE matchlen_match2_standalone - LEAL -4(DX), DX - LEAL 4(SI), SI - -matchlen_match2_standalone: - CMPL DX, $0x02 - JB matchlen_match1_standalone - MOVW (AX)(SI*1), BX - CMPW (CX)(SI*1), BX - JNE matchlen_match1_standalone - LEAL -2(DX), DX - LEAL 2(SI), SI - -matchlen_match1_standalone: - CMPL DX, $0x01 - JB gen_match_len_end - MOVB (AX)(SI*1), BL - CMPB (CX)(SI*1), BL - JNE gen_match_len_end - INCL SI - -gen_match_len_end: - MOVQ SI, ret+48(FP) - RET diff --git a/flate/matchlen_generic.go b/flate/matchlen_generic.go index 8c840f9b4..6149384aa 100644 --- a/flate/matchlen_generic.go +++ b/flate/matchlen_generic.go @@ -1,6 +1,3 @@ -//go:build !amd64 || appengine || !gc || noasm -// +build !amd64 appengine !gc noasm - // Copyright 2019+ Klaus Post. All rights reserved. // License information can be found in the LICENSE file.