Skip to content

Commit

Permalink
Implement SIMD-accelerated ImfZip::uncompress
Browse files Browse the repository at this point in the history
The main bottleneck in ImfZip::uncompress appears not to be zlib but the
predictor & interleaving loops that run after zlib's decompression.
Fortunately, throughput in both of these loops can be improved with SIMD
operations.  Even though each trip of the predictor loop has data dependencies
on all previous values, the usual SIMD prefix-sum construction is able to
provide a significant speedup.

While the uses of SSSE3 and SSE4.1 are minor in this change and could
maybe be replaced with some slightly more complicated SSE2, SSE4.1 was
released in 2007, so it doesn't seem unreasonable to require it in 2017.
  • Loading branch information
John Loy authored and nickrasmussen committed Aug 8, 2018
1 parent e640952 commit 32f2aa5
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 31 deletions.
9 changes: 9 additions & 0 deletions OpenEXR/IlmImf/ImfSimd.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
//
// Compile time SSE detection:
// IMF_HAVE_SSE2 - Defined if it's safe to compile SSE2 optimizations
// IMF_HAVE_SSE4_1 - Defined if it's safe to compile SSE4.1 optimizations
//


Expand All @@ -48,12 +49,20 @@
#define IMF_HAVE_SSE2 1
#endif

#if defined __SSE4_1__
#define IMF_HAVE_SSE4_1 1
#endif

extern "C"
{
#ifdef IMF_HAVE_SSE2
#include <emmintrin.h>
#include <mmintrin.h>
#endif

#ifdef IMF_HAVE_SSE4_1
#include <smmintrin.h>
#endif
}

#endif
171 changes: 140 additions & 31 deletions OpenEXR/IlmImf/ImfZip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "ImfZip.h"
#include "ImfCheckedArithmetic.h"
#include "ImfNamespace.h"
#include "ImfSimd.h"
#include "Iex.h"

#include <math.h>
Expand Down Expand Up @@ -135,6 +136,130 @@ Imf::Zip::compress(const char *raw, int rawSize, char *compressed)
return outSize;
}

#ifdef IMF_HAVE_SSE4_1

static void
reconstruct_sse41(char *buf, size_t outSize)
{
static const size_t bytesPerChunk = sizeof(__m128i);
const size_t vOutSize = outSize / bytesPerChunk;

const __m128i c = _mm_set1_epi8(-128);
const __m128i shuffleMask = _mm_set1_epi8(15);

// The first element doesn't have its high bit flipped during compression,
// so it must not be flipped here. To make the SIMD loop nice and
// uniform, we pre-flip the bit so that the loop will unflip it again.
buf[0] += -128;

__m128i *vBuf = reinterpret_cast<__m128i *>(buf);
__m128i vPrev = _mm_setzero_si128();
for (size_t i=0; i<vOutSize; ++i)
{
__m128i d = _mm_add_epi8(_mm_loadu_si128(vBuf), c);

// Compute the prefix sum of elements.
d = _mm_add_epi8(d, _mm_slli_si128(d, 1));
d = _mm_add_epi8(d, _mm_slli_si128(d, 2));
d = _mm_add_epi8(d, _mm_slli_si128(d, 4));
d = _mm_add_epi8(d, _mm_slli_si128(d, 8));
d = _mm_add_epi8(d, vPrev);

_mm_storeu_si128(vBuf++, d);

// Broadcast the high byte in our result to all lanes of the prev
// value for the next iteration.
vPrev = _mm_shuffle_epi8(d, shuffleMask);
}

unsigned char prev = _mm_extract_epi8(vPrev, 15);
for (size_t i=vOutSize*bytesPerChunk; i<outSize; ++i)
{
unsigned char d = prev + buf[i] - 128;
buf[i] = d;
prev = d;
}
}

#else

static void
reconstruct_scalar(char *buf, size_t outSize)
{
unsigned char *t = (unsigned char *) buf + 1;
unsigned char *stop = (unsigned char *) buf + outSize;

while (t < stop)
{
int d = int (t[-1]) + int (t[0]) - 128;
t[0] = d;
++t;
}
}

#endif


#ifdef IMF_HAVE_SSE2

static void
interleave_sse2(const char *source, size_t outSize, char *out)
{
static const size_t bytesPerChunk = 2*sizeof(__m128i);

const size_t vOutSize = outSize / bytesPerChunk;

const __m128i *v1 = reinterpret_cast<const __m128i *>(source);
const __m128i *v2 = reinterpret_cast<const __m128i *>(source + (outSize + 1) / 2);
__m128i *vOut = reinterpret_cast<__m128i *>(out);

for (size_t i=0; i<vOutSize; ++i) {
__m128i a = _mm_loadu_si128(v1++);
__m128i b = _mm_loadu_si128(v2++);

__m128i lo = _mm_unpacklo_epi8(a, b);
__m128i hi = _mm_unpackhi_epi8(a, b);

_mm_storeu_si128(vOut++, lo);
_mm_storeu_si128(vOut++, hi);
}

const char *t1 = reinterpret_cast<const char *>(v1);
const char *t2 = reinterpret_cast<const char *>(v2);
char *sOut = reinterpret_cast<char *>(vOut);

for (size_t i=vOutSize*bytesPerChunk; i<outSize; ++i)
{
*(sOut++) = (i%2==0) ? *(t1++) : *(t2++);
}
}

#else

static void
interleave_scalar(const char *source, size_t outSize, char *out)
{
const char *t1 = source;
const char *t2 = source + (outSize + 1) / 2;
char *s = out;
char *const stop = s + outSize;

while (true)
{
if (s < stop)
*(s++) = *(t1++);
else
break;

if (s < stop)
*(s++) = *(t2++);
else
break;
}
}

#endif

int
Imf::Zip::uncompress(const char *compressed, int compressedSize,
char *raw)
Expand All @@ -151,44 +276,28 @@ Imf::Zip::uncompress(const char *compressed, int compressedSize,
throw Iex::InputExc ("Data decompression (zlib) failed.");
}

if (outSize == 0)
{
return outSize;
}

//
// Predictor.
//
{
unsigned char *t = (unsigned char *) _tmpBuffer + 1;
unsigned char *stop = (unsigned char *) _tmpBuffer + outSize;

while (t < stop)
{
int d = int (t[-1]) + int (t[0]) - 128;
t[0] = d;
++t;
}
}
#ifdef IMF_HAVE_SSE4_1
reconstruct_sse41(_tmpBuffer, outSize);
#else
reconstruct_scalar(_tmpBuffer, outSize);
#endif

//
// Reorder the pixel data.
//

{
const char *t1 = _tmpBuffer;
const char *t2 = _tmpBuffer + (outSize + 1) / 2;
char *s = raw;
char *stop = s + outSize;

while (true)
{
if (s < stop)
*(s++) = *(t1++);
else
break;

if (s < stop)
*(s++) = *(t2++);
else
break;
}
}
#ifdef IMF_HAVE_SSE2
interleave_sse2(_tmpBuffer, outSize, raw);
#else
interleave_scalar(_tmpBuffer, outSize, raw);
#endif

return outSize;
}
Expand Down

0 comments on commit 32f2aa5

Please sign in to comment.