Implement SIMD-accelerated ImfZip::uncompress

The main bottleneck in ImfZip::uncompress appears not to be zlib but the predictor & interleaving loops that run after zlib's decompression. Fortunately, throughput in both of these loops can be improved with SIMD operations. Even though each trip of the predictor loop has data dependencies on all previous values, the usual SIMD prefix-sum construction is able to provide a significant speedup. While the uses of SSSE3 and SSE4.1 are minor in this change and could maybe be replaced with some slightly more complicated SSE2, SSE4.1 was released in 2007, so it doesn't seem unreasonable to require it in 2017.
AcademySoftwareFoundation · Aug 8, 2018 · 32f2aa5 · 32f2aa5
1 parent e640952
commit 32f2aa5
Show file tree

Hide file tree

Showing 2 changed files with 149 additions and 31 deletions.
diff --git a/OpenEXR/IlmImf/ImfSimd.h b/OpenEXR/IlmImf/ImfSimd.h
@@ -40,6 +40,7 @@
 //
 // Compile time SSE detection:
 //    IMF_HAVE_SSE2 - Defined if it's safe to compile SSE2 optimizations
+//    IMF_HAVE_SSE4_1 - Defined if it's safe to compile SSE4.1 optimizations
 //
 
 
@@ -48,12 +49,20 @@
     #define IMF_HAVE_SSE2 1
 #endif
 
+#if defined __SSE4_1__
+    #define IMF_HAVE_SSE4_1 1
+#endif
+
 extern "C"
 {
 #ifdef IMF_HAVE_SSE2
     #include <emmintrin.h>
     #include <mmintrin.h>
 #endif
+
+#ifdef IMF_HAVE_SSE4_1
+    #include <smmintrin.h>
+#endif
 }
 
 #endif
diff --git a/OpenEXR/IlmImf/ImfZip.cpp b/OpenEXR/IlmImf/ImfZip.cpp
@@ -35,6 +35,7 @@
 #include "ImfZip.h"
 #include "ImfCheckedArithmetic.h"
 #include "ImfNamespace.h"
+#include "ImfSimd.h"
 #include "Iex.h"
 
 #include <math.h>
@@ -135,6 +136,130 @@ Imf::Zip::compress(const char *raw, int rawSize, char *compressed)
     return outSize;
 }
 
+#ifdef IMF_HAVE_SSE4_1
+
+static void
+reconstruct_sse41(char *buf, size_t outSize)
+{
+    static const size_t bytesPerChunk = sizeof(__m128i);
+    const size_t vOutSize = outSize / bytesPerChunk;
+
+    const __m128i c = _mm_set1_epi8(-128);
+    const __m128i shuffleMask = _mm_set1_epi8(15);
+
+    // The first element doesn't have its high bit flipped during compression,
+    // so it must not be flipped here.  To make the SIMD loop nice and
+    // uniform, we pre-flip the bit so that the loop will unflip it again.
+    buf[0] += -128;
+
+    __m128i *vBuf = reinterpret_cast<__m128i *>(buf);
+    __m128i vPrev = _mm_setzero_si128();
+    for (size_t i=0; i<vOutSize; ++i)
+    {
+        __m128i d = _mm_add_epi8(_mm_loadu_si128(vBuf), c);
+
+        // Compute the prefix sum of elements.
+        d = _mm_add_epi8(d, _mm_slli_si128(d, 1));
+        d = _mm_add_epi8(d, _mm_slli_si128(d, 2));
+        d = _mm_add_epi8(d, _mm_slli_si128(d, 4));
+        d = _mm_add_epi8(d, _mm_slli_si128(d, 8));
+        d = _mm_add_epi8(d, vPrev);
+
+        _mm_storeu_si128(vBuf++, d);
+
+        // Broadcast the high byte in our result to all lanes of the prev
+        // value for the next iteration.
+        vPrev = _mm_shuffle_epi8(d, shuffleMask);
+    }
+
+    unsigned char prev = _mm_extract_epi8(vPrev, 15);
+    for (size_t i=vOutSize*bytesPerChunk; i<outSize; ++i)
+    {
+        unsigned char d = prev + buf[i] - 128;
+        buf[i] = d;
+        prev = d;
+    }
+}
+
+#else
+
+static void
+reconstruct_scalar(char *buf, size_t outSize)
+{
+    unsigned char *t    = (unsigned char *) buf + 1;
+    unsigned char *stop = (unsigned char *) buf + outSize;
+
+    while (t < stop)
+    {
+        int d = int (t[-1]) + int (t[0]) - 128;
+        t[0] = d;
+        ++t;
+    }
+}
+
+#endif
+
+
+#ifdef IMF_HAVE_SSE2
+
+static void
+interleave_sse2(const char *source, size_t outSize, char *out)
+{
+    static const size_t bytesPerChunk = 2*sizeof(__m128i);
+
+    const size_t vOutSize = outSize / bytesPerChunk;
+
+    const __m128i *v1 = reinterpret_cast<const __m128i *>(source);
+    const __m128i *v2 = reinterpret_cast<const __m128i *>(source + (outSize + 1) / 2);
+    __m128i *vOut = reinterpret_cast<__m128i *>(out);
+
+    for (size_t i=0; i<vOutSize; ++i) {
+        __m128i a = _mm_loadu_si128(v1++);
+        __m128i b = _mm_loadu_si128(v2++);
+
+        __m128i lo = _mm_unpacklo_epi8(a, b);
+        __m128i hi = _mm_unpackhi_epi8(a, b);
+
+        _mm_storeu_si128(vOut++, lo);
+        _mm_storeu_si128(vOut++, hi);
+    }
+
+    const char *t1 = reinterpret_cast<const char *>(v1);
+    const char *t2 = reinterpret_cast<const char *>(v2);
+    char *sOut = reinterpret_cast<char *>(vOut);
+
+    for (size_t i=vOutSize*bytesPerChunk; i<outSize; ++i)
+    {
+        *(sOut++) = (i%2==0) ? *(t1++) : *(t2++);
+    }
+}
+
+#else
+
+static void
+interleave_scalar(const char *source, size_t outSize, char *out)
+{
+    const char *t1 = source;
+    const char *t2 = source + (outSize + 1) / 2;
+    char *s = out;
+    char *const stop = s + outSize;
+
+    while (true)
+    {
+        if (s < stop)
+            *(s++) = *(t1++);
+        else
+            break;
+
+        if (s < stop)
+            *(s++) = *(t2++);
+        else
+            break;
+    }
+}
+
+#endif
+
 int
 Imf::Zip::uncompress(const char *compressed, int compressedSize,
                                             char *raw)
@@ -151,44 +276,28 @@ Imf::Zip::uncompress(const char *compressed, int compressedSize,
         throw Iex::InputExc ("Data decompression (zlib) failed.");
     }
 
+    if (outSize == 0)
+    {
+        return outSize;
+    }
+
     //
     // Predictor.
     //
-    {
-        unsigned char *t    = (unsigned char *) _tmpBuffer + 1;
-        unsigned char *stop = (unsigned char *) _tmpBuffer + outSize;
-
-        while (t < stop)
-        {
-            int d = int (t[-1]) + int (t[0]) - 128;
-            t[0] = d;
-            ++t;
-        }
-    }
+#ifdef IMF_HAVE_SSE4_1
+    reconstruct_sse41(_tmpBuffer, outSize);
+#else
+    reconstruct_scalar(_tmpBuffer, outSize);
+#endif
 
     //
     // Reorder the pixel data.
     //
-
-    {
-        const char *t1 = _tmpBuffer;
-        const char *t2 = _tmpBuffer + (outSize + 1) / 2;
-        char *s = raw;
-        char *stop = s + outSize;
-
-        while (true)
-        {
-            if (s < stop)
-            *(s++) = *(t1++);
-            else
-            break;
-
-            if (s < stop)
-            *(s++) = *(t2++);
-            else
-            break;
-        }
-    }
+#ifdef IMF_HAVE_SSE2
+    interleave_sse2(_tmpBuffer, outSize, raw);
+#else
+    interleave_scalar(_tmpBuffer, outSize, raw);
+#endif
 
     return outSize;
 }