Skip to content

Commit

Permalink
Merge pull request #137 from karlrasche/interleaveByte2_sse_bug
Browse files Browse the repository at this point in the history
Fixing SSE2 byte interleaving path to work with short runs
  • Loading branch information
ehanway-ilm committed Oct 15, 2014
2 parents 699b4a6 + da28ad8 commit f4a6d3b
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 22 deletions.
41 changes: 22 additions & 19 deletions OpenEXR/IlmImf/ImfDwaCompressorSimd.h
Original file line number Diff line number Diff line change
Expand Up @@ -334,34 +334,37 @@ interleaveByte2 (char *dst, char *src0, char *src1, int numBytes)
// use aligned loads
//

for (int x = 0; x < 8; ++x)
for (int x = 0; x < std::min (numBytes, 8); ++x)
{
dst[2 * x] = src0[x];
dst[2 * x + 1] = src1[x];
}

dst_epi8 = (__m128i*)&dst[16];
src0_epi8 = (__m128i*)&src0[8];
src1_epi8 = (__m128i*)&src1[8];
sseWidth = (numBytes - 8) / 16;

for (int x=0; x<sseWidth; ++x)
if (numBytes > 8)
{
_mm_stream_si128 (&dst_epi8[2 * x],
_mm_unpacklo_epi8 (src0_epi8[x], src1_epi8[x]));
dst_epi8 = (__m128i*)&dst[16];
src0_epi8 = (__m128i*)&src0[8];
src1_epi8 = (__m128i*)&src1[8];
sseWidth = (numBytes - 8) / 16;

_mm_stream_si128 (&dst_epi8[2 * x + 1],
_mm_unpackhi_epi8 (src0_epi8[x], src1_epi8[x]));
}
for (int x=0; x<sseWidth; ++x)
{
_mm_stream_si128 (&dst_epi8[2 * x],
_mm_unpacklo_epi8 (src0_epi8[x], src1_epi8[x]));

//
// Then do run the leftovers one at a time
//
_mm_stream_si128 (&dst_epi8[2 * x + 1],
_mm_unpackhi_epi8 (src0_epi8[x], src1_epi8[x]));
}

for (int x = 16 * sseWidth + 8; x < numBytes; ++x)
{
dst[2 * x] = src0[x];
dst[2 * x + 1] = src1[x];
//
// Then do run the leftovers one at a time
//

for (int x = 16 * sseWidth + 8; x < numBytes; ++x)
{
dst[2 * x] = src0[x];
dst[2 * x + 1] = src1[x];
}
}
}
else
Expand Down
47 changes: 44 additions & 3 deletions OpenEXR/IlmImfTest/testDwaCompressorSimd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,9 @@ testCsc()
void
testInterleave()
{
const int bufferLen = 100000;
const int numIter = 10000;
const int bufferLen = 100000;
const int randomNumIter = 10000;
const int lengthNumIter = 128;
Rand48 rand48(0);
char *srcA = new char[bufferLen];
char *srcB = new char[bufferLen];
Expand All @@ -189,7 +190,7 @@ testInterleave()
dst[2*i+1] = srcB[i];
}

for (int iter=0; iter<numIter; ++iter)
for (int iter=0; iter<randomNumIter; ++iter)
{
memset(test, 0, 2*bufferLen);

Expand All @@ -203,6 +204,46 @@ testInterleave()
}
}

//
// Test increasing length buffers, with varying alignment
// on all the buffers.
//
for (int len=1; len<lengthNumIter; ++len)
{
for (int offset=0; offset<16*16*16; ++offset)
{
int offsetA = offset % 16;
int offsetB = (offset / 16) % 16;
int offsetTest = (offset / 256) % 16;

memset(srcA, 255, bufferLen);
memset(srcB, 255, bufferLen);
memset(dst, 0, 2*bufferLen);
memset(test, 0, 2*bufferLen);

char *a = srcA + offsetA;
char *b = srcB + offsetB;
char *out = test + offsetTest;

for (int i=0; i<len; ++i)
{
a[i] = (char)rand48.nextf(0.0, 255.0);
b[i] = (char)rand48.nextf(0.0, 255.0);

dst[2*i] = a[i];
dst[2*i+1] = b[i];
}

interleaveByte2(out, a, b, len);

for (int i=0; i<2*len+8; ++i)
{
assert( dst[2*i] == out[2*i] );
assert( dst[2*i+1] == out[2*i+1] );
}
}
}

delete[] srcA;
delete[] srcB;
delete[] dst;
Expand Down

0 comments on commit f4a6d3b

Please sign in to comment.