Skip to content

Commit

Permalink
SSSE3: dec: unroll inner loop
Browse files Browse the repository at this point in the history
  • Loading branch information
aklomp committed Nov 29, 2019
1 parent 495414b commit 5874921
Showing 1 changed file with 80 additions and 42 deletions.
122 changes: 80 additions & 42 deletions lib/arch/ssse3/dec_loop.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,22 +65,9 @@
// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10

static inline void
dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
static inline int
dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
{
if (*slen < 24) {
return;
}

// Process blocks of 16 bytes per round. Because 4 extra zero bytes are
// written after the output, ensure that there will be at least 8 bytes
// of input data left to cover the gap. (6 data bytes and up to two
// end-of-string markers.)
size_t rounds = (*slen - 8) / 16;

*slen -= rounds * 16; // 16 bytes consumed per round
*olen += rounds * 12; // 12 bytes produced per round

const __m128i lut_lo = _mm_setr_epi8(
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
Expand All @@ -95,39 +82,90 @@ dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)

const __m128i mask_2F = _mm_set1_epi8(0x2F);

do {
// Load input:
__m128i str = _mm_loadu_si128((__m128i *) *s);

// Table lookups:
const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F);
const __m128i lo_nibbles = _mm_and_si128(str, mask_2F);
const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles);
const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles);

// Check for invalid input: if any "and" values from lo and hi
// are not zero, fall back on bytewise code to do error
// checking and reporting:
if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) {
break;
}
// Load input:
__m128i str = _mm_loadu_si128((__m128i *) *s);

const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F);
const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles));
// Table lookups:
const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F);
const __m128i lo_nibbles = _mm_and_si128(str, mask_2F);
const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles);
const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles);

// Now simply add the delta values to the input:
str = _mm_add_epi8(str, roll);
// Check for invalid input: if any "and" values from lo and hi are not
// zero, fall back on bytewise code to do error checking and reporting:
if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) {
return 0;
}

const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F);
const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles));

// Now simply add the delta values to the input:
str = _mm_add_epi8(str, roll);

// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);
// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);

// Store the output:
_mm_storeu_si128((__m128i *) *o, str);
// Store the output:
_mm_storeu_si128((__m128i *) *o, str);

*s += 16;
*o += 12;
*rounds -= 1;

return 1;
}

*s += 16;
*o += 12;
static inline void
dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 24) {
return;
}

// Process blocks of 16 bytes per round. Because 4 extra zero bytes are
// written after the output, ensure that there will be at least 8 bytes
// of input data left to cover the gap. (6 data bytes and up to two
// end-of-string markers.)
size_t rounds = (*slen - 8) / 16;

*slen -= rounds * 16; // 16 bytes consumed per round
*olen += rounds * 12; // 12 bytes produced per round

do {
if (rounds >= 8) {
if (dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 4) {
if (dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 2) {
if (dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds)) {
continue;
}
break;
}
dec_loop_ssse3_inner(s, o, &rounds);
break;

} while (--rounds > 0);
} while (rounds > 0);

// Adjust for any rounds that were skipped:
*slen += rounds * 16;
Expand Down

0 comments on commit 5874921

Please sign in to comment.