Skip to content

Commit 67ee3fd

Browse files
committed
SSSE3->AVX2 encoding optimization
Use Wojciech Mula (@WojciechMula) implementation update for AVX2 / SSSE3 encoding. SSSE3 implementation is reused in SSE4.1, SSE4.2 and AVX dispatched encoding loops. SSE4.1 implementation is now useless but kept to ease integration of future updates if needed. Speed-up on i7-4870HQ @ 2.5 GHz (clang-800.0.42.1, x86_64) SSSE3 encoding: +20% SSE4.2 encoding: +8% AVX encoding: +7% AVX2 encoding: +3%
1 parent cfa8bf7 commit 67ee3fd

File tree

10 files changed

+168
-132
lines changed

10 files changed

+168
-132
lines changed

LICENSE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
Copyright (c) 2005-2007, Nick Galbreath
22
Copyright (c) 2013-2017, Alfred Klomp
3+
Copyright (c) 2015-2017, Wojciech Mula
34
Copyright (c) 2016-2017, Matthieu Darbois
45
All rights reserved.
56

README.md

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -423,22 +423,22 @@ The tables below contain some results on random machines. All numbers measured w
423423

424424
x86 processors
425425

426-
| Processor | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | SSE4.1 enc | SSE4.1 dec| SSE4.2 enc | SSE4.2 dec| AVX enc | AVX dec | AVX2 enc | AVX2 dec |
427-
|-------------------------------------------|----------:|----------:|----------:|----------:|-----------:|----------:|-----------:|----------:|--------:|--------:|---------:|---------:|
428-
| i7-4771 @ 3.5 GHz | 833 | 1111\* | 3333\* | 4444\* | TBD | TBD | TBD | TBD | TBD | TBD | 4999\* | 6666\* |
429-
| i7-4770 @ 3.4 GHz DDR1600 | 1831 | 1748\* | 3570\* | 3695\* | TBD | TBD | TBD | TBD | TBD | TBD | 6539\* | 6512\* |
430-
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1779 | 1727\* | 3419\* | 3788\* | TBD | TBD | TBD | TBD | TBD | TBD | 4589\* | 5871\* |
431-
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3367 | 3374\* | 4784\* | 6672\* | TBD | TBD | TBD | TBD | TBD | TBD | 5120\* | 7721\* |
432-
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4834 | 6075\* | 4906\* | 8154\* | TBD | TBD | TBD | TBD | TBD | TBD | 4839\* | 6911\* |
433-
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 4696 | 6361\* | 5227\* | 7737\* | TBD | TBD | TBD | TBD | TBD | TBD | 4813\* | 7189\* |
434-
| i7-4870HQ @ 2.5 GHz | 1471 | 3066 | 5599 | 3886 | 5882 | 3888 | 6202 | 5098 | 6524 | 5281 | 8113 | 7063 |
435-
| i5-4590S @ 3.0 GHz | 1721 | 1643\* | 3255\* | 3404\* | TBD | TBD | TBD | TBD | TBD | TBD | 4124\* | 5403\* |
436-
| Xeon X5570 @ 2.93 GHz | 1097 | 1048\* | 2077\* | 2215\* | TBD | TBD | TBD | TBD | - | - | - | - |
437-
| Pentium4 @ 3.4 GHz | 528 | 448\* | - | - | - | - | - | - | - | - | - | - |
438-
| Atom N270 | 112 | 125\* | 331\* | 368\* | - | - | - | - | - | - | - | - |
439-
| AMD E-450 | 370 | 332\* | 405\* | 366\* | - | - | - | - | - | - | - | - |
440-
| Intel Edison @ 500 MHz | 79 | 92\* | 152\* | 172\* | TBD | TBD | TBD | TBD | - | - | - | - |
441-
| Intel Edison @ 500 MHz OPENMP 2 thread | 158 | 184\* | 300\* | 343\* | TBD | TBD | TBD | TBD | - | - | - | - |
426+
| Processor | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | SSE4.2 enc | SSE4.2 dec| AVX enc | AVX dec | AVX2 enc | AVX2 dec |
427+
|-------------------------------------------|----------:|----------:|----------:|----------:|-----------:|----------:|--------:|--------:|---------:|---------:|
428+
| i7-4771 @ 3.5 GHz | 833 | 1111\* | 3333\* | 4444\* | TBD | TBD | TBD | TBD | 4999\* | 6666\* |
429+
| i7-4770 @ 3.4 GHz DDR1600 | 1831 | 1748\* | 3570\* | 3695\* | TBD | TBD | TBD | TBD | 6539\* | 6512\* |
430+
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1779 | 1727\* | 3419\* | 3788\* | TBD | TBD | TBD | TBD | 4589\* | 5871\* |
431+
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3367 | 3374\* | 4784\* | 6672\* | TBD | TBD | TBD | TBD | 5120\* | 7721\* |
432+
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4834 | 6075\* | 4906\* | 8154\* | TBD | TBD | TBD | TBD | 4839\* | 6911\* |
433+
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 4696 | 6361\* | 5227\* | 7737\* | TBD | TBD | TBD | TBD | 4813\* | 7189\* |
434+
| i7-4870HQ @ 2.5 GHz | 1471 | 3066 | 6721 | 3886 | 6701 | 5098 | 7015 | 5281 | 8328 | 7063 |
435+
| i5-4590S @ 3.0 GHz | 1721 | 1643\* | 3255\* | 3404\* | TBD | TBD | TBD | TBD | 4124\* | 5403\* |
436+
| Xeon X5570 @ 2.93 GHz | 1097 | 1048\* | 2077\* | 2215\* | TBD | TBD | - | - | - | - |
437+
| Pentium4 @ 3.4 GHz | 528 | 448\* | - | - | - | - | - | - | - | - |
438+
| Atom N270 | 112 | 125\* | 331\* | 368\* | - | - | - | - | - | - |
439+
| AMD E-450 | 370 | 332\* | 405\* | 366\* | - | - | - | - | - | - |
440+
| Intel Edison @ 500 MHz | 79 | 92\* | 152\* | 172\* | TBD | TBD | - | - | - | - |
441+
| Intel Edison @ 500 MHz OPENMP 2 thread | 158 | 184\* | 300\* | 343\* | TBD | TBD | - | - | - | - |
442442

443443
ARM processors
444444

lib/arch/avx/codec.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,9 @@
1010

1111
#include "../sse2/compare_macros.h"
1212

13-
#include "../ssse3/_mm_bswap_epi32.c"
1413
#include "../ssse3/dec_reshuffle.c"
1514
#include "../ssse3/enc_translate.c"
16-
#include "../sse41/enc_reshuffle.c"
15+
#include "../ssse3/enc_reshuffle.c"
1716

1817
#endif // __AVX__
1918

lib/arch/avx2/codec.c

Lines changed: 79 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -13,57 +13,85 @@
1313
#define REPLACE(s,n) _mm256_and_si256((s), _mm256_set1_epi8(n))
1414
#define RANGE(s,a,b) _mm256_andnot_si256(CMPGT((s), (b)), CMPGT((s), (a) - 1))
1515

16-
static inline __m256i
17-
_mm256_bswap_epi32 (const __m256i in)
18-
{
19-
// _mm256_shuffle_epi8() works on two 128-bit lanes separately:
20-
return _mm256_shuffle_epi8(in, _mm256_setr_epi8(
21-
3, 2, 1, 0,
22-
7, 6, 5, 4,
23-
11, 10, 9, 8,
24-
15, 14, 13, 12,
25-
3, 2, 1, 0,
26-
7, 6, 5, 4,
27-
11, 10, 9, 8,
28-
15, 14, 13, 12));
29-
}
30-
31-
static inline __m256i
32-
enc_reshuffle (__m256i in)
33-
{
34-
// Spread out 32-bit words over both halves of the input register:
35-
in = _mm256_permutevar8x32_epi32(in, _mm256_setr_epi32(
36-
0, 1, 2, -1,
37-
3, 4, 5, -1));
38-
39-
// Slice into 32-bit chunks and operate on all chunks in parallel.
40-
// All processing is done within the 32-bit chunk. First, shuffle:
41-
// before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb]
42-
// after: [00000000|aaaaaabb|bbbbcccc|ccdddddd]
43-
in = _mm256_shuffle_epi8(in, _mm256_set_epi8(
44-
-1, 9, 10, 11,
45-
-1, 6, 7, 8,
46-
-1, 3, 4, 5,
47-
-1, 0, 1, 2,
48-
-1, 9, 10, 11,
49-
-1, 6, 7, 8,
50-
-1, 3, 4, 5,
51-
-1, 0, 1, 2));
52-
53-
// merged = [0000aaaa|aabbbbbb|bbbbcccc|ccdddddd]
54-
const __m256i merged = _mm256_blend_epi16(_mm256_slli_epi32(in, 4), in, 0x55);
55-
56-
// bd = [00000000|00bbbbbb|00000000|00dddddd]
57-
const __m256i bd = _mm256_and_si256(merged, _mm256_set1_epi32(0x003F003F));
58-
59-
// ac = [00aaaaaa|00000000|00cccccc|00000000]
60-
const __m256i ac = _mm256_and_si256(_mm256_slli_epi32(merged, 2), _mm256_set1_epi32(0x3F003F00));
61-
62-
// indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
63-
const __m256i indices = _mm256_or_si256(ac, bd);
64-
65-
// return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
66-
return _mm256_bswap_epi32(indices);
16+
static inline __m256i enc_reshuffle(const __m256i input) {
17+
// translation from SSSE3 into AVX2 of procedure
18+
// This one works with shifted (4 bytes) input in order to
19+
// be able to work efficiently in the 2 128-bit lanes
20+
21+
// input, bytes MSB to LSB:
22+
// 0 0 0 0 x w v u t s r q p o n m
23+
// l k j i h g f e d c b a 0 0 0 0
24+
25+
const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
26+
10, 11, 9, 10,
27+
7, 8, 6, 7,
28+
4, 5, 3, 4,
29+
1, 2, 0, 1,
30+
31+
14, 15, 13, 14,
32+
11, 12, 10, 11,
33+
8, 9, 7, 8,
34+
5, 6, 4, 5));
35+
// in, bytes MSB to LSB:
36+
// w x v w
37+
// t u s t
38+
// q r p q
39+
// n o m n
40+
// k l j k
41+
// h i g h
42+
// e f d e
43+
// b c a b
44+
45+
const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
46+
// bits, upper case are most significant bits, lower case are least significant bits.
47+
// 0000wwww XX000000 VVVVVV00 00000000
48+
// 0000tttt UU000000 SSSSSS00 00000000
49+
// 0000qqqq RR000000 PPPPPP00 00000000
50+
// 0000nnnn OO000000 MMMMMM00 00000000
51+
// 0000kkkk LL000000 JJJJJJ00 00000000
52+
// 0000hhhh II000000 GGGGGG00 00000000
53+
// 0000eeee FF000000 DDDDDD00 00000000
54+
// 0000bbbb CC000000 AAAAAA00 00000000
55+
56+
const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
57+
// 00000000 00wwwwXX 00000000 00VVVVVV
58+
// 00000000 00ttttUU 00000000 00SSSSSS
59+
// 00000000 00qqqqRR 00000000 00PPPPPP
60+
// 00000000 00nnnnOO 00000000 00MMMMMM
61+
// 00000000 00kkkkLL 00000000 00JJJJJJ
62+
// 00000000 00hhhhII 00000000 00GGGGGG
63+
// 00000000 00eeeeFF 00000000 00DDDDDD
64+
// 00000000 00bbbbCC 00000000 00AAAAAA
65+
66+
const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
67+
// 00000000 00xxxxxx 000000vv WWWW0000
68+
// 00000000 00uuuuuu 000000ss TTTT0000
69+
// 00000000 00rrrrrr 000000pp QQQQ0000
70+
// 00000000 00oooooo 000000mm NNNN0000
71+
// 00000000 00llllll 000000jj KKKK0000
72+
// 00000000 00iiiiii 000000gg HHHH0000
73+
// 00000000 00ffffff 000000dd EEEE0000
74+
// 00000000 00cccccc 000000aa BBBB0000
75+
76+
const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
77+
// 00xxxxxx 00000000 00vvWWWW 00000000
78+
// 00uuuuuu 00000000 00ssTTTT 00000000
79+
// 00rrrrrr 00000000 00ppQQQQ 00000000
80+
// 00oooooo 00000000 00mmNNNN 00000000
81+
// 00llllll 00000000 00jjKKKK 00000000
82+
// 00iiiiii 00000000 00ggHHHH 00000000
83+
// 00ffffff 00000000 00ddEEEE 00000000
84+
// 00cccccc 00000000 00aaBBBB 00000000
85+
86+
return _mm256_or_si256(t1, t3);
87+
// 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
88+
// 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
89+
// 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
90+
// 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
91+
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
92+
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
93+
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
94+
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
6795
}
6896

6997
static inline __m256i

lib/arch/avx2/enc_loop.c

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,28 @@
11
// If we have AVX2 support, pick off 24 bytes at a time for as long as we can.
22
// But because we read 32 bytes at a time, ensure we have enough room to do a
33
// full 32-byte read without segfaulting:
4-
while (srclen >= 32)
5-
{
6-
// Load string:
7-
__m256i str = _mm256_loadu_si256((__m256i *)c);
84

9-
// Reshuffle:
10-
str = enc_reshuffle(str);
5+
if (srclen >= 32) {
6+
const uint8_t* const o_orig = o;
117

12-
// Translate reshuffled bytes to the Base64 alphabet:
13-
str = enc_translate(str);
8+
// first load is done at c-0 not to get a segfault
9+
__m256i inputvector = _mm256_loadu_si256((__m256i *)(c - 0));
1410

15-
// Store:
16-
_mm256_storeu_si256((__m256i *)o, str);
11+
// shift by 4 bytes, as required by enc_reshuffle
12+
inputvector = _mm256_permutevar8x32_epi32(inputvector, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
1713

18-
c += 24; // 6 * 4 bytes of input
19-
o += 32; // 8 * 4 bytes of output
20-
outl += 32;
21-
srclen -= 24;
14+
for (;;) {
15+
inputvector = enc_reshuffle(inputvector);
16+
inputvector = enc_translate(inputvector);
17+
_mm256_storeu_si256((__m256i *)o, inputvector);
18+
c += 24;
19+
o += 32;
20+
srclen -= 24;
21+
if(srclen < 28) {
22+
break;
23+
}
24+
// Load at c-4, as required by enc_reshuffle
25+
inputvector = _mm256_loadu_si256((__m256i *)(c - 4));
26+
}
27+
outl += (size_t)(o - o_orig);
2228
}

lib/arch/sse41/codec.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,9 @@
1010

1111
#include "../sse2/compare_macros.h"
1212

13-
#include "../ssse3/_mm_bswap_epi32.c"
1413
#include "../ssse3/dec_reshuffle.c"
1514
#include "../ssse3/enc_translate.c"
16-
#include "enc_reshuffle.c"
15+
#include "../ssse3/enc_reshuffle.c"
1716

1817
#endif // __SSE4_1__
1918

lib/arch/sse42/codec.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,9 @@
1010

1111
#include "../sse2/compare_macros.h"
1212

13-
#include "../ssse3/_mm_bswap_epi32.c"
1413
#include "../ssse3/dec_reshuffle.c"
1514
#include "../ssse3/enc_translate.c"
16-
#include "../sse41/enc_reshuffle.c"
15+
#include "../ssse3/enc_reshuffle.c"
1716

1817
#endif // __SSE4_2__
1918

lib/arch/ssse3/_mm_bswap_epi32.c

Lines changed: 0 additions & 9 deletions
This file was deleted.

lib/arch/ssse3/codec.c

Lines changed: 1 addition & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,45 +10,10 @@
1010

1111
#include "../sse2/compare_macros.h"
1212

13-
#include "_mm_bswap_epi32.c"
1413
#include "dec_reshuffle.c"
14+
#include "enc_reshuffle.c"
1515
#include "enc_translate.c"
1616

17-
static inline __m128i
18-
enc_reshuffle (__m128i in)
19-
{
20-
// Slice into 32-bit chunks and operate on all chunks in parallel.
21-
// All processing is done within the 32-bit chunk. First, shuffle:
22-
// before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb]
23-
// after: [00000000|aaaaaabb|bbbbcccc|ccdddddd]
24-
in = _mm_shuffle_epi8(in, _mm_set_epi8(
25-
-1, 9, 10, 11,
26-
-1, 6, 7, 8,
27-
-1, 3, 4, 5,
28-
-1, 0, 1, 2));
29-
30-
// cd = [00000000|00000000|0000cccc|ccdddddd]
31-
const __m128i cd = _mm_and_si128(in, _mm_set1_epi32(0x00000FFF));
32-
33-
// ab = [0000aaaa|aabbbbbb|00000000|00000000]
34-
const __m128i ab = _mm_and_si128(_mm_slli_epi32(in, 4), _mm_set1_epi32(0x0FFF0000));
35-
36-
// merged = [0000aaaa|aabbbbbb|0000cccc|ccdddddd]
37-
const __m128i merged = _mm_or_si128(ab, cd);
38-
39-
// bd = [00000000|00bbbbbb|00000000|00dddddd]
40-
const __m128i bd = _mm_and_si128(merged, _mm_set1_epi32(0x003F003F));
41-
42-
// ac = [00aaaaaa|00000000|00cccccc|00000000]
43-
const __m128i ac = _mm_and_si128(_mm_slli_epi32(merged, 2), _mm_set1_epi32(0x3F003F00));
44-
45-
// indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
46-
const __m128i indices = _mm_or_si128(ac, bd);
47-
48-
// return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
49-
return _mm_bswap_epi32(indices);
50-
}
51-
5217
#endif // __SSSE3__
5318

5419
BASE64_ENC_FUNCTION(ssse3)

lib/arch/ssse3/enc_reshuffle.c

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
static inline __m128i
2+
enc_reshuffle (__m128i in)
3+
{
4+
// input, bytes MSB to LSB:
5+
// 0 0 0 0 l k j i h g f e d c b a
6+
7+
in = _mm_shuffle_epi8(in, _mm_set_epi8(
8+
10, 11, 9, 10,
9+
7, 8, 6, 7,
10+
4, 5, 3, 4,
11+
1, 2, 0, 1));
12+
// in, bytes MSB to LSB:
13+
// k l j k
14+
// h i g h
15+
// e f d e
16+
// b c a b
17+
18+
const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
19+
// bits, upper case are most significant bits, lower case are least significant bits
20+
// 0000kkkk LL000000 JJJJJJ00 00000000
21+
// 0000hhhh II000000 GGGGGG00 00000000
22+
// 0000eeee FF000000 DDDDDD00 00000000
23+
// 0000bbbb CC000000 AAAAAA00 00000000
24+
25+
const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
26+
// 00000000 00kkkkLL 00000000 00JJJJJJ
27+
// 00000000 00hhhhII 00000000 00GGGGGG
28+
// 00000000 00eeeeFF 00000000 00DDDDDD
29+
// 00000000 00bbbbCC 00000000 00AAAAAA
30+
31+
const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
32+
// 00000000 00llllll 000000jj KKKK0000
33+
// 00000000 00iiiiii 000000gg HHHH0000
34+
// 00000000 00ffffff 000000dd EEEE0000
35+
// 00000000 00cccccc 000000aa BBBB0000
36+
37+
const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
38+
// 00llllll 00000000 00jjKKKK 00000000
39+
// 00iiiiii 00000000 00ggHHHH 00000000
40+
// 00ffffff 00000000 00ddEEEE 00000000
41+
// 00cccccc 00000000 00aaBBBB 00000000
42+
43+
return _mm_or_si128(t1, t3);
44+
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
45+
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
46+
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
47+
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
48+
}

0 commit comments

Comments
 (0)