SSSE3->AVX2 encoding optimization

mayeut · mayeut · commit 67ee3fdbb154 · 2017-02-21T22:50:15.000+01:00
Use Wojciech Mula (@WojciechMula) implementation update for AVX2 / SSSE3 encoding. SSSE3 implementation is reused in SSE4.1, SSE4.2 and AVX dispatched encoding loops. SSE4.1 implementation is now useless but kept to ease integration of future updates if needed. Speed-up on i7-4870HQ @ 2.5 GHz (clang-800.0.42.1, x86_64) SSSE3 encoding: +20% SSE4.2 encoding: +8% AVX encoding: +7% AVX2 encoding: +3%
diff --git a/LICENSE b/LICENSE
@@ -1,5 +1,6 @@
 Copyright (c) 2005-2007, Nick Galbreath
 Copyright (c) 2013-2017, Alfred Klomp
+Copyright (c) 2015-2017, Wojciech Mula
 Copyright (c) 2016-2017, Matthieu Darbois
 All rights reserved.
 
diff --git a/README.md b/README.md
@@ -423,22 +423,22 @@ The tables below contain some results on random machines. All numbers measured w
 
 x86 processors
 
-| Processor                                 | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | SSE4.1 enc | SSE4.1 dec| SSE4.2 enc | SSE4.2 dec| AVX enc | AVX dec | AVX2 enc | AVX2 dec |
-|-------------------------------------------|----------:|----------:|----------:|----------:|-----------:|----------:|-----------:|----------:|--------:|--------:|---------:|---------:|
-| i7-4771 @ 3.5 GHz                         | 833       | 1111\*    | 3333\*    | 4444\*    | TBD        | TBD       | TBD        | TBD       | TBD     | TBD     | 4999\*   | 6666\*   |
-| i7-4770 @ 3.4 GHz DDR1600                 | 1831      | 1748\*    | 3570\*    | 3695\*    | TBD        | TBD       | TBD        | TBD       | TBD     | TBD     | 6539\*   | 6512\*   |
-| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1779      | 1727\*    | 3419\*    | 3788\*    | TBD        | TBD       | TBD        | TBD       | TBD     | TBD     | 4589\*   | 5871\*   |
-| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3367      | 3374\*    | 4784\*    | 6672\*    | TBD        | TBD       | TBD        | TBD       | TBD     | TBD     | 5120\*   | 7721\*   |
-| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4834      | 6075\*    | 4906\*    | 8154\*    | TBD        | TBD       | TBD        | TBD       | TBD     | TBD     | 4839\*   | 6911\*   |
-| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 4696      | 6361\*    | 5227\*    | 7737\*    | TBD        | TBD       | TBD        | TBD       | TBD     | TBD     | 4813\*   | 7189\*   |
-| i7-4870HQ @ 2.5 GHz                       | 1471      | 3066      | 5599      | 3886      | 5882       | 3888      | 6202       | 5098      | 6524    | 5281    | 8113     | 7063     |
-| i5-4590S @ 3.0 GHz                        | 1721      | 1643\*    | 3255\*    | 3404\*    | TBD        | TBD       | TBD        | TBD       | TBD     | TBD     | 4124\*   | 5403\*   |
-| Xeon X5570 @ 2.93 GHz                     | 1097      | 1048\*    | 2077\*    | 2215\*    | TBD        | TBD       | TBD        | TBD       | -       | -       | -        | -        |
-| Pentium4 @ 3.4 GHz                        | 528       | 448\*     | -         | -         | -          | -         | -          | -         | -       | -       | -        | -        |
-| Atom N270                                 | 112       | 125\*     | 331\*     | 368\*     | -          | -         | -          | -         | -       | -       | -        | -        |
-| AMD E-450                                 | 370       | 332\*     | 405\*     | 366\*     | -          | -         | -          | -         | -       | -       | -        | -        |
-| Intel Edison @ 500 MHz                    | 79        | 92\*      | 152\*     | 172\*     | TBD        | TBD       | TBD        | TBD       | -       | -       | -        | -        |
-| Intel Edison @ 500 MHz OPENMP 2 thread    | 158       | 184\*     | 300\*     | 343\*     | TBD        | TBD       | TBD        | TBD       | -       | -       | -        | -        |
+| Processor                                 | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | SSE4.2 enc | SSE4.2 dec| AVX enc | AVX dec | AVX2 enc | AVX2 dec |
+|-------------------------------------------|----------:|----------:|----------:|----------:|-----------:|----------:|--------:|--------:|---------:|---------:|
+| i7-4771 @ 3.5 GHz                         | 833       | 1111\*    | 3333\*    | 4444\*    | TBD        | TBD       | TBD     | TBD     | 4999\*   | 6666\*   |
+| i7-4770 @ 3.4 GHz DDR1600                 | 1831      | 1748\*    | 3570\*    | 3695\*    | TBD        | TBD       | TBD     | TBD     | 6539\*   | 6512\*   |
+| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1779      | 1727\*    | 3419\*    | 3788\*    | TBD        | TBD       | TBD     | TBD     | 4589\*   | 5871\*   |
+| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3367      | 3374\*    | 4784\*    | 6672\*    | TBD        | TBD       | TBD     | TBD     | 5120\*   | 7721\*   |
+| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4834      | 6075\*    | 4906\*    | 8154\*    | TBD        | TBD       | TBD     | TBD     | 4839\*   | 6911\*   |
+| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 4696      | 6361\*    | 5227\*    | 7737\*    | TBD        | TBD       | TBD     | TBD     | 4813\*   | 7189\*   |
+| i7-4870HQ @ 2.5 GHz                       | 1471      | 3066      | 6721      | 3886      | 6701       | 5098      | 7015    | 5281    | 8328     | 7063     |
+| i5-4590S @ 3.0 GHz                        | 1721      | 1643\*    | 3255\*    | 3404\*    | TBD        | TBD       | TBD     | TBD     | 4124\*   | 5403\*   |
+| Xeon X5570 @ 2.93 GHz                     | 1097      | 1048\*    | 2077\*    | 2215\*    | TBD        | TBD       | -       | -       | -        | -        |
+| Pentium4 @ 3.4 GHz                        | 528       | 448\*     | -         | -         | -          | -         | -       | -       | -        | -        |
+| Atom N270                                 | 112       | 125\*     | 331\*     | 368\*     | -          | -         | -       | -       | -        | -        |
+| AMD E-450                                 | 370       | 332\*     | 405\*     | 366\*     | -          | -         | -       | -       | -        | -        |
+| Intel Edison @ 500 MHz                    | 79        | 92\*      | 152\*     | 172\*     | TBD        | TBD       | -       | -       | -        | -        |
+| Intel Edison @ 500 MHz OPENMP 2 thread    | 158       | 184\*     | 300\*     | 343\*     | TBD        | TBD       | -       | -       | -        | -        |
 
 ARM processors
 
diff --git a/lib/arch/avx/codec.c b/lib/arch/avx/codec.c
@@ -10,10 +10,9 @@
 
 #include "../sse2/compare_macros.h"
 
-#include "../ssse3/_mm_bswap_epi32.c"
 #include "../ssse3/dec_reshuffle.c"
 #include "../ssse3/enc_translate.c"
-#include "../sse41/enc_reshuffle.c"
+#include "../ssse3/enc_reshuffle.c"
 
 #endif	// __AVX__
 
diff --git a/lib/arch/avx2/codec.c b/lib/arch/avx2/codec.c
@@ -13,57 +13,85 @@
 #define REPLACE(s,n)	_mm256_and_si256((s), _mm256_set1_epi8(n))
 #define RANGE(s,a,b)	_mm256_andnot_si256(CMPGT((s), (b)), CMPGT((s), (a) - 1))
 
-static inline __m256i
-_mm256_bswap_epi32 (const __m256i in)
-{
-	// _mm256_shuffle_epi8() works on two 128-bit lanes separately:
-	return _mm256_shuffle_epi8(in, _mm256_setr_epi8(
-		 3,  2,  1,  0,
-		 7,  6,  5,  4,
-		11, 10,  9,  8,
-		15, 14, 13, 12,
-		 3,  2,  1,  0,
-		 7,  6,  5,  4,
-		11, 10,  9,  8,
-		15, 14, 13, 12));
-}
-
-static inline __m256i
-enc_reshuffle (__m256i in)
-{
-	// Spread out 32-bit words over both halves of the input register:
-	in = _mm256_permutevar8x32_epi32(in, _mm256_setr_epi32(
-		0, 1, 2, -1,
-		3, 4, 5, -1));
-
-	// Slice into 32-bit chunks and operate on all chunks in parallel.
-	// All processing is done within the 32-bit chunk. First, shuffle:
-	// before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb]
-	// after:  [00000000|aaaaaabb|bbbbcccc|ccdddddd]
-	in = _mm256_shuffle_epi8(in, _mm256_set_epi8(
-		-1, 9, 10, 11,
-		-1, 6,  7,  8,
-		-1, 3,  4,  5,
-		-1, 0,  1,  2,
-		-1, 9, 10, 11,
-		-1, 6,  7,  8,
-		-1, 3,  4,  5,
-		-1, 0,  1,  2));
-
-	// merged  = [0000aaaa|aabbbbbb|bbbbcccc|ccdddddd]
-	const __m256i merged = _mm256_blend_epi16(_mm256_slli_epi32(in, 4), in, 0x55);
-
-	// bd      = [00000000|00bbbbbb|00000000|00dddddd]
-	const __m256i bd = _mm256_and_si256(merged, _mm256_set1_epi32(0x003F003F));
-
-	// ac      = [00aaaaaa|00000000|00cccccc|00000000]
-	const __m256i ac = _mm256_and_si256(_mm256_slli_epi32(merged, 2), _mm256_set1_epi32(0x3F003F00));
-
-	// indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
-	const __m256i indices = _mm256_or_si256(ac, bd);
-
-	// return  = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
-	return _mm256_bswap_epi32(indices);
+static inline __m256i enc_reshuffle(const __m256i input) {
+	// translation from SSSE3 into AVX2 of procedure
+	// This one works with shifted (4 bytes) input in order to
+	// be able to work efficiently in the 2 128-bit lanes
+
+	// input, bytes MSB to LSB:
+	// 0 0 0 0 x w v u t s r q p o n m
+	// l k j i h g f e d c b a 0 0 0 0
+
+	const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
+		10, 11,  9, 10,
+		 7,  8,  6,  7,
+		 4,  5,  3,  4,
+		 1,  2,  0,  1,
+
+		14, 15, 13, 14,
+		11, 12, 10, 11,
+		 8,  9,  7,  8,
+		 5,  6,  4,  5));
+	// in, bytes MSB to LSB:
+	// w x v w
+	// t u s t
+	// q r p q
+	// n o m n
+	// k l j k
+	// h i g h
+	// e f d e
+	// b c a b
+
+	const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
+	// bits, upper case are most significant bits, lower case are least significant bits.
+	// 0000wwww XX000000 VVVVVV00 00000000
+	// 0000tttt UU000000 SSSSSS00 00000000
+	// 0000qqqq RR000000 PPPPPP00 00000000
+	// 0000nnnn OO000000 MMMMMM00 00000000
+	// 0000kkkk LL000000 JJJJJJ00 00000000
+	// 0000hhhh II000000 GGGGGG00 00000000
+	// 0000eeee FF000000 DDDDDD00 00000000
+	// 0000bbbb CC000000 AAAAAA00 00000000
+
+	const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
+	// 00000000 00wwwwXX 00000000 00VVVVVV
+	// 00000000 00ttttUU 00000000 00SSSSSS
+	// 00000000 00qqqqRR 00000000 00PPPPPP
+	// 00000000 00nnnnOO 00000000 00MMMMMM
+	// 00000000 00kkkkLL 00000000 00JJJJJJ
+	// 00000000 00hhhhII 00000000 00GGGGGG
+	// 00000000 00eeeeFF 00000000 00DDDDDD
+	// 00000000 00bbbbCC 00000000 00AAAAAA
+
+	const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
+	// 00000000 00xxxxxx 000000vv WWWW0000
+	// 00000000 00uuuuuu 000000ss TTTT0000
+	// 00000000 00rrrrrr 000000pp QQQQ0000
+	// 00000000 00oooooo 000000mm NNNN0000
+	// 00000000 00llllll 000000jj KKKK0000
+	// 00000000 00iiiiii 000000gg HHHH0000
+	// 00000000 00ffffff 000000dd EEEE0000
+	// 00000000 00cccccc 000000aa BBBB0000
+
+	const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
+	// 00xxxxxx 00000000 00vvWWWW 00000000
+	// 00uuuuuu 00000000 00ssTTTT 00000000
+	// 00rrrrrr 00000000 00ppQQQQ 00000000
+	// 00oooooo 00000000 00mmNNNN 00000000
+	// 00llllll 00000000 00jjKKKK 00000000
+	// 00iiiiii 00000000 00ggHHHH 00000000
+	// 00ffffff 00000000 00ddEEEE 00000000
+	// 00cccccc 00000000 00aaBBBB 00000000
+
+	return _mm256_or_si256(t1, t3);
+	// 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
+	// 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
+	// 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
+	// 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
+	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
 }
 
 static inline __m256i
diff --git a/lib/arch/avx2/enc_loop.c b/lib/arch/avx2/enc_loop.c
@@ -1,22 +1,28 @@
 // If we have AVX2 support, pick off 24 bytes at a time for as long as we can.
 // But because we read 32 bytes at a time, ensure we have enough room to do a
 // full 32-byte read without segfaulting:
-while (srclen >= 32)
-{
-	// Load string:
-	__m256i str = _mm256_loadu_si256((__m256i *)c);
 
-	// Reshuffle:
-	str = enc_reshuffle(str);
+if (srclen >= 32) {
+	const uint8_t* const o_orig = o;
 
-	// Translate reshuffled bytes to the Base64 alphabet:
-	str = enc_translate(str);
+	// first load is done at c-0 not to get a segfault
+	__m256i inputvector = _mm256_loadu_si256((__m256i *)(c - 0));
 
-	// Store:
-	_mm256_storeu_si256((__m256i *)o, str);
+	// shift by 4 bytes, as required by enc_reshuffle
+	inputvector = _mm256_permutevar8x32_epi32(inputvector, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
 
-	c += 24;	// 6 * 4 bytes of input
-	o += 32;	// 8 * 4 bytes of output
-	outl += 32;
-	srclen -= 24;
+	for (;;) {
+		inputvector = enc_reshuffle(inputvector);
+		inputvector = enc_translate(inputvector);
+		_mm256_storeu_si256((__m256i *)o, inputvector);
+		c += 24;
+		o += 32;
+		srclen -= 24;
+		if(srclen < 28) {
+			break;
+		}
+		// Load at c-4, as required by enc_reshuffle
+		inputvector = _mm256_loadu_si256((__m256i *)(c - 4));
+	}
+	outl += (size_t)(o - o_orig);
 }
diff --git a/lib/arch/sse41/codec.c b/lib/arch/sse41/codec.c
@@ -10,10 +10,9 @@
 
 #include "../sse2/compare_macros.h"
 
-#include "../ssse3/_mm_bswap_epi32.c"
 #include "../ssse3/dec_reshuffle.c"
 #include "../ssse3/enc_translate.c"
-#include "enc_reshuffle.c"
+#include "../ssse3/enc_reshuffle.c"
 
 #endif	// __SSE4_1__
 
diff --git a/lib/arch/sse42/codec.c b/lib/arch/sse42/codec.c
@@ -10,10 +10,9 @@
 
 #include "../sse2/compare_macros.h"
 
-#include "../ssse3/_mm_bswap_epi32.c"
 #include "../ssse3/dec_reshuffle.c"
 #include "../ssse3/enc_translate.c"
-#include "../sse41/enc_reshuffle.c"
+#include "../ssse3/enc_reshuffle.c"
 
 #endif	// __SSE4_2__
 
diff --git a/lib/arch/ssse3/_mm_bswap_epi32.c b/lib/arch/ssse3/_mm_bswap_epi32.c
diff --git a/lib/arch/ssse3/codec.c b/lib/arch/ssse3/codec.c
@@ -10,45 +10,10 @@
 
 #include "../sse2/compare_macros.h"
 
-#include "_mm_bswap_epi32.c"
 #include "dec_reshuffle.c"
+#include "enc_reshuffle.c"
 #include "enc_translate.c"
 
-static inline __m128i
-enc_reshuffle (__m128i in)
-{
-	// Slice into 32-bit chunks and operate on all chunks in parallel.
-	// All processing is done within the 32-bit chunk. First, shuffle:
-	// before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb]
-	// after:  [00000000|aaaaaabb|bbbbcccc|ccdddddd]
-	in = _mm_shuffle_epi8(in, _mm_set_epi8(
-		-1, 9, 10, 11,
-		-1, 6,  7,  8,
-		-1, 3,  4,  5,
-		-1, 0,  1,  2));
-
-	// cd      = [00000000|00000000|0000cccc|ccdddddd]
-	const __m128i cd = _mm_and_si128(in, _mm_set1_epi32(0x00000FFF));
-
-	// ab      = [0000aaaa|aabbbbbb|00000000|00000000]
-	const __m128i ab = _mm_and_si128(_mm_slli_epi32(in, 4), _mm_set1_epi32(0x0FFF0000));
-
-	// merged  = [0000aaaa|aabbbbbb|0000cccc|ccdddddd]
-	const __m128i merged = _mm_or_si128(ab, cd);
-
-	// bd      = [00000000|00bbbbbb|00000000|00dddddd]
-	const __m128i bd = _mm_and_si128(merged, _mm_set1_epi32(0x003F003F));
-
-	// ac      = [00aaaaaa|00000000|00cccccc|00000000]
-	const __m128i ac = _mm_and_si128(_mm_slli_epi32(merged, 2), _mm_set1_epi32(0x3F003F00));
-
-	// indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
-	const __m128i indices = _mm_or_si128(ac, bd);
-
-	// return  = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
-	return _mm_bswap_epi32(indices);
-}
-
 #endif	// __SSSE3__
 
 BASE64_ENC_FUNCTION(ssse3)
diff --git a/lib/arch/ssse3/enc_reshuffle.c b/lib/arch/ssse3/enc_reshuffle.c
@@ -0,0 +1,48 @@
+static inline __m128i
+enc_reshuffle (__m128i in)
+{
+	// input, bytes MSB to LSB:
+	// 0 0 0 0 l k j i h g f e d c b a
+
+	in = _mm_shuffle_epi8(in, _mm_set_epi8(
+		10, 11,  9, 10,
+		 7,  8,  6,  7,
+		 4,  5,  3,  4,
+		 1,  2,  0,  1));
+	// in, bytes MSB to LSB:
+	// k l j k
+	// h i g h
+	// e f d e
+	// b c a b
+
+	const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
+	// bits, upper case are most significant bits, lower case are least significant bits
+	// 0000kkkk LL000000 JJJJJJ00 00000000
+	// 0000hhhh II000000 GGGGGG00 00000000
+	// 0000eeee FF000000 DDDDDD00 00000000
+	// 0000bbbb CC000000 AAAAAA00 00000000
+
+	const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
+	// 00000000 00kkkkLL 00000000 00JJJJJJ
+	// 00000000 00hhhhII 00000000 00GGGGGG
+	// 00000000 00eeeeFF 00000000 00DDDDDD
+	// 00000000 00bbbbCC 00000000 00AAAAAA
+
+	const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
+	// 00000000 00llllll 000000jj KKKK0000
+	// 00000000 00iiiiii 000000gg HHHH0000
+	// 00000000 00ffffff 000000dd EEEE0000
+	// 00000000 00cccccc 000000aa BBBB0000
+
+	const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
+	// 00llllll 00000000 00jjKKKK 00000000
+	// 00iiiiii 00000000 00ggHHHH 00000000
+	// 00ffffff 00000000 00ddEEEE 00000000
+	// 00cccccc 00000000 00aaBBBB 00000000
+
+	return _mm_or_si128(t1, t3);
+	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+}