Skip to content

Commit 0a69845

Browse files
committed
SSSE3->AVX2, NEON32 decoding optimization
Use Wojciech Mula (@WojciechMula) and @aqrit implementation update for AVX2 / SSSE3 / NEON32 decoding. SSSE3 implementation is reused in SSE4.1, SSE4.2 and AVX dispatched decoding loops. SSE4.2 implementation is now useless but kept to ease integration of future updates if needed. Speed-up on i7-4870HQ @ 2.5 GHz (clang-800.0.42.1, x86_64) SSSE3 decoding: +79% SSE4.2 decoding: +37% AVX decoding: +57% AVX2 decoding: +64% Speed-up on Apple iPhone SE (clang-800.0.42.1, armv7) NEON32 decoding: +66%
1 parent b6417f3 commit 0a69845

File tree

9 files changed

+257
-236
lines changed

9 files changed

+257
-236
lines changed

README.md

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -423,32 +423,32 @@ The tables below contain some results on random machines. All numbers measured w
423423

424424
x86 processors
425425

426-
| Processor | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | SSE4.2 enc | SSE4.2 dec| AVX enc | AVX dec | AVX2 enc | AVX2 dec |
427-
|-------------------------------------------|----------:|----------:|----------:|----------:|-----------:|----------:|--------:|--------:|---------:|---------:|
428-
| i7-4771 @ 3.5 GHz | 833 | 1111\* | 3333\* | 4444\* | TBD | TBD | TBD | TBD | 4999\* | 6666\* |
429-
| i7-4770 @ 3.4 GHz DDR1600 | 1790 | 3038 | 4899 | 4043 | 4938 | 4939 | 4796 | 5709 | 4681 | 6386 |
430-
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1784 | 3041 | 4945 | 4035 | 4954 | 4941 | 4776 | 5719 | 4661 | 6294 |
431-
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3401 | 5729 | 5489 | 7444 | 5030 | 8531 | 5003 | 8624 | 5105 | 8558 |
432-
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884 | 7099 | 4917 | 7057 | 4915 | 7541 | 4799 | 7143 | 4902 | 7219 |
433-
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212 | 8849 | 5284 | 9099 | 5245 | 9160 | 5289 | 9220 | 4849 | 9200 |
434-
| i7-4870HQ @ 2.5 GHz | 1471 | 3066 | 6721 | 3886 | 6701 | 5098 | 7015 | 5281 | 8328 | 7063 |
435-
| i5-4590S @ 3.0 GHz | 1721 | 1643\* | 3255\* | 3404\* | TBD | TBD | TBD | TBD | 4124\* | 5403\* |
436-
| Xeon X5570 @ 2.93 GHz | 1097 | 1048\* | 2077\* | 2215\* | TBD | TBD | - | - | - | - |
437-
| Pentium4 @ 3.4 GHz | 528 | 448\* | - | - | - | - | - | - | - | - |
438-
| Atom N270 | 112 | 125\* | 331\* | 368\* | - | - | - | - | - | - |
439-
| AMD E-450 | 370 | 332\* | 405\* | 366\* | - | - | - | - | - | - |
440-
| Intel Edison @ 500 MHz | 79 | 92\* | 152\* | 172\* | TBD | TBD | - | - | - | - |
441-
| Intel Edison @ 500 MHz OPENMP 2 thread | 158 | 184\* | 300\* | 343\* | TBD | TBD | - | - | - | - |
442-
| Intel Edison @ 500 MHz (x86-64) | 97 | 146 | 197 | 207 | 197 | 145 | - | - | - | - |
443-
| Intel Edison @ 500 MHz (x86-64) 2 thread | 193 | 288 | 389 | 410 | 389 | 289 | - | - | - | - |
426+
| Processor | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | AVX enc | AVX dec | AVX2 enc | AVX2 dec |
427+
|-------------------------------------------|----------:|----------:|----------:|----------:|--------:|--------:|---------:|---------:|
428+
| i7-4771 @ 3.5 GHz | 833 | 1111\* | 3333\* | 4444\* | TBD | TBD | 4999\* | 6666\* |
429+
| i7-4770 @ 3.4 GHz DDR1600 | 1790 | 3038 | 4899 | 4043\* | 4796 | 5709\* | 4681 | 6386\* |
430+
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1784 | 3041 | 4945 | 4035\* | 4776 | 5719\* | 4661 | 6294\* |
431+
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3401 | 5729 | 5489 | 7444\* | 5003 | 8624\* | 5105 | 8558\* |
432+
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884 | 7099 | 4917 | 7057\* | 4799 | 7143\* | 4902 | 7219\* |
433+
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212 | 8849 | 5284 | 9099\* | 5289 | 9220\* | 4849 | 9200\* |
434+
| i7-4870HQ @ 2.5 GHz | 1471 | 3066 | 6721 | 6962 | 7015 | 8267 | 8328 | 11576 |
435+
| i5-4590S @ 3.0 GHz | 1721 | 1643\* | 3255\* | 3404\* | TBD | TBD | 4124\* | 5403\* |
436+
| Xeon X5570 @ 2.93 GHz | 1097 | 1048\* | 2077\* | 2215\* | - | - | - | - |
437+
| Pentium4 @ 3.4 GHz | 528 | 448\* | - | - | - | - | - | - |
438+
| Atom N270 | 112 | 125\* | 331\* | 368\* | - | - | - | - |
439+
| AMD E-450 | 370 | 332\* | 405\* | 366\* | - | - | - | - |
440+
| Intel Edison @ 500 MHz | 79 | 92\* | 152\* | 172\* | - | - | - | - |
441+
| Intel Edison @ 500 MHz OPENMP 2 thread | 158 | 184\* | 300\* | 343\* | - | - | - | - |
442+
| Intel Edison @ 500 MHz (x86-64) | 97 | 146 | 197 | 207\* | - | - | - | - |
443+
| Intel Edison @ 500 MHz (x86-64) 2 thread | 193 | 288 | 389 | 410\* | - | - | - | - |
444444

445445
ARM processors
446446

447447
| Processor | Plain enc | Plain dec | NEON32 enc | NEON32 dec | NEON64 enc | NEON64 dec |
448448
|-------------------------------------------|----------:|----------:|-----------:|-----------:|-----------:|-----------:|
449449
| Raspberry PI B+ V1.2 | 46 | 40\* | - | - | - | - |
450450
| Raspberry PI 2 B V1.1 | 104 | 88\* | 188 | 116\* | - | - |
451-
| Apple iPhone SE armv7 | 1056 | 895 | 2943 | 1573 | - | - |
451+
| Apple iPhone SE armv7 | 1056 | 895 | 2943 | 2618 | - | - |
452452
| Apple iPhone SE arm64 | 1061 | 1239 | - | - | 4098 | 3983 |
453453

454454
PowerPC processors

lib/arch/avx/codec.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ BASE64_DEC_FUNCTION(avx)
3131
{
3232
#ifdef __AVX__
3333
#include "../generic/dec_head.c"
34-
#include "../sse42/dec_loop.c"
34+
#include "../ssse3/dec_loop.c"
3535
#include "../generic/dec_tail.c"
3636
#else
3737
BASE64_DEC_STUB

lib/arch/avx2/codec.c

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -127,35 +127,35 @@ enc_translate (const __m256i in)
127127
static inline __m256i
128128
dec_reshuffle (__m256i in)
129129
{
130-
// Mask in a single byte per shift:
131-
const __m256i maskB2 = _mm256_set1_epi32(0x003F0000);
132-
const __m256i maskB1 = _mm256_set1_epi32(0x00003F00);
133-
134-
// Pack bytes together:
135-
__m256i out = _mm256_srli_epi32(in, 16);
136-
137-
out = _mm256_or_si256(out, _mm256_srli_epi32(_mm256_and_si256(in, maskB2), 2));
130+
// in, lower lane, bits, upper case are most significant bits, lower case are least significant bits:
131+
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
132+
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
133+
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
134+
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
138135

139-
out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, maskB1), 12));
136+
const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
137+
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
138+
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
139+
// 0000eeee FFffffff 0000DDDD DDddEEEE
140+
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
140141

141-
out = _mm256_or_si256(out, _mm256_slli_epi32(in, 26));
142+
__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
143+
// 00000000 JJJJJJjj KKKKkkkk LLllllll
144+
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
145+
// 00000000 DDDDDDdd EEEEeeee FFffffff
146+
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
142147

143-
// Pack bytes together within 32-bit words, discarding words 3 and 7:
148+
// Pack bytes together in each lane:
144149
out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
145-
3, 2, 1,
146-
7, 6, 5,
147-
11, 10, 9,
148-
15, 14, 13,
149-
-1, -1, -1, -1,
150-
3, 2, 1,
151-
7, 6, 5,
152-
11, 10, 9,
153-
15, 14, 13,
154-
-1, -1, -1, -1));
155-
156-
// Pack 32-bit words together, squashing empty words 3 and 7:
157-
return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(
158-
0, 1, 2, 4, 5, 6, -1, -1));
150+
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
151+
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
152+
// 00000000 00000000 00000000 00000000
153+
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
154+
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
155+
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
156+
157+
// Pack lanes
158+
return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
159159
}
160160

161161
#endif // __AVX2__

lib/arch/avx2/dec_loop.c

Lines changed: 32 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,38 +8,42 @@ while (srclen >= 45)
88
// Load string:
99
__m256i str = _mm256_loadu_si256((__m256i *)c);
1010

11-
// The input consists of six character sets in the Base64 alphabet,
12-
// which we need to map back to the 6-bit values they represent.
13-
// There are three ranges, two singles, and then there's the rest.
14-
//
15-
// # From To Add Characters
16-
// 1 [43] [62] +19 +
17-
// 2 [47] [63] +16 /
18-
// 3 [48..57] [52..61] +4 0..9
19-
// 4 [65..90] [0..25] -65 A..Z
20-
// 5 [97..122] [26..51] -71 a..z
21-
// (6) Everything else => invalid input
22-
23-
const __m256i set1 = CMPEQ(str, '+');
24-
const __m256i set2 = CMPEQ(str, '/');
25-
const __m256i set3 = RANGE(str, '0', '9');
26-
const __m256i set4 = RANGE(str, 'A', 'Z');
27-
const __m256i set5 = RANGE(str, 'a', 'z');
28-
29-
__m256i delta = REPLACE(set1, 19);
30-
delta = _mm256_or_si256(delta, REPLACE(set2, 16));
31-
delta = _mm256_or_si256(delta, REPLACE(set3, 4));
32-
delta = _mm256_or_si256(delta, REPLACE(set4, -65));
33-
delta = _mm256_or_si256(delta, REPLACE(set5, -71));
34-
35-
// Check for invalid input: if any of the delta values are zero,
36-
// fall back on bytewise code to do error checking and reporting:
37-
if (_mm256_movemask_epi8(CMPEQ(delta, 0))) {
11+
// see ssse3/dec_loop.c for an explanation of how the code works.
12+
13+
const __m256i lut_lo = _mm256_setr_epi8(
14+
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
15+
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
16+
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
17+
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
18+
19+
const __m256i lut_hi = _mm256_setr_epi8(
20+
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
21+
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
22+
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
23+
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
24+
25+
const __m256i lut_roll = _mm256_setr_epi8(
26+
0, 16, 19, 4, -65, -65, -71, -71,
27+
0, 0, 0, 0, 0, 0, 0, 0,
28+
0, 16, 19, 4, -65, -65, -71, -71,
29+
0, 0, 0, 0, 0, 0, 0, 0);
30+
31+
const __m256i mask_2F = _mm256_set1_epi8(0x2f);
32+
33+
// lookup
34+
const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
35+
const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
36+
const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
37+
const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
38+
const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
39+
const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
40+
41+
if (!_mm256_testz_si256(lo, hi)) {
3842
break;
3943
}
4044

4145
// Now simply add the delta values to the input:
42-
str = _mm256_add_epi8(str, delta);
46+
str = _mm256_add_epi8(str, roll);
4347

4448
// Reshuffle the input to packed 12-byte output format:
4549
str = dec_reshuffle(str);

lib/arch/neon32/dec_loop.c

Lines changed: 75 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,47 +3,87 @@
33
// don't need to check if we have enough remaining input to cover them:
44
while (srclen >= 64)
55
{
6-
uint8x16x4_t set1, set2, set3, set4, set5, delta;
76
uint8x16x3_t dec;
87

98
// Load 64 bytes and deinterleave:
109
uint8x16x4_t str = vld4q_u8((uint8_t *)c);
1110

12-
// The input consists of six character sets in the Base64 alphabet,
13-
// which we need to map back to the 6-bit values they represent.
14-
// There are three ranges, two singles, and then there's the rest.
15-
//
16-
// # From To Add Characters
17-
// 1 [43] [62] +19 +
18-
// 2 [47] [63] +16 /
19-
// 3 [48..57] [52..61] +4 0..9
20-
// 4 [65..90] [0..25] -65 A..Z
21-
// 5 [97..122] [26..51] -71 a..z
22-
// (6) Everything else => invalid input
23-
24-
// Benchmarking on the Raspberry Pi 2B and Clang shows that looping
25-
// generates slightly faster code than explicit unrolling:
26-
for (int i = 0; i < 4; i++) {
27-
set1.val[i] = CMPEQ(str.val[i], '+');
28-
set2.val[i] = CMPEQ(str.val[i], '/');
29-
set3.val[i] = RANGE(str.val[i], '0', '9');
30-
set4.val[i] = RANGE(str.val[i], 'A', 'Z');
31-
set5.val[i] = RANGE(str.val[i], 'a', 'z');
32-
33-
delta.val[i] = REPLACE(set1.val[i], 19);
34-
delta.val[i] = vbslq_u8(set2.val[i], vdupq_n_u8( 16), delta.val[i]);
35-
delta.val[i] = vbslq_u8(set3.val[i], vdupq_n_u8( 4), delta.val[i]);
36-
delta.val[i] = vbslq_u8(set4.val[i], vdupq_n_u8(-65), delta.val[i]);
37-
delta.val[i] = vbslq_u8(set5.val[i], vdupq_n_u8(-71), delta.val[i]);
11+
// see ssse3/dec_loop.c for an explanation of how the code works.
12+
13+
const uint8x16_t lut_lo = {
14+
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
15+
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
16+
};
17+
const uint8x16_t lut_hi = {
18+
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
19+
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
20+
};
21+
22+
const uint8x16_t lut_roll = {
23+
0, 16, 19, 4, (uint8_t)-65, (uint8_t)-65, (uint8_t)-71, (uint8_t)-71,
24+
0, 0, 0, 0, 0, 0, 0, 0
25+
};
26+
27+
const uint8x16_t mask_F = vdupq_n_u8(0xf);
28+
const uint8x16_t mask_2F = vdupq_n_u8(0x2f);
29+
30+
uint8x16_t classified;
31+
32+
{
33+
const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[0], 4);
34+
const uint8x16_t lo_nibbles = vandq_u8(str.val[0], mask_F);
35+
const uint8x16_t eq_2F = vceqq_u8(str.val[0], mask_2F);
36+
37+
const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
38+
const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
39+
40+
const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles));
41+
classified = vandq_u8(lo, hi);
42+
// Now simply add the delta values to the input:
43+
str.val[0] = vaddq_u8(str.val[0], delta);
44+
}
45+
{
46+
const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[1], 4);
47+
const uint8x16_t lo_nibbles = vandq_u8(str.val[1], mask_F);
48+
const uint8x16_t eq_2F = vceqq_u8(str.val[1], mask_2F);
49+
50+
const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
51+
const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
52+
53+
const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles));
54+
classified = vorrq_u8(classified, vandq_u8(lo, hi));
55+
// Now simply add the delta values to the input:
56+
str.val[1] = vaddq_u8(str.val[1], delta);
57+
}
58+
{
59+
const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[2], 4);
60+
const uint8x16_t lo_nibbles = vandq_u8(str.val[2], mask_F);
61+
const uint8x16_t eq_2F = vceqq_u8(str.val[2], mask_2F);
62+
63+
const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
64+
const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
65+
66+
const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles));
67+
classified = vorrq_u8(classified, vandq_u8(lo, hi));
68+
// Now simply add the delta values to the input:
69+
str.val[2] = vaddq_u8(str.val[2], delta);
70+
}
71+
{
72+
const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[3], 4);
73+
const uint8x16_t lo_nibbles = vandq_u8(str.val[3], mask_F);
74+
const uint8x16_t eq_2F = vceqq_u8(str.val[3], mask_2F);
75+
76+
const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
77+
const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
78+
79+
const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles));
80+
classified = vorrq_u8(classified, vandq_u8(lo, hi));
81+
// Now simply add the delta values to the input:
82+
str.val[3] = vaddq_u8(str.val[3], delta);
3883
}
3984

4085
// Check for invalid input: if any of the delta values are zero,
4186
// fall back on bytewise code to do error checking and reporting:
42-
uint8x16_t classified = CMPEQ(delta.val[0], 0);
43-
classified = vorrq_u8(classified, CMPEQ(delta.val[1], 0));
44-
classified = vorrq_u8(classified, CMPEQ(delta.val[2], 0));
45-
classified = vorrq_u8(classified, CMPEQ(delta.val[3], 0));
46-
4787
// Extract both 32-bit halves; check that all bits are zero:
4888
if (vgetq_lane_u32((uint32x4_t)classified, 0) != 0
4989
|| vgetq_lane_u32((uint32x4_t)classified, 1) != 0
@@ -52,16 +92,10 @@ while (srclen >= 64)
5292
break;
5393
}
5494

55-
// Now simply add the delta values to the input:
56-
str.val[0] = vaddq_u8(str.val[0], delta.val[0]);
57-
str.val[1] = vaddq_u8(str.val[1], delta.val[1]);
58-
str.val[2] = vaddq_u8(str.val[2], delta.val[2]);
59-
str.val[3] = vaddq_u8(str.val[3], delta.val[3]);
60-
6195
// Compress four bytes into three:
62-
dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4);
63-
dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2);
64-
dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3];
96+
dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
97+
dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
98+
dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
6599

66100
// Interleave and store decoded result:
67101
vst3q_u8((uint8_t *)o, dec);

lib/arch/sse42/codec.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ BASE64_DEC_FUNCTION(sse42)
3131
{
3232
#ifdef __SSE4_2__
3333
#include "../generic/dec_head.c"
34-
#include "dec_loop.c"
34+
#include "../ssse3/dec_loop.c"
3535
#include "../generic/dec_tail.c"
3636
#else
3737
BASE64_DEC_STUB

0 commit comments

Comments
 (0)