Skip to content

Commit cfa8bf7

Browse files
committed
Plain decoding optimization
This now uses a modified implementation of Nick Galbreath (@client9) algorithm. Modifications of the original algorithm include: - Unaligned access is now optional (macro to enable this) - Invalid characters are mapped to 0xffffffff instead of 0x01ffffff. This removes the need to shift by 8 the returned value of the tables on big-endian architecture with fast unaligned access at the expense of a different check for invalid character depending on endianness Speed-up using clang-800.0.42.1 on i7-4870HQ @ 2.5 GHz or iPhone SE: - x86_64: +97% - i386: +60% - arm64: +0% - armv7: +24% As a side note, it seems that the iPhone SE processor (Apple A9) has fast unaligned access which gives even higher speed-up with corresponding macro set to 1 (+23% / +56%). This may not be the case for some other ARM processors.
1 parent f700099 commit cfa8bf7

File tree

13 files changed

+670
-143
lines changed

13 files changed

+670
-143
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
*.o
22
bin/base64
33
lib/config.h
4+
lib/table_generator
45
test/benchmark
56
test/test_base64

LICENSE

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
Copyright (c) 2013-2015, Alfred Klomp
1+
Copyright (c) 2005-2007, Nick Galbreath
2+
Copyright (c) 2013-2017, Alfred Klomp
3+
Copyright (c) 2016-2017, Matthieu Darbois
24
All rights reserved.
35

46
Redistribution and use in source and binary forms, with or without

Makefile

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@ HAVE_SSSE3 = 0
2222
HAVE_SSE41 = 0
2323
HAVE_SSE42 = 0
2424
HAVE_AVX = 0
25+
HAVE_FAST_UNALIGNED_ACCESS = 0
2526

2627
# The user should supply compiler flags for the codecs they want to build.
2728
# Check which codecs we're going to include:
2829
ifdef AVX2_CFLAGS
2930
HAVE_AVX2 = 1
31+
HAVE_FAST_UNALIGNED_ACCESS = 1
3032
endif
3133
ifdef NEON32_CFLAGS
3234
HAVE_NEON32 = 1
@@ -36,15 +38,19 @@ ifdef NEON64_CFLAGS
3638
endif
3739
ifdef SSSE3_CFLAGS
3840
HAVE_SSSE3 = 1
41+
HAVE_FAST_UNALIGNED_ACCESS = 1
3942
endif
4043
ifdef SSE41_CFLAGS
4144
HAVE_SSE41 = 1
45+
HAVE_FAST_UNALIGNED_ACCESS = 1
4246
endif
4347
ifdef SSE42_CFLAGS
4448
HAVE_SSE42 = 1
49+
HAVE_FAST_UNALIGNED_ACCESS = 1
4550
endif
4651
ifdef AVX_CFLAGS
4752
HAVE_AVX = 1
53+
HAVE_FAST_UNALIGNED_ACCESS = 1
4854
endif
4955
ifdef OPENMP
5056
CFLAGS += -fopenmp
@@ -63,15 +69,22 @@ lib/libbase64.o: $(OBJS)
6369
$(OBJCOPY) --keep-global-symbols=lib/exports.txt $@
6470

6571
lib/config.h:
66-
@echo "#define HAVE_AVX2 $(HAVE_AVX2)" > $@
67-
@echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@
68-
@echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@
69-
@echo "#define HAVE_SSSE3 $(HAVE_SSSE3)" >> $@
70-
@echo "#define HAVE_SSE41 $(HAVE_SSE41)" >> $@
71-
@echo "#define HAVE_SSE42 $(HAVE_SSE42)" >> $@
72-
@echo "#define HAVE_AVX $(HAVE_AVX)" >> $@
72+
@echo "#define HAVE_AVX2 $(HAVE_AVX2)" > $@
73+
@echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@
74+
@echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@
75+
@echo "#define HAVE_SSSE3 $(HAVE_SSSE3)" >> $@
76+
@echo "#define HAVE_SSE41 $(HAVE_SSE41)" >> $@
77+
@echo "#define HAVE_SSE42 $(HAVE_SSE42)" >> $@
78+
@echo "#define HAVE_AVX $(HAVE_AVX)" >> $@
79+
@echo "#define HAVE_FAST_UNALIGNED_ACCESS $(HAVE_FAST_UNALIGNED_ACCESS)" >> $@
7380

74-
lib/codec_choose.o: lib/config.h
81+
lib/tables.h: lib/table_generator.c
82+
$(CC) $(CFLAGS) -o lib/table_generator $^
83+
./lib/table_generator > $@
84+
85+
$(OBJS): lib/config.h
86+
87+
lib/lib.o: lib/tables.h
7588

7689
lib/arch/avx2/codec.o: CFLAGS += $(AVX2_CFLAGS)
7790
lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS)
@@ -88,4 +101,4 @@ analyze: clean
88101
scan-build --use-analyzer=`which clang` --status-bugs make
89102

90103
clean:
91-
rm -f bin/base64 bin/base64.o lib/libbase64.o lib/config.h $(OBJS)
104+
rm -f bin/base64 bin/base64.o lib/libbase64.o lib/table_generator.o lib/table_generator lib/config.h $(OBJS)

README.md

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -425,35 +425,35 @@ x86 processors
425425

426426
| Processor | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | SSE4.1 enc | SSE4.1 dec| SSE4.2 enc | SSE4.2 dec| AVX enc | AVX dec | AVX2 enc | AVX2 dec |
427427
|-------------------------------------------|----------:|----------:|----------:|----------:|-----------:|----------:|-----------:|----------:|--------:|--------:|---------:|---------:|
428-
| i7-4771 @ 3.5 GHz | 833 | 1111 | 3333\* | 4444\* | TBD | TBD | TBD | TBD | TBD | TBD | 4999\* | 6666\* |
429-
| i7-4770 @ 3.4 GHz DDR1600 | 1831 | 1748 | 3570\* | 3695\* | TBD | TBD | TBD | TBD | TBD | TBD | 6539\* | 6512\* |
430-
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1779 | 1727 | 3419\* | 3788\* | TBD | TBD | TBD | TBD | TBD | TBD | 4589\* | 5871\* |
431-
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3367 | 3374 | 4784\* | 6672\* | TBD | TBD | TBD | TBD | TBD | TBD | 5120\* | 7721\* |
432-
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4834 | 6075 | 4906\* | 8154\* | TBD | TBD | TBD | TBD | TBD | TBD | 4839\* | 6911\* |
433-
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 4696 | 6361 | 5227\* | 7737\* | TBD | TBD | TBD | TBD | TBD | TBD | 4813\* | 7189\* |
434-
| i7-4870HQ @ 2.5 GHz | 1471 | 1558 | 5599 | 3886 | 5882 | 3888 | 6202 | 5098 | 6524 | 5281 | 8113 | 7063 |
435-
| i5-4590S @ 3.0 GHz | 1721 | 1643 | 3255\* | 3404\* | TBD | TBD | TBD | TBD | TBD | TBD | 4124\* | 5403\* |
436-
| Xeon X5570 @ 2.93 GHz | 1097 | 1048 | 2077\* | 2215\* | TBD | TBD | TBD | TBD | - | - | - | - |
437-
| Pentium4 @ 3.4 GHz | 528 | 448 | - | - | - | - | - | - | - | - | - | - |
438-
| Atom N270 | 112 | 125 | 331\* | 368\* | - | - | - | - | - | - | - | - |
439-
| AMD E-450 | 370 | 332 | 405\* | 366\* | - | - | - | - | - | - | - | - |
440-
| Intel Edison @ 500 MHz | 79 | 92 | 152\* | 172\* | TBD | TBD | TBD | TBD | - | - | - | - |
441-
| Intel Edison @ 500 MHz OPENMP 2 thread | 158 | 184 | 300\* | 343\* | TBD | TBD | TBD | TBD | - | - | - | - |
428+
| i7-4771 @ 3.5 GHz | 833 | 1111\* | 3333\* | 4444\* | TBD | TBD | TBD | TBD | TBD | TBD | 4999\* | 6666\* |
429+
| i7-4770 @ 3.4 GHz DDR1600 | 1831 | 1748\* | 3570\* | 3695\* | TBD | TBD | TBD | TBD | TBD | TBD | 6539\* | 6512\* |
430+
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1779 | 1727\* | 3419\* | 3788\* | TBD | TBD | TBD | TBD | TBD | TBD | 4589\* | 5871\* |
431+
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3367 | 3374\* | 4784\* | 6672\* | TBD | TBD | TBD | TBD | TBD | TBD | 5120\* | 7721\* |
432+
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4834 | 6075\* | 4906\* | 8154\* | TBD | TBD | TBD | TBD | TBD | TBD | 4839\* | 6911\* |
433+
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 4696 | 6361\* | 5227\* | 7737\* | TBD | TBD | TBD | TBD | TBD | TBD | 4813\* | 7189\* |
434+
| i7-4870HQ @ 2.5 GHz | 1471 | 3066 | 5599 | 3886 | 5882 | 3888 | 6202 | 5098 | 6524 | 5281 | 8113 | 7063 |
435+
| i5-4590S @ 3.0 GHz | 1721 | 1643\* | 3255\* | 3404\* | TBD | TBD | TBD | TBD | TBD | TBD | 4124\* | 5403\* |
436+
| Xeon X5570 @ 2.93 GHz | 1097 | 1048\* | 2077\* | 2215\* | TBD | TBD | TBD | TBD | - | - | - | - |
437+
| Pentium4 @ 3.4 GHz | 528 | 448\* | - | - | - | - | - | - | - | - | - | - |
438+
| Atom N270 | 112 | 125\* | 331\* | 368\* | - | - | - | - | - | - | - | - |
439+
| AMD E-450 | 370 | 332\* | 405\* | 366\* | - | - | - | - | - | - | - | - |
440+
| Intel Edison @ 500 MHz | 79 | 92\* | 152\* | 172\* | TBD | TBD | TBD | TBD | - | - | - | - |
441+
| Intel Edison @ 500 MHz OPENMP 2 thread | 158 | 184\* | 300\* | 343\* | TBD | TBD | TBD | TBD | - | - | - | - |
442442

443443
ARM processors
444444

445445
| Processor | Plain enc | Plain dec | NEON32 enc | NEON32 dec | NEON64 enc | NEON64 dec |
446446
|-------------------------------------------|----------:|----------:|-----------:|-----------:|-----------:|-----------:|
447-
| Raspberry PI B+ V1.2 | 46 | 40 | - | - | - | - |
448-
| Raspberry PI 2 B V1.1 | 104 | 88 | 188 | 116\* | - | - |
449-
| Apple iPhone SE armv7 | 1056 | 722 | 2943 | 1573 | - | - |
450-
| Apple iPhone SE arm64 | 1061 | 1237 | - | - | 4098 | 3983 |
447+
| Raspberry PI B+ V1.2 | 46 | 40\* | - | - | - | - |
448+
| Raspberry PI 2 B V1.1 | 104 | 88\* | 188 | 116\* | - | - |
449+
| Apple iPhone SE armv7 | 1056 | 895 | 2943 | 1573 | - | - |
450+
| Apple iPhone SE arm64 | 1061 | 1239 | - | - | 4098 | 3983 |
451451

452452
PowerPC processors
453453

454454
| Processor | Plain enc | Plain dec |
455455
|-------------------------------------------|----------:|----------:|
456-
| PowerPC E6500 @ 1.8GHz | 270 | 265 |
456+
| PowerPC E6500 @ 1.8GHz | 270 | 265\* |
457457

458458

459459
Benchmarks on i7-4770 @ 3.4 GHz DDR1600 with varrying buffer sizes:

lib/arch/generic/32/dec_loop.c

Lines changed: 39 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,43 @@
1-
// If we have native uint32's, pick off 4 bytes at a time for as long as we
2-
// can, but make sure that we quit before seeing any == markers at the end of
3-
// the string. Also, because we write a zero at the end of the output, ensure
4-
// that there are at least 2 valid bytes of input data remaining to close the
5-
// gap. 4 + 2 + 2 = 8 bytes:
6-
while (srclen >= 8)
1+
// Read source 4 bytes at a time
2+
// Since we might be writing one byte more than needed,
3+
// we need to make sure there will still be some room
4+
// for one extra byte in o.
5+
// This will be the case if srclen > 0 when the loop
6+
// is exited
7+
while (srclen > 4)
78
{
8-
uint32_t str, res, dec;
9-
10-
// Load string:
11-
str = *(uint32_t *)c;
12-
13-
// Shuffle bytes to 32-bit bigendian:
14-
str = cpu_to_be32(str);
15-
16-
// Lookup each byte in the decoding table; if we encounter any
17-
// "invalid" values, fall back on the bytewise code:
18-
if ((dec = base64_table_dec[str >> 24]) > 63) {
19-
break;
20-
}
21-
res = dec << 26;
22-
23-
if ((dec = base64_table_dec[(str >> 16) & 0xFF]) > 63) {
24-
break;
25-
}
26-
res |= dec << 20;
27-
28-
if ((dec = base64_table_dec[(str >> 8) & 0xFF]) > 63) {
29-
break;
30-
}
31-
res |= dec << 14;
32-
33-
if ((dec = base64_table_dec[str & 0xFF]) > 63) {
34-
break;
35-
}
36-
res |= dec << 8;
37-
38-
// Reshuffle and repack into 3-byte output format:
39-
res = be32_to_cpu(res);
40-
41-
// Store back:
42-
*(uint32_t *)o = res;
9+
union {
10+
uint32_t asint;
11+
uint8_t aschar[4];
12+
} x;
13+
14+
x.asint = base64_table_dec_d0[c[0]]
15+
| base64_table_dec_d1[c[1]]
16+
| base64_table_dec_d2[c[2]]
17+
| base64_table_dec_d3[c[3]];
18+
19+
#if BASE64_LITTLE_ENDIAN
20+
// LUTs for little-endian set Most Significant Bit
21+
// in case of invalid character
22+
if (x.asint & 0x80000000U) break;
23+
#else
24+
// LUTs for big-endian set Least Significant Bit
25+
// in case of invalid character
26+
if (x.asint & 1U) break;
27+
#endif
28+
29+
#if HAVE_FAST_UNALIGNED_ACCESS
30+
// This might segfault or be too slow on
31+
// some architectures, do this only if specified
32+
// with HAVE_FAST_UNALIGNED_ACCESS macro
33+
// We write one byte more than needed
34+
*(uint32_t*)o = x.asint;
35+
#else
36+
// Fallback, write bytes one by one
37+
o[0] = x.aschar[0];
38+
o[1] = x.aschar[1];
39+
o[2] = x.aschar[2];
40+
#endif
4341

4442
c += 4;
4543
o += 3;

lib/arch/generic/64/dec_loop.c

Lines changed: 0 additions & 68 deletions
This file was deleted.

lib/arch/generic/codec.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,8 @@ BASE64_ENC_FUNCTION(plain)
1919
BASE64_DEC_FUNCTION(plain)
2020
{
2121
#include "dec_head.c"
22-
#if BASE64_WORDSIZE == 32
22+
#if BASE64_WORDSIZE >= 32
2323
#include "32/dec_loop.c"
24-
#elif BASE64_WORDSIZE == 64
25-
#include "64/dec_loop.c"
2624
#endif
2725
#include "dec_tail.c"
2826
}

lib/arch/neon64/codec.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ BASE64_DEC_FUNCTION(neon64)
110110

111111
#include "../generic/dec_head.c"
112112
#include "dec_loop.c"
113-
#include "../generic/64/dec_loop.c"
113+
#include "../generic/32/dec_loop.c"
114114
#include "../generic/dec_tail.c"
115115
#else
116116
BASE64_DEC_STUB

lib/codec_choose.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
#include "../include/libbase64.h"
77
#include "codecs.h"
8-
#include "config.h"
98

109
#if __x86_64__ || __i386__ || _M_X86 || _M_X64
1110
#ifdef _MSC_VER

lib/codecs.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#include "config.h"
2+
13
// Function parameters for encoding functions:
24
#define BASE64_ENC_PARAMS \
35
( struct base64_state *state \
@@ -115,3 +117,8 @@ void codec_choose (struct codec *, int flags);
115117
// for fallback plain encoding/decoding:
116118
extern const uint8_t base64_table_enc[];
117119
extern const uint8_t base64_table_dec[];
120+
121+
extern const uint32_t base64_table_dec_d0[];
122+
extern const uint32_t base64_table_dec_d1[];
123+
extern const uint32_t base64_table_dec_d2[];
124+
extern const uint32_t base64_table_dec_d3[];

lib/lib.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include "../include/libbase64.h"
88
#include "codecs.h"
9+
#include "tables.h"
910

1011
// These static function pointers are initialized once when the library is
1112
// first used, and remain in use for the remaining lifetime of the program.

0 commit comments

Comments
 (0)