3
3
// don't need to check if we have enough remaining input to cover them:
4
4
while (srclen >= 64 )
5
5
{
6
- uint8x16x4_t set1 , set2 , set3 , set4 , set5 , delta ;
7
6
uint8x16x3_t dec ;
8
7
9
8
// Load 64 bytes and deinterleave:
10
9
uint8x16x4_t str = vld4q_u8 ((uint8_t * )c );
11
10
12
- // The input consists of six character sets in the Base64 alphabet,
13
- // which we need to map back to the 6-bit values they represent.
14
- // There are three ranges, two singles, and then there's the rest.
15
- //
16
- // # From To Add Characters
17
- // 1 [43] [62] +19 +
18
- // 2 [47] [63] +16 /
19
- // 3 [48..57] [52..61] +4 0..9
20
- // 4 [65..90] [0..25] -65 A..Z
21
- // 5 [97..122] [26..51] -71 a..z
22
- // (6) Everything else => invalid input
23
-
24
- // Benchmarking on the Raspberry Pi 2B and Clang shows that looping
25
- // generates slightly faster code than explicit unrolling:
26
- for (int i = 0 ; i < 4 ; i ++ ) {
27
- set1 .val [i ] = CMPEQ (str .val [i ], '+' );
28
- set2 .val [i ] = CMPEQ (str .val [i ], '/' );
29
- set3 .val [i ] = RANGE (str .val [i ], '0' , '9' );
30
- set4 .val [i ] = RANGE (str .val [i ], 'A' , 'Z' );
31
- set5 .val [i ] = RANGE (str .val [i ], 'a' , 'z' );
32
-
33
- delta .val [i ] = REPLACE (set1 .val [i ], 19 );
34
- delta .val [i ] = vbslq_u8 (set2 .val [i ], vdupq_n_u8 ( 16 ), delta .val [i ]);
35
- delta .val [i ] = vbslq_u8 (set3 .val [i ], vdupq_n_u8 ( 4 ), delta .val [i ]);
36
- delta .val [i ] = vbslq_u8 (set4 .val [i ], vdupq_n_u8 (-65 ), delta .val [i ]);
37
- delta .val [i ] = vbslq_u8 (set5 .val [i ], vdupq_n_u8 (-71 ), delta .val [i ]);
11
+ // see ssse3/dec_loop.c for an explanation of how the code works.
12
+
13
+ const uint8x16_t lut_lo = {
14
+ 0x15 , 0x11 , 0x11 , 0x11 , 0x11 , 0x11 , 0x11 , 0x11 ,
15
+ 0x11 , 0x11 , 0x13 , 0x1A , 0x1B , 0x1B , 0x1B , 0x1A
16
+ };
17
+ const uint8x16_t lut_hi = {
18
+ 0x10 , 0x10 , 0x01 , 0x02 , 0x04 , 0x08 , 0x04 , 0x08 ,
19
+ 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10 , 0x10
20
+ };
21
+
22
+ const uint8x16_t lut_roll = {
23
+ 0 , 16 , 19 , 4 , (uint8_t )-65 , (uint8_t )-65 , (uint8_t )-71 , (uint8_t )-71 ,
24
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0
25
+ };
26
+
27
+ const uint8x16_t mask_F = vdupq_n_u8 (0xf );
28
+ const uint8x16_t mask_2F = vdupq_n_u8 (0x2f );
29
+
30
+ uint8x16_t classified ;
31
+
32
+ {
33
+ const uint8x16_t hi_nibbles = vshrq_n_u8 (str .val [0 ], 4 );
34
+ const uint8x16_t lo_nibbles = vandq_u8 (str .val [0 ], mask_F );
35
+ const uint8x16_t eq_2F = vceqq_u8 (str .val [0 ], mask_2F );
36
+
37
+ const uint8x16_t hi = vqtbl1q_u8 (lut_hi , hi_nibbles );
38
+ const uint8x16_t lo = vqtbl1q_u8 (lut_lo , lo_nibbles );
39
+
40
+ const uint8x16_t delta = vqtbl1q_u8 (lut_roll , vaddq_u8 (eq_2F , hi_nibbles ));
41
+ classified = vandq_u8 (lo , hi );
42
+ // Now simply add the delta values to the input:
43
+ str .val [0 ] = vaddq_u8 (str .val [0 ], delta );
44
+ }
45
+ {
46
+ const uint8x16_t hi_nibbles = vshrq_n_u8 (str .val [1 ], 4 );
47
+ const uint8x16_t lo_nibbles = vandq_u8 (str .val [1 ], mask_F );
48
+ const uint8x16_t eq_2F = vceqq_u8 (str .val [1 ], mask_2F );
49
+
50
+ const uint8x16_t hi = vqtbl1q_u8 (lut_hi , hi_nibbles );
51
+ const uint8x16_t lo = vqtbl1q_u8 (lut_lo , lo_nibbles );
52
+
53
+ const uint8x16_t delta = vqtbl1q_u8 (lut_roll , vaddq_u8 (eq_2F , hi_nibbles ));
54
+ classified = vorrq_u8 (classified , vandq_u8 (lo , hi ));
55
+ // Now simply add the delta values to the input:
56
+ str .val [1 ] = vaddq_u8 (str .val [1 ], delta );
57
+ }
58
+ {
59
+ const uint8x16_t hi_nibbles = vshrq_n_u8 (str .val [2 ], 4 );
60
+ const uint8x16_t lo_nibbles = vandq_u8 (str .val [2 ], mask_F );
61
+ const uint8x16_t eq_2F = vceqq_u8 (str .val [2 ], mask_2F );
62
+
63
+ const uint8x16_t hi = vqtbl1q_u8 (lut_hi , hi_nibbles );
64
+ const uint8x16_t lo = vqtbl1q_u8 (lut_lo , lo_nibbles );
65
+
66
+ const uint8x16_t delta = vqtbl1q_u8 (lut_roll , vaddq_u8 (eq_2F , hi_nibbles ));
67
+ classified = vorrq_u8 (classified , vandq_u8 (lo , hi ));
68
+ // Now simply add the delta values to the input:
69
+ str .val [2 ] = vaddq_u8 (str .val [2 ], delta );
70
+ }
71
+ {
72
+ const uint8x16_t hi_nibbles = vshrq_n_u8 (str .val [3 ], 4 );
73
+ const uint8x16_t lo_nibbles = vandq_u8 (str .val [3 ], mask_F );
74
+ const uint8x16_t eq_2F = vceqq_u8 (str .val [3 ], mask_2F );
75
+
76
+ const uint8x16_t hi = vqtbl1q_u8 (lut_hi , hi_nibbles );
77
+ const uint8x16_t lo = vqtbl1q_u8 (lut_lo , lo_nibbles );
78
+
79
+ const uint8x16_t delta = vqtbl1q_u8 (lut_roll , vaddq_u8 (eq_2F , hi_nibbles ));
80
+ classified = vorrq_u8 (classified , vandq_u8 (lo , hi ));
81
+ // Now simply add the delta values to the input:
82
+ str .val [3 ] = vaddq_u8 (str .val [3 ], delta );
38
83
}
39
84
40
85
// Check for invalid input: if any of the delta values are zero,
41
86
// fall back on bytewise code to do error checking and reporting:
42
- uint8x16_t classified = CMPEQ (delta .val [0 ], 0 );
43
- classified = vorrq_u8 (classified , CMPEQ (delta .val [1 ], 0 ));
44
- classified = vorrq_u8 (classified , CMPEQ (delta .val [2 ], 0 ));
45
- classified = vorrq_u8 (classified , CMPEQ (delta .val [3 ], 0 ));
46
-
47
87
// Extract both 32-bit halves; check that all bits are zero:
48
88
if (vgetq_lane_u32 ((uint32x4_t )classified , 0 ) != 0
49
89
|| vgetq_lane_u32 ((uint32x4_t )classified , 1 ) != 0
@@ -52,16 +92,10 @@ while (srclen >= 64)
52
92
break ;
53
93
}
54
94
55
- // Now simply add the delta values to the input:
56
- str .val [0 ] = vaddq_u8 (str .val [0 ], delta .val [0 ]);
57
- str .val [1 ] = vaddq_u8 (str .val [1 ], delta .val [1 ]);
58
- str .val [2 ] = vaddq_u8 (str .val [2 ], delta .val [2 ]);
59
- str .val [3 ] = vaddq_u8 (str .val [3 ], delta .val [3 ]);
60
-
61
95
// Compress four bytes into three:
62
- dec .val [0 ] = vshlq_n_u8 (str .val [0 ], 2 ) | vshrq_n_u8 (str .val [1 ], 4 );
63
- dec .val [1 ] = vshlq_n_u8 (str .val [1 ], 4 ) | vshrq_n_u8 (str .val [2 ], 2 );
64
- dec .val [2 ] = vshlq_n_u8 (str .val [2 ], 6 ) | str .val [3 ];
96
+ dec .val [0 ] = vorrq_u8 ( vshlq_n_u8 (str .val [0 ], 2 ), vshrq_n_u8 (str .val [1 ], 4 ) );
97
+ dec .val [1 ] = vorrq_u8 ( vshlq_n_u8 (str .val [1 ], 4 ), vshrq_n_u8 (str .val [2 ], 2 ) );
98
+ dec .val [2 ] = vorrq_u8 ( vshlq_n_u8 (str .val [2 ], 6 ), str .val [3 ]) ;
65
99
66
100
// Interleave and store decoded result:
67
101
vst3q_u8 ((uint8_t * )o , dec );
0 commit comments