-
Notifications
You must be signed in to change notification settings - Fork 167
Add SSE2 dec_loop.c #46
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
// If we have SSE2 support, pick off 16 bytes at a time for as long as we can, | ||
// but make sure that we quit before seeing any == markers at the end of the | ||
// string. Also, because we write four zeroes at the end of the output, ensure | ||
// that there are at least 6 valid bytes of input data remaining to close the | ||
// gap. 16 + 2 + 6 = 24 bytes: | ||
while (srclen >= 24) | ||
{ | ||
const __m128i digit_top = _mm_set1_epi32( 0x46464646 ); // 0x46 = 0x7F - '9' | ||
const __m128i lowercase_top = _mm_set1_epi32( 0x05050505 ); // 0x05 = 0x7F - 'z' | ||
const __m128i uppercase_top = _mm_set1_epi32( 0x25252525 ); // 0x25 = 0x7F - 'Z' | ||
const __m128i digit_bottom = _mm_set1_epi32( 0x76767676 ); // 0x76 = 0x7F - 9 ('9'-'0') | ||
const __m128i alpha_bottom = _mm_set1_epi32( 0x66666666 ); // 0x66 = 0x7F - 25 ('Z'-'A') ('z'-'a') | ||
const __m128i num_letters = _mm_set1_epi32( 0x1A1A1A1A ); // 0x1A = 26 | ||
const __m128i detect_plus = _mm_set1_epi32( 0xFBFBFBFB ); // 0xFB = plus sign value after '0-9' roll down | ||
const __m128i adj_plus = _mm_set1_epi32( 0xD4D4D4D4 ); // 0xD4 = plus sign shift after 'A-Z' roll down | ||
const __m128i detect_slash = _mm_set1_epi32( 0xFFFFFFFF ); // 0xFF = after 0-9 roll down | ||
|
||
__m128i m0, m1, m2, m3, m4; | ||
|
||
// Load string: | ||
m0 = _mm_loadu_si128((__m128i *)c); | ||
|
||
// Map input bytes to 6-bit values: | ||
m1 = _mm_add_epi8(m0, digit_top); // "0-9" roll up | ||
m2 = _mm_add_epi8(m0, lowercase_top); // "a-z" roll up | ||
m1 = _mm_subs_epi8(m1, digit_bottom); // "0-9" roll down | ||
m2 = _mm_subs_epi8(m2, alpha_bottom); // "a-z" roll down | ||
m4 = _mm_cmpeq_epi8(m1, detect_slash); // match '/' ( foward slash ) | ||
m3 = _mm_cmpeq_epi8(m1, detect_plus); // match '+' ( plus sign ) | ||
m1 = _mm_adds_epu8(m1, num_letters); // ('0'=26 thru '9'=35) else signed | ||
m4 = _mm_subs_epu8(m4, uppercase_top); // 0xDA = 0xFF - 0x25 | ||
m0 = _mm_add_epi8(m0, uppercase_top); // "A-Z" roll up | ||
m1 = _mm_xor_si128(m1, m4); // ( '/' = 37 ) = FF ^ DA | ||
m3 = _mm_and_si128(m3, adj_plus); // = 0xFF & 0xD4 | ||
m0 = _mm_subs_epi8(m0, alpha_bottom); // "A-Z" roll down | ||
m1 = _mm_min_epu8(m1, m2); // merge | ||
m0 = _mm_xor_si128(m0, m3); // ( '+' = 62 ) | ||
m1 = _mm_adds_epu8(m1, num_letters); // ('a'=26 thru '9'=61, '/'=63) else signed | ||
m0 = _mm_min_epu8(m0, m1); // merge | ||
|
||
// Fall back on bytewise code to do error checking and reporting: | ||
unsigned int mask = _mm_movemask_epi8(m0); | ||
if(mask != 0){ | ||
break; | ||
} | ||
|
||
// Pack 16 bytes into the 12-byte output format and store back: | ||
_mm_storeu_si128((__m128i *)o, dec_reshuffle(out)); | ||
|
||
c += 16; | ||
o += 12; | ||
outl += 12; | ||
srclen -= 16; | ||
} | ||
|
||
/* | ||
bool Test_SSE2_Decode() | ||
{ | ||
const unsigned char base64_table_dec[] = | ||
{ | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63, | ||
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, | ||
255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, | ||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 255, | ||
255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, | ||
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
}; | ||
|
||
const __m128i digit_top = _mm_set1_epi32( 0x46464646 ); // 0x46 = 0x7F - '9' | ||
const __m128i lowercase_top = _mm_set1_epi32( 0x05050505 ); // 0x05 = 0x7F - 'z' | ||
const __m128i uppercase_top = _mm_set1_epi32( 0x25252525 ); // 0x25 = 0x7F - 'Z' | ||
const __m128i digit_bottom = _mm_set1_epi32( 0x76767676 ); // 0x76 = 0x7F - 9 ('9'-'0') | ||
const __m128i alpha_bottom = _mm_set1_epi32( 0x66666666 ); // 0x66 = 0x7F - 25 ('Z'-'A') ('z'-'a') | ||
const __m128i num_letters = _mm_set1_epi32( 0x1A1A1A1A ); // 0x1A = 26 | ||
const __m128i detect_plus = _mm_set1_epi32( 0xFBFBFBFB ); // 0xFB = plus sign value after '0-9' roll down | ||
const __m128i adj_plus = _mm_set1_epi32( 0xD4D4D4D4 ); // 0xD4 = plus sign shift after 'A-Z' roll down | ||
const __m128i detect_slash = _mm_set1_epi32( 0xFFFFFFFF ); // 0xFF = after 0-9 roll down | ||
|
||
// check all possible values of 16 bits | ||
for(int i = 0; i < 0x10000; i++) | ||
{ | ||
__m128i m0, m1, m2, m3, m4; | ||
|
||
m0 = _mm_cvtsi32_si128(i); // load string | ||
|
||
m1 = _mm_add_epi8(m0, digit_top); // "0-9" roll up | ||
m2 = _mm_add_epi8(m0, lowercase_top); // "a-z" roll up | ||
m1 = _mm_subs_epi8(m1, digit_bottom); // "0-9" roll down | ||
m2 = _mm_subs_epi8(m2, alpha_bottom); // "a-z" roll down | ||
m4 = _mm_cmpeq_epi8(m1, detect_slash); // match '/' ( foward slash ) | ||
m3 = _mm_cmpeq_epi8(m1, detect_plus); // match '+' ( plus sign ) | ||
m1 = _mm_adds_epu8(m1, num_letters); // ('0'=26 thru '9'=35) else signed | ||
m4 = _mm_subs_epu8(m4, uppercase_top); // 0xDA = 0xFF - 0x25 | ||
m0 = _mm_add_epi8(m0, uppercase_top); // "A-Z" roll up | ||
m1 = _mm_xor_si128(m1, m4); // ( '/' = 37 ) = FF ^ DA | ||
m3 = _mm_and_si128(m3, adj_plus); // = 0xFF & 0xD4 | ||
m0 = _mm_subs_epi8(m0, alpha_bottom); // "A-Z" roll down | ||
m1 = _mm_min_epu8(m1, m2); // merge | ||
m0 = _mm_xor_si128(m0, m3); // ( '+' = 62 ) | ||
m1 = _mm_adds_epu8(m1, num_letters); // ('a'=26 thru '9'=61, '/'=63) else signed | ||
m0 = _mm_min_epu8(m0, m1); // merge | ||
|
||
// set signed bytes to 255 | ||
__m128i res; | ||
res = _mm_cmpgt_epi8(_mm_setzero_si128(), m0); | ||
res = _mm_or_si128(res, m0); | ||
|
||
// get low 16 bits | ||
unsigned int r = _mm_cvtsi128_si32(res) & 0x0000FFFF; | ||
|
||
// check that byte_0 decoded correctly | ||
if((base64_table_dec[i & 0x00FF]) != (r & 0x00FF)){ | ||
return false; | ||
} | ||
|
||
// check that byte_1 decoded correctly | ||
if((base64_table_dec[i >> 8]) != ((r >> 8) & 0x00FF)){ | ||
return false; | ||
} | ||
} | ||
return true; | ||
} | ||
*/ |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
dec_reshuffle
function appears to be undefined.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was meant for atom CPUs
so the ssse3
dec_reshuffle()
would have been used.I tried to do a sse2 packer here
but it is not too great.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried to get the code working with the SSSE3 decoder to get an idea of its performance, but hit a small bug: the function call should be
dec_reshuffle(m0)
.If the assumption is that this code would be used in conjunction with the SSSE3 reshuffler, then why not use the full SSSE3 codec?
One possible answer comes when testing this on a 32-bit Atom N270. There is a large improvement over the stock SSSE3 decoder: 591 MB/s for the SSE2 code against 427 MB/s for the SSSE3 code.
It's significantly slower on my desktop i5-4590S though: 6150 MB/s against 6800 MB/s for the stock SSSE3 codec. This makes it hard to justify merging this code for common use.
Maybe GCC's
__builtin_cpu_is("atom")
builtin would be useful here...