@@ -997,6 +997,36 @@ inline void GDALCopy4Words(const double *pValueIn, float *const pValueOut)
997
997
_mm_storeu_ps (pValueOut, val);
998
998
}
999
999
1000
+ template <>
1001
+ inline void GDALCopy4Words (const double *pValueIn, GByte *const pValueOut)
1002
+ {
1003
+ const __m128d p0d5 = _mm_set1_pd (0.5 );
1004
+ const __m128d xmm_max = _mm_set1_pd (255 );
1005
+
1006
+ __m128d val01 = _mm_loadu_pd (pValueIn);
1007
+ __m128d val23 = _mm_loadu_pd (pValueIn + 2 );
1008
+ val01 = _mm_add_pd (val01, p0d5);
1009
+ val01 = _mm_min_pd (_mm_max_pd (val01, p0d5), xmm_max);
1010
+ val23 = _mm_add_pd (val23, p0d5);
1011
+ val23 = _mm_min_pd (_mm_max_pd (val23, p0d5), xmm_max);
1012
+
1013
+ const __m128i val01_u32 = _mm_cvttpd_epi32 (val01);
1014
+ const __m128i val23_u32 = _mm_cvttpd_epi32 (val23);
1015
+
1016
+ // Merge 4 int32 values into a single register
1017
+ auto xmm_i = _mm_castpd_si128 (_mm_shuffle_pd (
1018
+ _mm_castsi128_pd (val01_u32), _mm_castsi128_pd (val23_u32), 0 ));
1019
+
1020
+ #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
1021
+ xmm_i = _mm_shuffle_epi8 (
1022
+ xmm_i, _mm_cvtsi32_si128 (0 | (4 << 8 ) | (8 << 16 ) | (12 << 24 )));
1023
+ #else
1024
+ xmm_i = _mm_packs_epi32 (xmm_i, xmm_i); // Pack int32 to int16
1025
+ xmm_i = _mm_packus_epi16 (xmm_i, xmm_i); // Pack int16 to uint8
1026
+ #endif
1027
+ GDALCopyXMMToInt32 (xmm_i, pValueOut);
1028
+ }
1029
+
1000
1030
template <>
1001
1031
inline void GDALCopy4Words (const float *pValueIn, double *const pValueOut)
1002
1032
{
0 commit comments