Skip to content

Commit fa441d5

Browse files
committed
GDALCopyWords(): speed-up Double->Byte conversion
1 parent b61136a commit fa441d5

File tree

2 files changed

+40
-0
lines changed

2 files changed

+40
-0
lines changed

gcore/gdal_priv_templates.hpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,36 @@ inline void GDALCopy4Words(const double *pValueIn, float *const pValueOut)
997997
_mm_storeu_ps(pValueOut, val);
998998
}
999999

1000+
template <>
1001+
inline void GDALCopy4Words(const double *pValueIn, GByte *const pValueOut)
1002+
{
1003+
const __m128d p0d5 = _mm_set1_pd(0.5);
1004+
const __m128d xmm_max = _mm_set1_pd(255);
1005+
1006+
__m128d val01 = _mm_loadu_pd(pValueIn);
1007+
__m128d val23 = _mm_loadu_pd(pValueIn + 2);
1008+
val01 = _mm_add_pd(val01, p0d5);
1009+
val01 = _mm_min_pd(_mm_max_pd(val01, p0d5), xmm_max);
1010+
val23 = _mm_add_pd(val23, p0d5);
1011+
val23 = _mm_min_pd(_mm_max_pd(val23, p0d5), xmm_max);
1012+
1013+
const __m128i val01_u32 = _mm_cvttpd_epi32(val01);
1014+
const __m128i val23_u32 = _mm_cvttpd_epi32(val23);
1015+
1016+
// Merge 4 int32 values into a single register
1017+
auto xmm_i = _mm_castpd_si128(_mm_shuffle_pd(
1018+
_mm_castsi128_pd(val01_u32), _mm_castsi128_pd(val23_u32), 0));
1019+
1020+
#if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
1021+
xmm_i = _mm_shuffle_epi8(
1022+
xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
1023+
#else
1024+
xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1025+
xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
1026+
#endif
1027+
GDALCopyXMMToInt32(xmm_i, pValueOut);
1028+
}
1029+
10001030
template <>
10011031
inline void GDALCopy4Words(const float *pValueIn, double *const pValueOut)
10021032
{

gcore/rasterio.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2990,6 +2990,16 @@ CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
29902990
}
29912991
}
29922992

2993+
template <>
2994+
CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
2995+
int nSrcPixelStride,
2996+
GByte *const CPL_RESTRICT pDstData,
2997+
int nDstPixelStride, GPtrDiff_t nWordCount)
2998+
{
2999+
GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
3000+
nDstPixelStride, nWordCount);
3001+
}
3002+
29933003
template <>
29943004
CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
29953005
int nSrcPixelStride,

0 commit comments

Comments
 (0)