Skip to content

Commit e476290

Browse files
committed
GDALCopyWords(): implement efficient conversion of 8 Float16->Float32/Float64 conversions using SSE2
1 parent 4e8448c commit e476290

File tree

2 files changed

+98
-18
lines changed

2 files changed

+98
-18
lines changed

gcore/gdal_priv_templates.hpp

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,7 +1035,87 @@ inline void GDALCopy4Words(const double *pValueIn, GFloat16 *const pValueOut)
10351035
GDALCopy4Words(pValueIn, tmp);
10361036
GDALCopy4Words(tmp, pValueOut);
10371037
}
1038+
#else // !__F16C__
1039+
1040+
static inline __m128i GDALIfThenElse(__m128i mask, __m128i thenVal,
1041+
__m128i elseVal)
1042+
{
1043+
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1044+
return _mm_blendv_epi8(elseVal, thenVal, mask);
1045+
#else
1046+
return _mm_or_si128(_mm_and_si128(mask, thenVal),
1047+
_mm_andnot_si128(mask, elseVal));
10381048
#endif
1049+
}
1050+
1051+
// Convert 4 float16 values to 4 float 32 values
1052+
// xmm must contain 4 float16 values stored in 32 bit each (with upper 16 bits at zero)
1053+
static inline __m128i GDALFourFloat16ToFloat32(__m128i xmm)
1054+
{
1055+
// Ported from https://github.com/simd-everywhere/simde/blob/51743e7920b6e867678cb50e9c62effe28f70b33/simde/simde-f16.h#L242C4-L242C68
1056+
// to SSE2 in a branch-less way
1057+
1058+
/* This code is CC0, based heavily on code by Fabian Giesen. */
1059+
const auto denorm_magic =
1060+
_mm_castsi128_ps(_mm_set1_epi32((128 - 15) << 23));
1061+
const auto shifted_exp =
1062+
_mm_set1_epi32(0x7c00 << 13); /* exponent mask after shift */
1063+
1064+
// Shift exponent and mantissa bits to their position in a float32
1065+
auto f32u = _mm_slli_epi32(_mm_and_si128(xmm, _mm_set1_epi32(0x7fff)), 13);
1066+
// Extract the (shifted) exponent
1067+
const auto exp = _mm_and_si128(shifted_exp, f32u);
1068+
// Adjust the exponent
1069+
const auto exp_adjustment = _mm_set1_epi32((127 - 15) << 23);
1070+
f32u = _mm_add_epi32(f32u, exp_adjustment);
1071+
1072+
const auto is_inf_nan = _mm_cmpeq_epi32(exp, shifted_exp); /* Inf/NaN? */
1073+
// When is_inf_nan is true: extra exponent adjustment
1074+
const auto f32u_inf_nan = _mm_add_epi32(f32u, exp_adjustment);
1075+
1076+
const auto is_denormal =
1077+
_mm_cmpeq_epi32(exp, _mm_setzero_si128()); /* Zero/Denormal? */
1078+
// When is_denormal is true:
1079+
auto f32u_denormal = _mm_add_epi32(f32u, _mm_set1_epi32(1 << 23));
1080+
f32u_denormal = _mm_castps_si128(
1081+
_mm_sub_ps(_mm_castsi128_ps(f32u_denormal), denorm_magic));
1082+
1083+
f32u = GDALIfThenElse(is_inf_nan, f32u_inf_nan, f32u);
1084+
f32u = GDALIfThenElse(is_denormal, f32u_denormal, f32u);
1085+
1086+
// Re-apply sign bit
1087+
f32u = _mm_or_si128(
1088+
f32u, _mm_slli_epi32(_mm_and_si128(xmm, _mm_set1_epi32(0x8000)), 16));
1089+
return f32u;
1090+
}
1091+
1092+
template <>
1093+
inline void GDALCopy8Words(const GFloat16 *pValueIn, float *const pValueOut)
1094+
{
1095+
__m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1096+
const auto xmm_0 =
1097+
GDALFourFloat16ToFloat32(_mm_unpacklo_epi16(xmm, _mm_setzero_si128()));
1098+
const auto xmm_1 =
1099+
GDALFourFloat16ToFloat32(_mm_unpackhi_epi16(xmm, _mm_setzero_si128()));
1100+
_mm_storeu_ps(pValueOut + 0, _mm_castsi128_ps(xmm_0));
1101+
_mm_storeu_ps(pValueOut + 4, _mm_castsi128_ps(xmm_1));
1102+
}
1103+
1104+
template <>
1105+
inline void GDALCopy8Words(const GFloat16 *pValueIn, double *const pValueOut)
1106+
{
1107+
__m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1108+
const auto xmm_0 = _mm_castsi128_ps(
1109+
GDALFourFloat16ToFloat32(_mm_unpacklo_epi16(xmm, _mm_setzero_si128())));
1110+
const auto xmm_1 = _mm_castsi128_ps(
1111+
GDALFourFloat16ToFloat32(_mm_unpackhi_epi16(xmm, _mm_setzero_si128())));
1112+
_mm_storeu_pd(pValueOut + 0, _mm_cvtps_pd(xmm_0));
1113+
_mm_storeu_pd(pValueOut + 2, _mm_cvtps_pd(_mm_movehl_ps(xmm_0, xmm_0)));
1114+
_mm_storeu_pd(pValueOut + 4, _mm_cvtps_pd(xmm_1));
1115+
_mm_storeu_pd(pValueOut + 6, _mm_cvtps_pd(_mm_movehl_ps(xmm_1, xmm_1)));
1116+
}
1117+
1118+
#endif // __F16C__
10391119

10401120
#ifdef __AVX2__
10411121

gcore/rasterio.cpp

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3020,42 +3020,40 @@ CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
30203020
nDstPixelStride, nWordCount);
30213021
}
30223022

3023-
#endif // HAVE_SSE2
3024-
30253023
template <>
3026-
CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3024+
CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
30273025
int nSrcPixelStride,
3028-
GByte *const CPL_RESTRICT pDstData,
3026+
float *const CPL_RESTRICT pDstData,
30293027
int nDstPixelStride, GPtrDiff_t nWordCount)
30303028
{
30313029
GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
30323030
nDstPixelStride, nWordCount);
30333031
}
30343032

30353033
template <>
3036-
CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3034+
CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
30373035
int nSrcPixelStride,
3038-
GInt16 *const CPL_RESTRICT pDstData,
3036+
double *const CPL_RESTRICT pDstData,
30393037
int nDstPixelStride, GPtrDiff_t nWordCount)
30403038
{
30413039
GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
30423040
nDstPixelStride, nWordCount);
30433041
}
30443042

3043+
#ifdef __F16C__
3044+
30453045
template <>
30463046
CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
30473047
int nSrcPixelStride,
3048-
GUInt16 *const CPL_RESTRICT pDstData,
3048+
GFloat16 *const CPL_RESTRICT pDstData,
30493049
int nDstPixelStride, GPtrDiff_t nWordCount)
30503050
{
30513051
GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
30523052
nDstPixelStride, nWordCount);
30533053
}
30543054

3055-
#ifdef __F16C__
3056-
30573055
template <>
3058-
CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
3056+
CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
30593057
int nSrcPixelStride,
30603058
GFloat16 *const CPL_RESTRICT pDstData,
30613059
int nDstPixelStride, GPtrDiff_t nWordCount)
@@ -3064,38 +3062,40 @@ CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
30643062
nDstPixelStride, nWordCount);
30653063
}
30663064

3065+
#endif // __F16C__
3066+
3067+
#endif // HAVE_SSE2
3068+
30673069
template <>
3068-
CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3070+
CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
30693071
int nSrcPixelStride,
3070-
float *const CPL_RESTRICT pDstData,
3072+
GByte *const CPL_RESTRICT pDstData,
30713073
int nDstPixelStride, GPtrDiff_t nWordCount)
30723074
{
30733075
GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
30743076
nDstPixelStride, nWordCount);
30753077
}
30763078

30773079
template <>
3078-
CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
3080+
CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
30793081
int nSrcPixelStride,
3080-
GFloat16 *const CPL_RESTRICT pDstData,
3082+
GInt16 *const CPL_RESTRICT pDstData,
30813083
int nDstPixelStride, GPtrDiff_t nWordCount)
30823084
{
30833085
GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
30843086
nDstPixelStride, nWordCount);
30853087
}
30863088

30873089
template <>
3088-
CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
3090+
CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
30893091
int nSrcPixelStride,
3090-
double *const CPL_RESTRICT pDstData,
3092+
GUInt16 *const CPL_RESTRICT pDstData,
30913093
int nDstPixelStride, GPtrDiff_t nWordCount)
30923094
{
30933095
GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
30943096
nDstPixelStride, nWordCount);
30953097
}
30963098

3097-
#endif
3098-
30993099
/************************************************************************/
31003100
/* GDALCopyWordsComplexT() */
31013101
/************************************************************************/

0 commit comments

Comments
 (0)