Skip to content

Commit 502fe6d

Browse files
authored
Merge pull request #12630 from rouault/GDALCopyWords_improvements
GDALCopyWords(): perf improvements for some packed conversions
2 parents f10cd5b + fa441d5 commit 502fe6d

File tree

4 files changed

+659
-158
lines changed

4 files changed

+659
-158
lines changed

autotest/cpp/testcopywords.cpp

Lines changed: 61 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <cstdint>
1919
#include <iostream>
2020
#include <limits>
21+
#include <type_traits>
2122

2223
#include "gtest_include.h"
2324

@@ -26,9 +27,9 @@ namespace
2627

2728
// ---------------------------------------------------------------------------
2829

29-
template <class OutType, class ConstantType>
30-
void AssertRes(GDALDataType intype, ConstantType inval, GDALDataType outtype,
31-
ConstantType expected_outval, OutType outval, int numLine)
30+
template <class OutType, class CT1, class CT2>
31+
void AssertRes(GDALDataType intype, CT1 inval, GDALDataType outtype,
32+
CT2 expected_outval, OutType outval, int numLine)
3233
{
3334
if (static_cast<double>(expected_outval) == static_cast<double>(outval) ||
3435
(std::isnan(static_cast<double>(expected_outval)) &&
@@ -55,8 +56,8 @@ class TestCopyWords : public ::testing::Test
5556
protected:
5657
void SetUp() override
5758
{
58-
pIn = (GByte *)malloc(256);
59-
pOut = (GByte *)malloc(256);
59+
pIn = (GByte *)malloc(2048);
60+
pOut = (GByte *)malloc(2048);
6061
}
6162

6263
void TearDown() override
@@ -74,8 +75,8 @@ class TestCopyWords : public ::testing::Test
7475
GDALDataType outtype, ConstantType outval, ConstantType outvali,
7576
int numLine)
7677
{
77-
memset(pIn, 0xff, 128);
78-
memset(pOut, 0xff, 128);
78+
memset(pIn, 0xff, 1024);
79+
memset(pOut, 0xff, 1024);
7980

8081
*(InType *)(pIn) = (InType)inval;
8182
*(InType *)(pIn + 32) = (InType)inval;
@@ -89,14 +90,14 @@ class TestCopyWords : public ::testing::Test
8990
GDALCopyWords(pIn, intype, 32, pOut, outtype, 32, 2);
9091

9192
/* Test negative offsets */
92-
GDALCopyWords(pIn + 32, intype, -32, pOut + 128 - 16, outtype, -32, 2);
93+
GDALCopyWords(pIn + 32, intype, -32, pOut + 1024 - 16, outtype, -32, 2);
9394

9495
MY_EXPECT(intype, inval, outtype, outval, *(OutType *)(pOut));
9596
MY_EXPECT(intype, inval, outtype, outval, *(OutType *)(pOut + 32));
9697
MY_EXPECT(intype, inval, outtype, outval,
97-
*(OutType *)(pOut + 128 - 16));
98+
*(OutType *)(pOut + 1024 - 16));
9899
MY_EXPECT(intype, inval, outtype, outval,
99-
*(OutType *)(pOut + 128 - 16 - 32));
100+
*(OutType *)(pOut + 1024 - 16 - 32));
100101

101102
if (GDALDataTypeIsComplex(outtype))
102103
{
@@ -105,38 +106,29 @@ class TestCopyWords : public ::testing::Test
105106
((OutType *)(pOut + 32))[1]);
106107

107108
MY_EXPECT(intype, invali, outtype, outvali,
108-
((OutType *)(pOut + 128 - 16))[1]);
109+
((OutType *)(pOut + 1024 - 16))[1]);
109110
MY_EXPECT(intype, invali, outtype, outvali,
110-
((OutType *)(pOut + 128 - 16 - 32))[1]);
111+
((OutType *)(pOut + 1024 - 16 - 32))[1]);
111112
}
112113
else
113114
{
114-
*(InType *)(pIn + GDALGetDataTypeSizeBytes(intype)) = (InType)inval;
115-
/* Test packed offsets */
116-
GDALCopyWords(pIn, intype, GDALGetDataTypeSizeBytes(intype), pOut,
117-
outtype, GDALGetDataTypeSizeBytes(outtype), 2);
118-
119-
MY_EXPECT(intype, inval, outtype, outval, *(OutType *)(pOut));
120-
MY_EXPECT(intype, inval, outtype, outval,
121-
*(OutType *)(pOut + GDALGetDataTypeSizeBytes(outtype)));
115+
constexpr int N = 32 + 31;
116+
for (int i = 0; i < N; ++i)
117+
{
118+
*(InType *)(pIn + i * GDALGetDataTypeSizeBytes(intype)) =
119+
(InType)inval;
120+
}
122121

123-
*(InType *)(pIn + 2 * GDALGetDataTypeSizeBytes(intype)) =
124-
(InType)inval;
125-
*(InType *)(pIn + 3 * GDALGetDataTypeSizeBytes(intype)) =
126-
(InType)inval;
127122
/* Test packed offsets */
128123
GDALCopyWords(pIn, intype, GDALGetDataTypeSizeBytes(intype), pOut,
129-
outtype, GDALGetDataTypeSizeBytes(outtype), 4);
130-
131-
MY_EXPECT(intype, inval, outtype, outval, *(OutType *)(pOut));
132-
MY_EXPECT(intype, inval, outtype, outval,
133-
*(OutType *)(pOut + GDALGetDataTypeSizeBytes(outtype)));
134-
MY_EXPECT(
135-
intype, inval, outtype, outval,
136-
*(OutType *)(pOut + 2 * GDALGetDataTypeSizeBytes(outtype)));
137-
MY_EXPECT(
138-
intype, inval, outtype, outval,
139-
*(OutType *)(pOut + 3 * GDALGetDataTypeSizeBytes(outtype)));
124+
outtype, GDALGetDataTypeSizeBytes(outtype), N);
125+
126+
for (int i = 0; i < N; ++i)
127+
{
128+
MY_EXPECT(
129+
intype, inval, outtype, outval,
130+
*(OutType *)(pOut + i * GDALGetDataTypeSizeBytes(outtype)));
131+
}
140132
}
141133
}
142134

@@ -1080,15 +1072,47 @@ void CheckPackedGeneric(GDALDataType eIn, GDALDataType eOut)
10801072
Tout arrayOut[N];
10811073
for (int i = 0; i < N; i++)
10821074
{
1083-
arrayIn[i] = static_cast<Tin>(i + 1);
1075+
if constexpr (!std::is_integral_v<Tin> && std::is_integral_v<Tout>)
1076+
{
1077+
// Test correct rounding
1078+
if (i == 0 && std::is_unsigned_v<Tout>)
1079+
arrayIn[i] = cpl::NumericLimits<Tin>::quiet_NaN();
1080+
else if ((i % 2) != 0)
1081+
arrayIn[i] = static_cast<Tin>(i + 0.4);
1082+
else
1083+
arrayIn[i] = static_cast<Tin>(i + 0.6);
1084+
}
1085+
else
1086+
{
1087+
arrayIn[i] = static_cast<Tin>(i + 1);
1088+
}
10841089
arrayOut[i] = 0;
10851090
}
10861091
GDALCopyWords(arrayIn, eIn, GDALGetDataTypeSizeBytes(eIn), arrayOut, eOut,
10871092
GDALGetDataTypeSizeBytes(eOut), N);
10881093
int numLine = 0;
10891094
for (int i = 0; i < N; i++)
10901095
{
1091-
MY_EXPECT(eIn, i + 1, eOut, i + 1, arrayOut[i]);
1096+
if constexpr (!std::is_integral_v<Tin> && std::is_integral_v<Tout>)
1097+
{
1098+
if (i == 0 && std::is_unsigned_v<Tout>)
1099+
{
1100+
MY_EXPECT(eIn, cpl::NumericLimits<Tin>::quiet_NaN(), eOut, 0,
1101+
arrayOut[i]);
1102+
}
1103+
else if ((i % 2) != 0)
1104+
{
1105+
MY_EXPECT(eIn, i + 0.4, eOut, i, arrayOut[i]);
1106+
}
1107+
else
1108+
{
1109+
MY_EXPECT(eIn, i + 0.6, eOut, i + 1, arrayOut[i]);
1110+
}
1111+
}
1112+
else
1113+
{
1114+
MY_EXPECT(eIn, i + 1, eOut, i + 1, arrayOut[i]);
1115+
}
10921116
}
10931117
}
10941118

gcore/gdal_priv_templates.hpp

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,36 @@ inline void GDALCopy4Words(const double *pValueIn, float *const pValueOut)
997997
_mm_storeu_ps(pValueOut, val);
998998
}
999999

1000+
template <>
1001+
inline void GDALCopy4Words(const double *pValueIn, GByte *const pValueOut)
1002+
{
1003+
const __m128d p0d5 = _mm_set1_pd(0.5);
1004+
const __m128d xmm_max = _mm_set1_pd(255);
1005+
1006+
__m128d val01 = _mm_loadu_pd(pValueIn);
1007+
__m128d val23 = _mm_loadu_pd(pValueIn + 2);
1008+
val01 = _mm_add_pd(val01, p0d5);
1009+
val01 = _mm_min_pd(_mm_max_pd(val01, p0d5), xmm_max);
1010+
val23 = _mm_add_pd(val23, p0d5);
1011+
val23 = _mm_min_pd(_mm_max_pd(val23, p0d5), xmm_max);
1012+
1013+
const __m128i val01_u32 = _mm_cvttpd_epi32(val01);
1014+
const __m128i val23_u32 = _mm_cvttpd_epi32(val23);
1015+
1016+
// Merge 4 int32 values into a single register
1017+
auto xmm_i = _mm_castpd_si128(_mm_shuffle_pd(
1018+
_mm_castsi128_pd(val01_u32), _mm_castsi128_pd(val23_u32), 0));
1019+
1020+
#if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
1021+
xmm_i = _mm_shuffle_epi8(
1022+
xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
1023+
#else
1024+
xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1025+
xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
1026+
#endif
1027+
GDALCopyXMMToInt32(xmm_i, pValueOut);
1028+
}
1029+
10001030
template <>
10011031
inline void GDALCopy4Words(const float *pValueIn, double *const pValueOut)
10021032
{
@@ -1035,7 +1065,87 @@ inline void GDALCopy4Words(const double *pValueIn, GFloat16 *const pValueOut)
10351065
GDALCopy4Words(pValueIn, tmp);
10361066
GDALCopy4Words(tmp, pValueOut);
10371067
}
1068+
#else // !__F16C__
1069+
1070+
static inline __m128i GDALIfThenElse(__m128i mask, __m128i thenVal,
1071+
__m128i elseVal)
1072+
{
1073+
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1074+
return _mm_blendv_epi8(elseVal, thenVal, mask);
1075+
#else
1076+
return _mm_or_si128(_mm_and_si128(mask, thenVal),
1077+
_mm_andnot_si128(mask, elseVal));
10381078
#endif
1079+
}
1080+
1081+
// Convert 4 float16 values to 4 float 32 values
1082+
// xmm must contain 4 float16 values stored in 32 bit each (with upper 16 bits at zero)
1083+
static inline __m128i GDALFourFloat16ToFloat32(__m128i xmm)
1084+
{
1085+
// Ported from https://github.com/simd-everywhere/simde/blob/51743e7920b6e867678cb50e9c62effe28f70b33/simde/simde-f16.h#L242C4-L242C68
1086+
// to SSE2 in a branch-less way
1087+
1088+
/* This code is CC0, based heavily on code by Fabian Giesen. */
1089+
const auto denorm_magic =
1090+
_mm_castsi128_ps(_mm_set1_epi32((128 - 15) << 23));
1091+
const auto shifted_exp =
1092+
_mm_set1_epi32(0x7c00 << 13); /* exponent mask after shift */
1093+
1094+
// Shift exponent and mantissa bits to their position in a float32
1095+
auto f32u = _mm_slli_epi32(_mm_and_si128(xmm, _mm_set1_epi32(0x7fff)), 13);
1096+
// Extract the (shifted) exponent
1097+
const auto exp = _mm_and_si128(shifted_exp, f32u);
1098+
// Adjust the exponent
1099+
const auto exp_adjustment = _mm_set1_epi32((127 - 15) << 23);
1100+
f32u = _mm_add_epi32(f32u, exp_adjustment);
1101+
1102+
const auto is_inf_nan = _mm_cmpeq_epi32(exp, shifted_exp); /* Inf/NaN? */
1103+
// When is_inf_nan is true: extra exponent adjustment
1104+
const auto f32u_inf_nan = _mm_add_epi32(f32u, exp_adjustment);
1105+
1106+
const auto is_denormal =
1107+
_mm_cmpeq_epi32(exp, _mm_setzero_si128()); /* Zero/Denormal? */
1108+
// When is_denormal is true:
1109+
auto f32u_denormal = _mm_add_epi32(f32u, _mm_set1_epi32(1 << 23));
1110+
f32u_denormal = _mm_castps_si128(
1111+
_mm_sub_ps(_mm_castsi128_ps(f32u_denormal), denorm_magic));
1112+
1113+
f32u = GDALIfThenElse(is_inf_nan, f32u_inf_nan, f32u);
1114+
f32u = GDALIfThenElse(is_denormal, f32u_denormal, f32u);
1115+
1116+
// Re-apply sign bit
1117+
f32u = _mm_or_si128(
1118+
f32u, _mm_slli_epi32(_mm_and_si128(xmm, _mm_set1_epi32(0x8000)), 16));
1119+
return f32u;
1120+
}
1121+
1122+
template <>
1123+
inline void GDALCopy8Words(const GFloat16 *pValueIn, float *const pValueOut)
1124+
{
1125+
__m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1126+
const auto xmm_0 =
1127+
GDALFourFloat16ToFloat32(_mm_unpacklo_epi16(xmm, _mm_setzero_si128()));
1128+
const auto xmm_1 =
1129+
GDALFourFloat16ToFloat32(_mm_unpackhi_epi16(xmm, _mm_setzero_si128()));
1130+
_mm_storeu_ps(pValueOut + 0, _mm_castsi128_ps(xmm_0));
1131+
_mm_storeu_ps(pValueOut + 4, _mm_castsi128_ps(xmm_1));
1132+
}
1133+
1134+
template <>
1135+
inline void GDALCopy8Words(const GFloat16 *pValueIn, double *const pValueOut)
1136+
{
1137+
__m128i xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pValueIn));
1138+
const auto xmm_0 = _mm_castsi128_ps(
1139+
GDALFourFloat16ToFloat32(_mm_unpacklo_epi16(xmm, _mm_setzero_si128())));
1140+
const auto xmm_1 = _mm_castsi128_ps(
1141+
GDALFourFloat16ToFloat32(_mm_unpackhi_epi16(xmm, _mm_setzero_si128())));
1142+
_mm_storeu_pd(pValueOut + 0, _mm_cvtps_pd(xmm_0));
1143+
_mm_storeu_pd(pValueOut + 2, _mm_cvtps_pd(_mm_movehl_ps(xmm_0, xmm_0)));
1144+
_mm_storeu_pd(pValueOut + 4, _mm_cvtps_pd(xmm_1));
1145+
_mm_storeu_pd(pValueOut + 6, _mm_cvtps_pd(_mm_movehl_ps(xmm_1, xmm_1)));
1146+
}
1147+
1148+
#endif // __F16C__
10391149

10401150
#ifdef __AVX2__
10411151

0 commit comments

Comments
 (0)