Skip to content

Commit 62af030

Browse files
committed
Added clamp i64 to i16 global isel pattern.
1 parent e7f9a83 commit 62af030

File tree

4 files changed

+290
-1
lines changed

4 files changed

+290
-1
lines changed

llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,14 @@ m_GInsertVecElt(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) {
468468
TargetOpcode::G_INSERT_VECTOR_ELT>(Src0, Src1, Src2);
469469
}
470470

471+
template <typename Src0Ty, typename Src1Ty, typename Src2Ty>
472+
inline TernaryOp_match<Src0Ty, Src1Ty, Src2Ty,
473+
TargetOpcode::G_SELECT>
474+
m_GISelect(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) {
475+
return TernaryOp_match<Src0Ty, Src1Ty, Src2Ty,
476+
TargetOpcode::G_SELECT>(Src0, Src1, Src2);
477+
}
478+
471479
/// Matches a register negated by a G_SUB.
472480
/// G_SUB 0, %negated_reg
473481
template <typename SrcTy>
@@ -484,6 +492,50 @@ m_Not(const SrcTy &&Src) {
484492
return m_GXor(Src, m_AllOnesInt());
485493
}
486494

495+
// class that allows to match one of the following patterns:
496+
// select (pred, x, value1) -> cmp slt -> select (pred, origin, value2) ->
497+
// cmp sgt OR select (pred, x, value1) -> cmp sgt -> select (pred, origin,
498+
// value2) -> cmp slt
499+
// also binds the boundary values and the origin.
500+
template <typename Boundary1,
501+
typename Boundary2, typename Origin>
502+
struct MaxMin_match_helper {
503+
Boundary1 B1;
504+
Boundary2 B2;
505+
Origin O;
506+
507+
MaxMin_match_helper(const Boundary1 &FirstBoundary,
508+
const Boundary2 &SecondBoundary, const Origin &Or)
509+
: B1(FirstBoundary), B2(SecondBoundary), O(Or) {}
510+
511+
template <typename OpTy>
512+
bool match(const MachineRegisterInfo &MRI, OpTy &&Op) {
513+
CmpInst::Predicate Predicate1;
514+
Register Base;
515+
516+
if (mi_match(Op, MRI,
517+
m_GISelect(m_GICmp(m_Pred(Predicate1), m_Reg(), m_Reg()),
518+
m_Reg(Base), B1))) {
519+
CmpInst::Predicate Predicate2;
520+
521+
if (mi_match(Base, MRI, m_GISelect(m_GICmp(m_Pred(Predicate2), m_Reg(), m_Reg()), O, B2))) {
522+
if ((Predicate1 == CmpInst::ICMP_SLT && Predicate2 == CmpInst::ICMP_SGT) ||
523+
(Predicate1 == CmpInst::ICMP_SGT && Predicate2 == CmpInst::ICMP_SLT)) {
524+
return true;
525+
}
526+
}
527+
}
528+
529+
return false;
530+
}
531+
};
532+
533+
template <typename Boundary1, typename Boundary2, typename Origin>
534+
inline MaxMin_match_helper<Boundary1, Boundary2, Origin>
535+
m_MaxMin(const Boundary1 &B1, const Boundary2 &B2, const Origin &O) {
536+
return MaxMin_match_helper<Boundary1, Boundary2, Origin>(B1, B2, O);
537+
}
538+
487539
} // namespace GMIPatternMatch
488540
} // namespace llvm
489541

llvm/lib/Target/AMDGPU/AMDGPUCombine.td

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@ def cvt_f32_ubyteN : GICombineRule<
3737
[{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]),
3838
(apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
3939

40+
def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::ClampI64ToI16MatchInfo">;
41+
42+
def clamp_i64_to_i16 : GICombineRule<
43+
(defs root:$clamp_i64_to_i16, clamp_i64_to_i16_matchdata:$matchinfo),
44+
(match (wip_match_opcode G_TRUNC):$clamp_i64_to_i16,
45+
[{ return PostLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]),
46+
(apply [{ PostLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>;
47+
4048
// Combines which should only apply on SI/VI
4149
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
4250

@@ -49,7 +57,7 @@ def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
4957
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
5058
"AMDGPUGenPostLegalizerCombinerHelper",
5159
[all_combines, gfx6gfx7_combines,
52-
uchar_to_float, cvt_f32_ubyteN]> {
60+
uchar_to_float, cvt_f32_ubyteN, clamp_i64_to_i16]> {
5361
let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
5462
let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
5563
let AdditionalArguments = [];

llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,19 @@ class AMDGPUPostLegalizerCombinerHelper {
6666
bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
6767
void applyCvtF32UByteN(MachineInstr &MI,
6868
const CvtF32UByteMatchInfo &MatchInfo);
69+
70+
struct ClampI64ToI16MatchInfo {
71+
int64_t Cmp1;
72+
int64_t Cmp2;
73+
Register Origin;
74+
};
75+
76+
bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
77+
MachineFunction &MF,
78+
ClampI64ToI16MatchInfo &MatchInfo);
79+
80+
void applyClampI64ToI16(MachineInstr &MI,
81+
const ClampI64ToI16MatchInfo &MatchInfo);
6982
};
7083

7184
bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
@@ -245,6 +258,91 @@ void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
245258
MI.eraseFromParent();
246259
}
247260

261+
bool AMDGPUPostLegalizerCombinerHelper::matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
262+
MachineFunction &MF,
263+
ClampI64ToI16MatchInfo &MatchInfo) {
264+
assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
265+
const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
266+
if (SrcType != LLT::scalar(64))
267+
return false;
268+
269+
MachineIRBuilder B(MI);
270+
271+
LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16");
272+
273+
if (mi_match(MI.getOperand(1).getReg(), MRI,
274+
m_MaxMin(m_ICst(MatchInfo.Cmp1),
275+
m_ICst(MatchInfo.Cmp2),
276+
m_Reg(MatchInfo.Origin)))) {
277+
const auto Cmp1 = static_cast<int64_t>(MatchInfo.Cmp1);
278+
const auto Cmp2 = static_cast<int64_t>(MatchInfo.Cmp2);
279+
280+
const int64_t Min = static_cast<int64_t>(std::numeric_limits<int16_t>::min());
281+
const int64_t Max = static_cast<int64_t>(std::numeric_limits<int16_t>::max());
282+
283+
// are we really trying to clamp against short boundaries?
284+
return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
285+
(Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
286+
}
287+
288+
return false;
289+
}
290+
291+
void AMDGPUPostLegalizerCombinerHelper::applyClampI64ToI16(MachineInstr &MI,
292+
const ClampI64ToI16MatchInfo &MatchInfo) {
293+
LLVM_DEBUG(dbgs() << "Combining MI");
294+
295+
MachineIRBuilder B(MI);
296+
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
297+
298+
Register Src = MatchInfo.Origin;
299+
assert(MRI.getType(Src) == LLT::scalar(64));
300+
const LLT S32 = LLT::scalar(32);
301+
302+
auto Unmerge = B.buildUnmerge(S32, Src);
303+
Register Hi32 = Unmerge->getOperand(0).getReg();
304+
Register Lo32 = Unmerge->getOperand(1).getReg();
305+
MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass);
306+
MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass);
307+
308+
constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64;
309+
assert(MI.getOpcode() != CvtOpcode);
310+
311+
Register CvtDst = MRI.createGenericVirtualRegister(S32);
312+
MRI.setRegClass(CvtDst, &AMDGPU::VGPR_32RegClass);
313+
314+
auto CvtPk = B.buildInstr(CvtOpcode);
315+
CvtPk.addDef(CvtDst);
316+
CvtPk.addReg(Hi32);
317+
CvtPk.addReg(Lo32);
318+
CvtPk.setMIFlags(MI.getFlags());
319+
320+
auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
321+
auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
322+
323+
Register MinBoundaryDst = MRI.createGenericVirtualRegister(S32);
324+
MRI.setRegClass(MinBoundaryDst, &AMDGPU::VGPR_32RegClass);
325+
B.buildConstant(MinBoundaryDst, min);
326+
327+
Register MaxBoundaryDst = MRI.createGenericVirtualRegister(S32);
328+
MRI.setRegClass(MaxBoundaryDst, &AMDGPU::VGPR_32RegClass);
329+
B.buildConstant(MaxBoundaryDst, max);
330+
331+
Register MedDst = MRI.createGenericVirtualRegister(S32);
332+
MRI.setRegClass(MedDst, &AMDGPU::VGPR_32RegClass);
333+
334+
auto Med = B.buildInstr(AMDGPU::V_MED3_I32);
335+
Med.addDef(MedDst);
336+
Med.addReg(MinBoundaryDst);
337+
Med.addReg(CvtDst);
338+
Med.addReg(MaxBoundaryDst);
339+
Med.setMIFlags(MI.getFlags());
340+
341+
B.buildCopy(MI.getOperand(0).getReg(), MedDst);
342+
343+
MI.eraseFromParent();
344+
}
345+
248346
class AMDGPUPostLegalizerCombinerHelperState {
249347
protected:
250348
CombinerHelper &Helper;
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX678,GFX6789 %s
2+
; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX6789 %s
3+
; RUN: llc -global-isel -mcpu=gfx1010 -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
4+
5+
; GCN-LABEL: {{^}}v_clamp_i64_i16
6+
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
7+
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
8+
; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000
9+
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
10+
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
11+
; GCN: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
12+
; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
13+
; GCN: v_med3_i32 [[A]], 0xffff8000, [[A]], [[C]]
14+
define i16 @v_clamp_i64_i16(i64 %in) nounwind {
15+
entry:
16+
%0 = icmp sgt i64 %in, -32768
17+
%1 = select i1 %0, i64 %in, i64 -32768
18+
%2 = icmp slt i64 %1, 32767
19+
%3 = select i1 %2, i64 %1, i64 32767
20+
%4 = trunc i64 %3 to i16
21+
22+
ret i16 %4
23+
}
24+
25+
; GCN-LABEL: {{^}}v_clamp_i64_i16_reverse
26+
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
27+
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
28+
; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000
29+
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
30+
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
31+
; GCN: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
32+
; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
33+
; GCN: v_med3_i32 [[A]], 0xffff8000, [[A]], [[C]]
34+
define i16 @v_clamp_i64_i16_reverse(i64 %in) nounwind {
35+
entry:
36+
%0 = icmp slt i64 %in, 32767
37+
%1 = select i1 %0, i64 %in, i64 32767
38+
%2 = icmp sgt i64 %1, -32768
39+
%3 = select i1 %2, i64 %1, i64 -32768
40+
%4 = trunc i64 %3 to i16
41+
42+
ret i16 %4
43+
}
44+
45+
; GCN-LABEL: {{^}}v_clamp_i64_i16_wrong_lower
46+
; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8001
47+
; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
48+
; GFX6789: v_cndmask_b32_e32 [[C:v[0-9]+]], 0, [[C]], vcc
49+
50+
; GCN: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8001, [[A]], vcc_lo
51+
; GCN: v_cndmask_b32_e32 [[B:v[0-9]+]], 0, [[B]], vcc_lo
52+
define i16 @v_clamp_i64_i16_wrong_lower(i64 %in) nounwind {
53+
entry:
54+
%0 = icmp slt i64 %in, 32769
55+
%1 = select i1 %0, i64 %in, i64 32769
56+
%2 = icmp sgt i64 %1, -32768
57+
%3 = select i1 %2, i64 %1, i64 -32768
58+
%4 = trunc i64 %3 to i16
59+
60+
ret i16 %4
61+
}
62+
63+
; GCN-LABEL: {{^}}v_clamp_i64_i16_wrong_lower_and_higher
64+
; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8000
65+
; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
66+
67+
; GCN: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8000, [[A]], vcc_lo
68+
define i16 @v_clamp_i64_i16_wrong_lower_and_higher(i64 %in) nounwind {
69+
entry:
70+
%0 = icmp sgt i64 %in, -32769
71+
%1 = select i1 %0, i64 %in, i64 -32769
72+
%2 = icmp slt i64 %1, 32768
73+
%3 = select i1 %2, i64 %1, i64 32768
74+
%4 = trunc i64 %3 to i16
75+
76+
ret i16 %4
77+
}
78+
79+
; GCN-LABEL: {{^}}v_clamp_i64_i16_lower_than_short
80+
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
81+
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
82+
; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01
83+
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
84+
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
85+
; GCN: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
86+
; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
87+
; GCN: v_med3_i32 [[A]], 0xffffff01, [[A]], [[C]]
88+
define i16 @v_clamp_i64_i16_lower_than_short(i64 %in) nounwind {
89+
entry:
90+
%0 = icmp slt i64 %in, 256
91+
%1 = select i1 %0, i64 %in, i64 256
92+
%2 = icmp sgt i64 %1, -255
93+
%3 = select i1 %2, i64 %1, i64 -255
94+
%4 = trunc i64 %3 to i16
95+
96+
ret i16 %4
97+
}
98+
99+
; GCN-LABEL: {{^}}v_clamp_i64_i16_lower_than_short_reverse
100+
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
101+
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
102+
; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01
103+
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
104+
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
105+
; GCN: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
106+
; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
107+
; GCN: v_med3_i32 [[A]], 0xffffff01, [[A]], [[C]]
108+
define i16 @v_clamp_i64_i16_lower_than_short_reverse(i64 %in) nounwind {
109+
entry:
110+
%0 = icmp sgt i64 %in, -255
111+
%1 = select i1 %0, i64 %in, i64 -255
112+
%2 = icmp slt i64 %1, 256
113+
%3 = select i1 %2, i64 %1, i64 256
114+
%4 = trunc i64 %3 to i16
115+
116+
ret i16 %4
117+
}
118+
119+
; GCN-LABEL: {{^}}v_clamp_i64_i16_zero
120+
; GFX678: v_mov_b32_e32 [[A:v[0-9]+]], 0
121+
; GCN: v_mov_b32_e32 [[A:v[0-9]+]], 0
122+
define i16 @v_clamp_i64_i16_zero(i64 %in) nounwind {
123+
entry:
124+
%0 = icmp sgt i64 %in, 0
125+
%1 = select i1 %0, i64 %in, i64 0
126+
%2 = icmp slt i64 %1, 0
127+
%3 = select i1 %2, i64 %1, i64 0
128+
%4 = trunc i64 %3 to i16
129+
130+
ret i16 %4
131+
}

0 commit comments

Comments
 (0)