diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b6d5b309ceb92..326a004d06f20 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15459,6 +15459,12 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false)) return N0; + // We currently avoid folding freeze over SRA/SRL, due to the problems seen + // with (freeze (assert ext)) blocking simplifications of SRA/SRL. See for + // example https://reviews.llvm.org/D136529#4120959. + if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL) + return SDValue(); + // Fold freeze(op(x, ...)) -> op(freeze(x), ...). // Try to push freeze through instructions that propagate but don't produce // poison as far as possible. If an operand of freeze follows three diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index dde10fd4b8c8a..dfbfaa8c894f5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5137,6 +5137,16 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::FREEZE: case ISD::CONCAT_VECTORS: case ISD::INSERT_SUBVECTOR: + case ISD::SADDSAT: + case ISD::UADDSAT: + case ISD::SSUBSAT: + case ISD::USUBSAT: + case ISD::MULHU: + case ISD::MULHS: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: case ISD::AND: case ISD::XOR: case ISD::ROTL: @@ -5157,6 +5167,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::BUILD_PAIR: return false; + case ISD::SELECT_CC: case ISD::SETCC: { // Integer setcc cannot create undef or poison. if (Op.getOperand(0).getValueType().isInteger()) @@ -5166,7 +5177,8 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, // based on options and flags. The options and flags also cause special // nonan condition codes to be used. Those condition codes may be preserved // even if the nonan flag is dropped somewhere. - ISD::CondCode CCCode = cast(Op.getOperand(2))->get(); + unsigned CCOp = Opcode == ISD::SETCC ? 2 : 4; + ISD::CondCode CCCode = cast(Op.getOperand(CCOp))->get(); if (((unsigned)CCCode & 0x10U)) return true; @@ -5183,6 +5195,8 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, return false; case ISD::SHL: + case ISD::SRL: + case ISD::SRA: // If the max shift amount isn't in range, then the shift can create poison. return !getValidMaximumShiftAmountConstant(Op, DemandedElts); diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index cf99b5d80e13a..b2f9bf89d9ec6 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -282,21 +282,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v4 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v4, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v4, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 ; GFX9-O0-NEXT: v_xor_b32_e64 v1, v5, v1 @@ -312,21 +312,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v3 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v8, v3, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v5, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v3, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v2, v5, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7 ; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, v6 ; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec @@ -339,18 +339,26 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7 @@ -403,7 +411,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 @@ -439,7 +448,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 @@ -690,10 +700,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload @@ -903,14 +913,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -1028,10 +1038,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 6ba66ccf71868..b068d87c4d6f4 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -242,130 +242,137 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 -; GFX9-O0-NEXT: v_ashrrev_i64 v[12:13], s4, v[6:7] -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_ashrrev_i64 v[11:12], s4, v[10:11] +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-O0-NEXT: v_ashrrev_i64 v[6:7], s4, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 -; GFX9-O0-NEXT: v_xor_b32_e64 v13, v11, v12 +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: v_ashrrev_i64 v[15:16], s4, v[13:14] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11 +; GFX9-O0-NEXT: v_xor_b32_e64 v13, v8, v12 ; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v10 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_xor_b32_e64 v15, v4, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_xor_b32_e64 v7, v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 +; GFX9-O0-NEXT: v_xor_b32_e64 v9, v8, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15 +; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v4 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_xor_b32_e64 v2, v2, v6 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_xor_b32_e64 v9, v9, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v12 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v10, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v11, v12, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v10, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v6 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v3, vcc ; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v2, v3, vcc ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -438,7 +445,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 @@ -474,7 +482,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 @@ -589,27 +598,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(6) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 @@ -624,22 +633,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -679,27 +688,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6 ; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -709,30 +718,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8 ; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3] @@ -872,24 +881,24 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4 ; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5 @@ -899,42 +908,42 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 ; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] @@ -1018,12 +1027,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s9 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8 @@ -1034,30 +1043,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 @@ -1099,14 +1108,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4 ; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12] @@ -1152,12 +1161,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4 @@ -1172,18 +1181,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] @@ -1203,18 +1212,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[5:6] @@ -1486,11 +1495,11 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-O0-NEXT: ; kill: killed $vgpr4 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index bf7c1c00c71df..33cc8e96f663f 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -178,18 +178,18 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $152, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: sarl $31, %ebx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: xorl %ecx, %esi -; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %eax, %esi @@ -198,28 +198,29 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %edi -; X86-NEXT: xorl %ebp, %edi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: xorl %edx, %esi +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: subl %edx, %ebp -; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: sbbl %edx, %edi -; X86-NEXT: xorl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: orl %edi, %eax +; X86-NEXT: subl %ebx, %ebp +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: sbbl %ebx, %edx +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: xorl %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -230,91 +231,87 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: sete %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: bsrl %edi, %edx +; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: bsrl %ebx, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx -; X86-NEXT: testl %edi, %edi +; X86-NEXT: testl %esi, %esi ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: bsrl %esi, %edx +; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: bsrl %ebp, %ebp ; X86-NEXT: xorl $31, %ebp ; X86-NEXT: addl $32, %ebp -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %esi, %esi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %edx, %ebp ; X86-NEXT: addl $64, %ebp ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %ebx ; X86-NEXT: cmovnel %ecx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: bsrl %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: bsrl %ebx, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: bsrl %eax, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx -; X86-NEXT: testl %edi, %edi +; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: bsrl %ebx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: bsrl %edi, %esi ; X86-NEXT: xorl $31, %esi ; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %edx ; X86-NEXT: addl $32, %edx -; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: addl $64, %edx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: movl %edi, %ebx +; X86-NEXT: orl %ebx, %esi ; X86-NEXT: cmovnel %ecx, %edx ; X86-NEXT: xorl %esi, %esi ; X86-NEXT: subl %edx, %ebp +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi ; X86-NEXT: movl $0, %edx ; X86-NEXT: sbbl %edx, %edx ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: movl $0, %edi -; X86-NEXT: sbbl %edi, %edi ; X86-NEXT: movl $127, %ecx ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: cmpl %ebp, %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %ecx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: setb %cl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: cmovnel %esi, %edi -; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: movl %ebx, %edx ; X86-NEXT: cmovnel %esi, %edx +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: cmovnel %esi, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: cmovnel %esi, %eax ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: jne .LBB4_1 -; X86-NEXT: # %bb.8: # %_udiv-special-cases -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: xorl $127, %ebp -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jne .LBB4_8 +; X86-NEXT: # %bb.1: # %_udiv-special-cases +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: xorl $127, %edi +; X86-NEXT: orl %ebp, %edi +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %ebp, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: je .LBB4_9 -; X86-NEXT: # %bb.5: # %udiv-bb1 +; X86-NEXT: orl %edi, %ecx +; X86-NEXT: je .LBB4_8 +; X86-NEXT: # %bb.2: # %udiv-bb1 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -344,225 +341,224 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: shll %cl, %edx ; X86-NEXT: notb %cl ; X86-NEXT: movl 140(%esp,%edi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shrl %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl %esi, %edx -; X86-NEXT: movl 136(%esp,%edi), %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: shrl %ebx +; X86-NEXT: shrl %cl, %ebx +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: movl 136(%esp,%edi), %edx ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: shll %cl, %esi +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl $0, %edi -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jae .LBB4_2 +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl $0, %edx +; X86-NEXT: jae .LBB4_3 ; X86-NEXT: # %bb.6: -; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: xorl %edi, %edi ; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: jmp .LBB4_7 -; X86-NEXT: .LBB4_1: -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: jmp .LBB4_9 -; X86-NEXT: .LBB4_2: # %udiv-preheader -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: .LBB4_3: # %udiv-preheader +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb %bl, %ch +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movb %dl, %ch ; X86-NEXT: andb $7, %ch -; X86-NEXT: movb %bl, %cl +; X86-NEXT: movb %dl, %cl ; X86-NEXT: shrb $3, %cl ; X86-NEXT: andb $15, %cl -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %cl, %ebx -; X86-NEXT: movl 100(%esp,%ebx), %ebp -; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-NEXT: movl 96(%esp,%ebx), %edi -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %edx +; X86-NEXT: movzbl %cl, %edx +; X86-NEXT: movl 100(%esp,%edx), %esi +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: movl 96(%esp,%edx), %edi +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %ebp ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrdl %cl, %ebp, %edx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 88(%esp,%ebx), %edx -; X86-NEXT: movl 92(%esp,%ebx), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: shrl %cl, %eax +; X86-NEXT: shrdl %cl, %esi, %ebp +; X86-NEXT: movl 88(%esp,%edx), %ebx +; X86-NEXT: movl 92(%esp,%edx), %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shrl %cl, %edx ; X86-NEXT: notb %cl ; X86-NEXT: addl %edi, %edi ; X86-NEXT: shll %cl, %edi -; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl %edx, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; X86-NEXT: shrdl %cl, %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: addl $-1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrdl %cl, %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB4_3: # %udiv-do-while +; X86-NEXT: .LBB4_4: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %edx +; X86-NEXT: movl %ebp, %esi ; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $1, %ebp, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $1, %ebx, %ebp -; X86-NEXT: shldl $1, %edi, %ebx +; X86-NEXT: shldl $1, %ebp, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: shldl $1, %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: orl %ebx, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl $1, %edi, %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: shldl $1, %eax, %ecx +; X86-NEXT: orl %ebx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl $1, %esi, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %esi, %esi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: sbbl %ebp, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl $1, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: subl %ecx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: sbbl %eax, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: sbbl %esi, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: sbbl %edi, %esi +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: sbbl %ebx, (%esp) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl $-1, %edi ; X86-NEXT: adcl $-1, %edx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: adcl $-1, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl $-1, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %edi ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebx, %ecx ; X86-NEXT: orl %edi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: jne .LBB4_3 -; X86-NEXT: # %bb.4: +; X86-NEXT: jne .LBB4_4 +; X86-NEXT: # %bb.5: ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: .LBB4_7: # %udiv-loop-exit ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: .LBB4_7: # %udiv-loop-exit +; X86-NEXT: shldl $1, %ebx, %edx ; X86-NEXT: orl %ecx, %edx +; X86-NEXT: shldl $1, %eax, %ebx +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: shldl $1, %esi, %eax ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: addl %esi, %esi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: .LBB4_9: # %udiv-end -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %eax -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: subl %ebx, %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: .LBB4_8: # %udiv-end +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, %edx +; X86-NEXT: xorl %ecx, %ebx +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: xorl %ecx, %esi +; X86-NEXT: subl %ecx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: sbbl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, (%ebp) -; X86-NEXT: movl %eax, 4(%ebp) -; X86-NEXT: movl %edx, 8(%ebp) -; X86-NEXT: movl %edi, 12(%ebp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl %ecx, %ebx +; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %esi, (%ecx) +; X86-NEXT: movl %eax, 4(%ecx) +; X86-NEXT: movl %ebx, 8(%ecx) +; X86-NEXT: movl %edx, 12(%ecx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull %eax, %ecx -; X86-NEXT: mull %ebx +; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: addl %edx, %edi +; X86-NEXT: addl %ecx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: imull %esi, %ecx @@ -572,7 +568,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: addl %edx, %ebp ; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %ebx, %ebp +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll index b212e9438e1b5..c79da37988e40 100644 --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -459,8 +459,7 @@ define i32 @freeze_ashr(i32 %a0) nounwind { ; X64-LABEL: freeze_ashr: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: sarl $3, %eax -; X64-NEXT: sarl $3, %eax +; X64-NEXT: sarl $6, %eax ; X64-NEXT: retq %x = ashr i32 %a0, 3 %y = freeze i32 %x @@ -531,30 +530,12 @@ define i32 @freeze_ashr_outofrange(i32 %a0) nounwind { define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind { ; X86-LABEL: freeze_ashr_vec: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psraw $1, %xmm2 -; X86-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: pandn %xmm2, %xmm3 -; X86-NEXT: psraw $3, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: por %xmm3, %xmm0 -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psraw $3, %xmm2 -; X86-NEXT: psraw $1, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: pandn %xmm2, %xmm1 -; X86-NEXT: por %xmm1, %xmm0 +; X86-NEXT: psraw $4, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_vec: ; X64: # %bb.0: -; X64-NEXT: vpsraw $1, %xmm0, %xmm1 -; X64-NEXT: vpsraw $3, %xmm0, %xmm0 -; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; X64-NEXT: vpsraw $3, %xmm0, %xmm1 -; X64-NEXT: vpsraw $1, %xmm0, %xmm0 -; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; X64-NEXT: vpsraw $4, %xmm0, %xmm0 ; X64-NEXT: retq %x = ashr <8 x i16> %a0, %y = freeze <8 x i16> %x @@ -592,8 +573,7 @@ define i32 @freeze_lshr(i32 %a0) nounwind { ; X64-LABEL: freeze_lshr: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $2, %eax -; X64-NEXT: shrl %eax +; X64-NEXT: shrl $3, %eax ; X64-NEXT: retq %x = lshr i32 %a0, 2 %y = freeze i32 %x @@ -664,30 +644,12 @@ define i32 @freeze_lshr_outofrange(i32 %a0) nounwind { define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind { ; X86-LABEL: freeze_lshr_vec: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psrlw $1, %xmm2 -; X86-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: pandn %xmm2, %xmm3 -; X86-NEXT: psrlw $2, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: por %xmm3, %xmm0 -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psrlw $2, %xmm2 -; X86-NEXT: psrlw $1, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: pandn %xmm2, %xmm1 -; X86-NEXT: por %xmm1, %xmm0 +; X86-NEXT: psrlw $3, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_vec: ; X64: # %bb.0: -; X64-NEXT: vpsrlw $1, %xmm0, %xmm1 -; X64-NEXT: vpsrlw $2, %xmm0, %xmm0 -; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; X64-NEXT: vpsrlw $2, %xmm0, %xmm1 -; X64-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; X64-NEXT: vpsrlw $3, %xmm0, %xmm0 ; X64-NEXT: retq %x = lshr <8 x i16> %a0, %y = freeze <8 x i16> %x diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll index 2fdf6ef224ca9..366dad1612b43 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -684,22 +684,21 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 @@ -715,22 +714,21 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 @@ -772,20 +770,19 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw ; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512F-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpandq %zmm6, %zmm4, %zmm4 -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 @@ -803,20 +800,19 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw ; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm4, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2