[LegalizeTypes][DAG] Use SHL(X,1) instead of ADD(X,X) for variable vector indices for extraction/insertion legalization (#188277)

Avoid ADD(X,X) as it doesn't correctly handle undef elements and helps avoid some FREEZE() fold headaches

Resurrects #86857
This commit is contained in:
Simon Pilgrim
2026-04-22 19:05:16 +01:00
committed by GitHub
parent 8f1b0f6327
commit d9bbb902fe
16 changed files with 165 additions and 266 deletions

View File

@@ -231,8 +231,8 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
// Extract the elements at 2 * Idx and 2 * Idx + 1 from the new vector.
SDValue Idx = N->getOperand(1);
Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
Idx = DAG.getNode(ISD::SHL, dl, Idx.getValueType(), Idx,
DAG.getShiftAmountConstant(1, Idx.getValueType(), dl));
Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, NewVec, Idx);
Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
@@ -446,12 +446,12 @@ SDValue DAGTypeLegalizer::ExpandOp_INSERT_VECTOR_ELT(SDNode *N) {
std::swap(Lo, Hi);
SDValue Idx = N->getOperand(2);
Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
Idx = DAG.getNode(ISD::SHL, dl, Idx.getValueType(), Idx,
DAG.getShiftAmountConstant(1, Idx.getValueType(), dl));
NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Lo, Idx);
Idx = DAG.getNode(ISD::ADD, dl,
Idx.getValueType(), Idx,
Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
DAG.getConstant(1, dl, Idx.getValueType()));
NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Hi, Idx);
NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Hi, Idx);
// Convert the new vector to the old vector type.
return DAG.getNode(ISD::BITCAST, dl, VecVT, NewVec);

View File

@@ -2856,23 +2856,23 @@ define i128 @extract_v2i128_c(<2 x i128> %a, i32 %c) {
; CHECK-SD-NEXT: sub sp, sp, #64
; CHECK-SD-NEXT: .cfi_def_cfa_offset 64
; CHECK-SD-NEXT: adds x9, x0, x0
; CHECK-SD-NEXT: mov w8, w4
; CHECK-SD-NEXT: mov w8, #1 // =0x1
; CHECK-SD-NEXT: // kill: def $w4 killed $w4 def $x4
; CHECK-SD-NEXT: adc x10, x1, x1
; CHECK-SD-NEXT: adds x11, x2, x2
; CHECK-SD-NEXT: fmov d1, x9
; CHECK-SD-NEXT: fmov d0, x11
; CHECK-SD-NEXT: adc x12, x3, x3
; CHECK-SD-NEXT: add x8, x8, x8
; CHECK-SD-NEXT: and x9, x8, #0x2
; CHECK-SD-NEXT: orr w8, w8, #0x1
; CHECK-SD-NEXT: mov x11, sp
; CHECK-SD-NEXT: adc x11, x3, x3
; CHECK-SD-NEXT: orr w8, w8, w4, lsl #1
; CHECK-SD-NEXT: ubfiz x9, x4, #4, #1
; CHECK-SD-NEXT: mov v1.d[1], x10
; CHECK-SD-NEXT: add x10, sp, #32
; CHECK-SD-NEXT: and x8, x8, #0x3
; CHECK-SD-NEXT: mov v0.d[1], x12
; CHECK-SD-NEXT: mov v0.d[1], x11
; CHECK-SD-NEXT: mov x11, sp
; CHECK-SD-NEXT: stp q1, q0, [sp]
; CHECK-SD-NEXT: stp q1, q0, [sp, #32]
; CHECK-SD-NEXT: ldr x0, [x10, x9, lsl #3]
; CHECK-SD-NEXT: ldr x0, [x10, x9]
; CHECK-SD-NEXT: ldr x1, [x11, x8, lsl #3]
; CHECK-SD-NEXT: add sp, sp, #64
; CHECK-SD-NEXT: ret

View File

@@ -6,18 +6,15 @@
define amdgpu_kernel void @test_bitcast_llc_v128i8_v16i8(ptr addrspace(1) %out, i32 %idx) {
; GFX9-LABEL: test_bitcast_llc_v128i8_v16i8:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s33, s[4:5], 0x8
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: s_and_b32 s1, s0, 0xff
; GFX9-NEXT: s_or_b32 s0, s1, s0
; GFX9-NEXT: s_and_b32 s1, s0, 0xffff
; GFX9-NEXT: s_lshl_b32 s0, s0, 16
; GFX9-NEXT: s_or_b32 s0, s1, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s33, s33, s33
; GFX9-NEXT: s_mov_b32 s1, s0
; GFX9-NEXT: s_lshl_b32 s33, s33, 1
; GFX9-NEXT: s_mov_b32 s2, s0
; GFX9-NEXT: s_mov_b32 s3, s0
; GFX9-NEXT: s_mov_b32 s4, s0
@@ -48,7 +45,8 @@ define amdgpu_kernel void @test_bitcast_llc_v128i8_v16i8(ptr addrspace(1) %out,
; GFX9-NEXT: s_mov_b32 s29, s0
; GFX9-NEXT: s_mov_b32 s30, s0
; GFX9-NEXT: s_mov_b32 s31, s0
; GFX9-NEXT: s_add_i32 s36, s33, 3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s33, s33, 2
; GFX9-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX9-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX9-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
@@ -65,12 +63,9 @@ define amdgpu_kernel void @test_bitcast_llc_v128i8_v16i8(ptr addrspace(1) %out,
; GFX9-NEXT: v_mov_b64_e32 v[26:27], s[26:27]
; GFX9-NEXT: v_mov_b64_e32 v[28:29], s[28:29]
; GFX9-NEXT: v_mov_b64_e32 v[30:31], s[30:31]
; GFX9-NEXT: s_set_gpr_idx_on s36, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v35, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_add_i32 s0, s33, 2
; GFX9-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v34, v0
; GFX9-NEXT: s_set_gpr_idx_on s33, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v35, v3
; GFX9-NEXT: v_mov_b32_e32 v34, v2
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: v_mov_b32_e32 v36, 0
; GFX9-NEXT: s_set_gpr_idx_on s33, gpr_idx(SRC0)
@@ -125,10 +120,11 @@ define amdgpu_kernel void @test_bitcast_llc_v128i8_v16i8(ptr addrspace(1) %out,
; GFX11-NEXT: s_mov_b32 s29, s0
; GFX11-NEXT: s_mov_b32 s30, s0
; GFX11-NEXT: s_mov_b32 s31, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s33, s33, s33
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b32 m0, s33, 2
; GFX11-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31
; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
@@ -142,14 +138,8 @@ define amdgpu_kernel void @test_bitcast_llc_v128i8_v16i8(ptr addrspace(1) %out,
; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25
; GFX11-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27
; GFX11-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29
; GFX11-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31
; GFX11-NEXT: s_lshl_b32 s0, s33, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 m0, s0, 3
; GFX11-NEXT: v_movrels_b32_e32 v34, v0
; GFX11-NEXT: s_add_i32 m0, s0, 2
; GFX11-NEXT: v_movrels_b32_e32 v33, v0
; GFX11-NEXT: s_mov_b32 m0, s0
; GFX11-NEXT: v_movrels_b32_e32 v34, v3
; GFX11-NEXT: v_movrels_b32_e32 v33, v2
; GFX11-NEXT: v_movrels_b32_e32 v32, v1
; GFX11-NEXT: v_movrels_b32_e32 v31, v0
; GFX11-NEXT: global_store_b128 v35, v[31:34], s[34:35]
@@ -198,10 +188,11 @@ define amdgpu_kernel void @test_bitcast_llc_v128i8_v16i8(ptr addrspace(1) %out,
; GFX12-NEXT: s_mov_b32 s29, s0
; GFX12-NEXT: s_mov_b32 s30, s0
; GFX12-NEXT: s_mov_b32 s31, s0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s33, s38, s38
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 m0, s38, 2
; GFX12-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
@@ -215,14 +206,8 @@ define amdgpu_kernel void @test_bitcast_llc_v128i8_v16i8(ptr addrspace(1) %out,
; GFX12-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25
; GFX12-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27
; GFX12-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29
; GFX12-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31
; GFX12-NEXT: s_lshl_b32 s0, s33, 1
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 m0, s0, 3
; GFX12-NEXT: v_movrels_b32_e32 v34, v0
; GFX12-NEXT: s_add_co_i32 m0, s0, 2
; GFX12-NEXT: v_movrels_b32_e32 v33, v0
; GFX12-NEXT: s_mov_b32 m0, s0
; GFX12-NEXT: v_movrels_b32_e32 v34, v3
; GFX12-NEXT: v_movrels_b32_e32 v33, v2
; GFX12-NEXT: v_movrels_b32_e32 v32, v1
; GFX12-NEXT: v_movrels_b32_e32 v31, v0
; GFX12-NEXT: global_store_b128 v35, v[31:34], s[36:37]
@@ -243,20 +228,13 @@ define amdgpu_kernel void @test_bitcast_llc_v64i16_v8i16(ptr addrspace(1) %out,
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s2, s2, s2
; GFX9-NEXT: s_lshl_b32 s2, s2, 1
; GFX9-NEXT: s_lshl_b32 s2, s2, 2
; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v3, v3
; GFX9-NEXT: v_mov_b32_e32 v2, v2
; GFX9-NEXT: v_mov_b32_e32 v1, v1
; GFX9-NEXT: s_add_i32 s3, s2, 3
; GFX9-NEXT: v_mov_b32_e32 v0, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_set_gpr_idx_on s3, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v3, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_add_i32 s2, s2, 2
; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -265,38 +243,26 @@ define amdgpu_kernel void @test_bitcast_llc_v64i16_v8i16(ptr addrspace(1) %out,
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s2, s2, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b32 s2, s2, 1
; GFX11-NEXT: s_mov_b32 m0, s2
; GFX11-NEXT: s_lshl_b32 m0, s2, 2
; GFX11-NEXT: v_movrels_b32_e32 v3, v3
; GFX11-NEXT: v_movrels_b32_e32 v2, v2
; GFX11-NEXT: v_movrels_b32_e32 v1, v1
; GFX11-NEXT: v_movrels_b32_e32 v0, v0
; GFX11-NEXT: s_add_i32 m0, s2, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_movrels_b32_e32 v3, v0
; GFX11-NEXT: s_add_i32 m0, s2, 2
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: v_movrels_b32_e32 v2, v0
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: test_bitcast_llc_v64i16_v8i16:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s2, s2, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_lshl_b32 s2, s2, 1
; GFX12-NEXT: s_mov_b32 m0, s2
; GFX12-NEXT: s_lshl_b32 m0, s2, 2
; GFX12-NEXT: v_movrels_b32_e32 v3, v3
; GFX12-NEXT: v_movrels_b32_e32 v2, v2
; GFX12-NEXT: v_movrels_b32_e32 v1, v1
; GFX12-NEXT: v_movrels_b32_e32 v0, v0
; GFX12-NEXT: s_add_co_i32 m0, s2, 3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_movrels_b32_e32 v3, v0
; GFX12-NEXT: s_add_co_i32 m0, s2, 2
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: v_movrels_b32_e32 v2, v0
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -315,20 +281,13 @@ define amdgpu_kernel void @test_bitcast_llc_v32i32_v4i32(ptr addrspace(1) %out,
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s2, s2, s2
; GFX9-NEXT: s_lshl_b32 s2, s2, 1
; GFX9-NEXT: s_lshl_b32 s2, s2, 2
; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v3, v3
; GFX9-NEXT: v_mov_b32_e32 v2, v2
; GFX9-NEXT: v_mov_b32_e32 v1, v1
; GFX9-NEXT: s_add_i32 s3, s2, 3
; GFX9-NEXT: v_mov_b32_e32 v0, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_set_gpr_idx_on s3, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v3, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_add_i32 s2, s2, 2
; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -337,38 +296,26 @@ define amdgpu_kernel void @test_bitcast_llc_v32i32_v4i32(ptr addrspace(1) %out,
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s2, s2, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b32 s2, s2, 1
; GFX11-NEXT: s_mov_b32 m0, s2
; GFX11-NEXT: s_lshl_b32 m0, s2, 2
; GFX11-NEXT: v_movrels_b32_e32 v3, v3
; GFX11-NEXT: v_movrels_b32_e32 v2, v2
; GFX11-NEXT: v_movrels_b32_e32 v1, v1
; GFX11-NEXT: v_movrels_b32_e32 v0, v0
; GFX11-NEXT: s_add_i32 m0, s2, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_movrels_b32_e32 v3, v0
; GFX11-NEXT: s_add_i32 m0, s2, 2
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: v_movrels_b32_e32 v2, v0
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: test_bitcast_llc_v32i32_v4i32:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s2, s2, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_lshl_b32 s2, s2, 1
; GFX12-NEXT: s_mov_b32 m0, s2
; GFX12-NEXT: s_lshl_b32 m0, s2, 2
; GFX12-NEXT: v_movrels_b32_e32 v3, v3
; GFX12-NEXT: v_movrels_b32_e32 v2, v2
; GFX12-NEXT: v_movrels_b32_e32 v1, v1
; GFX12-NEXT: v_movrels_b32_e32 v0, v0
; GFX12-NEXT: s_add_co_i32 m0, s2, 3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_movrels_b32_e32 v3, v0
; GFX12-NEXT: s_add_co_i32 m0, s2, 2
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: v_movrels_b32_e32 v2, v0
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -387,104 +334,59 @@ define amdgpu_kernel void @test_bitcast_llc_v16i64_v4i256(ptr addrspace(1) %out,
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s2, s2, s2
; GFX9-NEXT: s_add_i32 s3, s2, 1
; GFX9-NEXT: s_add_i32 s3, s3, s3
; GFX9-NEXT: s_lshl_b32 s3, s3, 1
; GFX9-NEXT: s_set_gpr_idx_on s3, gpr_idx(SRC0)
; GFX9-NEXT: s_lshl_b32 s2, s2, 3
; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v3, v3
; GFX9-NEXT: v_mov_b32_e32 v2, v2
; GFX9-NEXT: v_mov_b32_e32 v1, v1
; GFX9-NEXT: s_add_i32 s4, s3, 3
; GFX9-NEXT: v_mov_b32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v7, v7
; GFX9-NEXT: v_mov_b32_e32 v6, v6
; GFX9-NEXT: v_mov_b32_e32 v5, v5
; GFX9-NEXT: v_mov_b32_e32 v4, v4
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_add_i32 s5, s3, 2
; GFX9-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v3, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_add_i32 s2, s2, s2
; GFX9-NEXT: s_set_gpr_idx_on s5, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_lshl_b32 s2, s2, 1
; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v5, v1
; GFX9-NEXT: v_mov_b32_e32 v4, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_add_i32 s3, s2, 3
; GFX9-NEXT: s_set_gpr_idx_on s3, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v7, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: s_add_i32 s2, s2, 2
; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
; GFX9-NEXT: v_mov_b32_e32 v6, v0
; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_bitcast_llc_v16i64_v4i256:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s2, s0, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s0, s2, 1
; GFX11-NEXT: s_add_i32 s2, s2, s2
; GFX11-NEXT: s_add_i32 s0, s0, s0
; GFX11-NEXT: s_lshl_b32 s2, s2, 1
; GFX11-NEXT: s_lshl_b32 s3, s0, 1
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: s_mov_b32 m0, s3
; GFX11-NEXT: v_movrels_b32_e32 v1, v1
; GFX11-NEXT: v_movrels_b32_e32 v0, v0
; GFX11-NEXT: s_add_i32 m0, s3, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_movrels_b32_e32 v3, v0
; GFX11-NEXT: s_add_i32 m0, s3, 2
; GFX11-NEXT: v_movrels_b32_e32 v2, v0
; GFX11-NEXT: s_mov_b32 m0, s2
; GFX11-NEXT: v_movrels_b32_e32 v5, v1
; GFX11-NEXT: v_movrels_b32_e32 v4, v0
; GFX11-NEXT: s_add_i32 m0, s2, 3
; GFX11-NEXT: v_movrels_b32_e32 v7, v0
; GFX11-NEXT: s_add_i32 m0, s2, 2
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: v_movrels_b32_e32 v6, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b32 m0, s2, 3
; GFX11-NEXT: v_movrels_b32_e32 v3, v3
; GFX11-NEXT: v_movrels_b32_e32 v2, v2
; GFX11-NEXT: v_movrels_b32_e32 v1, v1
; GFX11-NEXT: v_movrels_b32_e32 v7, v7
; GFX11-NEXT: v_movrels_b32_e32 v6, v6
; GFX11-NEXT: v_movrels_b32_e32 v5, v5
; GFX11-NEXT: v_movrels_b32_e32 v4, v4
; GFX11-NEXT: v_movrels_b32_e32 v0, v0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: test_bitcast_llc_v16i64_v4i256:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s2, s2, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_add_co_i32 s3, s2, 1
; GFX12-NEXT: s_add_co_i32 s2, s2, s2
; GFX12-NEXT: s_add_co_i32 s3, s3, s3
; GFX12-NEXT: s_lshl_b32 s2, s2, 1
; GFX12-NEXT: s_lshl_b32 s3, s3, 1
; GFX12-NEXT: s_mov_b32 m0, s3
; GFX12-NEXT: v_movrels_b32_e32 v1, v1
; GFX12-NEXT: v_movrels_b32_e32 v0, v0
; GFX12-NEXT: s_add_co_i32 m0, s3, 3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_movrels_b32_e32 v3, v0
; GFX12-NEXT: s_add_co_i32 m0, s3, 2
; GFX12-NEXT: v_movrels_b32_e32 v2, v0
; GFX12-NEXT: s_mov_b32 m0, s2
; GFX12-NEXT: v_movrels_b32_e32 v5, v1
; GFX12-NEXT: v_movrels_b32_e32 v4, v0
; GFX12-NEXT: s_add_co_i32 m0, s2, 3
; GFX12-NEXT: v_movrels_b32_e32 v7, v0
; GFX12-NEXT: s_add_co_i32 m0, s2, 2
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: v_movrels_b32_e32 v6, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 m0, s2, 3
; GFX12-NEXT: v_movrels_b32_e32 v3, v3
; GFX12-NEXT: v_movrels_b32_e32 v2, v2
; GFX12-NEXT: v_movrels_b32_e32 v1, v1
; GFX12-NEXT: v_movrels_b32_e32 v7, v7
; GFX12-NEXT: v_movrels_b32_e32 v6, v6
; GFX12-NEXT: v_movrels_b32_e32 v5, v5
; GFX12-NEXT: v_movrels_b32_e32 v4, v4
; GFX12-NEXT: v_movrels_b32_e32 v0, v0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
%alloca = freeze <16 x i64> poison

View File

@@ -39,20 +39,21 @@ define void @dynamicIndex(ptr %addr, ptr %addr2, i32 %index) {
; CHECK-NEXT: mov r4, sp
; CHECK-NEXT: bfc r4, #0, #4
; CHECK-NEXT: mov sp, r4
; CHECK-NEXT: movs r3, #2
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
; CHECK-NEXT: adds r0, r2, r2
; CHECK-NEXT: and r2, r0, #2
; CHECK-NEXT: adds r0, #1
; CHECK-NEXT: and.w r0, r3, r2, lsl #1
; CHECK-NEXT: mov r12, sp
; CHECK-NEXT: and r0, r0, #3
; CHECK-NEXT: lsls r2, r2, #2
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: vst1.64 {d16, d17}, [r3:128], r2
; CHECK-NEXT: orr.w r0, r12, r0, lsl #2
; CHECK-NEXT: sub.w r4, r7, #8
; CHECK-NEXT: lsls r0, r0, #2
; CHECK-NEXT: vst1.64 {d16, d17}, [r3:128], r0
; CHECK-NEXT: lsls r0, r2, #1
; CHECK-NEXT: adds r0, #1
; CHECK-NEXT: and r0, r0, #3
; CHECK-NEXT: ldr r2, [r3]
; CHECK-NEXT: ldr r0, [r0]
; CHECK-NEXT: vldr d18, [r1]
; CHECK-NEXT: orr.w r0, r12, r0, lsl #2
; CHECK-NEXT: ldr r0, [r0]
; CHECK-NEXT: vmov d16, r2, r0
; CHECK-NEXT: vtbl.8 d16, {d16, d17}, d18
; CHECK-NEXT: vstr d16, [r1]

View File

@@ -8,8 +8,8 @@
define i8 @baz(ptr %ptr, i32 %arg) {
; CHECK-LABEL: baz:
; CHECK: @ %bb.0: @ %bb
; CHECK-NEXT: add r1, r1, r1
; CHECK-NEXT: and r1, r1, #2
; CHECK-NEXT: mov r2, #2
; CHECK-NEXT: and r1, r2, r1, lsl #1
; CHECK-NEXT: ldr r0, [r0, r1, lsl #2]
; CHECK-NEXT: bx lr
bb:

View File

@@ -168,7 +168,7 @@ define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
; LA32-LABEL: extract_4xi64_idx:
; LA32: # %bb.0:
; LA32-NEXT: xvld $xr0, $a0, 0
; LA32-NEXT: add.w $a0, $a2, $a2
; LA32-NEXT: slli.w $a0, $a2, 1
; LA32-NEXT: addi.w $a2, $a0, 1
; LA32-NEXT: xvreplgr2vr.w $xr1, $a2
; LA32-NEXT: xvperm.w $xr1, $xr0, $xr1

View File

@@ -268,7 +268,7 @@ define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
; LA32: # %bb.0:
; LA32-NEXT: pcalau12i $a5, %pc_hi20(.LCPI15_0)
; LA32-NEXT: xvld $xr0, $a5, %pc_lo12(.LCPI15_0)
; LA32-NEXT: add.w $a4, $a4, $a4
; LA32-NEXT: slli.w $a4, $a4, 1
; LA32-NEXT: xvld $xr1, $a0, 0
; LA32-NEXT: xvreplgr2vr.w $xr2, $a4
; LA32-NEXT: xvseq.w $xr2, $xr2, $xr0

View File

@@ -167,7 +167,7 @@ define void @extract_2xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
; LA32-LABEL: extract_2xi64_idx:
; LA32: # %bb.0:
; LA32-NEXT: vld $vr0, $a0, 0
; LA32-NEXT: add.w $a0, $a2, $a2
; LA32-NEXT: slli.w $a0, $a2, 1
; LA32-NEXT: addi.w $a2, $a0, 1
; LA32-NEXT: vreplve.w $vr1, $vr0, $a2
; LA32-NEXT: vreplve.w $vr0, $vr0, $a0

View File

@@ -188,7 +188,7 @@ define void @insert_2xi64_idx(ptr %src, ptr %dst, i64 %ins, i32 %idx) nounwind {
; LA32: # %bb.0:
; LA32-NEXT: pcalau12i $a5, %pc_hi20(.LCPI9_0)
; LA32-NEXT: vld $vr0, $a5, %pc_lo12(.LCPI9_0)
; LA32-NEXT: add.w $a4, $a4, $a4
; LA32-NEXT: slli.w $a4, $a4, 1
; LA32-NEXT: vld $vr1, $a0, 0
; LA32-NEXT: vreplgr2vr.w $vr2, $a4
; LA32-NEXT: vseq.w $vr2, $vr2, $vr0

View File

@@ -1442,8 +1442,8 @@ define i64 @extract_sext_v2i64_vidx() nounwind {
; O32-BE-NEXT: addu $1, $2, $25
; O32-BE-NEXT: lw $2, %got(i32)($1)
; O32-BE-NEXT: lw $2, 0($2)
; O32-BE-NEXT: addu $2, $2, $2
; O32-BE-NEXT: addiu $3, $2, 1
; O32-BE-NEXT: sll $2, $2, 1
; O32-BE-NEXT: ori $3, $2, 1
; O32-BE-NEXT: lw $1, %got(v2i64)($1)
; O32-BE-NEXT: ld.d $w0, 0($1)
; O32-BE-NEXT: addv.d $w0, $w0, $w0
@@ -1461,8 +1461,8 @@ define i64 @extract_sext_v2i64_vidx() nounwind {
; O32-LE-NEXT: addu $1, $2, $25
; O32-LE-NEXT: lw $2, %got(i32)($1)
; O32-LE-NEXT: lw $2, 0($2)
; O32-LE-NEXT: addu $2, $2, $2
; O32-LE-NEXT: addiu $3, $2, 1
; O32-LE-NEXT: sll $2, $2, 1
; O32-LE-NEXT: ori $3, $2, 1
; O32-LE-NEXT: lw $1, %got(v2i64)($1)
; O32-LE-NEXT: ld.d $w0, 0($1)
; O32-LE-NEXT: addv.d $w0, $w0, $w0
@@ -1669,8 +1669,8 @@ define i64 @extract_zext_v2i64_vidx() nounwind {
; O32-BE-NEXT: addu $1, $2, $25
; O32-BE-NEXT: lw $2, %got(i32)($1)
; O32-BE-NEXT: lw $2, 0($2)
; O32-BE-NEXT: addu $2, $2, $2
; O32-BE-NEXT: addiu $3, $2, 1
; O32-BE-NEXT: sll $2, $2, 1
; O32-BE-NEXT: ori $3, $2, 1
; O32-BE-NEXT: lw $1, %got(v2i64)($1)
; O32-BE-NEXT: ld.d $w0, 0($1)
; O32-BE-NEXT: addv.d $w0, $w0, $w0
@@ -1688,8 +1688,8 @@ define i64 @extract_zext_v2i64_vidx() nounwind {
; O32-LE-NEXT: addu $1, $2, $25
; O32-LE-NEXT: lw $2, %got(i32)($1)
; O32-LE-NEXT: lw $2, 0($2)
; O32-LE-NEXT: addu $2, $2, $2
; O32-LE-NEXT: addiu $3, $2, 1
; O32-LE-NEXT: sll $2, $2, 1
; O32-LE-NEXT: ori $3, $2, 1
; O32-LE-NEXT: lw $1, %got(v2i64)($1)
; O32-LE-NEXT: ld.d $w0, 0($1)
; O32-LE-NEXT: addv.d $w0, $w0, $w0

View File

@@ -165,14 +165,14 @@ define <2 x i64> @testDoubleword(<2 x i64> %a, i64 %b, i64 %idx) {
;
; CHECK-32-LABEL: testDoubleword:
; CHECK-32: # %bb.0: # %entry
; CHECK-32-NEXT: add 5, 6, 6
; CHECK-32-NEXT: addi 7, 1, -32
; CHECK-32-NEXT: rlwinm 5, 6, 3, 28, 28
; CHECK-32-NEXT: stxv 34, -32(1)
; CHECK-32-NEXT: rlwinm 6, 5, 2, 28, 28
; CHECK-32-NEXT: stwx 3, 7, 6
; CHECK-32-NEXT: addi 3, 5, 1
; CHECK-32-NEXT: stwx 3, 7, 5
; CHECK-32-NEXT: slwi 3, 6, 1
; CHECK-32-NEXT: addi 5, 1, -16
; CHECK-32-NEXT: lxv 0, -32(1)
; CHECK-32-NEXT: addi 3, 3, 1
; CHECK-32-NEXT: rlwinm 3, 3, 2, 28, 29
; CHECK-32-NEXT: stxv 0, -16(1)
; CHECK-32-NEXT: stwx 4, 5, 3
@@ -187,8 +187,8 @@ define <2 x i64> @testDoubleword(<2 x i64> %a, i64 %b, i64 %idx) {
;
; CHECK-32-P10-LABEL: testDoubleword:
; CHECK-32-P10: # %bb.0: # %entry
; CHECK-32-P10-NEXT: add 5, 6, 6
; CHECK-32-P10-NEXT: slwi 6, 5, 2
; CHECK-32-P10-NEXT: slwi 5, 6, 1
; CHECK-32-P10-NEXT: slwi 6, 6, 3
; CHECK-32-P10-NEXT: vinswlx 2, 6, 3
; CHECK-32-P10-NEXT: addi 3, 5, 1
; CHECK-32-P10-NEXT: slwi 3, 3, 2

View File

@@ -990,14 +990,15 @@ entry:
define i64 @getvelsl(<2 x i64> %vsl, i32 signext %i) {
; CHECK-LABEL: getvelsl:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: add 5, 3, 3
; CHECK-NEXT: rlwinm 5, 3, 3, 28, 28
; CHECK-NEXT: slwi 3, 3, 1
; CHECK-NEXT: addi 4, 1, -16
; CHECK-NEXT: rlwinm 3, 5, 2, 28, 28
; CHECK-NEXT: addi 5, 5, 1
; CHECK-NEXT: addi 3, 3, 1
; CHECK-NEXT: stxvw4x 34, 0, 4
; CHECK-NEXT: rlwinm 5, 5, 2, 28, 29
; CHECK-NEXT: lwzx 3, 4, 3
; CHECK-NEXT: lwzx 4, 4, 5
; CHECK-NEXT: lwzx 5, 4, 5
; CHECK-NEXT: rlwinm 3, 3, 2, 28, 29
; CHECK-NEXT: lwzx 4, 4, 3
; CHECK-NEXT: mr 3, 5
; CHECK-NEXT: blr
entry:
%vecext = extractelement <2 x i64> %vsl, i32 %i
@@ -1008,14 +1009,15 @@ entry:
define i64 @getvelul(<2 x i64> %vul, i32 signext %i) {
; CHECK-LABEL: getvelul:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: add 5, 3, 3
; CHECK-NEXT: rlwinm 5, 3, 3, 28, 28
; CHECK-NEXT: slwi 3, 3, 1
; CHECK-NEXT: addi 4, 1, -16
; CHECK-NEXT: rlwinm 3, 5, 2, 28, 28
; CHECK-NEXT: addi 5, 5, 1
; CHECK-NEXT: addi 3, 3, 1
; CHECK-NEXT: stxvw4x 34, 0, 4
; CHECK-NEXT: rlwinm 5, 5, 2, 28, 29
; CHECK-NEXT: lwzx 3, 4, 3
; CHECK-NEXT: lwzx 4, 4, 5
; CHECK-NEXT: lwzx 5, 4, 5
; CHECK-NEXT: rlwinm 3, 3, 2, 28, 29
; CHECK-NEXT: lwzx 4, 4, 3
; CHECK-NEXT: mr 3, 5
; CHECK-NEXT: blr
entry:
%vecext = extractelement <2 x i64> %vul, i32 %i

View File

@@ -241,12 +241,12 @@ define <2 x i64> @testDoubleword(<2 x i64> %a, i64 %b, i64 %idx) {
;
; AIX-P8-32-LABEL: testDoubleword:
; AIX-P8-32: # %bb.0: # %entry
; AIX-P8-32-NEXT: add r6, r6, r6
; AIX-P8-32-NEXT: addi r5, r1, -16
; AIX-P8-32-NEXT: rlwinm r7, r6, 2, 28, 28
; AIX-P8-32-NEXT: rlwinm r7, r6, 3, 28, 28
; AIX-P8-32-NEXT: stxvd2x v2, 0, r5
; AIX-P8-32-NEXT: stwx r3, r5, r7
; AIX-P8-32-NEXT: addi r3, r6, 1
; AIX-P8-32-NEXT: slwi r3, r6, 1
; AIX-P8-32-NEXT: addi r3, r3, 1
; AIX-P8-32-NEXT: rlwinm r3, r3, 2, 28, 29
; AIX-P8-32-NEXT: stwx r4, r5, r3
; AIX-P8-32-NEXT: lxvd2x v2, 0, r5

View File

@@ -621,7 +621,7 @@ define i64 @extractelt_v3i64_idx(<3 x i64> %a, i32 zeroext %idx) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV32-NEXT: vadd.vv v8, v8, v8
; RV32-NEXT: add a0, a0, a0
; RV32-NEXT: slli a0, a0, 1
; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV32-NEXT: vslidedown.vx v10, v8, a0
; RV32-NEXT: addi a1, a0, 1

View File

@@ -1013,14 +1013,13 @@ define <2 x i64> @arg_i64_v2i64(<2 x i64> %v, i64 %x, i32 %y) nounwind {
; X86AVX2-NEXT: movl 8(%ebp), %ecx
; X86AVX2-NEXT: movl 12(%ebp), %edx
; X86AVX2-NEXT: vmovaps %xmm0, (%esp)
; X86AVX2-NEXT: leal (%eax,%eax), %esi
; X86AVX2-NEXT: andl $2, %esi
; X86AVX2-NEXT: movl %ecx, (%esp,%esi,4)
; X86AVX2-NEXT: leal 1(%eax,%eax), %esi
; X86AVX2-NEXT: andl $1, %eax
; X86AVX2-NEXT: movl %ecx, (%esp,%eax,8)
; X86AVX2-NEXT: vmovaps (%esp), %xmm0
; X86AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
; X86AVX2-NEXT: leal 1(%eax,%eax), %eax
; X86AVX2-NEXT: andl $3, %eax
; X86AVX2-NEXT: movl %edx, 16(%esp,%eax,4)
; X86AVX2-NEXT: andl $3, %esi
; X86AVX2-NEXT: movl %edx, 16(%esp,%esi,4)
; X86AVX2-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0
; X86AVX2-NEXT: leal -4(%ebp), %esp
; X86AVX2-NEXT: popl %esi
@@ -1362,14 +1361,13 @@ define <2 x i64> @load_i64_v2i64(<2 x i64> %v, ptr %p, i32 %y) nounwind {
; X86AVX2-NEXT: movl (%ecx), %edx
; X86AVX2-NEXT: movl 4(%ecx), %ecx
; X86AVX2-NEXT: vmovaps %xmm0, (%esp)
; X86AVX2-NEXT: leal (%eax,%eax), %esi
; X86AVX2-NEXT: andl $2, %esi
; X86AVX2-NEXT: movl %edx, (%esp,%esi,4)
; X86AVX2-NEXT: leal 1(%eax,%eax), %esi
; X86AVX2-NEXT: andl $1, %eax
; X86AVX2-NEXT: movl %edx, (%esp,%eax,8)
; X86AVX2-NEXT: vmovaps (%esp), %xmm0
; X86AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
; X86AVX2-NEXT: leal 1(%eax,%eax), %eax
; X86AVX2-NEXT: andl $3, %eax
; X86AVX2-NEXT: movl %ecx, 16(%esp,%eax,4)
; X86AVX2-NEXT: andl $3, %esi
; X86AVX2-NEXT: movl %ecx, 16(%esp,%esi,4)
; X86AVX2-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0
; X86AVX2-NEXT: leal -4(%ebp), %esp
; X86AVX2-NEXT: popl %esi
@@ -1746,14 +1744,13 @@ define <4 x i64> @arg_i64_v4i64(<4 x i64> %v, i64 %x, i32 %y) nounwind {
; X86AVX2-NEXT: movl 8(%ebp), %ecx
; X86AVX2-NEXT: movl 12(%ebp), %edx
; X86AVX2-NEXT: vmovaps %ymm0, (%esp)
; X86AVX2-NEXT: leal (%eax,%eax), %esi
; X86AVX2-NEXT: andl $6, %esi
; X86AVX2-NEXT: movl %ecx, (%esp,%esi,4)
; X86AVX2-NEXT: leal 1(%eax,%eax), %esi
; X86AVX2-NEXT: andl $3, %eax
; X86AVX2-NEXT: movl %ecx, (%esp,%eax,8)
; X86AVX2-NEXT: vmovaps (%esp), %ymm0
; X86AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
; X86AVX2-NEXT: leal 1(%eax,%eax), %eax
; X86AVX2-NEXT: andl $7, %eax
; X86AVX2-NEXT: movl %edx, 32(%esp,%eax,4)
; X86AVX2-NEXT: andl $7, %esi
; X86AVX2-NEXT: movl %edx, 32(%esp,%esi,4)
; X86AVX2-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0
; X86AVX2-NEXT: leal -4(%ebp), %esp
; X86AVX2-NEXT: popl %esi
@@ -2128,14 +2125,13 @@ define <4 x i64> @load_i64_v4i64(<4 x i64> %v, ptr %p, i32 %y) nounwind {
; X86AVX2-NEXT: movl (%ecx), %edx
; X86AVX2-NEXT: movl 4(%ecx), %ecx
; X86AVX2-NEXT: vmovaps %ymm0, (%esp)
; X86AVX2-NEXT: leal (%eax,%eax), %esi
; X86AVX2-NEXT: andl $6, %esi
; X86AVX2-NEXT: movl %edx, (%esp,%esi,4)
; X86AVX2-NEXT: leal 1(%eax,%eax), %esi
; X86AVX2-NEXT: andl $3, %eax
; X86AVX2-NEXT: movl %edx, (%esp,%eax,8)
; X86AVX2-NEXT: vmovaps (%esp), %ymm0
; X86AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
; X86AVX2-NEXT: leal 1(%eax,%eax), %eax
; X86AVX2-NEXT: andl $7, %eax
; X86AVX2-NEXT: movl %ecx, 32(%esp,%eax,4)
; X86AVX2-NEXT: andl $7, %esi
; X86AVX2-NEXT: movl %ecx, 32(%esp,%esi,4)
; X86AVX2-NEXT: vmovaps {{[0-9]+}}(%esp), %ymm0
; X86AVX2-NEXT: leal -4(%ebp), %esp
; X86AVX2-NEXT: popl %esi

View File

@@ -11,7 +11,7 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
; X86-SSE-NEXT: movl %esp, %ebp
; X86-SSE-NEXT: andl $-16, %esp
; X86-SSE-NEXT: subl $272, %esp # imm = 0x110
; X86-SSE-NEXT: movl 88(%ebp), %ecx
; X86-SSE-NEXT: movl 88(%ebp), %eax
; X86-SSE-NEXT: movdqa 72(%ebp), %xmm0
; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
; X86-SSE-NEXT: xorps %xmm1, %xmm1
@@ -31,10 +31,9 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
; X86-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SSE-NEXT: movaps %xmm1, (%esp)
; X86-SSE-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
; X86-SSE-NEXT: leal (%ecx,%ecx), %eax
; X86-SSE-NEXT: andl $31, %eax
; X86-SSE-NEXT: movl 128(%esp,%eax,4), %eax
; X86-SSE-NEXT: leal 1(%ecx,%ecx), %ecx
; X86-SSE-NEXT: leal 1(%eax,%eax), %ecx
; X86-SSE-NEXT: andl $15, %eax
; X86-SSE-NEXT: movl 128(%esp,%eax,8), %eax
; X86-SSE-NEXT: andl $31, %ecx
; X86-SSE-NEXT: movl (%esp,%ecx,4), %edx
; X86-SSE-NEXT: movl %ebp, %esp
@@ -66,7 +65,7 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
; X86-AVX-NEXT: movl %esp, %ebp
; X86-AVX-NEXT: andl $-32, %esp
; X86-AVX-NEXT: subl $288, %esp # imm = 0x120
; X86-AVX-NEXT: movl 40(%ebp), %ecx
; X86-AVX-NEXT: movl 40(%ebp), %eax
; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
@@ -77,10 +76,9 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
; X86-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: vmovaps %ymm1, (%esp)
; X86-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: leal (%ecx,%ecx), %eax
; X86-AVX-NEXT: andl $31, %eax
; X86-AVX-NEXT: movl 128(%esp,%eax,4), %eax
; X86-AVX-NEXT: leal 1(%ecx,%ecx), %ecx
; X86-AVX-NEXT: leal 1(%eax,%eax), %ecx
; X86-AVX-NEXT: andl $15, %eax
; X86-AVX-NEXT: movl 128(%esp,%eax,8), %eax
; X86-AVX-NEXT: andl $31, %ecx
; X86-AVX-NEXT: movl (%esp,%ecx,4), %edx
; X86-AVX-NEXT: movl %ebp, %esp