883 lines
43 KiB
LLVM
883 lines
43 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX12 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefix=GFX12-SPREFETCH %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch -amdgpu-expert-scheduling-mode < %s | FileCheck --check-prefix=GFX12ES2-SPREFETCH %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX1250 %s
|
|
|
|
define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
|
|
; GFX12-LABEL: copy_flat:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s6, 0
|
|
; GFX12-NEXT: s_cbranch_scc1 .LBB0_3
|
|
; GFX12-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: s_movk_i32 s4, 0xff50
|
|
; GFX12-NEXT: s_mov_b32 s5, -1
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
|
|
; GFX12-NEXT: .LBB0_2: ; %for.body
|
|
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[4:5]
|
|
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
|
|
; GFX12-NEXT: s_add_co_i32 s6, s6, -1
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
|
|
; GFX12-NEXT: s_cmp_lg_u32 s6, 0
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
|
|
; GFX12-NEXT: flat_load_b128 v[0:3], v[0:1]
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: flat_store_b128 v[4:5], v[0:3]
|
|
; GFX12-NEXT: s_cbranch_scc1 .LBB0_2
|
|
; GFX12-NEXT: .LBB0_3: ; %for.end
|
|
; GFX12-NEXT: s_endpgm
|
|
;
|
|
; GFX12-SPREFETCH-LABEL: copy_flat:
|
|
; GFX12-SPREFETCH: ; %bb.0: ; %entry
|
|
; GFX12-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34
|
|
; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0
|
|
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_3
|
|
; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-SPREFETCH-NEXT: s_movk_i32 s4, 0xff50
|
|
; GFX12-SPREFETCH-NEXT: s_mov_b32 s5, -1
|
|
; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
|
|
; GFX12-SPREFETCH-NEXT: .LBB0_2: ; %for.body
|
|
; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[4:5]
|
|
; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
|
|
; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
|
|
; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
|
|
; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1
|
|
; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
|
|
; GFX12-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1]
|
|
; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0
|
|
; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
|
|
; GFX12-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: flat_store_b128 v[4:5], v[0:3]
|
|
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_2
|
|
; GFX12-SPREFETCH-NEXT: .LBB0_3: ; %for.end
|
|
; GFX12-SPREFETCH-NEXT: s_endpgm
|
|
;
|
|
; GFX12ES2-SPREFETCH-LABEL: copy_flat:
|
|
; GFX12ES2-SPREFETCH: ; %bb.0: ; %entry
|
|
; GFX12ES2-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), 2
|
|
; GFX12ES2-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_3
|
|
; GFX12ES2-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12ES2-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12ES2-SPREFETCH-NEXT: s_movk_i32 s4, 0xff50
|
|
; GFX12ES2-SPREFETCH-NEXT: s_mov_b32 s5, -1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
|
|
; GFX12ES2-SPREFETCH-NEXT: .LBB0_2: ; %for.body
|
|
; GFX12ES2-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[4:5]
|
|
; GFX12ES2-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_vm_vsrc(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
|
|
; GFX12ES2-SPREFETCH-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(1)
|
|
; GFX12ES2-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1]
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: flat_store_b128 v[4:5], v[0:3]
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_2
|
|
; GFX12ES2-SPREFETCH-NEXT: .LBB0_3: ; %for.end
|
|
; GFX12ES2-SPREFETCH-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: copy_flat:
|
|
; GFX1250: ; %bb.0: ; %entry
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: s_cmp_eq_u32 s6, 0
|
|
; GFX1250-NEXT: s_cbranch_scc1 .LBB0_3
|
|
; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
|
|
; GFX1250-NEXT: .LBB0_2: ; %for.body
|
|
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1250-NEXT: flat_load_b128 v[2:5], v0, s[2:3] offset:-176
|
|
; GFX1250-NEXT: flat_prefetch_b8 v0, s[2:3] scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_add_co_i32 s6, s6, -1
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s6, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: flat_store_b128 v0, v[2:5], s[0:1]
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
|
|
; GFX1250-NEXT: s_cbranch_scc1 .LBB0_2
|
|
; GFX1250-NEXT: .LBB0_3: ; %for.end
|
|
; GFX1250-NEXT: s_endpgm
|
|
entry:
|
|
%cmp6.not = icmp eq i32 %n, 0
|
|
br i1 %cmp6.not, label %for.end, label %for.body
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%idxprom = zext i32 %i.07 to i64
|
|
%arrayidx = getelementptr inbounds <4 x i32>, ptr %s, i64 %idxprom
|
|
%ld = load <4 x i32>, ptr %arrayidx, align 4
|
|
%arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d, i64 %idxprom
|
|
store <4 x i32> %ld, ptr %arrayidx2, align 4
|
|
%inc = add nuw i32 %i.07, 1
|
|
%exitcond.not = icmp eq i32 %inc, %n
|
|
br i1 %exitcond.not, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) {
|
|
; GFX12-LABEL: copy_global:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s6, 0
|
|
; GFX12-NEXT: s_cbranch_scc1 .LBB1_3
|
|
; GFX12-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
|
|
; GFX12-NEXT: .LBB1_2: ; %for.body
|
|
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176
|
|
; GFX12-NEXT: s_add_co_i32 s6, s6, -1
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
|
|
; GFX12-NEXT: s_cmp_lg_u32 s6, 0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1]
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
|
|
; GFX12-NEXT: s_cbranch_scc1 .LBB1_2
|
|
; GFX12-NEXT: .LBB1_3: ; %for.end
|
|
; GFX12-NEXT: s_endpgm
|
|
;
|
|
; GFX12-SPREFETCH-LABEL: copy_global:
|
|
; GFX12-SPREFETCH: ; %bb.0: ; %entry
|
|
; GFX12-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34
|
|
; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0
|
|
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB1_3
|
|
; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
|
|
; GFX12-SPREFETCH-NEXT: .LBB1_2: ; %for.body
|
|
; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12-SPREFETCH-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176
|
|
; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
|
|
; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1
|
|
; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
|
|
; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0
|
|
; GFX12-SPREFETCH-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: global_store_b128 v0, v[1:4], s[0:1]
|
|
; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
|
|
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB1_2
|
|
; GFX12-SPREFETCH-NEXT: .LBB1_3: ; %for.end
|
|
; GFX12-SPREFETCH-NEXT: s_endpgm
|
|
;
|
|
; GFX12ES2-SPREFETCH-LABEL: copy_global:
|
|
; GFX12ES2-SPREFETCH: ; %bb.0: ; %entry
|
|
; GFX12ES2-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), 2
|
|
; GFX12ES2-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB1_3
|
|
; GFX12ES2-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12ES2-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12ES2-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
|
|
; GFX12ES2-SPREFETCH-NEXT: .LBB1_2: ; %for.body
|
|
; GFX12ES2-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(0) depctr_vm_vsrc(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176
|
|
; GFX12ES2-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: global_store_b128 v0, v[1:4], s[0:1]
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB1_2
|
|
; GFX12ES2-SPREFETCH-NEXT: .LBB1_3: ; %for.end
|
|
; GFX12ES2-SPREFETCH-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: copy_global:
|
|
; GFX1250: ; %bb.0: ; %entry
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: s_cmp_eq_u32 s6, 0
|
|
; GFX1250-NEXT: s_cbranch_scc1 .LBB1_3
|
|
; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
|
|
; GFX1250-NEXT: .LBB1_2: ; %for.body
|
|
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1250-NEXT: global_load_b128 v[2:5], v0, s[2:3] offset:-176
|
|
; GFX1250-NEXT: global_prefetch_b8 v0, s[2:3] scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_add_co_i32 s6, s6, -1
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s6, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: global_store_b128 v0, v[2:5], s[0:1]
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
|
|
; GFX1250-NEXT: s_cbranch_scc1 .LBB1_2
|
|
; GFX1250-NEXT: .LBB1_3: ; %for.end
|
|
; GFX1250-NEXT: s_endpgm
|
|
entry:
|
|
%cmp6.not = icmp eq i32 %n, 0
|
|
br i1 %cmp6.not, label %for.end, label %for.body
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%idxprom = zext i32 %i.07 to i64
|
|
%arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i64 %idxprom
|
|
%ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4
|
|
%arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom
|
|
store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
|
|
%inc = add nuw i32 %i.07, 1
|
|
%exitcond.not = icmp eq i32 %inc, %n
|
|
br i1 %exitcond.not, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) {
|
|
; GFX12-LABEL: copy_constant:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s6, 0
|
|
; GFX12-NEXT: s_cbranch_scc1 .LBB2_3
|
|
; GFX12-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX12-NEXT: .LBB2_2: ; %for.body
|
|
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
|
|
; GFX12-NEXT: s_add_co_i32 s6, s6, -1
|
|
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
|
|
; GFX12-NEXT: s_cmp_lg_u32 s6, 0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
|
|
; GFX12-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
|
|
; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1]
|
|
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
|
|
; GFX12-NEXT: s_cbranch_scc1 .LBB2_2
|
|
; GFX12-NEXT: .LBB2_3: ; %for.end
|
|
; GFX12-NEXT: s_endpgm
|
|
;
|
|
; GFX12-SPREFETCH-LABEL: copy_constant:
|
|
; GFX12-SPREFETCH: ; %bb.0: ; %entry
|
|
; GFX12-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34
|
|
; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0
|
|
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB2_3
|
|
; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX12-SPREFETCH-NEXT: .LBB2_2: ; %for.body
|
|
; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
|
|
; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0
|
|
; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1
|
|
; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
|
|
; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0
|
|
; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
|
|
; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
|
|
; GFX12-SPREFETCH-NEXT: global_store_b128 v0, v[1:4], s[0:1]
|
|
; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
|
|
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB2_2
|
|
; GFX12-SPREFETCH-NEXT: .LBB2_3: ; %for.end
|
|
; GFX12-SPREFETCH-NEXT: s_endpgm
|
|
;
|
|
; GFX12ES2-SPREFETCH-LABEL: copy_constant:
|
|
; GFX12ES2-SPREFETCH: ; %bb.0: ; %entry
|
|
; GFX12ES2-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), 2
|
|
; GFX12ES2-SPREFETCH-NEXT: s_load_b32 s6, s[4:5], 0x34
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cmp_eq_u32 s6, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB2_3
|
|
; GFX12ES2-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12ES2-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12ES2-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: .LBB2_2: ; %for.body
|
|
; GFX12ES2-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_vm_vsrc(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
|
|
; GFX12ES2-SPREFETCH-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: global_store_b128 v0, v[1:4], s[0:1]
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB2_2
|
|
; GFX12ES2-SPREFETCH-NEXT: .LBB2_3: ; %for.end
|
|
; GFX12ES2-SPREFETCH-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: copy_constant:
|
|
; GFX1250: ; %bb.0: ; %entry
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: s_cmp_eq_u32 s6, 0
|
|
; GFX1250-NEXT: s_cbranch_scc1 .LBB2_3
|
|
; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX1250-NEXT: .LBB2_2: ; %for.body
|
|
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: global_prefetch_b8 v0, s[2:3] offset:176 scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
|
|
; GFX1250-NEXT: s_add_co_i32 s6, s6, -1
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s6, 0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[8:9]
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
|
|
; GFX1250-NEXT: global_store_b128 v0, v[2:5], s[0:1]
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
|
|
; GFX1250-NEXT: s_cbranch_scc1 .LBB2_2
|
|
; GFX1250-NEXT: .LBB2_3: ; %for.end
|
|
; GFX1250-NEXT: s_endpgm
|
|
entry:
|
|
%cmp6.not = icmp eq i32 %n, 0
|
|
br i1 %cmp6.not, label %for.end, label %for.body
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%idxprom = zext i32 %i.07 to i64
|
|
%arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(4) %s, i64 %idxprom
|
|
%ld = load <4 x i32>, ptr addrspace(4) %arrayidx, align 4
|
|
%arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom
|
|
store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
|
|
%inc = add nuw i32 %i.07, 1
|
|
%exitcond.not = icmp eq i32 %inc, %n
|
|
br i1 %exitcond.not, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) {
|
|
; GFX12-LABEL: copy_local:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s2, 0
|
|
; GFX12-NEXT: s_cbranch_scc1 .LBB3_2
|
|
; GFX12-NEXT: .LBB3_1: ; %for.body
|
|
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-NEXT: v_mov_b32_e32 v2, s1
|
|
; GFX12-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
|
|
; GFX12-NEXT: s_add_co_i32 s0, s0, 16
|
|
; GFX12-NEXT: s_add_co_i32 s1, s1, 16
|
|
; GFX12-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
|
|
; GFX12-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1
|
|
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
|
|
; GFX12-NEXT: s_wait_dscnt 0x1
|
|
; GFX12-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
|
|
; GFX12-NEXT: s_wait_dscnt 0x1
|
|
; GFX12-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1
|
|
; GFX12-NEXT: s_cbranch_scc1 .LBB3_1
|
|
; GFX12-NEXT: .LBB3_2: ; %for.end
|
|
; GFX12-NEXT: s_endpgm
|
|
;
|
|
; GFX12-SPREFETCH-LABEL: copy_local:
|
|
; GFX12-SPREFETCH: ; %bb.0: ; %entry
|
|
; GFX12-SPREFETCH-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s2, 0
|
|
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB3_2
|
|
; GFX12-SPREFETCH-NEXT: .LBB3_1: ; %for.body
|
|
; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12-SPREFETCH-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v2, s1
|
|
; GFX12-SPREFETCH-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX12-SPREFETCH-NEXT: s_add_co_i32 s2, s2, -1
|
|
; GFX12-SPREFETCH-NEXT: s_add_co_i32 s0, s0, 16
|
|
; GFX12-SPREFETCH-NEXT: s_add_co_i32 s1, s1, 16
|
|
; GFX12-SPREFETCH-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
|
|
; GFX12-SPREFETCH-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1
|
|
; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s2, 0
|
|
; GFX12-SPREFETCH-NEXT: s_wait_dscnt 0x1
|
|
; GFX12-SPREFETCH-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
|
|
; GFX12-SPREFETCH-NEXT: s_wait_dscnt 0x1
|
|
; GFX12-SPREFETCH-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1
|
|
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB3_1
|
|
; GFX12-SPREFETCH-NEXT: .LBB3_2: ; %for.end
|
|
; GFX12-SPREFETCH-NEXT: s_endpgm
|
|
;
|
|
; GFX12ES2-SPREFETCH-LABEL: copy_local:
|
|
; GFX12ES2-SPREFETCH: ; %bb.0: ; %entry
|
|
; GFX12ES2-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), 2
|
|
; GFX12ES2-SPREFETCH-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cmp_eq_u32 s2, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB3_2
|
|
; GFX12ES2-SPREFETCH-NEXT: .LBB3_1: ; %for.body
|
|
; GFX12ES2-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_sa_sdst(0) depctr_vm_vsrc(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_mov_b32_e32 v2, s1
|
|
; GFX12ES2-SPREFETCH-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s2, s2, -1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s0, s0, 16
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s1, s1, 16
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(1)
|
|
; GFX12ES2-SPREFETCH-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_vm_vsrc(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cmp_lg_u32 s2, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_dscnt 0x1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_dscnt 0x1
|
|
; GFX12ES2-SPREFETCH-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB3_1
|
|
; GFX12ES2-SPREFETCH-NEXT: .LBB3_2: ; %for.end
|
|
; GFX12ES2-SPREFETCH-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: copy_local:
|
|
; GFX1250: ; %bb.0: ; %entry
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: s_cmp_eq_u32 s2, 0
|
|
; GFX1250-NEXT: s_cbranch_scc1 .LBB3_2
|
|
; GFX1250-NEXT: .LBB3_1: ; %for.body
|
|
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1250-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v4, s0
|
|
; GFX1250-NEXT: s_add_co_i32 s2, s2, -1
|
|
; GFX1250-NEXT: s_add_co_i32 s0, s0, 16
|
|
; GFX1250-NEXT: s_add_co_i32 s1, s1, 16
|
|
; GFX1250-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
|
|
; GFX1250-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s2, 0
|
|
; GFX1250-NEXT: s_wait_dscnt 0x1
|
|
; GFX1250-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
|
|
; GFX1250-NEXT: s_wait_dscnt 0x1
|
|
; GFX1250-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1
|
|
; GFX1250-NEXT: s_cbranch_scc1 .LBB3_1
|
|
; GFX1250-NEXT: .LBB3_2: ; %for.end
|
|
; GFX1250-NEXT: s_endpgm
|
|
entry:
|
|
%cmp6.not = icmp eq i32 %n, 0
|
|
br i1 %cmp6.not, label %for.end, label %for.body
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%idxprom = zext i32 %i.07 to i64
|
|
%arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(3) %s, i64 %idxprom
|
|
%ld = load <4 x i32>, ptr addrspace(3) %arrayidx, align 4
|
|
%arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(3) %d, i64 %idxprom
|
|
store <4 x i32> %ld, ptr addrspace(3) %arrayidx2, align 4
|
|
%inc = add nuw i32 %i.07, 1
|
|
%exitcond.not = icmp eq i32 %inc, %n
|
|
br i1 %exitcond.not, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
|
|
; GFX12-LABEL: copy_flat_divergent:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s0, 0
|
|
; GFX12-NEXT: s_cbranch_scc1 .LBB4_3
|
|
; GFX12-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
|
|
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v2, s1, s6, v0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
|
|
; GFX12-NEXT: v_add_co_u32 v0, s1, s4, v0
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
|
|
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
|
|
; GFX12-NEXT: .LBB4_2: ; %for.body
|
|
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffff50, v2
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v3, vcc_lo
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
|
|
; GFX12-NEXT: flat_load_b128 v[4:7], v[4:5]
|
|
; GFX12-NEXT: s_add_co_i32 s0, s0, -1
|
|
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-NEXT: s_cmp_lg_u32 s0, 0
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: flat_store_b128 v[0:1], v[4:7]
|
|
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
|
|
; GFX12-NEXT: s_cbranch_scc1 .LBB4_2
|
|
; GFX12-NEXT: .LBB4_3: ; %for.end
|
|
; GFX12-NEXT: s_endpgm
|
|
;
|
|
; GFX12-SPREFETCH-LABEL: copy_flat_divergent:
|
|
; GFX12-SPREFETCH: ; %bb.0: ; %entry
|
|
; GFX12-SPREFETCH-NEXT: s_load_b32 s0, s[4:5], 0x34
|
|
; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s0, 0
|
|
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB4_3
|
|
; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12-SPREFETCH-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
|
|
; GFX12-SPREFETCH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX12-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0
|
|
; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
|
|
; GFX12-SPREFETCH-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
|
|
; GFX12-SPREFETCH-NEXT: .LBB4_2: ; %for.body
|
|
; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffff50, v2
|
|
; GFX12-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v3, vcc_lo
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
|
|
; GFX12-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
|
|
; GFX12-SPREFETCH-NEXT: flat_load_b128 v[4:7], v[4:5]
|
|
; GFX12-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1
|
|
; GFX12-SPREFETCH-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0
|
|
; GFX12-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: flat_store_b128 v[0:1], v[4:7]
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
|
|
; GFX12-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
|
|
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB4_2
|
|
; GFX12-SPREFETCH-NEXT: .LBB4_3: ; %for.end
|
|
; GFX12-SPREFETCH-NEXT: s_endpgm
|
|
;
|
|
; GFX12ES2-SPREFETCH-LABEL: copy_flat_divergent:
|
|
; GFX12ES2-SPREFETCH: ; %bb.0: ; %entry
|
|
; GFX12ES2-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), 2
|
|
; GFX12ES2-SPREFETCH-NEXT: s_load_b32 s0, s[4:5], 0x34
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cmp_eq_u32 s0, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB4_3
|
|
; GFX12ES2-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12ES2-SPREFETCH-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
|
|
; GFX12ES2-SPREFETCH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
|
|
; GFX12ES2-SPREFETCH-NEXT: .LBB4_2: ; %for.body
|
|
; GFX12ES2-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffff50, v2
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v3, vcc_lo
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(2)
|
|
; GFX12ES2-SPREFETCH-NEXT: flat_load_b128 v[4:7], v[4:5]
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: flat_store_b128 v[0:1], v[4:7]
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_vm_vsrc(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB4_2
|
|
; GFX12ES2-SPREFETCH-NEXT: .LBB4_3: ; %for.end
|
|
; GFX12ES2-SPREFETCH-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: copy_flat_divergent:
|
|
; GFX1250: ; %bb.0: ; %entry
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x34
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: s_cmp_eq_u32 s2, 0
|
|
; GFX1250-NEXT: s_cbranch_scc1 .LBB4_3
|
|
; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX1250-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff50
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[10:11], v[0:1]
|
|
; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[8:9], v[0:1]
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3]
|
|
; GFX1250-NEXT: .LBB4_2: ; %for.body
|
|
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
|
|
; GFX1250-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[2:3]
|
|
; GFX1250-NEXT: flat_prefetch_b8 v[2:3] scope:SCOPE_SE
|
|
; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 16, v[2:3]
|
|
; GFX1250-NEXT: s_add_co_i32 s2, s2, -1
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s2, 0
|
|
; GFX1250-NEXT: flat_load_b128 v[4:7], v[4:5]
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: flat_store_b128 v[0:1], v[4:7]
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 16, v[0:1]
|
|
; GFX1250-NEXT: s_cbranch_scc1 .LBB4_2
|
|
; GFX1250-NEXT: .LBB4_3: ; %for.end
|
|
; GFX1250-NEXT: s_endpgm
|
|
entry:
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%s.tid = getelementptr inbounds <4 x i32>, ptr %s, i32 %tid
|
|
%d.tid = getelementptr inbounds <4 x i32>, ptr %d, i32 %tid
|
|
%cmp6.not = icmp eq i32 %n, 0
|
|
br i1 %cmp6.not, label %for.end, label %for.body
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%idxprom = zext i32 %i.07 to i64
|
|
%arrayidx = getelementptr inbounds <4 x i32>, ptr %s.tid, i64 %idxprom
|
|
%ld = load <4 x i32>, ptr %arrayidx, align 4
|
|
%arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d.tid, i64 %idxprom
|
|
store <4 x i32> %ld, ptr %arrayidx2, align 4
|
|
%inc = add nuw i32 %i.07, 1
|
|
%exitcond.not = icmp eq i32 %inc, %n
|
|
br i1 %exitcond.not, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) {
|
|
; GFX12-LABEL: copy_global_divergent:
|
|
; GFX12: ; %bb.0: ; %entry
|
|
; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x34
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_cmp_eq_u32 s0, 0
|
|
; GFX12-NEXT: s_cbranch_scc1 .LBB5_3
|
|
; GFX12-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
|
|
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_add_co_u32 v2, s1, s6, v0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
|
|
; GFX12-NEXT: v_add_co_u32 v0, s1, s4, v0
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
|
|
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
|
|
; GFX12-NEXT: .LBB5_2: ; %for.body
|
|
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176
|
|
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
|
|
; GFX12-NEXT: s_add_co_i32 s0, s0, -1
|
|
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-NEXT: s_cmp_lg_u32 s0, 0
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off
|
|
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
|
|
; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
|
|
; GFX12-NEXT: s_cbranch_scc1 .LBB5_2
|
|
; GFX12-NEXT: .LBB5_3: ; %for.end
|
|
; GFX12-NEXT: s_endpgm
|
|
;
|
|
; GFX12-SPREFETCH-LABEL: copy_global_divergent:
|
|
; GFX12-SPREFETCH: ; %bb.0: ; %entry
|
|
; GFX12-SPREFETCH-NEXT: s_load_b32 s0, s[4:5], 0x34
|
|
; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s0, 0
|
|
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB5_3
|
|
; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12-SPREFETCH-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
|
|
; GFX12-SPREFETCH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX12-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0
|
|
; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
|
|
; GFX12-SPREFETCH-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
|
|
; GFX12-SPREFETCH-NEXT: .LBB5_2: ; %for.body
|
|
; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12-SPREFETCH-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
|
|
; GFX12-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
|
|
; GFX12-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1
|
|
; GFX12-SPREFETCH-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0
|
|
; GFX12-SPREFETCH-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-SPREFETCH-NEXT: global_store_b128 v[0:1], v[4:7], off
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
|
|
; GFX12-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
|
|
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB5_2
|
|
; GFX12-SPREFETCH-NEXT: .LBB5_3: ; %for.end
|
|
; GFX12-SPREFETCH-NEXT: s_endpgm
|
|
;
|
|
; GFX12ES2-SPREFETCH-LABEL: copy_global_divergent:
|
|
; GFX12ES2-SPREFETCH: ; %bb.0: ; %entry
|
|
; GFX12ES2-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), 2
|
|
; GFX12ES2-SPREFETCH-NEXT: s_load_b32 s0, s[4:5], 0x34
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cmp_eq_u32 s0, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB5_3
|
|
; GFX12ES2-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX12ES2-SPREFETCH-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
|
|
; GFX12ES2-SPREFETCH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_sdst(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
|
|
; GFX12ES2-SPREFETCH-NEXT: .LBB5_2: ; %for.body
|
|
; GFX12ES2-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vdst(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_vm_vsrc(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
|
|
; GFX12ES2-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_sa_sdst(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12ES2-SPREFETCH-NEXT: global_store_b128 v[0:1], v[4:7], off
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_vm_vsrc(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
|
|
; GFX12ES2-SPREFETCH-NEXT: s_wait_alu depctr_va_vcc(0)
|
|
; GFX12ES2-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
|
|
; GFX12ES2-SPREFETCH-NEXT: s_cbranch_scc1 .LBB5_2
|
|
; GFX12ES2-SPREFETCH-NEXT: .LBB5_3: ; %for.end
|
|
; GFX12ES2-SPREFETCH-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: copy_global_divergent:
|
|
; GFX1250: ; %bb.0: ; %entry
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x34
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: s_cmp_eq_u32 s0, 0
|
|
; GFX1250-NEXT: s_cbranch_scc1 .LBB5_3
|
|
; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
|
|
; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[10:11], v[0:1]
|
|
; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[8:9], v[0:1]
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
|
; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3]
|
|
; GFX1250-NEXT: .LBB5_2: ; %for.body
|
|
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX1250-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176
|
|
; GFX1250-NEXT: global_prefetch_b8 v[2:3], off scope:SCOPE_SE
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 16, v[2:3]
|
|
; GFX1250-NEXT: s_add_co_i32 s0, s0, -1
|
|
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: global_store_b128 v[0:1], v[4:7], off
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 16, v[0:1]
|
|
; GFX1250-NEXT: s_cbranch_scc1 .LBB5_2
|
|
; GFX1250-NEXT: .LBB5_3: ; %for.end
|
|
; GFX1250-NEXT: s_endpgm
|
|
entry:
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%s.tid = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i32 %tid
|
|
%d.tid = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i32 %tid
|
|
%cmp6.not = icmp eq i32 %n, 0
|
|
br i1 %cmp6.not, label %for.end, label %for.body
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
|
|
%idxprom = zext i32 %i.07 to i64
|
|
%arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s.tid, i64 %idxprom
|
|
%ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4
|
|
%arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d.tid, i64 %idxprom
|
|
store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
|
|
%inc = add nuw i32 %i.07, 1
|
|
%exitcond.not = icmp eq i32 %inc, %n
|
|
br i1 %exitcond.not, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
ret void
|
|
}
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x()
|