The main improvement is to the mfma tests. There are some mild regressions scattered around, and a few major ones. The worst regressions are in some of the bitcast tests; these are cases where the SGPR argument list runs out and uses VGPRs, and the copies-from-VGPR are misidentified as divergent. Most of the shufflevector tests are also regressions. These end up with cleaner MIR, but then get poor regalloc decisions.
116 lines
4.3 KiB
LLVM
116 lines
4.3 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; Ensure that range metadata is handled correctly for vector loads.
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
|
|
|
|
define <2 x i16> @test_add2x16(ptr %a_ptr, ptr %b_ptr) {
|
|
; CHECK-LABEL: test_add2x16:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0x300030
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%a = load <2 x i16>, ptr %a_ptr, !range !0, !noundef !{}
|
|
%b = load <2 x i16>, ptr %b_ptr, !range !1, !noundef !{}
|
|
%result = add <2 x i16> %a, %b
|
|
ret <2 x i16> %result
|
|
}
|
|
|
|
define <2 x i32> @test_add2x32(ptr %a_ptr, ptr %b_ptr) {
|
|
; CHECK-LABEL: test_add2x32:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_or_b32_e32 v1, v5, v7
|
|
; CHECK-NEXT: v_or_b32_e32 v0, v4, v6
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%a = load <2 x i32>, ptr %a_ptr, !range !2, !noundef !{}
|
|
%b = load <2 x i32>, ptr %b_ptr, !range !3, !noundef !{}
|
|
%result = add <2 x i32> %a, %b
|
|
ret <2 x i32> %result
|
|
}
|
|
|
|
define <2 x i64> @test_add2x64(ptr %a_ptr, ptr %b_ptr) {
|
|
; CHECK-LABEL: test_add2x64:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx4 v[6:9], v[2:3]
|
|
; CHECK-NEXT: ; kill: killed $vgpr0 killed $vgpr1
|
|
; CHECK-NEXT: ; kill: killed $vgpr2 killed $vgpr3
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, 48
|
|
; CHECK-NEXT: v_mov_b32_e32 v3, 0
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_or_b32_e32 v1, v5, v7
|
|
; CHECK-NEXT: v_or_b32_e32 v0, v4, v6
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%a = load <2 x i64>, ptr %a_ptr, !range !4, !noundef !{}
|
|
%b = load <2 x i64>, ptr %b_ptr, !range !5, !noundef !{}
|
|
%result = add <2 x i64> %a, %b
|
|
ret <2 x i64> %result
|
|
}
|
|
|
|
define <3 x i16> @test_add3x16(ptr %a_ptr, ptr %b_ptr) {
|
|
; CHECK-LABEL: test_add3x16:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_or_b32_e32 v1, v5, v7
|
|
; CHECK-NEXT: v_or_b32_e32 v0, v4, v6
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%a = load <3 x i16>, ptr %a_ptr, !range !0, !noundef !{}
|
|
%b = load <3 x i16>, ptr %b_ptr, !range !1, !noundef !{}
|
|
%result = add <3 x i16> %a, %b
|
|
ret <3 x i16> %result
|
|
}
|
|
|
|
define <3 x i32> @test_add3x32(ptr %a_ptr, ptr %b_ptr) {
|
|
; CHECK-LABEL: test_add3x32:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dword v4, v[2:3]
|
|
; CHECK-NEXT: flat_load_dword v5, v[0:1]
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 48
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, 48
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_or_b32_e32 v0, v5, v4
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%a = load <3 x i32>, ptr %a_ptr, !range !2, !noundef !{}
|
|
%b = load <3 x i32>, ptr %b_ptr, !range !3, !noundef !{}
|
|
%result = add <3 x i32> %a, %b
|
|
ret <3 x i32> %result
|
|
}
|
|
|
|
define <3 x i64> @test_add3x64(ptr %a_ptr, ptr %b_ptr) {
|
|
; CHECK-LABEL: test_add3x64:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_load_dwordx4 v[6:9], v[2:3]
|
|
; CHECK-NEXT: ; kill: killed $vgpr0 killed $vgpr1
|
|
; CHECK-NEXT: ; kill: killed $vgpr2 killed $vgpr3
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, 48
|
|
; CHECK-NEXT: v_mov_b32_e32 v3, 0
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_or_b32_e32 v1, v5, v7
|
|
; CHECK-NEXT: v_or_b32_e32 v0, v4, v6
|
|
; CHECK-NEXT: v_mov_b32_e32 v4, 48
|
|
; CHECK-NEXT: v_mov_b32_e32 v5, 0
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%a = load <3 x i64>, ptr %a_ptr, !range !4, !noundef !{}
|
|
%b = load <3 x i64>, ptr %b_ptr, !range !5, !noundef !{}
|
|
%result = add <3 x i64> %a, %b
|
|
ret <3 x i64> %result
|
|
}
|
|
|
|
!0 = !{i16 16, i16 17 }
|
|
!1 = !{i16 32, i16 33 }
|
|
!2 = !{i32 16, i32 17 }
|
|
!3 = !{i32 32, i32 33 }
|
|
!4 = !{i64 16, i64 17 }
|
|
!5 = !{i64 32, i64 33 }
|