exp2 maps to a single v_exp_f32 (quarter rate), while exp and exp10 need additional full-rate costs for the base conversion. Also added cost based on non-afn and denorm mode. This enables SLP vectorization of exp2 on targets with packed f32 ops.
84 lines
4.8 KiB
LLVM
84 lines
4.8 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
|
|
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefix=GFX9 %s
|
|
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes=slp-vectorizer %s | FileCheck -check-prefix=GFX1250 %s
|
|
|
|
define amdgpu_kernel void @exp2_combine(ptr addrspace(1) %arg) {
|
|
; GFX9-LABEL: define amdgpu_kernel void @exp2_combine(
|
|
; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
; GFX9-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
; GFX9-NEXT: [[IDX:%.*]] = zext i32 [[TID]] to i64
|
|
; GFX9-NEXT: [[PTR0:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARG]], i64 [[IDX]]
|
|
; GFX9-NEXT: [[VAL0:%.*]] = load float, ptr addrspace(1) [[PTR0]], align 4
|
|
; GFX9-NEXT: [[EXP0:%.*]] = call float @llvm.exp2.f32(float [[VAL0]])
|
|
; GFX9-NEXT: store float [[EXP0]], ptr addrspace(1) [[PTR0]], align 4
|
|
; GFX9-NEXT: [[IDX1:%.*]] = add nuw nsw i64 [[IDX]], 1
|
|
; GFX9-NEXT: [[PTR1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARG]], i64 [[IDX1]]
|
|
; GFX9-NEXT: [[VAL1:%.*]] = load float, ptr addrspace(1) [[PTR1]], align 4
|
|
; GFX9-NEXT: [[EXP1:%.*]] = call float @llvm.exp2.f32(float [[VAL1]])
|
|
; GFX9-NEXT: store float [[EXP1]], ptr addrspace(1) [[PTR1]], align 4
|
|
; GFX9-NEXT: ret void
|
|
;
|
|
; GFX1250-LABEL: define amdgpu_kernel void @exp2_combine(
|
|
; GFX1250-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
; GFX1250-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
; GFX1250-NEXT: [[IDX:%.*]] = zext i32 [[TID]] to i64
|
|
; GFX1250-NEXT: [[PTR0:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARG]], i64 [[IDX]]
|
|
; GFX1250-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR0]], align 4
|
|
; GFX1250-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.exp2.v2f32(<2 x float> [[TMP1]])
|
|
; GFX1250-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[PTR0]], align 4
|
|
; GFX1250-NEXT: ret void
|
|
;
|
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%idx = zext i32 %tid to i64
|
|
%ptr0 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %idx
|
|
%val0 = load float, ptr addrspace(1) %ptr0, align 4
|
|
%exp0 = call float @llvm.exp2.f32(float %val0)
|
|
store float %exp0, ptr addrspace(1) %ptr0, align 4
|
|
%idx1 = add nuw nsw i64 %idx, 1
|
|
%ptr1 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %idx1
|
|
%val1 = load float, ptr addrspace(1) %ptr1, align 4
|
|
%exp1 = call float @llvm.exp2.f32(float %val1)
|
|
store float %exp1, ptr addrspace(1) %ptr1, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @exp_afn_combine(ptr addrspace(1) %arg) {
|
|
; GFX9-LABEL: define amdgpu_kernel void @exp_afn_combine(
|
|
; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
|
|
; GFX9-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
; GFX9-NEXT: [[IDX:%.*]] = zext i32 [[TID]] to i64
|
|
; GFX9-NEXT: [[PTR0:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARG]], i64 [[IDX]]
|
|
; GFX9-NEXT: [[VAL0:%.*]] = load float, ptr addrspace(1) [[PTR0]], align 4
|
|
; GFX9-NEXT: [[EXP0:%.*]] = call afn float @llvm.exp.f32(float [[VAL0]])
|
|
; GFX9-NEXT: store float [[EXP0]], ptr addrspace(1) [[PTR0]], align 4
|
|
; GFX9-NEXT: [[IDX1:%.*]] = add nuw nsw i64 [[IDX]], 1
|
|
; GFX9-NEXT: [[PTR1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARG]], i64 [[IDX1]]
|
|
; GFX9-NEXT: [[VAL1:%.*]] = load float, ptr addrspace(1) [[PTR1]], align 4
|
|
; GFX9-NEXT: [[EXP1:%.*]] = call afn float @llvm.exp.f32(float [[VAL1]])
|
|
; GFX9-NEXT: store float [[EXP1]], ptr addrspace(1) [[PTR1]], align 4
|
|
; GFX9-NEXT: ret void
|
|
;
|
|
; GFX1250-LABEL: define amdgpu_kernel void @exp_afn_combine(
|
|
; GFX1250-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
|
|
; GFX1250-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
; GFX1250-NEXT: [[IDX:%.*]] = zext i32 [[TID]] to i64
|
|
; GFX1250-NEXT: [[PTR0:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARG]], i64 [[IDX]]
|
|
; GFX1250-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR0]], align 4
|
|
; GFX1250-NEXT: [[TMP2:%.*]] = call afn <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP1]])
|
|
; GFX1250-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[PTR0]], align 4
|
|
; GFX1250-NEXT: ret void
|
|
;
|
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%idx = zext i32 %tid to i64
|
|
%ptr0 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %idx
|
|
%val0 = load float, ptr addrspace(1) %ptr0, align 4
|
|
%exp0 = call afn float @llvm.exp.f32(float %val0)
|
|
store float %exp0, ptr addrspace(1) %ptr0, align 4
|
|
%idx1 = add nuw nsw i64 %idx, 1
|
|
%ptr1 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %idx1
|
|
%val1 = load float, ptr addrspace(1) %ptr1, align 4
|
|
%exp1 = call afn float @llvm.exp.f32(float %val1)
|
|
store float %exp1, ptr addrspace(1) %ptr1, align 4
|
|
ret void
|
|
}
|