[AArch64] Use SVE for fixed-length bf16 operations with +sve-b16b16 (#169329)
This can avoid the promotion bf16 -> f32 -> bf16 round trip (or costly expansions).
This commit is contained in:
@@ -1783,9 +1783,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
|
||||
setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
|
||||
setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
|
||||
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
|
||||
}
|
||||
|
||||
if (Subtarget->hasSVEB16B16() &&
|
||||
Subtarget->isNonStreamingSVEorSME2Available()) {
|
||||
if (Subtarget->hasSVEB16B16() &&
|
||||
Subtarget->isNonStreamingSVEorSME2Available()) {
|
||||
// Note: Use SVE for bfloat16 operations when +sve-b16b16 is available.
|
||||
for (auto VT : {MVT::v4bf16, MVT::v8bf16, MVT::nxv2bf16, MVT::nxv4bf16,
|
||||
MVT::nxv8bf16}) {
|
||||
setOperationAction(ISD::FADD, VT, Custom);
|
||||
setOperationAction(ISD::FMA, VT, Custom);
|
||||
setOperationAction(ISD::FMAXIMUM, VT, Custom);
|
||||
|
||||
936
llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll
Normal file
936
llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll
Normal file
@@ -0,0 +1,936 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16
|
||||
; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=CHECK,B16B16
|
||||
|
||||
target triple = "aarch64-unknown-linux-gnu"
|
||||
|
||||
;
|
||||
; FABS
|
||||
;
|
||||
|
||||
define <4 x bfloat> @fabs_v4bf16(<4 x bfloat> %a) {
|
||||
; CHECK-LABEL: fabs_v4bf16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: bic v0.4h, #128, lsl #8
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> %a)
|
||||
ret <4 x bfloat> %res
|
||||
}
|
||||
|
||||
define <8 x bfloat> @fabs_v8bf16(<8 x bfloat> %a) {
|
||||
; CHECK-LABEL: fabs_v8bf16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: bic v0.8h, #128, lsl #8
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> %a)
|
||||
ret <8 x bfloat> %res
|
||||
}
|
||||
|
||||
;
|
||||
; FADD
|
||||
;
|
||||
|
||||
define <4 x bfloat> @fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
|
||||
; NOB16B16-LABEL: fadd_v4bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v0.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: fadd v0.4s, v0.4s, v1.4s
|
||||
; NOB16B16-NEXT: bfcvtn v0.4h, v0.4s
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fadd_v4bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl4
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 def $z0
|
||||
; B16B16-NEXT: // kill: def $d1 killed $d1 def $z1
|
||||
; B16B16-NEXT: bfadd z0.h, p0/m, z0.h, z1.h
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = fadd <4 x bfloat> %a, %b
|
||||
ret <4 x bfloat> %res
|
||||
}
|
||||
|
||||
define <8 x bfloat> @fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; NOB16B16-LABEL: fadd_v8bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: shll v2.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v3.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: shll2 v1.4s, v1.8h, #16
|
||||
; NOB16B16-NEXT: shll2 v0.4s, v0.8h, #16
|
||||
; NOB16B16-NEXT: fadd v2.4s, v3.4s, v2.4s
|
||||
; NOB16B16-NEXT: fadd v1.4s, v0.4s, v1.4s
|
||||
; NOB16B16-NEXT: bfcvtn v0.4h, v2.4s
|
||||
; NOB16B16-NEXT: bfcvtn2 v0.8h, v1.4s
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fadd_v8bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl8
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 def $z0
|
||||
; B16B16-NEXT: // kill: def $q1 killed $q1 def $z1
|
||||
; B16B16-NEXT: bfadd z0.h, p0/m, z0.h, z1.h
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = fadd <8 x bfloat> %a, %b
|
||||
ret <8 x bfloat> %res
|
||||
}
|
||||
|
||||
;
|
||||
; FDIV
|
||||
;
|
||||
|
||||
define <4 x bfloat> @fdiv_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
|
||||
; CHECK-LABEL: fdiv_v4bf16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: shll v1.4s, v1.4h, #16
|
||||
; CHECK-NEXT: shll v0.4s, v0.4h, #16
|
||||
; CHECK-NEXT: fdiv v0.4s, v0.4s, v1.4s
|
||||
; CHECK-NEXT: bfcvtn v0.4h, v0.4s
|
||||
; CHECK-NEXT: ret
|
||||
%res = fdiv <4 x bfloat> %a, %b
|
||||
ret <4 x bfloat> %res
|
||||
}
|
||||
|
||||
define <8 x bfloat> @fdiv_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; CHECK-LABEL: fdiv_v8bf16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: shll v2.4s, v1.4h, #16
|
||||
; CHECK-NEXT: shll v3.4s, v0.4h, #16
|
||||
; CHECK-NEXT: shll2 v1.4s, v1.8h, #16
|
||||
; CHECK-NEXT: shll2 v0.4s, v0.8h, #16
|
||||
; CHECK-NEXT: fdiv v2.4s, v3.4s, v2.4s
|
||||
; CHECK-NEXT: fdiv v1.4s, v0.4s, v1.4s
|
||||
; CHECK-NEXT: bfcvtn v0.4h, v2.4s
|
||||
; CHECK-NEXT: bfcvtn2 v0.8h, v1.4s
|
||||
; CHECK-NEXT: ret
|
||||
%res = fdiv <8 x bfloat> %a, %b
|
||||
ret <8 x bfloat> %res
|
||||
}
|
||||
|
||||
;
|
||||
; FMAX
|
||||
;
|
||||
|
||||
define <4 x bfloat> @fmax_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
|
||||
; NOB16B16-LABEL: fmax_v4bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: // kill: def $d1 killed $d1 def $q1
|
||||
; NOB16B16-NEXT: // kill: def $d0 killed $d0 def $q0
|
||||
; NOB16B16-NEXT: mov h2, v1.h[1]
|
||||
; NOB16B16-NEXT: mov h3, v0.h[1]
|
||||
; NOB16B16-NEXT: mov h4, v1.h[2]
|
||||
; NOB16B16-NEXT: shll v5.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v6.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: mov h7, v0.h[2]
|
||||
; NOB16B16-NEXT: mov h1, v1.h[3]
|
||||
; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
|
||||
; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
|
||||
; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
|
||||
; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: fmax s2, s3, s2
|
||||
; NOB16B16-NEXT: fmax s3, s6, s5
|
||||
; NOB16B16-NEXT: shll v5.4s, v7.4h, #16
|
||||
; NOB16B16-NEXT: mov h6, v0.h[3]
|
||||
; NOB16B16-NEXT: fmax s4, s5, s4
|
||||
; NOB16B16-NEXT: bfcvt h2, s2
|
||||
; NOB16B16-NEXT: bfcvt h0, s3
|
||||
; NOB16B16-NEXT: shll v3.4s, v6.4h, #16
|
||||
; NOB16B16-NEXT: mov v0.h[1], v2.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h2, s4
|
||||
; NOB16B16-NEXT: fmax s1, s3, s1
|
||||
; NOB16B16-NEXT: mov v0.h[2], v2.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h1, s1
|
||||
; NOB16B16-NEXT: mov v0.h[3], v1.h[0]
|
||||
; NOB16B16-NEXT: // kill: def $d0 killed $d0 killed $q0
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fmax_v4bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl4
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 def $z0
|
||||
; B16B16-NEXT: // kill: def $d1 killed $d1 def $z1
|
||||
; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
|
||||
ret <4 x bfloat> %res
|
||||
}
|
||||
|
||||
define <8 x bfloat> @fmax_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; NOB16B16-LABEL: fmax_v8bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: mov h2, v1.h[1]
|
||||
; NOB16B16-NEXT: mov h3, v0.h[1]
|
||||
; NOB16B16-NEXT: shll v4.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v5.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: mov h6, v1.h[2]
|
||||
; NOB16B16-NEXT: mov h7, v0.h[2]
|
||||
; NOB16B16-NEXT: mov h16, v1.h[3]
|
||||
; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
|
||||
; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
|
||||
; NOB16B16-NEXT: fmax s4, s5, s4
|
||||
; NOB16B16-NEXT: mov h5, v0.h[3]
|
||||
; NOB16B16-NEXT: shll v6.4s, v6.4h, #16
|
||||
; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
|
||||
; NOB16B16-NEXT: fmax s3, s3, s2
|
||||
; NOB16B16-NEXT: bfcvt h2, s4
|
||||
; NOB16B16-NEXT: fmax s4, s7, s6
|
||||
; NOB16B16-NEXT: shll v6.4s, v16.4h, #16
|
||||
; NOB16B16-NEXT: shll v5.4s, v5.4h, #16
|
||||
; NOB16B16-NEXT: mov h7, v1.h[4]
|
||||
; NOB16B16-NEXT: mov h16, v0.h[4]
|
||||
; NOB16B16-NEXT: bfcvt h3, s3
|
||||
; NOB16B16-NEXT: fmax s5, s5, s6
|
||||
; NOB16B16-NEXT: bfcvt h4, s4
|
||||
; NOB16B16-NEXT: mov h6, v0.h[5]
|
||||
; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
|
||||
; NOB16B16-NEXT: shll v16.4s, v16.4h, #16
|
||||
; NOB16B16-NEXT: mov v2.h[1], v3.h[0]
|
||||
; NOB16B16-NEXT: mov h3, v1.h[5]
|
||||
; NOB16B16-NEXT: bfcvt h5, s5
|
||||
; NOB16B16-NEXT: fmax s7, s16, s7
|
||||
; NOB16B16-NEXT: mov h16, v0.h[6]
|
||||
; NOB16B16-NEXT: shll v6.4s, v6.4h, #16
|
||||
; NOB16B16-NEXT: mov h0, v0.h[7]
|
||||
; NOB16B16-NEXT: mov v2.h[2], v4.h[0]
|
||||
; NOB16B16-NEXT: mov h4, v1.h[6]
|
||||
; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
|
||||
; NOB16B16-NEXT: mov h1, v1.h[7]
|
||||
; NOB16B16-NEXT: shll v0.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: fmax s3, s6, s3
|
||||
; NOB16B16-NEXT: shll v6.4s, v16.4h, #16
|
||||
; NOB16B16-NEXT: mov v2.h[3], v5.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h5, s7
|
||||
; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
|
||||
; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: fmax s4, s6, s4
|
||||
; NOB16B16-NEXT: bfcvt h3, s3
|
||||
; NOB16B16-NEXT: mov v2.h[4], v5.h[0]
|
||||
; NOB16B16-NEXT: fmax s0, s0, s1
|
||||
; NOB16B16-NEXT: mov v2.h[5], v3.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h3, s4
|
||||
; NOB16B16-NEXT: bfcvt h0, s0
|
||||
; NOB16B16-NEXT: mov v2.h[6], v3.h[0]
|
||||
; NOB16B16-NEXT: mov v2.h[7], v0.h[0]
|
||||
; NOB16B16-NEXT: mov v0.16b, v2.16b
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fmax_v8bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl8
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 def $z0
|
||||
; B16B16-NEXT: // kill: def $q1 killed $q1 def $z1
|
||||
; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
|
||||
ret <8 x bfloat> %res
|
||||
}
|
||||
|
||||
;
|
||||
; FMAXNM
|
||||
;
|
||||
|
||||
define <4 x bfloat> @fmaxnm_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
|
||||
; NOB16B16-LABEL: fmaxnm_v4bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: // kill: def $d1 killed $d1 def $q1
|
||||
; NOB16B16-NEXT: // kill: def $d0 killed $d0 def $q0
|
||||
; NOB16B16-NEXT: mov h2, v1.h[1]
|
||||
; NOB16B16-NEXT: mov h3, v0.h[1]
|
||||
; NOB16B16-NEXT: mov h4, v1.h[2]
|
||||
; NOB16B16-NEXT: shll v5.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v6.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: mov h7, v0.h[2]
|
||||
; NOB16B16-NEXT: mov h1, v1.h[3]
|
||||
; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
|
||||
; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
|
||||
; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
|
||||
; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: fmaxnm s2, s3, s2
|
||||
; NOB16B16-NEXT: fmaxnm s3, s6, s5
|
||||
; NOB16B16-NEXT: shll v5.4s, v7.4h, #16
|
||||
; NOB16B16-NEXT: mov h6, v0.h[3]
|
||||
; NOB16B16-NEXT: fmaxnm s4, s5, s4
|
||||
; NOB16B16-NEXT: bfcvt h2, s2
|
||||
; NOB16B16-NEXT: bfcvt h0, s3
|
||||
; NOB16B16-NEXT: shll v3.4s, v6.4h, #16
|
||||
; NOB16B16-NEXT: mov v0.h[1], v2.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h2, s4
|
||||
; NOB16B16-NEXT: fmaxnm s1, s3, s1
|
||||
; NOB16B16-NEXT: mov v0.h[2], v2.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h1, s1
|
||||
; NOB16B16-NEXT: mov v0.h[3], v1.h[0]
|
||||
; NOB16B16-NEXT: // kill: def $d0 killed $d0 killed $q0
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fmaxnm_v4bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl4
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 def $z0
|
||||
; B16B16-NEXT: // kill: def $d1 killed $d1 def $z1
|
||||
; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
|
||||
ret <4 x bfloat> %res
|
||||
}
|
||||
|
||||
define <8 x bfloat> @fmaxnm_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; NOB16B16-LABEL: fmaxnm_v8bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: mov h2, v1.h[1]
|
||||
; NOB16B16-NEXT: mov h3, v0.h[1]
|
||||
; NOB16B16-NEXT: shll v4.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v5.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: mov h6, v1.h[2]
|
||||
; NOB16B16-NEXT: mov h7, v0.h[2]
|
||||
; NOB16B16-NEXT: mov h16, v1.h[3]
|
||||
; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
|
||||
; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
|
||||
; NOB16B16-NEXT: fmaxnm s4, s5, s4
|
||||
; NOB16B16-NEXT: mov h5, v0.h[3]
|
||||
; NOB16B16-NEXT: shll v6.4s, v6.4h, #16
|
||||
; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
|
||||
; NOB16B16-NEXT: fmaxnm s3, s3, s2
|
||||
; NOB16B16-NEXT: bfcvt h2, s4
|
||||
; NOB16B16-NEXT: fmaxnm s4, s7, s6
|
||||
; NOB16B16-NEXT: shll v6.4s, v16.4h, #16
|
||||
; NOB16B16-NEXT: shll v5.4s, v5.4h, #16
|
||||
; NOB16B16-NEXT: mov h7, v1.h[4]
|
||||
; NOB16B16-NEXT: mov h16, v0.h[4]
|
||||
; NOB16B16-NEXT: bfcvt h3, s3
|
||||
; NOB16B16-NEXT: fmaxnm s5, s5, s6
|
||||
; NOB16B16-NEXT: bfcvt h4, s4
|
||||
; NOB16B16-NEXT: mov h6, v0.h[5]
|
||||
; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
|
||||
; NOB16B16-NEXT: shll v16.4s, v16.4h, #16
|
||||
; NOB16B16-NEXT: mov v2.h[1], v3.h[0]
|
||||
; NOB16B16-NEXT: mov h3, v1.h[5]
|
||||
; NOB16B16-NEXT: bfcvt h5, s5
|
||||
; NOB16B16-NEXT: fmaxnm s7, s16, s7
|
||||
; NOB16B16-NEXT: mov h16, v0.h[6]
|
||||
; NOB16B16-NEXT: shll v6.4s, v6.4h, #16
|
||||
; NOB16B16-NEXT: mov h0, v0.h[7]
|
||||
; NOB16B16-NEXT: mov v2.h[2], v4.h[0]
|
||||
; NOB16B16-NEXT: mov h4, v1.h[6]
|
||||
; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
|
||||
; NOB16B16-NEXT: mov h1, v1.h[7]
|
||||
; NOB16B16-NEXT: shll v0.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: fmaxnm s3, s6, s3
|
||||
; NOB16B16-NEXT: shll v6.4s, v16.4h, #16
|
||||
; NOB16B16-NEXT: mov v2.h[3], v5.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h5, s7
|
||||
; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
|
||||
; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: fmaxnm s4, s6, s4
|
||||
; NOB16B16-NEXT: bfcvt h3, s3
|
||||
; NOB16B16-NEXT: mov v2.h[4], v5.h[0]
|
||||
; NOB16B16-NEXT: fmaxnm s0, s0, s1
|
||||
; NOB16B16-NEXT: mov v2.h[5], v3.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h3, s4
|
||||
; NOB16B16-NEXT: bfcvt h0, s0
|
||||
; NOB16B16-NEXT: mov v2.h[6], v3.h[0]
|
||||
; NOB16B16-NEXT: mov v2.h[7], v0.h[0]
|
||||
; NOB16B16-NEXT: mov v0.16b, v2.16b
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fmaxnm_v8bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl8
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 def $z0
|
||||
; B16B16-NEXT: // kill: def $q1 killed $q1 def $z1
|
||||
; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
|
||||
ret <8 x bfloat> %res
|
||||
}
|
||||
|
||||
;
|
||||
; FMIN
|
||||
;
|
||||
|
||||
define <4 x bfloat> @fmin_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
|
||||
; NOB16B16-LABEL: fmin_v4bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: // kill: def $d1 killed $d1 def $q1
|
||||
; NOB16B16-NEXT: // kill: def $d0 killed $d0 def $q0
|
||||
; NOB16B16-NEXT: mov h2, v1.h[1]
|
||||
; NOB16B16-NEXT: mov h3, v0.h[1]
|
||||
; NOB16B16-NEXT: mov h4, v1.h[2]
|
||||
; NOB16B16-NEXT: shll v5.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v6.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: mov h7, v0.h[2]
|
||||
; NOB16B16-NEXT: mov h1, v1.h[3]
|
||||
; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
|
||||
; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
|
||||
; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
|
||||
; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: fmin s2, s3, s2
|
||||
; NOB16B16-NEXT: fmin s3, s6, s5
|
||||
; NOB16B16-NEXT: shll v5.4s, v7.4h, #16
|
||||
; NOB16B16-NEXT: mov h6, v0.h[3]
|
||||
; NOB16B16-NEXT: fmin s4, s5, s4
|
||||
; NOB16B16-NEXT: bfcvt h2, s2
|
||||
; NOB16B16-NEXT: bfcvt h0, s3
|
||||
; NOB16B16-NEXT: shll v3.4s, v6.4h, #16
|
||||
; NOB16B16-NEXT: mov v0.h[1], v2.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h2, s4
|
||||
; NOB16B16-NEXT: fmin s1, s3, s1
|
||||
; NOB16B16-NEXT: mov v0.h[2], v2.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h1, s1
|
||||
; NOB16B16-NEXT: mov v0.h[3], v1.h[0]
|
||||
; NOB16B16-NEXT: // kill: def $d0 killed $d0 killed $q0
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fmin_v4bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl4
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 def $z0
|
||||
; B16B16-NEXT: // kill: def $d1 killed $d1 def $z1
|
||||
; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
|
||||
ret <4 x bfloat> %res
|
||||
}
|
||||
|
||||
define <8 x bfloat> @fmin_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; NOB16B16-LABEL: fmin_v8bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: mov h2, v1.h[1]
|
||||
; NOB16B16-NEXT: mov h3, v0.h[1]
|
||||
; NOB16B16-NEXT: shll v4.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v5.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: mov h6, v1.h[2]
|
||||
; NOB16B16-NEXT: mov h7, v0.h[2]
|
||||
; NOB16B16-NEXT: mov h16, v1.h[3]
|
||||
; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
|
||||
; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
|
||||
; NOB16B16-NEXT: fmin s4, s5, s4
|
||||
; NOB16B16-NEXT: mov h5, v0.h[3]
|
||||
; NOB16B16-NEXT: shll v6.4s, v6.4h, #16
|
||||
; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
|
||||
; NOB16B16-NEXT: fmin s3, s3, s2
|
||||
; NOB16B16-NEXT: bfcvt h2, s4
|
||||
; NOB16B16-NEXT: fmin s4, s7, s6
|
||||
; NOB16B16-NEXT: shll v6.4s, v16.4h, #16
|
||||
; NOB16B16-NEXT: shll v5.4s, v5.4h, #16
|
||||
; NOB16B16-NEXT: mov h7, v1.h[4]
|
||||
; NOB16B16-NEXT: mov h16, v0.h[4]
|
||||
; NOB16B16-NEXT: bfcvt h3, s3
|
||||
; NOB16B16-NEXT: fmin s5, s5, s6
|
||||
; NOB16B16-NEXT: bfcvt h4, s4
|
||||
; NOB16B16-NEXT: mov h6, v0.h[5]
|
||||
; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
|
||||
; NOB16B16-NEXT: shll v16.4s, v16.4h, #16
|
||||
; NOB16B16-NEXT: mov v2.h[1], v3.h[0]
|
||||
; NOB16B16-NEXT: mov h3, v1.h[5]
|
||||
; NOB16B16-NEXT: bfcvt h5, s5
|
||||
; NOB16B16-NEXT: fmin s7, s16, s7
|
||||
; NOB16B16-NEXT: mov h16, v0.h[6]
|
||||
; NOB16B16-NEXT: shll v6.4s, v6.4h, #16
|
||||
; NOB16B16-NEXT: mov h0, v0.h[7]
|
||||
; NOB16B16-NEXT: mov v2.h[2], v4.h[0]
|
||||
; NOB16B16-NEXT: mov h4, v1.h[6]
|
||||
; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
|
||||
; NOB16B16-NEXT: mov h1, v1.h[7]
|
||||
; NOB16B16-NEXT: shll v0.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: fmin s3, s6, s3
|
||||
; NOB16B16-NEXT: shll v6.4s, v16.4h, #16
|
||||
; NOB16B16-NEXT: mov v2.h[3], v5.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h5, s7
|
||||
; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
|
||||
; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: fmin s4, s6, s4
|
||||
; NOB16B16-NEXT: bfcvt h3, s3
|
||||
; NOB16B16-NEXT: mov v2.h[4], v5.h[0]
|
||||
; NOB16B16-NEXT: fmin s0, s0, s1
|
||||
; NOB16B16-NEXT: mov v2.h[5], v3.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h3, s4
|
||||
; NOB16B16-NEXT: bfcvt h0, s0
|
||||
; NOB16B16-NEXT: mov v2.h[6], v3.h[0]
|
||||
; NOB16B16-NEXT: mov v2.h[7], v0.h[0]
|
||||
; NOB16B16-NEXT: mov v0.16b, v2.16b
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fmin_v8bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl8
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 def $z0
|
||||
; B16B16-NEXT: // kill: def $q1 killed $q1 def $z1
|
||||
; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
|
||||
ret <8 x bfloat> %res
|
||||
}
|
||||
|
||||
;
|
||||
; FMINNM
|
||||
;
|
||||
|
||||
define <4 x bfloat> @fminnm_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
|
||||
; NOB16B16-LABEL: fminnm_v4bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: // kill: def $d1 killed $d1 def $q1
|
||||
; NOB16B16-NEXT: // kill: def $d0 killed $d0 def $q0
|
||||
; NOB16B16-NEXT: mov h2, v1.h[1]
|
||||
; NOB16B16-NEXT: mov h3, v0.h[1]
|
||||
; NOB16B16-NEXT: mov h4, v1.h[2]
|
||||
; NOB16B16-NEXT: shll v5.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v6.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: mov h7, v0.h[2]
|
||||
; NOB16B16-NEXT: mov h1, v1.h[3]
|
||||
; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
|
||||
; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
|
||||
; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
|
||||
; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: fminnm s2, s3, s2
|
||||
; NOB16B16-NEXT: fminnm s3, s6, s5
|
||||
; NOB16B16-NEXT: shll v5.4s, v7.4h, #16
|
||||
; NOB16B16-NEXT: mov h6, v0.h[3]
|
||||
; NOB16B16-NEXT: fminnm s4, s5, s4
|
||||
; NOB16B16-NEXT: bfcvt h2, s2
|
||||
; NOB16B16-NEXT: bfcvt h0, s3
|
||||
; NOB16B16-NEXT: shll v3.4s, v6.4h, #16
|
||||
; NOB16B16-NEXT: mov v0.h[1], v2.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h2, s4
|
||||
; NOB16B16-NEXT: fminnm s1, s3, s1
|
||||
; NOB16B16-NEXT: mov v0.h[2], v2.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h1, s1
|
||||
; NOB16B16-NEXT: mov v0.h[3], v1.h[0]
|
||||
; NOB16B16-NEXT: // kill: def $d0 killed $d0 killed $q0
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fminnm_v4bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl4
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 def $z0
|
||||
; B16B16-NEXT: // kill: def $d1 killed $d1 def $z1
|
||||
; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
|
||||
ret <4 x bfloat> %res
|
||||
}
|
||||
|
||||
define <8 x bfloat> @fminnm_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; NOB16B16-LABEL: fminnm_v8bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: mov h2, v1.h[1]
|
||||
; NOB16B16-NEXT: mov h3, v0.h[1]
|
||||
; NOB16B16-NEXT: shll v4.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v5.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: mov h6, v1.h[2]
|
||||
; NOB16B16-NEXT: mov h7, v0.h[2]
|
||||
; NOB16B16-NEXT: mov h16, v1.h[3]
|
||||
; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
|
||||
; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
|
||||
; NOB16B16-NEXT: fminnm s4, s5, s4
|
||||
; NOB16B16-NEXT: mov h5, v0.h[3]
|
||||
; NOB16B16-NEXT: shll v6.4s, v6.4h, #16
|
||||
; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
|
||||
; NOB16B16-NEXT: fminnm s3, s3, s2
|
||||
; NOB16B16-NEXT: bfcvt h2, s4
|
||||
; NOB16B16-NEXT: fminnm s4, s7, s6
|
||||
; NOB16B16-NEXT: shll v6.4s, v16.4h, #16
|
||||
; NOB16B16-NEXT: shll v5.4s, v5.4h, #16
|
||||
; NOB16B16-NEXT: mov h7, v1.h[4]
|
||||
; NOB16B16-NEXT: mov h16, v0.h[4]
|
||||
; NOB16B16-NEXT: bfcvt h3, s3
|
||||
; NOB16B16-NEXT: fminnm s5, s5, s6
|
||||
; NOB16B16-NEXT: bfcvt h4, s4
|
||||
; NOB16B16-NEXT: mov h6, v0.h[5]
|
||||
; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
|
||||
; NOB16B16-NEXT: shll v16.4s, v16.4h, #16
|
||||
; NOB16B16-NEXT: mov v2.h[1], v3.h[0]
|
||||
; NOB16B16-NEXT: mov h3, v1.h[5]
|
||||
; NOB16B16-NEXT: bfcvt h5, s5
|
||||
; NOB16B16-NEXT: fminnm s7, s16, s7
|
||||
; NOB16B16-NEXT: mov h16, v0.h[6]
|
||||
; NOB16B16-NEXT: shll v6.4s, v6.4h, #16
|
||||
; NOB16B16-NEXT: mov h0, v0.h[7]
|
||||
; NOB16B16-NEXT: mov v2.h[2], v4.h[0]
|
||||
; NOB16B16-NEXT: mov h4, v1.h[6]
|
||||
; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
|
||||
; NOB16B16-NEXT: mov h1, v1.h[7]
|
||||
; NOB16B16-NEXT: shll v0.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: fminnm s3, s6, s3
|
||||
; NOB16B16-NEXT: shll v6.4s, v16.4h, #16
|
||||
; NOB16B16-NEXT: mov v2.h[3], v5.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h5, s7
|
||||
; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
|
||||
; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: fminnm s4, s6, s4
|
||||
; NOB16B16-NEXT: bfcvt h3, s3
|
||||
; NOB16B16-NEXT: mov v2.h[4], v5.h[0]
|
||||
; NOB16B16-NEXT: fminnm s0, s0, s1
|
||||
; NOB16B16-NEXT: mov v2.h[5], v3.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h3, s4
|
||||
; NOB16B16-NEXT: bfcvt h0, s0
|
||||
; NOB16B16-NEXT: mov v2.h[6], v3.h[0]
|
||||
; NOB16B16-NEXT: mov v2.h[7], v0.h[0]
|
||||
; NOB16B16-NEXT: mov v0.16b, v2.16b
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fminnm_v8bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl8
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 def $z0
|
||||
; B16B16-NEXT: // kill: def $q1 killed $q1 def $z1
|
||||
; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
|
||||
ret <8 x bfloat> %res
|
||||
}
|
||||
|
||||
;
|
||||
; FMLA
|
||||
;
|
||||
|
||||
define <4 x bfloat> @fmla_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
|
||||
; NOB16B16-LABEL: fmla_v4bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: // kill: def $d2 killed $d2 def $q2
|
||||
; NOB16B16-NEXT: // kill: def $d1 killed $d1 def $q1
|
||||
; NOB16B16-NEXT: // kill: def $d0 killed $d0 def $q0
|
||||
; NOB16B16-NEXT: mov h3, v2.h[1]
|
||||
; NOB16B16-NEXT: mov h4, v1.h[1]
|
||||
; NOB16B16-NEXT: mov h5, v0.h[1]
|
||||
; NOB16B16-NEXT: shll v6.4s, v2.4h, #16
|
||||
; NOB16B16-NEXT: shll v7.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v16.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: mov h17, v2.h[2]
|
||||
; NOB16B16-NEXT: mov h18, v1.h[2]
|
||||
; NOB16B16-NEXT: mov h19, v0.h[2]
|
||||
; NOB16B16-NEXT: mov h2, v2.h[3]
|
||||
; NOB16B16-NEXT: mov h1, v1.h[3]
|
||||
; NOB16B16-NEXT: fmadd s6, s16, s7, s6
|
||||
; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
|
||||
; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
|
||||
; NOB16B16-NEXT: shll v5.4s, v5.4h, #16
|
||||
; NOB16B16-NEXT: mov h16, v0.h[3]
|
||||
; NOB16B16-NEXT: shll v7.4s, v19.4h, #16
|
||||
; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
|
||||
; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: fmadd s3, s5, s4, s3
|
||||
; NOB16B16-NEXT: shll v4.4s, v17.4h, #16
|
||||
; NOB16B16-NEXT: shll v5.4s, v18.4h, #16
|
||||
; NOB16B16-NEXT: bfcvt h0, s6
|
||||
; NOB16B16-NEXT: fmadd s4, s7, s5, s4
|
||||
; NOB16B16-NEXT: shll v5.4s, v16.4h, #16
|
||||
; NOB16B16-NEXT: bfcvt h3, s3
|
||||
; NOB16B16-NEXT: fmadd s1, s5, s1, s2
|
||||
; NOB16B16-NEXT: mov v0.h[1], v3.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h3, s4
|
||||
; NOB16B16-NEXT: bfcvt h1, s1
|
||||
; NOB16B16-NEXT: mov v0.h[2], v3.h[0]
|
||||
; NOB16B16-NEXT: mov v0.h[3], v1.h[0]
|
||||
; NOB16B16-NEXT: // kill: def $d0 killed $d0 killed $q0
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fmla_v4bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl4
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 def $z0
|
||||
; B16B16-NEXT: // kill: def $d2 killed $d2 def $z2
|
||||
; B16B16-NEXT: // kill: def $d1 killed $d1 def $z1
|
||||
; B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
|
||||
ret <4 x bfloat> %res
|
||||
}
|
||||
|
||||
define <8 x bfloat> @fmla_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
|
||||
; NOB16B16-LABEL: fmla_v8bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: mov h3, v2.h[1]
|
||||
; NOB16B16-NEXT: mov h4, v1.h[1]
|
||||
; NOB16B16-NEXT: mov h5, v0.h[1]
|
||||
; NOB16B16-NEXT: shll v6.4s, v2.4h, #16
|
||||
; NOB16B16-NEXT: shll v7.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v16.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: mov h17, v2.h[2]
|
||||
; NOB16B16-NEXT: mov h18, v1.h[2]
|
||||
; NOB16B16-NEXT: mov h19, v0.h[2]
|
||||
; NOB16B16-NEXT: mov h20, v2.h[3]
|
||||
; NOB16B16-NEXT: mov h21, v1.h[3]
|
||||
; NOB16B16-NEXT: fmadd s6, s16, s7, s6
|
||||
; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
|
||||
; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
|
||||
; NOB16B16-NEXT: shll v5.4s, v5.4h, #16
|
||||
; NOB16B16-NEXT: mov h7, v0.h[3]
|
||||
; NOB16B16-NEXT: shll v16.4s, v19.4h, #16
|
||||
; NOB16B16-NEXT: mov h19, v0.h[4]
|
||||
; NOB16B16-NEXT: fmadd s4, s5, s4, s3
|
||||
; NOB16B16-NEXT: shll v3.4s, v17.4h, #16
|
||||
; NOB16B16-NEXT: shll v5.4s, v18.4h, #16
|
||||
; NOB16B16-NEXT: mov h17, v2.h[4]
|
||||
; NOB16B16-NEXT: mov h18, v1.h[4]
|
||||
; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
|
||||
; NOB16B16-NEXT: shll v19.4s, v19.4h, #16
|
||||
; NOB16B16-NEXT: fmadd s5, s16, s5, s3
|
||||
; NOB16B16-NEXT: bfcvt h3, s6
|
||||
; NOB16B16-NEXT: shll v6.4s, v20.4h, #16
|
||||
; NOB16B16-NEXT: bfcvt h4, s4
|
||||
; NOB16B16-NEXT: shll v16.4s, v21.4h, #16
|
||||
; NOB16B16-NEXT: shll v17.4s, v17.4h, #16
|
||||
; NOB16B16-NEXT: shll v18.4s, v18.4h, #16
|
||||
; NOB16B16-NEXT: fmadd s6, s7, s16, s6
|
||||
; NOB16B16-NEXT: bfcvt h5, s5
|
||||
; NOB16B16-NEXT: mov h7, v1.h[5]
|
||||
; NOB16B16-NEXT: mov v3.h[1], v4.h[0]
|
||||
; NOB16B16-NEXT: mov h4, v2.h[5]
|
||||
; NOB16B16-NEXT: mov h16, v0.h[5]
|
||||
; NOB16B16-NEXT: fmadd s17, s19, s18, s17
|
||||
; NOB16B16-NEXT: mov h18, v2.h[6]
|
||||
; NOB16B16-NEXT: mov h19, v1.h[6]
|
||||
; NOB16B16-NEXT: mov h2, v2.h[7]
|
||||
; NOB16B16-NEXT: mov h1, v1.h[7]
|
||||
; NOB16B16-NEXT: bfcvt h6, s6
|
||||
; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
|
||||
; NOB16B16-NEXT: mov v3.h[2], v5.h[0]
|
||||
; NOB16B16-NEXT: mov h5, v0.h[6]
|
||||
; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
|
||||
; NOB16B16-NEXT: shll v16.4s, v16.4h, #16
|
||||
; NOB16B16-NEXT: mov h0, v0.h[7]
|
||||
; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
|
||||
; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: fmadd s4, s16, s7, s4
|
||||
; NOB16B16-NEXT: mov v3.h[3], v6.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h6, s17
|
||||
; NOB16B16-NEXT: shll v7.4s, v18.4h, #16
|
||||
; NOB16B16-NEXT: shll v16.4s, v19.4h, #16
|
||||
; NOB16B16-NEXT: shll v5.4s, v5.4h, #16
|
||||
; NOB16B16-NEXT: shll v0.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: fmadd s5, s5, s16, s7
|
||||
; NOB16B16-NEXT: mov v3.h[4], v6.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h4, s4
|
||||
; NOB16B16-NEXT: fmadd s0, s0, s1, s2
|
||||
; NOB16B16-NEXT: mov v3.h[5], v4.h[0]
|
||||
; NOB16B16-NEXT: bfcvt h4, s5
|
||||
; NOB16B16-NEXT: bfcvt h0, s0
|
||||
; NOB16B16-NEXT: mov v3.h[6], v4.h[0]
|
||||
; NOB16B16-NEXT: mov v3.h[7], v0.h[0]
|
||||
; NOB16B16-NEXT: mov v0.16b, v3.16b
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fmla_v8bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl8
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 def $z0
|
||||
; B16B16-NEXT: // kill: def $q2 killed $q2 def $z2
|
||||
; B16B16-NEXT: // kill: def $q1 killed $q1 def $z1
|
||||
; B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c)
|
||||
ret <8 x bfloat> %res
|
||||
}
|
||||
|
||||
;
|
||||
; FMUL
|
||||
;
|
||||
|
||||
define <4 x bfloat> @fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
|
||||
; NOB16B16-LABEL: fmul_v4bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v0.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: fmul v0.4s, v0.4s, v1.4s
|
||||
; NOB16B16-NEXT: bfcvtn v0.4h, v0.4s
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fmul_v4bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl4
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 def $z0
|
||||
; B16B16-NEXT: // kill: def $d1 killed $d1 def $z1
|
||||
; B16B16-NEXT: bfmul z0.h, p0/m, z0.h, z1.h
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = fmul <4 x bfloat> %a, %b
|
||||
ret <4 x bfloat> %res
|
||||
}
|
||||
|
||||
define <8 x bfloat> @fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; NOB16B16-LABEL: fmul_v8bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: shll v2.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v3.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: shll2 v1.4s, v1.8h, #16
|
||||
; NOB16B16-NEXT: shll2 v0.4s, v0.8h, #16
|
||||
; NOB16B16-NEXT: fmul v2.4s, v3.4s, v2.4s
|
||||
; NOB16B16-NEXT: fmul v1.4s, v0.4s, v1.4s
|
||||
; NOB16B16-NEXT: bfcvtn v0.4h, v2.4s
|
||||
; NOB16B16-NEXT: bfcvtn2 v0.8h, v1.4s
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fmul_v8bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl8
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 def $z0
|
||||
; B16B16-NEXT: // kill: def $q1 killed $q1 def $z1
|
||||
; B16B16-NEXT: bfmul z0.h, p0/m, z0.h, z1.h
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = fmul <8 x bfloat> %a, %b
|
||||
ret <8 x bfloat> %res
|
||||
}
|
||||
|
||||
;
|
||||
; FNEG
|
||||
;
|
||||
|
||||
define <4 x bfloat> @fneg_v4bf16(<4 x bfloat> %a) {
|
||||
; CHECK-LABEL: fneg_v4bf16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: movi v1.4h, #128, lsl #8
|
||||
; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
|
||||
; CHECK-NEXT: ret
|
||||
%res = fneg <4 x bfloat> %a
|
||||
ret <4 x bfloat> %res
|
||||
}
|
||||
|
||||
define <8 x bfloat> @fneg_v8bf16(<8 x bfloat> %a) {
|
||||
; CHECK-LABEL: fneg_v8bf16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: movi v1.8h, #128, lsl #8
|
||||
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
|
||||
; CHECK-NEXT: ret
|
||||
%res = fneg <8 x bfloat> %a
|
||||
ret <8 x bfloat> %res
|
||||
}
|
||||
|
||||
;
|
||||
; FSQRT
|
||||
;
|
||||
|
||||
define <4 x bfloat> @fsqrt_v4bf16(<4 x bfloat> %a) {
|
||||
; CHECK-LABEL: fsqrt_v4bf16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
|
||||
; CHECK-NEXT: mov h1, v0.h[1]
|
||||
; CHECK-NEXT: shll v2.4s, v0.4h, #16
|
||||
; CHECK-NEXT: mov h3, v0.h[2]
|
||||
; CHECK-NEXT: mov h0, v0.h[3]
|
||||
; CHECK-NEXT: fsqrt s2, s2
|
||||
; CHECK-NEXT: shll v1.4s, v1.4h, #16
|
||||
; CHECK-NEXT: shll v3.4s, v3.4h, #16
|
||||
; CHECK-NEXT: shll v0.4s, v0.4h, #16
|
||||
; CHECK-NEXT: fsqrt s1, s1
|
||||
; CHECK-NEXT: bfcvt h1, s1
|
||||
; CHECK-NEXT: fsqrt s3, s3
|
||||
; CHECK-NEXT: fsqrt s4, s0
|
||||
; CHECK-NEXT: bfcvt h0, s2
|
||||
; CHECK-NEXT: mov v0.h[1], v1.h[0]
|
||||
; CHECK-NEXT: bfcvt h1, s3
|
||||
; CHECK-NEXT: mov v0.h[2], v1.h[0]
|
||||
; CHECK-NEXT: bfcvt h1, s4
|
||||
; CHECK-NEXT: mov v0.h[3], v1.h[0]
|
||||
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> %a)
|
||||
ret <4 x bfloat> %res
|
||||
}
|
||||
|
||||
define <8 x bfloat> @fsqrt_v8bf16(<8 x bfloat> %a) {
|
||||
; CHECK-LABEL: fsqrt_v8bf16:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: mov h1, v0.h[1]
|
||||
; CHECK-NEXT: shll v2.4s, v0.4h, #16
|
||||
; CHECK-NEXT: mov h3, v0.h[2]
|
||||
; CHECK-NEXT: mov h4, v0.h[3]
|
||||
; CHECK-NEXT: mov h5, v0.h[4]
|
||||
; CHECK-NEXT: mov h6, v0.h[5]
|
||||
; CHECK-NEXT: mov h7, v0.h[6]
|
||||
; CHECK-NEXT: mov h0, v0.h[7]
|
||||
; CHECK-NEXT: fsqrt s2, s2
|
||||
; CHECK-NEXT: shll v1.4s, v1.4h, #16
|
||||
; CHECK-NEXT: shll v3.4s, v3.4h, #16
|
||||
; CHECK-NEXT: shll v4.4s, v4.4h, #16
|
||||
; CHECK-NEXT: shll v5.4s, v5.4h, #16
|
||||
; CHECK-NEXT: shll v6.4s, v6.4h, #16
|
||||
; CHECK-NEXT: shll v7.4s, v7.4h, #16
|
||||
; CHECK-NEXT: shll v16.4s, v0.4h, #16
|
||||
; CHECK-NEXT: bfcvt h0, s2
|
||||
; CHECK-NEXT: fsqrt s1, s1
|
||||
; CHECK-NEXT: bfcvt h1, s1
|
||||
; CHECK-NEXT: mov v0.h[1], v1.h[0]
|
||||
; CHECK-NEXT: fsqrt s3, s3
|
||||
; CHECK-NEXT: bfcvt h1, s3
|
||||
; CHECK-NEXT: mov v0.h[2], v1.h[0]
|
||||
; CHECK-NEXT: fsqrt s4, s4
|
||||
; CHECK-NEXT: bfcvt h1, s4
|
||||
; CHECK-NEXT: mov v0.h[3], v1.h[0]
|
||||
; CHECK-NEXT: fsqrt s5, s5
|
||||
; CHECK-NEXT: bfcvt h1, s5
|
||||
; CHECK-NEXT: mov v0.h[4], v1.h[0]
|
||||
; CHECK-NEXT: fsqrt s6, s6
|
||||
; CHECK-NEXT: bfcvt h1, s6
|
||||
; CHECK-NEXT: mov v0.h[5], v1.h[0]
|
||||
; CHECK-NEXT: fsqrt s7, s7
|
||||
; CHECK-NEXT: bfcvt h1, s7
|
||||
; CHECK-NEXT: mov v0.h[6], v1.h[0]
|
||||
; CHECK-NEXT: fsqrt s2, s16
|
||||
; CHECK-NEXT: bfcvt h1, s2
|
||||
; CHECK-NEXT: mov v0.h[7], v1.h[0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> %a)
|
||||
ret <8 x bfloat> %res
|
||||
}
|
||||
|
||||
;
|
||||
; FSUB
|
||||
;
|
||||
|
||||
define <4 x bfloat> @fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
|
||||
; NOB16B16-LABEL: fsub_v4bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v0.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: fsub v0.4s, v0.4s, v1.4s
|
||||
; NOB16B16-NEXT: bfcvtn v0.4h, v0.4s
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fsub_v4bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl4
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 def $z0
|
||||
; B16B16-NEXT: // kill: def $d1 killed $d1 def $z1
|
||||
; B16B16-NEXT: bfsub z0.h, p0/m, z0.h, z1.h
|
||||
; B16B16-NEXT: // kill: def $d0 killed $d0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = fsub <4 x bfloat> %a, %b
|
||||
ret <4 x bfloat> %res
|
||||
}
|
||||
|
||||
define <8 x bfloat> @fsub_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
|
||||
; NOB16B16-LABEL: fsub_v8bf16:
|
||||
; NOB16B16: // %bb.0:
|
||||
; NOB16B16-NEXT: shll v2.4s, v1.4h, #16
|
||||
; NOB16B16-NEXT: shll v3.4s, v0.4h, #16
|
||||
; NOB16B16-NEXT: shll2 v1.4s, v1.8h, #16
|
||||
; NOB16B16-NEXT: shll2 v0.4s, v0.8h, #16
|
||||
; NOB16B16-NEXT: fsub v2.4s, v3.4s, v2.4s
|
||||
; NOB16B16-NEXT: fsub v1.4s, v0.4s, v1.4s
|
||||
; NOB16B16-NEXT: bfcvtn v0.4h, v2.4s
|
||||
; NOB16B16-NEXT: bfcvtn2 v0.8h, v1.4s
|
||||
; NOB16B16-NEXT: ret
|
||||
;
|
||||
; B16B16-LABEL: fsub_v8bf16:
|
||||
; B16B16: // %bb.0:
|
||||
; B16B16-NEXT: ptrue p0.h, vl8
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 def $z0
|
||||
; B16B16-NEXT: // kill: def $q1 killed $q1 def $z1
|
||||
; B16B16-NEXT: bfsub z0.h, p0/m, z0.h, z1.h
|
||||
; B16B16-NEXT: // kill: def $q0 killed $q0 killed $z0
|
||||
; B16B16-NEXT: ret
|
||||
%res = fsub <8 x bfloat> %a, %b
|
||||
ret <8 x bfloat> %res
|
||||
}
|
||||
@@ -22,26 +22,13 @@ define void @fmul_indexed_bf16_256b(ptr %a, ptr %b, ptr %c) #0 {
|
||||
; CHECK-LABEL: fmul_indexed_bf16_256b:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ptrue p0.h, vl8
|
||||
; CHECK-NEXT: ldp q2, q3, [x1]
|
||||
; CHECK-NEXT: dup v0.8h, v0.h[2]
|
||||
; CHECK-NEXT: dup v1.8h, v1.h[2]
|
||||
; CHECK-NEXT: shll v4.4s, v2.4h, #16
|
||||
; CHECK-NEXT: shll v6.4s, v3.4h, #16
|
||||
; CHECK-NEXT: shll2 v2.4s, v2.8h, #16
|
||||
; CHECK-NEXT: shll2 v3.4s, v3.8h, #16
|
||||
; CHECK-NEXT: shll v5.4s, v0.4h, #16
|
||||
; CHECK-NEXT: shll v7.4s, v1.4h, #16
|
||||
; CHECK-NEXT: shll2 v0.4s, v0.8h, #16
|
||||
; CHECK-NEXT: shll2 v1.4s, v1.8h, #16
|
||||
; CHECK-NEXT: fmul v4.4s, v4.4s, v5.4s
|
||||
; CHECK-NEXT: fmul v5.4s, v6.4s, v7.4s
|
||||
; CHECK-NEXT: fmul v0.4s, v2.4s, v0.4s
|
||||
; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s
|
||||
; CHECK-NEXT: bfcvtn v2.4h, v4.4s
|
||||
; CHECK-NEXT: bfcvtn v3.4h, v5.4s
|
||||
; CHECK-NEXT: bfcvtn2 v2.8h, v0.4s
|
||||
; CHECK-NEXT: bfcvtn2 v3.8h, v1.4s
|
||||
; CHECK-NEXT: stp q2, q3, [x2]
|
||||
; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z2.h
|
||||
; CHECK-NEXT: bfmul z1.h, p0/m, z1.h, z3.h
|
||||
; CHECK-NEXT: stp q0, q1, [x2]
|
||||
; CHECK-NEXT: ret
|
||||
%ld.a = load <16 x bfloat>, ptr %a
|
||||
%ld.b = load <16 x bfloat>, ptr %b
|
||||
@@ -124,43 +111,16 @@ define void @fmla_indexed_bf16_256b(ptr %a, ptr %b, ptr %c) #0 {
|
||||
; CHECK-LABEL: fmla_indexed_bf16_256b:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ptrue p0.h, vl8
|
||||
; CHECK-NEXT: ldp q2, q3, [x1]
|
||||
; CHECK-NEXT: ldp q4, q5, [x2]
|
||||
; CHECK-NEXT: dup v0.8h, v0.h[2]
|
||||
; CHECK-NEXT: dup v1.8h, v1.h[2]
|
||||
; CHECK-NEXT: shll v4.4s, v2.4h, #16
|
||||
; CHECK-NEXT: shll v6.4s, v3.4h, #16
|
||||
; CHECK-NEXT: shll2 v2.4s, v2.8h, #16
|
||||
; CHECK-NEXT: shll2 v3.4s, v3.8h, #16
|
||||
; CHECK-NEXT: shll v5.4s, v0.4h, #16
|
||||
; CHECK-NEXT: shll v7.4s, v1.4h, #16
|
||||
; CHECK-NEXT: shll2 v0.4s, v0.8h, #16
|
||||
; CHECK-NEXT: shll2 v1.4s, v1.8h, #16
|
||||
; CHECK-NEXT: fmul v4.4s, v4.4s, v5.4s
|
||||
; CHECK-NEXT: fmul v5.4s, v6.4s, v7.4s
|
||||
; CHECK-NEXT: fmul v0.4s, v2.4s, v0.4s
|
||||
; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s
|
||||
; CHECK-NEXT: bfcvtn v2.4h, v4.4s
|
||||
; CHECK-NEXT: bfcvtn v3.4h, v5.4s
|
||||
; CHECK-NEXT: bfcvtn2 v2.8h, v0.4s
|
||||
; CHECK-NEXT: bfcvtn2 v3.8h, v1.4s
|
||||
; CHECK-NEXT: ldp q0, q1, [x2]
|
||||
; CHECK-NEXT: shll v4.4s, v0.4h, #16
|
||||
; CHECK-NEXT: shll v5.4s, v2.4h, #16
|
||||
; CHECK-NEXT: shll v6.4s, v1.4h, #16
|
||||
; CHECK-NEXT: shll v7.4s, v3.4h, #16
|
||||
; CHECK-NEXT: shll2 v0.4s, v0.8h, #16
|
||||
; CHECK-NEXT: shll2 v2.4s, v2.8h, #16
|
||||
; CHECK-NEXT: shll2 v1.4s, v1.8h, #16
|
||||
; CHECK-NEXT: shll2 v3.4s, v3.8h, #16
|
||||
; CHECK-NEXT: fadd v4.4s, v5.4s, v4.4s
|
||||
; CHECK-NEXT: fadd v5.4s, v7.4s, v6.4s
|
||||
; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s
|
||||
; CHECK-NEXT: fadd v1.4s, v3.4s, v1.4s
|
||||
; CHECK-NEXT: bfcvtn v2.4h, v4.4s
|
||||
; CHECK-NEXT: bfcvtn v3.4h, v5.4s
|
||||
; CHECK-NEXT: bfcvtn2 v2.8h, v0.4s
|
||||
; CHECK-NEXT: bfcvtn2 v3.8h, v1.4s
|
||||
; CHECK-NEXT: stp q2, q3, [x2]
|
||||
; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z2.h
|
||||
; CHECK-NEXT: bfmul z1.h, p0/m, z1.h, z3.h
|
||||
; CHECK-NEXT: bfadd z0.h, p0/m, z0.h, z4.h
|
||||
; CHECK-NEXT: bfadd z1.h, p0/m, z1.h, z5.h
|
||||
; CHECK-NEXT: stp q0, q1, [x2]
|
||||
; CHECK-NEXT: ret
|
||||
%ld.a = load <16 x bfloat>, ptr %a
|
||||
%ld.b = load <16 x bfloat>, ptr %b
|
||||
@@ -251,43 +211,16 @@ define void @fmls_indexed_bf16_256b(ptr %a, ptr %b, ptr %c) #0 {
|
||||
; CHECK-LABEL: fmls_indexed_bf16_256b:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ptrue p0.h, vl8
|
||||
; CHECK-NEXT: ldp q2, q3, [x1]
|
||||
; CHECK-NEXT: ldp q4, q5, [x2]
|
||||
; CHECK-NEXT: dup v0.8h, v0.h[2]
|
||||
; CHECK-NEXT: dup v1.8h, v1.h[2]
|
||||
; CHECK-NEXT: shll v4.4s, v2.4h, #16
|
||||
; CHECK-NEXT: shll v6.4s, v3.4h, #16
|
||||
; CHECK-NEXT: shll2 v2.4s, v2.8h, #16
|
||||
; CHECK-NEXT: shll2 v3.4s, v3.8h, #16
|
||||
; CHECK-NEXT: shll v5.4s, v0.4h, #16
|
||||
; CHECK-NEXT: shll v7.4s, v1.4h, #16
|
||||
; CHECK-NEXT: shll2 v0.4s, v0.8h, #16
|
||||
; CHECK-NEXT: shll2 v1.4s, v1.8h, #16
|
||||
; CHECK-NEXT: fmul v4.4s, v4.4s, v5.4s
|
||||
; CHECK-NEXT: fmul v5.4s, v6.4s, v7.4s
|
||||
; CHECK-NEXT: fmul v0.4s, v2.4s, v0.4s
|
||||
; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s
|
||||
; CHECK-NEXT: bfcvtn v2.4h, v4.4s
|
||||
; CHECK-NEXT: bfcvtn v3.4h, v5.4s
|
||||
; CHECK-NEXT: bfcvtn2 v2.8h, v0.4s
|
||||
; CHECK-NEXT: bfcvtn2 v3.8h, v1.4s
|
||||
; CHECK-NEXT: ldp q0, q1, [x2]
|
||||
; CHECK-NEXT: shll v4.4s, v0.4h, #16
|
||||
; CHECK-NEXT: shll v5.4s, v2.4h, #16
|
||||
; CHECK-NEXT: shll v6.4s, v1.4h, #16
|
||||
; CHECK-NEXT: shll v7.4s, v3.4h, #16
|
||||
; CHECK-NEXT: shll2 v0.4s, v0.8h, #16
|
||||
; CHECK-NEXT: shll2 v2.4s, v2.8h, #16
|
||||
; CHECK-NEXT: shll2 v1.4s, v1.8h, #16
|
||||
; CHECK-NEXT: shll2 v3.4s, v3.8h, #16
|
||||
; CHECK-NEXT: fsub v4.4s, v4.4s, v5.4s
|
||||
; CHECK-NEXT: fsub v5.4s, v6.4s, v7.4s
|
||||
; CHECK-NEXT: fsub v0.4s, v0.4s, v2.4s
|
||||
; CHECK-NEXT: fsub v1.4s, v1.4s, v3.4s
|
||||
; CHECK-NEXT: bfcvtn v2.4h, v4.4s
|
||||
; CHECK-NEXT: bfcvtn v3.4h, v5.4s
|
||||
; CHECK-NEXT: bfcvtn2 v2.8h, v0.4s
|
||||
; CHECK-NEXT: bfcvtn2 v3.8h, v1.4s
|
||||
; CHECK-NEXT: stp q2, q3, [x2]
|
||||
; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z2.h
|
||||
; CHECK-NEXT: bfmul z1.h, p0/m, z1.h, z3.h
|
||||
; CHECK-NEXT: bfsub z0.h, p0/m, z0.h, z4.h
|
||||
; CHECK-NEXT: bfsub z1.h, p0/m, z1.h, z5.h
|
||||
; CHECK-NEXT: stp q0, q1, [x2]
|
||||
; CHECK-NEXT: ret
|
||||
%ld.a = load <16 x bfloat>, ptr %a
|
||||
%ld.b = load <16 x bfloat>, ptr %b
|
||||
|
||||
Reference in New Issue
Block a user