Files
Jonas Paulsson 56a4315ee0 [SystemZ] Add a SystemZ specific pre-RA scheduling strategy. (#135076)
This is a relatively simple strategy as it is omitting any heuristics for
liveness and register pressure reduction. This works well as the SystemZ ISel
scheduler is using Sched::RegPressure which gives a good input order to begin
with.

It is trying harder with biasing phys regs than GenericScheduler as it also
considers other instructions such as immediate loads directly into phys-regs
produced by the register coalescer. This can hopefully be refactored into 
MachineScheduler.cpp.

It has a latency heuristic that is slightly different from the one in
GenericScheduler: It is activated for a specific type of region that have
many "data sequences" consisting of SUs connected only with a single
data-edge that are next to each other in the input order. This is only 3% of
all the scheduling regions, but when activated it is applied on all the
candidates (not just once per cycle). At the same time it is a bit more
careful by checking not only the SU Height against the scheduled latency but
also its Depth against the remaining latency.

It reuses the GenericScheduler handling of weak edges to help copy
coalescing.

It also helps with compare zero elimination as it tries to put a CC-defining
instruction that produces the compare source value above the compare before
any other instruction clobbering CC or the value.

This work was started after observing heavy spilling in Cactus, which was
actually *caused* by GenericScheduler - disabling it (no pre-RA scheduling)
remedied it and gave a 7% improvement in performance on that benchmark. Many
different versions have been tried which has evolved into this initial
simplistic MachineSchedStrategy that does relatively little and yet achieves
double-digit improvements on Cactus and Imagick compared to GenericSched
(which is OTOH 3% better on Blender). There will hopefully be more
improvements added later on as there seems to be potential for it.

It would be very interesting to have other OOO targets try this as well and
perhaps make this available in MachineScheduler.cpp

(A first attempt with improving the pre-RA scheduling was made with #90181,
which however did not materialize in anything actually useful.)
2026-03-10 15:38:05 +01:00

157 lines
4.4 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; Test 128-bit shift right arithmetic in vector registers on z13
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
; Shift right arithmetic immediate (general case).
define i128 @f1(i128 %a) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vrepib %v1, 100
; CHECK-NEXT: vsrab %v0, %v0, %v1
; CHECK-NEXT: vsra %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%res = ashr i128 %a, 100
ret i128 %res
}
; Shift right arithmetic immediate (< 8 bits).
define i128 @f2(i128 %a) {
; CHECK-LABEL: f2:
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vrepib %v1, 7
; CHECK-NEXT: vsra %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%res = ashr i128 %a, 7
ret i128 %res
}
; Shift right arithmetic immediate (full bytes).
define i128 @f3(i128 %a) {
; CHECK-LABEL: f3:
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vrepib %v1, 32
; CHECK-NEXT: vsrab %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%res = ashr i128 %a, 32
ret i128 %res
}
; Shift right arithmetic variable.
define i128 @f4(i128 %a, i128 %sh) {
; CHECK-LABEL: f4:
; CHECK: # %bb.0:
; CHECK-NEXT: l %r0, 12(%r4)
; CHECK-NEXT: vlvgp %v1, %r0, %r0
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vrepb %v1, %v1, 15
; CHECK-NEXT: vsrab %v0, %v0, %v1
; CHECK-NEXT: vsra %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%res = ashr i128 %a, %sh
ret i128 %res
}
; Test removal of AND mask with only bottom 7 bits set.
define i128 @f5(i128 %a, i128 %sh) {
; CHECK-LABEL: f5:
; CHECK: # %bb.0:
; CHECK-NEXT: l %r0, 12(%r4)
; CHECK-NEXT: vlvgp %v1, %r0, %r0
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vrepb %v1, %v1, 15
; CHECK-NEXT: vsrab %v0, %v0, %v1
; CHECK-NEXT: vsra %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%and = and i128 %sh, 127
%shift = ashr i128 %a, %and
ret i128 %shift
}
; Test removal of AND mask including but not limited to bottom 7 bits.
define i128 @f6(i128 %a, i128 %sh) {
; CHECK-LABEL: f6:
; CHECK: # %bb.0:
; CHECK-NEXT: l %r0, 12(%r4)
; CHECK-NEXT: vlvgp %v1, %r0, %r0
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vrepb %v1, %v1, 15
; CHECK-NEXT: vsrab %v0, %v0, %v1
; CHECK-NEXT: vsra %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%and = and i128 %sh, 511
%shift = ashr i128 %a, %and
ret i128 %shift
}
; Test that AND is not removed when some lower 7 bits are not set.
define i128 @f7(i128 %a, i128 %sh) {
; CHECK-LABEL: f7:
; CHECK: # %bb.0:
; CHECK-NEXT: lhi %r0, 63
; CHECK-NEXT: n %r0, 12(%r4)
; CHECK-NEXT: vlvgp %v1, %r0, %r0
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vrepb %v1, %v1, 15
; CHECK-NEXT: vsrab %v0, %v0, %v1
; CHECK-NEXT: vsra %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%and = and i128 %sh, 63
%shift = ashr i128 %a, %and
ret i128 %shift
}
; Test that AND with two register operands is not affected.
define i128 @f8(i128 %a, i128 %b, i128 %sh) {
; CHECK-LABEL: f8:
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v1, 0(%r4), 3
; CHECK-NEXT: vl %v2, 0(%r5), 3
; CHECK-NEXT: vn %v1, %v2, %v1
; CHECK-NEXT: vlgvf %r0, %v1, 3
; CHECK-NEXT: vlvgp %v1, %r0, %r0
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vrepb %v1, %v1, 15
; CHECK-NEXT: vsrab %v0, %v0, %v1
; CHECK-NEXT: vsra %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%and = and i128 %sh, %b
%shift = ashr i128 %a, %and
ret i128 %shift
}
; Test that AND is not entirely removed if the result is reused.
define i128 @f9(i128 %a, i128 %sh) {
; CHECK-LABEL: f9:
; CHECK: # %bb.0:
; CHECK-NEXT: larl %r1, .LCPI8_0
; CHECK-NEXT: vl %v0, 0(%r4), 3
; CHECK-NEXT: vl %v1, 0(%r1), 3
; CHECK-NEXT: vn %v0, %v0, %v1
; CHECK-NEXT: vlgvf %r0, %v0, 3
; CHECK-NEXT: vlvgp %v1, %r0, %r0
; CHECK-NEXT: vl %v2, 0(%r3), 3
; CHECK-NEXT: vrepb %v1, %v1, 15
; CHECK-NEXT: vsrab %v2, %v2, %v1
; CHECK-NEXT: vsra %v1, %v2, %v1
; CHECK-NEXT: vaq %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%and = and i128 %sh, 127
%shift = ashr i128 %a, %and
%reuse = add i128 %and, %shift
ret i128 %reuse
}