[Inliner] Use store-to-load forwarding to resolve call arguments (#190607)

Uses `FindAvailableLoadedValue` to resolve load instructions in call
arguments to constants before inline cost analysis. This gives the
inliner more precise cost estimate and option to inline functions which
would not be inlined otherwise.

The `-O3` doesn't inline empty `std::set` and `std::map` because node
deletion is recursive. The inliner doesn't know that `nullptr` is passed
in as it is a `load` from a member.

This addresses both `libstdc++` and `libc++`:
- `libstdc++` - `FindAvailableLoadedValue` requires `MaxInstToScan=0`,
because relevant store is 7 instructions away and `DefMaxInstsToScan =
6`. Benchmarking on large LLVM TUs showed no measurable compile-time
difference between limit=6 and whole basic block
- `libc++` - uses `memset` to zero all members in ctor, this patch
handles only `memset` to zero (the type mismatch case), which could be
generalized but seems very rare

The store-to-load pattern is created and consumed within the same CGSCC
inliner invocation: the ctor is inlined first (creating stores to the
object), and then the dtor's inline cost is evaluated (seeing loads from
the same object). No pass has an opportunity to simplify the IR in
between.

The `-flto` build eliminates empty `std::set` because the IR is
simplified enough in the regular optimization pass. However, when the
code is not header-only in a different TU, `-flto` doesn't help.

The change is much more general than just `std::set` and `std::map`. I
saw several impacts of it on LLVM codebase with `-O3`. Some function
reduce in size due to better dead-code elimination. Some increase due to
more aggressive inlining opportunities, and some are greatly simplified.

In my experiments I saw no measurable regression in compile times
compiling many large LLVM TUs. I measured ~1% faster compilation due to
following opt passes being faster. However, this needs more benchmarks.

Closes #183994
This commit is contained in:
Jiří Filek
2026-04-30 19:41:23 +02:00
committed by GitHub
parent ca9f6c5bcc
commit 0aef0f274b
3 changed files with 251 additions and 36 deletions

View File

@@ -30,6 +30,7 @@
#include "llvm/Analysis/InlineAdvisor.h"
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/LazyCallGraph.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ReplayInlineAdvisor.h"
@@ -80,6 +81,14 @@ static cl::opt<int> IntraSCCCostMultiplier(
"multiplied by intra-scc-cost-multiplier). This is to prevent tons of "
"inlining through a child SCC which can cause terrible compile times"));
static cl::opt<unsigned> InlinerForwardingScanLimit(
"inliner-forwarding-scan-limit", cl::init(16), cl::Hidden,
cl::desc("Maximum number of instructions to scan backward for "
"store-to-load forwarding in subsequent inlining decisions. "
"DefMaxInstsToScan=6 is not enough and misses inlining "
"opportunities (e.g. when class stores into mutiple members in "
"ctor and afterwards calls a function reading those members)"));
/// A flag for test, so we can print the content of the advisor when running it
/// as part of the default (e.g. -O3) pipeline.
static cl::opt<bool> KeepAdvisorForPrinting("keep-inline-advisor-for-printing",
@@ -325,6 +334,34 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
continue;
}
// Store-to-load forwarding, loads can be sometimes simplified to
// constants from stores introduced by previous inlining
if (DidInline) {
for (Value *Arg : CB->args()) {
auto *LI = dyn_cast<LoadInst>(Arg);
if (!LI || !LI->isSimple())
continue;
BasicBlock::iterator BBI = LI->getIterator();
Value *Available = FindAvailableLoadedValue(
LI, LI->getParent(), BBI, InlinerForwardingScanLimit);
if (!Available)
continue;
auto *C = dyn_cast<Constant>(Available);
if (!C)
continue;
// Handle type mismatches from memset forwarding (e.g. memset
// writes i64 0 but the load type is ptr).
if (C->getType() != LI->getType()) {
if (C->isNullValue())
C = Constant::getNullValue(LI->getType());
else
continue;
}
LI->replaceAllUsesWith(C);
LI->eraseFromParent();
}
}
std::unique_ptr<InlineAdvice> Advice =
Advisor.getAdvice(*CB, OnlyMandatory);

View File

@@ -0,0 +1,212 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -S -passes="cgscc(inline)" < %s | FileCheck %s
; Test that the CGSCC inliner performs store-to-load forwarding for call
; arguments after inlining a function in the same caller. The first call
; (@init or @init_memset) is inlined, producing stores. Forwarding then
; resolves the subsequent load to a constant, enabling inlining of the
; second callee.
target datalayout = "p:64:64"
; Two paths: mode==0 is trivial (ret), otherwise too expensive to inline.
define internal i32 @callee(i32 %mode, ptr %p) {
entry:
%cmp = icmp eq i32 %mode, 0
br i1 %cmp, label %fast, label %slow
fast:
%v = load i32, ptr %p
ret i32 %v
slow:
%a1 = load volatile i32, ptr %p
%a2 = load volatile i32, ptr %p
%x1 = add i32 %a1, %a2
%a3 = load volatile i32, ptr %p
%x2 = add i32 %x1, %a3
%a4 = load volatile i32, ptr %p
%x3 = add i32 %x2, %a4
%a5 = load volatile i32, ptr %p
%x4 = add i32 %x3, %a5
ret i32 %x4
}
; Trivial when called with null, otherwise too expensive to inline.
define internal void @recursive_callee(ptr %x) {
; CHECK-LABEL: define internal void @recursive_callee(
; CHECK-SAME: ptr [[X:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[X]], null
; CHECK-NEXT: br i1 [[CMP]], label %[[DONE:.*]], label %[[RECURSE:.*]]
; CHECK: [[RECURSE]]:
; CHECK-NEXT: [[NEXT:%.*]] = load ptr, ptr [[X]], align 8
; CHECK-NEXT: call void @recursive_callee(ptr [[NEXT]])
; CHECK-NEXT: [[V:%.*]] = load volatile i32, ptr [[X]], align 4
; CHECK-NEXT: br label %[[DONE]]
; CHECK: [[DONE]]:
; CHECK-NEXT: ret void
;
entry:
%cmp = icmp eq ptr %x, null
br i1 %cmp, label %done, label %recurse
recurse:
%next = load ptr, ptr %x
call void @recursive_callee(ptr %next)
%v = load volatile i32, ptr %x
br label %done
done:
ret void
}
; Trivially cheap — inlined first, producing a store.
define internal void @init_i32(ptr %p) {
store i32 0, ptr %p
ret void
}
; Trivially cheap — inlined first, producing a memset.
define internal void @init_memset(ptr %p) {
call void @llvm.memset.p0.i64(ptr %p, i8 0, i64 8, i1 false)
ret void
}
; Trivially cheap — stores a non-constant argument value.
define internal void @init_arg(ptr %p, i32 %v) {
store i32 %v, ptr %p
ret void
}
; Trivially cheap — stores a non-zero i64 at a slot later loaded as ptr.
define internal void @init_i64_nonzero(ptr %p) {
store i64 42, ptr %p
ret void
}
declare void @llvm.memset.p0.i64(ptr, i8, i64, i1)
; After inlining @init_i32: store 0 → %p.
; Forwarding resolves %mode to 0, making only the fast path reachable.
define i32 @caller_store_forward() {
; CHECK-LABEL: define i32 @caller_store_forward() {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[P:%.*]] = alloca i32, align 4
; CHECK-NEXT: store i32 0, ptr [[P]], align 4
; CHECK-NEXT: [[V_I:%.*]] = load i32, ptr [[P]], align 4
; CHECK-NEXT: ret i32 [[V_I]]
;
entry:
%p = alloca i32
call void @init_i32(ptr %p)
%mode = load i32, ptr %p
%r = call i32 @callee(i32 %mode, ptr %p)
ret i32 %r
}
; Memset-to-load forwarding converts the zero-filled integer to a null
; pointer, making the null check take the early exit.
define void @caller_memset_forward() {
; CHECK-LABEL: define void @caller_memset_forward() {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[P:%.*]] = alloca ptr, align 8
; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[P]], i8 0, i64 8, i1 false)
; CHECK-NEXT: ret void
;
entry:
%p = alloca ptr
call void @init_memset(ptr %p)
%x = load ptr, ptr %p
call void @recursive_callee(ptr %x)
ret void
}
; Negative: available value is non-constant (the function argument).
; No forwarding, keeps @callee as a call.
define i32 @caller_non_constant(i32 %v) {
; CHECK-LABEL: define i32 @caller_non_constant(
; CHECK-SAME: i32 [[V:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[P:%.*]] = alloca i32, align 4
; CHECK-NEXT: store i32 [[V]], ptr [[P]], align 4
; CHECK-NEXT: [[MODE:%.*]] = load i32, ptr [[P]], align 4
; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[MODE]], 0
; CHECK-NEXT: br i1 [[CMP_I]], label %[[FAST_I:.*]], label %[[SLOW_I:.*]]
; CHECK: [[FAST_I]]:
; CHECK-NEXT: [[V_I:%.*]] = load i32, ptr [[P]], align 4
; CHECK-NEXT: br label %[[CALLEE_EXIT:.*]]
; CHECK: [[SLOW_I]]:
; CHECK-NEXT: [[A1_I:%.*]] = load volatile i32, ptr [[P]], align 4
; CHECK-NEXT: [[A2_I:%.*]] = load volatile i32, ptr [[P]], align 4
; CHECK-NEXT: [[X1_I:%.*]] = add i32 [[A1_I]], [[A2_I]]
; CHECK-NEXT: [[A3_I:%.*]] = load volatile i32, ptr [[P]], align 4
; CHECK-NEXT: [[X2_I:%.*]] = add i32 [[X1_I]], [[A3_I]]
; CHECK-NEXT: [[A4_I:%.*]] = load volatile i32, ptr [[P]], align 4
; CHECK-NEXT: [[X3_I:%.*]] = add i32 [[X2_I]], [[A4_I]]
; CHECK-NEXT: [[A5_I:%.*]] = load volatile i32, ptr [[P]], align 4
; CHECK-NEXT: [[X4_I:%.*]] = add i32 [[X3_I]], [[A5_I]]
; CHECK-NEXT: br label %[[CALLEE_EXIT]]
; CHECK: [[CALLEE_EXIT]]:
; CHECK-NEXT: [[R1:%.*]] = phi i32 [ [[V_I]], %[[FAST_I]] ], [ [[X4_I]], %[[SLOW_I]] ]
; CHECK-NEXT: ret i32 [[R1]]
;
entry:
%p = alloca i32
call void @init_arg(ptr %p, i32 %v)
%mode = load i32, ptr %p
%r = call i32 @callee(i32 %mode, ptr %p)
ret i32 %r
}
; Negative: available value has a type different from the load's and is not
; a null value (store i64 42, load ptr).
; The type-mismatch blocks forwarding of a non-zero integer as a pointer.
define void @caller_type_mismatch() {
; CHECK-LABEL: define void @caller_type_mismatch() {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[P:%.*]] = alloca ptr, align 8
; CHECK-NEXT: store i64 42, ptr [[P]], align 4
; CHECK-NEXT: [[X:%.*]] = load ptr, ptr [[P]], align 8
; CHECK-NEXT: call void @recursive_callee(ptr [[X]])
; CHECK-NEXT: ret void
;
entry:
%p = alloca ptr
call void @init_i64_nonzero(ptr %p)
%x = load ptr, ptr %p
call void @recursive_callee(ptr %x)
ret void
}
; Negative: volatile load must not be forwarded. @callee stays uninlined since
; %mode cannot be resolved to a constant.
define i32 @caller_volatile_load() {
; CHECK-LABEL: define i32 @caller_volatile_load() {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[P:%.*]] = alloca i32, align 4
; CHECK-NEXT: store i32 0, ptr [[P]], align 4
; CHECK-NEXT: [[MODE:%.*]] = load volatile i32, ptr [[P]], align 4
; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[MODE]], 0
; CHECK-NEXT: br i1 [[CMP_I]], label %[[FAST_I:.*]], label %[[SLOW_I:.*]]
; CHECK: [[FAST_I]]:
; CHECK-NEXT: [[V_I:%.*]] = load i32, ptr [[P]], align 4
; CHECK-NEXT: br label %[[CALLEE_EXIT:.*]]
; CHECK: [[SLOW_I]]:
; CHECK-NEXT: [[A1_I:%.*]] = load volatile i32, ptr [[P]], align 4
; CHECK-NEXT: [[A2_I:%.*]] = load volatile i32, ptr [[P]], align 4
; CHECK-NEXT: [[X1_I:%.*]] = add i32 [[A1_I]], [[A2_I]]
; CHECK-NEXT: [[A3_I:%.*]] = load volatile i32, ptr [[P]], align 4
; CHECK-NEXT: [[X2_I:%.*]] = add i32 [[X1_I]], [[A3_I]]
; CHECK-NEXT: [[A4_I:%.*]] = load volatile i32, ptr [[P]], align 4
; CHECK-NEXT: [[X3_I:%.*]] = add i32 [[X2_I]], [[A4_I]]
; CHECK-NEXT: [[A5_I:%.*]] = load volatile i32, ptr [[P]], align 4
; CHECK-NEXT: [[X4_I:%.*]] = add i32 [[X3_I]], [[A5_I]]
; CHECK-NEXT: br label %[[CALLEE_EXIT]]
; CHECK: [[CALLEE_EXIT]]:
; CHECK-NEXT: [[R1:%.*]] = phi i32 [ [[V_I]], %[[FAST_I]] ], [ [[X4_I]], %[[SLOW_I]] ]
; CHECK-NEXT: ret i32 [[R1]]
;
entry:
%p = alloca i32
call void @init_i32(ptr %p)
%mode = load volatile i32, ptr %p
%r = call i32 @callee(i32 %mode, ptr %p)
ret i32 %r
}

View File

@@ -1,10 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -passes="default<O3>" < %s | FileCheck %s
; FIXME: Mirrors missing optimization on empty std::set
; The constructor stores null, so the destructor's erase call
; should be simplified to a no-op, making the entire function ret void.
; Test that the CGSCC inliner forwards stores to load arguments after
; inlining. This addresses a phase ordering issue: the constructor is
; inlined first (producing stores), and then the destructor's load
@@ -40,36 +36,6 @@ define internal void @dtor(ptr %this) {
; With null: icmp + branch to done → trivially cheap.
; With unknown ptr: recursive calls + external calls → too expensive.
define internal void @erase(ptr %node) {
; CHECK-LABEL: define internal fastcc void @erase(
; CHECK-SAME: ptr captures(address_is_null) [[NODE:%.*]]) unnamed_addr {
; CHECK-NEXT: [[IS_NULL:%.*]] = icmp eq ptr [[NODE]], null
; CHECK-NEXT: br i1 [[IS_NULL]], label %[[COMMON_RET1:.*]], label %[[RECURSE:.*]]
; CHECK: [[COMMON_RET1]]:
; CHECK-NEXT: ret void
; CHECK: [[RECURSE]]:
; CHECK-NEXT: [[LEFT_VAL:%.*]] = load ptr, ptr [[NODE]], align 8
; CHECK-NEXT: tail call fastcc void @erase(ptr [[LEFT_VAL]])
; CHECK-NEXT: [[RIGHT_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[NODE]], i64 8
; CHECK-NEXT: [[RIGHT_VAL:%.*]] = load ptr, ptr [[RIGHT_PTR]], align 8
; CHECK-NEXT: tail call fastcc void @erase(ptr [[RIGHT_VAL]])
; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds nuw i8, ptr [[NODE]], i64 16
; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[D1]], align 8
; CHECK-NEXT: tail call void @use(i64 [[V1]])
; CHECK-NEXT: [[D2:%.*]] = getelementptr inbounds nuw i8, ptr [[NODE]], i64 24
; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[D2]], align 8
; CHECK-NEXT: tail call void @use(i64 [[V2]])
; CHECK-NEXT: [[D3:%.*]] = getelementptr inbounds nuw i8, ptr [[NODE]], i64 32
; CHECK-NEXT: [[V3:%.*]] = load i64, ptr [[D3]], align 8
; CHECK-NEXT: tail call void @use(i64 [[V3]])
; CHECK-NEXT: [[D4:%.*]] = getelementptr inbounds nuw i8, ptr [[NODE]], i64 40
; CHECK-NEXT: [[V4:%.*]] = load i64, ptr [[D4]], align 8
; CHECK-NEXT: tail call void @use(i64 [[V4]])
; CHECK-NEXT: [[D5:%.*]] = getelementptr inbounds nuw i8, ptr [[NODE]], i64 48
; CHECK-NEXT: [[V5:%.*]] = load i64, ptr [[D5]], align 8
; CHECK-NEXT: tail call void @use(i64 [[V5]])
; CHECK-NEXT: tail call void @free(ptr nonnull [[NODE]])
; CHECK-NEXT: br label %[[COMMON_RET1]]
;
%is_null = icmp eq ptr %node, null
br i1 %is_null, label %done, label %recurse
@@ -102,8 +68,8 @@ done:
}
define void @test_empty_tree() {
; CHECK-LABEL: define void @test_empty_tree() local_unnamed_addr {
; CHECK-NEXT: tail call fastcc void @erase(ptr null)
; CHECK-LABEL: define void @test_empty_tree(
; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: ret void
;
%tree = alloca ptr, align 8