[Inliner] Use store-to-load forwarding to resolve call arguments (#190607)

Uses `FindAvailableLoadedValue` to resolve load instructions in call arguments to constants before inline cost analysis. This gives the inliner more precise cost estimate and option to inline functions which would not be inlined otherwise. The `-O3` doesn't inline empty `std::set` and `std::map` because node deletion is recursive. The inliner doesn't know that `nullptr` is passed in as it is a `load` from a member. This addresses both `libstdc++` and `libc++`: - `libstdc++` - `FindAvailableLoadedValue` requires `MaxInstToScan=0`, because relevant store is 7 instructions away and `DefMaxInstsToScan = 6`. Benchmarking on large LLVM TUs showed no measurable compile-time difference between limit=6 and whole basic block - `libc++` - uses `memset` to zero all members in ctor, this patch handles only `memset` to zero (the type mismatch case), which could be generalized but seems very rare The store-to-load pattern is created and consumed within the same CGSCC inliner invocation: the ctor is inlined first (creating stores to the object), and then the dtor's inline cost is evaluated (seeing loads from the same object). No pass has an opportunity to simplify the IR in between. The `-flto` build eliminates empty `std::set` because the IR is simplified enough in the regular optimization pass. However, when the code is not header-only in a different TU, `-flto` doesn't help. The change is much more general than just `std::set` and `std::map`. I saw several impacts of it on LLVM codebase with `-O3`. Some function reduce in size due to better dead-code elimination. Some increase due to more aggressive inlining opportunities, and some are greatly simplified. In my experiments I saw no measurable regression in compile times compiling many large LLVM TUs. I measured ~1% faster compilation due to following opt passes being faster. However, this needs more benchmarks. Closes #183994
2026-04-30 19:41:23 +02:00
parent ca9f6c5bcc
commit 0aef0f274b
3 changed files with 251 additions and 36 deletions
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ReplayInlineAdvisor.h"
@@ -80,6 +81,14 @@ static cl::opt<int> IntraSCCCostMultiplier(
        "multiplied by intra-scc-cost-multiplier). This is to prevent tons of "
        "inlining through a child SCC which can cause terrible compile times"));

+static cl::opt<unsigned> InlinerForwardingScanLimit(
+    "inliner-forwarding-scan-limit", cl::init(16), cl::Hidden,
+    cl::desc("Maximum number of instructions to scan backward for "
+             "store-to-load forwarding in subsequent inlining decisions. "
+             "DefMaxInstsToScan=6 is not enough and misses inlining "
+             "opportunities (e.g. when class stores into mutiple members in "
+             "ctor and afterwards calls a function reading those members)"));
+
 /// A flag for test, so we can print the content of the advisor when running it
 /// as part of the default (e.g. -O3) pipeline.
 static cl::opt<bool> KeepAdvisorForPrinting("keep-inline-advisor-for-printing",
@@ -325,6 +334,34 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
        continue;
      }

+      // Store-to-load forwarding, loads can be sometimes simplified to
+      // constants from stores introduced by previous inlining
+      if (DidInline) {
+        for (Value *Arg : CB->args()) {
+          auto *LI = dyn_cast<LoadInst>(Arg);
+          if (!LI || !LI->isSimple())
+            continue;
+          BasicBlock::iterator BBI = LI->getIterator();
+          Value *Available = FindAvailableLoadedValue(
+              LI, LI->getParent(), BBI, InlinerForwardingScanLimit);
+          if (!Available)
+            continue;
+          auto *C = dyn_cast<Constant>(Available);
+          if (!C)
+            continue;
+          // Handle type mismatches from memset forwarding (e.g. memset
+          // writes i64 0 but the load type is ptr).
+          if (C->getType() != LI->getType()) {
+            if (C->isNullValue())
+              C = Constant::getNullValue(LI->getType());
+            else
+              continue;
+          }
+          LI->replaceAllUsesWith(C);
+          LI->eraseFromParent();
+        }
+      }
+
      std::unique_ptr<InlineAdvice> Advice =
          Advisor.getAdvice(*CB, OnlyMandatory);

--- a/llvm/test/Transforms/Inline/inline_store_to_load.ll
+++ b/llvm/test/Transforms/Inline/inline_store_to_load.ll
@@ -0,0 +1,212 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes="cgscc(inline)" < %s | FileCheck %s
+
+; Test that the CGSCC inliner performs store-to-load forwarding for call
+; arguments after inlining a function in the same caller. The first call
+; (@init or @init_memset) is inlined, producing stores. Forwarding then
+; resolves the subsequent load to a constant, enabling inlining of the
+; second callee.
+
+target datalayout = "p:64:64"
+
+; Two paths: mode==0 is trivial (ret), otherwise too expensive to inline.
+define internal i32 @callee(i32 %mode, ptr %p) {
+entry:
+  %cmp = icmp eq i32 %mode, 0
+  br i1 %cmp, label %fast, label %slow
+fast:
+  %v = load i32, ptr %p
+  ret i32 %v
+slow:
+  %a1 = load volatile i32, ptr %p
+  %a2 = load volatile i32, ptr %p
+  %x1 = add i32 %a1, %a2
+  %a3 = load volatile i32, ptr %p
+  %x2 = add i32 %x1, %a3
+  %a4 = load volatile i32, ptr %p
+  %x3 = add i32 %x2, %a4
+  %a5 = load volatile i32, ptr %p
+  %x4 = add i32 %x3, %a5
+  ret i32 %x4
+}
+
+; Trivial when called with null, otherwise too expensive to inline.
+define internal void @recursive_callee(ptr %x) {
+; CHECK-LABEL: define internal void @recursive_callee(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[X]], null
+; CHECK-NEXT:    br i1 [[CMP]], label %[[DONE:.*]], label %[[RECURSE:.*]]
+; CHECK:       [[RECURSE]]:
+; CHECK-NEXT:    [[NEXT:%.*]] = load ptr, ptr [[X]], align 8
+; CHECK-NEXT:    call void @recursive_callee(ptr [[NEXT]])
+; CHECK-NEXT:    [[V:%.*]] = load volatile i32, ptr [[X]], align 4
+; CHECK-NEXT:    br label %[[DONE]]
+; CHECK:       [[DONE]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq ptr %x, null
+  br i1 %cmp, label %done, label %recurse
+recurse:
+  %next = load ptr, ptr %x
+  call void @recursive_callee(ptr %next)
+  %v = load volatile i32, ptr %x
+  br label %done
+done:
+  ret void
+}
+
+; Trivially cheap — inlined first, producing a store.
+define internal void @init_i32(ptr %p) {
+  store i32 0, ptr %p
+  ret void
+}
+
+; Trivially cheap — inlined first, producing a memset.
+define internal void @init_memset(ptr %p) {
+  call void @llvm.memset.p0.i64(ptr %p, i8 0, i64 8, i1 false)
+  ret void
+}
+
+; Trivially cheap — stores a non-constant argument value.
+define internal void @init_arg(ptr %p, i32 %v) {
+  store i32 %v, ptr %p
+  ret void
+}
+
+; Trivially cheap — stores a non-zero i64 at a slot later loaded as ptr.
+define internal void @init_i64_nonzero(ptr %p) {
+  store i64 42, ptr %p
+  ret void
+}
+
+declare void @llvm.memset.p0.i64(ptr, i8, i64, i1)
+
+; After inlining @init_i32: store 0 → %p.
+; Forwarding resolves %mode to 0, making only the fast path reachable.
+define i32 @caller_store_forward() {
+; CHECK-LABEL: define i32 @caller_store_forward() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 0, ptr [[P]], align 4
+; CHECK-NEXT:    [[V_I:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 [[V_I]]
+;
+entry:
+  %p = alloca i32
+  call void @init_i32(ptr %p)
+  %mode = load i32, ptr %p
+  %r = call i32 @callee(i32 %mode, ptr %p)
+  ret i32 %r
+}
+
+; Memset-to-load forwarding converts the zero-filled integer to a null
+; pointer, making the null check take the early exit.
+define void @caller_memset_forward() {
+; CHECK-LABEL: define void @caller_memset_forward() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[P]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p = alloca ptr
+  call void @init_memset(ptr %p)
+  %x = load ptr, ptr %p
+  call void @recursive_callee(ptr %x)
+  ret void
+}
+
+; Negative: available value is non-constant (the function argument).
+; No forwarding, keeps @callee as a call.
+define i32 @caller_non_constant(i32 %v) {
+; CHECK-LABEL: define i32 @caller_non_constant(
+; CHECK-SAME: i32 [[V:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[V]], ptr [[P]], align 4
+; CHECK-NEXT:    [[MODE:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[MODE]], 0
+; CHECK-NEXT:    br i1 [[CMP_I]], label %[[FAST_I:.*]], label %[[SLOW_I:.*]]
+; CHECK:       [[FAST_I]]:
+; CHECK-NEXT:    [[V_I:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    br label %[[CALLEE_EXIT:.*]]
+; CHECK:       [[SLOW_I]]:
+; CHECK-NEXT:    [[A1_I:%.*]] = load volatile i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[A2_I:%.*]] = load volatile i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[X1_I:%.*]] = add i32 [[A1_I]], [[A2_I]]
+; CHECK-NEXT:    [[A3_I:%.*]] = load volatile i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[X2_I:%.*]] = add i32 [[X1_I]], [[A3_I]]
+; CHECK-NEXT:    [[A4_I:%.*]] = load volatile i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[X3_I:%.*]] = add i32 [[X2_I]], [[A4_I]]
+; CHECK-NEXT:    [[A5_I:%.*]] = load volatile i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[X4_I:%.*]] = add i32 [[X3_I]], [[A5_I]]
+; CHECK-NEXT:    br label %[[CALLEE_EXIT]]
+; CHECK:       [[CALLEE_EXIT]]:
+; CHECK-NEXT:    [[R1:%.*]] = phi i32 [ [[V_I]], %[[FAST_I]] ], [ [[X4_I]], %[[SLOW_I]] ]
+; CHECK-NEXT:    ret i32 [[R1]]
+;
+entry:
+  %p = alloca i32
+  call void @init_arg(ptr %p, i32 %v)
+  %mode = load i32, ptr %p
+  %r = call i32 @callee(i32 %mode, ptr %p)
+  ret i32 %r
+}
+
+; Negative: available value has a type different from the load's and is not
+; a null value (store i64 42, load ptr).
+; The type-mismatch blocks forwarding of a non-zero integer as a pointer.
+define void @caller_type_mismatch() {
+; CHECK-LABEL: define void @caller_type_mismatch() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store i64 42, ptr [[P]], align 4
+; CHECK-NEXT:    [[X:%.*]] = load ptr, ptr [[P]], align 8
+; CHECK-NEXT:    call void @recursive_callee(ptr [[X]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p = alloca ptr
+  call void @init_i64_nonzero(ptr %p)
+  %x = load ptr, ptr %p
+  call void @recursive_callee(ptr %x)
+  ret void
+}
+
+; Negative: volatile load must not be forwarded. @callee stays uninlined since
+; %mode cannot be resolved to a constant.
+define i32 @caller_volatile_load() {
+; CHECK-LABEL: define i32 @caller_volatile_load() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 0, ptr [[P]], align 4
+; CHECK-NEXT:    [[MODE:%.*]] = load volatile i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[MODE]], 0
+; CHECK-NEXT:    br i1 [[CMP_I]], label %[[FAST_I:.*]], label %[[SLOW_I:.*]]
+; CHECK:       [[FAST_I]]:
+; CHECK-NEXT:    [[V_I:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    br label %[[CALLEE_EXIT:.*]]
+; CHECK:       [[SLOW_I]]:
+; CHECK-NEXT:    [[A1_I:%.*]] = load volatile i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[A2_I:%.*]] = load volatile i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[X1_I:%.*]] = add i32 [[A1_I]], [[A2_I]]
+; CHECK-NEXT:    [[A3_I:%.*]] = load volatile i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[X2_I:%.*]] = add i32 [[X1_I]], [[A3_I]]
+; CHECK-NEXT:    [[A4_I:%.*]] = load volatile i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[X3_I:%.*]] = add i32 [[X2_I]], [[A4_I]]
+; CHECK-NEXT:    [[A5_I:%.*]] = load volatile i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[X4_I:%.*]] = add i32 [[X3_I]], [[A5_I]]
+; CHECK-NEXT:    br label %[[CALLEE_EXIT]]
+; CHECK:       [[CALLEE_EXIT]]:
+; CHECK-NEXT:    [[R1:%.*]] = phi i32 [ [[V_I]], %[[FAST_I]] ], [ [[X4_I]], %[[SLOW_I]] ]
+; CHECK-NEXT:    ret i32 [[R1]]
+;
+entry:
+  %p = alloca i32
+  call void @init_i32(ptr %p)
+  %mode = load volatile i32, ptr %p
+  %r = call i32 @callee(i32 %mode, ptr %p)
+  ret i32 %r
+}
--- a/llvm/test/Transforms/PhaseOrdering/inline-store-to-load.ll
+++ b/llvm/test/Transforms/PhaseOrdering/inline-store-to-load.ll
@@ -1,10 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes="default<O3>" < %s | FileCheck %s

-; FIXME: Mirrors missing optimization on empty std::set
-; The constructor stores null, so the destructor's erase call
-; should be simplified to a no-op, making the entire function ret void.
-
 ; Test that the CGSCC inliner forwards stores to load arguments after
 ; inlining. This addresses a phase ordering issue: the constructor is
 ; inlined first (producing stores), and then the destructor's load
@@ -40,36 +36,6 @@ define internal void @dtor(ptr %this) {
 ; With null: icmp + branch to done → trivially cheap.
 ; With unknown ptr: recursive calls + external calls → too expensive.
 define internal void @erase(ptr %node) {
-; CHECK-LABEL: define internal fastcc void @erase(
-; CHECK-SAME: ptr captures(address_is_null) [[NODE:%.*]]) unnamed_addr {
-; CHECK-NEXT:    [[IS_NULL:%.*]] = icmp eq ptr [[NODE]], null
-; CHECK-NEXT:    br i1 [[IS_NULL]], label %[[COMMON_RET1:.*]], label %[[RECURSE:.*]]
-; CHECK:       [[COMMON_RET1]]:
-; CHECK-NEXT:    ret void
-; CHECK:       [[RECURSE]]:
-; CHECK-NEXT:    [[LEFT_VAL:%.*]] = load ptr, ptr [[NODE]], align 8
-; CHECK-NEXT:    tail call fastcc void @erase(ptr [[LEFT_VAL]])
-; CHECK-NEXT:    [[RIGHT_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[NODE]], i64 8
-; CHECK-NEXT:    [[RIGHT_VAL:%.*]] = load ptr, ptr [[RIGHT_PTR]], align 8
-; CHECK-NEXT:    tail call fastcc void @erase(ptr [[RIGHT_VAL]])
-; CHECK-NEXT:    [[D1:%.*]] = getelementptr inbounds nuw i8, ptr [[NODE]], i64 16
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[D1]], align 8
-; CHECK-NEXT:    tail call void @use(i64 [[V1]])
-; CHECK-NEXT:    [[D2:%.*]] = getelementptr inbounds nuw i8, ptr [[NODE]], i64 24
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[D2]], align 8
-; CHECK-NEXT:    tail call void @use(i64 [[V2]])
-; CHECK-NEXT:    [[D3:%.*]] = getelementptr inbounds nuw i8, ptr [[NODE]], i64 32
-; CHECK-NEXT:    [[V3:%.*]] = load i64, ptr [[D3]], align 8
-; CHECK-NEXT:    tail call void @use(i64 [[V3]])
-; CHECK-NEXT:    [[D4:%.*]] = getelementptr inbounds nuw i8, ptr [[NODE]], i64 40
-; CHECK-NEXT:    [[V4:%.*]] = load i64, ptr [[D4]], align 8
-; CHECK-NEXT:    tail call void @use(i64 [[V4]])
-; CHECK-NEXT:    [[D5:%.*]] = getelementptr inbounds nuw i8, ptr [[NODE]], i64 48
-; CHECK-NEXT:    [[V5:%.*]] = load i64, ptr [[D5]], align 8
-; CHECK-NEXT:    tail call void @use(i64 [[V5]])
-; CHECK-NEXT:    tail call void @free(ptr nonnull [[NODE]])
-; CHECK-NEXT:    br label %[[COMMON_RET1]]
-;
  %is_null = icmp eq ptr %node, null
  br i1 %is_null, label %done, label %recurse

@@ -102,8 +68,8 @@ done:
 }

 define void @test_empty_tree() {
-; CHECK-LABEL: define void @test_empty_tree() local_unnamed_addr {
-; CHECK-NEXT:    tail call fastcc void @erase(ptr null)
+; CHECK-LABEL: define void @test_empty_tree(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    ret void
 ;
  %tree = alloca ptr, align 8