[ELF] Parallelize --gc-sections mark phase (#189321)

Add `markParallel` using level-synchronized `parallelFor`. Each BFS level is processed in parallel; newly discovered sections are collected in per-thread queues and merged for the next level. The parallel path is used when `!TrackWhyLive && partitions.size()==1`. `parallelFor` naturally degrades to serial when `--threads=1`. Uses depth-limited inline recursion (depth<3) and optimistic load-then-exchange dedup for best performance. Linking a Release+Asserts clang (--gc-sections, --time-trace) on an old x86-64: 8 threads: markLive 315ms -> 82ms (-234ms). Total 1562ms -> 1350ms (1.16x). 16 threads: markLive 199ms -> 50ms (-149ms). Total 1017ms -> 862ms (1.18x). and on Apple M4: markLive 61ms -> 13ms. Total 317.3ms -> 272.7ms (1.16x).
2026-04-01 23:42:00 -07:00
parent 083f9c158a
commit 6f9646a598
1 changed files with 101 additions and 3 deletions
--- a/lld/ELF/MarkLive.cpp
+++ b/lld/ELF/MarkLive.cpp
@@ -30,6 +30,7 @@
 #include "lld/Common/Strings.h"
 #include "llvm/ADT/DenseMapInfoVariant.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Parallel.h"
 #include "llvm/Support/TimeProfiler.h"
 #include <variant>
 #include <vector>
@@ -66,6 +67,7 @@ private:
               LiveReason reason);
  void markSymbol(Symbol *sym, StringRef reason);
  void mark();
+  void markParallel();

  template <class RelTy>
  void resolveReloc(InputSectionBase &sec, const RelTy &rel, bool fromFDE);
@@ -460,7 +462,12 @@ void MarkLive<ELFT, TrackWhyLive>::run() {

 template <class ELFT, bool TrackWhyLive>
 void MarkLive<ELFT, TrackWhyLive>::mark() {
-  // Mark all reachable sections.
+  if constexpr (!TrackWhyLive) {
+    if (ctx.partitions.size() == 1) {
+      markParallel();
+      return;
+    }
+  }
  while (!queue.empty()) {
    InputSectionBase &sec = *queue.pop_back_val();

@@ -483,6 +490,97 @@ void MarkLive<ELFT, TrackWhyLive>::mark() {
  }
 }

+// Helper function for markParallel. Walk all GC edges from sec, marking
+// everything that needs to be live. Call fn(target section, offset) for each
+// edge, which will mark the section live and handle further processing of edges
+// from that section.
+template <class ELFT, class Fn>
+static void processSectionEdges(
+    Ctx &ctx, InputSectionBase &sec,
+    const DenseMap<StringRef, SmallVector<InputSectionBase *, 0>>
+        &cNamedSections,
+    Fn fn) {
+  auto resolveEdge = [&](const auto &rel) {
+    Symbol &sym = sec.file->getRelocTargetSym(rel);
+    if (!sym.hasFlag(USED))
+      sym.setFlags(USED);
+    if (auto *d = dyn_cast<Defined>(&sym)) {
+      if (auto *relSec = dyn_cast_or_null<InputSectionBase>(d->section)) {
+        uint64_t offset = d->value;
+        if (d->isSection()) {
+          offset += getAddend<ELFT>(ctx, sec, rel);
+          if (auto *ms = dyn_cast<MergeInputSection>(relSec);
+              ms && offset >= ms->content().size())
+            return;
+        }
+        if (auto *ms = dyn_cast<MergeInputSection>(relSec)) {
+          auto &piece = ms->getSectionPiece(offset);
+          auto *word =
+              reinterpret_cast<std::atomic<uint32_t> *>(&piece.inputOff + 1);
+          constexpr uint32_t liveBit = sys::IsBigEndianHost ? (1U << 31) : 1U;
+          word->fetch_or(liveBit, std::memory_order_relaxed);
+        }
+        fn(relSec, offset);
+      }
+      return;
+    }
+    for (InputSectionBase *csec : cNamedSections.lookup(sym.getName()))
+      fn(csec, 0);
+  };
+  const RelsOrRelas<ELFT> rels = sec.template relsOrRelas<ELFT>();
+  for (const typename ELFT::Rel &rel : rels.rels)
+    resolveEdge(rel);
+  for (const typename ELFT::Rela &rel : rels.relas)
+    resolveEdge(rel);
+  for (const typename ELFT::Crel &rel : rels.crels)
+    resolveEdge(rel);
+  for (InputSectionBase *isec : sec.dependentSections)
+    fn(isec, 0);
+  if (sec.nextInSectionGroup)
+    fn(sec.nextInSectionGroup, 0);
+}
+
+// Parallel mark using level-synchronized BFS with depth-limited inline
+// recursion. Each parallelFor iteration processes a subtree up to depth 3
+// (DFS for cache locality), then queues deeper discoveries for the next level.
+template <class ELFT, bool TrackWhyLive>
+void MarkLive<ELFT, TrackWhyLive>::markParallel() {
+  const size_t numThreads = parallel::getThreadCount();
+  auto visit = [&](InputSection *sec, int depth,
+                   SmallVector<InputSection *, 0> &localQueue,
+                   auto &self) -> void {
+    processSectionEdges<ELFT>(
+        ctx, *sec, cNamedSections,
+        [&](InputSectionBase *target, uint64_t offset) {
+          auto &part =
+              reinterpret_cast<std::atomic<uint8_t> &>(target->partition);
+          // Optimistic load-then-exchange avoids expensive atomic
+          // RMW on already-visited sections.
+          if (part.load(std::memory_order_relaxed) != 0 ||
+              part.exchange(1, std::memory_order_relaxed) != 0)
+            return;
+          if (auto *s = dyn_cast<InputSection>(target)) {
+            if (depth < 3)
+              self(s, depth + 1, localQueue, self);
+            else
+              localQueue.push_back(s);
+          }
+        });
+  };
+
+  while (!queue.empty()) {
+    auto queues =
+        std::make_unique<SmallVector<InputSection *, 0>[]>(numThreads);
+    parallelFor(0, queue.size(), [&](size_t i) {
+      const unsigned tid = parallel::getThreadIndex();
+      visit(queue[i], 0, queues[tid], visit);
+    });
+    queue.clear();
+    for (size_t t = 0; t < numThreads; ++t)
+      queue.append(std::move(queues[t]));
+  }
+}
+
 // Move the sections for some symbols to the main partition, specifically ifuncs
 // (because they can result in an IRELATIVE being added to the main partition's
 // GOT, which means that the ifunc must be available when the main partition is
@@ -527,8 +625,8 @@ template <class ELFT> void elf::markLive(Ctx &ctx) {
    return;
  }

-  for (InputSectionBase *sec : ctx.inputSections)
-    sec->markDead();
+  parallelForEach(ctx.inputSections,
+                  [](InputSectionBase *sec) { sec->markDead(); });

  // Follow the graph to mark all live sections.
  for (unsigned i = 1, e = ctx.partitions.size(); i <= e; ++i)