[ELF] Parallelize --gc-sections mark phase (#189321)

Add `markParallel` using level-synchronized `parallelFor`. Each BFS
level is processed in parallel; newly discovered sections are collected
in per-thread queues and merged for the next level.

The parallel path is used when `!TrackWhyLive && partitions.size()==1`.
`parallelFor` naturally degrades to serial when `--threads=1`.

Uses depth-limited inline recursion (depth<3) and optimistic
load-then-exchange dedup for best performance.

Linking a Release+Asserts clang (--gc-sections, --time-trace) on an old
x86-64:

8 threads: markLive 315ms -> 82ms (-234ms). Total 1562ms -> 1350ms
(1.16x).
16 threads: markLive 199ms -> 50ms (-149ms). Total 1017ms -> 862ms
(1.18x).

and on Apple M4: markLive 61ms -> 13ms. Total 317.3ms -> 272.7ms
(1.16x).
This commit is contained in:
Fangrui Song
2026-04-01 23:42:00 -07:00
committed by GitHub
parent 083f9c158a
commit 6f9646a598

View File

@@ -30,6 +30,7 @@
#include "lld/Common/Strings.h"
#include "llvm/ADT/DenseMapInfoVariant.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/TimeProfiler.h"
#include <variant>
#include <vector>
@@ -66,6 +67,7 @@ private:
LiveReason reason);
void markSymbol(Symbol *sym, StringRef reason);
void mark();
void markParallel();
template <class RelTy>
void resolveReloc(InputSectionBase &sec, const RelTy &rel, bool fromFDE);
@@ -460,7 +462,12 @@ void MarkLive<ELFT, TrackWhyLive>::run() {
template <class ELFT, bool TrackWhyLive>
void MarkLive<ELFT, TrackWhyLive>::mark() {
// Mark all reachable sections.
if constexpr (!TrackWhyLive) {
if (ctx.partitions.size() == 1) {
markParallel();
return;
}
}
while (!queue.empty()) {
InputSectionBase &sec = *queue.pop_back_val();
@@ -483,6 +490,97 @@ void MarkLive<ELFT, TrackWhyLive>::mark() {
}
}
// Helper function for markParallel. Walk all GC edges from sec, marking
// everything that needs to be live. Call fn(target section, offset) for each
// edge, which will mark the section live and handle further processing of edges
// from that section.
template <class ELFT, class Fn>
static void processSectionEdges(
Ctx &ctx, InputSectionBase &sec,
const DenseMap<StringRef, SmallVector<InputSectionBase *, 0>>
&cNamedSections,
Fn fn) {
auto resolveEdge = [&](const auto &rel) {
Symbol &sym = sec.file->getRelocTargetSym(rel);
if (!sym.hasFlag(USED))
sym.setFlags(USED);
if (auto *d = dyn_cast<Defined>(&sym)) {
if (auto *relSec = dyn_cast_or_null<InputSectionBase>(d->section)) {
uint64_t offset = d->value;
if (d->isSection()) {
offset += getAddend<ELFT>(ctx, sec, rel);
if (auto *ms = dyn_cast<MergeInputSection>(relSec);
ms && offset >= ms->content().size())
return;
}
if (auto *ms = dyn_cast<MergeInputSection>(relSec)) {
auto &piece = ms->getSectionPiece(offset);
auto *word =
reinterpret_cast<std::atomic<uint32_t> *>(&piece.inputOff + 1);
constexpr uint32_t liveBit = sys::IsBigEndianHost ? (1U << 31) : 1U;
word->fetch_or(liveBit, std::memory_order_relaxed);
}
fn(relSec, offset);
}
return;
}
for (InputSectionBase *csec : cNamedSections.lookup(sym.getName()))
fn(csec, 0);
};
const RelsOrRelas<ELFT> rels = sec.template relsOrRelas<ELFT>();
for (const typename ELFT::Rel &rel : rels.rels)
resolveEdge(rel);
for (const typename ELFT::Rela &rel : rels.relas)
resolveEdge(rel);
for (const typename ELFT::Crel &rel : rels.crels)
resolveEdge(rel);
for (InputSectionBase *isec : sec.dependentSections)
fn(isec, 0);
if (sec.nextInSectionGroup)
fn(sec.nextInSectionGroup, 0);
}
// Parallel mark using level-synchronized BFS with depth-limited inline
// recursion. Each parallelFor iteration processes a subtree up to depth 3
// (DFS for cache locality), then queues deeper discoveries for the next level.
template <class ELFT, bool TrackWhyLive>
void MarkLive<ELFT, TrackWhyLive>::markParallel() {
const size_t numThreads = parallel::getThreadCount();
auto visit = [&](InputSection *sec, int depth,
SmallVector<InputSection *, 0> &localQueue,
auto &self) -> void {
processSectionEdges<ELFT>(
ctx, *sec, cNamedSections,
[&](InputSectionBase *target, uint64_t offset) {
auto &part =
reinterpret_cast<std::atomic<uint8_t> &>(target->partition);
// Optimistic load-then-exchange avoids expensive atomic
// RMW on already-visited sections.
if (part.load(std::memory_order_relaxed) != 0 ||
part.exchange(1, std::memory_order_relaxed) != 0)
return;
if (auto *s = dyn_cast<InputSection>(target)) {
if (depth < 3)
self(s, depth + 1, localQueue, self);
else
localQueue.push_back(s);
}
});
};
while (!queue.empty()) {
auto queues =
std::make_unique<SmallVector<InputSection *, 0>[]>(numThreads);
parallelFor(0, queue.size(), [&](size_t i) {
const unsigned tid = parallel::getThreadIndex();
visit(queue[i], 0, queues[tid], visit);
});
queue.clear();
for (size_t t = 0; t < numThreads; ++t)
queue.append(std::move(queues[t]));
}
}
// Move the sections for some symbols to the main partition, specifically ifuncs
// (because they can result in an IRELATIVE being added to the main partition's
// GOT, which means that the ifunc must be available when the main partition is
@@ -527,8 +625,8 @@ template <class ELFT> void elf::markLive(Ctx &ctx) {
return;
}
for (InputSectionBase *sec : ctx.inputSections)
sec->markDead();
parallelForEach(ctx.inputSections,
[](InputSectionBase *sec) { sec->markDead(); });
// Follow the graph to mark all live sections.
for (unsigned i = 1, e = ctx.partitions.size(); i <= e; ++i)