[ELF] Parallelize --gc-sections mark phase (#189321)
Add `markParallel` using level-synchronized `parallelFor`. Each BFS level is processed in parallel; newly discovered sections are collected in per-thread queues and merged for the next level. The parallel path is used when `!TrackWhyLive && partitions.size()==1`. `parallelFor` naturally degrades to serial when `--threads=1`. Uses depth-limited inline recursion (depth<3) and optimistic load-then-exchange dedup for best performance. Linking a Release+Asserts clang (--gc-sections, --time-trace) on an old x86-64: 8 threads: markLive 315ms -> 82ms (-234ms). Total 1562ms -> 1350ms (1.16x). 16 threads: markLive 199ms -> 50ms (-149ms). Total 1017ms -> 862ms (1.18x). and on Apple M4: markLive 61ms -> 13ms. Total 317.3ms -> 272.7ms (1.16x).
This commit is contained in:
@@ -30,6 +30,7 @@
|
||||
#include "lld/Common/Strings.h"
|
||||
#include "llvm/ADT/DenseMapInfoVariant.h"
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/Support/Parallel.h"
|
||||
#include "llvm/Support/TimeProfiler.h"
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
@@ -66,6 +67,7 @@ private:
|
||||
LiveReason reason);
|
||||
void markSymbol(Symbol *sym, StringRef reason);
|
||||
void mark();
|
||||
void markParallel();
|
||||
|
||||
template <class RelTy>
|
||||
void resolveReloc(InputSectionBase &sec, const RelTy &rel, bool fromFDE);
|
||||
@@ -460,7 +462,12 @@ void MarkLive<ELFT, TrackWhyLive>::run() {
|
||||
|
||||
template <class ELFT, bool TrackWhyLive>
|
||||
void MarkLive<ELFT, TrackWhyLive>::mark() {
|
||||
// Mark all reachable sections.
|
||||
if constexpr (!TrackWhyLive) {
|
||||
if (ctx.partitions.size() == 1) {
|
||||
markParallel();
|
||||
return;
|
||||
}
|
||||
}
|
||||
while (!queue.empty()) {
|
||||
InputSectionBase &sec = *queue.pop_back_val();
|
||||
|
||||
@@ -483,6 +490,97 @@ void MarkLive<ELFT, TrackWhyLive>::mark() {
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function for markParallel. Walk all GC edges from sec, marking
|
||||
// everything that needs to be live. Call fn(target section, offset) for each
|
||||
// edge, which will mark the section live and handle further processing of edges
|
||||
// from that section.
|
||||
template <class ELFT, class Fn>
|
||||
static void processSectionEdges(
|
||||
Ctx &ctx, InputSectionBase &sec,
|
||||
const DenseMap<StringRef, SmallVector<InputSectionBase *, 0>>
|
||||
&cNamedSections,
|
||||
Fn fn) {
|
||||
auto resolveEdge = [&](const auto &rel) {
|
||||
Symbol &sym = sec.file->getRelocTargetSym(rel);
|
||||
if (!sym.hasFlag(USED))
|
||||
sym.setFlags(USED);
|
||||
if (auto *d = dyn_cast<Defined>(&sym)) {
|
||||
if (auto *relSec = dyn_cast_or_null<InputSectionBase>(d->section)) {
|
||||
uint64_t offset = d->value;
|
||||
if (d->isSection()) {
|
||||
offset += getAddend<ELFT>(ctx, sec, rel);
|
||||
if (auto *ms = dyn_cast<MergeInputSection>(relSec);
|
||||
ms && offset >= ms->content().size())
|
||||
return;
|
||||
}
|
||||
if (auto *ms = dyn_cast<MergeInputSection>(relSec)) {
|
||||
auto &piece = ms->getSectionPiece(offset);
|
||||
auto *word =
|
||||
reinterpret_cast<std::atomic<uint32_t> *>(&piece.inputOff + 1);
|
||||
constexpr uint32_t liveBit = sys::IsBigEndianHost ? (1U << 31) : 1U;
|
||||
word->fetch_or(liveBit, std::memory_order_relaxed);
|
||||
}
|
||||
fn(relSec, offset);
|
||||
}
|
||||
return;
|
||||
}
|
||||
for (InputSectionBase *csec : cNamedSections.lookup(sym.getName()))
|
||||
fn(csec, 0);
|
||||
};
|
||||
const RelsOrRelas<ELFT> rels = sec.template relsOrRelas<ELFT>();
|
||||
for (const typename ELFT::Rel &rel : rels.rels)
|
||||
resolveEdge(rel);
|
||||
for (const typename ELFT::Rela &rel : rels.relas)
|
||||
resolveEdge(rel);
|
||||
for (const typename ELFT::Crel &rel : rels.crels)
|
||||
resolveEdge(rel);
|
||||
for (InputSectionBase *isec : sec.dependentSections)
|
||||
fn(isec, 0);
|
||||
if (sec.nextInSectionGroup)
|
||||
fn(sec.nextInSectionGroup, 0);
|
||||
}
|
||||
|
||||
// Parallel mark using level-synchronized BFS with depth-limited inline
|
||||
// recursion. Each parallelFor iteration processes a subtree up to depth 3
|
||||
// (DFS for cache locality), then queues deeper discoveries for the next level.
|
||||
template <class ELFT, bool TrackWhyLive>
|
||||
void MarkLive<ELFT, TrackWhyLive>::markParallel() {
|
||||
const size_t numThreads = parallel::getThreadCount();
|
||||
auto visit = [&](InputSection *sec, int depth,
|
||||
SmallVector<InputSection *, 0> &localQueue,
|
||||
auto &self) -> void {
|
||||
processSectionEdges<ELFT>(
|
||||
ctx, *sec, cNamedSections,
|
||||
[&](InputSectionBase *target, uint64_t offset) {
|
||||
auto &part =
|
||||
reinterpret_cast<std::atomic<uint8_t> &>(target->partition);
|
||||
// Optimistic load-then-exchange avoids expensive atomic
|
||||
// RMW on already-visited sections.
|
||||
if (part.load(std::memory_order_relaxed) != 0 ||
|
||||
part.exchange(1, std::memory_order_relaxed) != 0)
|
||||
return;
|
||||
if (auto *s = dyn_cast<InputSection>(target)) {
|
||||
if (depth < 3)
|
||||
self(s, depth + 1, localQueue, self);
|
||||
else
|
||||
localQueue.push_back(s);
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
while (!queue.empty()) {
|
||||
auto queues =
|
||||
std::make_unique<SmallVector<InputSection *, 0>[]>(numThreads);
|
||||
parallelFor(0, queue.size(), [&](size_t i) {
|
||||
const unsigned tid = parallel::getThreadIndex();
|
||||
visit(queue[i], 0, queues[tid], visit);
|
||||
});
|
||||
queue.clear();
|
||||
for (size_t t = 0; t < numThreads; ++t)
|
||||
queue.append(std::move(queues[t]));
|
||||
}
|
||||
}
|
||||
|
||||
// Move the sections for some symbols to the main partition, specifically ifuncs
|
||||
// (because they can result in an IRELATIVE being added to the main partition's
|
||||
// GOT, which means that the ifunc must be available when the main partition is
|
||||
@@ -527,8 +625,8 @@ template <class ELFT> void elf::markLive(Ctx &ctx) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (InputSectionBase *sec : ctx.inputSections)
|
||||
sec->markDead();
|
||||
parallelForEach(ctx.inputSections,
|
||||
[](InputSectionBase *sec) { sec->markDead(); });
|
||||
|
||||
// Follow the graph to mark all live sections.
|
||||
for (unsigned i = 1, e = ctx.partitions.size(); i <= e; ++i)
|
||||
|
||||
Reference in New Issue
Block a user