[ELF] Parallelize input file loading (#191690)

During `createFiles`, `addFile()` records a `LoadJob` for each
non-script input (archive, relocatable, DSO, bitcode, binary) with a
state-machine snapshot (`inWholeArchive`, `inLib`, `asNeeded`,
`withLOption`, `groupId`) and expands them on worker threads in
`loadFiles()`. Linker scripts are still processed inline since their
`INPUT()` and `GROUP()` commands recursively call `addFile()`.

Outside `createFiles()`, `loadFiles()` is called with a single job and
drained immediately (`deferLoad` is false). Two cases:
- `addDependentLibrary()`: `.deplibs` sections trigger `addFile()`
  during the serial `doParseFiles()` loop.
- `--just-symbols`: pushes files directly, bypassing
`addFile`/`LoadJob`.

Thread-safety:
- A mutex serializes `BitcodeFile` / fatLTO constructors that call
  `ctx.saver` / `ctx.uniqueSaver`. Zero contention on pure ELF links.
- Thin-archive member buffers accumulate in per-job `SmallVector`s and
  are merged into `ctx.memoryBuffers` in command-line order.
- `groupId` is pre-claimed during the serial walk and written to each
  produced file after construction (the `InputFile` constructor no
  longer reads `nextGroupId`).

Performance (--threads=8):

```
  clang-relassert (267 thin archives, 10 .o, 2 .so):
    965 +/- 32 ms -> 924 +/- 24 ms (1.05x, 80 runs)

    (Apple M4) 249.7ms ± 2.5ms -> 221.2ms ± 1.4ms (1.13x, 10 runs)

  chromium (532 .a, 3314 .o, 343 .so):
    8.071 +/- 0.472 s -> 7.370 +/- 0.198 s (1.10x, 20 runs)
```

Parallelizing all file kinds (not just archives) matters for
.o-dominated workloads like chromium where archive-only parallelization
shows no improvement.

Output is byte-identical to the old lld and deterministic across
`--threads` values.
This commit is contained in:
Fangrui Song
2026-04-20 21:07:34 -07:00
committed by GitHub
parent b2d7d892c9
commit 83f8eee57d
4 changed files with 177 additions and 107 deletions

View File

@@ -175,6 +175,21 @@ struct VersionDefinition {
SmallVector<SymbolVersion, 0> localPatterns;
};
// Deferred file-load job: one per input, expanded by loadFiles().
struct LoadJob {
enum Kind : uint8_t { Obj, Bitcode, Archive, Shared, Binary };
llvm::MemoryBufferRef mbref;
llvm::StringRef path;
Kind kind;
bool inWholeArchive;
bool lazy;
bool asNeeded;
bool withLOption;
uint32_t groupId;
SmallVector<std::unique_ptr<InputFile>, 0> out;
std::vector<std::unique_ptr<llvm::MemoryBuffer>> thinBufs;
};
class LinkerDriver {
public:
LinkerDriver(Ctx &ctx);
@@ -186,17 +201,20 @@ public:
private:
Ctx &ctx;
void createFiles(llvm::opt::InputArgList &args);
void loadFiles();
void inferMachineType();
template <class ELFT> void link(llvm::opt::InputArgList &args);
template <class ELFT> void compileBitcodeFiles(bool skipLinkedOutput);
bool tryAddFatLTOFile(MemoryBufferRef mb, StringRef archiveName,
uint64_t offsetInArchive, bool lazy);
// True if we are in --whole-archive and --no-whole-archive.
bool inWholeArchive = false;
// True if we are in --start-lib and --end-lib.
bool inLib = false;
// True inside createFiles(): defers to loadFiles().
bool deferLoad = false;
SmallVector<LoadJob, 0> loadJobs;
std::unique_ptr<BitcodeCompiler> lto;
SmallVector<std::unique_ptr<InputFile>, 0> files, ltoObjectFiles;

View File

@@ -195,7 +195,8 @@ static std::tuple<ELFKind, uint16_t, uint8_t> parseEmulation(Ctx &ctx,
// Returns slices of MB by parsing MB as an archive file.
// Each slice consists of a member file in the archive.
std::vector<std::pair<MemoryBufferRef, uint64_t>> static getArchiveMembers(
Ctx &ctx, MemoryBufferRef mb) {
Ctx &ctx, LoadJob &job) {
MemoryBufferRef mb = job.mbref;
std::unique_ptr<Archive> file =
CHECK(Archive::create(mb),
mb.getBufferIdentifier() + ": failed to parse archive");
@@ -218,32 +219,16 @@ std::vector<std::pair<MemoryBufferRef, uint64_t>> static getArchiveMembers(
<< ": Archive::children failed: " << std::move(err);
// Take ownership of memory buffers created for members of thin archives.
std::vector<std::unique_ptr<MemoryBuffer>> mbs = file->takeThinBuffers();
std::move(mbs.begin(), mbs.end(), std::back_inserter(ctx.memoryBuffers));
job.thinBufs = file->takeThinBuffers();
return v;
}
static bool isBitcode(MemoryBufferRef mb) {
return identify_magic(mb.getBuffer()) == llvm::file_magic::bitcode;
}
bool LinkerDriver::tryAddFatLTOFile(MemoryBufferRef mb, StringRef archiveName,
uint64_t offsetInArchive, bool lazy) {
if (!ctx.arg.fatLTOObjects)
return false;
Expected<MemoryBufferRef> fatLTOData =
IRObjectFile::findBitcodeInMemBuffer(mb);
if (errorToBool(fatLTOData.takeError()))
return false;
auto file = std::make_unique<BitcodeFile>(ctx, *fatLTOData, archiveName,
offsetInArchive, lazy);
file->obj->fatLTOObject(true);
files.push_back(std::move(file));
return true;
}
// Opens a file and create a file object. Path has to be resolved already.
// Every regular input (not binary-format or linker scripts) is recorded as a
// LoadJob. Inside createFiles() jobs batch up and are expanded in parallel at
// the end. Outside createFiles() (e.g. addDependentLibrary during parseFiles)
// the single job is expanded immediately.
void LinkerDriver::addFile(StringRef path, bool withLOption) {
using namespace sys::fs;
@@ -253,93 +238,59 @@ void LinkerDriver::addFile(StringRef path, bool withLOption) {
MemoryBufferRef mbref = *buffer;
if (ctx.arg.formatBinary) {
files.push_back(std::make_unique<BinaryFile>(ctx, mbref));
if (!isInGroup)
++nextGroupId;
return;
}
switch (identify_magic(mbref.getBuffer())) {
case file_magic::unknown:
readLinkerScript(ctx, mbref);
return;
case file_magic::archive: {
auto members = getArchiveMembers(ctx, mbref);
if (inWholeArchive) {
for (const std::pair<MemoryBufferRef, uint64_t> &p : members) {
if (isBitcode(p.first))
files.push_back(std::make_unique<BitcodeFile>(ctx, p.first, path,
p.second, false));
else if (!tryAddFatLTOFile(p.first, path, p.second, false))
files.push_back(createObjFile(ctx, p.first, path));
loadJobs.push_back({mbref,
path,
LoadJob::Binary,
/*inWholeArchive=*/false,
/*lazy=*/false,
/*asNeeded=*/false,
/*withLOption=*/false,
nextGroupId,
{},
{}});
} else {
auto magic = identify_magic(mbref.getBuffer());
if (magic == file_magic::unknown) {
readLinkerScript(ctx, mbref);
return;
}
LoadJob::Kind kind;
switch (magic) {
case file_magic::archive:
kind = LoadJob::Archive;
break;
case file_magic::elf_relocatable:
kind = LoadJob::Obj;
break;
case file_magic::bitcode:
kind = LoadJob::Bitcode;
break;
case file_magic::elf_shared_object:
if (ctx.arg.isStatic) {
Err(ctx) << "attempted static link of dynamic object " << path;
return;
}
kind = LoadJob::Shared;
break;
default:
Err(ctx) << path << ": unknown file type";
return;
}
archiveFiles.emplace_back(path, members.size());
// Handle archives and --start-lib/--end-lib using the same code path. This
// scans all the ELF relocatable object files and bitcode files in the
// archive rather than just the index file, with the benefit that the
// symbols are only loaded once. For many projects archives see high
// utilization rates and it is a net performance win. --start-lib scans
// symbols in the same order that llvm-ar adds them to the index, so in the
// common case the semantics are identical. If the archive symbol table was
// created in a different order, or is incomplete, this strategy has
// different semantics. Such output differences are considered user error.
//
// All files within the archive get the same group ID to allow mutual
// references for --warn-backrefs.
SaveAndRestore saved(isInGroup, true);
for (const std::pair<MemoryBufferRef, uint64_t> &p : members) {
auto magic = identify_magic(p.first.getBuffer());
if (magic == file_magic::elf_relocatable) {
if (!tryAddFatLTOFile(p.first, path, p.second, true))
files.push_back(createObjFile(ctx, p.first, path, true));
} else if (magic == file_magic::bitcode)
files.push_back(
std::make_unique<BitcodeFile>(ctx, p.first, path, p.second, true));
else
Warn(ctx) << path << ": archive member '"
<< p.first.getBufferIdentifier()
<< "' is neither ET_REL nor LLVM bitcode";
}
if (!saved.get())
++nextGroupId;
return;
loadJobs.push_back({mbref,
path,
kind,
inWholeArchive,
inLib,
ctx.arg.asNeeded,
withLOption,
nextGroupId,
{},
{}});
}
case file_magic::elf_shared_object: {
if (ctx.arg.isStatic) {
ErrAlways(ctx) << "attempted static link of dynamic object " << path;
return;
}
// Shared objects are identified by soname. soname is (if specified)
// DT_SONAME and falls back to filename. If a file was specified by -lfoo,
// the directory part is ignored. Note that path may be a temporary and
// cannot be stored into SharedFile::soName.
path = mbref.getBufferIdentifier();
auto f = std::make_unique<SharedFile>(
ctx, mbref, withLOption ? path::filename(path) : path);
f->init();
files.push_back(std::move(f));
break;
}
case file_magic::bitcode:
files.push_back(std::make_unique<BitcodeFile>(ctx, mbref, "", 0, inLib));
break;
case file_magic::elf_relocatable:
if (!tryAddFatLTOFile(mbref, "", 0, inLib))
files.push_back(createObjFile(ctx, mbref, "", inLib));
break;
default:
ErrAlways(ctx) << path << ": unknown file type";
return;
}
// All files within the same --{start,end}-group get the same group ID.
// Otherwise, a new file will get a new group ID.
if (!isInGroup)
++nextGroupId;
if (!deferLoad)
loadFiles();
}
// Add a given library by searching it from input search paths.
@@ -2160,8 +2111,108 @@ static bool isFormatBinary(Ctx &ctx, StringRef s) {
return false;
}
// Expand LoadJob entries recorded by addFile(). Called in batch from
// createFiles() (parallel), or immediately from addFile() for late additions
// like dependent libraries (single job, runs inline).
void LinkerDriver::loadFiles() {
// BitcodeFile / fatLTO constructors call ctx.saver which is not thread-safe.
// SharedFile and ObjFile constructors are safe without the mutex.
std::mutex mu;
auto makeFile = [&](MemoryBufferRef mb, file_magic magic, StringRef arPath,
uint64_t offset,
bool lazy) -> std::unique_ptr<InputFile> {
if (magic == file_magic::bitcode) {
std::lock_guard<std::mutex> lk(mu);
return std::make_unique<BitcodeFile>(ctx, mb, arPath, offset, lazy);
}
if (ctx.arg.fatLTOObjects) {
Expected<MemoryBufferRef> fatLTOData =
IRObjectFile::findBitcodeInMemBuffer(mb);
if (!errorToBool(fatLTOData.takeError())) {
std::lock_guard<std::mutex> lk(mu);
auto f = std::make_unique<BitcodeFile>(ctx, *fatLTOData, arPath, offset,
lazy);
f->obj->fatLTOObject(true);
return f;
}
}
return createObjFile(ctx, mb, arPath, lazy);
};
{
llvm::TimeTraceScope timeScope("Parallel load");
parallelFor(0, loadJobs.size(), [&](size_t i) {
LoadJob &job = loadJobs[i];
switch (job.kind) {
case LoadJob::Obj:
case LoadJob::Bitcode:
job.out.push_back(makeFile(job.mbref,
job.kind == LoadJob::Bitcode
? file_magic::bitcode
: file_magic::elf_relocatable,
"", 0, job.lazy));
break;
case LoadJob::Archive: {
// Scan all archive members rather than using the archive symbol
// index. We assume the archive symbol table order matches the order
// of symbols in the member symbol tables. All files within the
// archive share the same group ID to allow mutual references for
// --warn-backrefs.
auto members = getArchiveMembers(ctx, job);
job.out.reserve(members.size());
bool lazy = !job.inWholeArchive;
for (const auto &[mb, offset] : members) {
auto mm = identify_magic(mb.getBuffer());
if (mm == file_magic::elf_relocatable || mm == file_magic::bitcode ||
job.inWholeArchive)
job.out.push_back(makeFile(mb, mm, job.path, offset, lazy));
else
Warn(ctx) << job.path << ": archive member '"
<< mb.getBufferIdentifier()
<< "' is neither ET_REL nor LLVM bitcode";
}
break;
}
case LoadJob::Shared: {
// Shared objects are identified by soname. soname is (if specified)
// DT_SONAME and falls back to filename. If a file was specified by
// -lfoo, the directory part is ignored.
StringRef bufPath = job.mbref.getBufferIdentifier();
auto f = std::make_unique<SharedFile>(
ctx, job.mbref,
job.withLOption ? path::filename(bufPath) : bufPath);
f->init();
f->isNeeded = !job.asNeeded;
job.out.push_back(std::move(f));
break;
}
case LoadJob::Binary:
job.out.push_back(std::make_unique<BinaryFile>(ctx, job.mbref));
break;
}
for (auto &m : job.out)
m->groupId = job.groupId;
});
}
size_t numFiles = 0;
for (auto &job : loadJobs)
numFiles += job.out.size();
files.reserve(files.size() + numFiles);
for (auto &job : loadJobs) {
if (job.kind == LoadJob::Archive)
archiveFiles.emplace_back(job.path, (unsigned)job.out.size());
files.append(std::make_move_iterator(job.out.begin()),
std::make_move_iterator(job.out.end()));
ctx.memoryBuffers.append(std::make_move_iterator(job.thinBufs.begin()),
std::make_move_iterator(job.thinBufs.end()));
}
loadJobs.clear();
}
void LinkerDriver::createFiles(opt::InputArgList &args) {
llvm::TimeTraceScope timeScope("Load input files");
SaveAndRestore saveDefer(deferLoad, true);
// For --{push,pop}-state.
std::vector<std::tuple<bool, bool, bool>> stack;
@@ -2283,6 +2334,7 @@ void LinkerDriver::createFiles(opt::InputArgList &args) {
if (defaultScript && !hasScript)
readLinkerScript(ctx, *defaultScript);
loadFiles();
if (files.empty() && !hasInput && errCount(ctx) == 0)
ErrAlways(ctx) << "no input files";
}

View File

@@ -207,7 +207,7 @@ static void updateSupportedARMFeatures(Ctx &ctx,
}
InputFile::InputFile(Ctx &ctx, Kind k, MemoryBufferRef m)
: ctx(ctx), mb(m), groupId(ctx.driver.nextGroupId), fileKind(k) {}
: ctx(ctx), mb(m), fileKind(k) {}
InputFile::~InputFile() {}

View File

@@ -130,7 +130,7 @@ public:
// checking feature. All files within the same --{start,end}-group or
// --{start,end}-lib get the same group ID. Otherwise, each file gets a new
// group ID. For more info, see checkDependency() in SymbolTable.cpp.
uint32_t groupId;
uint32_t groupId = 0;
// If this is an architecture-specific file, the following members
// have ELF type (i.e. ELF{32,64}{LE,BE}) and target machine type.