[offload] Allow replay repetitions and report basic timing (#193388)

This commit extends the kernel replay tool to perform multiple replay
repetitions on the same process. It also prints the execution time of
the kernel replay, which includes the kernel launch and kernel
synchronization (replay I/O time is excluded). Precise kernel timing
should be obtained through the corresponding profiling tools for now.

The output report after recording has been improved as well.
This commit is contained in:
Kevin Sala Penades
2026-04-22 15:22:23 -07:00
committed by GitHub
parent e68d91afdf
commit 802de7ebd1
8 changed files with 155 additions and 55 deletions

View File

@@ -134,6 +134,12 @@ struct KernelReplayOutcomeTy {
/// The path to the file that stores the output memory snapshot after the /// The path to the file that stores the output memory snapshot after the
/// kernel has been replayed. /// kernel has been replayed.
llvm::SmallString<128> OutputFilepath; llvm::SmallString<128> OutputFilepath;
/// The execution time of the kernel replay in nanoseconds. This time includes
/// the the kernel launch and synchronization time. Replay I/O is excluded.
uint64_t KernelReplayTimeNs = 0;
/// The pointer to the device memory allocation used to replay. This can be
/// reused for future replays of the same kernel.
void *ReplayDeviceAlloc = nullptr;
}; };
/// Extra kernel arguments managed by the runtime components. Notice these /// Extra kernel arguments managed by the runtime components. Notice these

View File

@@ -428,10 +428,11 @@ void __tgt_target_nowait_query(void **AsyncHandle);
/// device memory. /// device memory.
int __tgt_target_kernel_replay( int __tgt_target_kernel_replay(
ident_t *Loc, int64_t DeviceId, void *HostPtr, void *DeviceMemory, ident_t *Loc, int64_t DeviceId, void *HostPtr, void *DeviceMemory,
int64_t DeviceMemorySize, const llvm::offloading::EntryTy *Globals, void *ReuseDeviceAlloc, int64_t DeviceMemorySize,
int32_t NumGlobals, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
uint64_t LoopTripCount, KernelReplayOutcomeTy *ReplayOutcome); int32_t ThreadLimit, uint32_t SharedMemorySize, uint64_t LoopTripCount,
KernelReplayOutcomeTy *ReplayOutcome);
void __tgt_set_info_flag(uint32_t); void __tgt_set_info_flag(uint32_t);

View File

@@ -509,6 +509,9 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
/// \param DeviceMemory A pointer to an array storing device memory data to move /// \param DeviceMemory A pointer to an array storing device memory data to move
/// prior to kernel execution. /// prior to kernel execution.
/// \param DeviceMemorySize The size of the above device memory data in bytes. /// \param DeviceMemorySize The size of the above device memory data in bytes.
/// \param ReuseDeviceAlloc Pointer to a device memory allocation that should be
/// reused for the replay. If null, the replay will
/// allocate the necessary device buffer.
/// \param TgtArgs An array of pointers of the pre-recorded target kernel /// \param TgtArgs An array of pointers of the pre-recorded target kernel
/// arguments. /// arguments.
/// \param TgtOffsets An array of pointers of the pre-recorded target kernel /// \param TgtOffsets An array of pointers of the pre-recorded target kernel
@@ -521,10 +524,11 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
/// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure. /// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure.
EXTERN int __tgt_target_kernel_replay( EXTERN int __tgt_target_kernel_replay(
ident_t *Loc, int64_t DeviceId, void *HostPtr, void *DeviceMemory, ident_t *Loc, int64_t DeviceId, void *HostPtr, void *DeviceMemory,
int64_t DeviceMemorySize, const llvm::offloading::EntryTy *Globals, void *ReuseDeviceAlloc, int64_t DeviceMemorySize,
int32_t NumGlobals, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
uint64_t LoopTripCount, KernelReplayOutcomeTy *ReplayOutcome) { int32_t ThreadLimit, uint32_t SharedMemorySize, uint64_t LoopTripCount,
KernelReplayOutcomeTy *ReplayOutcome) {
assert(PM && "Runtime not initialized"); assert(PM && "Runtime not initialized");
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
if (checkDevice(DeviceId, Loc)) { if (checkDevice(DeviceId, Loc)) {
@@ -541,10 +545,11 @@ EXTERN int __tgt_target_kernel_replay(
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);) /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
AsyncInfoTy AsyncInfo(*DeviceOrErr); AsyncInfoTy AsyncInfo(*DeviceOrErr);
int Rc = target_replay( int Rc =
Loc, *DeviceOrErr, HostPtr, DeviceMemory, DeviceMemorySize, Globals, target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory, DeviceMemorySize,
NumGlobals, TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit, ReuseDeviceAlloc, Globals, NumGlobals, TgtArgs, TgtOffsets,
SharedMemorySize, LoopTripCount, AsyncInfo, ReplayOutcome); NumArgs, NumTeams, ThreadLimit, SharedMemorySize,
LoopTripCount, AsyncInfo, ReplayOutcome);
if (Rc == OFFLOAD_SUCCESS) if (Rc == OFFLOAD_SUCCESS)
Rc = AsyncInfo.synchronize(); Rc = AsyncInfo.synchronize();

View File

@@ -2391,6 +2391,7 @@ int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
/// configuration. /// configuration.
int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
void *DeviceMemory, int64_t DeviceMemorySize, void *DeviceMemory, int64_t DeviceMemorySize,
void *ReuseDeviceAlloc,
const llvm::offloading::EntryTy *Globals, int32_t NumGlobals, const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
int32_t NumTeams, int32_t ThreadLimit, int32_t NumTeams, int32_t ThreadLimit,
@@ -2448,13 +2449,20 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
} }
} }
void *TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr, // Reuse a previous device allocation or allocate a new device buffer.
TARGET_ALLOC_DEFAULT); void *&TgtPtr = ReuseDeviceAlloc;
if (!TgtPtr)
TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
TARGET_ALLOC_DEFAULT);
if (!TgtPtr) { if (!TgtPtr) {
REPORT() << "Failed to allocate device memory."; REPORT() << "Failed to allocate device memory.";
return OFFLOAD_FAIL; return OFFLOAD_FAIL;
} }
// Save the device allocation for future replays of the same kernel.
if (ReplayOutcome)
ReplayOutcome->ReplayDeviceAlloc = TgtPtr;
int Ret = int Ret =
Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo); Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) { if (Ret != OFFLOAD_SUCCESS) {

View File

@@ -30,15 +30,14 @@ extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
void *ReqAddr, bool IsRecord, bool SaveOutput, void *ReqAddr, bool IsRecord, bool SaveOutput,
bool EmitReport, const char *OutputDirPath); bool EmitReport, const char *OutputDirPath);
extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, extern int
void *DeviceMemory, int64_t DeviceMemorySize, target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, void *DeviceMemory,
const llvm::offloading::EntryTy *Globals, int64_t DeviceMemorySize, void *ReuseDeviceAlloc,
int32_t NumGlobals, void **TgtArgs, const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
ptrdiff_t *TgtOffsets, int32_t NumArgs, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
int32_t NumTeams, int32_t ThreadLimit, int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize,
uint32_t SharedMemorySize, uint64_t LoopTripCount, uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo,
AsyncInfoTy &AsyncInfo, KernelReplayOutcomeTy *ReplayOutcome);
KernelReplayOutcomeTy *ReplayOutcome);
extern void handleTargetOutcome(bool Success, ident_t *Loc); extern void handleTargetOutcome(bool Success, ident_t *Loc);

View File

@@ -11,6 +11,7 @@
#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RECORDREPLAY_H #ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RECORDREPLAY_H
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RECORDREPLAY_H #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RECORDREPLAY_H
#include <chrono>
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include <filesystem> #include <filesystem>
@@ -116,6 +117,10 @@ protected:
/// information about the the kernel's replay, such as the snapshot file. /// information about the the kernel's replay, such as the snapshot file.
KernelReplayOutcomeTy *ReplayOutcome = nullptr; KernelReplayOutcomeTy *ReplayOutcome = nullptr;
/// The begin and end time points of the kernel execution.
using ClockTy = std::chrono::steady_clock;
mutable std::chrono::time_point<ClockTy> BeginTime, EndTime;
/// The number of occurrences during the execution. /// The number of occurrences during the execution.
mutable size_t Occurrences = 0; mutable size_t Occurrences = 0;
@@ -129,6 +134,17 @@ protected:
NumTeams == Other.NumTeams && NumThreads == Other.NumThreads && NumTeams == Other.NumTeams && NumThreads == Other.NumThreads &&
SharedMemorySize == Other.SharedMemorySize); SharedMemorySize == Other.SharedMemorySize);
} }
/// Record the begin and ending of the kernel execution.
void recordBeginTime() const { BeginTime = ClockTy::now(); }
void recordEndTime() const { EndTime = ClockTy::now(); }
/// Get the kernel execution time in nanoseconds.
uint64_t getRecordedTimeNs() const {
using DurationNsTy = std::chrono::duration<uint64_t, std::nano>;
return std::chrono::duration_cast<DurationNsTy>(EndTime - BeginTime)
.count();
}
}; };
struct InstanceHasher { struct InstanceHasher {
@@ -210,6 +226,15 @@ private:
uint32_t NumThreads, uint32_t SharedMemorySize, uint32_t NumThreads, uint32_t SharedMemorySize,
KernelReplayOutcomeTy *ReplayOutcome); KernelReplayOutcomeTy *ReplayOutcome);
/// Unregister an instance once it has been replayed. Instances during
/// recording cannot be unregistered. Accessing the instance beyond this point
/// is invalid.
Error unregisterInstance(const InstanceTy &Instance);
/// Populate the replay outcome struct to forward some replay information.
void populateReplayOutcome(const InstanceTy &Instance,
KernelReplayOutcomeTy &Outcome);
/// Record the prologue data. /// Record the prologue data.
virtual Error virtual Error
recordPrologueImpl(const GenericKernelTy &Kernel, const InstanceTy &Instance, recordPrologueImpl(const GenericKernelTy &Kernel, const InstanceTy &Instance,

View File

@@ -89,17 +89,20 @@ Error RecordReplayTy::deinit() {
Error RecordReplayTy::emitInstanceReport() { Error RecordReplayTy::emitInstanceReport() {
std::lock_guard<std::mutex> LG(InstancesLock); std::lock_guard<std::mutex> LG(InstancesLock);
llvm::outs() << "=== record report begin ===\n"; llvm::outs() << "=== Kernel Record Report ===\n";
llvm::outs() << "directory: " llvm::outs() << "Directory: "
<< std::filesystem::absolute(OutputDirectory).string() << "\n"; << std::filesystem::absolute(OutputDirectory).string() << "\n";
llvm::outs() << "kernels: " << Instances.size() << "\n"; llvm::outs() << "Total Instances: " << Instances.size() << "\n";
llvm::outs() << "JSON Filename, Kernel Name, Time (ns), Occurrences:\n";
SmallString<128> Filename; SmallString<128> Filename;
for (const auto &Inst : Instances) for (const auto &Inst : Instances)
llvm::outs() llvm::outs()
<< getFilename(Inst, FileTy::Descriptor, /*IncludeDir=*/false).c_str() << getFilename(Inst, FileTy::Descriptor, /*IncludeDir=*/false).c_str()
<< ": " << Inst.Kernel.getName() << "\n"; << ", " << Inst.Kernel.getName() << ", " << Inst.getRecordedTimeNs()
llvm::outs() << "=== record report end ===\n"; << ", " << Inst.Occurrences << "\n";
llvm::outs() << "=== End Kernel Record Report ===\n";
return Plugin::success(); return Plugin::success();
} }
@@ -116,6 +119,16 @@ RecordReplayTy::registerInstance(const GenericKernelTy &Kernel,
return {*It, Inserted}; return {*It, Inserted};
} }
Error RecordReplayTy::unregisterInstance(const InstanceTy &Instance) {
assert(isReplaying() && "Cannot unregister instance when recording.");
std::lock_guard<std::mutex> LG(InstancesLock);
size_t Erased = Instances.erase(Instance);
if (Erased != 1)
return Plugin::error(ErrorCode::INVALID_ARGUMENT, "invalid instance");
return Plugin::success();
}
Expected<void *> RecordReplayTy::allocate(uint64_t Size) { Expected<void *> RecordReplayTy::allocate(uint64_t Size) {
assert(StartAddr && "Expected memory has been pre-allocated"); assert(StartAddr && "Expected memory has been pre-allocated");
constexpr int Alignment = 16; constexpr int Alignment = 16;
@@ -147,36 +160,59 @@ Expected<RecordReplayTy::HandleTy> RecordReplayTy::recordPrologue(
(KernelExtraArgs) ? KernelExtraArgs->ReplayOutcome : nullptr); (KernelExtraArgs) ? KernelExtraArgs->ReplayOutcome : nullptr);
HandleTy Handle{&Instance, First}; HandleTy Handle{&Instance, First};
if (isReplaying() || !First) if (!First)
return Handle; return Handle;
if (auto Err = recordDescImpl(Kernel, Instance, KernelArgs, LaunchParams)) if (isRecording()) {
return Err; if (auto Err = recordDescImpl(Kernel, Instance, KernelArgs, LaunchParams))
return Err;
if (auto Err = recordPrologueImpl(Kernel, Instance, KernelArgs, LaunchParams)) if (auto Err =
return Err; recordPrologueImpl(Kernel, Instance, KernelArgs, LaunchParams))
return Err;
}
// Start the timer for the kernel execution.
Instance.recordBeginTime();
return Handle; return Handle;
} }
Error RecordReplayTy::recordEpilogue(const GenericKernelTy &Kernel, Error RecordReplayTy::recordEpilogue(const GenericKernelTy &Kernel,
HandleTy Handle) { HandleTy Handle) {
if (!shouldRecordEpilogue() || !Handle.Active) if (!Handle.Active)
return Plugin::success(); return Plugin::success();
// Stop the timer for the kernel execution.
const InstanceTy &Instance = *Handle.Instance; const InstanceTy &Instance = *Handle.Instance;
if (auto Err = recordEpilogueImpl(Kernel, Instance)) Instance.recordEndTime();
return Err;
if (shouldRecordEpilogue())
if (auto Err = recordEpilogueImpl(Kernel, Instance))
return Err;
if (isReplaying() && Instance.ReplayOutcome)
populateReplayOutcome(Instance, *Instance.ReplayOutcome);
// After a replay, unregister the instance so it can be replayed again. Do
// not access the instance object beyond this point.
if (isReplaying())
return unregisterInstance(Instance);
// If necessary, inform the replaying tool about where the epilogue snapshot
// file has been stored.
if (isReplaying() && Instance.ReplayOutcome) {
SmallString<128> Filename = getFilename(Instance, FileTy::EpilogueSnapshot);
Instance.ReplayOutcome->OutputFilepath = Filename;
}
return Plugin::success(); return Plugin::success();
} }
void RecordReplayTy::populateReplayOutcome(const InstanceTy &Instance,
KernelReplayOutcomeTy &Outcome) {
// Only save the epilogue output filename if it was recorded.
if (shouldRecordEpilogue()) {
SmallString<128> Filename = getFilename(Instance, FileTy::EpilogueSnapshot);
Outcome.OutputFilepath = Filename;
}
// Save the kernel replay time.
Outcome.KernelReplayTimeNs = Instance.getRecordedTimeNs();
}
Error NativeRecordReplayTy::recordPrologueImpl( Error NativeRecordReplayTy::recordPrologueImpl(
const GenericKernelTy &Kernel, const InstanceTy &Instance, const GenericKernelTy &Kernel, const InstanceTy &Instance,
const KernelArgsTy &KernelArgs, const KernelLaunchParamsTy &LaunchParams) { const KernelArgsTy &KernelArgs, const KernelLaunchParamsTy &LaunchParams) {

View File

@@ -58,6 +58,11 @@ static cl::opt<uint32_t> NumThreadsOpt("num-threads",
static cl::opt<int32_t> DeviceIdOpt("device-id", cl::desc("Set the device id."), static cl::opt<int32_t> DeviceIdOpt("device-id", cl::desc("Set the device id."),
cl::init(-1), cl::cat(ReplayOptions)); cl::init(-1), cl::cat(ReplayOptions));
static cl::opt<uint32_t>
RepetitionsOpt("repetitions",
cl::desc("Set the number of replay repetitions."),
cl::init(1), cl::cat(ReplayOptions));
template <typename... ArgsTy> template <typename... ArgsTy>
Error createErr(const char *ErrFmt, ArgsTy &&...Args) { Error createErr(const char *ErrFmt, ArgsTy &&...Args) {
return llvm::createStringError(llvm::inconvertibleErrorCode(), ErrFmt, return llvm::createStringError(llvm::inconvertibleErrorCode(), ErrFmt,
@@ -132,12 +137,14 @@ Error verifyReplayOutput(StringRef RecordOutputFilename,
return createErr("replay device memory failed to verify"); return createErr("replay device memory failed to verify");
// Sucessfully verified. // Sucessfully verified.
outs() << TOOL_PREFIX << "Replay device memory verified\n";
return Error::success(); return Error::success();
} }
/// Replay the kernel and return whether verification occurred. /// Replay the kernel and return whether verification occurred.
Error replayKernel() { Error replayKernel() {
if (RepetitionsOpt == 0)
return createErr("invalid number of repetitions");
// Load the kernel descriptor JSON file. // Load the kernel descriptor JSON file.
auto KernelDescrBufferOrErr = auto KernelDescrBufferOrErr =
MemoryBuffer::getFile(JsonFilename, /*isText=*/true, MemoryBuffer::getFile(JsonFilename, /*isText=*/true,
@@ -315,15 +322,23 @@ Error replayKernel() {
auto RecordInputBuffer = std::move(RecordInputBufferOrErr.get()); auto RecordInputBuffer = std::move(RecordInputBufferOrErr.get());
KernelReplayOutcomeTy Outcome; KernelReplayOutcomeTy Outcome;
Rc = __tgt_target_kernel_replay(
/*Loc=*/nullptr, DeviceId, OffloadEntries[0].Address, // Perform the kernel replay and verification (if needed) for each repetition.
const_cast<char *>(RecordInputBuffer->getBufferStart()), for (uint32_t R = 1; R <= RepetitionsOpt; ++R) {
RecordInputBuffer->getBufferSize(), Rc = __tgt_target_kernel_replay(
NumGlobals ? &OffloadEntries[1] : nullptr, NumGlobals, TgtArgs.data(), /*Loc=*/nullptr, DeviceId, OffloadEntries[0].Address,
TgtArgOffsets.data(), NumArgs, NumTeams, NumThreads, SharedMemorySize, const_cast<char *>(RecordInputBuffer->getBufferStart()),
LoopTripCount, &Outcome); R > 0 ? Outcome.ReplayDeviceAlloc : nullptr,
if (Rc != OMP_TGT_SUCCESS) RecordInputBuffer->getBufferSize(),
return createErr("failed to replay kernel"); NumGlobals ? &OffloadEntries[1] : nullptr, NumGlobals, TgtArgs.data(),
TgtArgOffsets.data(), NumArgs, NumTeams, NumThreads, SharedMemorySize,
LoopTripCount, &Outcome);
if (Rc != OMP_TGT_SUCCESS)
return createErr("failed to replay kernel");
outs() << TOOL_PREFIX << " Replay time (" << R
<< "): " << Outcome.KernelReplayTimeNs << " ns\n";
}
// Verify the replay output if requested. // Verify the replay output if requested.
if (VerifyOpt) { if (VerifyOpt) {
@@ -331,10 +346,15 @@ Error replayKernel() {
return createErr("replay output file was not generated"); return createErr("replay output file was not generated");
Filepath.replace_extension("record_output"); Filepath.replace_extension("record_output");
return verifyReplayOutput(Filepath.c_str(), Outcome.OutputFilepath.c_str()); if (auto Err = verifyReplayOutput(Filepath.c_str(),
} Outcome.OutputFilepath.c_str()))
return Err;
outs() << TOOL_PREFIX << "Replay finished (verification skipped)\n"; // The verification was successful.
outs() << TOOL_PREFIX << " Replay done, device memory verified\n";
} else {
outs() << TOOL_PREFIX << " Replay done, verification skipped\n";
}
return Error::success(); return Error::success();
} }