[offload] Allow replay repetitions and report basic timing (#193388)
This commit extends the kernel replay tool to perform multiple replay repetitions on the same process. It also prints the execution time of the kernel replay, which includes the kernel launch and kernel synchronization (replay I/O time is excluded). Precise kernel timing should be obtained through the corresponding profiling tools for now. The output report after recording has been improved as well.
This commit is contained in:
committed by
GitHub
parent
e68d91afdf
commit
802de7ebd1
@@ -134,6 +134,12 @@ struct KernelReplayOutcomeTy {
|
||||
/// The path to the file that stores the output memory snapshot after the
|
||||
/// kernel has been replayed.
|
||||
llvm::SmallString<128> OutputFilepath;
|
||||
/// The execution time of the kernel replay in nanoseconds. This time includes
|
||||
/// the the kernel launch and synchronization time. Replay I/O is excluded.
|
||||
uint64_t KernelReplayTimeNs = 0;
|
||||
/// The pointer to the device memory allocation used to replay. This can be
|
||||
/// reused for future replays of the same kernel.
|
||||
void *ReplayDeviceAlloc = nullptr;
|
||||
};
|
||||
|
||||
/// Extra kernel arguments managed by the runtime components. Notice these
|
||||
|
||||
@@ -428,10 +428,11 @@ void __tgt_target_nowait_query(void **AsyncHandle);
|
||||
/// device memory.
|
||||
int __tgt_target_kernel_replay(
|
||||
ident_t *Loc, int64_t DeviceId, void *HostPtr, void *DeviceMemory,
|
||||
int64_t DeviceMemorySize, const llvm::offloading::EntryTy *Globals,
|
||||
int32_t NumGlobals, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
|
||||
int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize,
|
||||
uint64_t LoopTripCount, KernelReplayOutcomeTy *ReplayOutcome);
|
||||
void *ReuseDeviceAlloc, int64_t DeviceMemorySize,
|
||||
const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
|
||||
void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
|
||||
int32_t ThreadLimit, uint32_t SharedMemorySize, uint64_t LoopTripCount,
|
||||
KernelReplayOutcomeTy *ReplayOutcome);
|
||||
|
||||
void __tgt_set_info_flag(uint32_t);
|
||||
|
||||
|
||||
@@ -509,6 +509,9 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
|
||||
/// \param DeviceMemory A pointer to an array storing device memory data to move
|
||||
/// prior to kernel execution.
|
||||
/// \param DeviceMemorySize The size of the above device memory data in bytes.
|
||||
/// \param ReuseDeviceAlloc Pointer to a device memory allocation that should be
|
||||
/// reused for the replay. If null, the replay will
|
||||
/// allocate the necessary device buffer.
|
||||
/// \param TgtArgs An array of pointers of the pre-recorded target kernel
|
||||
/// arguments.
|
||||
/// \param TgtOffsets An array of pointers of the pre-recorded target kernel
|
||||
@@ -521,10 +524,11 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
|
||||
/// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure.
|
||||
EXTERN int __tgt_target_kernel_replay(
|
||||
ident_t *Loc, int64_t DeviceId, void *HostPtr, void *DeviceMemory,
|
||||
int64_t DeviceMemorySize, const llvm::offloading::EntryTy *Globals,
|
||||
int32_t NumGlobals, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
|
||||
int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize,
|
||||
uint64_t LoopTripCount, KernelReplayOutcomeTy *ReplayOutcome) {
|
||||
void *ReuseDeviceAlloc, int64_t DeviceMemorySize,
|
||||
const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
|
||||
void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
|
||||
int32_t ThreadLimit, uint32_t SharedMemorySize, uint64_t LoopTripCount,
|
||||
KernelReplayOutcomeTy *ReplayOutcome) {
|
||||
assert(PM && "Runtime not initialized");
|
||||
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
|
||||
if (checkDevice(DeviceId, Loc)) {
|
||||
@@ -541,10 +545,11 @@ EXTERN int __tgt_target_kernel_replay(
|
||||
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
|
||||
|
||||
AsyncInfoTy AsyncInfo(*DeviceOrErr);
|
||||
int Rc = target_replay(
|
||||
Loc, *DeviceOrErr, HostPtr, DeviceMemory, DeviceMemorySize, Globals,
|
||||
NumGlobals, TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit,
|
||||
SharedMemorySize, LoopTripCount, AsyncInfo, ReplayOutcome);
|
||||
int Rc =
|
||||
target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory, DeviceMemorySize,
|
||||
ReuseDeviceAlloc, Globals, NumGlobals, TgtArgs, TgtOffsets,
|
||||
NumArgs, NumTeams, ThreadLimit, SharedMemorySize,
|
||||
LoopTripCount, AsyncInfo, ReplayOutcome);
|
||||
|
||||
if (Rc == OFFLOAD_SUCCESS)
|
||||
Rc = AsyncInfo.synchronize();
|
||||
|
||||
@@ -2391,6 +2391,7 @@ int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
|
||||
/// configuration.
|
||||
int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
|
||||
void *DeviceMemory, int64_t DeviceMemorySize,
|
||||
void *ReuseDeviceAlloc,
|
||||
const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
|
||||
void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
|
||||
int32_t NumTeams, int32_t ThreadLimit,
|
||||
@@ -2448,13 +2449,20 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
|
||||
}
|
||||
}
|
||||
|
||||
void *TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
|
||||
// Reuse a previous device allocation or allocate a new device buffer.
|
||||
void *&TgtPtr = ReuseDeviceAlloc;
|
||||
if (!TgtPtr)
|
||||
TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
|
||||
TARGET_ALLOC_DEFAULT);
|
||||
if (!TgtPtr) {
|
||||
REPORT() << "Failed to allocate device memory.";
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
|
||||
// Save the device allocation for future replays of the same kernel.
|
||||
if (ReplayOutcome)
|
||||
ReplayOutcome->ReplayDeviceAlloc = TgtPtr;
|
||||
|
||||
int Ret =
|
||||
Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
|
||||
if (Ret != OFFLOAD_SUCCESS) {
|
||||
|
||||
@@ -30,14 +30,13 @@ extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
|
||||
void *ReqAddr, bool IsRecord, bool SaveOutput,
|
||||
bool EmitReport, const char *OutputDirPath);
|
||||
|
||||
extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
|
||||
void *DeviceMemory, int64_t DeviceMemorySize,
|
||||
const llvm::offloading::EntryTy *Globals,
|
||||
int32_t NumGlobals, void **TgtArgs,
|
||||
ptrdiff_t *TgtOffsets, int32_t NumArgs,
|
||||
int32_t NumTeams, int32_t ThreadLimit,
|
||||
uint32_t SharedMemorySize, uint64_t LoopTripCount,
|
||||
AsyncInfoTy &AsyncInfo,
|
||||
extern int
|
||||
target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, void *DeviceMemory,
|
||||
int64_t DeviceMemorySize, void *ReuseDeviceAlloc,
|
||||
const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
|
||||
void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
|
||||
int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize,
|
||||
uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo,
|
||||
KernelReplayOutcomeTy *ReplayOutcome);
|
||||
|
||||
extern void handleTargetOutcome(bool Success, ident_t *Loc);
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RECORDREPLAY_H
|
||||
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RECORDREPLAY_H
|
||||
|
||||
#include <chrono>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <filesystem>
|
||||
@@ -116,6 +117,10 @@ protected:
|
||||
/// information about the the kernel's replay, such as the snapshot file.
|
||||
KernelReplayOutcomeTy *ReplayOutcome = nullptr;
|
||||
|
||||
/// The begin and end time points of the kernel execution.
|
||||
using ClockTy = std::chrono::steady_clock;
|
||||
mutable std::chrono::time_point<ClockTy> BeginTime, EndTime;
|
||||
|
||||
/// The number of occurrences during the execution.
|
||||
mutable size_t Occurrences = 0;
|
||||
|
||||
@@ -129,6 +134,17 @@ protected:
|
||||
NumTeams == Other.NumTeams && NumThreads == Other.NumThreads &&
|
||||
SharedMemorySize == Other.SharedMemorySize);
|
||||
}
|
||||
|
||||
/// Record the begin and ending of the kernel execution.
|
||||
void recordBeginTime() const { BeginTime = ClockTy::now(); }
|
||||
void recordEndTime() const { EndTime = ClockTy::now(); }
|
||||
|
||||
/// Get the kernel execution time in nanoseconds.
|
||||
uint64_t getRecordedTimeNs() const {
|
||||
using DurationNsTy = std::chrono::duration<uint64_t, std::nano>;
|
||||
return std::chrono::duration_cast<DurationNsTy>(EndTime - BeginTime)
|
||||
.count();
|
||||
}
|
||||
};
|
||||
|
||||
struct InstanceHasher {
|
||||
@@ -210,6 +226,15 @@ private:
|
||||
uint32_t NumThreads, uint32_t SharedMemorySize,
|
||||
KernelReplayOutcomeTy *ReplayOutcome);
|
||||
|
||||
/// Unregister an instance once it has been replayed. Instances during
|
||||
/// recording cannot be unregistered. Accessing the instance beyond this point
|
||||
/// is invalid.
|
||||
Error unregisterInstance(const InstanceTy &Instance);
|
||||
|
||||
/// Populate the replay outcome struct to forward some replay information.
|
||||
void populateReplayOutcome(const InstanceTy &Instance,
|
||||
KernelReplayOutcomeTy &Outcome);
|
||||
|
||||
/// Record the prologue data.
|
||||
virtual Error
|
||||
recordPrologueImpl(const GenericKernelTy &Kernel, const InstanceTy &Instance,
|
||||
|
||||
@@ -89,17 +89,20 @@ Error RecordReplayTy::deinit() {
|
||||
|
||||
Error RecordReplayTy::emitInstanceReport() {
|
||||
std::lock_guard<std::mutex> LG(InstancesLock);
|
||||
llvm::outs() << "=== record report begin ===\n";
|
||||
llvm::outs() << "directory: "
|
||||
llvm::outs() << "=== Kernel Record Report ===\n";
|
||||
llvm::outs() << "Directory: "
|
||||
<< std::filesystem::absolute(OutputDirectory).string() << "\n";
|
||||
llvm::outs() << "kernels: " << Instances.size() << "\n";
|
||||
llvm::outs() << "Total Instances: " << Instances.size() << "\n";
|
||||
llvm::outs() << "JSON Filename, Kernel Name, Time (ns), Occurrences:\n";
|
||||
|
||||
SmallString<128> Filename;
|
||||
for (const auto &Inst : Instances)
|
||||
llvm::outs()
|
||||
<< getFilename(Inst, FileTy::Descriptor, /*IncludeDir=*/false).c_str()
|
||||
<< ": " << Inst.Kernel.getName() << "\n";
|
||||
llvm::outs() << "=== record report end ===\n";
|
||||
<< ", " << Inst.Kernel.getName() << ", " << Inst.getRecordedTimeNs()
|
||||
<< ", " << Inst.Occurrences << "\n";
|
||||
llvm::outs() << "=== End Kernel Record Report ===\n";
|
||||
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
@@ -116,6 +119,16 @@ RecordReplayTy::registerInstance(const GenericKernelTy &Kernel,
|
||||
return {*It, Inserted};
|
||||
}
|
||||
|
||||
Error RecordReplayTy::unregisterInstance(const InstanceTy &Instance) {
|
||||
assert(isReplaying() && "Cannot unregister instance when recording.");
|
||||
|
||||
std::lock_guard<std::mutex> LG(InstancesLock);
|
||||
size_t Erased = Instances.erase(Instance);
|
||||
if (Erased != 1)
|
||||
return Plugin::error(ErrorCode::INVALID_ARGUMENT, "invalid instance");
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
Expected<void *> RecordReplayTy::allocate(uint64_t Size) {
|
||||
assert(StartAddr && "Expected memory has been pre-allocated");
|
||||
constexpr int Alignment = 16;
|
||||
@@ -147,36 +160,59 @@ Expected<RecordReplayTy::HandleTy> RecordReplayTy::recordPrologue(
|
||||
(KernelExtraArgs) ? KernelExtraArgs->ReplayOutcome : nullptr);
|
||||
|
||||
HandleTy Handle{&Instance, First};
|
||||
if (isReplaying() || !First)
|
||||
if (!First)
|
||||
return Handle;
|
||||
|
||||
if (isRecording()) {
|
||||
if (auto Err = recordDescImpl(Kernel, Instance, KernelArgs, LaunchParams))
|
||||
return Err;
|
||||
|
||||
if (auto Err = recordPrologueImpl(Kernel, Instance, KernelArgs, LaunchParams))
|
||||
if (auto Err =
|
||||
recordPrologueImpl(Kernel, Instance, KernelArgs, LaunchParams))
|
||||
return Err;
|
||||
}
|
||||
|
||||
// Start the timer for the kernel execution.
|
||||
Instance.recordBeginTime();
|
||||
|
||||
return Handle;
|
||||
}
|
||||
|
||||
Error RecordReplayTy::recordEpilogue(const GenericKernelTy &Kernel,
|
||||
HandleTy Handle) {
|
||||
if (!shouldRecordEpilogue() || !Handle.Active)
|
||||
if (!Handle.Active)
|
||||
return Plugin::success();
|
||||
|
||||
// Stop the timer for the kernel execution.
|
||||
const InstanceTy &Instance = *Handle.Instance;
|
||||
Instance.recordEndTime();
|
||||
|
||||
if (shouldRecordEpilogue())
|
||||
if (auto Err = recordEpilogueImpl(Kernel, Instance))
|
||||
return Err;
|
||||
|
||||
// If necessary, inform the replaying tool about where the epilogue snapshot
|
||||
// file has been stored.
|
||||
if (isReplaying() && Instance.ReplayOutcome) {
|
||||
SmallString<128> Filename = getFilename(Instance, FileTy::EpilogueSnapshot);
|
||||
Instance.ReplayOutcome->OutputFilepath = Filename;
|
||||
}
|
||||
if (isReplaying() && Instance.ReplayOutcome)
|
||||
populateReplayOutcome(Instance, *Instance.ReplayOutcome);
|
||||
|
||||
// After a replay, unregister the instance so it can be replayed again. Do
|
||||
// not access the instance object beyond this point.
|
||||
if (isReplaying())
|
||||
return unregisterInstance(Instance);
|
||||
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
void RecordReplayTy::populateReplayOutcome(const InstanceTy &Instance,
|
||||
KernelReplayOutcomeTy &Outcome) {
|
||||
// Only save the epilogue output filename if it was recorded.
|
||||
if (shouldRecordEpilogue()) {
|
||||
SmallString<128> Filename = getFilename(Instance, FileTy::EpilogueSnapshot);
|
||||
Outcome.OutputFilepath = Filename;
|
||||
}
|
||||
// Save the kernel replay time.
|
||||
Outcome.KernelReplayTimeNs = Instance.getRecordedTimeNs();
|
||||
}
|
||||
|
||||
Error NativeRecordReplayTy::recordPrologueImpl(
|
||||
const GenericKernelTy &Kernel, const InstanceTy &Instance,
|
||||
const KernelArgsTy &KernelArgs, const KernelLaunchParamsTy &LaunchParams) {
|
||||
|
||||
@@ -58,6 +58,11 @@ static cl::opt<uint32_t> NumThreadsOpt("num-threads",
|
||||
static cl::opt<int32_t> DeviceIdOpt("device-id", cl::desc("Set the device id."),
|
||||
cl::init(-1), cl::cat(ReplayOptions));
|
||||
|
||||
static cl::opt<uint32_t>
|
||||
RepetitionsOpt("repetitions",
|
||||
cl::desc("Set the number of replay repetitions."),
|
||||
cl::init(1), cl::cat(ReplayOptions));
|
||||
|
||||
template <typename... ArgsTy>
|
||||
Error createErr(const char *ErrFmt, ArgsTy &&...Args) {
|
||||
return llvm::createStringError(llvm::inconvertibleErrorCode(), ErrFmt,
|
||||
@@ -132,12 +137,14 @@ Error verifyReplayOutput(StringRef RecordOutputFilename,
|
||||
return createErr("replay device memory failed to verify");
|
||||
|
||||
// Sucessfully verified.
|
||||
outs() << TOOL_PREFIX << "Replay device memory verified\n";
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
/// Replay the kernel and return whether verification occurred.
|
||||
Error replayKernel() {
|
||||
if (RepetitionsOpt == 0)
|
||||
return createErr("invalid number of repetitions");
|
||||
|
||||
// Load the kernel descriptor JSON file.
|
||||
auto KernelDescrBufferOrErr =
|
||||
MemoryBuffer::getFile(JsonFilename, /*isText=*/true,
|
||||
@@ -315,9 +322,13 @@ Error replayKernel() {
|
||||
auto RecordInputBuffer = std::move(RecordInputBufferOrErr.get());
|
||||
|
||||
KernelReplayOutcomeTy Outcome;
|
||||
|
||||
// Perform the kernel replay and verification (if needed) for each repetition.
|
||||
for (uint32_t R = 1; R <= RepetitionsOpt; ++R) {
|
||||
Rc = __tgt_target_kernel_replay(
|
||||
/*Loc=*/nullptr, DeviceId, OffloadEntries[0].Address,
|
||||
const_cast<char *>(RecordInputBuffer->getBufferStart()),
|
||||
R > 0 ? Outcome.ReplayDeviceAlloc : nullptr,
|
||||
RecordInputBuffer->getBufferSize(),
|
||||
NumGlobals ? &OffloadEntries[1] : nullptr, NumGlobals, TgtArgs.data(),
|
||||
TgtArgOffsets.data(), NumArgs, NumTeams, NumThreads, SharedMemorySize,
|
||||
@@ -325,16 +336,25 @@ Error replayKernel() {
|
||||
if (Rc != OMP_TGT_SUCCESS)
|
||||
return createErr("failed to replay kernel");
|
||||
|
||||
outs() << TOOL_PREFIX << " Replay time (" << R
|
||||
<< "): " << Outcome.KernelReplayTimeNs << " ns\n";
|
||||
}
|
||||
|
||||
// Verify the replay output if requested.
|
||||
if (VerifyOpt) {
|
||||
if (Outcome.OutputFilepath.empty())
|
||||
return createErr("replay output file was not generated");
|
||||
|
||||
Filepath.replace_extension("record_output");
|
||||
return verifyReplayOutput(Filepath.c_str(), Outcome.OutputFilepath.c_str());
|
||||
}
|
||||
if (auto Err = verifyReplayOutput(Filepath.c_str(),
|
||||
Outcome.OutputFilepath.c_str()))
|
||||
return Err;
|
||||
|
||||
outs() << TOOL_PREFIX << "Replay finished (verification skipped)\n";
|
||||
// The verification was successful.
|
||||
outs() << TOOL_PREFIX << " Replay done, device memory verified\n";
|
||||
} else {
|
||||
outs() << TOOL_PREFIX << " Replay done, verification skipped\n";
|
||||
}
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user