diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index 40824596c3b9..5e61bc7c842e 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -134,6 +134,12 @@ struct KernelReplayOutcomeTy { /// The path to the file that stores the output memory snapshot after the /// kernel has been replayed. llvm::SmallString<128> OutputFilepath; + /// The execution time of the kernel replay in nanoseconds. This time includes + /// the the kernel launch and synchronization time. Replay I/O is excluded. + uint64_t KernelReplayTimeNs = 0; + /// The pointer to the device memory allocation used to replay. This can be + /// reused for future replays of the same kernel. + void *ReplayDeviceAlloc = nullptr; }; /// Extra kernel arguments managed by the runtime components. Notice these diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 0234e8fc5524..e5d9852ad48a 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -428,10 +428,11 @@ void __tgt_target_nowait_query(void **AsyncHandle); /// device memory. int __tgt_target_kernel_replay( ident_t *Loc, int64_t DeviceId, void *HostPtr, void *DeviceMemory, - int64_t DeviceMemorySize, const llvm::offloading::EntryTy *Globals, - int32_t NumGlobals, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, - int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize, - uint64_t LoopTripCount, KernelReplayOutcomeTy *ReplayOutcome); + void *ReuseDeviceAlloc, int64_t DeviceMemorySize, + const llvm::offloading::EntryTy *Globals, int32_t NumGlobals, + void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, + int32_t ThreadLimit, uint32_t SharedMemorySize, uint64_t LoopTripCount, + KernelReplayOutcomeTy *ReplayOutcome); void __tgt_set_info_flag(uint32_t); diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp index 9dd206d140c1..f65ca3cadee8 100644 --- a/offload/libomptarget/interface.cpp +++ b/offload/libomptarget/interface.cpp @@ -509,6 +509,9 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, /// \param DeviceMemory A pointer to an array storing device memory data to move /// prior to kernel execution. /// \param DeviceMemorySize The size of the above device memory data in bytes. +/// \param ReuseDeviceAlloc Pointer to a device memory allocation that should be +/// reused for the replay. If null, the replay will +/// allocate the necessary device buffer. /// \param TgtArgs An array of pointers of the pre-recorded target kernel /// arguments. /// \param TgtOffsets An array of pointers of the pre-recorded target kernel @@ -521,10 +524,11 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, /// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure. EXTERN int __tgt_target_kernel_replay( ident_t *Loc, int64_t DeviceId, void *HostPtr, void *DeviceMemory, - int64_t DeviceMemorySize, const llvm::offloading::EntryTy *Globals, - int32_t NumGlobals, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, - int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize, - uint64_t LoopTripCount, KernelReplayOutcomeTy *ReplayOutcome) { + void *ReuseDeviceAlloc, int64_t DeviceMemorySize, + const llvm::offloading::EntryTy *Globals, int32_t NumGlobals, + void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, + int32_t ThreadLimit, uint32_t SharedMemorySize, uint64_t LoopTripCount, + KernelReplayOutcomeTy *ReplayOutcome) { assert(PM && "Runtime not initialized"); OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); if (checkDevice(DeviceId, Loc)) { @@ -541,10 +545,11 @@ EXTERN int __tgt_target_kernel_replay( /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);) AsyncInfoTy AsyncInfo(*DeviceOrErr); - int Rc = target_replay( - Loc, *DeviceOrErr, HostPtr, DeviceMemory, DeviceMemorySize, Globals, - NumGlobals, TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit, - SharedMemorySize, LoopTripCount, AsyncInfo, ReplayOutcome); + int Rc = + target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory, DeviceMemorySize, + ReuseDeviceAlloc, Globals, NumGlobals, TgtArgs, TgtOffsets, + NumArgs, NumTeams, ThreadLimit, SharedMemorySize, + LoopTripCount, AsyncInfo, ReplayOutcome); if (Rc == OFFLOAD_SUCCESS) Rc = AsyncInfo.synchronize(); diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp index f06654c639a8..82a0ed73317d 100644 --- a/offload/libomptarget/omptarget.cpp +++ b/offload/libomptarget/omptarget.cpp @@ -2391,6 +2391,7 @@ int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr, /// configuration. int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, void *DeviceMemory, int64_t DeviceMemorySize, + void *ReuseDeviceAlloc, const llvm::offloading::EntryTy *Globals, int32_t NumGlobals, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit, @@ -2448,13 +2449,20 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, } } - void *TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr, - TARGET_ALLOC_DEFAULT); + // Reuse a previous device allocation or allocate a new device buffer. + void *&TgtPtr = ReuseDeviceAlloc; + if (!TgtPtr) + TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr, + TARGET_ALLOC_DEFAULT); if (!TgtPtr) { REPORT() << "Failed to allocate device memory."; return OFFLOAD_FAIL; } + // Save the device allocation for future replays of the same kernel. + if (ReplayOutcome) + ReplayOutcome->ReplayDeviceAlloc = TgtPtr; + int Ret = Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { diff --git a/offload/libomptarget/private.h b/offload/libomptarget/private.h index 31b295bda613..e52028cc060d 100644 --- a/offload/libomptarget/private.h +++ b/offload/libomptarget/private.h @@ -30,15 +30,14 @@ extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *ReqAddr, bool IsRecord, bool SaveOutput, bool EmitReport, const char *OutputDirPath); -extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, - void *DeviceMemory, int64_t DeviceMemorySize, - const llvm::offloading::EntryTy *Globals, - int32_t NumGlobals, void **TgtArgs, - ptrdiff_t *TgtOffsets, int32_t NumArgs, - int32_t NumTeams, int32_t ThreadLimit, - uint32_t SharedMemorySize, uint64_t LoopTripCount, - AsyncInfoTy &AsyncInfo, - KernelReplayOutcomeTy *ReplayOutcome); +extern int +target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, void *DeviceMemory, + int64_t DeviceMemorySize, void *ReuseDeviceAlloc, + const llvm::offloading::EntryTy *Globals, int32_t NumGlobals, + void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs, + int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize, + uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo, + KernelReplayOutcomeTy *ReplayOutcome); extern void handleTargetOutcome(bool Success, ident_t *Loc); diff --git a/offload/plugins-nextgen/common/include/RecordReplay.h b/offload/plugins-nextgen/common/include/RecordReplay.h index 0929a533effa..65a861cc8a0c 100644 --- a/offload/plugins-nextgen/common/include/RecordReplay.h +++ b/offload/plugins-nextgen/common/include/RecordReplay.h @@ -11,6 +11,7 @@ #ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RECORDREPLAY_H #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RECORDREPLAY_H +#include #include #include #include @@ -116,6 +117,10 @@ protected: /// information about the the kernel's replay, such as the snapshot file. KernelReplayOutcomeTy *ReplayOutcome = nullptr; + /// The begin and end time points of the kernel execution. + using ClockTy = std::chrono::steady_clock; + mutable std::chrono::time_point BeginTime, EndTime; + /// The number of occurrences during the execution. mutable size_t Occurrences = 0; @@ -129,6 +134,17 @@ protected: NumTeams == Other.NumTeams && NumThreads == Other.NumThreads && SharedMemorySize == Other.SharedMemorySize); } + + /// Record the begin and ending of the kernel execution. + void recordBeginTime() const { BeginTime = ClockTy::now(); } + void recordEndTime() const { EndTime = ClockTy::now(); } + + /// Get the kernel execution time in nanoseconds. + uint64_t getRecordedTimeNs() const { + using DurationNsTy = std::chrono::duration; + return std::chrono::duration_cast(EndTime - BeginTime) + .count(); + } }; struct InstanceHasher { @@ -210,6 +226,15 @@ private: uint32_t NumThreads, uint32_t SharedMemorySize, KernelReplayOutcomeTy *ReplayOutcome); + /// Unregister an instance once it has been replayed. Instances during + /// recording cannot be unregistered. Accessing the instance beyond this point + /// is invalid. + Error unregisterInstance(const InstanceTy &Instance); + + /// Populate the replay outcome struct to forward some replay information. + void populateReplayOutcome(const InstanceTy &Instance, + KernelReplayOutcomeTy &Outcome); + /// Record the prologue data. virtual Error recordPrologueImpl(const GenericKernelTy &Kernel, const InstanceTy &Instance, diff --git a/offload/plugins-nextgen/common/src/RecordReplay.cpp b/offload/plugins-nextgen/common/src/RecordReplay.cpp index 0b03e3405ce5..8c4b012fdc9e 100644 --- a/offload/plugins-nextgen/common/src/RecordReplay.cpp +++ b/offload/plugins-nextgen/common/src/RecordReplay.cpp @@ -89,17 +89,20 @@ Error RecordReplayTy::deinit() { Error RecordReplayTy::emitInstanceReport() { std::lock_guard LG(InstancesLock); - llvm::outs() << "=== record report begin ===\n"; - llvm::outs() << "directory: " + llvm::outs() << "=== Kernel Record Report ===\n"; + llvm::outs() << "Directory: " << std::filesystem::absolute(OutputDirectory).string() << "\n"; - llvm::outs() << "kernels: " << Instances.size() << "\n"; + llvm::outs() << "Total Instances: " << Instances.size() << "\n"; + llvm::outs() << "JSON Filename, Kernel Name, Time (ns), Occurrences:\n"; SmallString<128> Filename; for (const auto &Inst : Instances) llvm::outs() << getFilename(Inst, FileTy::Descriptor, /*IncludeDir=*/false).c_str() - << ": " << Inst.Kernel.getName() << "\n"; - llvm::outs() << "=== record report end ===\n"; + << ", " << Inst.Kernel.getName() << ", " << Inst.getRecordedTimeNs() + << ", " << Inst.Occurrences << "\n"; + llvm::outs() << "=== End Kernel Record Report ===\n"; + return Plugin::success(); } @@ -116,6 +119,16 @@ RecordReplayTy::registerInstance(const GenericKernelTy &Kernel, return {*It, Inserted}; } +Error RecordReplayTy::unregisterInstance(const InstanceTy &Instance) { + assert(isReplaying() && "Cannot unregister instance when recording."); + + std::lock_guard LG(InstancesLock); + size_t Erased = Instances.erase(Instance); + if (Erased != 1) + return Plugin::error(ErrorCode::INVALID_ARGUMENT, "invalid instance"); + return Plugin::success(); +} + Expected RecordReplayTy::allocate(uint64_t Size) { assert(StartAddr && "Expected memory has been pre-allocated"); constexpr int Alignment = 16; @@ -147,36 +160,59 @@ Expected RecordReplayTy::recordPrologue( (KernelExtraArgs) ? KernelExtraArgs->ReplayOutcome : nullptr); HandleTy Handle{&Instance, First}; - if (isReplaying() || !First) + if (!First) return Handle; - if (auto Err = recordDescImpl(Kernel, Instance, KernelArgs, LaunchParams)) - return Err; + if (isRecording()) { + if (auto Err = recordDescImpl(Kernel, Instance, KernelArgs, LaunchParams)) + return Err; - if (auto Err = recordPrologueImpl(Kernel, Instance, KernelArgs, LaunchParams)) - return Err; + if (auto Err = + recordPrologueImpl(Kernel, Instance, KernelArgs, LaunchParams)) + return Err; + } + + // Start the timer for the kernel execution. + Instance.recordBeginTime(); return Handle; } Error RecordReplayTy::recordEpilogue(const GenericKernelTy &Kernel, HandleTy Handle) { - if (!shouldRecordEpilogue() || !Handle.Active) + if (!Handle.Active) return Plugin::success(); + // Stop the timer for the kernel execution. const InstanceTy &Instance = *Handle.Instance; - if (auto Err = recordEpilogueImpl(Kernel, Instance)) - return Err; + Instance.recordEndTime(); + + if (shouldRecordEpilogue()) + if (auto Err = recordEpilogueImpl(Kernel, Instance)) + return Err; + + if (isReplaying() && Instance.ReplayOutcome) + populateReplayOutcome(Instance, *Instance.ReplayOutcome); + + // After a replay, unregister the instance so it can be replayed again. Do + // not access the instance object beyond this point. + if (isReplaying()) + return unregisterInstance(Instance); - // If necessary, inform the replaying tool about where the epilogue snapshot - // file has been stored. - if (isReplaying() && Instance.ReplayOutcome) { - SmallString<128> Filename = getFilename(Instance, FileTy::EpilogueSnapshot); - Instance.ReplayOutcome->OutputFilepath = Filename; - } return Plugin::success(); } +void RecordReplayTy::populateReplayOutcome(const InstanceTy &Instance, + KernelReplayOutcomeTy &Outcome) { + // Only save the epilogue output filename if it was recorded. + if (shouldRecordEpilogue()) { + SmallString<128> Filename = getFilename(Instance, FileTy::EpilogueSnapshot); + Outcome.OutputFilepath = Filename; + } + // Save the kernel replay time. + Outcome.KernelReplayTimeNs = Instance.getRecordedTimeNs(); +} + Error NativeRecordReplayTy::recordPrologueImpl( const GenericKernelTy &Kernel, const InstanceTy &Instance, const KernelArgsTy &KernelArgs, const KernelLaunchParamsTy &LaunchParams) { diff --git a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp index 7e8ceb7c24c0..a5bda7a0f044 100644 --- a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp +++ b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp @@ -58,6 +58,11 @@ static cl::opt NumThreadsOpt("num-threads", static cl::opt DeviceIdOpt("device-id", cl::desc("Set the device id."), cl::init(-1), cl::cat(ReplayOptions)); +static cl::opt + RepetitionsOpt("repetitions", + cl::desc("Set the number of replay repetitions."), + cl::init(1), cl::cat(ReplayOptions)); + template Error createErr(const char *ErrFmt, ArgsTy &&...Args) { return llvm::createStringError(llvm::inconvertibleErrorCode(), ErrFmt, @@ -132,12 +137,14 @@ Error verifyReplayOutput(StringRef RecordOutputFilename, return createErr("replay device memory failed to verify"); // Sucessfully verified. - outs() << TOOL_PREFIX << "Replay device memory verified\n"; return Error::success(); } /// Replay the kernel and return whether verification occurred. Error replayKernel() { + if (RepetitionsOpt == 0) + return createErr("invalid number of repetitions"); + // Load the kernel descriptor JSON file. auto KernelDescrBufferOrErr = MemoryBuffer::getFile(JsonFilename, /*isText=*/true, @@ -315,15 +322,23 @@ Error replayKernel() { auto RecordInputBuffer = std::move(RecordInputBufferOrErr.get()); KernelReplayOutcomeTy Outcome; - Rc = __tgt_target_kernel_replay( - /*Loc=*/nullptr, DeviceId, OffloadEntries[0].Address, - const_cast(RecordInputBuffer->getBufferStart()), - RecordInputBuffer->getBufferSize(), - NumGlobals ? &OffloadEntries[1] : nullptr, NumGlobals, TgtArgs.data(), - TgtArgOffsets.data(), NumArgs, NumTeams, NumThreads, SharedMemorySize, - LoopTripCount, &Outcome); - if (Rc != OMP_TGT_SUCCESS) - return createErr("failed to replay kernel"); + + // Perform the kernel replay and verification (if needed) for each repetition. + for (uint32_t R = 1; R <= RepetitionsOpt; ++R) { + Rc = __tgt_target_kernel_replay( + /*Loc=*/nullptr, DeviceId, OffloadEntries[0].Address, + const_cast(RecordInputBuffer->getBufferStart()), + R > 0 ? Outcome.ReplayDeviceAlloc : nullptr, + RecordInputBuffer->getBufferSize(), + NumGlobals ? &OffloadEntries[1] : nullptr, NumGlobals, TgtArgs.data(), + TgtArgOffsets.data(), NumArgs, NumTeams, NumThreads, SharedMemorySize, + LoopTripCount, &Outcome); + if (Rc != OMP_TGT_SUCCESS) + return createErr("failed to replay kernel"); + + outs() << TOOL_PREFIX << " Replay time (" << R + << "): " << Outcome.KernelReplayTimeNs << " ns\n"; + } // Verify the replay output if requested. if (VerifyOpt) { @@ -331,10 +346,15 @@ Error replayKernel() { return createErr("replay output file was not generated"); Filepath.replace_extension("record_output"); - return verifyReplayOutput(Filepath.c_str(), Outcome.OutputFilepath.c_str()); - } + if (auto Err = verifyReplayOutput(Filepath.c_str(), + Outcome.OutputFilepath.c_str())) + return Err; - outs() << TOOL_PREFIX << "Replay finished (verification skipped)\n"; + // The verification was successful. + outs() << TOOL_PREFIX << " Replay done, device memory verified\n"; + } else { + outs() << TOOL_PREFIX << " Replay done, verification skipped\n"; + } return Error::success(); }