[offload] Add mechanism to return info to kernel replay tool (#192611)

This commit adds a mechanism to return information about a kernel replay
to outer replay tool. This mechanism allows verifying the replay memory
output when using different launch configurations (e.g., different number
of teams or threads) than the one used for recording.

It also adds a new KernelExtraArgsTy structure that is only generated by
the offload/libomptarget runtime components (unlike KernelArgsTy). These
runtime arguments can be used by future extensions, including extensions
not related to kernel record replay.
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 6183686..4082459 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -16,6 +16,7 @@
 
 #include "Environment.h"
 
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Frontend/Offloading/Utility.h"
 
@@ -127,6 +128,20 @@
   /// Ptrs to the Data entries. Only strictly required for the host plugin.
   void **Ptrs = nullptr;
 };
+
+/// The outcome of a kernel replay.
+struct KernelReplayOutcomeTy {
+  /// The path to the file that stores the output memory snapshot after the
+  /// kernel has been replayed.
+  llvm::SmallString<128> OutputFilepath;
+};
+
+/// Extra kernel arguments managed by the runtime components. Notice these
+/// arguments are additional to the ones in KernelArgsTy, which are usually
+/// generated by the compiler.
+struct KernelExtraArgsTy {
+  KernelReplayOutcomeTy *ReplayOutcome = nullptr;
+};
 }
 
 #endif // OMPTARGET_SHARED_API_TYPES_H
diff --git a/offload/include/device.h b/offload/include/device.h
index 06d2139..af103c3 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -117,6 +117,7 @@
   // Launch the kernel identified by \p TgtEntryPtr with the given arguments.
   int32_t launchKernel(void *TgtEntryPtr, void **TgtVarsPtr,
                        ptrdiff_t *TgtOffsets, KernelArgsTy &KernelArgs,
+                       KernelExtraArgsTy *KernelExtraArgs,
                        AsyncInfoTy &AsyncInfo);
 
   /// Synchronize device/queue/event based on \p AsyncInfo and return
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index c2233c5..0234e8f 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -426,14 +426,12 @@
 
 /// Executes a target kernel by replaying recorded kernel arguments and
 /// device memory.
-int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr,
-                               void *DeviceMemory, int64_t DeviceMemorySize,
-                               const llvm::offloading::EntryTy *Globals,
-                               int32_t NumGlobals, void **TgtArgs,
-                               ptrdiff_t *TgtOffsets, int32_t NumArgs,
-                               int32_t NumTeams, int32_t ThreadLimit,
-                               uint32_t SharedMemorySize,
-                               uint64_t LoopTripCount);
+int __tgt_target_kernel_replay(
+    ident_t *Loc, int64_t DeviceId, void *HostPtr, void *DeviceMemory,
+    int64_t DeviceMemorySize, const llvm::offloading::EntryTy *Globals,
+    int32_t NumGlobals, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
+    int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize,
+    uint64_t LoopTripCount, KernelReplayOutcomeTy *ReplayOutcome);
 
 void __tgt_set_info_flag(uint32_t);
 
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 77933e6..3147887 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -1087,7 +1087,7 @@
 
   auto *KernelImpl = std::get<GenericKernelTy *>(Kernel->PluginImpl);
   auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr,
-                                LaunchArgs, AsyncInfoWrapper);
+                                LaunchArgs, nullptr, AsyncInfoWrapper);
 
   AsyncInfoWrapper.finalize(Err);
   if (Err)
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index 277d4a4..48aa29c 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -350,9 +350,10 @@
 // Run region on device
 int32_t DeviceTy::launchKernel(void *TgtEntryPtr, void **TgtVarsPtr,
                                ptrdiff_t *TgtOffsets, KernelArgsTy &KernelArgs,
+                               KernelExtraArgsTy *KernelExtraArgs,
                                AsyncInfoTy &AsyncInfo) {
   return RTL->launch_kernel(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
-                            &KernelArgs, AsyncInfo);
+                            &KernelArgs, KernelExtraArgs, AsyncInfo);
 }
 
 // Run region on device
diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp
index 9f003ad..9dd206d 100644
--- a/offload/libomptarget/interface.cpp
+++ b/offload/libomptarget/interface.cpp
@@ -524,7 +524,7 @@
     int64_t DeviceMemorySize, const llvm::offloading::EntryTy *Globals,
     int32_t NumGlobals, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
     int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize,
-    uint64_t LoopTripCount) {
+    uint64_t LoopTripCount, KernelReplayOutcomeTy *ReplayOutcome) {
   assert(PM && "Runtime not initialized");
   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
   if (checkDevice(DeviceId, Loc)) {
@@ -541,10 +541,10 @@
                     /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
 
   AsyncInfoTy AsyncInfo(*DeviceOrErr);
-  int Rc =
-      target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory, DeviceMemorySize,
-                    Globals, NumGlobals, TgtArgs, TgtOffsets, NumArgs, NumTeams,
-                    ThreadLimit, SharedMemorySize, LoopTripCount, AsyncInfo);
+  int Rc = target_replay(
+      Loc, *DeviceOrErr, HostPtr, DeviceMemory, DeviceMemorySize, Globals,
+      NumGlobals, TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit,
+      SharedMemorySize, LoopTripCount, AsyncInfo, ReplayOutcome);
 
   if (Rc == OFFLOAD_SUCCESS)
     Rc = AsyncInfo.synchronize();
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 4f76976..f06654c 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -2350,7 +2350,7 @@
 #endif
 
     Ret = Device.launchKernel(TgtEntryPtr, TgtArgs.data(), TgtOffsets.data(),
-                              KernelArgs, AsyncInfo);
+                              KernelArgs, nullptr, AsyncInfo);
   }
 
   if (Ret != OFFLOAD_SUCCESS) {
@@ -2395,7 +2395,8 @@
                   void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
                   int32_t NumTeams, int32_t ThreadLimit,
                   uint32_t SharedMemorySize, uint64_t LoopTripCount,
-                  AsyncInfoTy &AsyncInfo) {
+                  AsyncInfoTy &AsyncInfo,
+                  KernelReplayOutcomeTy *ReplayOutcome) {
   int32_t DeviceId = Device.DeviceID;
   int32_t NumSymbols = NumGlobals + 1;
 
@@ -2473,8 +2474,11 @@
   KernelArgs.ThreadLimit[2] = 1;
   KernelArgs.DynCGroupMem = SharedMemorySize;
 
+  KernelExtraArgsTy KernelExtraArgs{};
+  KernelExtraArgs.ReplayOutcome = ReplayOutcome;
+
   Ret = Device.launchKernel(Symbols[0].DevPtr, TgtArgs, TgtOffsets, KernelArgs,
-                            AsyncInfo);
+                            &KernelExtraArgs, AsyncInfo);
   if (Ret != OFFLOAD_SUCCESS) {
     REPORT() << "Failed to launch kernel replay.";
     return OFFLOAD_FAIL;
diff --git a/offload/libomptarget/private.h b/offload/libomptarget/private.h
index 7c2179d..31b295b 100644
--- a/offload/libomptarget/private.h
+++ b/offload/libomptarget/private.h
@@ -37,7 +37,8 @@
                          ptrdiff_t *TgtOffsets, int32_t NumArgs,
                          int32_t NumTeams, int32_t ThreadLimit,
                          uint32_t SharedMemorySize, uint64_t LoopTripCount,
-                         AsyncInfoTy &AsyncInfo);
+                         AsyncInfoTy &AsyncInfo,
+                         KernelReplayOutcomeTy *ReplayOutcome);
 
 extern void handleTargetOutcome(bool Success, ident_t *Loc);
 
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 7cd0eba..9e84d78 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -415,6 +415,7 @@
   /// one used to initialize the kernel.
   Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
                ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
+               KernelExtraArgsTy *KernelExtraArgs,
                AsyncInfoWrapperTy &AsyncInfoWrapper,
                RecordReplayTy::HandleTy *RRHandle = nullptr) const;
   virtual Error launchImpl(GenericDeviceTy &GenericDevice,
@@ -1019,7 +1020,9 @@
 
   /// Run the kernel associated with \p EntryPtr
   Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
-                     KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
+                     KernelArgsTy &KernelArgs,
+                     KernelExtraArgsTy *KernelExtraArgs,
+                     __tgt_async_info *AsyncInfo);
 
   /// Initialize a __tgt_async_info structure.
   Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr);
@@ -1615,6 +1618,7 @@
   /// Begin executing a kernel on the given device.
   int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
                         ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs,
+                        KernelExtraArgsTy *KernelExtraArgs,
                         __tgt_async_info *AsyncInfoPtr);
 
   /// Synchronize an asyncrhonous queue with the plugin runtime.
diff --git a/offload/plugins-nextgen/common/include/RecordReplay.h b/offload/plugins-nextgen/common/include/RecordReplay.h
index 2760e10..0929a53 100644
--- a/offload/plugins-nextgen/common/include/RecordReplay.h
+++ b/offload/plugins-nextgen/common/include/RecordReplay.h
@@ -50,6 +50,15 @@
   /// Describes the format of the recording and replaying.
   enum FormatTy { Native = 0 };
 
+  /// Describes the file types that can be recorded.
+  enum FileTy {
+    PrologueSnapshot = 0,
+    EpilogueSnapshot,
+    Descriptor,
+    Globals,
+    Program
+  };
+
   struct HandleTy {
     const InstanceTy *Instance = nullptr;
     bool Active = false;
@@ -103,11 +112,16 @@
     size_t KernelHash = 0;
     size_t LaunchConfigHash = 0;
 
+    /// When replaying, the entity requesting the replay may also need further
+    /// information about the the kernel's replay, such as the snapshot file.
+    KernelReplayOutcomeTy *ReplayOutcome = nullptr;
+
     /// The number of occurrences during the execution.
     mutable size_t Occurrences = 0;
 
     InstanceTy(const GenericKernelTy &Kernel, uint32_t NumTeams,
-               uint32_t NumThreads, uint32_t SharedMemorySize);
+               uint32_t NumThreads, uint32_t SharedMemorySize,
+               KernelReplayOutcomeTy *ReplayOutcome);
 
     bool operator==(const InstanceTy &Other) const {
       return (KernelHash == Other.KernelHash &&
@@ -170,6 +184,7 @@
   /// instance is registered.
   Expected<HandleTy>
   recordPrologue(const GenericKernelTy &Kernel, const KernelArgsTy &KernelArgs,
+                 const KernelExtraArgsTy *KernelExtraArgs,
                  const KernelLaunchParamsTy &LaunchParams, uint32_t NumTeams[3],
                  uint32_t NumThreads[3], uint32_t SharedMemorySize);
 
@@ -178,9 +193,9 @@
   Error recordEpilogue(const GenericKernelTy &Kernel, HandleTy Handle);
 
   /// Get a string with the filename.
-  std::string getFilename(const InstanceTy &Instance, StringRef Suffix,
-                          bool IncludeDirectory = true) {
-    return getFilenameImpl(Instance, Suffix, IncludeDirectory);
+  SmallString<128> getFilename(const InstanceTy &Instance, FileTy FileType,
+                               bool IncludeDirectory = true) {
+    return getFilenameImpl(Instance, FileType, IncludeDirectory);
   }
 
   /// Allocates device memory from the record replay space.
@@ -192,7 +207,8 @@
   /// as a new instance.
   std::pair<const InstanceTy &, bool>
   registerInstance(const GenericKernelTy &Kernel, uint32_t NumTeams,
-                   uint32_t NumThreads, uint32_t SharedMemorySize);
+                   uint32_t NumThreads, uint32_t SharedMemorySize,
+                   KernelReplayOutcomeTy *ReplayOutcome);
 
   /// Record the prologue data.
   virtual Error
@@ -211,9 +227,9 @@
                                const KernelLaunchParamsTy &LaunchParams) = 0;
 
   /// Get a string with the filename.
-  virtual std::string getFilenameImpl(const InstanceTy &Instance,
-                                      StringRef Suffix,
-                                      bool IncludeDirectory) = 0;
+  virtual SmallString<128> getFilenameImpl(const InstanceTy &Instance,
+                                           FileTy FileType,
+                                           bool IncludeDirectory) = 0;
 };
 
 /// The native kernel record replay support.
@@ -237,17 +253,20 @@
                        const KernelLaunchParamsTy &LaunchParams) override;
 
   /// Get a string with the filename.
-  std::string getFilenameImpl(const InstanceTy &Instance, StringRef Suffix,
-                              bool IncludeDirectory) override;
+  SmallString<128> getFilenameImpl(const InstanceTy &Instance, FileTy FileType,
+                                   bool IncludeDirectory) override;
+
+  /// Get the extension for a recording file type.
+  StringRef getExtension(FileTy FileType);
 
   /// Record a memory snapshot to a file.
-  Error recordSnapshot(const std::string &Filename);
+  Error recordSnapshot(StringRef Filename);
 
   /// Record the globals to a file.
-  Error recordGlobals(const std::string &Filename);
+  Error recordGlobals(StringRef Filename);
 
   /// Record the device image to a file.
-  Error recordImage(const GenericKernelTy &Kernel, const std::string &Filename);
+  Error recordImage(const GenericKernelTy &Kernel, StringRef Filename);
 };
 
 } // namespace plugin
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 49b5edc..0d5cbf2 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -231,6 +231,7 @@
 
 Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
                               ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
+                              KernelExtraArgsTy *KernelExtraArgs,
                               AsyncInfoWrapperTy &AsyncInfoWrapper,
                               RecordReplayTy::HandleTy *RRHandle) const {
   llvm::SmallVector<void *, 16> Args;
@@ -278,9 +279,9 @@
   // blocks/threads.
   RecordReplayTy *RecordReplay = GenericDevice.getRecordReplay();
   if (RecordReplay) {
-    auto RRHandleOrErr =
-        RecordReplay->recordPrologue(*this, KernelArgs, LaunchParams, NumBlocks,
-                                     NumThreads, DynBlockMemConf.NativeSize);
+    auto RRHandleOrErr = RecordReplay->recordPrologue(
+        *this, KernelArgs, KernelExtraArgs, LaunchParams, NumBlocks, NumThreads,
+        DynBlockMemConf.NativeSize);
     if (!RRHandleOrErr)
       return RRHandleOrErr.takeError();
     if (RRHandle)
@@ -1105,6 +1106,7 @@
 Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
                                     ptrdiff_t *ArgOffsets,
                                     KernelArgsTy &KernelArgs,
+                                    KernelExtraArgsTy *KernelExtraArgs,
                                     __tgt_async_info *AsyncInfo) {
   AsyncInfoWrapperTy AsyncInfoWrapper(*this,
                                       RecordReplay ? nullptr : AsyncInfo);
@@ -1126,7 +1128,7 @@
 
   RecordReplayTy::HandleTy RRHandle;
   auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
-                                  AsyncInfoWrapper, &RRHandle);
+                                  KernelExtraArgs, AsyncInfoWrapper, &RRHandle);
 
   // 'finalize' here to guarantee next record-replay actions are in-sync
   AsyncInfoWrapper.finalize(Err);
@@ -1654,9 +1656,11 @@
 int32_t GenericPluginTy::launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
                                        void **TgtArgs, ptrdiff_t *TgtOffsets,
                                        KernelArgsTy *KernelArgs,
+                                       KernelExtraArgsTy *KernelExtraArgs,
                                        __tgt_async_info *AsyncInfoPtr) {
   auto Err = getDevice(DeviceId).launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets,
-                                              *KernelArgs, AsyncInfoPtr);
+                                              *KernelArgs, KernelExtraArgs,
+                                              AsyncInfoPtr);
   if (Err) {
     REPORT() << "Failure to run target region " << TgtEntryPtr << " in device "
              << DeviceId << ": " << toString(std::move(Err));
diff --git a/offload/plugins-nextgen/common/src/RecordReplay.cpp b/offload/plugins-nextgen/common/src/RecordReplay.cpp
index fc425b1..d4ae33c 100644
--- a/offload/plugins-nextgen/common/src/RecordReplay.cpp
+++ b/offload/plugins-nextgen/common/src/RecordReplay.cpp
@@ -31,9 +31,10 @@
 
 RecordReplayTy::InstanceTy::InstanceTy(const GenericKernelTy &Kernel,
                                        uint32_t NumTeams, uint32_t NumThreads,
-                                       uint32_t SharedMemorySize)
+                                       uint32_t SharedMemorySize,
+                                       KernelReplayOutcomeTy *ReplayOutcome)
     : Kernel(Kernel), NumTeams(NumTeams), NumThreads(NumThreads),
-      SharedMemorySize(SharedMemorySize) {
+      SharedMemorySize(SharedMemorySize), ReplayOutcome(ReplayOutcome) {
   KernelHash = stable_hash_name(Kernel.getName());
   LaunchConfigHash =
       stable_hash_combine((stable_hash)NumTeams, (stable_hash)NumThreads,
@@ -92,10 +93,12 @@
   llvm::outs() << "directory: "
                << std::filesystem::absolute(OutputDirectory).string() << "\n";
   llvm::outs() << "kernels: " << Instances.size() << "\n";
-  for (const auto &Inst : Instances) {
-    llvm::outs() << getFilename(Inst, "json", /*IncludeDir=*/false) << ": "
-                 << Inst.Kernel.getName() << "\n";
-  }
+
+  SmallString<128> Filename;
+  for (const auto &Inst : Instances)
+    llvm::outs()
+        << getFilename(Inst, FileTy::Descriptor, /*IncludeDir=*/false).c_str()
+        << ": " << Inst.Kernel.getName() << "\n";
   llvm::outs() << "=== record report end ===\n";
   return Plugin::success();
 }
@@ -103,10 +106,11 @@
 std::pair<const RecordReplayTy::InstanceTy &, bool>
 RecordReplayTy::registerInstance(const GenericKernelTy &Kernel,
                                  uint32_t NumTeams, uint32_t NumThreads,
-                                 uint32_t SharedMemorySize) {
+                                 uint32_t SharedMemorySize,
+                                 KernelReplayOutcomeTy *ReplayOutcome) {
   std::lock_guard<std::mutex> LG(InstancesLock);
-  auto [It, Inserted] =
-      Instances.emplace(Kernel, NumTeams, NumThreads, SharedMemorySize);
+  auto [It, Inserted] = Instances.emplace(Kernel, NumTeams, NumThreads,
+                                          SharedMemorySize, ReplayOutcome);
   // Increase the number of occurrences.
   It->Occurrences += 1;
   return {*It, Inserted};
@@ -131,14 +135,16 @@
 
 Expected<RecordReplayTy::HandleTy> RecordReplayTy::recordPrologue(
     const GenericKernelTy &Kernel, const KernelArgsTy &KernelArgs,
+    const KernelExtraArgsTy *KernelExtraArgs,
     const KernelLaunchParamsTy &LaunchParams, uint32_t NumTeams[3],
     uint32_t NumThreads[3], uint32_t SharedMemorySize) {
   if (!isRecordingOrReplaying())
     return HandleTy{nullptr, false};
 
   // Register the instance and avoid recording if it is inactive or replaying.
-  auto [Instance, First] =
-      registerInstance(Kernel, NumTeams[0], NumThreads[0], SharedMemorySize);
+  auto [Instance, First] = registerInstance(
+      Kernel, NumTeams[0], NumThreads[0], SharedMemorySize,
+      (KernelExtraArgs) ? KernelExtraArgs->ReplayOutcome : nullptr);
 
   HandleTy Handle{&Instance, First};
   if (isReplaying() || !First)
@@ -158,28 +164,39 @@
   if (!shouldRecordEpilogue() || !Handle.Active)
     return Plugin::success();
 
-  return recordEpilogueImpl(Kernel, *Handle.Instance);
+  const InstanceTy &Instance = *Handle.Instance;
+  if (auto Err = recordEpilogueImpl(Kernel, Instance))
+    return Err;
+
+  // If necessary, inform the replaying tool about where the epilogue snapshot
+  // file has been stored.
+  if (isReplaying() && Instance.ReplayOutcome) {
+    SmallString<128> Filename = getFilename(Instance, FileTy::EpilogueSnapshot);
+    Instance.ReplayOutcome->OutputFilepath = Filename;
+  }
+  return Plugin::success();
 }
 
 Error NativeRecordReplayTy::recordPrologueImpl(
     const GenericKernelTy &Kernel, const InstanceTy &Instance,
     const KernelArgsTy &KernelArgs, const KernelLaunchParamsTy &LaunchParams) {
-  std::string SnapshotFilename = getFilename(Instance, "record_input");
-  if (auto Err = recordSnapshot(SnapshotFilename))
+  SmallString<128> SnapshotFilename =
+      getFilename(Instance, FileTy::PrologueSnapshot);
+  if (auto Err = recordSnapshot(SnapshotFilename.c_str()))
     return Err;
 
-  std::string GlobalsFilename = getFilename(Instance, "globals");
-  if (auto Err = recordGlobals(GlobalsFilename))
+  SmallString<128> GlobalsFilename = getFilename(Instance, FileTy::Globals);
+  if (auto Err = recordGlobals(GlobalsFilename.c_str()))
     return Err;
 
-  std::string ImageFilename = getFilename(Instance, "image");
-  return recordImage(Kernel, ImageFilename);
+  SmallString<128> ImageFilename = getFilename(Instance, FileTy::Program);
+  return recordImage(Kernel, ImageFilename.c_str());
 }
 
 Error NativeRecordReplayTy::recordEpilogueImpl(const GenericKernelTy &Kernel,
                                                const InstanceTy &Instance) {
-  std::string SnapshotFilename =
-      getFilename(Instance, isRecording() ? "record_output" : "replay_output");
+  SmallString<128> SnapshotFilename =
+      getFilename(Instance, FileTy::EpilogueSnapshot);
   return recordSnapshot(SnapshotFilename);
 }
 
@@ -207,9 +224,10 @@
     JsonArgOffsets.push_back(0);
   JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets));
 
-  std::string JsonFilename = getFilename(Instance, "json");
+  SmallString<128> JsonFilename = getFilename(Instance, FileTy::Descriptor);
+
   std::error_code EC;
-  raw_fd_ostream JsonOS(JsonFilename, EC);
+  raw_fd_ostream JsonOS(JsonFilename.c_str(), EC);
   if (EC)
     return Plugin::error(ErrorCode::HOST_IO, "saving kernel json file");
   JsonOS << json::Value(std::move(JsonKernelInfo));
@@ -217,17 +235,34 @@
   return Plugin::success();
 }
 
-std::string NativeRecordReplayTy::getFilenameImpl(const InstanceTy &Instance,
-                                                  StringRef Suffix,
-                                                  bool IncludeDirectory) {
+StringRef NativeRecordReplayTy::getExtension(FileTy FileType) {
+  switch (FileType) {
+  case FileTy::PrologueSnapshot:
+    return "record_input";
+  case FileTy::EpilogueSnapshot:
+    return isRecording() ? "record_output" : "replay_output";
+  case FileTy::Descriptor:
+    return "json";
+  case FileTy::Globals:
+    return "globals";
+  case FileTy::Program:
+    return "image";
+  }
+  return "";
+}
+
+SmallString<128>
+NativeRecordReplayTy::getFilenameImpl(const InstanceTy &Instance,
+                                      FileTy FileType, bool IncludeDirectory) {
   std::filesystem::path Filepath = IncludeDirectory ? OutputDirectory : "";
   Filepath /= std::to_string(Instance.KernelHash) + "_" +
               std::to_string(Instance.LaunchConfigHash);
-  Filepath.replace_extension(Suffix.data());
-  return Filepath.string();
+  Filepath.replace_extension(getExtension(FileType).data());
+  SmallString<128> Filename(Filepath.c_str());
+  return Filename;
 }
 
-Error NativeRecordReplayTy::recordSnapshot(const std::string &Filename) {
+Error NativeRecordReplayTy::recordSnapshot(StringRef Filename) {
   // Another thread may be allocating memory. The size can only increase.
   AllocationLock.lock();
   uint64_t RecordSize = CurrentSize;
@@ -255,7 +290,7 @@
 }
 
 Error NativeRecordReplayTy::recordImage(const GenericKernelTy &Kernel,
-                                        const std::string &Filename) {
+                                        StringRef Filename) {
   std::error_code EC;
   raw_fd_ostream OS(Filename, EC);
   if (EC)
@@ -265,7 +300,7 @@
   return Plugin::success();
 }
 
-Error NativeRecordReplayTy::recordGlobals(const std::string &Filename) {
+Error NativeRecordReplayTy::recordGlobals(StringRef Filename) {
   AllocationLock.lock();
   // Copy the globals into a local vector so we can read it safely from this
   // thread. This vector should have a few entries in general. No need to lock
diff --git a/offload/test/tools/omp-kernel-replay/record-replay-diff-teams-threads.cpp b/offload/test/tools/omp-kernel-replay/record-replay-diff-teams-threads.cpp
new file mode 100644
index 0000000..803b633
--- /dev/null
+++ b/offload/test/tools/omp-kernel-replay/record-replay-diff-teams-threads.cpp
@@ -0,0 +1,46 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic
+// RUN: rm -rf %t.testdir
+// RUN: mkdir -p %t.testdir
+// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify {}
+// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-teams=1 --num-threads=1 {}
+// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-teams=2 --num-threads=32 {}
+// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-teams=32 --num-threads=64 {}
+// clang-format on
+
+// REQUIRES: gpu
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: x86_64-unknown-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: intelgpu
+
+#include <cstdint>
+#include <cstdio>
+
+int main() {
+  size_t Size = 1000;
+  uint64_t *Data = new uint64_t[Size];
+
+  for (size_t I = 0; I < Size; ++I) {
+    Data[I] = 20;
+  }
+
+#pragma omp target teams distribute parallel for thread_limit(128)             \
+    map(tofrom : Data[0 : Size])
+  for (size_t I = 0; I < Size; ++I) {
+    Data[I] = 10 + (uint64_t)I;
+  }
+
+  uint64_t Sum = 0;
+  for (size_t I = 0; I < Size; ++I) {
+    Sum += Data[I];
+  }
+
+  // CHECK: PASS
+  if (Sum == 509500)
+    printf("PASS\n");
+
+  delete[] Data;
+}
diff --git a/offload/test/tools/omp-kernel-replay/record-replay-diff-threads.cpp b/offload/test/tools/omp-kernel-replay/record-replay-diff-threads.cpp
new file mode 100644
index 0000000..9b65c38f
--- /dev/null
+++ b/offload/test/tools/omp-kernel-replay/record-replay-diff-threads.cpp
@@ -0,0 +1,46 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic
+// RUN: rm -rf %t.testdir
+// RUN: mkdir -p %t.testdir
+// RUN: env LIBOMPTARGET_RECORD=1 LIBOMPTARGET_RECORD_MEMSIZE=536870912 LIBOMPTARGET_RECORD_DIR=%t.testdir %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify {}
+// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-threads=1 {}
+// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-threads=32 {}
+// RUN: ls -t %t.testdir/*.json | sed -n '1p' | grep . | xargs -I {} %omp-kernel-replay --verify --num-threads=64 {}
+// clang-format on
+
+// REQUIRES: gpu
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: x86_64-unknown-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: intelgpu
+
+#include <cstdint>
+#include <cstdio>
+
+int main() {
+  size_t Size = 1000;
+  uint64_t *Data = new uint64_t[Size];
+
+  for (size_t I = 0; I < Size; ++I) {
+    Data[I] = 20;
+  }
+
+#pragma omp target teams distribute parallel for num_teams(256)                \
+    thread_limit(128) map(tofrom : Data[0 : Size])
+  for (size_t I = 0; I < Size; ++I) {
+    Data[I] = 10 + (uint64_t)I;
+  }
+
+  uint64_t Sum = 0;
+  for (size_t I = 0; I < Size; ++I) {
+    Sum += Data[I];
+  }
+
+  // CHECK: PASS
+  if (Sum == 509500)
+    printf("PASS\n");
+
+  delete[] Data;
+}
diff --git a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index 1d364bb..3f22c7c 100644
--- a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -197,17 +197,22 @@
               const_cast<char *>(DeviceMemoryMB.get()->getBuffer().data()),
               DeviceMemoryMB.get()->getBufferSize());
 
+  KernelReplayOutcomeTy ReplayOutcome;
+
   Rc = __tgt_target_kernel_replay(
       /*Loc=*/nullptr, DeviceId, OffloadEntries[0].Address,
       (char *)RecordedData, DeviceMemoryMB.get()->getBufferSize(),
       NumGlobals ? &OffloadEntries[1] : nullptr, NumGlobals, TgtArgs.data(),
       TgtArgOffsets.data(), NumArgs.value(), NumTeams, NumThreads,
-      SharedMemorySize, LoopTripCount.value());
+      SharedMemorySize, LoopTripCount.value(), &ReplayOutcome);
   if (Rc != OMP_TGT_SUCCESS)
     reportFatalUsageError("Error replaying kernel");
 
   int ErrorDetected = 0;
   if (VerifyOpt) {
+    if (ReplayOutcome.OutputFilepath.empty())
+      reportFatalUsageError("Replay output file was not generated");
+
     Filepath.replace_extension("record_output");
     ErrorOr<std::unique_ptr<MemoryBuffer>> OriginalOutputMB =
         MemoryBuffer::getFile(Filepath.string(),
@@ -218,9 +223,8 @@
           "Error reading the kernel record output file. Make sure "
           "LIBOMPTARGET_RECORD_OUTPUT is set when recording");
 
-    Filepath.replace_extension("replay_output");
     ErrorOr<std::unique_ptr<MemoryBuffer>> ReplayOutputMB =
-        MemoryBuffer::getFile(Filepath.string(),
+        MemoryBuffer::getFile(ReplayOutcome.OutputFilepath.c_str(),
                               /*isText=*/false,
                               /*RequiresNullTerminator=*/false);
     if (!ReplayOutputMB)