lib/Support/Windows/Threading.inc - llvm-project/llvm - Git at Google

 //===- Windows/Threading.inc - Win32 Threading Implementation - -*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file provides the Win32 specific implementation of Threading functions.
 //
 //===----------------------------------------------------------------------===//

 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"

 #include "llvm/Support/Windows/WindowsSupport.h"
 #include <process.h>

 #include <bitset>

 // Windows will at times define MemoryFence.
 #ifdef MemoryFence
 #undef MemoryFence
 #endif

 static unsigned __stdcall threadFuncSync(void *Arg) {
   SyncThreadInfo *TI = static_cast<SyncThreadInfo *>(Arg);
   TI->UserFn(TI->UserData);
   return 0;
 }

 static unsigned __stdcall threadFuncAsync(void *Arg) {
   std::unique_ptr<AsyncThreadInfo> Info(static_cast<AsyncThreadInfo *>(Arg));
   (*Info)();
   return 0;
 }

 static void
 llvm_execute_on_thread_impl(unsigned (__stdcall *ThreadFunc)(void *), void *Arg,
                             llvm::Optional<unsigned> StackSizeInBytes,
                             JoiningPolicy JP) {
   HANDLE hThread = (HANDLE)::_beginthreadex(
       NULL, StackSizeInBytes.getValueOr(0), ThreadFunc, Arg, 0, NULL);

   if (!hThread) {
     ReportLastErrorFatal("_beginthreadex failed");
   }

   if (JP == JoiningPolicy::Join) {
     if (::WaitForSingleObject(hThread, INFINITE) == WAIT_FAILED) {
       ReportLastErrorFatal("WaitForSingleObject failed");
     }
   }
   if (::CloseHandle(hThread) == FALSE) {
     ReportLastErrorFatal("CloseHandle failed");
   }
 }

 uint64_t llvm::get_threadid() {
   return uint64_t(::GetCurrentThreadId());
 }

 uint32_t llvm::get_max_thread_name_length() { return 0; }

 #if defined(_MSC_VER)
 static void SetThreadName(DWORD Id, LPCSTR Name) {
   constexpr DWORD MS_VC_EXCEPTION = 0x406D1388;

 #pragma pack(push, 8)
   struct THREADNAME_INFO {
     DWORD dwType;     // Must be 0x1000.
     LPCSTR szName;    // Pointer to thread name
     DWORD dwThreadId; // Thread ID (-1 == current thread)
     DWORD dwFlags;    // Reserved.  Do not use.
   };
 #pragma pack(pop)

   THREADNAME_INFO info;
   info.dwType = 0x1000;
   info.szName = Name;
   info.dwThreadId = Id;
   info.dwFlags = 0;

   __try {
     ::RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR),
       (ULONG_PTR *)&info);
   }
   __except (EXCEPTION_EXECUTE_HANDLER) {
   }
 }
 #endif

 void llvm::set_thread_name(const Twine &Name) {
 #if defined(_MSC_VER)
   // Make sure the input is null terminated.
   SmallString<64> Storage;
   StringRef NameStr = Name.toNullTerminatedStringRef(Storage);
   SetThreadName(::GetCurrentThreadId(), NameStr.data());
 #endif
 }

 void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
   // "Name" is not an inherent property of a thread on Windows.  In fact, when
   // you "set" the name, you are only firing a one-time message to a debugger
   // which it interprets as a program setting its threads' name.  We may be
   // able to get fancy by creating a TLS entry when someone calls
   // set_thread_name so that subsequent calls to get_thread_name return this
   // value.
   Name.clear();
 }

 SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
   // https://docs.microsoft.com/en-us/windows/desktop/api/processthreadsapi/nf-processthreadsapi-setthreadpriority
   // Begin background processing mode. The system lowers the resource scheduling
   // priorities of the thread so that it can perform background work without
   // significantly affecting activity in the foreground.
   // End background processing mode. The system restores the resource scheduling
   // priorities of the thread as they were before the thread entered background
   // processing mode.
   return SetThreadPriority(GetCurrentThread(),
                            Priority == ThreadPriority::Background
                                ? THREAD_MODE_BACKGROUND_BEGIN
                                : THREAD_MODE_BACKGROUND_END)
              ? SetThreadPriorityResult::SUCCESS
              : SetThreadPriorityResult::FAILURE;
 }

 struct ProcessorGroup {
   unsigned ID;
   unsigned AllThreads;
   unsigned UsableThreads;
   unsigned ThreadsPerCore;
   uint64_t Affinity;

   unsigned useableCores() const {
     return std::max(1U, UsableThreads / ThreadsPerCore);
   }
 };

 template <typename F>
 static bool IterateProcInfo(LOGICAL_PROCESSOR_RELATIONSHIP Relationship, F Fn) {
   DWORD Len = 0;
   BOOL R = ::GetLogicalProcessorInformationEx(Relationship, NULL, &Len);
   if (R || GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
     return false;
   }
   auto *Info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)calloc(1, Len);
   R = ::GetLogicalProcessorInformationEx(Relationship, Info, &Len);
   if (R) {
     auto *End =
         (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Info + Len);
     for (auto *Curr = Info; Curr < End;
          Curr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Curr +
                                                             Curr->Size)) {
       if (Curr->Relationship != Relationship)
         continue;
       Fn(Curr);
     }
   }
   free(Info);
   return true;
 }

 static ArrayRef<ProcessorGroup> getProcessorGroups() {
   auto computeGroups = []() {
     SmallVector<ProcessorGroup, 4> Groups;

     auto HandleGroup = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
       GROUP_RELATIONSHIP &El = ProcInfo->Group;
       for (unsigned J = 0; J < El.ActiveGroupCount; ++J) {
         ProcessorGroup G;
         G.ID = Groups.size();
         G.AllThreads = El.GroupInfo[J].MaximumProcessorCount;
         G.UsableThreads = El.GroupInfo[J].ActiveProcessorCount;
         assert(G.UsableThreads <= 64);
         G.Affinity = El.GroupInfo[J].ActiveProcessorMask;
         Groups.push_back(G);
       }
     };

     if (!IterateProcInfo(RelationGroup, HandleGroup))
       return std::vector<ProcessorGroup>();

     auto HandleProc = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
       PROCESSOR_RELATIONSHIP &El = ProcInfo->Processor;
       assert(El.GroupCount == 1);
       unsigned NumHyperThreads = 1;
       // If the flag is set, each core supports more than one hyper-thread.
       if (El.Flags & LTP_PC_SMT)
         NumHyperThreads = std::bitset<64>(El.GroupMask[0].Mask).count();
       unsigned I = El.GroupMask[0].Group;
       Groups[I].ThreadsPerCore = NumHyperThreads;
     };

     if (!IterateProcInfo(RelationProcessorCore, HandleProc))
       return std::vector<ProcessorGroup>();

     // If there's an affinity mask set, assume the user wants to constrain the
     // current process to only a single CPU group. On Windows, it is not
     // possible for affinity masks to cross CPU group boundaries.
     DWORD_PTR ProcessAffinityMask = 0, SystemAffinityMask = 0;
     if (::GetProcessAffinityMask(GetCurrentProcess(), &ProcessAffinityMask,
                                  &SystemAffinityMask) &&
         ProcessAffinityMask != SystemAffinityMask) {
       // We don't expect more that 4 CPU groups on Windows (256 processors).
       USHORT GroupCount = 4;
       USHORT GroupArray[4]{};
       if (::GetProcessGroupAffinity(GetCurrentProcess(), &GroupCount,
                                     GroupArray)) {
         assert(GroupCount == 1 &&
                "On startup, a program is expected to be assigned only to "
                "one processor group!");
         unsigned CurrentGroupID = GroupArray[0];
         ProcessorGroup NewG{Groups[CurrentGroupID]};
         NewG.Affinity = ProcessAffinityMask;
         NewG.UsableThreads = countPopulation(ProcessAffinityMask);
         Groups.clear();
         Groups.push_back(NewG);
       }
     }

     return std::vector<ProcessorGroup>(Groups.begin(), Groups.end());
   };
   static auto Groups = computeGroups();
   return ArrayRef<ProcessorGroup>(Groups);
 }

 template <typename R, typename UnaryPredicate>
 static unsigned aggregate(R &&Range, UnaryPredicate P) {
   unsigned I{};
   for (const auto &It : Range)
     I += P(It);
   return I;
 }

 // for sys::getHostNumPhysicalCores
 int computeHostNumPhysicalCores() {
   static unsigned Cores =
       aggregate(getProcessorGroups(), [](const ProcessorGroup &G) {
         return G.UsableThreads / G.ThreadsPerCore;
       });
   return Cores;
 }

 int computeHostNumHardwareThreads() {
   static unsigned Threads =
       aggregate(getProcessorGroups(),
                 [](const ProcessorGroup &G) { return G.UsableThreads; });
   return Threads;
 }

 // Finds the proper CPU socket where a thread number should go. Returns 'None'
 // if the thread shall remain on the actual CPU socket.
 Optional<unsigned>
 llvm::ThreadPoolStrategy::compute_cpu_socket(unsigned ThreadPoolNum) const {
   ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
   // Only one CPU socket in the system or process affinity was set, no need to
   // move the thread(s) to another CPU socket.
   if (Groups.size() <= 1)
     return None;

   // We ask for less threads than there are hardware threads per CPU socket, no
   // need to dispatch threads to other CPU sockets.
   unsigned MaxThreadsPerSocket =
       UseHyperThreads ? Groups[0].UsableThreads : Groups[0].useableCores();
   if (compute_thread_count() <= MaxThreadsPerSocket)
     return None;

   assert(ThreadPoolNum < compute_thread_count() &&
          "The thread index is not within thread strategy's range!");

   // Assumes the same number of hardware threads per CPU socket.
   return (ThreadPoolNum * Groups.size()) / compute_thread_count();
 }

 // Assign the current thread to a more appropriate CPU socket or CPU group
 void llvm::ThreadPoolStrategy::apply_thread_strategy(
     unsigned ThreadPoolNum) const {
   Optional<unsigned> Socket = compute_cpu_socket(ThreadPoolNum);
   if (!Socket)
     return;
   ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
   GROUP_AFFINITY Affinity{};
   Affinity.Group = Groups[*Socket].ID;
   Affinity.Mask = Groups[*Socket].Affinity;
   SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
 }

 llvm::BitVector llvm::get_thread_affinity_mask() {
   GROUP_AFFINITY Affinity{};
   GetThreadGroupAffinity(GetCurrentThread(), &Affinity);

   static unsigned All =
       aggregate(getProcessorGroups(),
                 [](const ProcessorGroup &G) { return G.AllThreads; });

   unsigned StartOffset =
       aggregate(getProcessorGroups(), [&](const ProcessorGroup &G) {
         return G.ID < Affinity.Group ? G.AllThreads : 0;
       });

   llvm::BitVector V;
   V.resize(All);
   for (unsigned I = 0; I < sizeof(KAFFINITY) * 8; ++I) {
     if ((Affinity.Mask >> I) & 1)
       V.set(StartOffset + I);
   }
   return V;
 }

 unsigned llvm::get_cpus() { return getProcessorGroups().size(); }
	//===- Windows/Threading.inc - Win32 Threading Implementation - -- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file provides the Win32 specific implementation of Threading functions.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/Twine.h"

	#include "llvm/Support/Windows/WindowsSupport.h"
	#include <process.h>

	#include <bitset>

	// Windows will at times define MemoryFence.
	#ifdef MemoryFence
	#undef MemoryFence
	#endif

	static unsigned __stdcall threadFuncSync(void *Arg) {
	SyncThreadInfo TI = static_cast<SyncThreadInfo >(Arg);
	TI->UserFn(TI->UserData);
	return 0;
	}

	static unsigned __stdcall threadFuncAsync(void *Arg) {
	std::unique_ptr<AsyncThreadInfo> Info(static_cast<AsyncThreadInfo *>(Arg));
	(*Info)();
	return 0;
	}

	static void
	llvm_execute_on_thread_impl(unsigned (__stdcall ThreadFunc)(void ), void *Arg,
	llvm::Optional<unsigned> StackSizeInBytes,
	JoiningPolicy JP) {
	HANDLE hThread = (HANDLE)::_beginthreadex(
	NULL, StackSizeInBytes.getValueOr(0), ThreadFunc, Arg, 0, NULL);

	if (!hThread) {
	ReportLastErrorFatal("_beginthreadex failed");
	}

	if (JP == JoiningPolicy::Join) {
	if (::WaitForSingleObject(hThread, INFINITE) == WAIT_FAILED) {
	ReportLastErrorFatal("WaitForSingleObject failed");
	}
	}
	if (::CloseHandle(hThread) == FALSE) {
	ReportLastErrorFatal("CloseHandle failed");
	}
	}

	uint64_t llvm::get_threadid() {
	return uint64_t(::GetCurrentThreadId());
	}

	uint32_t llvm::get_max_thread_name_length() { return 0; }

	#if defined(_MSC_VER)
	static void SetThreadName(DWORD Id, LPCSTR Name) {
	constexpr DWORD MS_VC_EXCEPTION = 0x406D1388;

	#pragma pack(push, 8)
	struct THREADNAME_INFO {
	DWORD dwType; // Must be 0x1000.
	LPCSTR szName; // Pointer to thread name
	DWORD dwThreadId; // Thread ID (-1 == current thread)
	DWORD dwFlags; // Reserved. Do not use.
	};
	#pragma pack(pop)

	THREADNAME_INFO info;
	info.dwType = 0x1000;
	info.szName = Name;
	info.dwThreadId = Id;
	info.dwFlags = 0;

	__try {
	::RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR),
	(ULONG_PTR *)&info);
	}
	__except (EXCEPTION_EXECUTE_HANDLER) {
	}
	}
	#endif

	void llvm::set_thread_name(const Twine &Name) {
	#if defined(_MSC_VER)
	// Make sure the input is null terminated.
	SmallString<64> Storage;
	StringRef NameStr = Name.toNullTerminatedStringRef(Storage);
	SetThreadName(::GetCurrentThreadId(), NameStr.data());
	#endif
	}

	void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
	// "Name" is not an inherent property of a thread on Windows. In fact, when
	// you "set" the name, you are only firing a one-time message to a debugger
	// which it interprets as a program setting its threads' name. We may be
	// able to get fancy by creating a TLS entry when someone calls
	// set_thread_name so that subsequent calls to get_thread_name return this
	// value.
	Name.clear();
	}

	SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
	// https://docs.microsoft.com/en-us/windows/desktop/api/processthreadsapi/nf-processthreadsapi-setthreadpriority
	// Begin background processing mode. The system lowers the resource scheduling
	// priorities of the thread so that it can perform background work without
	// significantly affecting activity in the foreground.
	// End background processing mode. The system restores the resource scheduling
	// priorities of the thread as they were before the thread entered background
	// processing mode.
	return SetThreadPriority(GetCurrentThread(),
	Priority == ThreadPriority::Background
	? THREAD_MODE_BACKGROUND_BEGIN
	: THREAD_MODE_BACKGROUND_END)
	? SetThreadPriorityResult::SUCCESS
	: SetThreadPriorityResult::FAILURE;
	}

	struct ProcessorGroup {
	unsigned ID;
	unsigned AllThreads;
	unsigned UsableThreads;
	unsigned ThreadsPerCore;
	uint64_t Affinity;

	unsigned useableCores() const {
	return std::max(1U, UsableThreads / ThreadsPerCore);
	}
	};

	template <typename F>
	static bool IterateProcInfo(LOGICAL_PROCESSOR_RELATIONSHIP Relationship, F Fn) {
	DWORD Len = 0;
	BOOL R = ::GetLogicalProcessorInformationEx(Relationship, NULL, &Len);
	if (R \|\| GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
	return false;
	}
	auto Info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX )calloc(1, Len);
	R = ::GetLogicalProcessorInformationEx(Relationship, Info, &Len);
	if (R) {
	auto *End =
	(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX )((uint8_t )Info + Len);
	for (auto *Curr = Info; Curr < End;
	Curr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX )((uint8_t )Curr +
	Curr->Size)) {
	if (Curr->Relationship != Relationship)
	continue;
	Fn(Curr);
	}
	}
	free(Info);
	return true;
	}

	static ArrayRef<ProcessorGroup> getProcessorGroups() {
	auto computeGroups = []() {
	SmallVector<ProcessorGroup, 4> Groups;

	auto HandleGroup = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
	GROUP_RELATIONSHIP &El = ProcInfo->Group;
	for (unsigned J = 0; J < El.ActiveGroupCount; ++J) {
	ProcessorGroup G;
	G.ID = Groups.size();
	G.AllThreads = El.GroupInfo[J].MaximumProcessorCount;
	G.UsableThreads = El.GroupInfo[J].ActiveProcessorCount;
	assert(G.UsableThreads <= 64);
	G.Affinity = El.GroupInfo[J].ActiveProcessorMask;
	Groups.push_back(G);
	}
	};

	if (!IterateProcInfo(RelationGroup, HandleGroup))
	return std::vector<ProcessorGroup>();

	auto HandleProc = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
	PROCESSOR_RELATIONSHIP &El = ProcInfo->Processor;
	assert(El.GroupCount == 1);
	unsigned NumHyperThreads = 1;
	// If the flag is set, each core supports more than one hyper-thread.
	if (El.Flags & LTP_PC_SMT)
	NumHyperThreads = std::bitset<64>(El.GroupMask[0].Mask).count();
	unsigned I = El.GroupMask[0].Group;
	Groups[I].ThreadsPerCore = NumHyperThreads;
	};

	if (!IterateProcInfo(RelationProcessorCore, HandleProc))
	return std::vector<ProcessorGroup>();

	// If there's an affinity mask set, assume the user wants to constrain the
	// current process to only a single CPU group. On Windows, it is not
	// possible for affinity masks to cross CPU group boundaries.
	DWORD_PTR ProcessAffinityMask = 0, SystemAffinityMask = 0;
	if (::GetProcessAffinityMask(GetCurrentProcess(), &ProcessAffinityMask,
	&SystemAffinityMask) &&
	ProcessAffinityMask != SystemAffinityMask) {
	// We don't expect more that 4 CPU groups on Windows (256 processors).
	USHORT GroupCount = 4;
	USHORT GroupArray[4]{};
	if (::GetProcessGroupAffinity(GetCurrentProcess(), &GroupCount,
	GroupArray)) {
	assert(GroupCount == 1 &&
	"On startup, a program is expected to be assigned only to "
	"one processor group!");
	unsigned CurrentGroupID = GroupArray[0];
	ProcessorGroup NewG{Groups[CurrentGroupID]};
	NewG.Affinity = ProcessAffinityMask;
	NewG.UsableThreads = countPopulation(ProcessAffinityMask);
	Groups.clear();
	Groups.push_back(NewG);
	}
	}

	return std::vector<ProcessorGroup>(Groups.begin(), Groups.end());
	};
	static auto Groups = computeGroups();
	return ArrayRef<ProcessorGroup>(Groups);
	}

	template <typename R, typename UnaryPredicate>
	static unsigned aggregate(R &&Range, UnaryPredicate P) {
	unsigned I{};
	for (const auto &It : Range)
	I += P(It);
	return I;
	}

	// for sys::getHostNumPhysicalCores
	int computeHostNumPhysicalCores() {
	static unsigned Cores =
	aggregate(getProcessorGroups(), [](const ProcessorGroup &G) {
	return G.UsableThreads / G.ThreadsPerCore;
	});
	return Cores;
	}

	int computeHostNumHardwareThreads() {
	static unsigned Threads =
	aggregate(getProcessorGroups(),
	[](const ProcessorGroup &G) { return G.UsableThreads; });
	return Threads;
	}

	// Finds the proper CPU socket where a thread number should go. Returns 'None'
	// if the thread shall remain on the actual CPU socket.
	Optional<unsigned>
	llvm::ThreadPoolStrategy::compute_cpu_socket(unsigned ThreadPoolNum) const {
	ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
	// Only one CPU socket in the system or process affinity was set, no need to
	// move the thread(s) to another CPU socket.
	if (Groups.size() <= 1)
	return None;

	// We ask for less threads than there are hardware threads per CPU socket, no
	// need to dispatch threads to other CPU sockets.
	unsigned MaxThreadsPerSocket =
	UseHyperThreads ? Groups[0].UsableThreads : Groups[0].useableCores();
	if (compute_thread_count() <= MaxThreadsPerSocket)
	return None;

	assert(ThreadPoolNum < compute_thread_count() &&
	"The thread index is not within thread strategy's range!");

	// Assumes the same number of hardware threads per CPU socket.
	return (ThreadPoolNum * Groups.size()) / compute_thread_count();
	}

	// Assign the current thread to a more appropriate CPU socket or CPU group
	void llvm::ThreadPoolStrategy::apply_thread_strategy(
	unsigned ThreadPoolNum) const {
	Optional<unsigned> Socket = compute_cpu_socket(ThreadPoolNum);
	if (!Socket)
	return;
	ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
	GROUP_AFFINITY Affinity{};
	Affinity.Group = Groups[*Socket].ID;
	Affinity.Mask = Groups[*Socket].Affinity;
	SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
	}

	llvm::BitVector llvm::get_thread_affinity_mask() {
	GROUP_AFFINITY Affinity{};
	GetThreadGroupAffinity(GetCurrentThread(), &Affinity);

	static unsigned All =
	aggregate(getProcessorGroups(),
	[](const ProcessorGroup &G) { return G.AllThreads; });

	unsigned StartOffset =
	aggregate(getProcessorGroups(), [&](const ProcessorGroup &G) {
	return G.ID < Affinity.Group ? G.AllThreads : 0;
	});

	llvm::BitVector V;
	V.resize(All);
	for (unsigned I = 0; I < sizeof(KAFFINITY) * 8; ++I) {
	if ((Affinity.Mask >> I) & 1)
	V.set(StartOffset + I);
	}
	return V;
	}

	unsigned llvm::get_cpus() { return getProcessorGroups().size(); }