[clang] [AArch64] Fix Windows va_arg handling for larger structs

Aggregate types over 16 bytes are passed by reference.

Contrary to the x86_64 ABI, smaller structs with an odd (non power
of two) are padded and passed in registers.

Differential Revision: https://reviews.llvm.org/D100374

GitOrigin-RevId: 3637c5c8ec3d4dc0b87eb4e3ee9c9ae8816cade2
diff --git a/lib/CodeGen/TargetInfo.cpp b/lib/CodeGen/TargetInfo.cpp
index 55e3874..3ff3eed 100644
--- a/lib/CodeGen/TargetInfo.cpp
+++ b/lib/CodeGen/TargetInfo.cpp
@@ -6121,7 +6121,13 @@
 
 Address AArch64ABIInfo::EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr,
                                     QualType Ty) const {
-  return emitVoidPtrVAArg(CGF, VAListAddr, Ty, /*indirect*/ false,
+  bool IsIndirect = false;
+
+  // Composites larger than 16 bytes are passed by reference.
+  if (isAggregateTypeForABI(Ty) && getContext().getTypeSize(Ty) > 128)
+    IsIndirect = true;
+
+  return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
                           CGF.getContext().getTypeInfoInChars(Ty),
                           CharUnits::fromQuantity(8),
                           /*allowHigherAlign*/ false);
diff --git a/test/CodeGen/ms_abi_aarch64.c b/test/CodeGen/ms_abi_aarch64.c
index bbb4be9..5268561 100644
--- a/test/CodeGen/ms_abi_aarch64.c
+++ b/test/CodeGen/ms_abi_aarch64.c
@@ -1,5 +1,13 @@
-// RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-llvm < %s | FileCheck -check-prefix=LINUX %s
-// RUN: %clang_cc1 -triple aarch64-pc-win32 -emit-llvm < %s | FileCheck -check-prefix=WIN64 %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-llvm < %s | FileCheck -check-prefixes=LINUX,COMMON %s
+// RUN: %clang_cc1 -triple aarch64-pc-win32 -emit-llvm < %s | FileCheck -check-prefixes=WIN64,COMMON %s
+
+struct small_odd {
+  char a, b, c;
+};
+
+struct larger {
+  int a, b, c, d, e;
+};
 
 void __attribute__((ms_abi)) f1(void);
 void f2(void);
@@ -48,6 +56,26 @@
   // WIN64: call void @llvm.va_end
 }
 
+void __attribute__((ms_abi)) f4_2(int a, ...) {
+  // LINUX-LABEL: define{{.*}} win64cc void @f4_2
+  // WIN64-LABEL: define dso_local void @f4_2
+  __builtin_ms_va_list ap;
+  __builtin_ms_va_start(ap, a);
+  // COMMON: %[[AP:.*]] = alloca i8*
+  // COMMON: call void @llvm.va_start
+  struct small_odd s1 = __builtin_va_arg(ap, struct small_odd);
+  // COMMON: %[[AP_CUR:.*]] = load i8*, i8** %[[AP]]
+  // COMMON-NEXT: %[[AP_NEXT:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR]], i64 8
+  // COMMON-NEXT: store i8* %[[AP_NEXT]], i8** %[[AP]]
+  // COMMON-NEXT: bitcast i8* %[[AP_CUR]] to %struct.small_odd*
+  struct larger s2 = __builtin_va_arg(ap, struct larger);
+  // COMMON: %[[AP_CUR2:.*]] = load i8*, i8** %[[AP]]
+  // COMMON-NEXT: %[[AP_NEXT3:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR2]], i64 8
+  // COMMON-NEXT: store i8* %[[AP_NEXT3]], i8** %[[AP]]
+  // COMMON-NEXT: bitcast i8* %[[AP_CUR2]] to %struct.larger**
+  __builtin_ms_va_end(ap);
+}
+
 // Let's verify that normal va_lists work right on Win64, too.
 void f5(int a, ...) {
   // WIN64-LABEL: define dso_local void @f5