| //===---------- X86.cpp - Emit LLVM Code for builtins ---------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This contains code to emit Builtin calls as LLVM code. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "CGBuiltin.h" |
| #include "clang/Basic/TargetBuiltins.h" |
| #include "llvm/IR/InlineAsm.h" |
| #include "llvm/IR/IntrinsicsX86.h" |
| #include "llvm/TargetParser/X86TargetParser.h" |
| |
| using namespace clang; |
| using namespace CodeGen; |
| using namespace llvm; |
| |
| static std::optional<CodeGenFunction::MSVCIntrin> |
| translateX86ToMsvcIntrin(unsigned BuiltinID) { |
| using MSVCIntrin = CodeGenFunction::MSVCIntrin; |
| switch (BuiltinID) { |
| default: |
| return std::nullopt; |
| case clang::X86::BI_BitScanForward: |
| case clang::X86::BI_BitScanForward64: |
| return MSVCIntrin::_BitScanForward; |
| case clang::X86::BI_BitScanReverse: |
| case clang::X86::BI_BitScanReverse64: |
| return MSVCIntrin::_BitScanReverse; |
| case clang::X86::BI_InterlockedAnd64: |
| return MSVCIntrin::_InterlockedAnd; |
| case clang::X86::BI_InterlockedCompareExchange128: |
| return MSVCIntrin::_InterlockedCompareExchange128; |
| case clang::X86::BI_InterlockedExchange64: |
| return MSVCIntrin::_InterlockedExchange; |
| case clang::X86::BI_InterlockedExchangeAdd64: |
| return MSVCIntrin::_InterlockedExchangeAdd; |
| case clang::X86::BI_InterlockedExchangeSub64: |
| return MSVCIntrin::_InterlockedExchangeSub; |
| case clang::X86::BI_InterlockedOr64: |
| return MSVCIntrin::_InterlockedOr; |
| case clang::X86::BI_InterlockedXor64: |
| return MSVCIntrin::_InterlockedXor; |
| case clang::X86::BI_InterlockedDecrement64: |
| return MSVCIntrin::_InterlockedDecrement; |
| case clang::X86::BI_InterlockedIncrement64: |
| return MSVCIntrin::_InterlockedIncrement; |
| } |
| llvm_unreachable("must return from switch"); |
| } |
| |
| // Convert the mask from an integer type to a vector of i1. |
| static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask, |
| unsigned NumElts) { |
| |
| auto *MaskTy = llvm::FixedVectorType::get( |
| CGF.Builder.getInt1Ty(), |
| cast<IntegerType>(Mask->getType())->getBitWidth()); |
| Value *MaskVec = CGF.Builder.CreateBitCast(Mask, MaskTy); |
| |
| // If we have less than 8 elements, then the starting mask was an i8 and |
| // we need to extract down to the right number of elements. |
| if (NumElts < 8) { |
| int Indices[4]; |
| for (unsigned i = 0; i != NumElts; ++i) |
| Indices[i] = i; |
| MaskVec = CGF.Builder.CreateShuffleVector( |
| MaskVec, MaskVec, ArrayRef(Indices, NumElts), "extract"); |
| } |
| return MaskVec; |
| } |
| |
| static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops, |
| Align Alignment) { |
| Value *Ptr = Ops[0]; |
| |
| Value *MaskVec = getMaskVecValue( |
| CGF, Ops[2], |
| cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements()); |
| |
| return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Alignment, MaskVec); |
| } |
| |
| static Value *EmitX86MaskedLoad(CodeGenFunction &CGF, ArrayRef<Value *> Ops, |
| Align Alignment) { |
| llvm::Type *Ty = Ops[1]->getType(); |
| Value *Ptr = Ops[0]; |
| |
| Value *MaskVec = getMaskVecValue( |
| CGF, Ops[2], cast<llvm::FixedVectorType>(Ty)->getNumElements()); |
| |
| return CGF.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, MaskVec, Ops[1]); |
| } |
| |
| static Value *EmitX86ExpandLoad(CodeGenFunction &CGF, |
| ArrayRef<Value *> Ops) { |
| auto *ResultTy = cast<llvm::VectorType>(Ops[1]->getType()); |
| Value *Ptr = Ops[0]; |
| |
| Value *MaskVec = getMaskVecValue( |
| CGF, Ops[2], cast<FixedVectorType>(ResultTy)->getNumElements()); |
| |
| llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload, |
| ResultTy); |
| return CGF.Builder.CreateCall(F, { Ptr, MaskVec, Ops[1] }); |
| } |
| |
| static Value *EmitX86CompressExpand(CodeGenFunction &CGF, |
| ArrayRef<Value *> Ops, |
| bool IsCompress) { |
| auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType()); |
| |
| Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements()); |
| |
| Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress |
| : Intrinsic::x86_avx512_mask_expand; |
| llvm::Function *F = CGF.CGM.getIntrinsic(IID, ResultTy); |
| return CGF.Builder.CreateCall(F, { Ops[0], Ops[1], MaskVec }); |
| } |
| |
| static Value *EmitX86CompressStore(CodeGenFunction &CGF, |
| ArrayRef<Value *> Ops) { |
| auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType()); |
| Value *Ptr = Ops[0]; |
| |
| Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements()); |
| |
| llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_compressstore, |
| ResultTy); |
| return CGF.Builder.CreateCall(F, { Ops[1], Ptr, MaskVec }); |
| } |
| |
| static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc, |
| ArrayRef<Value *> Ops, |
| bool InvertLHS = false) { |
| unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); |
| Value *LHS = getMaskVecValue(CGF, Ops[0], NumElts); |
| Value *RHS = getMaskVecValue(CGF, Ops[1], NumElts); |
| |
| if (InvertLHS) |
| LHS = CGF.Builder.CreateNot(LHS); |
| |
| return CGF.Builder.CreateBitCast(CGF.Builder.CreateBinOp(Opc, LHS, RHS), |
| Ops[0]->getType()); |
| } |
| |
| static Value *EmitX86FunnelShift(CodeGenFunction &CGF, Value *Op0, Value *Op1, |
| Value *Amt, bool IsRight) { |
| llvm::Type *Ty = Op0->getType(); |
| |
| // Amount may be scalar immediate, in which case create a splat vector. |
| // Funnel shifts amounts are treated as modulo and types are all power-of-2 so |
| // we only care about the lowest log2 bits anyway. |
| if (Amt->getType() != Ty) { |
| unsigned NumElts = cast<llvm::FixedVectorType>(Ty)->getNumElements(); |
| Amt = CGF.Builder.CreateIntCast(Amt, Ty->getScalarType(), false); |
| Amt = CGF.Builder.CreateVectorSplat(NumElts, Amt); |
| } |
| |
| unsigned IID = IsRight ? Intrinsic::fshr : Intrinsic::fshl; |
| Function *F = CGF.CGM.getIntrinsic(IID, Ty); |
| return CGF.Builder.CreateCall(F, {Op0, Op1, Amt}); |
| } |
| |
| static Value *EmitX86vpcom(CodeGenFunction &CGF, ArrayRef<Value *> Ops, |
| bool IsSigned) { |
| Value *Op0 = Ops[0]; |
| Value *Op1 = Ops[1]; |
| llvm::Type *Ty = Op0->getType(); |
| uint64_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7; |
| |
| CmpInst::Predicate Pred; |
| switch (Imm) { |
| case 0x0: |
| Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; |
| break; |
| case 0x1: |
| Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; |
| break; |
| case 0x2: |
| Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; |
| break; |
| case 0x3: |
| Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; |
| break; |
| case 0x4: |
| Pred = ICmpInst::ICMP_EQ; |
| break; |
| case 0x5: |
| Pred = ICmpInst::ICMP_NE; |
| break; |
| case 0x6: |
| return llvm::Constant::getNullValue(Ty); // FALSE |
| case 0x7: |
| return llvm::Constant::getAllOnesValue(Ty); // TRUE |
| default: |
| llvm_unreachable("Unexpected XOP vpcom/vpcomu predicate"); |
| } |
| |
| Value *Cmp = CGF.Builder.CreateICmp(Pred, Op0, Op1); |
| Value *Res = CGF.Builder.CreateSExt(Cmp, Ty); |
| return Res; |
| } |
| |
| static Value *EmitX86Select(CodeGenFunction &CGF, |
| Value *Mask, Value *Op0, Value *Op1) { |
| |
| // If the mask is all ones just return first argument. |
| if (const auto *C = dyn_cast<Constant>(Mask)) |
| if (C->isAllOnesValue()) |
| return Op0; |
| |
| Mask = getMaskVecValue( |
| CGF, Mask, cast<llvm::FixedVectorType>(Op0->getType())->getNumElements()); |
| |
| return CGF.Builder.CreateSelect(Mask, Op0, Op1); |
| } |
| |
| static Value *EmitX86ScalarSelect(CodeGenFunction &CGF, |
| Value *Mask, Value *Op0, Value *Op1) { |
| // If the mask is all ones just return first argument. |
| if (const auto *C = dyn_cast<Constant>(Mask)) |
| if (C->isAllOnesValue()) |
| return Op0; |
| |
| auto *MaskTy = llvm::FixedVectorType::get( |
| CGF.Builder.getInt1Ty(), Mask->getType()->getIntegerBitWidth()); |
| Mask = CGF.Builder.CreateBitCast(Mask, MaskTy); |
| Mask = CGF.Builder.CreateExtractElement(Mask, (uint64_t)0); |
| return CGF.Builder.CreateSelect(Mask, Op0, Op1); |
| } |
| |
| static Value *EmitX86MaskedCompareResult(CodeGenFunction &CGF, Value *Cmp, |
| unsigned NumElts, Value *MaskIn) { |
| if (MaskIn) { |
| const auto *C = dyn_cast<Constant>(MaskIn); |
| if (!C || !C->isAllOnesValue()) |
| Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, MaskIn, NumElts)); |
| } |
| |
| if (NumElts < 8) { |
| int Indices[8]; |
| for (unsigned i = 0; i != NumElts; ++i) |
| Indices[i] = i; |
| for (unsigned i = NumElts; i != 8; ++i) |
| Indices[i] = i % NumElts + NumElts; |
| Cmp = CGF.Builder.CreateShuffleVector( |
| Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices); |
| } |
| |
| return CGF.Builder.CreateBitCast(Cmp, |
| IntegerType::get(CGF.getLLVMContext(), |
| std::max(NumElts, 8U))); |
| } |
| |
| static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC, |
| bool Signed, ArrayRef<Value *> Ops) { |
| assert((Ops.size() == 2 || Ops.size() == 4) && |
| "Unexpected number of arguments"); |
| unsigned NumElts = |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); |
| Value *Cmp; |
| |
| if (CC == 3) { |
| Cmp = Constant::getNullValue( |
| llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts)); |
| } else if (CC == 7) { |
| Cmp = Constant::getAllOnesValue( |
| llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts)); |
| } else { |
| ICmpInst::Predicate Pred; |
| switch (CC) { |
| default: llvm_unreachable("Unknown condition code"); |
| case 0: Pred = ICmpInst::ICMP_EQ; break; |
| case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break; |
| case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break; |
| case 4: Pred = ICmpInst::ICMP_NE; break; |
| case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break; |
| case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break; |
| } |
| Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]); |
| } |
| |
| Value *MaskIn = nullptr; |
| if (Ops.size() == 4) |
| MaskIn = Ops[3]; |
| |
| return EmitX86MaskedCompareResult(CGF, Cmp, NumElts, MaskIn); |
| } |
| |
| static Value *EmitX86ConvertToMask(CodeGenFunction &CGF, Value *In) { |
| Value *Zero = Constant::getNullValue(In->getType()); |
| return EmitX86MaskedCompare(CGF, 1, true, { In, Zero }); |
| } |
| |
| static Value *EmitX86ConvertIntToFp(CodeGenFunction &CGF, const CallExpr *E, |
| ArrayRef<Value *> Ops, bool IsSigned) { |
| unsigned Rnd = cast<llvm::ConstantInt>(Ops[3])->getZExtValue(); |
| llvm::Type *Ty = Ops[1]->getType(); |
| |
| Value *Res; |
| if (Rnd != 4) { |
| Intrinsic::ID IID = IsSigned ? Intrinsic::x86_avx512_sitofp_round |
| : Intrinsic::x86_avx512_uitofp_round; |
| Function *F = CGF.CGM.getIntrinsic(IID, { Ty, Ops[0]->getType() }); |
| Res = CGF.Builder.CreateCall(F, { Ops[0], Ops[3] }); |
| } else { |
| CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E); |
| Res = IsSigned ? CGF.Builder.CreateSIToFP(Ops[0], Ty) |
| : CGF.Builder.CreateUIToFP(Ops[0], Ty); |
| } |
| |
| return EmitX86Select(CGF, Ops[2], Res, Ops[1]); |
| } |
| |
| // Lowers X86 FMA intrinsics to IR. |
| static Value *EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E, |
| ArrayRef<Value *> Ops, unsigned BuiltinID, |
| bool IsAddSub) { |
| |
| bool Subtract = false; |
| Intrinsic::ID IID = Intrinsic::not_intrinsic; |
| switch (BuiltinID) { |
| default: break; |
| case clang::X86::BI__builtin_ia32_vfmsubph512_mask3: |
| Subtract = true; |
| [[fallthrough]]; |
| case clang::X86::BI__builtin_ia32_vfmaddph512_mask: |
| case clang::X86::BI__builtin_ia32_vfmaddph512_maskz: |
| case clang::X86::BI__builtin_ia32_vfmaddph512_mask3: |
| IID = Intrinsic::x86_avx512fp16_vfmadd_ph_512; |
| break; |
| case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3: |
| Subtract = true; |
| [[fallthrough]]; |
| case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask: |
| case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz: |
| case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3: |
| IID = Intrinsic::x86_avx512fp16_vfmaddsub_ph_512; |
| break; |
| case clang::X86::BI__builtin_ia32_vfmsubps512_mask3: |
| Subtract = true; |
| [[fallthrough]]; |
| case clang::X86::BI__builtin_ia32_vfmaddps512_mask: |
| case clang::X86::BI__builtin_ia32_vfmaddps512_maskz: |
| case clang::X86::BI__builtin_ia32_vfmaddps512_mask3: |
| IID = Intrinsic::x86_avx512_vfmadd_ps_512; break; |
| case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3: |
| Subtract = true; |
| [[fallthrough]]; |
| case clang::X86::BI__builtin_ia32_vfmaddpd512_mask: |
| case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz: |
| case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3: |
| IID = Intrinsic::x86_avx512_vfmadd_pd_512; break; |
| case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3: |
| Subtract = true; |
| [[fallthrough]]; |
| case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask: |
| case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz: |
| case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3: |
| IID = Intrinsic::x86_avx512_vfmaddsub_ps_512; |
| break; |
| case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3: |
| Subtract = true; |
| [[fallthrough]]; |
| case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask: |
| case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz: |
| case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3: |
| IID = Intrinsic::x86_avx512_vfmaddsub_pd_512; |
| break; |
| } |
| |
| Value *A = Ops[0]; |
| Value *B = Ops[1]; |
| Value *C = Ops[2]; |
| |
| if (Subtract) |
| C = CGF.Builder.CreateFNeg(C); |
| |
| Value *Res; |
| |
| // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding). |
| if (IID != Intrinsic::not_intrinsic && |
| (cast<llvm::ConstantInt>(Ops.back())->getZExtValue() != (uint64_t)4 || |
| IsAddSub)) { |
| Function *Intr = CGF.CGM.getIntrinsic(IID); |
| Res = CGF.Builder.CreateCall(Intr, {A, B, C, Ops.back() }); |
| } else { |
| llvm::Type *Ty = A->getType(); |
| Function *FMA; |
| if (CGF.Builder.getIsFPConstrained()) { |
| CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E); |
| FMA = CGF.CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, Ty); |
| Res = CGF.Builder.CreateConstrainedFPCall(FMA, {A, B, C}); |
| } else { |
| FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty); |
| Res = CGF.Builder.CreateCall(FMA, {A, B, C}); |
| } |
| } |
| |
| // Handle any required masking. |
| Value *MaskFalseVal = nullptr; |
| switch (BuiltinID) { |
| case clang::X86::BI__builtin_ia32_vfmaddph512_mask: |
| case clang::X86::BI__builtin_ia32_vfmaddps512_mask: |
| case clang::X86::BI__builtin_ia32_vfmaddpd512_mask: |
| case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask: |
| case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask: |
| case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask: |
| MaskFalseVal = Ops[0]; |
| break; |
| case clang::X86::BI__builtin_ia32_vfmaddph512_maskz: |
| case clang::X86::BI__builtin_ia32_vfmaddps512_maskz: |
| case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz: |
| case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz: |
| case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz: |
| case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz: |
| MaskFalseVal = Constant::getNullValue(Ops[0]->getType()); |
| break; |
| case clang::X86::BI__builtin_ia32_vfmsubph512_mask3: |
| case clang::X86::BI__builtin_ia32_vfmaddph512_mask3: |
| case clang::X86::BI__builtin_ia32_vfmsubps512_mask3: |
| case clang::X86::BI__builtin_ia32_vfmaddps512_mask3: |
| case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3: |
| case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3: |
| case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3: |
| case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3: |
| case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3: |
| case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3: |
| case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3: |
| case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3: |
| MaskFalseVal = Ops[2]; |
| break; |
| } |
| |
| if (MaskFalseVal) |
| return EmitX86Select(CGF, Ops[3], Res, MaskFalseVal); |
| |
| return Res; |
| } |
| |
| static Value *EmitScalarFMAExpr(CodeGenFunction &CGF, const CallExpr *E, |
| MutableArrayRef<Value *> Ops, Value *Upper, |
| bool ZeroMask = false, unsigned PTIdx = 0, |
| bool NegAcc = false) { |
| unsigned Rnd = 4; |
| if (Ops.size() > 4) |
| Rnd = cast<llvm::ConstantInt>(Ops[4])->getZExtValue(); |
| |
| if (NegAcc) |
| Ops[2] = CGF.Builder.CreateFNeg(Ops[2]); |
| |
| Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], (uint64_t)0); |
| Ops[1] = CGF.Builder.CreateExtractElement(Ops[1], (uint64_t)0); |
| Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], (uint64_t)0); |
| Value *Res; |
| if (Rnd != 4) { |
| Intrinsic::ID IID; |
| |
| switch (Ops[0]->getType()->getPrimitiveSizeInBits()) { |
| case 16: |
| IID = Intrinsic::x86_avx512fp16_vfmadd_f16; |
| break; |
| case 32: |
| IID = Intrinsic::x86_avx512_vfmadd_f32; |
| break; |
| case 64: |
| IID = Intrinsic::x86_avx512_vfmadd_f64; |
| break; |
| default: |
| llvm_unreachable("Unexpected size"); |
| } |
| Res = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID), |
| {Ops[0], Ops[1], Ops[2], Ops[4]}); |
| } else if (CGF.Builder.getIsFPConstrained()) { |
| CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E); |
| Function *FMA = CGF.CGM.getIntrinsic( |
| Intrinsic::experimental_constrained_fma, Ops[0]->getType()); |
| Res = CGF.Builder.CreateConstrainedFPCall(FMA, Ops.slice(0, 3)); |
| } else { |
| Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ops[0]->getType()); |
| Res = CGF.Builder.CreateCall(FMA, Ops.slice(0, 3)); |
| } |
| // If we have more than 3 arguments, we need to do masking. |
| if (Ops.size() > 3) { |
| Value *PassThru = ZeroMask ? Constant::getNullValue(Res->getType()) |
| : Ops[PTIdx]; |
| |
| // If we negated the accumulator and the its the PassThru value we need to |
| // bypass the negate. Conveniently Upper should be the same thing in this |
| // case. |
| if (NegAcc && PTIdx == 2) |
| PassThru = CGF.Builder.CreateExtractElement(Upper, (uint64_t)0); |
| |
| Res = EmitX86ScalarSelect(CGF, Ops[3], Res, PassThru); |
| } |
| return CGF.Builder.CreateInsertElement(Upper, Res, (uint64_t)0); |
| } |
| |
| static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned, |
| ArrayRef<Value *> Ops) { |
| llvm::Type *Ty = Ops[0]->getType(); |
| // Arguments have a vXi32 type so cast to vXi64. |
| Ty = llvm::FixedVectorType::get(CGF.Int64Ty, |
| Ty->getPrimitiveSizeInBits() / 64); |
| Value *LHS = CGF.Builder.CreateBitCast(Ops[0], Ty); |
| Value *RHS = CGF.Builder.CreateBitCast(Ops[1], Ty); |
| |
| if (IsSigned) { |
| // Shift left then arithmetic shift right. |
| Constant *ShiftAmt = ConstantInt::get(Ty, 32); |
| LHS = CGF.Builder.CreateShl(LHS, ShiftAmt); |
| LHS = CGF.Builder.CreateAShr(LHS, ShiftAmt); |
| RHS = CGF.Builder.CreateShl(RHS, ShiftAmt); |
| RHS = CGF.Builder.CreateAShr(RHS, ShiftAmt); |
| } else { |
| // Clear the upper bits. |
| Constant *Mask = ConstantInt::get(Ty, 0xffffffff); |
| LHS = CGF.Builder.CreateAnd(LHS, Mask); |
| RHS = CGF.Builder.CreateAnd(RHS, Mask); |
| } |
| |
| return CGF.Builder.CreateMul(LHS, RHS); |
| } |
| |
| // Emit a masked pternlog intrinsic. This only exists because the header has to |
| // use a macro and we aren't able to pass the input argument to a pternlog |
| // builtin and a select builtin without evaluating it twice. |
| static Value *EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask, |
| ArrayRef<Value *> Ops) { |
| llvm::Type *Ty = Ops[0]->getType(); |
| |
| unsigned VecWidth = Ty->getPrimitiveSizeInBits(); |
| unsigned EltWidth = Ty->getScalarSizeInBits(); |
| Intrinsic::ID IID; |
| if (VecWidth == 128 && EltWidth == 32) |
| IID = Intrinsic::x86_avx512_pternlog_d_128; |
| else if (VecWidth == 256 && EltWidth == 32) |
| IID = Intrinsic::x86_avx512_pternlog_d_256; |
| else if (VecWidth == 512 && EltWidth == 32) |
| IID = Intrinsic::x86_avx512_pternlog_d_512; |
| else if (VecWidth == 128 && EltWidth == 64) |
| IID = Intrinsic::x86_avx512_pternlog_q_128; |
| else if (VecWidth == 256 && EltWidth == 64) |
| IID = Intrinsic::x86_avx512_pternlog_q_256; |
| else if (VecWidth == 512 && EltWidth == 64) |
| IID = Intrinsic::x86_avx512_pternlog_q_512; |
| else |
| llvm_unreachable("Unexpected intrinsic"); |
| |
| Value *Ternlog = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID), |
| Ops.drop_back()); |
| Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Ops[0]; |
| return EmitX86Select(CGF, Ops[4], Ternlog, PassThru); |
| } |
| |
| static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op, |
| llvm::Type *DstTy) { |
| unsigned NumberOfElements = |
| cast<llvm::FixedVectorType>(DstTy)->getNumElements(); |
| Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements); |
| return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2"); |
| } |
| |
| Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) { |
| const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts(); |
| StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString(); |
| return EmitX86CpuIs(CPUStr); |
| } |
| |
| // Convert F16 halfs to floats. |
| static Value *EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF, |
| ArrayRef<Value *> Ops, |
| llvm::Type *DstTy) { |
| assert((Ops.size() == 1 || Ops.size() == 3 || Ops.size() == 4) && |
| "Unknown cvtph2ps intrinsic"); |
| |
| // If the SAE intrinsic doesn't use default rounding then we can't upgrade. |
| if (Ops.size() == 4 && cast<llvm::ConstantInt>(Ops[3])->getZExtValue() != 4) { |
| Function *F = |
| CGF.CGM.getIntrinsic(Intrinsic::x86_avx512_mask_vcvtph2ps_512); |
| return CGF.Builder.CreateCall(F, {Ops[0], Ops[1], Ops[2], Ops[3]}); |
| } |
| |
| unsigned NumDstElts = cast<llvm::FixedVectorType>(DstTy)->getNumElements(); |
| Value *Src = Ops[0]; |
| |
| // Extract the subvector. |
| if (NumDstElts != |
| cast<llvm::FixedVectorType>(Src->getType())->getNumElements()) { |
| assert(NumDstElts == 4 && "Unexpected vector size"); |
| Src = CGF.Builder.CreateShuffleVector(Src, {0, 1, 2, 3}); |
| } |
| |
| // Bitcast from vXi16 to vXf16. |
| auto *HalfTy = llvm::FixedVectorType::get( |
| llvm::Type::getHalfTy(CGF.getLLVMContext()), NumDstElts); |
| Src = CGF.Builder.CreateBitCast(Src, HalfTy); |
| |
| // Perform the fp-extension. |
| Value *Res = CGF.Builder.CreateFPExt(Src, DstTy, "cvtph2ps"); |
| |
| if (Ops.size() >= 3) |
| Res = EmitX86Select(CGF, Ops[2], Res, Ops[1]); |
| return Res; |
| } |
| |
| Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) { |
| |
| llvm::Type *Int32Ty = Builder.getInt32Ty(); |
| |
| // Matching the struct layout from the compiler-rt/libgcc structure that is |
| // filled in: |
| // unsigned int __cpu_vendor; |
| // unsigned int __cpu_type; |
| // unsigned int __cpu_subtype; |
| // unsigned int __cpu_features[1]; |
| llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty, |
| llvm::ArrayType::get(Int32Ty, 1)); |
| |
| // Grab the global __cpu_model. |
| llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model"); |
| cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true); |
| |
| // Calculate the index needed to access the correct field based on the |
| // range. Also adjust the expected value. |
| auto [Index, Value] = StringSwitch<std::pair<unsigned, unsigned>>(CPUStr) |
| #define X86_VENDOR(ENUM, STRING) \ |
| .Case(STRING, {0u, static_cast<unsigned>(llvm::X86::ENUM)}) |
| #define X86_CPU_TYPE_ALIAS(ENUM, ALIAS) \ |
| .Case(ALIAS, {1u, static_cast<unsigned>(llvm::X86::ENUM)}) |
| #define X86_CPU_TYPE(ENUM, STR) \ |
| .Case(STR, {1u, static_cast<unsigned>(llvm::X86::ENUM)}) |
| #define X86_CPU_SUBTYPE_ALIAS(ENUM, ALIAS) \ |
| .Case(ALIAS, {2u, static_cast<unsigned>(llvm::X86::ENUM)}) |
| #define X86_CPU_SUBTYPE(ENUM, STR) \ |
| .Case(STR, {2u, static_cast<unsigned>(llvm::X86::ENUM)}) |
| #include "llvm/TargetParser/X86TargetParser.def" |
| .Default({0, 0}); |
| assert(Value != 0 && "Invalid CPUStr passed to CpuIs"); |
| |
| // Grab the appropriate field from __cpu_model. |
| llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0), |
| ConstantInt::get(Int32Ty, Index)}; |
| llvm::Value *CpuValue = Builder.CreateInBoundsGEP(STy, CpuModel, Idxs); |
| CpuValue = Builder.CreateAlignedLoad(Int32Ty, CpuValue, |
| CharUnits::fromQuantity(4)); |
| |
| // Check the value of the field against the requested value. |
| return Builder.CreateICmpEQ(CpuValue, |
| llvm::ConstantInt::get(Int32Ty, Value)); |
| } |
| |
| Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) { |
| const Expr *FeatureExpr = E->getArg(0)->IgnoreParenCasts(); |
| StringRef FeatureStr = cast<StringLiteral>(FeatureExpr)->getString(); |
| if (!getContext().getTargetInfo().validateCpuSupports(FeatureStr)) |
| return Builder.getFalse(); |
| return EmitX86CpuSupports(FeatureStr); |
| } |
| |
| Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) { |
| return EmitX86CpuSupports(llvm::X86::getCpuSupportsMask(FeatureStrs)); |
| } |
| |
| llvm::Value * |
| CodeGenFunction::EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask) { |
| Value *Result = Builder.getTrue(); |
| if (FeatureMask[0] != 0) { |
| // Matching the struct layout from the compiler-rt/libgcc structure that is |
| // filled in: |
| // unsigned int __cpu_vendor; |
| // unsigned int __cpu_type; |
| // unsigned int __cpu_subtype; |
| // unsigned int __cpu_features[1]; |
| llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty, |
| llvm::ArrayType::get(Int32Ty, 1)); |
| |
| // Grab the global __cpu_model. |
| llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model"); |
| cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true); |
| |
| // Grab the first (0th) element from the field __cpu_features off of the |
| // global in the struct STy. |
| Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(3), |
| Builder.getInt32(0)}; |
| Value *CpuFeatures = Builder.CreateInBoundsGEP(STy, CpuModel, Idxs); |
| Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures, |
| CharUnits::fromQuantity(4)); |
| |
| // Check the value of the bit corresponding to the feature requested. |
| Value *Mask = Builder.getInt32(FeatureMask[0]); |
| Value *Bitset = Builder.CreateAnd(Features, Mask); |
| Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask); |
| Result = Builder.CreateAnd(Result, Cmp); |
| } |
| |
| llvm::Type *ATy = llvm::ArrayType::get(Int32Ty, 3); |
| llvm::Constant *CpuFeatures2 = |
| CGM.CreateRuntimeVariable(ATy, "__cpu_features2"); |
| cast<llvm::GlobalValue>(CpuFeatures2)->setDSOLocal(true); |
| for (int i = 1; i != 4; ++i) { |
| const uint32_t M = FeatureMask[i]; |
| if (!M) |
| continue; |
| Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(i - 1)}; |
| Value *Features = Builder.CreateAlignedLoad( |
| Int32Ty, Builder.CreateInBoundsGEP(ATy, CpuFeatures2, Idxs), |
| CharUnits::fromQuantity(4)); |
| // Check the value of the bit corresponding to the feature requested. |
| Value *Mask = Builder.getInt32(M); |
| Value *Bitset = Builder.CreateAnd(Features, Mask); |
| Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask); |
| Result = Builder.CreateAnd(Result, Cmp); |
| } |
| |
| return Result; |
| } |
| |
| Value *CodeGenFunction::EmitX86CpuInit() { |
| llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, |
| /*Variadic*/ false); |
| llvm::FunctionCallee Func = |
| CGM.CreateRuntimeFunction(FTy, "__cpu_indicator_init"); |
| cast<llvm::GlobalValue>(Func.getCallee())->setDSOLocal(true); |
| cast<llvm::GlobalValue>(Func.getCallee()) |
| ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass); |
| return Builder.CreateCall(Func); |
| } |
| |
| |
| Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, |
| const CallExpr *E) { |
| if (BuiltinID == Builtin::BI__builtin_cpu_is) |
| return EmitX86CpuIs(E); |
| if (BuiltinID == Builtin::BI__builtin_cpu_supports) |
| return EmitX86CpuSupports(E); |
| if (BuiltinID == Builtin::BI__builtin_cpu_init) |
| return EmitX86CpuInit(); |
| |
| // Handle MSVC intrinsics before argument evaluation to prevent double |
| // evaluation. |
| if (std::optional<MSVCIntrin> MsvcIntId = translateX86ToMsvcIntrin(BuiltinID)) |
| return EmitMSVCBuiltinExpr(*MsvcIntId, E); |
| |
| SmallVector<Value*, 4> Ops; |
| bool IsMaskFCmp = false; |
| bool IsConjFMA = false; |
| |
| // Find out if any arguments are required to be integer constant expressions. |
| unsigned ICEArguments = 0; |
| ASTContext::GetBuiltinTypeError Error; |
| getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); |
| assert(Error == ASTContext::GE_None && "Should not codegen an error"); |
| |
| for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) { |
| Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E)); |
| } |
| |
| // These exist so that the builtin that takes an immediate can be bounds |
| // checked by clang to avoid passing bad immediates to the backend. Since |
| // AVX has a larger immediate than SSE we would need separate builtins to |
| // do the different bounds checking. Rather than create a clang specific |
| // SSE only builtin, this implements eight separate builtins to match gcc |
| // implementation. |
| auto getCmpIntrinsicCall = [this, &Ops](Intrinsic::ID ID, unsigned Imm) { |
| Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm)); |
| llvm::Function *F = CGM.getIntrinsic(ID); |
| return Builder.CreateCall(F, Ops); |
| }; |
| |
| // For the vector forms of FP comparisons, translate the builtins directly to |
| // IR. |
| // TODO: The builtins could be removed if the SSE header files used vector |
| // extension comparisons directly (vector ordered/unordered may need |
| // additional support via __builtin_isnan()). |
| auto getVectorFCmpIR = [this, &Ops, E](CmpInst::Predicate Pred, |
| bool IsSignaling) { |
| CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); |
| Value *Cmp; |
| if (IsSignaling) |
| Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]); |
| else |
| Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]); |
| llvm::VectorType *FPVecTy = cast<llvm::VectorType>(Ops[0]->getType()); |
| llvm::VectorType *IntVecTy = llvm::VectorType::getInteger(FPVecTy); |
| Value *Sext = Builder.CreateSExt(Cmp, IntVecTy); |
| return Builder.CreateBitCast(Sext, FPVecTy); |
| }; |
| |
| switch (BuiltinID) { |
| default: return nullptr; |
| case X86::BI_mm_prefetch: { |
| Value *Address = Ops[0]; |
| ConstantInt *C = cast<ConstantInt>(Ops[1]); |
| Value *RW = ConstantInt::get(Int32Ty, (C->getZExtValue() >> 2) & 0x1); |
| Value *Locality = ConstantInt::get(Int32Ty, C->getZExtValue() & 0x3); |
| Value *Data = ConstantInt::get(Int32Ty, 1); |
| Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType()); |
| return Builder.CreateCall(F, {Address, RW, Locality, Data}); |
| } |
| case X86::BI_mm_clflush: { |
| return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_clflush), |
| Ops[0]); |
| } |
| case X86::BI_mm_lfence: { |
| return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_lfence)); |
| } |
| case X86::BI_mm_mfence: { |
| return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_mfence)); |
| } |
| case X86::BI_mm_sfence: { |
| return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_sfence)); |
| } |
| case X86::BI_mm_pause: { |
| return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_pause)); |
| } |
| case X86::BI__rdtsc: { |
| return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtsc)); |
| } |
| case X86::BI__builtin_ia32_rdtscp: { |
| Value *Call = Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtscp)); |
| Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1), |
| Ops[0]); |
| return Builder.CreateExtractValue(Call, 0); |
| } |
| case X86::BI__builtin_ia32_lzcnt_u16: |
| case X86::BI__builtin_ia32_lzcnt_u32: |
| case X86::BI__builtin_ia32_lzcnt_u64: { |
| Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType()); |
| return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)}); |
| } |
| case X86::BI__builtin_ia32_tzcnt_u16: |
| case X86::BI__builtin_ia32_tzcnt_u32: |
| case X86::BI__builtin_ia32_tzcnt_u64: { |
| Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType()); |
| return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)}); |
| } |
| case X86::BI__builtin_ia32_undef128: |
| case X86::BI__builtin_ia32_undef256: |
| case X86::BI__builtin_ia32_undef512: |
| // The x86 definition of "undef" is not the same as the LLVM definition |
| // (PR32176). We leave optimizing away an unnecessary zero constant to the |
| // IR optimizer and backend. |
| // TODO: If we had a "freeze" IR instruction to generate a fixed undef |
| // value, we should use that here instead of a zero. |
| return llvm::Constant::getNullValue(ConvertType(E->getType())); |
| case X86::BI__builtin_ia32_vec_ext_v4hi: |
| case X86::BI__builtin_ia32_vec_ext_v16qi: |
| case X86::BI__builtin_ia32_vec_ext_v8hi: |
| case X86::BI__builtin_ia32_vec_ext_v4si: |
| case X86::BI__builtin_ia32_vec_ext_v4sf: |
| case X86::BI__builtin_ia32_vec_ext_v2di: |
| case X86::BI__builtin_ia32_vec_ext_v32qi: |
| case X86::BI__builtin_ia32_vec_ext_v16hi: |
| case X86::BI__builtin_ia32_vec_ext_v8si: |
| case X86::BI__builtin_ia32_vec_ext_v4di: { |
| unsigned NumElts = |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); |
| uint64_t Index = cast<ConstantInt>(Ops[1])->getZExtValue(); |
| Index &= NumElts - 1; |
| // These builtins exist so we can ensure the index is an ICE and in range. |
| // Otherwise we could just do this in the header file. |
| return Builder.CreateExtractElement(Ops[0], Index); |
| } |
| case X86::BI__builtin_ia32_vec_set_v4hi: |
| case X86::BI__builtin_ia32_vec_set_v16qi: |
| case X86::BI__builtin_ia32_vec_set_v8hi: |
| case X86::BI__builtin_ia32_vec_set_v4si: |
| case X86::BI__builtin_ia32_vec_set_v2di: |
| case X86::BI__builtin_ia32_vec_set_v32qi: |
| case X86::BI__builtin_ia32_vec_set_v16hi: |
| case X86::BI__builtin_ia32_vec_set_v8si: |
| case X86::BI__builtin_ia32_vec_set_v4di: { |
| unsigned NumElts = |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); |
| unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue(); |
| Index &= NumElts - 1; |
| // These builtins exist so we can ensure the index is an ICE and in range. |
| // Otherwise we could just do this in the header file. |
| return Builder.CreateInsertElement(Ops[0], Ops[1], Index); |
| } |
| case X86::BI_mm_setcsr: |
| case X86::BI__builtin_ia32_ldmxcsr: { |
| RawAddress Tmp = CreateMemTemp(E->getArg(0)->getType()); |
| Builder.CreateStore(Ops[0], Tmp); |
| return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_ldmxcsr), |
| Tmp.getPointer()); |
| } |
| case X86::BI_mm_getcsr: |
| case X86::BI__builtin_ia32_stmxcsr: { |
| RawAddress Tmp = CreateMemTemp(E->getType()); |
| Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_stmxcsr), |
| Tmp.getPointer()); |
| return Builder.CreateLoad(Tmp, "stmxcsr"); |
| } |
| case X86::BI__builtin_ia32_xsave: |
| case X86::BI__builtin_ia32_xsave64: |
| case X86::BI__builtin_ia32_xrstor: |
| case X86::BI__builtin_ia32_xrstor64: |
| case X86::BI__builtin_ia32_xsaveopt: |
| case X86::BI__builtin_ia32_xsaveopt64: |
| case X86::BI__builtin_ia32_xrstors: |
| case X86::BI__builtin_ia32_xrstors64: |
| case X86::BI__builtin_ia32_xsavec: |
| case X86::BI__builtin_ia32_xsavec64: |
| case X86::BI__builtin_ia32_xsaves: |
| case X86::BI__builtin_ia32_xsaves64: |
| case X86::BI__builtin_ia32_xsetbv: |
| case X86::BI_xsetbv: { |
| Intrinsic::ID ID; |
| #define INTRINSIC_X86_XSAVE_ID(NAME) \ |
| case X86::BI__builtin_ia32_##NAME: \ |
| ID = Intrinsic::x86_##NAME; \ |
| break |
| switch (BuiltinID) { |
| default: llvm_unreachable("Unsupported intrinsic!"); |
| INTRINSIC_X86_XSAVE_ID(xsave); |
| INTRINSIC_X86_XSAVE_ID(xsave64); |
| INTRINSIC_X86_XSAVE_ID(xrstor); |
| INTRINSIC_X86_XSAVE_ID(xrstor64); |
| INTRINSIC_X86_XSAVE_ID(xsaveopt); |
| INTRINSIC_X86_XSAVE_ID(xsaveopt64); |
| INTRINSIC_X86_XSAVE_ID(xrstors); |
| INTRINSIC_X86_XSAVE_ID(xrstors64); |
| INTRINSIC_X86_XSAVE_ID(xsavec); |
| INTRINSIC_X86_XSAVE_ID(xsavec64); |
| INTRINSIC_X86_XSAVE_ID(xsaves); |
| INTRINSIC_X86_XSAVE_ID(xsaves64); |
| INTRINSIC_X86_XSAVE_ID(xsetbv); |
| case X86::BI_xsetbv: |
| ID = Intrinsic::x86_xsetbv; |
| break; |
| } |
| #undef INTRINSIC_X86_XSAVE_ID |
| Value *Mhi = Builder.CreateTrunc( |
| Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, 32)), Int32Ty); |
| Value *Mlo = Builder.CreateTrunc(Ops[1], Int32Ty); |
| Ops[1] = Mhi; |
| Ops.push_back(Mlo); |
| return Builder.CreateCall(CGM.getIntrinsic(ID), Ops); |
| } |
| case X86::BI__builtin_ia32_xgetbv: |
| case X86::BI_xgetbv: |
| return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_xgetbv), Ops); |
| case X86::BI__builtin_ia32_storedqudi128_mask: |
| case X86::BI__builtin_ia32_storedqusi128_mask: |
| case X86::BI__builtin_ia32_storedquhi128_mask: |
| case X86::BI__builtin_ia32_storedquqi128_mask: |
| case X86::BI__builtin_ia32_storeupd128_mask: |
| case X86::BI__builtin_ia32_storeups128_mask: |
| case X86::BI__builtin_ia32_storedqudi256_mask: |
| case X86::BI__builtin_ia32_storedqusi256_mask: |
| case X86::BI__builtin_ia32_storedquhi256_mask: |
| case X86::BI__builtin_ia32_storedquqi256_mask: |
| case X86::BI__builtin_ia32_storeupd256_mask: |
| case X86::BI__builtin_ia32_storeups256_mask: |
| case X86::BI__builtin_ia32_storedqudi512_mask: |
| case X86::BI__builtin_ia32_storedqusi512_mask: |
| case X86::BI__builtin_ia32_storedquhi512_mask: |
| case X86::BI__builtin_ia32_storedquqi512_mask: |
| case X86::BI__builtin_ia32_storeupd512_mask: |
| case X86::BI__builtin_ia32_storeups512_mask: |
| return EmitX86MaskedStore(*this, Ops, Align(1)); |
| |
| case X86::BI__builtin_ia32_storesbf16128_mask: |
| case X86::BI__builtin_ia32_storesh128_mask: |
| case X86::BI__builtin_ia32_storess128_mask: |
| case X86::BI__builtin_ia32_storesd128_mask: |
| return EmitX86MaskedStore(*this, Ops, Align(1)); |
| |
| case X86::BI__builtin_ia32_cvtmask2b128: |
| case X86::BI__builtin_ia32_cvtmask2b256: |
| case X86::BI__builtin_ia32_cvtmask2b512: |
| case X86::BI__builtin_ia32_cvtmask2w128: |
| case X86::BI__builtin_ia32_cvtmask2w256: |
| case X86::BI__builtin_ia32_cvtmask2w512: |
| case X86::BI__builtin_ia32_cvtmask2d128: |
| case X86::BI__builtin_ia32_cvtmask2d256: |
| case X86::BI__builtin_ia32_cvtmask2d512: |
| case X86::BI__builtin_ia32_cvtmask2q128: |
| case X86::BI__builtin_ia32_cvtmask2q256: |
| case X86::BI__builtin_ia32_cvtmask2q512: |
| return EmitX86SExtMask(*this, Ops[0], ConvertType(E->getType())); |
| |
| case X86::BI__builtin_ia32_cvtb2mask128: |
| case X86::BI__builtin_ia32_cvtb2mask256: |
| case X86::BI__builtin_ia32_cvtb2mask512: |
| case X86::BI__builtin_ia32_cvtw2mask128: |
| case X86::BI__builtin_ia32_cvtw2mask256: |
| case X86::BI__builtin_ia32_cvtw2mask512: |
| case X86::BI__builtin_ia32_cvtd2mask128: |
| case X86::BI__builtin_ia32_cvtd2mask256: |
| case X86::BI__builtin_ia32_cvtd2mask512: |
| case X86::BI__builtin_ia32_cvtq2mask128: |
| case X86::BI__builtin_ia32_cvtq2mask256: |
| case X86::BI__builtin_ia32_cvtq2mask512: |
| return EmitX86ConvertToMask(*this, Ops[0]); |
| |
| case X86::BI__builtin_ia32_cvtdq2ps512_mask: |
| case X86::BI__builtin_ia32_cvtqq2ps512_mask: |
| case X86::BI__builtin_ia32_cvtqq2pd512_mask: |
| case X86::BI__builtin_ia32_vcvtw2ph512_mask: |
| case X86::BI__builtin_ia32_vcvtdq2ph512_mask: |
| case X86::BI__builtin_ia32_vcvtqq2ph512_mask: |
| return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ true); |
| case X86::BI__builtin_ia32_cvtudq2ps512_mask: |
| case X86::BI__builtin_ia32_cvtuqq2ps512_mask: |
| case X86::BI__builtin_ia32_cvtuqq2pd512_mask: |
| case X86::BI__builtin_ia32_vcvtuw2ph512_mask: |
| case X86::BI__builtin_ia32_vcvtudq2ph512_mask: |
| case X86::BI__builtin_ia32_vcvtuqq2ph512_mask: |
| return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false); |
| |
| case X86::BI__builtin_ia32_vfmaddss3: |
| case X86::BI__builtin_ia32_vfmaddsd3: |
| case X86::BI__builtin_ia32_vfmaddsh3_mask: |
| case X86::BI__builtin_ia32_vfmaddss3_mask: |
| case X86::BI__builtin_ia32_vfmaddsd3_mask: |
| return EmitScalarFMAExpr(*this, E, Ops, Ops[0]); |
| case X86::BI__builtin_ia32_vfmaddss: |
| case X86::BI__builtin_ia32_vfmaddsd: |
| return EmitScalarFMAExpr(*this, E, Ops, |
| Constant::getNullValue(Ops[0]->getType())); |
| case X86::BI__builtin_ia32_vfmaddsh3_maskz: |
| case X86::BI__builtin_ia32_vfmaddss3_maskz: |
| case X86::BI__builtin_ia32_vfmaddsd3_maskz: |
| return EmitScalarFMAExpr(*this, E, Ops, Ops[0], /*ZeroMask*/ true); |
| case X86::BI__builtin_ia32_vfmaddsh3_mask3: |
| case X86::BI__builtin_ia32_vfmaddss3_mask3: |
| case X86::BI__builtin_ia32_vfmaddsd3_mask3: |
| return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2); |
| case X86::BI__builtin_ia32_vfmsubsh3_mask3: |
| case X86::BI__builtin_ia32_vfmsubss3_mask3: |
| case X86::BI__builtin_ia32_vfmsubsd3_mask3: |
| return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2, |
| /*NegAcc*/ true); |
| case X86::BI__builtin_ia32_vfmaddph: |
| case X86::BI__builtin_ia32_vfmaddps: |
| case X86::BI__builtin_ia32_vfmaddpd: |
| case X86::BI__builtin_ia32_vfmaddph256: |
| case X86::BI__builtin_ia32_vfmaddps256: |
| case X86::BI__builtin_ia32_vfmaddpd256: |
| case X86::BI__builtin_ia32_vfmaddph512_mask: |
| case X86::BI__builtin_ia32_vfmaddph512_maskz: |
| case X86::BI__builtin_ia32_vfmaddph512_mask3: |
| case X86::BI__builtin_ia32_vfmaddbf16128: |
| case X86::BI__builtin_ia32_vfmaddbf16256: |
| case X86::BI__builtin_ia32_vfmaddbf16512: |
| case X86::BI__builtin_ia32_vfmaddps512_mask: |
| case X86::BI__builtin_ia32_vfmaddps512_maskz: |
| case X86::BI__builtin_ia32_vfmaddps512_mask3: |
| case X86::BI__builtin_ia32_vfmsubps512_mask3: |
| case X86::BI__builtin_ia32_vfmaddpd512_mask: |
| case X86::BI__builtin_ia32_vfmaddpd512_maskz: |
| case X86::BI__builtin_ia32_vfmaddpd512_mask3: |
| case X86::BI__builtin_ia32_vfmsubpd512_mask3: |
| case X86::BI__builtin_ia32_vfmsubph512_mask3: |
| return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ false); |
| case X86::BI__builtin_ia32_vfmaddsubph512_mask: |
| case X86::BI__builtin_ia32_vfmaddsubph512_maskz: |
| case X86::BI__builtin_ia32_vfmaddsubph512_mask3: |
| case X86::BI__builtin_ia32_vfmsubaddph512_mask3: |
| case X86::BI__builtin_ia32_vfmaddsubps512_mask: |
| case X86::BI__builtin_ia32_vfmaddsubps512_maskz: |
| case X86::BI__builtin_ia32_vfmaddsubps512_mask3: |
| case X86::BI__builtin_ia32_vfmsubaddps512_mask3: |
| case X86::BI__builtin_ia32_vfmaddsubpd512_mask: |
| case X86::BI__builtin_ia32_vfmaddsubpd512_maskz: |
| case X86::BI__builtin_ia32_vfmaddsubpd512_mask3: |
| case X86::BI__builtin_ia32_vfmsubaddpd512_mask3: |
| return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ true); |
| |
| case X86::BI__builtin_ia32_movdqa32store128_mask: |
| case X86::BI__builtin_ia32_movdqa64store128_mask: |
| case X86::BI__builtin_ia32_storeaps128_mask: |
| case X86::BI__builtin_ia32_storeapd128_mask: |
| case X86::BI__builtin_ia32_movdqa32store256_mask: |
| case X86::BI__builtin_ia32_movdqa64store256_mask: |
| case X86::BI__builtin_ia32_storeaps256_mask: |
| case X86::BI__builtin_ia32_storeapd256_mask: |
| case X86::BI__builtin_ia32_movdqa32store512_mask: |
| case X86::BI__builtin_ia32_movdqa64store512_mask: |
| case X86::BI__builtin_ia32_storeaps512_mask: |
| case X86::BI__builtin_ia32_storeapd512_mask: |
| return EmitX86MaskedStore( |
| *this, Ops, |
| getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign()); |
| |
| case X86::BI__builtin_ia32_loadups128_mask: |
| case X86::BI__builtin_ia32_loadups256_mask: |
| case X86::BI__builtin_ia32_loadups512_mask: |
| case X86::BI__builtin_ia32_loadupd128_mask: |
| case X86::BI__builtin_ia32_loadupd256_mask: |
| case X86::BI__builtin_ia32_loadupd512_mask: |
| case X86::BI__builtin_ia32_loaddquqi128_mask: |
| case X86::BI__builtin_ia32_loaddquqi256_mask: |
| case X86::BI__builtin_ia32_loaddquqi512_mask: |
| case X86::BI__builtin_ia32_loaddquhi128_mask: |
| case X86::BI__builtin_ia32_loaddquhi256_mask: |
| case X86::BI__builtin_ia32_loaddquhi512_mask: |
| case X86::BI__builtin_ia32_loaddqusi128_mask: |
| case X86::BI__builtin_ia32_loaddqusi256_mask: |
| case X86::BI__builtin_ia32_loaddqusi512_mask: |
| case X86::BI__builtin_ia32_loaddqudi128_mask: |
| case X86::BI__builtin_ia32_loaddqudi256_mask: |
| case X86::BI__builtin_ia32_loaddqudi512_mask: |
| return EmitX86MaskedLoad(*this, Ops, Align(1)); |
| |
| case X86::BI__builtin_ia32_loadsbf16128_mask: |
| case X86::BI__builtin_ia32_loadsh128_mask: |
| case X86::BI__builtin_ia32_loadss128_mask: |
| case X86::BI__builtin_ia32_loadsd128_mask: |
| return EmitX86MaskedLoad(*this, Ops, Align(1)); |
| |
| case X86::BI__builtin_ia32_loadaps128_mask: |
| case X86::BI__builtin_ia32_loadaps256_mask: |
| case X86::BI__builtin_ia32_loadaps512_mask: |
| case X86::BI__builtin_ia32_loadapd128_mask: |
| case X86::BI__builtin_ia32_loadapd256_mask: |
| case X86::BI__builtin_ia32_loadapd512_mask: |
| case X86::BI__builtin_ia32_movdqa32load128_mask: |
| case X86::BI__builtin_ia32_movdqa32load256_mask: |
| case X86::BI__builtin_ia32_movdqa32load512_mask: |
| case X86::BI__builtin_ia32_movdqa64load128_mask: |
| case X86::BI__builtin_ia32_movdqa64load256_mask: |
| case X86::BI__builtin_ia32_movdqa64load512_mask: |
| return EmitX86MaskedLoad( |
| *this, Ops, |
| getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign()); |
| |
| case X86::BI__builtin_ia32_expandloaddf128_mask: |
| case X86::BI__builtin_ia32_expandloaddf256_mask: |
| case X86::BI__builtin_ia32_expandloaddf512_mask: |
| case X86::BI__builtin_ia32_expandloadsf128_mask: |
| case X86::BI__builtin_ia32_expandloadsf256_mask: |
| case X86::BI__builtin_ia32_expandloadsf512_mask: |
| case X86::BI__builtin_ia32_expandloaddi128_mask: |
| case X86::BI__builtin_ia32_expandloaddi256_mask: |
| case X86::BI__builtin_ia32_expandloaddi512_mask: |
| case X86::BI__builtin_ia32_expandloadsi128_mask: |
| case X86::BI__builtin_ia32_expandloadsi256_mask: |
| case X86::BI__builtin_ia32_expandloadsi512_mask: |
| case X86::BI__builtin_ia32_expandloadhi128_mask: |
| case X86::BI__builtin_ia32_expandloadhi256_mask: |
| case X86::BI__builtin_ia32_expandloadhi512_mask: |
| case X86::BI__builtin_ia32_expandloadqi128_mask: |
| case X86::BI__builtin_ia32_expandloadqi256_mask: |
| case X86::BI__builtin_ia32_expandloadqi512_mask: |
| return EmitX86ExpandLoad(*this, Ops); |
| |
| case X86::BI__builtin_ia32_compressstoredf128_mask: |
| case X86::BI__builtin_ia32_compressstoredf256_mask: |
| case X86::BI__builtin_ia32_compressstoredf512_mask: |
| case X86::BI__builtin_ia32_compressstoresf128_mask: |
| case X86::BI__builtin_ia32_compressstoresf256_mask: |
| case X86::BI__builtin_ia32_compressstoresf512_mask: |
| case X86::BI__builtin_ia32_compressstoredi128_mask: |
| case X86::BI__builtin_ia32_compressstoredi256_mask: |
| case X86::BI__builtin_ia32_compressstoredi512_mask: |
| case X86::BI__builtin_ia32_compressstoresi128_mask: |
| case X86::BI__builtin_ia32_compressstoresi256_mask: |
| case X86::BI__builtin_ia32_compressstoresi512_mask: |
| case X86::BI__builtin_ia32_compressstorehi128_mask: |
| case X86::BI__builtin_ia32_compressstorehi256_mask: |
| case X86::BI__builtin_ia32_compressstorehi512_mask: |
| case X86::BI__builtin_ia32_compressstoreqi128_mask: |
| case X86::BI__builtin_ia32_compressstoreqi256_mask: |
| case X86::BI__builtin_ia32_compressstoreqi512_mask: |
| return EmitX86CompressStore(*this, Ops); |
| |
| case X86::BI__builtin_ia32_expanddf128_mask: |
| case X86::BI__builtin_ia32_expanddf256_mask: |
| case X86::BI__builtin_ia32_expanddf512_mask: |
| case X86::BI__builtin_ia32_expandsf128_mask: |
| case X86::BI__builtin_ia32_expandsf256_mask: |
| case X86::BI__builtin_ia32_expandsf512_mask: |
| case X86::BI__builtin_ia32_expanddi128_mask: |
| case X86::BI__builtin_ia32_expanddi256_mask: |
| case X86::BI__builtin_ia32_expanddi512_mask: |
| case X86::BI__builtin_ia32_expandsi128_mask: |
| case X86::BI__builtin_ia32_expandsi256_mask: |
| case X86::BI__builtin_ia32_expandsi512_mask: |
| case X86::BI__builtin_ia32_expandhi128_mask: |
| case X86::BI__builtin_ia32_expandhi256_mask: |
| case X86::BI__builtin_ia32_expandhi512_mask: |
| case X86::BI__builtin_ia32_expandqi128_mask: |
| case X86::BI__builtin_ia32_expandqi256_mask: |
| case X86::BI__builtin_ia32_expandqi512_mask: |
| return EmitX86CompressExpand(*this, Ops, /*IsCompress*/false); |
| |
| case X86::BI__builtin_ia32_compressdf128_mask: |
| case X86::BI__builtin_ia32_compressdf256_mask: |
| case X86::BI__builtin_ia32_compressdf512_mask: |
| case X86::BI__builtin_ia32_compresssf128_mask: |
| case X86::BI__builtin_ia32_compresssf256_mask: |
| case X86::BI__builtin_ia32_compresssf512_mask: |
| case X86::BI__builtin_ia32_compressdi128_mask: |
| case X86::BI__builtin_ia32_compressdi256_mask: |
| case X86::BI__builtin_ia32_compressdi512_mask: |
| case X86::BI__builtin_ia32_compresssi128_mask: |
| case X86::BI__builtin_ia32_compresssi256_mask: |
| case X86::BI__builtin_ia32_compresssi512_mask: |
| case X86::BI__builtin_ia32_compresshi128_mask: |
| case X86::BI__builtin_ia32_compresshi256_mask: |
| case X86::BI__builtin_ia32_compresshi512_mask: |
| case X86::BI__builtin_ia32_compressqi128_mask: |
| case X86::BI__builtin_ia32_compressqi256_mask: |
| case X86::BI__builtin_ia32_compressqi512_mask: |
| return EmitX86CompressExpand(*this, Ops, /*IsCompress*/true); |
| |
| case X86::BI__builtin_ia32_gather3div2df: |
| case X86::BI__builtin_ia32_gather3div2di: |
| case X86::BI__builtin_ia32_gather3div4df: |
| case X86::BI__builtin_ia32_gather3div4di: |
| case X86::BI__builtin_ia32_gather3div4sf: |
| case X86::BI__builtin_ia32_gather3div4si: |
| case X86::BI__builtin_ia32_gather3div8sf: |
| case X86::BI__builtin_ia32_gather3div8si: |
| case X86::BI__builtin_ia32_gather3siv2df: |
| case X86::BI__builtin_ia32_gather3siv2di: |
| case X86::BI__builtin_ia32_gather3siv4df: |
| case X86::BI__builtin_ia32_gather3siv4di: |
| case X86::BI__builtin_ia32_gather3siv4sf: |
| case X86::BI__builtin_ia32_gather3siv4si: |
| case X86::BI__builtin_ia32_gather3siv8sf: |
| case X86::BI__builtin_ia32_gather3siv8si: |
| case X86::BI__builtin_ia32_gathersiv8df: |
| case X86::BI__builtin_ia32_gathersiv16sf: |
| case X86::BI__builtin_ia32_gatherdiv8df: |
| case X86::BI__builtin_ia32_gatherdiv16sf: |
| case X86::BI__builtin_ia32_gathersiv8di: |
| case X86::BI__builtin_ia32_gathersiv16si: |
| case X86::BI__builtin_ia32_gatherdiv8di: |
| case X86::BI__builtin_ia32_gatherdiv16si: { |
| Intrinsic::ID IID; |
| switch (BuiltinID) { |
| default: llvm_unreachable("Unexpected builtin"); |
| case X86::BI__builtin_ia32_gather3div2df: |
| IID = Intrinsic::x86_avx512_mask_gather3div2_df; |
| break; |
| case X86::BI__builtin_ia32_gather3div2di: |
| IID = Intrinsic::x86_avx512_mask_gather3div2_di; |
| break; |
| case X86::BI__builtin_ia32_gather3div4df: |
| IID = Intrinsic::x86_avx512_mask_gather3div4_df; |
| break; |
| case X86::BI__builtin_ia32_gather3div4di: |
| IID = Intrinsic::x86_avx512_mask_gather3div4_di; |
| break; |
| case X86::BI__builtin_ia32_gather3div4sf: |
| IID = Intrinsic::x86_avx512_mask_gather3div4_sf; |
| break; |
| case X86::BI__builtin_ia32_gather3div4si: |
| IID = Intrinsic::x86_avx512_mask_gather3div4_si; |
| break; |
| case X86::BI__builtin_ia32_gather3div8sf: |
| IID = Intrinsic::x86_avx512_mask_gather3div8_sf; |
| break; |
| case X86::BI__builtin_ia32_gather3div8si: |
| IID = Intrinsic::x86_avx512_mask_gather3div8_si; |
| break; |
| case X86::BI__builtin_ia32_gather3siv2df: |
| IID = Intrinsic::x86_avx512_mask_gather3siv2_df; |
| break; |
| case X86::BI__builtin_ia32_gather3siv2di: |
| IID = Intrinsic::x86_avx512_mask_gather3siv2_di; |
| break; |
| case X86::BI__builtin_ia32_gather3siv4df: |
| IID = Intrinsic::x86_avx512_mask_gather3siv4_df; |
| break; |
| case X86::BI__builtin_ia32_gather3siv4di: |
| IID = Intrinsic::x86_avx512_mask_gather3siv4_di; |
| break; |
| case X86::BI__builtin_ia32_gather3siv4sf: |
| IID = Intrinsic::x86_avx512_mask_gather3siv4_sf; |
| break; |
| case X86::BI__builtin_ia32_gather3siv4si: |
| IID = Intrinsic::x86_avx512_mask_gather3siv4_si; |
| break; |
| case X86::BI__builtin_ia32_gather3siv8sf: |
| IID = Intrinsic::x86_avx512_mask_gather3siv8_sf; |
| break; |
| case X86::BI__builtin_ia32_gather3siv8si: |
| IID = Intrinsic::x86_avx512_mask_gather3siv8_si; |
| break; |
| case X86::BI__builtin_ia32_gathersiv8df: |
| IID = Intrinsic::x86_avx512_mask_gather_dpd_512; |
| break; |
| case X86::BI__builtin_ia32_gathersiv16sf: |
| IID = Intrinsic::x86_avx512_mask_gather_dps_512; |
| break; |
| case X86::BI__builtin_ia32_gatherdiv8df: |
| IID = Intrinsic::x86_avx512_mask_gather_qpd_512; |
| break; |
| case X86::BI__builtin_ia32_gatherdiv16sf: |
| IID = Intrinsic::x86_avx512_mask_gather_qps_512; |
| break; |
| case X86::BI__builtin_ia32_gathersiv8di: |
| IID = Intrinsic::x86_avx512_mask_gather_dpq_512; |
| break; |
| case X86::BI__builtin_ia32_gathersiv16si: |
| IID = Intrinsic::x86_avx512_mask_gather_dpi_512; |
| break; |
| case X86::BI__builtin_ia32_gatherdiv8di: |
| IID = Intrinsic::x86_avx512_mask_gather_qpq_512; |
| break; |
| case X86::BI__builtin_ia32_gatherdiv16si: |
| IID = Intrinsic::x86_avx512_mask_gather_qpi_512; |
| break; |
| } |
| |
| unsigned MinElts = std::min( |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(), |
| cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements()); |
| Ops[3] = getMaskVecValue(*this, Ops[3], MinElts); |
| Function *Intr = CGM.getIntrinsic(IID); |
| return Builder.CreateCall(Intr, Ops); |
| } |
| |
| case X86::BI__builtin_ia32_scattersiv8df: |
| case X86::BI__builtin_ia32_scattersiv16sf: |
| case X86::BI__builtin_ia32_scatterdiv8df: |
| case X86::BI__builtin_ia32_scatterdiv16sf: |
| case X86::BI__builtin_ia32_scattersiv8di: |
| case X86::BI__builtin_ia32_scattersiv16si: |
| case X86::BI__builtin_ia32_scatterdiv8di: |
| case X86::BI__builtin_ia32_scatterdiv16si: |
| case X86::BI__builtin_ia32_scatterdiv2df: |
| case X86::BI__builtin_ia32_scatterdiv2di: |
| case X86::BI__builtin_ia32_scatterdiv4df: |
| case X86::BI__builtin_ia32_scatterdiv4di: |
| case X86::BI__builtin_ia32_scatterdiv4sf: |
| case X86::BI__builtin_ia32_scatterdiv4si: |
| case X86::BI__builtin_ia32_scatterdiv8sf: |
| case X86::BI__builtin_ia32_scatterdiv8si: |
| case X86::BI__builtin_ia32_scattersiv2df: |
| case X86::BI__builtin_ia32_scattersiv2di: |
| case X86::BI__builtin_ia32_scattersiv4df: |
| case X86::BI__builtin_ia32_scattersiv4di: |
| case X86::BI__builtin_ia32_scattersiv4sf: |
| case X86::BI__builtin_ia32_scattersiv4si: |
| case X86::BI__builtin_ia32_scattersiv8sf: |
| case X86::BI__builtin_ia32_scattersiv8si: { |
| Intrinsic::ID IID; |
| switch (BuiltinID) { |
| default: llvm_unreachable("Unexpected builtin"); |
| case X86::BI__builtin_ia32_scattersiv8df: |
| IID = Intrinsic::x86_avx512_mask_scatter_dpd_512; |
| break; |
| case X86::BI__builtin_ia32_scattersiv16sf: |
| IID = Intrinsic::x86_avx512_mask_scatter_dps_512; |
| break; |
| case X86::BI__builtin_ia32_scatterdiv8df: |
| IID = Intrinsic::x86_avx512_mask_scatter_qpd_512; |
| break; |
| case X86::BI__builtin_ia32_scatterdiv16sf: |
| IID = Intrinsic::x86_avx512_mask_scatter_qps_512; |
| break; |
| case X86::BI__builtin_ia32_scattersiv8di: |
| IID = Intrinsic::x86_avx512_mask_scatter_dpq_512; |
| break; |
| case X86::BI__builtin_ia32_scattersiv16si: |
| IID = Intrinsic::x86_avx512_mask_scatter_dpi_512; |
| break; |
| case X86::BI__builtin_ia32_scatterdiv8di: |
| IID = Intrinsic::x86_avx512_mask_scatter_qpq_512; |
| break; |
| case X86::BI__builtin_ia32_scatterdiv16si: |
| IID = Intrinsic::x86_avx512_mask_scatter_qpi_512; |
| break; |
| case X86::BI__builtin_ia32_scatterdiv2df: |
| IID = Intrinsic::x86_avx512_mask_scatterdiv2_df; |
| break; |
| case X86::BI__builtin_ia32_scatterdiv2di: |
| IID = Intrinsic::x86_avx512_mask_scatterdiv2_di; |
| break; |
| case X86::BI__builtin_ia32_scatterdiv4df: |
| IID = Intrinsic::x86_avx512_mask_scatterdiv4_df; |
| break; |
| case X86::BI__builtin_ia32_scatterdiv4di: |
| IID = Intrinsic::x86_avx512_mask_scatterdiv4_di; |
| break; |
| case X86::BI__builtin_ia32_scatterdiv4sf: |
| IID = Intrinsic::x86_avx512_mask_scatterdiv4_sf; |
| break; |
| case X86::BI__builtin_ia32_scatterdiv4si: |
| IID = Intrinsic::x86_avx512_mask_scatterdiv4_si; |
| break; |
| case X86::BI__builtin_ia32_scatterdiv8sf: |
| IID = Intrinsic::x86_avx512_mask_scatterdiv8_sf; |
| break; |
| case X86::BI__builtin_ia32_scatterdiv8si: |
| IID = Intrinsic::x86_avx512_mask_scatterdiv8_si; |
| break; |
| case X86::BI__builtin_ia32_scattersiv2df: |
| IID = Intrinsic::x86_avx512_mask_scattersiv2_df; |
| break; |
| case X86::BI__builtin_ia32_scattersiv2di: |
| IID = Intrinsic::x86_avx512_mask_scattersiv2_di; |
| break; |
| case X86::BI__builtin_ia32_scattersiv4df: |
| IID = Intrinsic::x86_avx512_mask_scattersiv4_df; |
| break; |
| case X86::BI__builtin_ia32_scattersiv4di: |
| IID = Intrinsic::x86_avx512_mask_scattersiv4_di; |
| break; |
| case X86::BI__builtin_ia32_scattersiv4sf: |
| IID = Intrinsic::x86_avx512_mask_scattersiv4_sf; |
| break; |
| case X86::BI__builtin_ia32_scattersiv4si: |
| IID = Intrinsic::x86_avx512_mask_scattersiv4_si; |
| break; |
| case X86::BI__builtin_ia32_scattersiv8sf: |
| IID = Intrinsic::x86_avx512_mask_scattersiv8_sf; |
| break; |
| case X86::BI__builtin_ia32_scattersiv8si: |
| IID = Intrinsic::x86_avx512_mask_scattersiv8_si; |
| break; |
| } |
| |
| unsigned MinElts = std::min( |
| cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements(), |
| cast<llvm::FixedVectorType>(Ops[3]->getType())->getNumElements()); |
| Ops[1] = getMaskVecValue(*this, Ops[1], MinElts); |
| Function *Intr = CGM.getIntrinsic(IID); |
| return Builder.CreateCall(Intr, Ops); |
| } |
| |
| case X86::BI__builtin_ia32_vextractf128_pd256: |
| case X86::BI__builtin_ia32_vextractf128_ps256: |
| case X86::BI__builtin_ia32_vextractf128_si256: |
| case X86::BI__builtin_ia32_extract128i256: |
| case X86::BI__builtin_ia32_extractf64x4_mask: |
| case X86::BI__builtin_ia32_extractf32x4_mask: |
| case X86::BI__builtin_ia32_extracti64x4_mask: |
| case X86::BI__builtin_ia32_extracti32x4_mask: |
| case X86::BI__builtin_ia32_extractf32x8_mask: |
| case X86::BI__builtin_ia32_extracti32x8_mask: |
| case X86::BI__builtin_ia32_extractf32x4_256_mask: |
| case X86::BI__builtin_ia32_extracti32x4_256_mask: |
| case X86::BI__builtin_ia32_extractf64x2_256_mask: |
| case X86::BI__builtin_ia32_extracti64x2_256_mask: |
| case X86::BI__builtin_ia32_extractf64x2_512_mask: |
| case X86::BI__builtin_ia32_extracti64x2_512_mask: { |
| auto *DstTy = cast<llvm::FixedVectorType>(ConvertType(E->getType())); |
| unsigned NumElts = DstTy->getNumElements(); |
| unsigned SrcNumElts = |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); |
| unsigned SubVectors = SrcNumElts / NumElts; |
| unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue(); |
| assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors"); |
| Index &= SubVectors - 1; // Remove any extra bits. |
| Index *= NumElts; |
| |
| int Indices[16]; |
| for (unsigned i = 0; i != NumElts; ++i) |
| Indices[i] = i + Index; |
| |
| Value *Res = Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts), |
| "extract"); |
| |
| if (Ops.size() == 4) |
| Res = EmitX86Select(*this, Ops[3], Res, Ops[2]); |
| |
| return Res; |
| } |
| case X86::BI__builtin_ia32_vinsertf128_pd256: |
| case X86::BI__builtin_ia32_vinsertf128_ps256: |
| case X86::BI__builtin_ia32_vinsertf128_si256: |
| case X86::BI__builtin_ia32_insert128i256: |
| case X86::BI__builtin_ia32_insertf64x4: |
| case X86::BI__builtin_ia32_insertf32x4: |
| case X86::BI__builtin_ia32_inserti64x4: |
| case X86::BI__builtin_ia32_inserti32x4: |
| case X86::BI__builtin_ia32_insertf32x8: |
| case X86::BI__builtin_ia32_inserti32x8: |
| case X86::BI__builtin_ia32_insertf32x4_256: |
| case X86::BI__builtin_ia32_inserti32x4_256: |
| case X86::BI__builtin_ia32_insertf64x2_256: |
| case X86::BI__builtin_ia32_inserti64x2_256: |
| case X86::BI__builtin_ia32_insertf64x2_512: |
| case X86::BI__builtin_ia32_inserti64x2_512: { |
| unsigned DstNumElts = |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); |
| unsigned SrcNumElts = |
| cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements(); |
| unsigned SubVectors = DstNumElts / SrcNumElts; |
| unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue(); |
| assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors"); |
| Index &= SubVectors - 1; // Remove any extra bits. |
| Index *= SrcNumElts; |
| |
| int Indices[16]; |
| for (unsigned i = 0; i != DstNumElts; ++i) |
| Indices[i] = (i >= SrcNumElts) ? SrcNumElts + (i % SrcNumElts) : i; |
| |
| Value *Op1 = Builder.CreateShuffleVector( |
| Ops[1], ArrayRef(Indices, DstNumElts), "widen"); |
| |
| for (unsigned i = 0; i != DstNumElts; ++i) { |
| if (i >= Index && i < (Index + SrcNumElts)) |
| Indices[i] = (i - Index) + DstNumElts; |
| else |
| Indices[i] = i; |
| } |
| |
| return Builder.CreateShuffleVector(Ops[0], Op1, |
| ArrayRef(Indices, DstNumElts), "insert"); |
| } |
| case X86::BI__builtin_ia32_pmovqd512_mask: |
| case X86::BI__builtin_ia32_pmovwb512_mask: { |
| Value *Res = Builder.CreateTrunc(Ops[0], Ops[1]->getType()); |
| return EmitX86Select(*this, Ops[2], Res, Ops[1]); |
| } |
| case X86::BI__builtin_ia32_pmovdb512_mask: |
| case X86::BI__builtin_ia32_pmovdw512_mask: |
| case X86::BI__builtin_ia32_pmovqw512_mask: { |
| if (const auto *C = dyn_cast<Constant>(Ops[2])) |
| if (C->isAllOnesValue()) |
| return Builder.CreateTrunc(Ops[0], Ops[1]->getType()); |
| |
| Intrinsic::ID IID; |
| switch (BuiltinID) { |
| default: llvm_unreachable("Unsupported intrinsic!"); |
| case X86::BI__builtin_ia32_pmovdb512_mask: |
| IID = Intrinsic::x86_avx512_mask_pmov_db_512; |
| break; |
| case X86::BI__builtin_ia32_pmovdw512_mask: |
| IID = Intrinsic::x86_avx512_mask_pmov_dw_512; |
| break; |
| case X86::BI__builtin_ia32_pmovqw512_mask: |
| IID = Intrinsic::x86_avx512_mask_pmov_qw_512; |
| break; |
| } |
| |
| Function *Intr = CGM.getIntrinsic(IID); |
| return Builder.CreateCall(Intr, Ops); |
| } |
| case X86::BI__builtin_ia32_pblendw128: |
| case X86::BI__builtin_ia32_blendpd: |
| case X86::BI__builtin_ia32_blendps: |
| case X86::BI__builtin_ia32_blendpd256: |
| case X86::BI__builtin_ia32_blendps256: |
| case X86::BI__builtin_ia32_pblendw256: |
| case X86::BI__builtin_ia32_pblendd128: |
| case X86::BI__builtin_ia32_pblendd256: { |
| unsigned NumElts = |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); |
| unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); |
| |
| int Indices[16]; |
| // If there are more than 8 elements, the immediate is used twice so make |
| // sure we handle that. |
| for (unsigned i = 0; i != NumElts; ++i) |
| Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i; |
| |
| return Builder.CreateShuffleVector(Ops[0], Ops[1], |
| ArrayRef(Indices, NumElts), "blend"); |
| } |
| case X86::BI__builtin_ia32_pshuflw: |
| case X86::BI__builtin_ia32_pshuflw256: |
| case X86::BI__builtin_ia32_pshuflw512: { |
| uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); |
| auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType()); |
| unsigned NumElts = Ty->getNumElements(); |
| |
| // Splat the 8-bits of immediate 4 times to help the loop wrap around. |
| Imm = (Imm & 0xff) * 0x01010101; |
| |
| int Indices[32]; |
| for (unsigned l = 0; l != NumElts; l += 8) { |
| for (unsigned i = 0; i != 4; ++i) { |
| Indices[l + i] = l + (Imm & 3); |
| Imm >>= 2; |
| } |
| for (unsigned i = 4; i != 8; ++i) |
| Indices[l + i] = l + i; |
| } |
| |
| return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts), |
| "pshuflw"); |
| } |
| case X86::BI__builtin_ia32_pshufhw: |
| case X86::BI__builtin_ia32_pshufhw256: |
| case X86::BI__builtin_ia32_pshufhw512: { |
| uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); |
| auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType()); |
| unsigned NumElts = Ty->getNumElements(); |
| |
| // Splat the 8-bits of immediate 4 times to help the loop wrap around. |
| Imm = (Imm & 0xff) * 0x01010101; |
| |
| int Indices[32]; |
| for (unsigned l = 0; l != NumElts; l += 8) { |
| for (unsigned i = 0; i != 4; ++i) |
| Indices[l + i] = l + i; |
| for (unsigned i = 4; i != 8; ++i) { |
| Indices[l + i] = l + 4 + (Imm & 3); |
| Imm >>= 2; |
| } |
| } |
| |
| return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts), |
| "pshufhw"); |
| } |
| case X86::BI__builtin_ia32_pshufd: |
| case X86::BI__builtin_ia32_pshufd256: |
| case X86::BI__builtin_ia32_pshufd512: |
| case X86::BI__builtin_ia32_vpermilpd: |
| case X86::BI__builtin_ia32_vpermilps: |
| case X86::BI__builtin_ia32_vpermilpd256: |
| case X86::BI__builtin_ia32_vpermilps256: |
| case X86::BI__builtin_ia32_vpermilpd512: |
| case X86::BI__builtin_ia32_vpermilps512: { |
| uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); |
| auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType()); |
| unsigned NumElts = Ty->getNumElements(); |
| unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128; |
| unsigned NumLaneElts = NumElts / NumLanes; |
| |
| // Splat the 8-bits of immediate 4 times to help the loop wrap around. |
| Imm = (Imm & 0xff) * 0x01010101; |
| |
| int Indices[16]; |
| for (unsigned l = 0; l != NumElts; l += NumLaneElts) { |
| for (unsigned i = 0; i != NumLaneElts; ++i) { |
| Indices[i + l] = (Imm % NumLaneElts) + l; |
| Imm /= NumLaneElts; |
| } |
| } |
| |
| return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts), |
| "permil"); |
| } |
| case X86::BI__builtin_ia32_shufpd: |
| case X86::BI__builtin_ia32_shufpd256: |
| case X86::BI__builtin_ia32_shufpd512: |
| case X86::BI__builtin_ia32_shufps: |
| case X86::BI__builtin_ia32_shufps256: |
| case X86::BI__builtin_ia32_shufps512: { |
| uint32_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); |
| auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType()); |
| unsigned NumElts = Ty->getNumElements(); |
| unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128; |
| unsigned NumLaneElts = NumElts / NumLanes; |
| |
| // Splat the 8-bits of immediate 4 times to help the loop wrap around. |
| Imm = (Imm & 0xff) * 0x01010101; |
| |
| int Indices[16]; |
| for (unsigned l = 0; l != NumElts; l += NumLaneElts) { |
| for (unsigned i = 0; i != NumLaneElts; ++i) { |
| unsigned Index = Imm % NumLaneElts; |
| Imm /= NumLaneElts; |
| if (i >= (NumLaneElts / 2)) |
| Index += NumElts; |
| Indices[l + i] = l + Index; |
| } |
| } |
| |
| return Builder.CreateShuffleVector(Ops[0], Ops[1], |
| ArrayRef(Indices, NumElts), "shufp"); |
| } |
| case X86::BI__builtin_ia32_permdi256: |
| case X86::BI__builtin_ia32_permdf256: |
| case X86::BI__builtin_ia32_permdi512: |
| case X86::BI__builtin_ia32_permdf512: { |
| unsigned Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); |
| auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType()); |
| unsigned NumElts = Ty->getNumElements(); |
| |
| // These intrinsics operate on 256-bit lanes of four 64-bit elements. |
| int Indices[8]; |
| for (unsigned l = 0; l != NumElts; l += 4) |
| for (unsigned i = 0; i != 4; ++i) |
| Indices[l + i] = l + ((Imm >> (2 * i)) & 0x3); |
| |
| return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts), |
| "perm"); |
| } |
| case X86::BI__builtin_ia32_palignr128: |
| case X86::BI__builtin_ia32_palignr256: |
| case X86::BI__builtin_ia32_palignr512: { |
| unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff; |
| |
| unsigned NumElts = |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); |
| assert(NumElts % 16 == 0); |
| |
| // If palignr is shifting the pair of vectors more than the size of two |
| // lanes, emit zero. |
| if (ShiftVal >= 32) |
| return llvm::Constant::getNullValue(ConvertType(E->getType())); |
| |
| // If palignr is shifting the pair of input vectors more than one lane, |
| // but less than two lanes, convert to shifting in zeroes. |
| if (ShiftVal > 16) { |
| ShiftVal -= 16; |
| Ops[1] = Ops[0]; |
| Ops[0] = llvm::Constant::getNullValue(Ops[0]->getType()); |
| } |
| |
| int Indices[64]; |
| // 256-bit palignr operates on 128-bit lanes so we need to handle that |
| for (unsigned l = 0; l != NumElts; l += 16) { |
| for (unsigned i = 0; i != 16; ++i) { |
| unsigned Idx = ShiftVal + i; |
| if (Idx >= 16) |
| Idx += NumElts - 16; // End of lane, switch operand. |
| Indices[l + i] = Idx + l; |
| } |
| } |
| |
| return Builder.CreateShuffleVector(Ops[1], Ops[0], |
| ArrayRef(Indices, NumElts), "palignr"); |
| } |
| case X86::BI__builtin_ia32_alignd128: |
| case X86::BI__builtin_ia32_alignd256: |
| case X86::BI__builtin_ia32_alignd512: |
| case X86::BI__builtin_ia32_alignq128: |
| case X86::BI__builtin_ia32_alignq256: |
| case X86::BI__builtin_ia32_alignq512: { |
| unsigned NumElts = |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); |
| unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff; |
| |
| // Mask the shift amount to width of a vector. |
| ShiftVal &= NumElts - 1; |
| |
| int Indices[16]; |
| for (unsigned i = 0; i != NumElts; ++i) |
| Indices[i] = i + ShiftVal; |
| |
| return Builder.CreateShuffleVector(Ops[1], Ops[0], |
| ArrayRef(Indices, NumElts), "valign"); |
| } |
| case X86::BI__builtin_ia32_shuf_f32x4_256: |
| case X86::BI__builtin_ia32_shuf_f64x2_256: |
| case X86::BI__builtin_ia32_shuf_i32x4_256: |
| case X86::BI__builtin_ia32_shuf_i64x2_256: |
| case X86::BI__builtin_ia32_shuf_f32x4: |
| case X86::BI__builtin_ia32_shuf_f64x2: |
| case X86::BI__builtin_ia32_shuf_i32x4: |
| case X86::BI__builtin_ia32_shuf_i64x2: { |
| unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); |
| auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType()); |
| unsigned NumElts = Ty->getNumElements(); |
| unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2; |
| unsigned NumLaneElts = NumElts / NumLanes; |
| |
| int Indices[16]; |
| for (unsigned l = 0; l != NumElts; l += NumLaneElts) { |
| unsigned Index = (Imm % NumLanes) * NumLaneElts; |
| Imm /= NumLanes; // Discard the bits we just used. |
| if (l >= (NumElts / 2)) |
| Index += NumElts; // Switch to other source. |
| for (unsigned i = 0; i != NumLaneElts; ++i) { |
| Indices[l + i] = Index + i; |
| } |
| } |
| |
| return Builder.CreateShuffleVector(Ops[0], Ops[1], |
| ArrayRef(Indices, NumElts), "shuf"); |
| } |
| |
| case X86::BI__builtin_ia32_vperm2f128_pd256: |
| case X86::BI__builtin_ia32_vperm2f128_ps256: |
| case X86::BI__builtin_ia32_vperm2f128_si256: |
| case X86::BI__builtin_ia32_permti256: { |
| unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); |
| unsigned NumElts = |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); |
| |
| // This takes a very simple approach since there are two lanes and a |
| // shuffle can have 2 inputs. So we reserve the first input for the first |
| // lane and the second input for the second lane. This may result in |
| // duplicate sources, but this can be dealt with in the backend. |
| |
| Value *OutOps[2]; |
| int Indices[8]; |
| for (unsigned l = 0; l != 2; ++l) { |
| // Determine the source for this lane. |
| if (Imm & (1 << ((l * 4) + 3))) |
| OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType()); |
| else if (Imm & (1 << ((l * 4) + 1))) |
| OutOps[l] = Ops[1]; |
| else |
| OutOps[l] = Ops[0]; |
| |
| for (unsigned i = 0; i != NumElts/2; ++i) { |
| // Start with ith element of the source for this lane. |
| unsigned Idx = (l * NumElts) + i; |
| // If bit 0 of the immediate half is set, switch to the high half of |
| // the source. |
| if (Imm & (1 << (l * 4))) |
| Idx += NumElts/2; |
| Indices[(l * (NumElts/2)) + i] = Idx; |
| } |
| } |
| |
| return Builder.CreateShuffleVector(OutOps[0], OutOps[1], |
| ArrayRef(Indices, NumElts), "vperm"); |
| } |
| |
| case X86::BI__builtin_ia32_pslldqi128_byteshift: |
| case X86::BI__builtin_ia32_pslldqi256_byteshift: |
| case X86::BI__builtin_ia32_pslldqi512_byteshift: { |
| unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff; |
| auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType()); |
| // Builtin type is vXi64 so multiply by 8 to get bytes. |
| unsigned NumElts = ResultType->getNumElements() * 8; |
| |
| // If pslldq is shifting the vector more than 15 bytes, emit zero. |
| if (ShiftVal >= 16) |
| return llvm::Constant::getNullValue(ResultType); |
| |
| int Indices[64]; |
| // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that |
| for (unsigned l = 0; l != NumElts; l += 16) { |
| for (unsigned i = 0; i != 16; ++i) { |
| unsigned Idx = NumElts + i - ShiftVal; |
| if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand. |
| Indices[l + i] = Idx + l; |
| } |
| } |
| |
| auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts); |
| Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast"); |
| Value *Zero = llvm::Constant::getNullValue(VecTy); |
| Value *SV = Builder.CreateShuffleVector( |
| Zero, Cast, ArrayRef(Indices, NumElts), "pslldq"); |
| return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast"); |
| } |
| case X86::BI__builtin_ia32_psrldqi128_byteshift: |
| case X86::BI__builtin_ia32_psrldqi256_byteshift: |
| case X86::BI__builtin_ia32_psrldqi512_byteshift: { |
| unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff; |
| auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType()); |
| // Builtin type is vXi64 so multiply by 8 to get bytes. |
| unsigned NumElts = ResultType->getNumElements() * 8; |
| |
| // If psrldq is shifting the vector more than 15 bytes, emit zero. |
| if (ShiftVal >= 16) |
| return llvm::Constant::getNullValue(ResultType); |
| |
| int Indices[64]; |
| // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that |
| for (unsigned l = 0; l != NumElts; l += 16) { |
| for (unsigned i = 0; i != 16; ++i) { |
| unsigned Idx = i + ShiftVal; |
| if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand. |
| Indices[l + i] = Idx + l; |
| } |
| } |
| |
| auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts); |
| Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast"); |
| Value *Zero = llvm::Constant::getNullValue(VecTy); |
| Value *SV = Builder.CreateShuffleVector( |
| Cast, Zero, ArrayRef(Indices, NumElts), "psrldq"); |
| return Builder.CreateBitCast(SV, ResultType, "cast"); |
| } |
| case X86::BI__builtin_ia32_kshiftliqi: |
| case X86::BI__builtin_ia32_kshiftlihi: |
| case X86::BI__builtin_ia32_kshiftlisi: |
| case X86::BI__builtin_ia32_kshiftlidi: { |
| unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff; |
| unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); |
| |
| if (ShiftVal >= NumElts) |
| return llvm::Constant::getNullValue(Ops[0]->getType()); |
| |
| Value *In = getMaskVecValue(*this, Ops[0], NumElts); |
| |
| int Indices[64]; |
| for (unsigned i = 0; i != NumElts; ++i) |
| Indices[i] = NumElts + i - ShiftVal; |
| |
| Value *Zero = llvm::Constant::getNullValue(In->getType()); |
| Value *SV = Builder.CreateShuffleVector( |
| Zero, In, ArrayRef(Indices, NumElts), "kshiftl"); |
| return Builder.CreateBitCast(SV, Ops[0]->getType()); |
| } |
| case X86::BI__builtin_ia32_kshiftriqi: |
| case X86::BI__builtin_ia32_kshiftrihi: |
| case X86::BI__builtin_ia32_kshiftrisi: |
| case X86::BI__builtin_ia32_kshiftridi: { |
| unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff; |
| unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); |
| |
| if (ShiftVal >= NumElts) |
| return llvm::Constant::getNullValue(Ops[0]->getType()); |
| |
| Value *In = getMaskVecValue(*this, Ops[0], NumElts); |
| |
| int Indices[64]; |
| for (unsigned i = 0; i != NumElts; ++i) |
| Indices[i] = i + ShiftVal; |
| |
| Value *Zero = llvm::Constant::getNullValue(In->getType()); |
| Value *SV = Builder.CreateShuffleVector( |
| In, Zero, ArrayRef(Indices, NumElts), "kshiftr"); |
| return Builder.CreateBitCast(SV, Ops[0]->getType()); |
| } |
| case X86::BI__builtin_ia32_movnti: |
| case X86::BI__builtin_ia32_movnti64: |
| case X86::BI__builtin_ia32_movntsd: |
| case X86::BI__builtin_ia32_movntss: { |
| llvm::MDNode *Node = llvm::MDNode::get( |
| getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1))); |
| |
| Value *Ptr = Ops[0]; |
| Value *Src = Ops[1]; |
| |
| // Extract the 0'th element of the source vector. |
| if (BuiltinID == X86::BI__builtin_ia32_movntsd || |
| BuiltinID == X86::BI__builtin_ia32_movntss) |
| Src = Builder.CreateExtractElement(Src, (uint64_t)0, "extract"); |
| |
| // Unaligned nontemporal store of the scalar value. |
| StoreInst *SI = Builder.CreateDefaultAlignedStore(Src, Ptr); |
| SI->setMetadata(llvm::LLVMContext::MD_nontemporal, Node); |
| SI->setAlignment(llvm::Align(1)); |
| return SI; |
| } |
| // Rotate is a special case of funnel shift - 1st 2 args are the same. |
| case X86::BI__builtin_ia32_vprotb: |
| case X86::BI__builtin_ia32_vprotw: |
| case X86::BI__builtin_ia32_vprotd: |
| case X86::BI__builtin_ia32_vprotq: |
| case X86::BI__builtin_ia32_vprotbi: |
| case X86::BI__builtin_ia32_vprotwi: |
| case X86::BI__builtin_ia32_vprotdi: |
| case X86::BI__builtin_ia32_vprotqi: |
| case X86::BI__builtin_ia32_prold128: |
| case X86::BI__builtin_ia32_prold256: |
| case X86::BI__builtin_ia32_prold512: |
| case X86::BI__builtin_ia32_prolq128: |
| case X86::BI__builtin_ia32_prolq256: |
| case X86::BI__builtin_ia32_prolq512: |
| case X86::BI__builtin_ia32_prolvd128: |
| case X86::BI__builtin_ia32_prolvd256: |
| case X86::BI__builtin_ia32_prolvd512: |
| case X86::BI__builtin_ia32_prolvq128: |
| case X86::BI__builtin_ia32_prolvq256: |
| case X86::BI__builtin_ia32_prolvq512: |
| return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], false); |
| case X86::BI__builtin_ia32_prord128: |
| case X86::BI__builtin_ia32_prord256: |
| case X86::BI__builtin_ia32_prord512: |
| case X86::BI__builtin_ia32_prorq128: |
| case X86::BI__builtin_ia32_prorq256: |
| case X86::BI__builtin_ia32_prorq512: |
| case X86::BI__builtin_ia32_prorvd128: |
| case X86::BI__builtin_ia32_prorvd256: |
| case X86::BI__builtin_ia32_prorvd512: |
| case X86::BI__builtin_ia32_prorvq128: |
| case X86::BI__builtin_ia32_prorvq256: |
| case X86::BI__builtin_ia32_prorvq512: |
| return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], true); |
| case X86::BI__builtin_ia32_selectb_128: |
| case X86::BI__builtin_ia32_selectb_256: |
| case X86::BI__builtin_ia32_selectb_512: |
| case X86::BI__builtin_ia32_selectw_128: |
| case X86::BI__builtin_ia32_selectw_256: |
| case X86::BI__builtin_ia32_selectw_512: |
| case X86::BI__builtin_ia32_selectd_128: |
| case X86::BI__builtin_ia32_selectd_256: |
| case X86::BI__builtin_ia32_selectd_512: |
| case X86::BI__builtin_ia32_selectq_128: |
| case X86::BI__builtin_ia32_selectq_256: |
| case X86::BI__builtin_ia32_selectq_512: |
| case X86::BI__builtin_ia32_selectph_128: |
| case X86::BI__builtin_ia32_selectph_256: |
| case X86::BI__builtin_ia32_selectph_512: |
| case X86::BI__builtin_ia32_selectpbf_128: |
| case X86::BI__builtin_ia32_selectpbf_256: |
| case X86::BI__builtin_ia32_selectpbf_512: |
| case X86::BI__builtin_ia32_selectps_128: |
| case X86::BI__builtin_ia32_selectps_256: |
| case X86::BI__builtin_ia32_selectps_512: |
| case X86::BI__builtin_ia32_selectpd_128: |
| case X86::BI__builtin_ia32_selectpd_256: |
| case X86::BI__builtin_ia32_selectpd_512: |
| return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]); |
| case X86::BI__builtin_ia32_selectsh_128: |
| case X86::BI__builtin_ia32_selectsbf_128: |
| case X86::BI__builtin_ia32_selectss_128: |
| case X86::BI__builtin_ia32_selectsd_128: { |
| Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0); |
| Value *B = Builder.CreateExtractElement(Ops[2], (uint64_t)0); |
| A = EmitX86ScalarSelect(*this, Ops[0], A, B); |
| return Builder.CreateInsertElement(Ops[1], A, (uint64_t)0); |
| } |
| case X86::BI__builtin_ia32_cmpb128_mask: |
| case X86::BI__builtin_ia32_cmpb256_mask: |
| case X86::BI__builtin_ia32_cmpb512_mask: |
| case X86::BI__builtin_ia32_cmpw128_mask: |
| case X86::BI__builtin_ia32_cmpw256_mask: |
| case X86::BI__builtin_ia32_cmpw512_mask: |
| case X86::BI__builtin_ia32_cmpd128_mask: |
| case X86::BI__builtin_ia32_cmpd256_mask: |
| case X86::BI__builtin_ia32_cmpd512_mask: |
| case X86::BI__builtin_ia32_cmpq128_mask: |
| case X86::BI__builtin_ia32_cmpq256_mask: |
| case X86::BI__builtin_ia32_cmpq512_mask: { |
| unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7; |
| return EmitX86MaskedCompare(*this, CC, true, Ops); |
| } |
| case X86::BI__builtin_ia32_ucmpb128_mask: |
| case X86::BI__builtin_ia32_ucmpb256_mask: |
| case X86::BI__builtin_ia32_ucmpb512_mask: |
| case X86::BI__builtin_ia32_ucmpw128_mask: |
| case X86::BI__builtin_ia32_ucmpw256_mask: |
| case X86::BI__builtin_ia32_ucmpw512_mask: |
| case X86::BI__builtin_ia32_ucmpd128_mask: |
| case X86::BI__builtin_ia32_ucmpd256_mask: |
| case X86::BI__builtin_ia32_ucmpd512_mask: |
| case X86::BI__builtin_ia32_ucmpq128_mask: |
| case X86::BI__builtin_ia32_ucmpq256_mask: |
| case X86::BI__builtin_ia32_ucmpq512_mask: { |
| unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7; |
| return EmitX86MaskedCompare(*this, CC, false, Ops); |
| } |
| case X86::BI__builtin_ia32_vpcomb: |
| case X86::BI__builtin_ia32_vpcomw: |
| case X86::BI__builtin_ia32_vpcomd: |
| case X86::BI__builtin_ia32_vpcomq: |
| return EmitX86vpcom(*this, Ops, true); |
| case X86::BI__builtin_ia32_vpcomub: |
| case X86::BI__builtin_ia32_vpcomuw: |
| case X86::BI__builtin_ia32_vpcomud: |
| case X86::BI__builtin_ia32_vpcomuq: |
| return EmitX86vpcom(*this, Ops, false); |
| |
| case X86::BI__builtin_ia32_kortestcqi: |
| case X86::BI__builtin_ia32_kortestchi: |
| case X86::BI__builtin_ia32_kortestcsi: |
| case X86::BI__builtin_ia32_kortestcdi: { |
| Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops); |
| Value *C = llvm::Constant::getAllOnesValue(Ops[0]->getType()); |
| Value *Cmp = Builder.CreateICmpEQ(Or, C); |
| return Builder.CreateZExt(Cmp, ConvertType(E->getType())); |
| } |
| case X86::BI__builtin_ia32_kortestzqi: |
| case X86::BI__builtin_ia32_kortestzhi: |
| case X86::BI__builtin_ia32_kortestzsi: |
| case X86::BI__builtin_ia32_kortestzdi: { |
| Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops); |
| Value *C = llvm::Constant::getNullValue(Ops[0]->getType()); |
| Value *Cmp = Builder.CreateICmpEQ(Or, C); |
| return Builder.CreateZExt(Cmp, ConvertType(E->getType())); |
| } |
| |
| case X86::BI__builtin_ia32_ktestcqi: |
| case X86::BI__builtin_ia32_ktestzqi: |
| case X86::BI__builtin_ia32_ktestchi: |
| case X86::BI__builtin_ia32_ktestzhi: |
| case X86::BI__builtin_ia32_ktestcsi: |
| case X86::BI__builtin_ia32_ktestzsi: |
| case X86::BI__builtin_ia32_ktestcdi: |
| case X86::BI__builtin_ia32_ktestzdi: { |
| Intrinsic::ID IID; |
| switch (BuiltinID) { |
| default: llvm_unreachable("Unsupported intrinsic!"); |
| case X86::BI__builtin_ia32_ktestcqi: |
| IID = Intrinsic::x86_avx512_ktestc_b; |
| break; |
| case X86::BI__builtin_ia32_ktestzqi: |
| IID = Intrinsic::x86_avx512_ktestz_b; |
| break; |
| case X86::BI__builtin_ia32_ktestchi: |
| IID = Intrinsic::x86_avx512_ktestc_w; |
| break; |
| case X86::BI__builtin_ia32_ktestzhi: |
| IID = Intrinsic::x86_avx512_ktestz_w; |
| break; |
| case X86::BI__builtin_ia32_ktestcsi: |
| IID = Intrinsic::x86_avx512_ktestc_d; |
| break; |
| case X86::BI__builtin_ia32_ktestzsi: |
| IID = Intrinsic::x86_avx512_ktestz_d; |
| break; |
| case X86::BI__builtin_ia32_ktestcdi: |
| IID = Intrinsic::x86_avx512_ktestc_q; |
| break; |
| case X86::BI__builtin_ia32_ktestzdi: |
| IID = Intrinsic::x86_avx512_ktestz_q; |
| break; |
| } |
| |
| unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); |
| Value *LHS = getMaskVecValue(*this, Ops[0], NumElts); |
| Value *RHS = getMaskVecValue(*this, Ops[1], NumElts); |
| Function *Intr = CGM.getIntrinsic(IID); |
| return Builder.CreateCall(Intr, {LHS, RHS}); |
| } |
| |
| case X86::BI__builtin_ia32_kaddqi: |
| case X86::BI__builtin_ia32_kaddhi: |
| case X86::BI__builtin_ia32_kaddsi: |
| case X86::BI__builtin_ia32_kadddi: { |
| Intrinsic::ID IID; |
| switch (BuiltinID) { |
| default: llvm_unreachable("Unsupported intrinsic!"); |
| case X86::BI__builtin_ia32_kaddqi: |
| IID = Intrinsic::x86_avx512_kadd_b; |
| break; |
| case X86::BI__builtin_ia32_kaddhi: |
| IID = Intrinsic::x86_avx512_kadd_w; |
| break; |
| case X86::BI__builtin_ia32_kaddsi: |
| IID = Intrinsic::x86_avx512_kadd_d; |
| break; |
| case X86::BI__builtin_ia32_kadddi: |
| IID = Intrinsic::x86_avx512_kadd_q; |
| break; |
| } |
| |
| unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); |
| Value *LHS = getMaskVecValue(*this, Ops[0], NumElts); |
| Value *RHS = getMaskVecValue(*this, Ops[1], NumElts); |
| Function *Intr = CGM.getIntrinsic(IID); |
| Value *Res = Builder.CreateCall(Intr, {LHS, RHS}); |
| return Builder.CreateBitCast(Res, Ops[0]->getType()); |
| } |
| case X86::BI__builtin_ia32_kandqi: |
| case X86::BI__builtin_ia32_kandhi: |
| case X86::BI__builtin_ia32_kandsi: |
| case X86::BI__builtin_ia32_kanddi: |
| return EmitX86MaskLogic(*this, Instruction::And, Ops); |
| case X86::BI__builtin_ia32_kandnqi: |
| case X86::BI__builtin_ia32_kandnhi: |
| case X86::BI__builtin_ia32_kandnsi: |
| case X86::BI__builtin_ia32_kandndi: |
| return EmitX86MaskLogic(*this, Instruction::And, Ops, true); |
| case X86::BI__builtin_ia32_korqi: |
| case X86::BI__builtin_ia32_korhi: |
| case X86::BI__builtin_ia32_korsi: |
| case X86::BI__builtin_ia32_kordi: |
| return EmitX86MaskLogic(*this, Instruction::Or, Ops); |
| case X86::BI__builtin_ia32_kxnorqi: |
| case X86::BI__builtin_ia32_kxnorhi: |
| case X86::BI__builtin_ia32_kxnorsi: |
| case X86::BI__builtin_ia32_kxnordi: |
| return EmitX86MaskLogic(*this, Instruction::Xor, Ops, true); |
| case X86::BI__builtin_ia32_kxorqi: |
| case X86::BI__builtin_ia32_kxorhi: |
| case X86::BI__builtin_ia32_kxorsi: |
| case X86::BI__builtin_ia32_kxordi: |
| return EmitX86MaskLogic(*this, Instruction::Xor, Ops); |
| case X86::BI__builtin_ia32_knotqi: |
| case X86::BI__builtin_ia32_knothi: |
| case X86::BI__builtin_ia32_knotsi: |
| case X86::BI__builtin_ia32_knotdi: { |
| unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); |
| Value *Res = getMaskVecValue(*this, Ops[0], NumElts); |
| return Builder.CreateBitCast(Builder.CreateNot(Res), |
| Ops[0]->getType()); |
| } |
| case X86::BI__builtin_ia32_kmovb: |
| case X86::BI__builtin_ia32_kmovw: |
| case X86::BI__builtin_ia32_kmovd: |
| case X86::BI__builtin_ia32_kmovq: { |
| // Bitcast to vXi1 type and then back to integer. This gets the mask |
| // register type into the IR, but might be optimized out depending on |
| // what's around it. |
| unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); |
| Value *Res = getMaskVecValue(*this, Ops[0], NumElts); |
| return Builder.CreateBitCast(Res, Ops[0]->getType()); |
| } |
| |
| case X86::BI__builtin_ia32_kunpckdi: |
| case X86::BI__builtin_ia32_kunpcksi: |
| case X86::BI__builtin_ia32_kunpckhi: { |
| unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); |
| Value *LHS = getMaskVecValue(*this, Ops[0], NumElts); |
| Value *RHS = getMaskVecValue(*this, Ops[1], NumElts); |
| int Indices[64]; |
| for (unsigned i = 0; i != NumElts; ++i) |
| Indices[i] = i; |
| |
| // First extract half of each vector. This gives better codegen than |
| // doing it in a single shuffle. |
| LHS = Builder.CreateShuffleVector(LHS, LHS, ArrayRef(Indices, NumElts / 2)); |
| RHS = Builder.CreateShuffleVector(RHS, RHS, ArrayRef(Indices, NumElts / 2)); |
| // Concat the vectors. |
| // NOTE: Operands are swapped to match the intrinsic definition. |
| Value *Res = |
| Builder.CreateShuffleVector(RHS, LHS, ArrayRef(Indices, NumElts)); |
| return Builder.CreateBitCast(Res, Ops[0]->getType()); |
| } |
| |
| case X86::BI__builtin_ia32_vplzcntd_128: |
| case X86::BI__builtin_ia32_vplzcntd_256: |
| case X86::BI__builtin_ia32_vplzcntd_512: |
| case X86::BI__builtin_ia32_vplzcntq_128: |
| case X86::BI__builtin_ia32_vplzcntq_256: |
| case X86::BI__builtin_ia32_vplzcntq_512: { |
| Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType()); |
| return Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)}); |
| } |
| case X86::BI__builtin_ia32_sqrtss: |
| case X86::BI__builtin_ia32_sqrtsd: { |
| Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0); |
| Function *F; |
| if (Builder.getIsFPConstrained()) { |
| CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); |
| F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt, |
| A->getType()); |
| A = Builder.CreateConstrainedFPCall(F, {A}); |
| } else { |
| F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType()); |
| A = Builder.CreateCall(F, {A}); |
| } |
| return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0); |
| } |
| case X86::BI__builtin_ia32_sqrtsh_round_mask: |
| case X86::BI__builtin_ia32_sqrtsd_round_mask: |
| case X86::BI__builtin_ia32_sqrtss_round_mask: { |
| unsigned CC = cast<llvm::ConstantInt>(Ops[4])->getZExtValue(); |
| // Support only if the rounding mode is 4 (AKA CUR_DIRECTION), |
| // otherwise keep the intrinsic. |
| if (CC != 4) { |
| Intrinsic::ID IID; |
| |
| switch (BuiltinID) { |
| default: |
| llvm_unreachable("Unsupported intrinsic!"); |
| case X86::BI__builtin_ia32_sqrtsh_round_mask: |
| IID = Intrinsic::x86_avx512fp16_mask_sqrt_sh; |
| break; |
| case X86::BI__builtin_ia32_sqrtsd_round_mask: |
| IID = Intrinsic::x86_avx512_mask_sqrt_sd; |
| break; |
| case X86::BI__builtin_ia32_sqrtss_round_mask: |
| IID = Intrinsic::x86_avx512_mask_sqrt_ss; |
| break; |
| } |
| return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); |
| } |
| Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0); |
| Function *F; |
| if (Builder.getIsFPConstrained()) { |
| CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); |
| F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt, |
| A->getType()); |
| A = Builder.CreateConstrainedFPCall(F, A); |
| } else { |
| F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType()); |
| A = Builder.CreateCall(F, A); |
| } |
| Value *Src = Builder.CreateExtractElement(Ops[2], (uint64_t)0); |
| A = EmitX86ScalarSelect(*this, Ops[3], A, Src); |
| return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0); |
| } |
| case X86::BI__builtin_ia32_sqrtpd256: |
| case X86::BI__builtin_ia32_sqrtpd: |
| case X86::BI__builtin_ia32_sqrtps256: |
| case X86::BI__builtin_ia32_sqrtps: |
| case X86::BI__builtin_ia32_sqrtph256: |
| case X86::BI__builtin_ia32_sqrtph: |
| case X86::BI__builtin_ia32_sqrtph512: |
| case X86::BI__builtin_ia32_vsqrtbf16256: |
| case X86::BI__builtin_ia32_vsqrtbf16: |
| case X86::BI__builtin_ia32_vsqrtbf16512: |
| case X86::BI__builtin_ia32_sqrtps512: |
| case X86::BI__builtin_ia32_sqrtpd512: { |
| if (Ops.size() == 2) { |
| unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); |
| // Support only if the rounding mode is 4 (AKA CUR_DIRECTION), |
| // otherwise keep the intrinsic. |
| if (CC != 4) { |
| Intrinsic::ID IID; |
| |
| switch (BuiltinID) { |
| default: |
| llvm_unreachable("Unsupported intrinsic!"); |
| case X86::BI__builtin_ia32_sqrtph512: |
| IID = Intrinsic::x86_avx512fp16_sqrt_ph_512; |
| break; |
| case X86::BI__builtin_ia32_sqrtps512: |
| IID = Intrinsic::x86_avx512_sqrt_ps_512; |
| break; |
| case X86::BI__builtin_ia32_sqrtpd512: |
| IID = Intrinsic::x86_avx512_sqrt_pd_512; |
| break; |
| } |
| return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); |
| } |
| } |
| if (Builder.getIsFPConstrained()) { |
| CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); |
| Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt, |
| Ops[0]->getType()); |
| return Builder.CreateConstrainedFPCall(F, Ops[0]); |
| } else { |
| Function *F = CGM.getIntrinsic(Intrinsic::sqrt, Ops[0]->getType()); |
| return Builder.CreateCall(F, Ops[0]); |
| } |
| } |
| |
| case X86::BI__builtin_ia32_pmuludq128: |
| case X86::BI__builtin_ia32_pmuludq256: |
| case X86::BI__builtin_ia32_pmuludq512: |
| return EmitX86Muldq(*this, /*IsSigned*/false, Ops); |
| |
| case X86::BI__builtin_ia32_pmuldq128: |
| case X86::BI__builtin_ia32_pmuldq256: |
| case X86::BI__builtin_ia32_pmuldq512: |
| return EmitX86Muldq(*this, /*IsSigned*/true, Ops); |
| |
| case X86::BI__builtin_ia32_pternlogd512_mask: |
| case X86::BI__builtin_ia32_pternlogq512_mask: |
| case X86::BI__builtin_ia32_pternlogd128_mask: |
| case X86::BI__builtin_ia32_pternlogd256_mask: |
| case X86::BI__builtin_ia32_pternlogq128_mask: |
| case X86::BI__builtin_ia32_pternlogq256_mask: |
| return EmitX86Ternlog(*this, /*ZeroMask*/false, Ops); |
| |
| case X86::BI__builtin_ia32_pternlogd512_maskz: |
| case X86::BI__builtin_ia32_pternlogq512_maskz: |
| case X86::BI__builtin_ia32_pternlogd128_maskz: |
| case X86::BI__builtin_ia32_pternlogd256_maskz: |
| case X86::BI__builtin_ia32_pternlogq128_maskz: |
| case X86::BI__builtin_ia32_pternlogq256_maskz: |
| return EmitX86Ternlog(*this, /*ZeroMask*/true, Ops); |
| |
| case X86::BI__builtin_ia32_vpshldd128: |
| case X86::BI__builtin_ia32_vpshldd256: |
| case X86::BI__builtin_ia32_vpshldd512: |
| case X86::BI__builtin_ia32_vpshldq128: |
| case X86::BI__builtin_ia32_vpshldq256: |
| case X86::BI__builtin_ia32_vpshldq512: |
| case X86::BI__builtin_ia32_vpshldw128: |
| case X86::BI__builtin_ia32_vpshldw256: |
| case X86::BI__builtin_ia32_vpshldw512: |
| return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false); |
| |
| case X86::BI__builtin_ia32_vpshrdd128: |
| case X86::BI__builtin_ia32_vpshrdd256: |
| case X86::BI__builtin_ia32_vpshrdd512: |
| case X86::BI__builtin_ia32_vpshrdq128: |
| case X86::BI__builtin_ia32_vpshrdq256: |
| case X86::BI__builtin_ia32_vpshrdq512: |
| case X86::BI__builtin_ia32_vpshrdw128: |
| case X86::BI__builtin_ia32_vpshrdw256: |
| case X86::BI__builtin_ia32_vpshrdw512: |
| // Ops 0 and 1 are swapped. |
| return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true); |
| |
| case X86::BI__builtin_ia32_vpshldvd128: |
| case X86::BI__builtin_ia32_vpshldvd256: |
| case X86::BI__builtin_ia32_vpshldvd512: |
| case X86::BI__builtin_ia32_vpshldvq128: |
| case X86::BI__builtin_ia32_vpshldvq256: |
| case X86::BI__builtin_ia32_vpshldvq512: |
| case X86::BI__builtin_ia32_vpshldvw128: |
| case X86::BI__builtin_ia32_vpshldvw256: |
| case X86::BI__builtin_ia32_vpshldvw512: |
| return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false); |
| |
| case X86::BI__builtin_ia32_vpshrdvd128: |
| case X86::BI__builtin_ia32_vpshrdvd256: |
| case X86::BI__builtin_ia32_vpshrdvd512: |
| case X86::BI__builtin_ia32_vpshrdvq128: |
| case X86::BI__builtin_ia32_vpshrdvq256: |
| case X86::BI__builtin_ia32_vpshrdvq512: |
| case X86::BI__builtin_ia32_vpshrdvw128: |
| case X86::BI__builtin_ia32_vpshrdvw256: |
| case X86::BI__builtin_ia32_vpshrdvw512: |
| // Ops 0 and 1 are swapped. |
| return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true); |
| |
| // Reductions |
| case X86::BI__builtin_ia32_reduce_fadd_pd512: |
| case X86::BI__builtin_ia32_reduce_fadd_ps512: |
| case X86::BI__builtin_ia32_reduce_fadd_ph512: |
| case X86::BI__builtin_ia32_reduce_fadd_ph256: |
| case X86::BI__builtin_ia32_reduce_fadd_ph128: { |
| Function *F = |
| CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType()); |
| IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); |
| Builder.getFastMathFlags().setAllowReassoc(); |
| return Builder.CreateCall(F, {Ops[0], Ops[1]}); |
| } |
| case X86::BI__builtin_ia32_reduce_fmul_pd512: |
| case X86::BI__builtin_ia32_reduce_fmul_ps512: |
| case X86::BI__builtin_ia32_reduce_fmul_ph512: |
| case X86::BI__builtin_ia32_reduce_fmul_ph256: |
| case X86::BI__builtin_ia32_reduce_fmul_ph128: { |
| Function *F = |
| CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType()); |
| IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); |
| Builder.getFastMathFlags().setAllowReassoc(); |
| return Builder.CreateCall(F, {Ops[0], Ops[1]}); |
| } |
| case X86::BI__builtin_ia32_reduce_fmax_pd512: |
| case X86::BI__builtin_ia32_reduce_fmax_ps512: |
| case X86::BI__builtin_ia32_reduce_fmax_ph512: |
| case X86::BI__builtin_ia32_reduce_fmax_ph256: |
| case X86::BI__builtin_ia32_reduce_fmax_ph128: { |
| Function *F = |
| CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType()); |
| IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); |
| Builder.getFastMathFlags().setNoNaNs(); |
| return Builder.CreateCall(F, {Ops[0]}); |
| } |
| case X86::BI__builtin_ia32_reduce_fmin_pd512: |
| case X86::BI__builtin_ia32_reduce_fmin_ps512: |
| case X86::BI__builtin_ia32_reduce_fmin_ph512: |
| case X86::BI__builtin_ia32_reduce_fmin_ph256: |
| case X86::BI__builtin_ia32_reduce_fmin_ph128: { |
| Function *F = |
| CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType()); |
| IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); |
| Builder.getFastMathFlags().setNoNaNs(); |
| return Builder.CreateCall(F, {Ops[0]}); |
| } |
| |
| case X86::BI__builtin_ia32_rdrand16_step: |
| case X86::BI__builtin_ia32_rdrand32_step: |
| case X86::BI__builtin_ia32_rdrand64_step: |
| case X86::BI__builtin_ia32_rdseed16_step: |
| case X86::BI__builtin_ia32_rdseed32_step: |
| case X86::BI__builtin_ia32_rdseed64_step: { |
| Intrinsic::ID ID; |
| switch (BuiltinID) { |
| default: llvm_unreachable("Unsupported intrinsic!"); |
| case X86::BI__builtin_ia32_rdrand16_step: |
| ID = Intrinsic::x86_rdrand_16; |
| break; |
| case X86::BI__builtin_ia32_rdrand32_step: |
| ID = Intrinsic::x86_rdrand_32; |
| break; |
| case X86::BI__builtin_ia32_rdrand64_step: |
| ID = Intrinsic::x86_rdrand_64; |
| break; |
| case X86::BI__builtin_ia32_rdseed16_step: |
| ID = Intrinsic::x86_rdseed_16; |
| break; |
| case X86::BI__builtin_ia32_rdseed32_step: |
| ID = Intrinsic::x86_rdseed_32; |
| break; |
| case X86::BI__builtin_ia32_rdseed64_step: |
| ID = Intrinsic::x86_rdseed_64; |
| break; |
| } |
| |
| Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID)); |
| Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 0), |
| Ops[0]); |
| return Builder.CreateExtractValue(Call, 1); |
| } |
| case X86::BI__builtin_ia32_addcarryx_u32: |
| case X86::BI__builtin_ia32_addcarryx_u64: |
| case X86::BI__builtin_ia32_subborrow_u32: |
| case X86::BI__builtin_ia32_subborrow_u64: { |
| Intrinsic::ID IID; |
| switch (BuiltinID) { |
| default: llvm_unreachable("Unsupported intrinsic!"); |
| case X86::BI__builtin_ia32_addcarryx_u32: |
| IID = Intrinsic::x86_addcarry_32; |
| break; |
| case X86::BI__builtin_ia32_addcarryx_u64: |
| IID = Intrinsic::x86_addcarry_64; |
| break; |
| case X86::BI__builtin_ia32_subborrow_u32: |
| IID = Intrinsic::x86_subborrow_32; |
| break; |
| case X86::BI__builtin_ia32_subborrow_u64: |
| IID = Intrinsic::x86_subborrow_64; |
| break; |
| } |
| |
| Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), |
| { Ops[0], Ops[1], Ops[2] }); |
| Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1), |
| Ops[3]); |
| return Builder.CreateExtractValue(Call, 0); |
| } |
| |
| case X86::BI__builtin_ia32_fpclassps128_mask: |
| case X86::BI__builtin_ia32_fpclassps256_mask: |
| case X86::BI__builtin_ia32_fpclassps512_mask: |
| case X86::BI__builtin_ia32_vfpclassbf16128_mask: |
| case X86::BI__builtin_ia32_vfpclassbf16256_mask: |
| case X86::BI__builtin_ia32_vfpclassbf16512_mask: |
| case X86::BI__builtin_ia32_fpclassph128_mask: |
| case X86::BI__builtin_ia32_fpclassph256_mask: |
| case X86::BI__builtin_ia32_fpclassph512_mask: |
| case X86::BI__builtin_ia32_fpclasspd128_mask: |
| case X86::BI__builtin_ia32_fpclasspd256_mask: |
| case X86::BI__builtin_ia32_fpclasspd512_mask: { |
| unsigned NumElts = |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); |
| Value *MaskIn = Ops[2]; |
| Ops.erase(&Ops[2]); |
| |
| Intrinsic::ID ID; |
| switch (BuiltinID) { |
| default: llvm_unreachable("Unsupported intrinsic!"); |
| case X86::BI__builtin_ia32_vfpclassbf16128_mask: |
| ID = Intrinsic::x86_avx10_fpclass_bf16_128; |
| break; |
| case X86::BI__builtin_ia32_vfpclassbf16256_mask: |
| ID = Intrinsic::x86_avx10_fpclass_bf16_256; |
| break; |
| case X86::BI__builtin_ia32_vfpclassbf16512_mask: |
| ID = Intrinsic::x86_avx10_fpclass_bf16_512; |
| break; |
| case X86::BI__builtin_ia32_fpclassph128_mask: |
| ID = Intrinsic::x86_avx512fp16_fpclass_ph_128; |
| break; |
| case X86::BI__builtin_ia32_fpclassph256_mask: |
| ID = Intrinsic::x86_avx512fp16_fpclass_ph_256; |
| break; |
| case X86::BI__builtin_ia32_fpclassph512_mask: |
| ID = Intrinsic::x86_avx512fp16_fpclass_ph_512; |
| break; |
| case X86::BI__builtin_ia32_fpclassps128_mask: |
| ID = Intrinsic::x86_avx512_fpclass_ps_128; |
| break; |
| case X86::BI__builtin_ia32_fpclassps256_mask: |
| ID = Intrinsic::x86_avx512_fpclass_ps_256; |
| break; |
| case X86::BI__builtin_ia32_fpclassps512_mask: |
| ID = Intrinsic::x86_avx512_fpclass_ps_512; |
| break; |
| case X86::BI__builtin_ia32_fpclasspd128_mask: |
| ID = Intrinsic::x86_avx512_fpclass_pd_128; |
| break; |
| case X86::BI__builtin_ia32_fpclasspd256_mask: |
| ID = Intrinsic::x86_avx512_fpclass_pd_256; |
| break; |
| case X86::BI__builtin_ia32_fpclasspd512_mask: |
| ID = Intrinsic::x86_avx512_fpclass_pd_512; |
| break; |
| } |
| |
| Value *Fpclass = Builder.CreateCall(CGM.getIntrinsic(ID), Ops); |
| return EmitX86MaskedCompareResult(*this, Fpclass, NumElts, MaskIn); |
| } |
| |
| case X86::BI__builtin_ia32_vp2intersect_q_512: |
| case X86::BI__builtin_ia32_vp2intersect_q_256: |
| case X86::BI__builtin_ia32_vp2intersect_q_128: |
| case X86::BI__builtin_ia32_vp2intersect_d_512: |
| case X86::BI__builtin_ia32_vp2intersect_d_256: |
| case X86::BI__builtin_ia32_vp2intersect_d_128: { |
| unsigned NumElts = |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); |
| Intrinsic::ID ID; |
| |
| switch (BuiltinID) { |
| default: llvm_unreachable("Unsupported intrinsic!"); |
| case X86::BI__builtin_ia32_vp2intersect_q_512: |
| ID = Intrinsic::x86_avx512_vp2intersect_q_512; |
| break; |
| case X86::BI__builtin_ia32_vp2intersect_q_256: |
| ID = Intrinsic::x86_avx512_vp2intersect_q_256; |
| break; |
| case X86::BI__builtin_ia32_vp2intersect_q_128: |
| ID = Intrinsic::x86_avx512_vp2intersect_q_128; |
| break; |
| case X86::BI__builtin_ia32_vp2intersect_d_512: |
| ID = Intrinsic::x86_avx512_vp2intersect_d_512; |
| break; |
| case X86::BI__builtin_ia32_vp2intersect_d_256: |
| ID = Intrinsic::x86_avx512_vp2intersect_d_256; |
| break; |
| case X86::BI__builtin_ia32_vp2intersect_d_128: |
| ID = Intrinsic::x86_avx512_vp2intersect_d_128; |
| break; |
| } |
| |
| Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID), {Ops[0], Ops[1]}); |
| Value *Result = Builder.CreateExtractValue(Call, 0); |
| Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr); |
| Builder.CreateDefaultAlignedStore(Result, Ops[2]); |
| |
| Result = Builder.CreateExtractValue(Call, 1); |
| Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr); |
| return Builder.CreateDefaultAlignedStore(Result, Ops[3]); |
| } |
| |
| case X86::BI__builtin_ia32_vpmultishiftqb128: |
| case X86::BI__builtin_ia32_vpmultishiftqb256: |
| case X86::BI__builtin_ia32_vpmultishiftqb512: { |
| Intrinsic::ID ID; |
| switch (BuiltinID) { |
| default: llvm_unreachable("Unsupported intrinsic!"); |
| case X86::BI__builtin_ia32_vpmultishiftqb128: |
| ID = Intrinsic::x86_avx512_pmultishift_qb_128; |
| break; |
| case X86::BI__builtin_ia32_vpmultishiftqb256: |
| ID = Intrinsic::x86_avx512_pmultishift_qb_256; |
| break; |
| case X86::BI__builtin_ia32_vpmultishiftqb512: |
| ID = Intrinsic::x86_avx512_pmultishift_qb_512; |
| break; |
| } |
| |
| return Builder.CreateCall(CGM.getIntrinsic(ID), Ops); |
| } |
| |
| case X86::BI__builtin_ia32_vpshufbitqmb128_mask: |
| case X86::BI__builtin_ia32_vpshufbitqmb256_mask: |
| case X86::BI__builtin_ia32_vpshufbitqmb512_mask: { |
| unsigned NumElts = |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); |
| Value *MaskIn = Ops[2]; |
| Ops.erase(&Ops[2]); |
| |
| Intrinsic::ID ID; |
| switch (BuiltinID) { |
| default: llvm_unreachable("Unsupported intrinsic!"); |
| case X86::BI__builtin_ia32_vpshufbitqmb128_mask: |
| ID = Intrinsic::x86_avx512_vpshufbitqmb_128; |
| break; |
| case X86::BI__builtin_ia32_vpshufbitqmb256_mask: |
| ID = Intrinsic::x86_avx512_vpshufbitqmb_256; |
| break; |
| case X86::BI__builtin_ia32_vpshufbitqmb512_mask: |
| ID = Intrinsic::x86_avx512_vpshufbitqmb_512; |
| break; |
| } |
| |
| Value *Shufbit = Builder.CreateCall(CGM.getIntrinsic(ID), Ops); |
| return EmitX86MaskedCompareResult(*this, Shufbit, NumElts, MaskIn); |
| } |
| |
| // packed comparison intrinsics |
| case X86::BI__builtin_ia32_cmpeqps: |
| case X86::BI__builtin_ia32_cmpeqpd: |
| return getVectorFCmpIR(CmpInst::FCMP_OEQ, /*IsSignaling*/false); |
| case X86::BI__builtin_ia32_cmpltps: |
| case X86::BI__builtin_ia32_cmpltpd: |
| return getVectorFCmpIR(CmpInst::FCMP_OLT, /*IsSignaling*/true); |
| case X86::BI__builtin_ia32_cmpleps: |
| case X86::BI__builtin_ia32_cmplepd: |
| return getVectorFCmpIR(CmpInst::FCMP_OLE, /*IsSignaling*/true); |
| case X86::BI__builtin_ia32_cmpunordps: |
| case X86::BI__builtin_ia32_cmpunordpd: |
| return getVectorFCmpIR(CmpInst::FCMP_UNO, /*IsSignaling*/false); |
| case X86::BI__builtin_ia32_cmpneqps: |
| case X86::BI__builtin_ia32_cmpneqpd: |
| return getVectorFCmpIR(CmpInst::FCMP_UNE, /*IsSignaling*/false); |
| case X86::BI__builtin_ia32_cmpnltps: |
| case X86::BI__builtin_ia32_cmpnltpd: |
| return getVectorFCmpIR(CmpInst::FCMP_UGE, /*IsSignaling*/true); |
| case X86::BI__builtin_ia32_cmpnleps: |
| case X86::BI__builtin_ia32_cmpnlepd: |
| return getVectorFCmpIR(CmpInst::FCMP_UGT, /*IsSignaling*/true); |
| case X86::BI__builtin_ia32_cmpordps: |
| case X86::BI__builtin_ia32_cmpordpd: |
| return getVectorFCmpIR(CmpInst::FCMP_ORD, /*IsSignaling*/false); |
| case X86::BI__builtin_ia32_cmpph128_mask: |
| case X86::BI__builtin_ia32_cmpph256_mask: |
| case X86::BI__builtin_ia32_cmpph512_mask: |
| case X86::BI__builtin_ia32_cmpps128_mask: |
| case X86::BI__builtin_ia32_cmpps256_mask: |
| case X86::BI__builtin_ia32_cmpps512_mask: |
| case X86::BI__builtin_ia32_cmppd128_mask: |
| case X86::BI__builtin_ia32_cmppd256_mask: |
| case X86::BI__builtin_ia32_cmppd512_mask: |
| case X86::BI__builtin_ia32_vcmpbf16512_mask: |
| case X86::BI__builtin_ia32_vcmpbf16256_mask: |
| case X86::BI__builtin_ia32_vcmpbf16128_mask: |
| IsMaskFCmp = true; |
| [[fallthrough]]; |
| case X86::BI__builtin_ia32_cmpps: |
| case X86::BI__builtin_ia32_cmpps256: |
| case X86::BI__builtin_ia32_cmppd: |
| case X86::BI__builtin_ia32_cmppd256: { |
| // Lowering vector comparisons to fcmp instructions, while |
| // ignoring signalling behaviour requested |
| // ignoring rounding mode requested |
| // This is only possible if fp-model is not strict and FENV_ACCESS is off. |
| |
| // The third argument is the comparison condition, and integer in the |
| // range [0, 31] |
| unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x1f; |
| |
| // Lowering to IR fcmp instruction. |
| // Ignoring requested signaling behaviour, |
| // e.g. both _CMP_GT_OS & _CMP_GT_OQ are translated to FCMP_OGT. |
| FCmpInst::Predicate Pred; |
| bool IsSignaling; |
| // Predicates for 16-31 repeat the 0-15 predicates. Only the signalling |
| // behavior is inverted. We'll handle that after the switch. |
| switch (CC & 0xf) { |
| case 0x00: Pred = FCmpInst::FCMP_OEQ; IsSignaling = false; break; |
| case 0x01: Pred = FCmpInst::FCMP_OLT; IsSignaling = true; break; |
| case 0x02: Pred = FCmpInst::FCMP_OLE; IsSignaling = true; break; |
| case 0x03: Pred = FCmpInst::FCMP_UNO; IsSignaling = false; break; |
| case 0x04: Pred = FCmpInst::FCMP_UNE; IsSignaling = false; break; |
| case 0x05: Pred = FCmpInst::FCMP_UGE; IsSignaling = true; break; |
| case 0x06: Pred = FCmpInst::FCMP_UGT; IsSignaling = true; break; |
| case 0x07: Pred = FCmpInst::FCMP_ORD; IsSignaling = false; break; |
| case 0x08: Pred = FCmpInst::FCMP_UEQ; IsSignaling = false; break; |
| case 0x09: Pred = FCmpInst::FCMP_ULT; IsSignaling = true; break; |
| case 0x0a: Pred = FCmpInst::FCMP_ULE; IsSignaling = true; break; |
| case 0x0b: Pred = FCmpInst::FCMP_FALSE; IsSignaling = false; break; |
| case 0x0c: Pred = FCmpInst::FCMP_ONE; IsSignaling = false; break; |
| case 0x0d: Pred = FCmpInst::FCMP_OGE; IsSignaling = true; break; |
| case 0x0e: Pred = FCmpInst::FCMP_OGT; IsSignaling = true; break; |
| case 0x0f: Pred = FCmpInst::FCMP_TRUE; IsSignaling = false; break; |
| default: llvm_unreachable("Unhandled CC"); |
| } |
| |
| // Invert the signalling behavior for 16-31. |
| if (CC & 0x10) |
| IsSignaling = !IsSignaling; |
| |
| // If the predicate is true or false and we're using constrained intrinsics, |
| // we don't have a compare intrinsic we can use. Just use the legacy X86 |
| // specific intrinsic. |
| // If the intrinsic is mask enabled and we're using constrained intrinsics, |
| // use the legacy X86 specific intrinsic. |
| if (Builder.getIsFPConstrained() && |
| (Pred == FCmpInst::FCMP_TRUE || Pred == FCmpInst::FCMP_FALSE || |
| IsMaskFCmp)) { |
| |
| Intrinsic::ID IID; |
| switch (BuiltinID) { |
| default: llvm_unreachable("Unexpected builtin"); |
| case X86::BI__builtin_ia32_cmpps: |
| IID = Intrinsic::x86_sse_cmp_ps; |
| break; |
| case X86::BI__builtin_ia32_cmpps256: |
| IID = Intrinsic::x86_avx_cmp_ps_256; |
| break; |
| case X86::BI__builtin_ia32_cmppd: |
| IID = Intrinsic::x86_sse2_cmp_pd; |
| break; |
| case X86::BI__builtin_ia32_cmppd256: |
| IID = Intrinsic::x86_avx_cmp_pd_256; |
| break; |
| case X86::BI__builtin_ia32_cmpph128_mask: |
| IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_128; |
| break; |
| case X86::BI__builtin_ia32_cmpph256_mask: |
| IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_256; |
| break; |
| case X86::BI__builtin_ia32_cmpph512_mask: |
| IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_512; |
| break; |
| case X86::BI__builtin_ia32_cmpps512_mask: |
| IID = Intrinsic::x86_avx512_mask_cmp_ps_512; |
| break; |
| case X86::BI__builtin_ia32_cmppd512_mask: |
| IID = Intrinsic::x86_avx512_mask_cmp_pd_512; |
| break; |
| case X86::BI__builtin_ia32_cmpps128_mask: |
| IID = Intrinsic::x86_avx512_mask_cmp_ps_128; |
| break; |
| case X86::BI__builtin_ia32_cmpps256_mask: |
| IID = Intrinsic::x86_avx512_mask_cmp_ps_256; |
| break; |
| case X86::BI__builtin_ia32_cmppd128_mask: |
| IID = Intrinsic::x86_avx512_mask_cmp_pd_128; |
| break; |
| case X86::BI__builtin_ia32_cmppd256_mask: |
| IID = Intrinsic::x86_avx512_mask_cmp_pd_256; |
| break; |
| } |
| |
| Function *Intr = CGM.getIntrinsic(IID); |
| if (IsMaskFCmp) { |
| unsigned NumElts = |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); |
| Ops[3] = getMaskVecValue(*this, Ops[3], NumElts); |
| Value *Cmp = Builder.CreateCall(Intr, Ops); |
| return EmitX86MaskedCompareResult(*this, Cmp, NumElts, nullptr); |
| } |
| |
| return Builder.CreateCall(Intr, Ops); |
| } |
| |
| // Builtins without the _mask suffix return a vector of integers |
| // of the same width as the input vectors |
| if (IsMaskFCmp) { |
| // We ignore SAE if strict FP is disabled. We only keep precise |
| // exception behavior under strict FP. |
| // NOTE: If strict FP does ever go through here a CGFPOptionsRAII |
| // object will be required. |
| unsigned NumElts = |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(); |
| Value *Cmp; |
| if (IsSignaling) |
| Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]); |
| else |
| Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]); |
| return EmitX86MaskedCompareResult(*this, Cmp, NumElts, Ops[3]); |
| } |
| |
| return getVectorFCmpIR(Pred, IsSignaling); |
| } |
| |
| // SSE scalar comparison intrinsics |
| case X86::BI__builtin_ia32_cmpeqss: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 0); |
| case X86::BI__builtin_ia32_cmpltss: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 1); |
| case X86::BI__builtin_ia32_cmpless: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 2); |
| case X86::BI__builtin_ia32_cmpunordss: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 3); |
| case X86::BI__builtin_ia32_cmpneqss: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 4); |
| case X86::BI__builtin_ia32_cmpnltss: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 5); |
| case X86::BI__builtin_ia32_cmpnless: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 6); |
| case X86::BI__builtin_ia32_cmpordss: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 7); |
| case X86::BI__builtin_ia32_cmpeqsd: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 0); |
| case X86::BI__builtin_ia32_cmpltsd: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 1); |
| case X86::BI__builtin_ia32_cmplesd: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 2); |
| case X86::BI__builtin_ia32_cmpunordsd: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 3); |
| case X86::BI__builtin_ia32_cmpneqsd: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 4); |
| case X86::BI__builtin_ia32_cmpnltsd: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 5); |
| case X86::BI__builtin_ia32_cmpnlesd: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 6); |
| case X86::BI__builtin_ia32_cmpordsd: |
| return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7); |
| |
| // f16c half2float intrinsics |
| case X86::BI__builtin_ia32_vcvtph2ps: |
| case X86::BI__builtin_ia32_vcvtph2ps256: |
| case X86::BI__builtin_ia32_vcvtph2ps_mask: |
| case X86::BI__builtin_ia32_vcvtph2ps256_mask: |
| case X86::BI__builtin_ia32_vcvtph2ps512_mask: { |
| CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); |
| return EmitX86CvtF16ToFloatExpr(*this, Ops, ConvertType(E->getType())); |
| } |
| |
| // AVX512 bf16 intrinsics |
| case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: { |
| Ops[2] = getMaskVecValue( |
| *this, Ops[2], |
| cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements()); |
| Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128; |
| return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); |
| } |
| case X86::BI__builtin_ia32_cvtsbf162ss_32: |
| return Builder.CreateFPExt(Ops[0], Builder.getFloatTy()); |
| |
| case X86::BI__builtin_ia32_cvtneps2bf16_256_mask: |
| case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: { |
| Intrinsic::ID IID; |
| switch (BuiltinID) { |
| default: llvm_unreachable("Unsupported intrinsic!"); |
| case X86::BI__builtin_ia32_cvtneps2bf16_256_mask: |
| IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_256; |
| break; |
| case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: |
| IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_512; |
| break; |
| } |
| Value *Res = Builder.CreateCall(CGM.getIntrinsic(IID), Ops[0]); |
| return EmitX86Select(*this, Ops[2], Res, Ops[1]); |
| } |
| |
| case X86::BI__cpuid: |
| case X86::BI__cpuidex: { |
| Value *FuncId = EmitScalarExpr(E->getArg(1)); |
| Value *SubFuncId = BuiltinID == X86::BI__cpuidex |
| ? EmitScalarExpr(E->getArg(2)) |
| : llvm::ConstantInt::get(Int32Ty, 0); |
| |
| llvm::StructType *CpuidRetTy = |
| llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty, Int32Ty); |
| llvm::FunctionType *FTy = |
| llvm::FunctionType::get(CpuidRetTy, {Int32Ty, Int32Ty}, false); |
| |
| StringRef Asm, Constraints; |
| if (getTarget().getTriple().getArch() == llvm::Triple::x86) { |
| Asm = "cpuid"; |
| Constraints = "={ax},={bx},={cx},={dx},{ax},{cx}"; |
| } else { |
| // x86-64 uses %rbx as the base register, so preserve it. |
| Asm = "xchgq %rbx, ${1:q}\n" |
| "cpuid\n" |
| "xchgq %rbx, ${1:q}"; |
| Constraints = "={ax},=r,={cx},={dx},0,2"; |
| } |
| |
| llvm::InlineAsm *IA = llvm::InlineAsm::get(FTy, Asm, Constraints, |
| /*hasSideEffects=*/false); |
| Value *IACall = Builder.CreateCall(IA, {FuncId, SubFuncId}); |
| Value *BasePtr = EmitScalarExpr(E->getArg(0)); |
| Value *Store = nullptr; |
| for (unsigned i = 0; i < 4; i++) { |
| Value *Extracted = Builder.CreateExtractValue(IACall, i); |
| Value *StorePtr = Builder.CreateConstInBoundsGEP1_32(Int32Ty, BasePtr, i); |
| Store = Builder.CreateAlignedStore(Extracted, StorePtr, getIntAlign()); |
| } |
| |
| // Return the last store instruction to signal that we have emitted the |
| // the intrinsic. |
| return Store; |
| } |
| |
| case X86::BI__emul: |
| case X86::BI__emulu: { |
| llvm::Type *Int64Ty = llvm::IntegerType::get(getLLVMContext(), 64); |
| bool isSigned = (BuiltinID == X86::BI__emul); |
| Value *LHS = Builder.CreateIntCast(Ops[0], Int64Ty, isSigned); |
| Value *RHS = Builder.CreateIntCast(Ops[1], Int64Ty, isSigned); |
| return Builder.CreateMul(LHS, RHS, "", !isSigned, isSigned); |
| } |
| case X86::BI__mulh: |
| case X86::BI__umulh: |
| case X86::BI_mul128: |
| case X86::BI_umul128: { |
| llvm::Type *ResType = ConvertType(E->getType()); |
| llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128); |
| |
| bool IsSigned = (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI_mul128); |
| Value *LHS = Builder.CreateIntCast(Ops[0], Int128Ty, IsSigned); |
| Value *RHS = Builder.CreateIntCast(Ops[1], Int128Ty, IsSigned); |
| |
| Value *MulResult, *HigherBits; |
| if (IsSigned) { |
| MulResult = Builder.CreateNSWMul(LHS, RHS); |
| HigherBits = Builder.CreateAShr(MulResult, 64); |
| } else { |
| MulResult = Builder.CreateNUWMul(LHS, RHS); |
| HigherBits = Builder.CreateLShr(MulResult, 64); |
| } |
| HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned); |
| |
| if (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI__umulh) |
| return HigherBits; |
| |
| Address HighBitsAddress = EmitPointerWithAlignment(E->getArg(2)); |
| Builder.CreateStore(HigherBits, HighBitsAddress); |
| return Builder.CreateIntCast(MulResult, ResType, IsSigned); |
| } |
| |
| case X86::BI__faststorefence: { |
| return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, |
| llvm::SyncScope::System); |
| } |
| case X86::BI__shiftleft128: |
| case X86::BI__shiftright128: { |
| llvm::Function *F = CGM.getIntrinsic( |
| BuiltinID == X86::BI__shiftleft128 ? Intrinsic::fshl : Intrinsic::fshr, |
| Int64Ty); |
| // Flip low/high ops and zero-extend amount to matching type. |
| // shiftleft128(Low, High, Amt) -> fshl(High, Low, Amt) |
| // shiftright128(Low, High, Amt) -> fshr(High, Low, Amt) |
| std::swap(Ops[0], Ops[1]); |
| Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty); |
| return Builder.CreateCall(F, Ops); |
| } |
| case X86::BI_ReadWriteBarrier: |
| case X86::BI_ReadBarrier: |
| case X86::BI_WriteBarrier: { |
| return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, |
| llvm::SyncScope::SingleThread); |
| } |
| |
| case X86::BI_AddressOfReturnAddress: { |
| Function *F = |
| CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy); |
| return Builder.CreateCall(F); |
| } |
| case X86::BI__stosb: { |
| // We treat __stosb as a volatile memset - it may not generate "rep stosb" |
| // instruction, but it will create a memset that won't be optimized away. |
| return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true); |
| } |
| // Corresponding to intrisics which will return 2 tiles (tile0_tile1). |
| case X86::BI__builtin_ia32_t2rpntlvwz0_internal: |
| case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: |
| case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: |
| case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: |
| case X86::BI__builtin_ia32_t2rpntlvwz1_internal: |
| case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: |
| case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: |
| case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: { |
| Intrinsic::ID IID; |
| switch (BuiltinID) { |
| default: |
| llvm_unreachable("Unsupported intrinsic!"); |
| case X86::BI__builtin_ia32_t2rpntlvwz0_internal: |
| IID = Intrinsic::x86_t2rpntlvwz0_internal; |
| break; |
| case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: |
| IID = Intrinsic::x86_t2rpntlvwz0rs_internal; |
| break; |
| case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: |
| IID = Intrinsic::x86_t2rpntlvwz0t1_internal; |
| break; |
| case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: |
| IID = Intrinsic::x86_t2rpntlvwz0rst1_internal; |
| break; |
| case X86::BI__builtin_ia32_t2rpntlvwz1_internal: |
| IID = Intrinsic::x86_t2rpntlvwz1_internal; |
| break; |
| case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: |
| IID = Intrinsic::x86_t2rpntlvwz1rs_internal; |
| break; |
| case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: |
| IID = Intrinsic::x86_t2rpntlvwz1t1_internal; |
| break; |
| case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: |
| IID = Intrinsic::x86_t2rpntlvwz1rst1_internal; |
| break; |
| } |
| |
| // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride) |
| Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), |
| {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]}); |
| |
| auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>(); |
| assert(PtrTy && "arg3 must be of pointer type"); |
| QualType PtreeTy = PtrTy->getPointeeType(); |
| llvm::Type *TyPtee = ConvertType(PtreeTy); |
| |
| // Bitcast amx type (x86_amx) to vector type (256 x i32) |
| // Then store tile0 into DstPtr0 |
| Value *T0 = Builder.CreateExtractValue(Call, 0); |
| Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, |
| {TyPtee}, {T0}); |
| Builder.CreateDefaultAlignedStore(VecT0, Ops[3]); |
| |
| // Then store tile1 into DstPtr1 |
| Value *T1 = Builder.CreateExtractValue(Call, 1); |
| Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, |
| {TyPtee}, {T1}); |
| Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]); |
| |
| // Note: Here we escape directly use x86_tilestored64_internal to store |
| // the results due to it can't make sure the Mem written scope. This may |
| // cause shapes reloads after first amx intrinsic, which current amx reg- |
| // ister allocation has no ability to handle it. |
| |
| return Store; |
| } |
| case X86::BI__ud2: |
| // llvm.trap makes a ud2a instruction on x86. |
| return EmitTrapCall(Intrinsic::trap); |
| case X86::BI__int2c: { |
| // This syscall signals a driver assertion failure in x86 NT kernels. |
| llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false); |
| llvm::InlineAsm *IA = |
| llvm::InlineAsm::get(FTy, "int $$0x2c", "", /*hasSideEffects=*/true); |
| llvm::AttributeList NoReturnAttr = llvm::AttributeList::get( |
| getLLVMContext(), llvm::AttributeList::FunctionIndex, |
| llvm::Attribute::NoReturn); |
| llvm::CallInst *CI = Builder.CreateCall(IA); |
| CI->setAttributes(NoReturnAttr); |
| return CI; |
| } |
| case X86::BI__readfsbyte: |
| case X86::BI__readfsword: |
| case X86::BI__readfsdword: |
| case X86::BI__readfsqword: { |
| llvm::Type *IntTy = ConvertType(E->getType()); |
| Value *Ptr = Builder.CreateIntToPtr( |
| Ops[0], llvm::PointerType::get(getLLVMContext(), 257)); |
| LoadInst *Load = Builder.CreateAlignedLoad( |
| IntTy, Ptr, getContext().getTypeAlignInChars(E->getType())); |
| Load->setVolatile(true); |
| return Load; |
| } |
| case X86::BI__readgsbyte: |
| case X86::BI__readgsword: |
| case X86::BI__readgsdword: |
| case X86::BI__readgsqword: { |
| llvm::Type *IntTy = ConvertType(E->getType()); |
| Value *Ptr = Builder.CreateIntToPtr( |
| Ops[0], llvm::PointerType::get(getLLVMContext(), 256)); |
| LoadInst *Load = Builder.CreateAlignedLoad( |
| IntTy, Ptr, getContext().getTypeAlignInChars(E->getType())); |
| Load->setVolatile(true); |
| return Load; |
| } |
| case X86::BI__builtin_ia32_encodekey128_u32: { |
| Intrinsic::ID IID = Intrinsic::x86_encodekey128; |
| |
| Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1]}); |
| |
| for (int i = 0; i < 3; ++i) { |
| Value *Extract = Builder.CreateExtractValue(Call, i + 1); |
| Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[2], i * 16); |
| Builder.CreateAlignedStore(Extract, Ptr, Align(1)); |
| } |
| |
| return Builder.CreateExtractValue(Call, 0); |
| } |
| case X86::BI__builtin_ia32_encodekey256_u32: { |
| Intrinsic::ID IID = Intrinsic::x86_encodekey256; |
| |
| Value *Call = |
| Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1], Ops[2]}); |
| |
| for (int i = 0; i < 4; ++i) { |
| Value *Extract = Builder.CreateExtractValue(Call, i + 1); |
| Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[3], i * 16); |
| Builder.CreateAlignedStore(Extract, Ptr, Align(1)); |
| } |
| |
| return Builder.CreateExtractValue(Call, 0); |
| } |
| case X86::BI__builtin_ia32_aesenc128kl_u8: |
| case X86::BI__builtin_ia32_aesdec128kl_u8: |
| case X86::BI__builtin_ia32_aesenc256kl_u8: |
| case X86::BI__builtin_ia32_aesdec256kl_u8: { |
| Intrinsic::ID IID; |
| StringRef BlockName; |
| switch (BuiltinID) { |
| default: |
| llvm_unreachable("Unexpected builtin"); |
| case X86::BI__builtin_ia32_aesenc128kl_u8: |
| IID = Intrinsic::x86_aesenc128kl; |
| BlockName = "aesenc128kl"; |
| break; |
| case X86::BI__builtin_ia32_aesdec128kl_u8: |
| IID = Intrinsic::x86_aesdec128kl; |
| BlockName = "aesdec128kl"; |
| break; |
| case X86::BI__builtin_ia32_aesenc256kl_u8: |
| IID = Intrinsic::x86_aesenc256kl; |
| BlockName = "aesenc256kl"; |
| break; |
| case X86::BI__builtin_ia32_aesdec256kl_u8: |
| IID = Intrinsic::x86_aesdec256kl; |
| BlockName = "aesdec256kl"; |
| break; |
| } |
| |
| Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[1], Ops[2]}); |
| |
| BasicBlock *NoError = |
| createBasicBlock(BlockName + "_no_error", this->CurFn); |
| BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn); |
| BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn); |
| |
| Value *Ret = Builder.CreateExtractValue(Call, 0); |
| Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty()); |
| Value *Out = Builder.CreateExtractValue(Call, 1); |
| Builder.CreateCondBr(Succ, NoError, Error); |
| |
| Builder.SetInsertPoint(NoError); |
| Builder.CreateDefaultAlignedStore(Out, Ops[0]); |
| Builder.CreateBr(End); |
| |
| Builder.SetInsertPoint(Error); |
| Constant *Zero = llvm::Constant::getNullValue(Out->getType()); |
| Builder.CreateDefaultAlignedStore(Zero, Ops[0]); |
| Builder.CreateBr(End); |
| |
| Builder.SetInsertPoint(End); |
| return Builder.CreateExtractValue(Call, 0); |
| } |
| case X86::BI__builtin_ia32_aesencwide128kl_u8: |
| case X86::BI__builtin_ia32_aesdecwide128kl_u8: |
| case X86::BI__builtin_ia32_aesencwide256kl_u8: |
| case X86::BI__builtin_ia32_aesdecwide256kl_u8: { |
| Intrinsic::ID IID; |
| StringRef BlockName; |
| switch (BuiltinID) { |
| case X86::BI__builtin_ia32_aesencwide128kl_u8: |
| IID = Intrinsic::x86_aesencwide128kl; |
| BlockName = "aesencwide128kl"; |
| break; |
| case X86::BI__builtin_ia32_aesdecwide128kl_u8: |
| IID = Intrinsic::x86_aesdecwide128kl; |
| BlockName = "aesdecwide128kl"; |
| break; |
| case X86::BI__builtin_ia32_aesencwide256kl_u8: |
| IID = Intrinsic::x86_aesencwide256kl; |
| BlockName = "aesencwide256kl"; |
| break; |
| case X86::BI__builtin_ia32_aesdecwide256kl_u8: |
| IID = Intrinsic::x86_aesdecwide256kl; |
| BlockName = "aesdecwide256kl"; |
| break; |
| } |
| |
| llvm::Type *Ty = FixedVectorType::get(Builder.getInt64Ty(), 2); |
| Value *InOps[9]; |
| InOps[0] = Ops[2]; |
| for (int i = 0; i != 8; ++i) { |
| Value *Ptr = Builder.CreateConstGEP1_32(Ty, Ops[1], i); |
| InOps[i + 1] = Builder.CreateAlignedLoad(Ty, Ptr, Align(16)); |
| } |
| |
| Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), InOps); |
| |
| BasicBlock *NoError = |
| createBasicBlock(BlockName + "_no_error", this->CurFn); |
| BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn); |
| BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn); |
| |
| Value *Ret = Builder.CreateExtractValue(Call, 0); |
| Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty()); |
| Builder.CreateCondBr(Succ, NoError, Error); |
| |
| Builder.SetInsertPoint(NoError); |
| for (int i = 0; i != 8; ++i) { |
| Value *Extract = Builder.CreateExtractValue(Call, i + 1); |
| Value *Ptr = Builder.CreateConstGEP1_32(Extract->getType(), Ops[0], i); |
| Builder.CreateAlignedStore(Extract, Ptr, Align(16)); |
| } |
| Builder.CreateBr(End); |
| |
| Builder.SetInsertPoint(Error); |
| for (int i = 0; i != 8; ++i) { |
| Value *Out = Builder.CreateExtractValue(Call, i + 1); |
| Constant *Zero = llvm::Constant::getNullValue(Out->getType()); |
| Value *Ptr = Builder.CreateConstGEP1_32(Out->getType(), Ops[0], i); |
| Builder.CreateAlignedStore(Zero, Ptr, Align(16)); |
| } |
| Builder.CreateBr(End); |
| |
| Builder.SetInsertPoint(End); |
| return Builder.CreateExtractValue(Call, 0); |
| } |
| case X86::BI__builtin_ia32_vfcmaddcph512_mask: |
| IsConjFMA = true; |
| [[fallthrough]]; |
| case X86::BI__builtin_ia32_vfmaddcph512_mask: { |
| Intrinsic::ID IID = IsConjFMA |
| ? Intrinsic::x86_avx512fp16_mask_vfcmadd_cph_512 |
| : Intrinsic::x86_avx512fp16_mask_vfmadd_cph_512; |
| Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops); |
| return EmitX86Select(*this, Ops[3], Call, Ops[0]); |
| } |
| case X86::BI__builtin_ia32_vfcmaddcsh_round_mask: |
| IsConjFMA = true; |
| [[fallthrough]]; |
| case X86::BI__builtin_ia32_vfmaddcsh_round_mask: { |
| Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh |
| : Intrinsic::x86_avx512fp16_mask_vfmadd_csh; |
| Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops); |
| Value *And = Builder.CreateAnd(Ops[3], llvm::ConstantInt::get(Int8Ty, 1)); |
| return EmitX86Select(*this, And, Call, Ops[0]); |
| } |
| case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3: |
| IsConjFMA = true; |
| [[fallthrough]]; |
| case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: { |
| Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh |
| : Intrinsic::x86_avx512fp16_mask_vfmadd_csh; |
| Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops); |
| static constexpr int Mask[] = {0, 5, 6, 7}; |
| return Builder.CreateShuffleVector(Call, Ops[2], Mask); |
| } |
| case X86::BI__builtin_ia32_prefetchi: |
| return Builder.CreateCall( |
| CGM.getIntrinsic(Intrinsic::prefetch, Ops[0]->getType()), |
| {Ops[0], llvm::ConstantInt::get(Int32Ty, 0), Ops[1], |
| llvm::ConstantInt::get(Int32Ty, 0)}); |
| } |
| } |