blob: bb5637a319477dbcd7edf07a9cae762fff1a6dc3 [file] [log] [blame]
//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the X86 implementation of the TargetInstrInfo class.
//
//===----------------------------------------------------------------------===//
#include "X86InstrInfo.h"
#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86InstrFoldTables.h"
#include "X86MachineFunctionInfo.h"
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
#define DEBUG_TYPE "x86-instr-info"
#define GET_INSTRINFO_CTOR_DTOR
#include "X86GenInstrInfo.inc"
static cl::opt<bool>
NoFusing("disable-spill-fusing",
cl::desc("Disable fusing of spill code into instructions"),
cl::Hidden);
static cl::opt<bool>
PrintFailedFusing("print-failed-fuse-candidates",
cl::desc("Print instructions that the allocator wants to"
" fuse, but the X86 backend currently can't"),
cl::Hidden);
static cl::opt<bool>
ReMatPICStubLoad("remat-pic-stub-load",
cl::desc("Re-materialize load from stub in PIC mode"),
cl::init(false), cl::Hidden);
static cl::opt<unsigned>
PartialRegUpdateClearance("partial-reg-update-clearance",
cl::desc("Clearance between two register writes "
"for inserting XOR to avoid partial "
"register update"),
cl::init(64), cl::Hidden);
static cl::opt<unsigned>
UndefRegClearance("undef-reg-clearance",
cl::desc("How many idle instructions we would like before "
"certain undef register reads"),
cl::init(128), cl::Hidden);
// Pin the vtable to this file.
void X86InstrInfo::anchor() {}
X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
: X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
: X86::ADJCALLSTACKDOWN32),
(STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
: X86::ADJCALLSTACKUP32),
X86::CATCHRET,
(STI.is64Bit() ? X86::RET64 : X86::RET32)),
Subtarget(STI), RI(STI.getTargetTriple()) {
}
bool
X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
Register &SrcReg, Register &DstReg,
unsigned &SubIdx) const {
switch (MI.getOpcode()) {
default: break;
case X86::MOVSX16rr8:
case X86::MOVZX16rr8:
case X86::MOVSX32rr8:
case X86::MOVZX32rr8:
case X86::MOVSX64rr8:
if (!Subtarget.is64Bit())
// It's not always legal to reference the low 8-bit of the larger
// register in 32-bit mode.
return false;
LLVM_FALLTHROUGH;
case X86::MOVSX32rr16:
case X86::MOVZX32rr16:
case X86::MOVSX64rr16:
case X86::MOVSX64rr32: {
if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
// Be conservative.
return false;
SrcReg = MI.getOperand(1).getReg();
DstReg = MI.getOperand(0).getReg();
switch (MI.getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::MOVSX16rr8:
case X86::MOVZX16rr8:
case X86::MOVSX32rr8:
case X86::MOVZX32rr8:
case X86::MOVSX64rr8:
SubIdx = X86::sub_8bit;
break;
case X86::MOVSX32rr16:
case X86::MOVZX32rr16:
case X86::MOVSX64rr16:
SubIdx = X86::sub_16bit;
break;
case X86::MOVSX64rr32:
SubIdx = X86::sub_32bit;
break;
}
return true;
}
}
return false;
}
bool X86InstrInfo::isDataInvariant(MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
// By default, assume that the instruction is not data invariant.
return false;
// Some target-independent operations that trivially lower to data-invariant
// instructions.
case TargetOpcode::COPY:
case TargetOpcode::INSERT_SUBREG:
case TargetOpcode::SUBREG_TO_REG:
return true;
// On x86 it is believed that imul is constant time w.r.t. the loaded data.
// However, they set flags and are perhaps the most surprisingly constant
// time operations so we call them out here separately.
case X86::IMUL16rr:
case X86::IMUL16rri8:
case X86::IMUL16rri:
case X86::IMUL32rr:
case X86::IMUL32rri8:
case X86::IMUL32rri:
case X86::IMUL64rr:
case X86::IMUL64rri32:
case X86::IMUL64rri8:
// Bit scanning and counting instructions that are somewhat surprisingly
// constant time as they scan across bits and do other fairly complex
// operations like popcnt, but are believed to be constant time on x86.
// However, these set flags.
case X86::BSF16rr:
case X86::BSF32rr:
case X86::BSF64rr:
case X86::BSR16rr:
case X86::BSR32rr:
case X86::BSR64rr:
case X86::LZCNT16rr:
case X86::LZCNT32rr:
case X86::LZCNT64rr:
case X86::POPCNT16rr:
case X86::POPCNT32rr:
case X86::POPCNT64rr:
case X86::TZCNT16rr:
case X86::TZCNT32rr:
case X86::TZCNT64rr:
// Bit manipulation instructions are effectively combinations of basic
// arithmetic ops, and should still execute in constant time. These also
// set flags.
case X86::BLCFILL32rr:
case X86::BLCFILL64rr:
case X86::BLCI32rr:
case X86::BLCI64rr:
case X86::BLCIC32rr:
case X86::BLCIC64rr:
case X86::BLCMSK32rr:
case X86::BLCMSK64rr:
case X86::BLCS32rr:
case X86::BLCS64rr:
case X86::BLSFILL32rr:
case X86::BLSFILL64rr:
case X86::BLSI32rr:
case X86::BLSI64rr:
case X86::BLSIC32rr:
case X86::BLSIC64rr:
case X86::BLSMSK32rr:
case X86::BLSMSK64rr:
case X86::BLSR32rr:
case X86::BLSR64rr:
case X86::TZMSK32rr:
case X86::TZMSK64rr:
// Bit extracting and clearing instructions should execute in constant time,
// and set flags.
case X86::BEXTR32rr:
case X86::BEXTR64rr:
case X86::BEXTRI32ri:
case X86::BEXTRI64ri:
case X86::BZHI32rr:
case X86::BZHI64rr:
// Shift and rotate.
case X86::ROL8r1:
case X86::ROL16r1:
case X86::ROL32r1:
case X86::ROL64r1:
case X86::ROL8rCL:
case X86::ROL16rCL:
case X86::ROL32rCL:
case X86::ROL64rCL:
case X86::ROL8ri:
case X86::ROL16ri:
case X86::ROL32ri:
case X86::ROL64ri:
case X86::ROR8r1:
case X86::ROR16r1:
case X86::ROR32r1:
case X86::ROR64r1:
case X86::ROR8rCL:
case X86::ROR16rCL:
case X86::ROR32rCL:
case X86::ROR64rCL:
case X86::ROR8ri:
case X86::ROR16ri:
case X86::ROR32ri:
case X86::ROR64ri:
case X86::SAR8r1:
case X86::SAR16r1:
case X86::SAR32r1:
case X86::SAR64r1:
case X86::SAR8rCL:
case X86::SAR16rCL:
case X86::SAR32rCL:
case X86::SAR64rCL:
case X86::SAR8ri:
case X86::SAR16ri:
case X86::SAR32ri:
case X86::SAR64ri:
case X86::SHL8r1:
case X86::SHL16r1:
case X86::SHL32r1:
case X86::SHL64r1:
case X86::SHL8rCL:
case X86::SHL16rCL:
case X86::SHL32rCL:
case X86::SHL64rCL:
case X86::SHL8ri:
case X86::SHL16ri:
case X86::SHL32ri:
case X86::SHL64ri:
case X86::SHR8r1:
case X86::SHR16r1:
case X86::SHR32r1:
case X86::SHR64r1:
case X86::SHR8rCL:
case X86::SHR16rCL:
case X86::SHR32rCL:
case X86::SHR64rCL:
case X86::SHR8ri:
case X86::SHR16ri:
case X86::SHR32ri:
case X86::SHR64ri:
case X86::SHLD16rrCL:
case X86::SHLD32rrCL:
case X86::SHLD64rrCL:
case X86::SHLD16rri8:
case X86::SHLD32rri8:
case X86::SHLD64rri8:
case X86::SHRD16rrCL:
case X86::SHRD32rrCL:
case X86::SHRD64rrCL:
case X86::SHRD16rri8:
case X86::SHRD32rri8:
case X86::SHRD64rri8:
// Basic arithmetic is constant time on the input but does set flags.
case X86::ADC8rr:
case X86::ADC8ri:
case X86::ADC16rr:
case X86::ADC16ri:
case X86::ADC16ri8:
case X86::ADC32rr:
case X86::ADC32ri:
case X86::ADC32ri8:
case X86::ADC64rr:
case X86::ADC64ri8:
case X86::ADC64ri32:
case X86::ADD8rr:
case X86::ADD8ri:
case X86::ADD16rr:
case X86::ADD16ri:
case X86::ADD16ri8:
case X86::ADD32rr:
case X86::ADD32ri:
case X86::ADD32ri8:
case X86::ADD64rr:
case X86::ADD64ri8:
case X86::ADD64ri32:
case X86::AND8rr:
case X86::AND8ri:
case X86::AND16rr:
case X86::AND16ri:
case X86::AND16ri8:
case X86::AND32rr:
case X86::AND32ri:
case X86::AND32ri8:
case X86::AND64rr:
case X86::AND64ri8:
case X86::AND64ri32:
case X86::OR8rr:
case X86::OR8ri:
case X86::OR16rr:
case X86::OR16ri:
case X86::OR16ri8:
case X86::OR32rr:
case X86::OR32ri:
case X86::OR32ri8:
case X86::OR64rr:
case X86::OR64ri8:
case X86::OR64ri32:
case X86::SBB8rr:
case X86::SBB8ri:
case X86::SBB16rr:
case X86::SBB16ri:
case X86::SBB16ri8:
case X86::SBB32rr:
case X86::SBB32ri:
case X86::SBB32ri8:
case X86::SBB64rr:
case X86::SBB64ri8:
case X86::SBB64ri32:
case X86::SUB8rr:
case X86::SUB8ri:
case X86::SUB16rr:
case X86::SUB16ri:
case X86::SUB16ri8:
case X86::SUB32rr:
case X86::SUB32ri:
case X86::SUB32ri8:
case X86::SUB64rr:
case X86::SUB64ri8:
case X86::SUB64ri32:
case X86::XOR8rr:
case X86::XOR8ri:
case X86::XOR16rr:
case X86::XOR16ri:
case X86::XOR16ri8:
case X86::XOR32rr:
case X86::XOR32ri:
case X86::XOR32ri8:
case X86::XOR64rr:
case X86::XOR64ri8:
case X86::XOR64ri32:
// Arithmetic with just 32-bit and 64-bit variants and no immediates.
case X86::ADCX32rr:
case X86::ADCX64rr:
case X86::ADOX32rr:
case X86::ADOX64rr:
case X86::ANDN32rr:
case X86::ANDN64rr:
// Unary arithmetic operations.
case X86::DEC8r:
case X86::DEC16r:
case X86::DEC32r:
case X86::DEC64r:
case X86::INC8r:
case X86::INC16r:
case X86::INC32r:
case X86::INC64r:
case X86::NEG8r:
case X86::NEG16r:
case X86::NEG32r:
case X86::NEG64r:
// Unlike other arithmetic, NOT doesn't set EFLAGS.
case X86::NOT8r:
case X86::NOT16r:
case X86::NOT32r:
case X86::NOT64r:
// Various move instructions used to zero or sign extend things. Note that we
// intentionally don't support the _NOREX variants as we can't handle that
// register constraint anyways.
case X86::MOVSX16rr8:
case X86::MOVSX32rr8:
case X86::MOVSX32rr16:
case X86::MOVSX64rr8:
case X86::MOVSX64rr16:
case X86::MOVSX64rr32:
case X86::MOVZX16rr8:
case X86::MOVZX32rr8:
case X86::MOVZX32rr16:
case X86::MOVZX64rr8:
case X86::MOVZX64rr16:
case X86::MOV32rr:
// Arithmetic instructions that are both constant time and don't set flags.
case X86::RORX32ri:
case X86::RORX64ri:
case X86::SARX32rr:
case X86::SARX64rr:
case X86::SHLX32rr:
case X86::SHLX64rr:
case X86::SHRX32rr:
case X86::SHRX64rr:
// LEA doesn't actually access memory, and its arithmetic is constant time.
case X86::LEA16r:
case X86::LEA32r:
case X86::LEA64_32r:
case X86::LEA64r:
return true;
}
}
bool X86InstrInfo::isDataInvariantLoad(MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
// By default, assume that the load will immediately leak.
return false;
// On x86 it is believed that imul is constant time w.r.t. the loaded data.
// However, they set flags and are perhaps the most surprisingly constant
// time operations so we call them out here separately.
case X86::IMUL16rm:
case X86::IMUL16rmi8:
case X86::IMUL16rmi:
case X86::IMUL32rm:
case X86::IMUL32rmi8:
case X86::IMUL32rmi:
case X86::IMUL64rm:
case X86::IMUL64rmi32:
case X86::IMUL64rmi8:
// Bit scanning and counting instructions that are somewhat surprisingly
// constant time as they scan across bits and do other fairly complex
// operations like popcnt, but are believed to be constant time on x86.
// However, these set flags.
case X86::BSF16rm:
case X86::BSF32rm:
case X86::BSF64rm:
case X86::BSR16rm:
case X86::BSR32rm:
case X86::BSR64rm:
case X86::LZCNT16rm:
case X86::LZCNT32rm:
case X86::LZCNT64rm:
case X86::POPCNT16rm:
case X86::POPCNT32rm:
case X86::POPCNT64rm:
case X86::TZCNT16rm:
case X86::TZCNT32rm:
case X86::TZCNT64rm:
// Bit manipulation instructions are effectively combinations of basic
// arithmetic ops, and should still execute in constant time. These also
// set flags.
case X86::BLCFILL32rm:
case X86::BLCFILL64rm:
case X86::BLCI32rm:
case X86::BLCI64rm:
case X86::BLCIC32rm:
case X86::BLCIC64rm:
case X86::BLCMSK32rm:
case X86::BLCMSK64rm:
case X86::BLCS32rm:
case X86::BLCS64rm:
case X86::BLSFILL32rm:
case X86::BLSFILL64rm:
case X86::BLSI32rm:
case X86::BLSI64rm:
case X86::BLSIC32rm:
case X86::BLSIC64rm:
case X86::BLSMSK32rm:
case X86::BLSMSK64rm:
case X86::BLSR32rm:
case X86::BLSR64rm:
case X86::TZMSK32rm:
case X86::TZMSK64rm:
// Bit extracting and clearing instructions should execute in constant time,
// and set flags.
case X86::BEXTR32rm:
case X86::BEXTR64rm:
case X86::BEXTRI32mi:
case X86::BEXTRI64mi:
case X86::BZHI32rm:
case X86::BZHI64rm:
// Basic arithmetic is constant time on the input but does set flags.
case X86::ADC8rm:
case X86::ADC16rm:
case X86::ADC32rm:
case X86::ADC64rm:
case X86::ADCX32rm:
case X86::ADCX64rm:
case X86::ADD8rm:
case X86::ADD16rm:
case X86::ADD32rm:
case X86::ADD64rm:
case X86::ADOX32rm:
case X86::ADOX64rm:
case X86::AND8rm:
case X86::AND16rm:
case X86::AND32rm:
case X86::AND64rm:
case X86::ANDN32rm:
case X86::ANDN64rm:
case X86::OR8rm:
case X86::OR16rm:
case X86::OR32rm:
case X86::OR64rm:
case X86::SBB8rm:
case X86::SBB16rm:
case X86::SBB32rm:
case X86::SBB64rm:
case X86::SUB8rm:
case X86::SUB16rm:
case X86::SUB32rm:
case X86::SUB64rm:
case X86::XOR8rm:
case X86::XOR16rm:
case X86::XOR32rm:
case X86::XOR64rm:
// Integer multiply w/o affecting flags is still believed to be constant
// time on x86. Called out separately as this is among the most surprising
// instructions to exhibit that behavior.
case X86::MULX32rm:
case X86::MULX64rm:
// Arithmetic instructions that are both constant time and don't set flags.
case X86::RORX32mi:
case X86::RORX64mi:
case X86::SARX32rm:
case X86::SARX64rm:
case X86::SHLX32rm:
case X86::SHLX64rm:
case X86::SHRX32rm:
case X86::SHRX64rm:
// Conversions are believed to be constant time and don't set flags.
case X86::CVTTSD2SI64rm:
case X86::VCVTTSD2SI64rm:
case X86::VCVTTSD2SI64Zrm:
case X86::CVTTSD2SIrm:
case X86::VCVTTSD2SIrm:
case X86::VCVTTSD2SIZrm:
case X86::CVTTSS2SI64rm:
case X86::VCVTTSS2SI64rm:
case X86::VCVTTSS2SI64Zrm:
case X86::CVTTSS2SIrm:
case X86::VCVTTSS2SIrm:
case X86::VCVTTSS2SIZrm:
case X86::CVTSI2SDrm:
case X86::VCVTSI2SDrm:
case X86::VCVTSI2SDZrm:
case X86::CVTSI2SSrm:
case X86::VCVTSI2SSrm:
case X86::VCVTSI2SSZrm:
case X86::CVTSI642SDrm:
case X86::VCVTSI642SDrm:
case X86::VCVTSI642SDZrm:
case X86::CVTSI642SSrm:
case X86::VCVTSI642SSrm:
case X86::VCVTSI642SSZrm:
case X86::CVTSS2SDrm:
case X86::VCVTSS2SDrm:
case X86::VCVTSS2SDZrm:
case X86::CVTSD2SSrm:
case X86::VCVTSD2SSrm:
case X86::VCVTSD2SSZrm:
// AVX512 added unsigned integer conversions.
case X86::VCVTTSD2USI64Zrm:
case X86::VCVTTSD2USIZrm:
case X86::VCVTTSS2USI64Zrm:
case X86::VCVTTSS2USIZrm:
case X86::VCVTUSI2SDZrm:
case X86::VCVTUSI642SDZrm:
case X86::VCVTUSI2SSZrm:
case X86::VCVTUSI642SSZrm:
// Loads to register don't set flags.
case X86::MOV8rm:
case X86::MOV8rm_NOREX:
case X86::MOV16rm:
case X86::MOV32rm:
case X86::MOV64rm:
case X86::MOVSX16rm8:
case X86::MOVSX32rm16:
case X86::MOVSX32rm8:
case X86::MOVSX32rm8_NOREX:
case X86::MOVSX64rm16:
case X86::MOVSX64rm32:
case X86::MOVSX64rm8:
case X86::MOVZX16rm8:
case X86::MOVZX32rm16:
case X86::MOVZX32rm8:
case X86::MOVZX32rm8_NOREX:
case X86::MOVZX64rm16:
case X86::MOVZX64rm8:
return true;
}
}
int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
const MachineFunction *MF = MI.getParent()->getParent();
const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
if (isFrameInstr(MI)) {
int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
SPAdj -= getFrameAdjustment(MI);
if (!isFrameSetup(MI))
SPAdj = -SPAdj;
return SPAdj;
}
// To know whether a call adjusts the stack, we need information
// that is bound to the following ADJCALLSTACKUP pseudo.
// Look for the next ADJCALLSTACKUP that follows the call.
if (MI.isCall()) {
const MachineBasicBlock *MBB = MI.getParent();
auto I = ++MachineBasicBlock::const_iterator(MI);
for (auto E = MBB->end(); I != E; ++I) {
if (I->getOpcode() == getCallFrameDestroyOpcode() ||
I->isCall())
break;
}
// If we could not find a frame destroy opcode, then it has already
// been simplified, so we don't care.
if (I->getOpcode() != getCallFrameDestroyOpcode())
return 0;
return -(I->getOperand(1).getImm());
}
// Currently handle only PUSHes we can reasonably expect to see
// in call sequences
switch (MI.getOpcode()) {
default:
return 0;
case X86::PUSH32i8:
case X86::PUSH32r:
case X86::PUSH32rmm:
case X86::PUSH32rmr:
case X86::PUSHi32:
return 4;
case X86::PUSH64i8:
case X86::PUSH64r:
case X86::PUSH64rmm:
case X86::PUSH64rmr:
case X86::PUSH64i32:
return 8;
}
}
/// Return true and the FrameIndex if the specified
/// operand and follow operands form a reference to the stack frame.
bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
int &FrameIndex) const {
if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
MI.getOperand(Op + X86::AddrDisp).isImm() &&
MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
return true;
}
return false;
}
static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
switch (Opcode) {
default:
return false;
case X86::MOV8rm:
case X86::KMOVBkm:
MemBytes = 1;
return true;
case X86::MOV16rm:
case X86::KMOVWkm:
case X86::VMOVSHZrm:
case X86::VMOVSHZrm_alt:
MemBytes = 2;
return true;
case X86::MOV32rm:
case X86::MOVSSrm:
case X86::MOVSSrm_alt:
case X86::VMOVSSrm:
case X86::VMOVSSrm_alt:
case X86::VMOVSSZrm:
case X86::VMOVSSZrm_alt:
case X86::KMOVDkm:
MemBytes = 4;
return true;
case X86::MOV64rm:
case X86::LD_Fp64m:
case X86::MOVSDrm:
case X86::MOVSDrm_alt:
case X86::VMOVSDrm:
case X86::VMOVSDrm_alt:
case X86::VMOVSDZrm:
case X86::VMOVSDZrm_alt:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
case X86::KMOVQkm:
MemBytes = 8;
return true;
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
case X86::MOVUPDrm:
case X86::MOVDQArm:
case X86::MOVDQUrm:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
case X86::VMOVUPDrm:
case X86::VMOVDQArm:
case X86::VMOVDQUrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPSZ128rm:
case X86::VMOVAPSZ128rm_NOVLX:
case X86::VMOVUPSZ128rm_NOVLX:
case X86::VMOVAPDZ128rm:
case X86::VMOVUPDZ128rm:
case X86::VMOVDQU8Z128rm:
case X86::VMOVDQU16Z128rm:
case X86::VMOVDQA32Z128rm:
case X86::VMOVDQU32Z128rm:
case X86::VMOVDQA64Z128rm:
case X86::VMOVDQU64Z128rm:
MemBytes = 16;
return true;
case X86::VMOVAPSYrm:
case X86::VMOVUPSYrm:
case X86::VMOVAPDYrm:
case X86::VMOVUPDYrm:
case X86::VMOVDQAYrm:
case X86::VMOVDQUYrm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPSZ256rm:
case X86::VMOVAPSZ256rm_NOVLX:
case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVAPDZ256rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVDQU8Z256rm:
case X86::VMOVDQU16Z256rm:
case X86::VMOVDQA32Z256rm:
case X86::VMOVDQU32Z256rm:
case X86::VMOVDQA64Z256rm:
case X86::VMOVDQU64Z256rm:
MemBytes = 32;
return true;
case X86::VMOVAPSZrm:
case X86::VMOVUPSZrm:
case X86::VMOVAPDZrm:
case X86::VMOVUPDZrm:
case X86::VMOVDQU8Zrm:
case X86::VMOVDQU16Zrm:
case X86::VMOVDQA32Zrm:
case X86::VMOVDQU32Zrm:
case X86::VMOVDQA64Zrm:
case X86::VMOVDQU64Zrm:
MemBytes = 64;
return true;
}
}
static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
switch (Opcode) {
default:
return false;
case X86::MOV8mr:
case X86::KMOVBmk:
MemBytes = 1;
return true;
case X86::MOV16mr:
case X86::KMOVWmk:
case X86::VMOVSHZmr:
MemBytes = 2;
return true;
case X86::MOV32mr:
case X86::MOVSSmr:
case X86::VMOVSSmr:
case X86::VMOVSSZmr:
case X86::KMOVDmk:
MemBytes = 4;
return true;
case X86::MOV64mr:
case X86::ST_FpP64m:
case X86::MOVSDmr:
case X86::VMOVSDmr:
case X86::VMOVSDZmr:
case X86::MMX_MOVD64mr:
case X86::MMX_MOVQ64mr:
case X86::MMX_MOVNTQmr:
case X86::KMOVQmk:
MemBytes = 8;
return true;
case X86::MOVAPSmr:
case X86::MOVUPSmr:
case X86::MOVAPDmr:
case X86::MOVUPDmr:
case X86::MOVDQAmr:
case X86::MOVDQUmr:
case X86::VMOVAPSmr:
case X86::VMOVUPSmr:
case X86::VMOVAPDmr:
case X86::VMOVUPDmr:
case X86::VMOVDQAmr:
case X86::VMOVDQUmr:
case X86::VMOVUPSZ128mr:
case X86::VMOVAPSZ128mr:
case X86::VMOVUPSZ128mr_NOVLX:
case X86::VMOVAPSZ128mr_NOVLX:
case X86::VMOVUPDZ128mr:
case X86::VMOVAPDZ128mr:
case X86::VMOVDQA32Z128mr:
case X86::VMOVDQU32Z128mr:
case X86::VMOVDQA64Z128mr:
case X86::VMOVDQU64Z128mr:
case X86::VMOVDQU8Z128mr:
case X86::VMOVDQU16Z128mr:
MemBytes = 16;
return true;
case X86::VMOVUPSYmr:
case X86::VMOVAPSYmr:
case X86::VMOVUPDYmr:
case X86::VMOVAPDYmr:
case X86::VMOVDQUYmr:
case X86::VMOVDQAYmr:
case X86::VMOVUPSZ256mr:
case X86::VMOVAPSZ256mr:
case X86::VMOVUPSZ256mr_NOVLX:
case X86::VMOVAPSZ256mr_NOVLX:
case X86::VMOVUPDZ256mr:
case X86::VMOVAPDZ256mr:
case X86::VMOVDQU8Z256mr:
case X86::VMOVDQU16Z256mr:
case X86::VMOVDQA32Z256mr:
case X86::VMOVDQU32Z256mr:
case X86::VMOVDQA64Z256mr:
case X86::VMOVDQU64Z256mr:
MemBytes = 32;
return true;
case X86::VMOVUPSZmr:
case X86::VMOVAPSZmr:
case X86::VMOVUPDZmr:
case X86::VMOVAPDZmr:
case X86::VMOVDQU8Zmr:
case X86::VMOVDQU16Zmr:
case X86::VMOVDQA32Zmr:
case X86::VMOVDQU32Zmr:
case X86::VMOVDQA64Zmr:
case X86::VMOVDQU64Zmr:
MemBytes = 64;
return true;
}
return false;
}
unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
unsigned Dummy;
return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
}
unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex,
unsigned &MemBytes) const {
if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
return MI.getOperand(0).getReg();
return 0;
}
unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const {
unsigned Dummy;
if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
unsigned Reg;
if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
return Reg;
// Check for post-frame index elimination operations
SmallVector<const MachineMemOperand *, 1> Accesses;
if (hasLoadFromStackSlot(MI, Accesses)) {
FrameIndex =
cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
->getFrameIndex();
return MI.getOperand(0).getReg();
}
}
return 0;
}
unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
unsigned Dummy;
return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
}
unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex,
unsigned &MemBytes) const {
if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
isFrameOperand(MI, 0, FrameIndex))
return MI.getOperand(X86::AddrNumOperands).getReg();
return 0;
}
unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const {
unsigned Dummy;
if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
unsigned Reg;
if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
return Reg;
// Check for post-frame index elimination operations
SmallVector<const MachineMemOperand *, 1> Accesses;
if (hasStoreToStackSlot(MI, Accesses)) {
FrameIndex =
cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
->getFrameIndex();
return MI.getOperand(X86::AddrNumOperands).getReg();
}
}
return 0;
}
/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
// Don't waste compile time scanning use-def chains of physregs.
if (!BaseReg.isVirtual())
return false;
bool isPICBase = false;
for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
E = MRI.def_instr_end(); I != E; ++I) {
MachineInstr *DefMI = &*I;
if (DefMI->getOpcode() != X86::MOVPC32r)
return false;
assert(!isPICBase && "More than one PIC base?");
isPICBase = true;
}
return isPICBase;
}
bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
AAResults *AA) const {
switch (MI.getOpcode()) {
default:
// This function should only be called for opcodes with the ReMaterializable
// flag set.
llvm_unreachable("Unknown rematerializable operation!");
break;
case X86::LOAD_STACK_GUARD:
case X86::AVX1_SETALLONES:
case X86::AVX2_SETALLONES:
case X86::AVX512_128_SET0:
case X86::AVX512_256_SET0:
case X86::AVX512_512_SET0:
case X86::AVX512_512_SETALLONES:
case X86::AVX512_FsFLD0SD:
case X86::AVX512_FsFLD0SH:
case X86::AVX512_FsFLD0SS:
case X86::AVX512_FsFLD0F128:
case X86::AVX_SET0:
case X86::FsFLD0SD:
case X86::FsFLD0SS:
case X86::FsFLD0F128:
case X86::KSET0D:
case X86::KSET0Q:
case X86::KSET0W:
case X86::KSET1D:
case X86::KSET1Q:
case X86::KSET1W:
case X86::MMX_SET0:
case X86::MOV32ImmSExti8:
case X86::MOV32r0:
case X86::MOV32r1:
case X86::MOV32r_1:
case X86::MOV32ri64:
case X86::MOV64ImmSExti8:
case X86::V_SET0:
case X86::V_SETALLONES:
case X86::MOV16ri:
case X86::MOV32ri:
case X86::MOV64ri:
case X86::MOV64ri32:
case X86::MOV8ri:
case X86::PTILEZEROV:
return true;
case X86::MOV8rm:
case X86::MOV8rm_NOREX:
case X86::MOV16rm:
case X86::MOV32rm:
case X86::MOV64rm:
case X86::MOVSSrm:
case X86::MOVSSrm_alt:
case X86::MOVSDrm:
case X86::MOVSDrm_alt:
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
case X86::MOVUPDrm:
case X86::MOVDQArm:
case X86::MOVDQUrm:
case X86::VMOVSSrm:
case X86::VMOVSSrm_alt:
case X86::VMOVSDrm:
case X86::VMOVSDrm_alt:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
case X86::VMOVUPDrm:
case X86::VMOVDQArm:
case X86::VMOVDQUrm:
case X86::VMOVAPSYrm:
case X86::VMOVUPSYrm:
case X86::VMOVAPDYrm:
case X86::VMOVUPDYrm:
case X86::VMOVDQAYrm:
case X86::VMOVDQUYrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
// AVX-512
case X86::VMOVSSZrm:
case X86::VMOVSSZrm_alt:
case X86::VMOVSDZrm:
case X86::VMOVSDZrm_alt:
case X86::VMOVSHZrm:
case X86::VMOVSHZrm_alt:
case X86::VMOVAPDZ128rm:
case X86::VMOVAPDZ256rm:
case X86::VMOVAPDZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVAPSZ128rm_NOVLX:
case X86::VMOVAPSZ256rm_NOVLX:
case X86::VMOVAPSZrm:
case X86::VMOVDQA32Z128rm:
case X86::VMOVDQA32Z256rm:
case X86::VMOVDQA32Zrm:
case X86::VMOVDQA64Z128rm:
case X86::VMOVDQA64Z256rm:
case X86::VMOVDQA64Zrm:
case X86::VMOVDQU16Z128rm:
case X86::VMOVDQU16Z256rm:
case X86::VMOVDQU16Zrm:
case X86::VMOVDQU32Z128rm:
case X86::VMOVDQU32Z256rm:
case X86::VMOVDQU32Zrm:
case X86::VMOVDQU64Z128rm:
case X86::VMOVDQU64Z256rm:
case X86::VMOVDQU64Zrm:
case X86::VMOVDQU8Z128rm:
case X86::VMOVDQU8Z256rm:
case X86::VMOVDQU8Zrm:
case X86::VMOVUPDZ128rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVUPDZrm:
case X86::VMOVUPSZ128rm:
case X86::VMOVUPSZ256rm:
case X86::VMOVUPSZ128rm_NOVLX:
case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVUPSZrm: {
// Loads from constant pools are trivially rematerializable.
if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
MI.isDereferenceableInvariantLoad(AA)) {
Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
if (BaseReg == 0 || BaseReg == X86::RIP)
return true;
// Allow re-materialization of PIC load.
if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())
return false;
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
return regIsPICBase(BaseReg, MRI);
}
return false;
}
case X86::LEA32r:
case X86::LEA64r: {
if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
!MI.getOperand(1 + X86::AddrDisp).isReg()) {
// lea fi#, lea GV, etc. are all rematerializable.
if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
return true;
Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
if (BaseReg == 0)
return true;
// Allow re-materialization of lea PICBase + x.
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
return regIsPICBase(BaseReg, MRI);
}
return false;
}
}
}
void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
Register DestReg, unsigned SubIdx,
const MachineInstr &Orig,
const TargetRegisterInfo &TRI) const {
bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
MachineBasicBlock::LQR_Dead) {
// The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
// effects.
int Value;
switch (Orig.getOpcode()) {
case X86::MOV32r0: Value = 0; break;
case X86::MOV32r1: Value = 1; break;
case X86::MOV32r_1: Value = -1; break;
default:
llvm_unreachable("Unexpected instruction!");
}
const DebugLoc &DL = Orig.getDebugLoc();
BuildMI(MBB, I, DL, get(X86::MOV32ri))
.add(Orig.getOperand(0))
.addImm(Value);
} else {
MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
MBB.insert(I, MI);
}
MachineInstr &NewMI = *std::prev(I);
NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
}
/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const {
for (const MachineOperand &MO : MI.operands()) {
if (MO.isReg() && MO.isDef() &&
MO.getReg() == X86::EFLAGS && !MO.isDead()) {
return true;
}
}
return false;
}
/// Check whether the shift count for a machine operand is non-zero.
inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
unsigned ShiftAmtOperandIdx) {
// The shift count is six bits with the REX.W prefix and five bits without.
unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
return Imm & ShiftCountMask;
}
/// Check whether the given shift count is appropriate
/// can be represented by a LEA instruction.
inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
// Left shift instructions can be transformed into load-effective-address
// instructions if we can encode them appropriately.
// A LEA instruction utilizes a SIB byte to encode its scale factor.
// The SIB.scale field is two bits wide which means that we can encode any
// shift amount less than 4.
return ShAmt < 4 && ShAmt > 0;
}
bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
unsigned Opc, bool AllowSP, Register &NewSrc,
bool &isKill, MachineOperand &ImplicitOp,
LiveVariables *LV, LiveIntervals *LIS) const {
MachineFunction &MF = *MI.getParent()->getParent();
const TargetRegisterClass *RC;
if (AllowSP) {
RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
} else {
RC = Opc != X86::LEA32r ?
&X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
}
Register SrcReg = Src.getReg();
isKill = MI.killsRegister(SrcReg);
// For both LEA64 and LEA32 the register already has essentially the right
// type (32-bit or 64-bit) we may just need to forbid SP.
if (Opc != X86::LEA64_32r) {
NewSrc = SrcReg;
assert(!Src.isUndef() && "Undef op doesn't need optimization");
if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
return false;
return true;
}
// This is for an LEA64_32r and incoming registers are 32-bit. One way or
// another we need to add 64-bit registers to the final MI.
if (SrcReg.isPhysical()) {
ImplicitOp = Src;
ImplicitOp.setImplicit();
NewSrc = getX86SubSuperRegister(SrcReg, 64);
assert(!Src.isUndef() && "Undef op doesn't need optimization");
} else {
// Virtual register of the wrong class, we have to create a temporary 64-bit
// vreg to feed into the LEA.
NewSrc = MF.getRegInfo().createVirtualRegister(RC);
MachineInstr *Copy =
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
.addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
.addReg(SrcReg, getKillRegState(isKill));
// Which is obviously going to be dead after we're done with it.
isKill = true;
if (LV)
LV->replaceKillInstruction(SrcReg, MI, *Copy);
if (LIS) {
SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy);
SlotIndex Idx = LIS->getInstructionIndex(MI);
LiveInterval &LI = LIS->getInterval(SrcReg);
LiveRange::Segment *S = LI.getSegmentContaining(Idx);
if (S->end.getBaseIndex() == Idx)
S->end = CopyIdx.getRegSlot();
}
}
// We've set all the parameters without issue.
return true;
}
MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS,
bool Is8BitOp) const {
// We handle 8-bit adds and various 16-bit opcodes in the switch below.
MachineBasicBlock &MBB = *MI.getParent();
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
*RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
"Unexpected type for LEA transform");
// TODO: For a 32-bit target, we need to adjust the LEA variables with
// something like this:
// Opcode = X86::LEA32r;
// InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
// OutRegLEA =
// Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
// : RegInfo.createVirtualRegister(&X86::GR32RegClass);
if (!Subtarget.is64Bit())
return nullptr;
unsigned Opcode = X86::LEA64_32r;
Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
Register InRegLEA2;
// Build and insert into an implicit UNDEF value. This is OK because
// we will be shifting and then extracting the lower 8/16-bits.
// This has the potential to cause partial register stall. e.g.
// movw (%rbp,%rcx,2), %dx
// leal -65(%rdx), %esi
// But testing has shown this *does* help performance in 64-bit mode (at
// least on modern x86 machines).
MachineBasicBlock::iterator MBBI = MI.getIterator();
Register Dest = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
Register Src2;
bool IsDead = MI.getOperand(0).isDead();
bool IsKill = MI.getOperand(1).isKill();
unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
MachineInstr *ImpDef =
BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
MachineInstr *InsMI =
BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
.addReg(InRegLEA, RegState::Define, SubReg)
.addReg(Src, getKillRegState(IsKill));
MachineInstr *ImpDef2 = nullptr;
MachineInstr *InsMI2 = nullptr;
MachineInstrBuilder MIB =
BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
switch (MIOpc) {
default: llvm_unreachable("Unreachable!");
case X86::SHL8ri:
case X86::SHL16ri: {
unsigned ShAmt = MI.getOperand(2).getImm();
MIB.addReg(0).addImm(1ULL << ShAmt)
.addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0);
break;
}
case X86::INC8r:
case X86::INC16r:
addRegOffset(MIB, InRegLEA, true, 1);
break;
case X86::DEC8r:
case X86::DEC16r:
addRegOffset(MIB, InRegLEA, true, -1);
break;
case X86::ADD8ri:
case X86::ADD8ri_DB:
case X86::ADD16ri:
case X86::ADD16ri8:
case X86::ADD16ri_DB:
case X86::ADD16ri8_DB:
addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
break;
case X86::ADD8rr:
case X86::ADD8rr_DB:
case X86::ADD16rr:
case X86::ADD16rr_DB: {
Src2 = MI.getOperand(2).getReg();
bool IsKill2 = MI.getOperand(2).isKill();
assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
if (Src == Src2) {
// ADD8rr/ADD16rr killed %reg1028, %reg1028
// just a single insert_subreg.
addRegReg(MIB, InRegLEA, true, InRegLEA, false);
} else {
if (Subtarget.is64Bit())
InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
else
InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
// Build and insert into an implicit UNDEF value. This is OK because
// we will be shifting and then extracting the lower 8/16-bits.
ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF),
InRegLEA2);
InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
.addReg(InRegLEA2, RegState::Define, SubReg)
.addReg(Src2, getKillRegState(IsKill2));
addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
}
if (LV && IsKill2 && InsMI2)
LV->replaceKillInstruction(Src2, MI, *InsMI2);
break;
}
}
MachineInstr *NewMI = MIB;
MachineInstr *ExtMI =
BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
.addReg(Dest, RegState::Define | getDeadRegState(IsDead))
.addReg(OutRegLEA, RegState::Kill, SubReg);
if (LV) {
// Update live variables.
LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
if (IsKill)
LV->replaceKillInstruction(Src, MI, *InsMI);
if (IsDead)
LV->replaceKillInstruction(Dest, MI, *ExtMI);
}
if (LIS) {
LIS->InsertMachineInstrInMaps(*ImpDef);
SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI);
if (ImpDef2)
LIS->InsertMachineInstrInMaps(*ImpDef2);
SlotIndex Ins2Idx;
if (InsMI2)
Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2);
SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI);
LIS->getInterval(InRegLEA);
LIS->getInterval(OutRegLEA);
if (InRegLEA2)
LIS->getInterval(InRegLEA2);
// Move the use of Src up to InsMI.
LiveInterval &SrcLI = LIS->getInterval(Src);
LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx);
if (SrcSeg->end == NewIdx.getRegSlot())
SrcSeg->end = InsIdx.getRegSlot();
if (InsMI2) {
// Move the use of Src2 up to InsMI2.
LiveInterval &Src2LI = LIS->getInterval(Src2);
LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx);
if (Src2Seg->end == NewIdx.getRegSlot())
Src2Seg->end = Ins2Idx.getRegSlot();
}
// Move the definition of Dest down to ExtMI.
LiveInterval &DestLI = LIS->getInterval(Dest);
LiveRange::Segment *DestSeg =
DestLI.getSegmentContaining(NewIdx.getRegSlot());
assert(DestSeg->start == NewIdx.getRegSlot() &&
DestSeg->valno->def == NewIdx.getRegSlot());
DestSeg->start = ExtIdx.getRegSlot();
DestSeg->valno->def = ExtIdx.getRegSlot();
}
return ExtMI;
}
/// This method must be implemented by targets that
/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
/// may be able to convert a two-address instruction into a true
/// three-address instruction on demand. This allows the X86 target (for
/// example) to convert ADD and SHL instructions into LEA instructions if they
/// would require register copies due to two-addressness.
///
/// This method returns a null pointer if the transformation cannot be
/// performed, otherwise it returns the new instruction.
///
MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
// The following opcodes also sets the condition code register(s). Only
// convert them to equivalent lea if the condition code register def's
// are dead!
if (hasLiveCondCodeDef(MI))
return nullptr;
MachineFunction &MF = *MI.getParent()->getParent();
// All instructions input are two-addr instructions. Get the known operands.
const MachineOperand &Dest = MI.getOperand(0);
const MachineOperand &Src = MI.getOperand(1);
// Ideally, operations with undef should be folded before we get here, but we
// can't guarantee it. Bail out because optimizing undefs is a waste of time.
// Without this, we have to forward undef state to new register operands to
// avoid machine verifier errors.
if (Src.isUndef())
return nullptr;
if (MI.getNumOperands() > 2)
if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
return nullptr;
MachineInstr *NewMI = nullptr;
Register SrcReg, SrcReg2;
bool Is64Bit = Subtarget.is64Bit();
bool Is8BitOp = false;
unsigned MIOpc = MI.getOpcode();
switch (MIOpc) {
default: llvm_unreachable("Unreachable!");
case X86::SHL64ri: {
assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
// LEA can't handle RSP.
if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
Src.getReg(), &X86::GR64_NOSPRegClass))
return nullptr;
NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
.add(Dest)
.addReg(0)
.addImm(1ULL << ShAmt)
.add(Src)
.addImm(0)
.addReg(0);
break;
}
case X86::SHL32ri: {
assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
// LEA can't handle ESP.
bool isKill;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
ImplicitOp, LV, LIS))
return nullptr;
MachineInstrBuilder MIB =
BuildMI(MF, MI.getDebugLoc(), get(Opc))
.add(Dest)
.addReg(0)
.addImm(1ULL << ShAmt)
.addReg(SrcReg, getKillRegState(isKill))
.addImm(0)
.addReg(0);
if (ImplicitOp.getReg() != 0)
MIB.add(ImplicitOp);
NewMI = MIB;
break;
}
case X86::SHL8ri:
Is8BitOp = true;
LLVM_FALLTHROUGH;
case X86::SHL16ri: {
assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
if (!isTruncatedShiftCountForLEA(ShAmt))
return nullptr;
return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
}
case X86::INC64r:
case X86::INC32r: {
assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r :
(Is64Bit ? X86::LEA64_32r : X86::LEA32r);
bool isKill;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
ImplicitOp, LV, LIS))
return nullptr;
MachineInstrBuilder MIB =
BuildMI(MF, MI.getDebugLoc(), get(Opc))
.add(Dest)
.addReg(SrcReg, getKillRegState(isKill));
if (ImplicitOp.getReg() != 0)
MIB.add(ImplicitOp);
NewMI = addOffset(MIB, 1);
break;
}
case X86::DEC64r:
case X86::DEC32r: {
assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
: (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
bool isKill;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
ImplicitOp, LV, LIS))
return nullptr;
MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
.add(Dest)
.addReg(SrcReg, getKillRegState(isKill));
if (ImplicitOp.getReg() != 0)
MIB.add(ImplicitOp);
NewMI = addOffset(MIB, -1);
break;
}
case X86::DEC8r:
case X86::INC8r:
Is8BitOp = true;
LLVM_FALLTHROUGH;
case X86::DEC16r:
case X86::INC16r:
return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
case X86::ADD64rr:
case X86::ADD64rr_DB:
case X86::ADD32rr:
case X86::ADD32rr_DB: {
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
unsigned Opc;
if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
Opc = X86::LEA64r;
else
Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
const MachineOperand &Src2 = MI.getOperand(2);
bool isKill2;
MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, isKill2,
ImplicitOp2, LV, LIS))
return nullptr;
bool isKill;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (Src.getReg() == Src2.getReg()) {
// Don't call classify LEAReg a second time on the same register, in case
// the first call inserted a COPY from Src2 and marked it as killed.
isKill = isKill2;
SrcReg = SrcReg2;
} else {
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
ImplicitOp, LV, LIS))
return nullptr;
}
MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
if (ImplicitOp.getReg() != 0)
MIB.add(ImplicitOp);
if (ImplicitOp2.getReg() != 0)
MIB.add(ImplicitOp2);
NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
if (LV && Src2.isKill())
LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
break;
}
case X86::ADD8rr:
case X86::ADD8rr_DB:
Is8BitOp = true;
LLVM_FALLTHROUGH;
case X86::ADD16rr:
case X86::ADD16rr_DB:
return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
case X86::ADD64ri32:
case X86::ADD64ri8:
case X86::ADD64ri32_DB:
case X86::ADD64ri8_DB:
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
NewMI = addOffset(
BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
MI.getOperand(2));
break;
case X86::ADD32ri:
case X86::ADD32ri8:
case X86::ADD32ri_DB:
case X86::ADD32ri8_DB: {
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
bool isKill;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
ImplicitOp, LV, LIS))
return nullptr;
MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
.add(Dest)
.addReg(SrcReg, getKillRegState(isKill));
if (ImplicitOp.getReg() != 0)
MIB.add(ImplicitOp);
NewMI = addOffset(MIB, MI.getOperand(2));
break;
}
case X86::ADD8ri:
case X86::ADD8ri_DB:
Is8BitOp = true;
LLVM_FALLTHROUGH;
case X86::ADD16ri:
case X86::ADD16ri8:
case X86::ADD16ri_DB:
case X86::ADD16ri8_DB:
return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
case X86::SUB8ri:
case X86::SUB16ri8:
case X86::SUB16ri:
/// FIXME: Support these similar to ADD8ri/ADD16ri*.
return nullptr;
case X86::SUB32ri8:
case X86::SUB32ri: {
if (!MI.getOperand(2).isImm())
return nullptr;
int64_t Imm = MI.getOperand(2).getImm();
if (!isInt<32>(-Imm))
return nullptr;
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
bool isKill;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
ImplicitOp, LV, LIS))
return nullptr;
MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
.add(Dest)
.addReg(SrcReg, getKillRegState(isKill));
if (ImplicitOp.getReg() != 0)
MIB.add(ImplicitOp);
NewMI = addOffset(MIB, -Imm);
break;
}
case X86::SUB64ri8:
case X86::SUB64ri32: {
if (!MI.getOperand(2).isImm())
return nullptr;
int64_t Imm = MI.getOperand(2).getImm();
if (!isInt<32>(-Imm))
return nullptr;
assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(),
get(X86::LEA64r)).add(Dest).add(Src);
NewMI = addOffset(MIB, -Imm);
break;
}
case X86::VMOVDQU8Z128rmk:
case X86::VMOVDQU8Z256rmk:
case X86::VMOVDQU8Zrmk:
case X86::VMOVDQU16Z128rmk:
case X86::VMOVDQU16Z256rmk:
case X86::VMOVDQU16Zrmk:
case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk:
case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk:
case X86::VMOVDQU32Zrmk: case X86::VMOVDQA32Zrmk:
case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk:
case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk:
case X86::VMOVDQU64Zrmk: case X86::VMOVDQA64Zrmk:
case X86::VMOVUPDZ128rmk: case X86::VMOVAPDZ128rmk:
case X86::VMOVUPDZ256rmk: case X86::VMOVAPDZ256rmk:
case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk:
case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk:
case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk:
case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk:
case X86::VBROADCASTSDZ256rmk:
case X86::VBROADCASTSDZrmk:
case X86::VBROADCASTSSZ128rmk:
case X86::VBROADCASTSSZ256rmk:
case X86::VBROADCASTSSZrmk:
case X86::VPBROADCASTDZ128rmk:
case X86::VPBROADCASTDZ256rmk:
case X86::VPBROADCASTDZrmk:
case X86::VPBROADCASTQZ128rmk:
case X86::VPBROADCASTQZ256rmk:
case X86::VPBROADCASTQZrmk: {
unsigned Opc;
switch (MIOpc) {
default: llvm_unreachable("Unreachable!");
case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
case X86::VBROADCASTSDZ256rmk: Opc = X86::VBLENDMPDZ256rmbk; break;
case X86::VBROADCASTSDZrmk: Opc = X86::VBLENDMPDZrmbk; break;
case X86::VBROADCASTSSZ128rmk: Opc = X86::VBLENDMPSZ128rmbk; break;
case X86::VBROADCASTSSZ256rmk: Opc = X86::VBLENDMPSZ256rmbk; break;
case X86::VBROADCASTSSZrmk: Opc = X86::VBLENDMPSZrmbk; break;
case X86::VPBROADCASTDZ128rmk: Opc = X86::VPBLENDMDZ128rmbk; break;
case X86::VPBROADCASTDZ256rmk: Opc = X86::VPBLENDMDZ256rmbk; break;
case X86::VPBROADCASTDZrmk: Opc = X86::VPBLENDMDZrmbk; break;
case X86::VPBROADCASTQZ128rmk: Opc = X86::VPBLENDMQZ128rmbk; break;
case X86::VPBROADCASTQZ256rmk: Opc = X86::VPBLENDMQZ256rmbk; break;
case X86::VPBROADCASTQZrmk: Opc = X86::VPBLENDMQZrmbk; break;
}
NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
.add(Dest)
.add(MI.getOperand(2))
.add(Src)
.add(MI.getOperand(3))
.add(MI.getOperand(4))
.add(MI.getOperand(5))
.add(MI.getOperand(6))
.add(MI.getOperand(7));
break;
}
case X86::VMOVDQU8Z128rrk:
case X86::VMOVDQU8Z256rrk:
case X86::VMOVDQU8Zrrk:
case X86::VMOVDQU16Z128rrk:
case X86::VMOVDQU16Z256rrk:
case X86::VMOVDQU16Zrrk:
case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk:
case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk:
case X86::VMOVDQU32Zrrk: case X86::VMOVDQA32Zrrk:
case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk:
case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk:
case X86::VMOVDQU64Zrrk: case X86::VMOVDQA64Zrrk:
case X86::VMOVUPDZ128rrk: case X86::VMOVAPDZ128rrk:
case X86::VMOVUPDZ256rrk: case X86::VMOVAPDZ256rrk:
case X86::VMOVUPDZrrk: case X86::VMOVAPDZrrk:
case X86::VMOVUPSZ128rrk: case X86::VMOVAPSZ128rrk:
case X86::VMOVUPSZ256rrk: case X86::VMOVAPSZ256rrk:
case X86::VMOVUPSZrrk: case X86::VMOVAPSZrrk: {
unsigned Opc;
switch (MIOpc) {
default: llvm_unreachable("Unreachable!");
case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break;
case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break;
case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break;
case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break;
case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break;
case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break;
case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
}
NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
.add(Dest)
.add(MI.getOperand(2))
.add(Src)
.add(MI.getOperand(3));
break;
}
}
if (!NewMI) return nullptr;
if (LV) { // Update live variables
if (Src.isKill())
LV->replaceKillInstruction(Src.getReg(), MI, *NewMI);
if (Dest.isDead())
LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI);
}
MachineBasicBlock &MBB = *MI.getParent();
MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
if (LIS) {
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
if (SrcReg)
LIS->getInterval(SrcReg);
if (SrcReg2)
LIS->getInterval(SrcReg2);
}
return NewMI;
}
/// This determines which of three possible cases of a three source commute
/// the source indexes correspond to taking into account any mask operands.
/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
/// possible.
/// Case 0 - Possible to commute the first and second operands.
/// Case 1 - Possible to commute the first and third operands.
/// Case 2 - Possible to commute the second and third operands.
static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
unsigned SrcOpIdx2) {
// Put the lowest index to SrcOpIdx1 to simplify the checks below.
if (SrcOpIdx1 > SrcOpIdx2)
std::swap(SrcOpIdx1, SrcOpIdx2);
unsigned Op1 = 1, Op2 = 2, Op3 = 3;
if (X86II::isKMasked(TSFlags)) {
Op2++;
Op3++;
}
if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
return 0;
if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
return 1;
if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
return 2;
llvm_unreachable("Unknown three src commute case.");
}
unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
const X86InstrFMA3Group &FMA3Group) const {
unsigned Opc = MI.getOpcode();
// TODO: Commuting the 1st operand of FMA*_Int requires some additional
// analysis. The commute optimization is legal only if all users of FMA*_Int
// use only the lowest element of the FMA*_Int instruction. Such analysis are
// not implemented yet. So, just return 0 in that case.
// When such analysis are available this place will be the right place for
// calling it.
assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
"Intrinsic instructions can't commute operand 1");
// Determine which case this commute is or if it can't be done.
unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
SrcOpIdx2);
assert(Case < 3 && "Unexpected case number!");
// Define the FMA forms mapping array that helps to map input FMA form
// to output FMA form to preserve the operation semantics after
// commuting the operands.
const unsigned Form132Index = 0;
const unsigned Form213Index = 1;
const unsigned Form231Index = 2;
static const unsigned FormMapping[][3] = {
// 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
// FMA132 A, C, b; ==> FMA231 C, A, b;
// FMA213 B, A, c; ==> FMA213 A, B, c;
// FMA231 C, A, b; ==> FMA132 A, C, b;
{ Form231Index, Form213Index, Form132Index },
// 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
// FMA132 A, c, B; ==> FMA132 B, c, A;
// FMA213 B, a, C; ==> FMA231 C, a, B;
// FMA231 C, a, B; ==> FMA213 B, a, C;
{ Form132Index, Form231Index, Form213Index },
// 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
// FMA132 a, C, B; ==> FMA213 a, B, C;
// FMA213 b, A, C; ==> FMA132 b, C, A;
// FMA231 c, A, B; ==> FMA231 c, B, A;
{ Form213Index, Form132Index, Form231Index }
};
unsigned FMAForms[3];
FMAForms[0] = FMA3Group.get132Opcode();
FMAForms[1] = FMA3Group.get213Opcode();
FMAForms[2] = FMA3Group.get231Opcode();
unsigned FormIndex;
for (FormIndex = 0; FormIndex < 3; FormIndex++)
if (Opc == FMAForms[FormIndex])
break;
// Everything is ready, just adjust the FMA opcode and return it.
FormIndex = FormMapping[Case][FormIndex];
return FMAForms[FormIndex];
}
static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
unsigned SrcOpIdx2) {
// Determine which case this commute is or if it can't be done.
unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
SrcOpIdx2);
assert(Case < 3 && "Unexpected case value!");
// For each case we need to swap two pairs of bits in the final immediate.
static const uint8_t SwapMasks[3][4] = {
{ 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
{ 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
{ 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
};
uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
// Clear out the bits we are swapping.
uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
SwapMasks[Case][2] | SwapMasks[Case][3]);
// If the immediate had a bit of the pair set, then set the opposite bit.
if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
}
// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
// commuted.
static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
#define VPERM_CASES(Suffix) \
case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:
#define VPERM_CASES_BROADCAST(Suffix) \
VPERM_CASES(Suffix) \
case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:
switch (Opcode) {
default: return false;
VPERM_CASES(B)
VPERM_CASES_BROADCAST(D)
VPERM_CASES_BROADCAST(PD)
VPERM_CASES_BROADCAST(PS)
VPERM_CASES_BROADCAST(Q)
VPERM_CASES(W)
return true;
}
#undef VPERM_CASES_BROADCAST
#undef VPERM_CASES
}
// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
// from the I opcode to the T opcode and vice versa.
static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
#define VPERM_CASES(Orig, New) \
case X86::Orig##128rr: return X86::New##128rr; \
case X86::Orig##128rrkz: return X86::New##128rrkz; \
case X86::Orig##128rm: return X86::New##128rm; \
case X86::Orig##128rmkz: return X86::New##128rmkz; \
case X86::Orig##256rr: return X86::New##256rr; \
case X86::Orig##256rrkz: return X86::New##256rrkz; \
case X86::Orig##256rm: return X86::New##256rm; \
case X86::Orig##256rmkz: return X86::New##256rmkz; \
case X86::Orig##rr: return X86::New##rr; \
case X86::Orig##rrkz: return X86::New##rrkz; \
case X86::Orig##rm: return X86::New##rm; \
case X86::Orig##rmkz: return X86::New##rmkz;
#define VPERM_CASES_BROADCAST(Orig, New) \
VPERM_CASES(Orig, New) \
case X86::Orig##128rmb: return X86::New##128rmb; \
case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
case X86::Orig##256rmb: return X86::New##256rmb; \
case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
case X86::Orig##rmb: return X86::New##rmb; \
case X86::Orig##rmbkz: return X86::New##rmbkz;
switch (Opcode) {
VPERM_CASES(VPERMI2B, VPERMT2B)
VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
VPERM_CASES(VPERMI2W, VPERMT2W)
VPERM_CASES(VPERMT2B, VPERMI2B)
VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
VPERM_CASES(VPERMT2W, VPERMI2W)
}
llvm_unreachable("Unreachable!");
#undef VPERM_CASES_BROADCAST
#undef VPERM_CASES
}
MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned OpIdx1,
unsigned OpIdx2) const {
auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
if (NewMI)
return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
return MI;
};
switch (MI.getOpcode()) {
case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
unsigned Opc;
unsigned Size;
switch (MI.getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
}
unsigned Amt = MI.getOperand(3).getImm();
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
WorkingMI.getOperand(3).setImm(Size - Amt);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::PFSUBrr:
case X86::PFSUBRrr: {
// PFSUB x, y: x = x - y
// PFSUBR x, y: x = y - x
unsigned Opc =
(X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr);
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::BLENDPDrri:
case X86::BLENDPSrri:
case X86::VBLENDPDrri:
case X86::VBLENDPSrri:
// If we're optimizing for size, try to use MOVSD/MOVSS.
if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
unsigned Mask, Opc;
switch (MI.getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::BLENDPDrri: Opc = X86::MOVSDrr; Mask = 0x03; break;
case X86::BLENDPSrri: Opc = X86::MOVSSrr; Mask = 0x0F; break;
case X86::VBLENDPDrri: Opc = X86::VMOVSDrr; Mask = 0x03; break;
case X86::VBLENDPSrri: Opc = X86::VMOVSSrr; Mask = 0x0F; break;
}
if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
WorkingMI.RemoveOperand(3);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
/*NewMI=*/false,
OpIdx1, OpIdx2);
}
}
LLVM_FALLTHROUGH;
case X86::PBLENDWrri:
case X86::VBLENDPDYrri:
case X86::VBLENDPSYrri:
case X86::VPBLENDDrri:
case X86::VPBLENDWrri:
case X86::VPBLENDDYrri:
case X86::VPBLENDWYrri:{
int8_t Mask;
switch (MI.getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::BLENDPDrri: Mask = (int8_t)0x03; break;
case X86::BLENDPSrri: Mask = (int8_t)0x0F; break;
case X86::PBLENDWrri: Mask = (int8_t)0xFF; break;
case X86::VBLENDPDrri: Mask = (int8_t)0x03; break;
case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break;
case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break;
case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break;
case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break;
case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break;
case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break;
case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break;
}
// Only the least significant bits of Imm are used.
// Using int8_t to ensure it will be sign extended to the int64_t that
// setImm takes in order to match isel behavior.
int8_t Imm = MI.getOperand(3).getImm() & Mask;
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(3).setImm(Mask ^ Imm);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::INSERTPSrr:
case X86::VINSERTPSrr:
case X86::VINSERTPSZrr: {
unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
unsigned ZMask = Imm & 15;
unsigned DstIdx = (Imm >> 4) & 3;
unsigned SrcIdx = (Imm >> 6) & 3;
// We can commute insertps if we zero 2 of the elements, the insertion is
// "inline" and we don't override the insertion with a zero.
if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
countPopulation(ZMask) == 2) {
unsigned AltIdx = findFirstSet((ZMask | (1 << DstIdx)) ^ 15);
assert(AltIdx < 4 && "Illegal insertion index");
unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
return nullptr;
}
case X86::MOVSDrr:
case X86::MOVSSrr:
case X86::VMOVSDrr:
case X86::VMOVSSrr:{
// On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
if (Subtarget.hasSSE41()) {
unsigned Mask, Opc;
switch (MI.getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
}
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
// Convert to SHUFPD.
assert(MI.getOpcode() == X86::MOVSDrr &&
"Can only commute MOVSDrr without SSE4.1");
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(X86::SHUFPDrri));
WorkingMI.addOperand(MachineOperand::CreateImm(0x02));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::SHUFPDrri: {
// Commute to MOVSD.
assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(X86::MOVSDrr));
WorkingMI.RemoveOperand(3);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::PCLMULQDQrr:
case X86::VPCLMULQDQrr:
case X86::VPCLMULQDQYrr:
case X86::VPCLMULQDQZrr:
case X86::VPCLMULQDQZ128rr:
case X86::VPCLMULQDQZ256rr: {
// SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
// SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
unsigned Imm = MI.getOperand(3).getImm();
unsigned Src1Hi = Imm & 0x01;
unsigned Src2Hi = Imm & 0x10;
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
case X86::VPCMPWZrri: case X86::VPCMPUWZrri:
case X86::VPCMPBZ128rrik: case X86::VPCMPUBZ128rrik:
case X86::VPCMPBZ256rrik: case X86::VPCMPUBZ256rrik:
case X86::VPCMPBZrrik: case X86::VPCMPUBZrrik:
case X86::VPCMPDZ128rrik: case X86::VPCMPUDZ128rrik:
case X86::VPCMPDZ256rrik: case X86::VPCMPUDZ256rrik:
case X86::VPCMPDZrrik: case X86::VPCMPUDZrrik:
case X86::VPCMPQZ128rrik: case X86::VPCMPUQZ128rrik:
case X86::VPCMPQZ256rrik: case X86::VPCMPUQZ256rrik:
case X86::VPCMPQZrrik: case X86::VPCMPUQZrrik:
case X86::VPCMPWZ128rrik: case X86::VPCMPUWZ128rrik:
case X86::VPCMPWZ256rrik: case X86::VPCMPUWZ256rrik:
case X86::VPCMPWZrrik: case X86::VPCMPUWZrrik: {
// Flip comparison mode immediate (if necessary).
unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
Imm = X86::getSwappedVPCMPImm(Imm);
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::VPCOMBri: case X86::VPCOMUBri:
case X86::VPCOMDri: case X86::VPCOMUDri:
case X86::VPCOMQri: case X86::VPCOMUQri:
case X86::VPCOMWri: case X86::VPCOMUWri: {
// Flip comparison mode immediate (if necessary).
unsigned Imm = MI.getOperand(3).getImm() & 0x7;
Imm = X86::getSwappedVPCOMImm(Imm);
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(3).setImm(Imm);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::VCMPSDZrr:
case X86::VCMPSSZrr:
case X86::VCMPPDZrri:
case X86::VCMPPSZrri:
case X86::VCMPSHZrr:
case X86::VCMPPHZrri:
case X86::VCMPPHZ128rri:
case X86::VCMPPHZ256rri:
case X86::VCMPPDZ128rri:
case X86::VCMPPSZ128rri:
case X86::VCMPPDZ256rri:
case X86::VCMPPSZ256rri:
case X86::VCMPPDZrrik:
case X86::VCMPPSZrrik:
case X86::VCMPPDZ128rrik:
case X86::VCMPPSZ128rrik:
case X86::VCMPPDZ256rrik:
case X86::VCMPPSZ256rrik: {
unsigned Imm =
MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f;
Imm = X86::getSwappedVCMPImm(Imm);
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::VPERM2F128rr:
case X86::VPERM2I128rr: {
// Flip permute source immediate.
// Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
// Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
int8_t Imm = MI.getOperand(3).getImm() & 0xFF;
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::MOVHLPSrr:
case X86::UNPCKHPDrr:
case X86::VMOVHLPSrr:
case X86::VUNPCKHPDrr:
case X86::VMOVHLPSZrr:
case X86::VUNPCKHPDZ128rr: {
assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
unsigned Opc = MI.getOpcode();
switch (Opc) {
default: llvm_unreachable("Unreachable!");
case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
case X86::VMOVHLPSrr: Opc = X86::VUNPCKHPDrr; break;
case X86::VUNPCKHPDrr: Opc = X86::VMOVHLPSrr; break;
case X86::VMOVHLPSZrr: Opc = X86::VUNPCKHPDZ128rr; break;
case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr; break;
}
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: {
auto &WorkingMI = cloneIfNew(MI);
unsigned OpNo = MI.getDesc().getNumOperands() - 1;
X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
case X86::VPTERNLOGDZrrik:
case X86::VPTERNLOGDZ128rrik:
case X86::VPTERNLOGDZ256rrik:
case X86::VPTERNLOGQZrrik:
case X86::VPTERNLOGQZ128rrik:
case X86::VPTERNLOGQZ256rrik:
case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
case X86::VPTERNLOGDZ128rmbi:
case X86::VPTERNLOGDZ256rmbi:
case X86::VPTERNLOGDZrmbi:
case X86::VPTERNLOGQZ128rmbi:
case X86::VPTERNLOGQZ256rmbi:
case X86::VPTERNLOGQZrmbi:
case X86::VPTERNLOGDZ128rmbikz:
case X86::VPTERNLOGDZ256rmbikz:
case X86::VPTERNLOGDZrmbikz:
case X86::VPTERNLOGQZ128rmbikz:
case X86::VPTERNLOGQZ256rmbikz:
case X86::VPTERNLOGQZrmbikz: {
auto &WorkingMI = cloneIfNew(MI);
commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
default: {
if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
MI.getDesc().TSFlags);
if (FMA3Group) {
unsigned Opc =
getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
}
}
bool
X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2,
bool IsIntrinsic) const {
uint64_t TSFlags = MI.getDesc().TSFlags;
unsigned FirstCommutableVecOp = 1;
unsigned LastCommutableVecOp = 3;
unsigned KMaskOp = -1U;
if (X86II::isKMasked(TSFlags)) {
// For k-zero-masked operations it is Ok to commute the first vector
// operand. Unless this is an intrinsic instruction.
// For regular k-masked operations a conservative choice is done as the
// elements of the first vector operand, for which the corresponding bit
// in the k-mask operand is set to 0, are copied to the result of the
// instruction.
// TODO/FIXME: The commute still may be legal if it is known that the
// k-mask operand is set to either all ones or all zeroes.
// It is also Ok to commute the 1st operand if all users of MI use only
// the elements enabled by the k-mask operand. For example,
// v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
// : v1[i];
// VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
// // Ok, to commute v1 in FMADD213PSZrk.
// The k-mask operand has index = 2 for masked and zero-masked operations.
KMaskOp = 2;
// The operand with index = 1 is used as a source for those elements for
// which the corresponding bit in the k-mask is set to 0.
if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
FirstCommutableVecOp = 3;
LastCommutableVecOp++;
} else if (IsIntrinsic) {
// Commuting the first operand of an intrinsic instruction isn't possible
// unless we can prove that only the lowest element of the result is used.
FirstCommutableVecOp = 2;
}
if (isMem(MI, LastCommutableVecOp))
LastCommutableVecOp--;
// Only the first RegOpsNum operands are commutable.
// Also, the value 'CommuteAnyOperandIndex' is valid here as it means
// that the operand is not specified/fixed.
if (SrcOpIdx1 != CommuteAnyOperandIndex &&
(SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
SrcOpIdx1 == KMaskOp))
return false;
if (SrcOpIdx2 != CommuteAnyOperandIndex &&
(SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
SrcOpIdx2 == KMaskOp))
return false;
// Look for two different register operands assumed to be commutable
// regardless of the FMA opcode. The FMA opcode is adjusted later.
if (SrcOpIdx1 == CommuteAnyOperandIndex ||
SrcOpIdx2 == CommuteAnyOperandIndex) {
unsigned CommutableOpIdx2 = SrcOpIdx2;
// At least one of operands to be commuted is not specified and
// this method is free to choose appropriate commutable operands.
if (SrcOpIdx1 == SrcOpIdx2)
// Both of operands are not fixed. By default set one of commutable
// operands to the last register operand of the instruction.
CommutableOpIdx2 = LastCommutableVecOp;
else if (SrcOpIdx2 == CommuteAnyOperandIndex)
// Only one of operands is not fixed.
CommutableOpIdx2 = SrcOpIdx1;
// CommutableOpIdx2 is well defined now. Let's choose another commutable
// operand and assign its index to CommutableOpIdx1.
Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
unsigned CommutableOpIdx1;
for (CommutableOpIdx1 = LastCommutableVecOp;
CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
// Just ignore and skip the k-mask operand.
if (CommutableOpIdx1 == KMaskOp)
continue;
// The commuted operands must have different registers.
// Otherwise, the commute transformation does not change anything and
// is useless then.
if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
break;
}
// No appropriate commutable operands were found.
if (CommutableOpIdx1 < FirstCommutableVecOp)
return false;
// Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
// to return those values.
if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
CommutableOpIdx1, CommutableOpIdx2))
return false;
}
return true;
}
bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {
const MCInstrDesc &Desc = MI.getDesc();
if (!Desc.isCommutable())
return false;
switch (MI.getOpcode()) {
case X86::CMPSDrr:
case X86::CMPSSrr:
case X86::CMPPDrri:
case X86::CMPPSrri:
case X86::VCMPSDrr:
case X86::VCMPSSrr:
case X86::VCMPPDrri:
case X86::VCMPPSrri:
case X86::VCMPPDYrri:
case X86::VCMPPSYrri:
case X86::VCMPSDZrr:
case X86::VCMPSSZrr:
case X86::VCMPPDZrri:
case X86::VCMPPSZrri:
case X86::VCMPSHZrr:
case X86::VCMPPHZrri:
case X86::VCMPPHZ128rri:
case X86::VCMPPHZ256rri:
case X86::VCMPPDZ128rri:
case X86::VCMPPSZ128rri:
case X86::VCMPPDZ256rri:
case X86::VCMPPSZ256rri:
case X86::VCMPPDZrrik:
case X86::VCMPPSZrrik:
case X86::VCMPPDZ128rrik:
case X86::VCMPPSZ128rrik:
case X86::VCMPPDZ256rrik:
case X86::VCMPPSZ256rrik: {
unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
// Float comparison can be safely commuted for
// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
switch (Imm) {
default:
// EVEX versions can be commuted.
if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
break;
return false;
case 0x00: // EQUAL
case 0x03: // UNORDERED
case 0x04: // NOT EQUAL
case 0x07: // ORDERED
break;
}
// The indices of the commutable operands are 1 and 2 (or 2 and 3
// when masked).
// Assign them to the returned operand indices here.
return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2 + OpOffset);
}
case X86::MOVSSrr:
// X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
// form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
// AVX implies sse4.1.
if (Subtarget.hasSSE41())
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
return false;
case X86::SHUFPDrri: