| //===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file contains the X86 implementation of the TargetInstrInfo class. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "X86InstrInfo.h" |
| #include "X86.h" |
| #include "X86InstrBuilder.h" |
| #include "X86InstrFoldTables.h" |
| #include "X86MachineFunctionInfo.h" |
| #include "X86Subtarget.h" |
| #include "X86TargetMachine.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/Sequence.h" |
| #include "llvm/CodeGen/LiveIntervals.h" |
| #include "llvm/CodeGen/LivePhysRegs.h" |
| #include "llvm/CodeGen/LiveVariables.h" |
| #include "llvm/CodeGen/MachineConstantPool.h" |
| #include "llvm/CodeGen/MachineDominators.h" |
| #include "llvm/CodeGen/MachineFrameInfo.h" |
| #include "llvm/CodeGen/MachineInstrBuilder.h" |
| #include "llvm/CodeGen/MachineModuleInfo.h" |
| #include "llvm/CodeGen/MachineRegisterInfo.h" |
| #include "llvm/CodeGen/StackMaps.h" |
| #include "llvm/IR/DebugInfoMetadata.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/Function.h" |
| #include "llvm/MC/MCAsmInfo.h" |
| #include "llvm/MC/MCExpr.h" |
| #include "llvm/MC/MCInst.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/ErrorHandling.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Target/TargetOptions.h" |
| |
| using namespace llvm; |
| |
| #define DEBUG_TYPE "x86-instr-info" |
| |
| #define GET_INSTRINFO_CTOR_DTOR |
| #include "X86GenInstrInfo.inc" |
| |
| static cl::opt<bool> |
| NoFusing("disable-spill-fusing", |
| cl::desc("Disable fusing of spill code into instructions"), |
| cl::Hidden); |
| static cl::opt<bool> |
| PrintFailedFusing("print-failed-fuse-candidates", |
| cl::desc("Print instructions that the allocator wants to" |
| " fuse, but the X86 backend currently can't"), |
| cl::Hidden); |
| static cl::opt<bool> |
| ReMatPICStubLoad("remat-pic-stub-load", |
| cl::desc("Re-materialize load from stub in PIC mode"), |
| cl::init(false), cl::Hidden); |
| static cl::opt<unsigned> |
| PartialRegUpdateClearance("partial-reg-update-clearance", |
| cl::desc("Clearance between two register writes " |
| "for inserting XOR to avoid partial " |
| "register update"), |
| cl::init(64), cl::Hidden); |
| static cl::opt<unsigned> |
| UndefRegClearance("undef-reg-clearance", |
| cl::desc("How many idle instructions we would like before " |
| "certain undef register reads"), |
| cl::init(128), cl::Hidden); |
| |
| |
| // Pin the vtable to this file. |
| void X86InstrInfo::anchor() {} |
| |
| X86InstrInfo::X86InstrInfo(X86Subtarget &STI) |
| : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 |
| : X86::ADJCALLSTACKDOWN32), |
| (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 |
| : X86::ADJCALLSTACKUP32), |
| X86::CATCHRET, |
| (STI.is64Bit() ? X86::RET64 : X86::RET32)), |
| Subtarget(STI), RI(STI.getTargetTriple()) { |
| } |
| |
| bool |
| X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, |
| Register &SrcReg, Register &DstReg, |
| unsigned &SubIdx) const { |
| switch (MI.getOpcode()) { |
| default: break; |
| case X86::MOVSX16rr8: |
| case X86::MOVZX16rr8: |
| case X86::MOVSX32rr8: |
| case X86::MOVZX32rr8: |
| case X86::MOVSX64rr8: |
| if (!Subtarget.is64Bit()) |
| // It's not always legal to reference the low 8-bit of the larger |
| // register in 32-bit mode. |
| return false; |
| LLVM_FALLTHROUGH; |
| case X86::MOVSX32rr16: |
| case X86::MOVZX32rr16: |
| case X86::MOVSX64rr16: |
| case X86::MOVSX64rr32: { |
| if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) |
| // Be conservative. |
| return false; |
| SrcReg = MI.getOperand(1).getReg(); |
| DstReg = MI.getOperand(0).getReg(); |
| switch (MI.getOpcode()) { |
| default: llvm_unreachable("Unreachable!"); |
| case X86::MOVSX16rr8: |
| case X86::MOVZX16rr8: |
| case X86::MOVSX32rr8: |
| case X86::MOVZX32rr8: |
| case X86::MOVSX64rr8: |
| SubIdx = X86::sub_8bit; |
| break; |
| case X86::MOVSX32rr16: |
| case X86::MOVZX32rr16: |
| case X86::MOVSX64rr16: |
| SubIdx = X86::sub_16bit; |
| break; |
| case X86::MOVSX64rr32: |
| SubIdx = X86::sub_32bit; |
| break; |
| } |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| bool X86InstrInfo::isDataInvariant(MachineInstr &MI) { |
| switch (MI.getOpcode()) { |
| default: |
| // By default, assume that the instruction is not data invariant. |
| return false; |
| |
| // Some target-independent operations that trivially lower to data-invariant |
| // instructions. |
| case TargetOpcode::COPY: |
| case TargetOpcode::INSERT_SUBREG: |
| case TargetOpcode::SUBREG_TO_REG: |
| return true; |
| |
| // On x86 it is believed that imul is constant time w.r.t. the loaded data. |
| // However, they set flags and are perhaps the most surprisingly constant |
| // time operations so we call them out here separately. |
| case X86::IMUL16rr: |
| case X86::IMUL16rri8: |
| case X86::IMUL16rri: |
| case X86::IMUL32rr: |
| case X86::IMUL32rri8: |
| case X86::IMUL32rri: |
| case X86::IMUL64rr: |
| case X86::IMUL64rri32: |
| case X86::IMUL64rri8: |
| |
| // Bit scanning and counting instructions that are somewhat surprisingly |
| // constant time as they scan across bits and do other fairly complex |
| // operations like popcnt, but are believed to be constant time on x86. |
| // However, these set flags. |
| case X86::BSF16rr: |
| case X86::BSF32rr: |
| case X86::BSF64rr: |
| case X86::BSR16rr: |
| case X86::BSR32rr: |
| case X86::BSR64rr: |
| case X86::LZCNT16rr: |
| case X86::LZCNT32rr: |
| case X86::LZCNT64rr: |
| case X86::POPCNT16rr: |
| case X86::POPCNT32rr: |
| case X86::POPCNT64rr: |
| case X86::TZCNT16rr: |
| case X86::TZCNT32rr: |
| case X86::TZCNT64rr: |
| |
| // Bit manipulation instructions are effectively combinations of basic |
| // arithmetic ops, and should still execute in constant time. These also |
| // set flags. |
| case X86::BLCFILL32rr: |
| case X86::BLCFILL64rr: |
| case X86::BLCI32rr: |
| case X86::BLCI64rr: |
| case X86::BLCIC32rr: |
| case X86::BLCIC64rr: |
| case X86::BLCMSK32rr: |
| case X86::BLCMSK64rr: |
| case X86::BLCS32rr: |
| case X86::BLCS64rr: |
| case X86::BLSFILL32rr: |
| case X86::BLSFILL64rr: |
| case X86::BLSI32rr: |
| case X86::BLSI64rr: |
| case X86::BLSIC32rr: |
| case X86::BLSIC64rr: |
| case X86::BLSMSK32rr: |
| case X86::BLSMSK64rr: |
| case X86::BLSR32rr: |
| case X86::BLSR64rr: |
| case X86::TZMSK32rr: |
| case X86::TZMSK64rr: |
| |
| // Bit extracting and clearing instructions should execute in constant time, |
| // and set flags. |
| case X86::BEXTR32rr: |
| case X86::BEXTR64rr: |
| case X86::BEXTRI32ri: |
| case X86::BEXTRI64ri: |
| case X86::BZHI32rr: |
| case X86::BZHI64rr: |
| |
| // Shift and rotate. |
| case X86::ROL8r1: |
| case X86::ROL16r1: |
| case X86::ROL32r1: |
| case X86::ROL64r1: |
| case X86::ROL8rCL: |
| case X86::ROL16rCL: |
| case X86::ROL32rCL: |
| case X86::ROL64rCL: |
| case X86::ROL8ri: |
| case X86::ROL16ri: |
| case X86::ROL32ri: |
| case X86::ROL64ri: |
| case X86::ROR8r1: |
| case X86::ROR16r1: |
| case X86::ROR32r1: |
| case X86::ROR64r1: |
| case X86::ROR8rCL: |
| case X86::ROR16rCL: |
| case X86::ROR32rCL: |
| case X86::ROR64rCL: |
| case X86::ROR8ri: |
| case X86::ROR16ri: |
| case X86::ROR32ri: |
| case X86::ROR64ri: |
| case X86::SAR8r1: |
| case X86::SAR16r1: |
| case X86::SAR32r1: |
| case X86::SAR64r1: |
| case X86::SAR8rCL: |
| case X86::SAR16rCL: |
| case X86::SAR32rCL: |
| case X86::SAR64rCL: |
| case X86::SAR8ri: |
| case X86::SAR16ri: |
| case X86::SAR32ri: |
| case X86::SAR64ri: |
| case X86::SHL8r1: |
| case X86::SHL16r1: |
| case X86::SHL32r1: |
| case X86::SHL64r1: |
| case X86::SHL8rCL: |
| case X86::SHL16rCL: |
| case X86::SHL32rCL: |
| case X86::SHL64rCL: |
| case X86::SHL8ri: |
| case X86::SHL16ri: |
| case X86::SHL32ri: |
| case X86::SHL64ri: |
| case X86::SHR8r1: |
| case X86::SHR16r1: |
| case X86::SHR32r1: |
| case X86::SHR64r1: |
| case X86::SHR8rCL: |
| case X86::SHR16rCL: |
| case X86::SHR32rCL: |
| case X86::SHR64rCL: |
| case X86::SHR8ri: |
| case X86::SHR16ri: |
| case X86::SHR32ri: |
| case X86::SHR64ri: |
| case X86::SHLD16rrCL: |
| case X86::SHLD32rrCL: |
| case X86::SHLD64rrCL: |
| case X86::SHLD16rri8: |
| case X86::SHLD32rri8: |
| case X86::SHLD64rri8: |
| case X86::SHRD16rrCL: |
| case X86::SHRD32rrCL: |
| case X86::SHRD64rrCL: |
| case X86::SHRD16rri8: |
| case X86::SHRD32rri8: |
| case X86::SHRD64rri8: |
| |
| // Basic arithmetic is constant time on the input but does set flags. |
| case X86::ADC8rr: |
| case X86::ADC8ri: |
| case X86::ADC16rr: |
| case X86::ADC16ri: |
| case X86::ADC16ri8: |
| case X86::ADC32rr: |
| case X86::ADC32ri: |
| case X86::ADC32ri8: |
| case X86::ADC64rr: |
| case X86::ADC64ri8: |
| case X86::ADC64ri32: |
| case X86::ADD8rr: |
| case X86::ADD8ri: |
| case X86::ADD16rr: |
| case X86::ADD16ri: |
| case X86::ADD16ri8: |
| case X86::ADD32rr: |
| case X86::ADD32ri: |
| case X86::ADD32ri8: |
| case X86::ADD64rr: |
| case X86::ADD64ri8: |
| case X86::ADD64ri32: |
| case X86::AND8rr: |
| case X86::AND8ri: |
| case X86::AND16rr: |
| case X86::AND16ri: |
| case X86::AND16ri8: |
| case X86::AND32rr: |
| case X86::AND32ri: |
| case X86::AND32ri8: |
| case X86::AND64rr: |
| case X86::AND64ri8: |
| case X86::AND64ri32: |
| case X86::OR8rr: |
| case X86::OR8ri: |
| case X86::OR16rr: |
| case X86::OR16ri: |
| case X86::OR16ri8: |
| case X86::OR32rr: |
| case X86::OR32ri: |
| case X86::OR32ri8: |
| case X86::OR64rr: |
| case X86::OR64ri8: |
| case X86::OR64ri32: |
| case X86::SBB8rr: |
| case X86::SBB8ri: |
| case X86::SBB16rr: |
| case X86::SBB16ri: |
| case X86::SBB16ri8: |
| case X86::SBB32rr: |
| case X86::SBB32ri: |
| case X86::SBB32ri8: |
| case X86::SBB64rr: |
| case X86::SBB64ri8: |
| case X86::SBB64ri32: |
| case X86::SUB8rr: |
| case X86::SUB8ri: |
| case X86::SUB16rr: |
| case X86::SUB16ri: |
| case X86::SUB16ri8: |
| case X86::SUB32rr: |
| case X86::SUB32ri: |
| case X86::SUB32ri8: |
| case X86::SUB64rr: |
| case X86::SUB64ri8: |
| case X86::SUB64ri32: |
| case X86::XOR8rr: |
| case X86::XOR8ri: |
| case X86::XOR16rr: |
| case X86::XOR16ri: |
| case X86::XOR16ri8: |
| case X86::XOR32rr: |
| case X86::XOR32ri: |
| case X86::XOR32ri8: |
| case X86::XOR64rr: |
| case X86::XOR64ri8: |
| case X86::XOR64ri32: |
| // Arithmetic with just 32-bit and 64-bit variants and no immediates. |
| case X86::ADCX32rr: |
| case X86::ADCX64rr: |
| case X86::ADOX32rr: |
| case X86::ADOX64rr: |
| case X86::ANDN32rr: |
| case X86::ANDN64rr: |
| // Unary arithmetic operations. |
| case X86::DEC8r: |
| case X86::DEC16r: |
| case X86::DEC32r: |
| case X86::DEC64r: |
| case X86::INC8r: |
| case X86::INC16r: |
| case X86::INC32r: |
| case X86::INC64r: |
| case X86::NEG8r: |
| case X86::NEG16r: |
| case X86::NEG32r: |
| case X86::NEG64r: |
| |
| // Unlike other arithmetic, NOT doesn't set EFLAGS. |
| case X86::NOT8r: |
| case X86::NOT16r: |
| case X86::NOT32r: |
| case X86::NOT64r: |
| |
| // Various move instructions used to zero or sign extend things. Note that we |
| // intentionally don't support the _NOREX variants as we can't handle that |
| // register constraint anyways. |
| case X86::MOVSX16rr8: |
| case X86::MOVSX32rr8: |
| case X86::MOVSX32rr16: |
| case X86::MOVSX64rr8: |
| case X86::MOVSX64rr16: |
| case X86::MOVSX64rr32: |
| case X86::MOVZX16rr8: |
| case X86::MOVZX32rr8: |
| case X86::MOVZX32rr16: |
| case X86::MOVZX64rr8: |
| case X86::MOVZX64rr16: |
| case X86::MOV32rr: |
| |
| // Arithmetic instructions that are both constant time and don't set flags. |
| case X86::RORX32ri: |
| case X86::RORX64ri: |
| case X86::SARX32rr: |
| case X86::SARX64rr: |
| case X86::SHLX32rr: |
| case X86::SHLX64rr: |
| case X86::SHRX32rr: |
| case X86::SHRX64rr: |
| |
| // LEA doesn't actually access memory, and its arithmetic is constant time. |
| case X86::LEA16r: |
| case X86::LEA32r: |
| case X86::LEA64_32r: |
| case X86::LEA64r: |
| return true; |
| } |
| } |
| |
| bool X86InstrInfo::isDataInvariantLoad(MachineInstr &MI) { |
| switch (MI.getOpcode()) { |
| default: |
| // By default, assume that the load will immediately leak. |
| return false; |
| |
| // On x86 it is believed that imul is constant time w.r.t. the loaded data. |
| // However, they set flags and are perhaps the most surprisingly constant |
| // time operations so we call them out here separately. |
| case X86::IMUL16rm: |
| case X86::IMUL16rmi8: |
| case X86::IMUL16rmi: |
| case X86::IMUL32rm: |
| case X86::IMUL32rmi8: |
| case X86::IMUL32rmi: |
| case X86::IMUL64rm: |
| case X86::IMUL64rmi32: |
| case X86::IMUL64rmi8: |
| |
| // Bit scanning and counting instructions that are somewhat surprisingly |
| // constant time as they scan across bits and do other fairly complex |
| // operations like popcnt, but are believed to be constant time on x86. |
| // However, these set flags. |
| case X86::BSF16rm: |
| case X86::BSF32rm: |
| case X86::BSF64rm: |
| case X86::BSR16rm: |
| case X86::BSR32rm: |
| case X86::BSR64rm: |
| case X86::LZCNT16rm: |
| case X86::LZCNT32rm: |
| case X86::LZCNT64rm: |
| case X86::POPCNT16rm: |
| case X86::POPCNT32rm: |
| case X86::POPCNT64rm: |
| case X86::TZCNT16rm: |
| case X86::TZCNT32rm: |
| case X86::TZCNT64rm: |
| |
| // Bit manipulation instructions are effectively combinations of basic |
| // arithmetic ops, and should still execute in constant time. These also |
| // set flags. |
| case X86::BLCFILL32rm: |
| case X86::BLCFILL64rm: |
| case X86::BLCI32rm: |
| case X86::BLCI64rm: |
| case X86::BLCIC32rm: |
| case X86::BLCIC64rm: |
| case X86::BLCMSK32rm: |
| case X86::BLCMSK64rm: |
| case X86::BLCS32rm: |
| case X86::BLCS64rm: |
| case X86::BLSFILL32rm: |
| case X86::BLSFILL64rm: |
| case X86::BLSI32rm: |
| case X86::BLSI64rm: |
| case X86::BLSIC32rm: |
| case X86::BLSIC64rm: |
| case X86::BLSMSK32rm: |
| case X86::BLSMSK64rm: |
| case X86::BLSR32rm: |
| case X86::BLSR64rm: |
| case X86::TZMSK32rm: |
| case X86::TZMSK64rm: |
| |
| // Bit extracting and clearing instructions should execute in constant time, |
| // and set flags. |
| case X86::BEXTR32rm: |
| case X86::BEXTR64rm: |
| case X86::BEXTRI32mi: |
| case X86::BEXTRI64mi: |
| case X86::BZHI32rm: |
| case X86::BZHI64rm: |
| |
| // Basic arithmetic is constant time on the input but does set flags. |
| case X86::ADC8rm: |
| case X86::ADC16rm: |
| case X86::ADC32rm: |
| case X86::ADC64rm: |
| case X86::ADCX32rm: |
| case X86::ADCX64rm: |
| case X86::ADD8rm: |
| case X86::ADD16rm: |
| case X86::ADD32rm: |
| case X86::ADD64rm: |
| case X86::ADOX32rm: |
| case X86::ADOX64rm: |
| case X86::AND8rm: |
| case X86::AND16rm: |
| case X86::AND32rm: |
| case X86::AND64rm: |
| case X86::ANDN32rm: |
| case X86::ANDN64rm: |
| case X86::OR8rm: |
| case X86::OR16rm: |
| case X86::OR32rm: |
| case X86::OR64rm: |
| case X86::SBB8rm: |
| case X86::SBB16rm: |
| case X86::SBB32rm: |
| case X86::SBB64rm: |
| case X86::SUB8rm: |
| case X86::SUB16rm: |
| case X86::SUB32rm: |
| case X86::SUB64rm: |
| case X86::XOR8rm: |
| case X86::XOR16rm: |
| case X86::XOR32rm: |
| case X86::XOR64rm: |
| |
| // Integer multiply w/o affecting flags is still believed to be constant |
| // time on x86. Called out separately as this is among the most surprising |
| // instructions to exhibit that behavior. |
| case X86::MULX32rm: |
| case X86::MULX64rm: |
| |
| // Arithmetic instructions that are both constant time and don't set flags. |
| case X86::RORX32mi: |
| case X86::RORX64mi: |
| case X86::SARX32rm: |
| case X86::SARX64rm: |
| case X86::SHLX32rm: |
| case X86::SHLX64rm: |
| case X86::SHRX32rm: |
| case X86::SHRX64rm: |
| |
| // Conversions are believed to be constant time and don't set flags. |
| case X86::CVTTSD2SI64rm: |
| case X86::VCVTTSD2SI64rm: |
| case X86::VCVTTSD2SI64Zrm: |
| case X86::CVTTSD2SIrm: |
| case X86::VCVTTSD2SIrm: |
| case X86::VCVTTSD2SIZrm: |
| case X86::CVTTSS2SI64rm: |
| case X86::VCVTTSS2SI64rm: |
| case X86::VCVTTSS2SI64Zrm: |
| case X86::CVTTSS2SIrm: |
| case X86::VCVTTSS2SIrm: |
| case X86::VCVTTSS2SIZrm: |
| case X86::CVTSI2SDrm: |
| case X86::VCVTSI2SDrm: |
| case X86::VCVTSI2SDZrm: |
| case X86::CVTSI2SSrm: |
| case X86::VCVTSI2SSrm: |
| case X86::VCVTSI2SSZrm: |
| case X86::CVTSI642SDrm: |
| case X86::VCVTSI642SDrm: |
| case X86::VCVTSI642SDZrm: |
| case X86::CVTSI642SSrm: |
| case X86::VCVTSI642SSrm: |
| case X86::VCVTSI642SSZrm: |
| case X86::CVTSS2SDrm: |
| case X86::VCVTSS2SDrm: |
| case X86::VCVTSS2SDZrm: |
| case X86::CVTSD2SSrm: |
| case X86::VCVTSD2SSrm: |
| case X86::VCVTSD2SSZrm: |
| // AVX512 added unsigned integer conversions. |
| case X86::VCVTTSD2USI64Zrm: |
| case X86::VCVTTSD2USIZrm: |
| case X86::VCVTTSS2USI64Zrm: |
| case X86::VCVTTSS2USIZrm: |
| case X86::VCVTUSI2SDZrm: |
| case X86::VCVTUSI642SDZrm: |
| case X86::VCVTUSI2SSZrm: |
| case X86::VCVTUSI642SSZrm: |
| |
| // Loads to register don't set flags. |
| case X86::MOV8rm: |
| case X86::MOV8rm_NOREX: |
| case X86::MOV16rm: |
| case X86::MOV32rm: |
| case X86::MOV64rm: |
| case X86::MOVSX16rm8: |
| case X86::MOVSX32rm16: |
| case X86::MOVSX32rm8: |
| case X86::MOVSX32rm8_NOREX: |
| case X86::MOVSX64rm16: |
| case X86::MOVSX64rm32: |
| case X86::MOVSX64rm8: |
| case X86::MOVZX16rm8: |
| case X86::MOVZX32rm16: |
| case X86::MOVZX32rm8: |
| case X86::MOVZX32rm8_NOREX: |
| case X86::MOVZX64rm16: |
| case X86::MOVZX64rm8: |
| return true; |
| } |
| } |
| |
| int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const { |
| const MachineFunction *MF = MI.getParent()->getParent(); |
| const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); |
| |
| if (isFrameInstr(MI)) { |
| int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign()); |
| SPAdj -= getFrameAdjustment(MI); |
| if (!isFrameSetup(MI)) |
| SPAdj = -SPAdj; |
| return SPAdj; |
| } |
| |
| // To know whether a call adjusts the stack, we need information |
| // that is bound to the following ADJCALLSTACKUP pseudo. |
| // Look for the next ADJCALLSTACKUP that follows the call. |
| if (MI.isCall()) { |
| const MachineBasicBlock *MBB = MI.getParent(); |
| auto I = ++MachineBasicBlock::const_iterator(MI); |
| for (auto E = MBB->end(); I != E; ++I) { |
| if (I->getOpcode() == getCallFrameDestroyOpcode() || |
| I->isCall()) |
| break; |
| } |
| |
| // If we could not find a frame destroy opcode, then it has already |
| // been simplified, so we don't care. |
| if (I->getOpcode() != getCallFrameDestroyOpcode()) |
| return 0; |
| |
| return -(I->getOperand(1).getImm()); |
| } |
| |
| // Currently handle only PUSHes we can reasonably expect to see |
| // in call sequences |
| switch (MI.getOpcode()) { |
| default: |
| return 0; |
| case X86::PUSH32i8: |
| case X86::PUSH32r: |
| case X86::PUSH32rmm: |
| case X86::PUSH32rmr: |
| case X86::PUSHi32: |
| return 4; |
| case X86::PUSH64i8: |
| case X86::PUSH64r: |
| case X86::PUSH64rmm: |
| case X86::PUSH64rmr: |
| case X86::PUSH64i32: |
| return 8; |
| } |
| } |
| |
| /// Return true and the FrameIndex if the specified |
| /// operand and follow operands form a reference to the stack frame. |
| bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op, |
| int &FrameIndex) const { |
| if (MI.getOperand(Op + X86::AddrBaseReg).isFI() && |
| MI.getOperand(Op + X86::AddrScaleAmt).isImm() && |
| MI.getOperand(Op + X86::AddrIndexReg).isReg() && |
| MI.getOperand(Op + X86::AddrDisp).isImm() && |
| MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 && |
| MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 && |
| MI.getOperand(Op + X86::AddrDisp).getImm() == 0) { |
| FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex(); |
| return true; |
| } |
| return false; |
| } |
| |
| static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) { |
| switch (Opcode) { |
| default: |
| return false; |
| case X86::MOV8rm: |
| case X86::KMOVBkm: |
| MemBytes = 1; |
| return true; |
| case X86::MOV16rm: |
| case X86::KMOVWkm: |
| case X86::VMOVSHZrm: |
| case X86::VMOVSHZrm_alt: |
| MemBytes = 2; |
| return true; |
| case X86::MOV32rm: |
| case X86::MOVSSrm: |
| case X86::MOVSSrm_alt: |
| case X86::VMOVSSrm: |
| case X86::VMOVSSrm_alt: |
| case X86::VMOVSSZrm: |
| case X86::VMOVSSZrm_alt: |
| case X86::KMOVDkm: |
| MemBytes = 4; |
| return true; |
| case X86::MOV64rm: |
| case X86::LD_Fp64m: |
| case X86::MOVSDrm: |
| case X86::MOVSDrm_alt: |
| case X86::VMOVSDrm: |
| case X86::VMOVSDrm_alt: |
| case X86::VMOVSDZrm: |
| case X86::VMOVSDZrm_alt: |
| case X86::MMX_MOVD64rm: |
| case X86::MMX_MOVQ64rm: |
| case X86::KMOVQkm: |
| MemBytes = 8; |
| return true; |
| case X86::MOVAPSrm: |
| case X86::MOVUPSrm: |
| case X86::MOVAPDrm: |
| case X86::MOVUPDrm: |
| case X86::MOVDQArm: |
| case X86::MOVDQUrm: |
| case X86::VMOVAPSrm: |
| case X86::VMOVUPSrm: |
| case X86::VMOVAPDrm: |
| case X86::VMOVUPDrm: |
| case X86::VMOVDQArm: |
| case X86::VMOVDQUrm: |
| case X86::VMOVAPSZ128rm: |
| case X86::VMOVUPSZ128rm: |
| case X86::VMOVAPSZ128rm_NOVLX: |
| case X86::VMOVUPSZ128rm_NOVLX: |
| case X86::VMOVAPDZ128rm: |
| case X86::VMOVUPDZ128rm: |
| case X86::VMOVDQU8Z128rm: |
| case X86::VMOVDQU16Z128rm: |
| case X86::VMOVDQA32Z128rm: |
| case X86::VMOVDQU32Z128rm: |
| case X86::VMOVDQA64Z128rm: |
| case X86::VMOVDQU64Z128rm: |
| MemBytes = 16; |
| return true; |
| case X86::VMOVAPSYrm: |
| case X86::VMOVUPSYrm: |
| case X86::VMOVAPDYrm: |
| case X86::VMOVUPDYrm: |
| case X86::VMOVDQAYrm: |
| case X86::VMOVDQUYrm: |
| case X86::VMOVAPSZ256rm: |
| case X86::VMOVUPSZ256rm: |
| case X86::VMOVAPSZ256rm_NOVLX: |
| case X86::VMOVUPSZ256rm_NOVLX: |
| case X86::VMOVAPDZ256rm: |
| case X86::VMOVUPDZ256rm: |
| case X86::VMOVDQU8Z256rm: |
| case X86::VMOVDQU16Z256rm: |
| case X86::VMOVDQA32Z256rm: |
| case X86::VMOVDQU32Z256rm: |
| case X86::VMOVDQA64Z256rm: |
| case X86::VMOVDQU64Z256rm: |
| MemBytes = 32; |
| return true; |
| case X86::VMOVAPSZrm: |
| case X86::VMOVUPSZrm: |
| case X86::VMOVAPDZrm: |
| case X86::VMOVUPDZrm: |
| case X86::VMOVDQU8Zrm: |
| case X86::VMOVDQU16Zrm: |
| case X86::VMOVDQA32Zrm: |
| case X86::VMOVDQU32Zrm: |
| case X86::VMOVDQA64Zrm: |
| case X86::VMOVDQU64Zrm: |
| MemBytes = 64; |
| return true; |
| } |
| } |
| |
| static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) { |
| switch (Opcode) { |
| default: |
| return false; |
| case X86::MOV8mr: |
| case X86::KMOVBmk: |
| MemBytes = 1; |
| return true; |
| case X86::MOV16mr: |
| case X86::KMOVWmk: |
| case X86::VMOVSHZmr: |
| MemBytes = 2; |
| return true; |
| case X86::MOV32mr: |
| case X86::MOVSSmr: |
| case X86::VMOVSSmr: |
| case X86::VMOVSSZmr: |
| case X86::KMOVDmk: |
| MemBytes = 4; |
| return true; |
| case X86::MOV64mr: |
| case X86::ST_FpP64m: |
| case X86::MOVSDmr: |
| case X86::VMOVSDmr: |
| case X86::VMOVSDZmr: |
| case X86::MMX_MOVD64mr: |
| case X86::MMX_MOVQ64mr: |
| case X86::MMX_MOVNTQmr: |
| case X86::KMOVQmk: |
| MemBytes = 8; |
| return true; |
| case X86::MOVAPSmr: |
| case X86::MOVUPSmr: |
| case X86::MOVAPDmr: |
| case X86::MOVUPDmr: |
| case X86::MOVDQAmr: |
| case X86::MOVDQUmr: |
| case X86::VMOVAPSmr: |
| case X86::VMOVUPSmr: |
| case X86::VMOVAPDmr: |
| case X86::VMOVUPDmr: |
| case X86::VMOVDQAmr: |
| case X86::VMOVDQUmr: |
| case X86::VMOVUPSZ128mr: |
| case X86::VMOVAPSZ128mr: |
| case X86::VMOVUPSZ128mr_NOVLX: |
| case X86::VMOVAPSZ128mr_NOVLX: |
| case X86::VMOVUPDZ128mr: |
| case X86::VMOVAPDZ128mr: |
| case X86::VMOVDQA32Z128mr: |
| case X86::VMOVDQU32Z128mr: |
| case X86::VMOVDQA64Z128mr: |
| case X86::VMOVDQU64Z128mr: |
| case X86::VMOVDQU8Z128mr: |
| case X86::VMOVDQU16Z128mr: |
| MemBytes = 16; |
| return true; |
| case X86::VMOVUPSYmr: |
| case X86::VMOVAPSYmr: |
| case X86::VMOVUPDYmr: |
| case X86::VMOVAPDYmr: |
| case X86::VMOVDQUYmr: |
| case X86::VMOVDQAYmr: |
| case X86::VMOVUPSZ256mr: |
| case X86::VMOVAPSZ256mr: |
| case X86::VMOVUPSZ256mr_NOVLX: |
| case X86::VMOVAPSZ256mr_NOVLX: |
| case X86::VMOVUPDZ256mr: |
| case X86::VMOVAPDZ256mr: |
| case X86::VMOVDQU8Z256mr: |
| case X86::VMOVDQU16Z256mr: |
| case X86::VMOVDQA32Z256mr: |
| case X86::VMOVDQU32Z256mr: |
| case X86::VMOVDQA64Z256mr: |
| case X86::VMOVDQU64Z256mr: |
| MemBytes = 32; |
| return true; |
| case X86::VMOVUPSZmr: |
| case X86::VMOVAPSZmr: |
| case X86::VMOVUPDZmr: |
| case X86::VMOVAPDZmr: |
| case X86::VMOVDQU8Zmr: |
| case X86::VMOVDQU16Zmr: |
| case X86::VMOVDQA32Zmr: |
| case X86::VMOVDQU32Zmr: |
| case X86::VMOVDQA64Zmr: |
| case X86::VMOVDQU64Zmr: |
| MemBytes = 64; |
| return true; |
| } |
| return false; |
| } |
| |
| unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, |
| int &FrameIndex) const { |
| unsigned Dummy; |
| return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy); |
| } |
| |
| unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, |
| int &FrameIndex, |
| unsigned &MemBytes) const { |
| if (isFrameLoadOpcode(MI.getOpcode(), MemBytes)) |
| if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex)) |
| return MI.getOperand(0).getReg(); |
| return 0; |
| } |
| |
| unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI, |
| int &FrameIndex) const { |
| unsigned Dummy; |
| if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) { |
| unsigned Reg; |
| if ((Reg = isLoadFromStackSlot(MI, FrameIndex))) |
| return Reg; |
| // Check for post-frame index elimination operations |
| SmallVector<const MachineMemOperand *, 1> Accesses; |
| if (hasLoadFromStackSlot(MI, Accesses)) { |
| FrameIndex = |
| cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue()) |
| ->getFrameIndex(); |
| return MI.getOperand(0).getReg(); |
| } |
| } |
| return 0; |
| } |
| |
| unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI, |
| int &FrameIndex) const { |
| unsigned Dummy; |
| return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy); |
| } |
| |
| unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI, |
| int &FrameIndex, |
| unsigned &MemBytes) const { |
| if (isFrameStoreOpcode(MI.getOpcode(), MemBytes)) |
| if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 && |
| isFrameOperand(MI, 0, FrameIndex)) |
| return MI.getOperand(X86::AddrNumOperands).getReg(); |
| return 0; |
| } |
| |
| unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, |
| int &FrameIndex) const { |
| unsigned Dummy; |
| if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) { |
| unsigned Reg; |
| if ((Reg = isStoreToStackSlot(MI, FrameIndex))) |
| return Reg; |
| // Check for post-frame index elimination operations |
| SmallVector<const MachineMemOperand *, 1> Accesses; |
| if (hasStoreToStackSlot(MI, Accesses)) { |
| FrameIndex = |
| cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue()) |
| ->getFrameIndex(); |
| return MI.getOperand(X86::AddrNumOperands).getReg(); |
| } |
| } |
| return 0; |
| } |
| |
| /// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r. |
| static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) { |
| // Don't waste compile time scanning use-def chains of physregs. |
| if (!BaseReg.isVirtual()) |
| return false; |
| bool isPICBase = false; |
| for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg), |
| E = MRI.def_instr_end(); I != E; ++I) { |
| MachineInstr *DefMI = &*I; |
| if (DefMI->getOpcode() != X86::MOVPC32r) |
| return false; |
| assert(!isPICBase && "More than one PIC base?"); |
| isPICBase = true; |
| } |
| return isPICBase; |
| } |
| |
| bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, |
| AAResults *AA) const { |
| switch (MI.getOpcode()) { |
| default: |
| // This function should only be called for opcodes with the ReMaterializable |
| // flag set. |
| llvm_unreachable("Unknown rematerializable operation!"); |
| break; |
| |
| case X86::LOAD_STACK_GUARD: |
| case X86::AVX1_SETALLONES: |
| case X86::AVX2_SETALLONES: |
| case X86::AVX512_128_SET0: |
| case X86::AVX512_256_SET0: |
| case X86::AVX512_512_SET0: |
| case X86::AVX512_512_SETALLONES: |
| case X86::AVX512_FsFLD0SD: |
| case X86::AVX512_FsFLD0SH: |
| case X86::AVX512_FsFLD0SS: |
| case X86::AVX512_FsFLD0F128: |
| case X86::AVX_SET0: |
| case X86::FsFLD0SD: |
| case X86::FsFLD0SS: |
| case X86::FsFLD0F128: |
| case X86::KSET0D: |
| case X86::KSET0Q: |
| case X86::KSET0W: |
| case X86::KSET1D: |
| case X86::KSET1Q: |
| case X86::KSET1W: |
| case X86::MMX_SET0: |
| case X86::MOV32ImmSExti8: |
| case X86::MOV32r0: |
| case X86::MOV32r1: |
| case X86::MOV32r_1: |
| case X86::MOV32ri64: |
| case X86::MOV64ImmSExti8: |
| case X86::V_SET0: |
| case X86::V_SETALLONES: |
| case X86::MOV16ri: |
| case X86::MOV32ri: |
| case X86::MOV64ri: |
| case X86::MOV64ri32: |
| case X86::MOV8ri: |
| case X86::PTILEZEROV: |
| return true; |
| |
| case X86::MOV8rm: |
| case X86::MOV8rm_NOREX: |
| case X86::MOV16rm: |
| case X86::MOV32rm: |
| case X86::MOV64rm: |
| case X86::MOVSSrm: |
| case X86::MOVSSrm_alt: |
| case X86::MOVSDrm: |
| case X86::MOVSDrm_alt: |
| case X86::MOVAPSrm: |
| case X86::MOVUPSrm: |
| case X86::MOVAPDrm: |
| case X86::MOVUPDrm: |
| case X86::MOVDQArm: |
| case X86::MOVDQUrm: |
| case X86::VMOVSSrm: |
| case X86::VMOVSSrm_alt: |
| case X86::VMOVSDrm: |
| case X86::VMOVSDrm_alt: |
| case X86::VMOVAPSrm: |
| case X86::VMOVUPSrm: |
| case X86::VMOVAPDrm: |
| case X86::VMOVUPDrm: |
| case X86::VMOVDQArm: |
| case X86::VMOVDQUrm: |
| case X86::VMOVAPSYrm: |
| case X86::VMOVUPSYrm: |
| case X86::VMOVAPDYrm: |
| case X86::VMOVUPDYrm: |
| case X86::VMOVDQAYrm: |
| case X86::VMOVDQUYrm: |
| case X86::MMX_MOVD64rm: |
| case X86::MMX_MOVQ64rm: |
| // AVX-512 |
| case X86::VMOVSSZrm: |
| case X86::VMOVSSZrm_alt: |
| case X86::VMOVSDZrm: |
| case X86::VMOVSDZrm_alt: |
| case X86::VMOVSHZrm: |
| case X86::VMOVSHZrm_alt: |
| case X86::VMOVAPDZ128rm: |
| case X86::VMOVAPDZ256rm: |
| case X86::VMOVAPDZrm: |
| case X86::VMOVAPSZ128rm: |
| case X86::VMOVAPSZ256rm: |
| case X86::VMOVAPSZ128rm_NOVLX: |
| case X86::VMOVAPSZ256rm_NOVLX: |
| case X86::VMOVAPSZrm: |
| case X86::VMOVDQA32Z128rm: |
| case X86::VMOVDQA32Z256rm: |
| case X86::VMOVDQA32Zrm: |
| case X86::VMOVDQA64Z128rm: |
| case X86::VMOVDQA64Z256rm: |
| case X86::VMOVDQA64Zrm: |
| case X86::VMOVDQU16Z128rm: |
| case X86::VMOVDQU16Z256rm: |
| case X86::VMOVDQU16Zrm: |
| case X86::VMOVDQU32Z128rm: |
| case X86::VMOVDQU32Z256rm: |
| case X86::VMOVDQU32Zrm: |
| case X86::VMOVDQU64Z128rm: |
| case X86::VMOVDQU64Z256rm: |
| case X86::VMOVDQU64Zrm: |
| case X86::VMOVDQU8Z128rm: |
| case X86::VMOVDQU8Z256rm: |
| case X86::VMOVDQU8Zrm: |
| case X86::VMOVUPDZ128rm: |
| case X86::VMOVUPDZ256rm: |
| case X86::VMOVUPDZrm: |
| case X86::VMOVUPSZ128rm: |
| case X86::VMOVUPSZ256rm: |
| case X86::VMOVUPSZ128rm_NOVLX: |
| case X86::VMOVUPSZ256rm_NOVLX: |
| case X86::VMOVUPSZrm: { |
| // Loads from constant pools are trivially rematerializable. |
| if (MI.getOperand(1 + X86::AddrBaseReg).isReg() && |
| MI.getOperand(1 + X86::AddrScaleAmt).isImm() && |
| MI.getOperand(1 + X86::AddrIndexReg).isReg() && |
| MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && |
| MI.isDereferenceableInvariantLoad(AA)) { |
| Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); |
| if (BaseReg == 0 || BaseReg == X86::RIP) |
| return true; |
| // Allow re-materialization of PIC load. |
| if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal()) |
| return false; |
| const MachineFunction &MF = *MI.getParent()->getParent(); |
| const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| return regIsPICBase(BaseReg, MRI); |
| } |
| return false; |
| } |
| |
| case X86::LEA32r: |
| case X86::LEA64r: { |
| if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() && |
| MI.getOperand(1 + X86::AddrIndexReg).isReg() && |
| MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && |
| !MI.getOperand(1 + X86::AddrDisp).isReg()) { |
| // lea fi#, lea GV, etc. are all rematerializable. |
| if (!MI.getOperand(1 + X86::AddrBaseReg).isReg()) |
| return true; |
| Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); |
| if (BaseReg == 0) |
| return true; |
| // Allow re-materialization of lea PICBase + x. |
| const MachineFunction &MF = *MI.getParent()->getParent(); |
| const MachineRegisterInfo &MRI = MF.getRegInfo(); |
| return regIsPICBase(BaseReg, MRI); |
| } |
| return false; |
| } |
| } |
| } |
| |
| void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator I, |
| Register DestReg, unsigned SubIdx, |
| const MachineInstr &Orig, |
| const TargetRegisterInfo &TRI) const { |
| bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI); |
| if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) != |
| MachineBasicBlock::LQR_Dead) { |
| // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side |
| // effects. |
| int Value; |
| switch (Orig.getOpcode()) { |
| case X86::MOV32r0: Value = 0; break; |
| case X86::MOV32r1: Value = 1; break; |
| case X86::MOV32r_1: Value = -1; break; |
| default: |
| llvm_unreachable("Unexpected instruction!"); |
| } |
| |
| const DebugLoc &DL = Orig.getDebugLoc(); |
| BuildMI(MBB, I, DL, get(X86::MOV32ri)) |
| .add(Orig.getOperand(0)) |
| .addImm(Value); |
| } else { |
| MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig); |
| MBB.insert(I, MI); |
| } |
| |
| MachineInstr &NewMI = *std::prev(I); |
| NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI); |
| } |
| |
| /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead. |
| bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const { |
| for (const MachineOperand &MO : MI.operands()) { |
| if (MO.isReg() && MO.isDef() && |
| MO.getReg() == X86::EFLAGS && !MO.isDead()) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /// Check whether the shift count for a machine operand is non-zero. |
| inline static unsigned getTruncatedShiftCount(const MachineInstr &MI, |
| unsigned ShiftAmtOperandIdx) { |
| // The shift count is six bits with the REX.W prefix and five bits without. |
| unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31; |
| unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm(); |
| return Imm & ShiftCountMask; |
| } |
| |
| /// Check whether the given shift count is appropriate |
| /// can be represented by a LEA instruction. |
| inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { |
| // Left shift instructions can be transformed into load-effective-address |
| // instructions if we can encode them appropriately. |
| // A LEA instruction utilizes a SIB byte to encode its scale factor. |
| // The SIB.scale field is two bits wide which means that we can encode any |
| // shift amount less than 4. |
| return ShAmt < 4 && ShAmt > 0; |
| } |
| |
| bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, |
| unsigned Opc, bool AllowSP, Register &NewSrc, |
| bool &isKill, MachineOperand &ImplicitOp, |
| LiveVariables *LV, LiveIntervals *LIS) const { |
| MachineFunction &MF = *MI.getParent()->getParent(); |
| const TargetRegisterClass *RC; |
| if (AllowSP) { |
| RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass; |
| } else { |
| RC = Opc != X86::LEA32r ? |
| &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass; |
| } |
| Register SrcReg = Src.getReg(); |
| isKill = MI.killsRegister(SrcReg); |
| |
| // For both LEA64 and LEA32 the register already has essentially the right |
| // type (32-bit or 64-bit) we may just need to forbid SP. |
| if (Opc != X86::LEA64_32r) { |
| NewSrc = SrcReg; |
| assert(!Src.isUndef() && "Undef op doesn't need optimization"); |
| |
| if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC)) |
| return false; |
| |
| return true; |
| } |
| |
| // This is for an LEA64_32r and incoming registers are 32-bit. One way or |
| // another we need to add 64-bit registers to the final MI. |
| if (SrcReg.isPhysical()) { |
| ImplicitOp = Src; |
| ImplicitOp.setImplicit(); |
| |
| NewSrc = getX86SubSuperRegister(SrcReg, 64); |
| assert(!Src.isUndef() && "Undef op doesn't need optimization"); |
| } else { |
| // Virtual register of the wrong class, we have to create a temporary 64-bit |
| // vreg to feed into the LEA. |
| NewSrc = MF.getRegInfo().createVirtualRegister(RC); |
| MachineInstr *Copy = |
| BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY)) |
| .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit) |
| .addReg(SrcReg, getKillRegState(isKill)); |
| |
| // Which is obviously going to be dead after we're done with it. |
| isKill = true; |
| |
| if (LV) |
| LV->replaceKillInstruction(SrcReg, MI, *Copy); |
| |
| if (LIS) { |
| SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy); |
| SlotIndex Idx = LIS->getInstructionIndex(MI); |
| LiveInterval &LI = LIS->getInterval(SrcReg); |
| LiveRange::Segment *S = LI.getSegmentContaining(Idx); |
| if (S->end.getBaseIndex() == Idx) |
| S->end = CopyIdx.getRegSlot(); |
| } |
| } |
| |
| // We've set all the parameters without issue. |
| return true; |
| } |
| |
| MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, |
| MachineInstr &MI, |
| LiveVariables *LV, |
| LiveIntervals *LIS, |
| bool Is8BitOp) const { |
| // We handle 8-bit adds and various 16-bit opcodes in the switch below. |
| MachineBasicBlock &MBB = *MI.getParent(); |
| MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); |
| assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits( |
| *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) && |
| "Unexpected type for LEA transform"); |
| |
| // TODO: For a 32-bit target, we need to adjust the LEA variables with |
| // something like this: |
| // Opcode = X86::LEA32r; |
| // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); |
| // OutRegLEA = |
| // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass) |
| // : RegInfo.createVirtualRegister(&X86::GR32RegClass); |
| if (!Subtarget.is64Bit()) |
| return nullptr; |
| |
| unsigned Opcode = X86::LEA64_32r; |
| Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); |
| Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass); |
| Register InRegLEA2; |
| |
| // Build and insert into an implicit UNDEF value. This is OK because |
| // we will be shifting and then extracting the lower 8/16-bits. |
| // This has the potential to cause partial register stall. e.g. |
| // movw (%rbp,%rcx,2), %dx |
| // leal -65(%rdx), %esi |
| // But testing has shown this *does* help performance in 64-bit mode (at |
| // least on modern x86 machines). |
| MachineBasicBlock::iterator MBBI = MI.getIterator(); |
| Register Dest = MI.getOperand(0).getReg(); |
| Register Src = MI.getOperand(1).getReg(); |
| Register Src2; |
| bool IsDead = MI.getOperand(0).isDead(); |
| bool IsKill = MI.getOperand(1).isKill(); |
| unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit; |
| assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization"); |
| MachineInstr *ImpDef = |
| BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA); |
| MachineInstr *InsMI = |
| BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY)) |
| .addReg(InRegLEA, RegState::Define, SubReg) |
| .addReg(Src, getKillRegState(IsKill)); |
| MachineInstr *ImpDef2 = nullptr; |
| MachineInstr *InsMI2 = nullptr; |
| |
| MachineInstrBuilder MIB = |
| BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA); |
| switch (MIOpc) { |
| default: llvm_unreachable("Unreachable!"); |
| case X86::SHL8ri: |
| case X86::SHL16ri: { |
| unsigned ShAmt = MI.getOperand(2).getImm(); |
| MIB.addReg(0).addImm(1ULL << ShAmt) |
| .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0); |
| break; |
| } |
| case X86::INC8r: |
| case X86::INC16r: |
| addRegOffset(MIB, InRegLEA, true, 1); |
| break; |
| case X86::DEC8r: |
| case X86::DEC16r: |
| addRegOffset(MIB, InRegLEA, true, -1); |
| break; |
| case X86::ADD8ri: |
| case X86::ADD8ri_DB: |
| case X86::ADD16ri: |
| case X86::ADD16ri8: |
| case X86::ADD16ri_DB: |
| case X86::ADD16ri8_DB: |
| addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm()); |
| break; |
| case X86::ADD8rr: |
| case X86::ADD8rr_DB: |
| case X86::ADD16rr: |
| case X86::ADD16rr_DB: { |
| Src2 = MI.getOperand(2).getReg(); |
| bool IsKill2 = MI.getOperand(2).isKill(); |
| assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization"); |
| if (Src == Src2) { |
| // ADD8rr/ADD16rr killed %reg1028, %reg1028 |
| // just a single insert_subreg. |
| addRegReg(MIB, InRegLEA, true, InRegLEA, false); |
| } else { |
| if (Subtarget.is64Bit()) |
| InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); |
| else |
| InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); |
| // Build and insert into an implicit UNDEF value. This is OK because |
| // we will be shifting and then extracting the lower 8/16-bits. |
| ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), |
| InRegLEA2); |
| InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY)) |
| .addReg(InRegLEA2, RegState::Define, SubReg) |
| .addReg(Src2, getKillRegState(IsKill2)); |
| addRegReg(MIB, InRegLEA, true, InRegLEA2, true); |
| } |
| if (LV && IsKill2 && InsMI2) |
| LV->replaceKillInstruction(Src2, MI, *InsMI2); |
| break; |
| } |
| } |
| |
| MachineInstr *NewMI = MIB; |
| MachineInstr *ExtMI = |
| BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY)) |
| .addReg(Dest, RegState::Define | getDeadRegState(IsDead)) |
| .addReg(OutRegLEA, RegState::Kill, SubReg); |
| |
| if (LV) { |
| // Update live variables. |
| LV->getVarInfo(InRegLEA).Kills.push_back(NewMI); |
| LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI); |
| if (IsKill) |
| LV->replaceKillInstruction(Src, MI, *InsMI); |
| if (IsDead) |
| LV->replaceKillInstruction(Dest, MI, *ExtMI); |
| } |
| |
| if (LIS) { |
| LIS->InsertMachineInstrInMaps(*ImpDef); |
| SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI); |
| if (ImpDef2) |
| LIS->InsertMachineInstrInMaps(*ImpDef2); |
| SlotIndex Ins2Idx; |
| if (InsMI2) |
| Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2); |
| SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI); |
| SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI); |
| LIS->getInterval(InRegLEA); |
| LIS->getInterval(OutRegLEA); |
| if (InRegLEA2) |
| LIS->getInterval(InRegLEA2); |
| |
| // Move the use of Src up to InsMI. |
| LiveInterval &SrcLI = LIS->getInterval(Src); |
| LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx); |
| if (SrcSeg->end == NewIdx.getRegSlot()) |
| SrcSeg->end = InsIdx.getRegSlot(); |
| |
| if (InsMI2) { |
| // Move the use of Src2 up to InsMI2. |
| LiveInterval &Src2LI = LIS->getInterval(Src2); |
| LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx); |
| if (Src2Seg->end == NewIdx.getRegSlot()) |
| Src2Seg->end = Ins2Idx.getRegSlot(); |
| } |
| |
| // Move the definition of Dest down to ExtMI. |
| LiveInterval &DestLI = LIS->getInterval(Dest); |
| LiveRange::Segment *DestSeg = |
| DestLI.getSegmentContaining(NewIdx.getRegSlot()); |
| assert(DestSeg->start == NewIdx.getRegSlot() && |
| DestSeg->valno->def == NewIdx.getRegSlot()); |
| DestSeg->start = ExtIdx.getRegSlot(); |
| DestSeg->valno->def = ExtIdx.getRegSlot(); |
| } |
| |
| return ExtMI; |
| } |
| |
| /// This method must be implemented by targets that |
| /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target |
| /// may be able to convert a two-address instruction into a true |
| /// three-address instruction on demand. This allows the X86 target (for |
| /// example) to convert ADD and SHL instructions into LEA instructions if they |
| /// would require register copies due to two-addressness. |
| /// |
| /// This method returns a null pointer if the transformation cannot be |
| /// performed, otherwise it returns the new instruction. |
| /// |
| MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI, |
| LiveVariables *LV, |
| LiveIntervals *LIS) const { |
| // The following opcodes also sets the condition code register(s). Only |
| // convert them to equivalent lea if the condition code register def's |
| // are dead! |
| if (hasLiveCondCodeDef(MI)) |
| return nullptr; |
| |
| MachineFunction &MF = *MI.getParent()->getParent(); |
| // All instructions input are two-addr instructions. Get the known operands. |
| const MachineOperand &Dest = MI.getOperand(0); |
| const MachineOperand &Src = MI.getOperand(1); |
| |
| // Ideally, operations with undef should be folded before we get here, but we |
| // can't guarantee it. Bail out because optimizing undefs is a waste of time. |
| // Without this, we have to forward undef state to new register operands to |
| // avoid machine verifier errors. |
| if (Src.isUndef()) |
| return nullptr; |
| if (MI.getNumOperands() > 2) |
| if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef()) |
| return nullptr; |
| |
| MachineInstr *NewMI = nullptr; |
| Register SrcReg, SrcReg2; |
| bool Is64Bit = Subtarget.is64Bit(); |
| |
| bool Is8BitOp = false; |
| unsigned MIOpc = MI.getOpcode(); |
| switch (MIOpc) { |
| default: llvm_unreachable("Unreachable!"); |
| case X86::SHL64ri: { |
| assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); |
| unsigned ShAmt = getTruncatedShiftCount(MI, 2); |
| if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; |
| |
| // LEA can't handle RSP. |
| if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass( |
| Src.getReg(), &X86::GR64_NOSPRegClass)) |
| return nullptr; |
| |
| NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)) |
| .add(Dest) |
| .addReg(0) |
| .addImm(1ULL << ShAmt) |
| .add(Src) |
| .addImm(0) |
| .addReg(0); |
| break; |
| } |
| case X86::SHL32ri: { |
| assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); |
| unsigned ShAmt = getTruncatedShiftCount(MI, 2); |
| if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; |
| |
| unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; |
| |
| // LEA can't handle ESP. |
| bool isKill; |
| MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); |
| if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill, |
| ImplicitOp, LV, LIS)) |
| return nullptr; |
| |
| MachineInstrBuilder MIB = |
| BuildMI(MF, MI.getDebugLoc(), get(Opc)) |
| .add(Dest) |
| .addReg(0) |
| .addImm(1ULL << ShAmt) |
| .addReg(SrcReg, getKillRegState(isKill)) |
| .addImm(0) |
| .addReg(0); |
| if (ImplicitOp.getReg() != 0) |
| MIB.add(ImplicitOp); |
| NewMI = MIB; |
| |
| break; |
| } |
| case X86::SHL8ri: |
| Is8BitOp = true; |
| LLVM_FALLTHROUGH; |
| case X86::SHL16ri: { |
| assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); |
| unsigned ShAmt = getTruncatedShiftCount(MI, 2); |
| if (!isTruncatedShiftCountForLEA(ShAmt)) |
| return nullptr; |
| return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp); |
| } |
| case X86::INC64r: |
| case X86::INC32r: { |
| assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!"); |
| unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r : |
| (Is64Bit ? X86::LEA64_32r : X86::LEA32r); |
| bool isKill; |
| MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); |
| if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill, |
| ImplicitOp, LV, LIS)) |
| return nullptr; |
| |
| MachineInstrBuilder MIB = |
| BuildMI(MF, MI.getDebugLoc(), get(Opc)) |
| .add(Dest) |
| .addReg(SrcReg, getKillRegState(isKill)); |
| if (ImplicitOp.getReg() != 0) |
| MIB.add(ImplicitOp); |
| |
| NewMI = addOffset(MIB, 1); |
| break; |
| } |
| case X86::DEC64r: |
| case X86::DEC32r: { |
| assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!"); |
| unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r |
| : (Is64Bit ? X86::LEA64_32r : X86::LEA32r); |
| |
| bool isKill; |
| MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); |
| if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill, |
| ImplicitOp, LV, LIS)) |
| return nullptr; |
| |
| MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) |
| .add(Dest) |
| .addReg(SrcReg, getKillRegState(isKill)); |
| if (ImplicitOp.getReg() != 0) |
| MIB.add(ImplicitOp); |
| |
| NewMI = addOffset(MIB, -1); |
| |
| break; |
| } |
| case X86::DEC8r: |
| case X86::INC8r: |
| Is8BitOp = true; |
| LLVM_FALLTHROUGH; |
| case X86::DEC16r: |
| case X86::INC16r: |
| return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp); |
| case X86::ADD64rr: |
| case X86::ADD64rr_DB: |
| case X86::ADD32rr: |
| case X86::ADD32rr_DB: { |
| assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); |
| unsigned Opc; |
| if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) |
| Opc = X86::LEA64r; |
| else |
| Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; |
| |
| const MachineOperand &Src2 = MI.getOperand(2); |
| bool isKill2; |
| MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); |
| if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, isKill2, |
| ImplicitOp2, LV, LIS)) |
| return nullptr; |
| |
| bool isKill; |
| MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); |
| if (Src.getReg() == Src2.getReg()) { |
| // Don't call classify LEAReg a second time on the same register, in case |
| // the first call inserted a COPY from Src2 and marked it as killed. |
| isKill = isKill2; |
| SrcReg = SrcReg2; |
| } else { |
| if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill, |
| ImplicitOp, LV, LIS)) |
| return nullptr; |
| } |
| |
| MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest); |
| if (ImplicitOp.getReg() != 0) |
| MIB.add(ImplicitOp); |
| if (ImplicitOp2.getReg() != 0) |
| MIB.add(ImplicitOp2); |
| |
| NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2); |
| if (LV && Src2.isKill()) |
| LV->replaceKillInstruction(SrcReg2, MI, *NewMI); |
| break; |
| } |
| case X86::ADD8rr: |
| case X86::ADD8rr_DB: |
| Is8BitOp = true; |
| LLVM_FALLTHROUGH; |
| case X86::ADD16rr: |
| case X86::ADD16rr_DB: |
| return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp); |
| case X86::ADD64ri32: |
| case X86::ADD64ri8: |
| case X86::ADD64ri32_DB: |
| case X86::ADD64ri8_DB: |
| assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); |
| NewMI = addOffset( |
| BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src), |
| MI.getOperand(2)); |
| break; |
| case X86::ADD32ri: |
| case X86::ADD32ri8: |
| case X86::ADD32ri_DB: |
| case X86::ADD32ri8_DB: { |
| assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); |
| unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; |
| |
| bool isKill; |
| MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); |
| if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill, |
| ImplicitOp, LV, LIS)) |
| return nullptr; |
| |
| MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) |
| .add(Dest) |
| .addReg(SrcReg, getKillRegState(isKill)); |
| if (ImplicitOp.getReg() != 0) |
| MIB.add(ImplicitOp); |
| |
| NewMI = addOffset(MIB, MI.getOperand(2)); |
| break; |
| } |
| case X86::ADD8ri: |
| case X86::ADD8ri_DB: |
| Is8BitOp = true; |
| LLVM_FALLTHROUGH; |
| case X86::ADD16ri: |
| case X86::ADD16ri8: |
| case X86::ADD16ri_DB: |
| case X86::ADD16ri8_DB: |
| return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp); |
| case X86::SUB8ri: |
| case X86::SUB16ri8: |
| case X86::SUB16ri: |
| /// FIXME: Support these similar to ADD8ri/ADD16ri*. |
| return nullptr; |
| case X86::SUB32ri8: |
| case X86::SUB32ri: { |
| if (!MI.getOperand(2).isImm()) |
| return nullptr; |
| int64_t Imm = MI.getOperand(2).getImm(); |
| if (!isInt<32>(-Imm)) |
| return nullptr; |
| |
| assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); |
| unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; |
| |
| bool isKill; |
| MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); |
| if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill, |
| ImplicitOp, LV, LIS)) |
| return nullptr; |
| |
| MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) |
| .add(Dest) |
| .addReg(SrcReg, getKillRegState(isKill)); |
| if (ImplicitOp.getReg() != 0) |
| MIB.add(ImplicitOp); |
| |
| NewMI = addOffset(MIB, -Imm); |
| break; |
| } |
| |
| case X86::SUB64ri8: |
| case X86::SUB64ri32: { |
| if (!MI.getOperand(2).isImm()) |
| return nullptr; |
| int64_t Imm = MI.getOperand(2).getImm(); |
| if (!isInt<32>(-Imm)) |
| return nullptr; |
| |
| assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!"); |
| |
| MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), |
| get(X86::LEA64r)).add(Dest).add(Src); |
| NewMI = addOffset(MIB, -Imm); |
| break; |
| } |
| |
| case X86::VMOVDQU8Z128rmk: |
| case X86::VMOVDQU8Z256rmk: |
| case X86::VMOVDQU8Zrmk: |
| case X86::VMOVDQU16Z128rmk: |
| case X86::VMOVDQU16Z256rmk: |
| case X86::VMOVDQU16Zrmk: |
| case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk: |
| case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk: |
| case X86::VMOVDQU32Zrmk: case X86::VMOVDQA32Zrmk: |
| case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk: |
| case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk: |
| case X86::VMOVDQU64Zrmk: case X86::VMOVDQA64Zrmk: |
| case X86::VMOVUPDZ128rmk: case X86::VMOVAPDZ128rmk: |
| case X86::VMOVUPDZ256rmk: case X86::VMOVAPDZ256rmk: |
| case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk: |
| case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk: |
| case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk: |
| case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk: |
| case X86::VBROADCASTSDZ256rmk: |
| case X86::VBROADCASTSDZrmk: |
| case X86::VBROADCASTSSZ128rmk: |
| case X86::VBROADCASTSSZ256rmk: |
| case X86::VBROADCASTSSZrmk: |
| case X86::VPBROADCASTDZ128rmk: |
| case X86::VPBROADCASTDZ256rmk: |
| case X86::VPBROADCASTDZrmk: |
| case X86::VPBROADCASTQZ128rmk: |
| case X86::VPBROADCASTQZ256rmk: |
| case X86::VPBROADCASTQZrmk: { |
| unsigned Opc; |
| switch (MIOpc) { |
| default: llvm_unreachable("Unreachable!"); |
| case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break; |
| case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break; |
| case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break; |
| case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break; |
| case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break; |
| case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break; |
| case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; |
| case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; |
| case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break; |
| case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; |
| case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; |
| case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break; |
| case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; |
| case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; |
| case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break; |
| case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; |
| case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; |
| case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break; |
| case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; |
| case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; |
| case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break; |
| case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; |
| case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; |
| case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break; |
| case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; |
| case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; |
| case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break; |
| case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; |
| case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; |
| case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break; |
| case X86::VBROADCASTSDZ256rmk: Opc = X86::VBLENDMPDZ256rmbk; break; |
| case X86::VBROADCASTSDZrmk: Opc = X86::VBLENDMPDZrmbk; break; |
| case X86::VBROADCASTSSZ128rmk: Opc = X86::VBLENDMPSZ128rmbk; break; |
| case X86::VBROADCASTSSZ256rmk: Opc = X86::VBLENDMPSZ256rmbk; break; |
| case X86::VBROADCASTSSZrmk: Opc = X86::VBLENDMPSZrmbk; break; |
| case X86::VPBROADCASTDZ128rmk: Opc = X86::VPBLENDMDZ128rmbk; break; |
| case X86::VPBROADCASTDZ256rmk: Opc = X86::VPBLENDMDZ256rmbk; break; |
| case X86::VPBROADCASTDZrmk: Opc = X86::VPBLENDMDZrmbk; break; |
| case X86::VPBROADCASTQZ128rmk: Opc = X86::VPBLENDMQZ128rmbk; break; |
| case X86::VPBROADCASTQZ256rmk: Opc = X86::VPBLENDMQZ256rmbk; break; |
| case X86::VPBROADCASTQZrmk: Opc = X86::VPBLENDMQZrmbk; break; |
| } |
| |
| NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc)) |
| .add(Dest) |
| .add(MI.getOperand(2)) |
| .add(Src) |
| .add(MI.getOperand(3)) |
| .add(MI.getOperand(4)) |
| .add(MI.getOperand(5)) |
| .add(MI.getOperand(6)) |
| .add(MI.getOperand(7)); |
| break; |
| } |
| |
| case X86::VMOVDQU8Z128rrk: |
| case X86::VMOVDQU8Z256rrk: |
| case X86::VMOVDQU8Zrrk: |
| case X86::VMOVDQU16Z128rrk: |
| case X86::VMOVDQU16Z256rrk: |
| case X86::VMOVDQU16Zrrk: |
| case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk: |
| case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk: |
| case X86::VMOVDQU32Zrrk: case X86::VMOVDQA32Zrrk: |
| case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk: |
| case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk: |
| case X86::VMOVDQU64Zrrk: case X86::VMOVDQA64Zrrk: |
| case X86::VMOVUPDZ128rrk: case X86::VMOVAPDZ128rrk: |
| case X86::VMOVUPDZ256rrk: case X86::VMOVAPDZ256rrk: |
| case X86::VMOVUPDZrrk: case X86::VMOVAPDZrrk: |
| case X86::VMOVUPSZ128rrk: case X86::VMOVAPSZ128rrk: |
| case X86::VMOVUPSZ256rrk: case X86::VMOVAPSZ256rrk: |
| case X86::VMOVUPSZrrk: case X86::VMOVAPSZrrk: { |
| unsigned Opc; |
| switch (MIOpc) { |
| default: llvm_unreachable("Unreachable!"); |
| case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break; |
| case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break; |
| case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break; |
| case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break; |
| case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break; |
| case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break; |
| case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break; |
| case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break; |
| case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break; |
| case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break; |
| case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break; |
| case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break; |
| case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break; |
| case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break; |
| case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break; |
| case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break; |
| case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break; |
| case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break; |
| case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break; |
| case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break; |
| case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break; |
| case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break; |
| case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break; |
| case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break; |
| case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break; |
| case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break; |
| case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break; |
| case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break; |
| case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break; |
| case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break; |
| } |
| |
| NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc)) |
| .add(Dest) |
| .add(MI.getOperand(2)) |
| .add(Src) |
| .add(MI.getOperand(3)); |
| break; |
| } |
| } |
| |
| if (!NewMI) return nullptr; |
| |
| if (LV) { // Update live variables |
| if (Src.isKill()) |
| LV->replaceKillInstruction(Src.getReg(), MI, *NewMI); |
| if (Dest.isDead()) |
| LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI); |
| } |
| |
| MachineBasicBlock &MBB = *MI.getParent(); |
| MBB.insert(MI.getIterator(), NewMI); // Insert the new inst |
| |
| if (LIS) { |
| LIS->ReplaceMachineInstrInMaps(MI, *NewMI); |
| if (SrcReg) |
| LIS->getInterval(SrcReg); |
| if (SrcReg2) |
| LIS->getInterval(SrcReg2); |
| } |
| |
| return NewMI; |
| } |
| |
| /// This determines which of three possible cases of a three source commute |
| /// the source indexes correspond to taking into account any mask operands. |
| /// All prevents commuting a passthru operand. Returns -1 if the commute isn't |
| /// possible. |
| /// Case 0 - Possible to commute the first and second operands. |
| /// Case 1 - Possible to commute the first and third operands. |
| /// Case 2 - Possible to commute the second and third operands. |
| static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1, |
| unsigned SrcOpIdx2) { |
| // Put the lowest index to SrcOpIdx1 to simplify the checks below. |
| if (SrcOpIdx1 > SrcOpIdx2) |
| std::swap(SrcOpIdx1, SrcOpIdx2); |
| |
| unsigned Op1 = 1, Op2 = 2, Op3 = 3; |
| if (X86II::isKMasked(TSFlags)) { |
| Op2++; |
| Op3++; |
| } |
| |
| if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2) |
| return 0; |
| if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3) |
| return 1; |
| if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3) |
| return 2; |
| llvm_unreachable("Unknown three src commute case."); |
| } |
| |
| unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands( |
| const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, |
| const X86InstrFMA3Group &FMA3Group) const { |
| |
| unsigned Opc = MI.getOpcode(); |
| |
| // TODO: Commuting the 1st operand of FMA*_Int requires some additional |
| // analysis. The commute optimization is legal only if all users of FMA*_Int |
| // use only the lowest element of the FMA*_Int instruction. Such analysis are |
| // not implemented yet. So, just return 0 in that case. |
| // When such analysis are available this place will be the right place for |
| // calling it. |
| assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) && |
| "Intrinsic instructions can't commute operand 1"); |
| |
| // Determine which case this commute is or if it can't be done. |
| unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, |
| SrcOpIdx2); |
| assert(Case < 3 && "Unexpected case number!"); |
| |
| // Define the FMA forms mapping array that helps to map input FMA form |
| // to output FMA form to preserve the operation semantics after |
| // commuting the operands. |
| const unsigned Form132Index = 0; |
| const unsigned Form213Index = 1; |
| const unsigned Form231Index = 2; |
| static const unsigned FormMapping[][3] = { |
| // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2; |
| // FMA132 A, C, b; ==> FMA231 C, A, b; |
| // FMA213 B, A, c; ==> FMA213 A, B, c; |
| // FMA231 C, A, b; ==> FMA132 A, C, b; |
| { Form231Index, Form213Index, Form132Index }, |
| // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3; |
| // FMA132 A, c, B; ==> FMA132 B, c, A; |
| // FMA213 B, a, C; ==> FMA231 C, a, B; |
| // FMA231 C, a, B; ==> FMA213 B, a, C; |
| { Form132Index, Form231Index, Form213Index }, |
| // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3; |
| // FMA132 a, C, B; ==> FMA213 a, B, C; |
| // FMA213 b, A, C; ==> FMA132 b, C, A; |
| // FMA231 c, A, B; ==> FMA231 c, B, A; |
| { Form213Index, Form132Index, Form231Index } |
| }; |
| |
| unsigned FMAForms[3]; |
| FMAForms[0] = FMA3Group.get132Opcode(); |
| FMAForms[1] = FMA3Group.get213Opcode(); |
| FMAForms[2] = FMA3Group.get231Opcode(); |
| unsigned FormIndex; |
| for (FormIndex = 0; FormIndex < 3; FormIndex++) |
| if (Opc == FMAForms[FormIndex]) |
| break; |
| |
| // Everything is ready, just adjust the FMA opcode and return it. |
| FormIndex = FormMapping[Case][FormIndex]; |
| return FMAForms[FormIndex]; |
| } |
| |
| static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, |
| unsigned SrcOpIdx2) { |
| // Determine which case this commute is or if it can't be done. |
| unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, |
| SrcOpIdx2); |
| assert(Case < 3 && "Unexpected case value!"); |
| |
| // For each case we need to swap two pairs of bits in the final immediate. |
| static const uint8_t SwapMasks[3][4] = { |
| { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5. |
| { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6. |
| { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6. |
| }; |
| |
| uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm(); |
| // Clear out the bits we are swapping. |
| uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] | |
| SwapMasks[Case][2] | SwapMasks[Case][3]); |
| // If the immediate had a bit of the pair set, then set the opposite bit. |
| if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1]; |
| if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0]; |
| if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3]; |
| if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2]; |
| MI.getOperand(MI.getNumOperands()-1).setImm(NewImm); |
| } |
| |
| // Returns true if this is a VPERMI2 or VPERMT2 instruction that can be |
| // commuted. |
| static bool isCommutableVPERMV3Instruction(unsigned Opcode) { |
| #define VPERM_CASES(Suffix) \ |
| case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \ |
| case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \ |
| case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \ |
| case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \ |
| case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \ |
| case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \ |
| case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \ |
| case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \ |
| case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \ |
| case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \ |
| case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \ |
| case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz: |
| |
| #define VPERM_CASES_BROADCAST(Suffix) \ |
| VPERM_CASES(Suffix) \ |
| case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \ |
| case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \ |
| case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \ |
| case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \ |
| case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \ |
| case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz: |
| |
| switch (Opcode) { |
| default: return false; |
| VPERM_CASES(B) |
| VPERM_CASES_BROADCAST(D) |
| VPERM_CASES_BROADCAST(PD) |
| VPERM_CASES_BROADCAST(PS) |
| VPERM_CASES_BROADCAST(Q) |
| VPERM_CASES(W) |
| return true; |
| } |
| #undef VPERM_CASES_BROADCAST |
| #undef VPERM_CASES |
| } |
| |
| // Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching |
| // from the I opcode to the T opcode and vice versa. |
| static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) { |
| #define VPERM_CASES(Orig, New) \ |
| case X86::Orig##128rr: return X86::New##128rr; \ |
| case X86::Orig##128rrkz: return X86::New##128rrkz; \ |
| case X86::Orig##128rm: return X86::New##128rm; \ |
| case X86::Orig##128rmkz: return X86::New##128rmkz; \ |
| case X86::Orig##256rr: return X86::New##256rr; \ |
| case X86::Orig##256rrkz: return X86::New##256rrkz; \ |
| case X86::Orig##256rm: return X86::New##256rm; \ |
| case X86::Orig##256rmkz: return X86::New##256rmkz; \ |
| case X86::Orig##rr: return X86::New##rr; \ |
| case X86::Orig##rrkz: return X86::New##rrkz; \ |
| case X86::Orig##rm: return X86::New##rm; \ |
| case X86::Orig##rmkz: return X86::New##rmkz; |
| |
| #define VPERM_CASES_BROADCAST(Orig, New) \ |
| VPERM_CASES(Orig, New) \ |
| case X86::Orig##128rmb: return X86::New##128rmb; \ |
| case X86::Orig##128rmbkz: return X86::New##128rmbkz; \ |
| case X86::Orig##256rmb: return X86::New##256rmb; \ |
| case X86::Orig##256rmbkz: return X86::New##256rmbkz; \ |
| case X86::Orig##rmb: return X86::New##rmb; \ |
| case X86::Orig##rmbkz: return X86::New##rmbkz; |
| |
| switch (Opcode) { |
| VPERM_CASES(VPERMI2B, VPERMT2B) |
| VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D) |
| VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD) |
| VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS) |
| VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q) |
| VPERM_CASES(VPERMI2W, VPERMT2W) |
| VPERM_CASES(VPERMT2B, VPERMI2B) |
| VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D) |
| VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD) |
| VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS) |
| VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q) |
| VPERM_CASES(VPERMT2W, VPERMI2W) |
| } |
| |
| llvm_unreachable("Unreachable!"); |
| #undef VPERM_CASES_BROADCAST |
| #undef VPERM_CASES |
| } |
| |
| MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, |
| unsigned OpIdx1, |
| unsigned OpIdx2) const { |
| auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & { |
| if (NewMI) |
| return *MI.getParent()->getParent()->CloneMachineInstr(&MI); |
| return MI; |
| }; |
| |
| switch (MI.getOpcode()) { |
| case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I) |
| case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I) |
| case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I) |
| case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I) |
| case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I) |
| case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I) |
| unsigned Opc; |
| unsigned Size; |
| switch (MI.getOpcode()) { |
| default: llvm_unreachable("Unreachable!"); |
| case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break; |
| case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break; |
| case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break; |
| case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break; |
| case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break; |
| case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break; |
| } |
| unsigned Amt = MI.getOperand(3).getImm(); |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.setDesc(get(Opc)); |
| WorkingMI.getOperand(3).setImm(Size - Amt); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| case X86::PFSUBrr: |
| case X86::PFSUBRrr: { |
| // PFSUB x, y: x = x - y |
| // PFSUBR x, y: x = y - x |
| unsigned Opc = |
| (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr); |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.setDesc(get(Opc)); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| case X86::BLENDPDrri: |
| case X86::BLENDPSrri: |
| case X86::VBLENDPDrri: |
| case X86::VBLENDPSrri: |
| // If we're optimizing for size, try to use MOVSD/MOVSS. |
| if (MI.getParent()->getParent()->getFunction().hasOptSize()) { |
| unsigned Mask, Opc; |
| switch (MI.getOpcode()) { |
| default: llvm_unreachable("Unreachable!"); |
| case X86::BLENDPDrri: Opc = X86::MOVSDrr; Mask = 0x03; break; |
| case X86::BLENDPSrri: Opc = X86::MOVSSrr; Mask = 0x0F; break; |
| case X86::VBLENDPDrri: Opc = X86::VMOVSDrr; Mask = 0x03; break; |
| case X86::VBLENDPSrri: Opc = X86::VMOVSSrr; Mask = 0x0F; break; |
| } |
| if ((MI.getOperand(3).getImm() ^ Mask) == 1) { |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.setDesc(get(Opc)); |
| WorkingMI.RemoveOperand(3); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, |
| /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| } |
| LLVM_FALLTHROUGH; |
| case X86::PBLENDWrri: |
| case X86::VBLENDPDYrri: |
| case X86::VBLENDPSYrri: |
| case X86::VPBLENDDrri: |
| case X86::VPBLENDWrri: |
| case X86::VPBLENDDYrri: |
| case X86::VPBLENDWYrri:{ |
| int8_t Mask; |
| switch (MI.getOpcode()) { |
| default: llvm_unreachable("Unreachable!"); |
| case X86::BLENDPDrri: Mask = (int8_t)0x03; break; |
| case X86::BLENDPSrri: Mask = (int8_t)0x0F; break; |
| case X86::PBLENDWrri: Mask = (int8_t)0xFF; break; |
| case X86::VBLENDPDrri: Mask = (int8_t)0x03; break; |
| case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break; |
| case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break; |
| case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break; |
| case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break; |
| case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break; |
| case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break; |
| case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break; |
| } |
| // Only the least significant bits of Imm are used. |
| // Using int8_t to ensure it will be sign extended to the int64_t that |
| // setImm takes in order to match isel behavior. |
| int8_t Imm = MI.getOperand(3).getImm() & Mask; |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.getOperand(3).setImm(Mask ^ Imm); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| case X86::INSERTPSrr: |
| case X86::VINSERTPSrr: |
| case X86::VINSERTPSZrr: { |
| unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm(); |
| unsigned ZMask = Imm & 15; |
| unsigned DstIdx = (Imm >> 4) & 3; |
| unsigned SrcIdx = (Imm >> 6) & 3; |
| |
| // We can commute insertps if we zero 2 of the elements, the insertion is |
| // "inline" and we don't override the insertion with a zero. |
| if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 && |
| countPopulation(ZMask) == 2) { |
| unsigned AltIdx = findFirstSet((ZMask | (1 << DstIdx)) ^ 15); |
| assert(AltIdx < 4 && "Illegal insertion index"); |
| unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask; |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| return nullptr; |
| } |
| case X86::MOVSDrr: |
| case X86::MOVSSrr: |
| case X86::VMOVSDrr: |
| case X86::VMOVSSrr:{ |
| // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD. |
| if (Subtarget.hasSSE41()) { |
| unsigned Mask, Opc; |
| switch (MI.getOpcode()) { |
| default: llvm_unreachable("Unreachable!"); |
| case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break; |
| case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break; |
| case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break; |
| case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break; |
| } |
| |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.setDesc(get(Opc)); |
| WorkingMI.addOperand(MachineOperand::CreateImm(Mask)); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| |
| // Convert to SHUFPD. |
| assert(MI.getOpcode() == X86::MOVSDrr && |
| "Can only commute MOVSDrr without SSE4.1"); |
| |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.setDesc(get(X86::SHUFPDrri)); |
| WorkingMI.addOperand(MachineOperand::CreateImm(0x02)); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| case X86::SHUFPDrri: { |
| // Commute to MOVSD. |
| assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!"); |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.setDesc(get(X86::MOVSDrr)); |
| WorkingMI.RemoveOperand(3); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| case X86::PCLMULQDQrr: |
| case X86::VPCLMULQDQrr: |
| case X86::VPCLMULQDQYrr: |
| case X86::VPCLMULQDQZrr: |
| case X86::VPCLMULQDQZ128rr: |
| case X86::VPCLMULQDQZ256rr: { |
| // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0] |
| // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0] |
| unsigned Imm = MI.getOperand(3).getImm(); |
| unsigned Src1Hi = Imm & 0x01; |
| unsigned Src2Hi = Imm & 0x10; |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri: |
| case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri: |
| case X86::VPCMPBZrri: case X86::VPCMPUBZrri: |
| case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri: |
| case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri: |
| case X86::VPCMPDZrri: case X86::VPCMPUDZrri: |
| case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri: |
| case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri: |
| case X86::VPCMPQZrri: case X86::VPCMPUQZrri: |
| case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri: |
| case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri: |
| case X86::VPCMPWZrri: case X86::VPCMPUWZrri: |
| case X86::VPCMPBZ128rrik: case X86::VPCMPUBZ128rrik: |
| case X86::VPCMPBZ256rrik: case X86::VPCMPUBZ256rrik: |
| case X86::VPCMPBZrrik: case X86::VPCMPUBZrrik: |
| case X86::VPCMPDZ128rrik: case X86::VPCMPUDZ128rrik: |
| case X86::VPCMPDZ256rrik: case X86::VPCMPUDZ256rrik: |
| case X86::VPCMPDZrrik: case X86::VPCMPUDZrrik: |
| case X86::VPCMPQZ128rrik: case X86::VPCMPUQZ128rrik: |
| case X86::VPCMPQZ256rrik: case X86::VPCMPUQZ256rrik: |
| case X86::VPCMPQZrrik: case X86::VPCMPUQZrrik: |
| case X86::VPCMPWZ128rrik: case X86::VPCMPUWZ128rrik: |
| case X86::VPCMPWZ256rrik: case X86::VPCMPUWZ256rrik: |
| case X86::VPCMPWZrrik: case X86::VPCMPUWZrrik: { |
| // Flip comparison mode immediate (if necessary). |
| unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7; |
| Imm = X86::getSwappedVPCMPImm(Imm); |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| case X86::VPCOMBri: case X86::VPCOMUBri: |
| case X86::VPCOMDri: case X86::VPCOMUDri: |
| case X86::VPCOMQri: case X86::VPCOMUQri: |
| case X86::VPCOMWri: case X86::VPCOMUWri: { |
| // Flip comparison mode immediate (if necessary). |
| unsigned Imm = MI.getOperand(3).getImm() & 0x7; |
| Imm = X86::getSwappedVPCOMImm(Imm); |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.getOperand(3).setImm(Imm); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| case X86::VCMPSDZrr: |
| case X86::VCMPSSZrr: |
| case X86::VCMPPDZrri: |
| case X86::VCMPPSZrri: |
| case X86::VCMPSHZrr: |
| case X86::VCMPPHZrri: |
| case X86::VCMPPHZ128rri: |
| case X86::VCMPPHZ256rri: |
| case X86::VCMPPDZ128rri: |
| case X86::VCMPPSZ128rri: |
| case X86::VCMPPDZ256rri: |
| case X86::VCMPPSZ256rri: |
| case X86::VCMPPDZrrik: |
| case X86::VCMPPSZrrik: |
| case X86::VCMPPDZ128rrik: |
| case X86::VCMPPSZ128rrik: |
| case X86::VCMPPDZ256rrik: |
| case X86::VCMPPSZ256rrik: { |
| unsigned Imm = |
| MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f; |
| Imm = X86::getSwappedVCMPImm(Imm); |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| case X86::VPERM2F128rr: |
| case X86::VPERM2I128rr: { |
| // Flip permute source immediate. |
| // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi. |
| // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi. |
| int8_t Imm = MI.getOperand(3).getImm() & 0xFF; |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.getOperand(3).setImm(Imm ^ 0x22); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| case X86::MOVHLPSrr: |
| case X86::UNPCKHPDrr: |
| case X86::VMOVHLPSrr: |
| case X86::VUNPCKHPDrr: |
| case X86::VMOVHLPSZrr: |
| case X86::VUNPCKHPDZ128rr: { |
| assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!"); |
| |
| unsigned Opc = MI.getOpcode(); |
| switch (Opc) { |
| default: llvm_unreachable("Unreachable!"); |
| case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break; |
| case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break; |
| case X86::VMOVHLPSrr: Opc = X86::VUNPCKHPDrr; break; |
| case X86::VUNPCKHPDrr: Opc = X86::VMOVHLPSrr; break; |
| case X86::VMOVHLPSZrr: Opc = X86::VUNPCKHPDZ128rr; break; |
| case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr; break; |
| } |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.setDesc(get(Opc)); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: { |
| auto &WorkingMI = cloneIfNew(MI); |
| unsigned OpNo = MI.getDesc().getNumOperands() - 1; |
| X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm()); |
| WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC)); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi: |
| case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi: |
| case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi: |
| case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi: |
| case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi: |
| case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi: |
| case X86::VPTERNLOGDZrrik: |
| case X86::VPTERNLOGDZ128rrik: |
| case X86::VPTERNLOGDZ256rrik: |
| case X86::VPTERNLOGQZrrik: |
| case X86::VPTERNLOGQZ128rrik: |
| case X86::VPTERNLOGQZ256rrik: |
| case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz: |
| case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz: |
| case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz: |
| case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz: |
| case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz: |
| case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: |
| case X86::VPTERNLOGDZ128rmbi: |
| case X86::VPTERNLOGDZ256rmbi: |
| case X86::VPTERNLOGDZrmbi: |
| case X86::VPTERNLOGQZ128rmbi: |
| case X86::VPTERNLOGQZ256rmbi: |
| case X86::VPTERNLOGQZrmbi: |
| case X86::VPTERNLOGDZ128rmbikz: |
| case X86::VPTERNLOGDZ256rmbikz: |
| case X86::VPTERNLOGDZrmbikz: |
| case X86::VPTERNLOGQZ128rmbikz: |
| case X86::VPTERNLOGQZ256rmbikz: |
| case X86::VPTERNLOGQZrmbikz: { |
| auto &WorkingMI = cloneIfNew(MI); |
| commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| default: { |
| if (isCommutableVPERMV3Instruction(MI.getOpcode())) { |
| unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode()); |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.setDesc(get(Opc)); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| |
| const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(), |
| MI.getDesc().TSFlags); |
| if (FMA3Group) { |
| unsigned Opc = |
| getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group); |
| auto &WorkingMI = cloneIfNew(MI); |
| WorkingMI.setDesc(get(Opc)); |
| return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, |
| OpIdx1, OpIdx2); |
| } |
| |
| return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); |
| } |
| } |
| } |
| |
| bool |
| X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, |
| unsigned &SrcOpIdx1, |
| unsigned &SrcOpIdx2, |
| bool IsIntrinsic) const { |
| uint64_t TSFlags = MI.getDesc().TSFlags; |
| |
| unsigned FirstCommutableVecOp = 1; |
| unsigned LastCommutableVecOp = 3; |
| unsigned KMaskOp = -1U; |
| if (X86II::isKMasked(TSFlags)) { |
| // For k-zero-masked operations it is Ok to commute the first vector |
| // operand. Unless this is an intrinsic instruction. |
| // For regular k-masked operations a conservative choice is done as the |
| // elements of the first vector operand, for which the corresponding bit |
| // in the k-mask operand is set to 0, are copied to the result of the |
| // instruction. |
| // TODO/FIXME: The commute still may be legal if it is known that the |
| // k-mask operand is set to either all ones or all zeroes. |
| // It is also Ok to commute the 1st operand if all users of MI use only |
| // the elements enabled by the k-mask operand. For example, |
| // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i] |
| // : v1[i]; |
| // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 -> |
| // // Ok, to commute v1 in FMADD213PSZrk. |
| |
| // The k-mask operand has index = 2 for masked and zero-masked operations. |
| KMaskOp = 2; |
| |
| // The operand with index = 1 is used as a source for those elements for |
| // which the corresponding bit in the k-mask is set to 0. |
| if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic) |
| FirstCommutableVecOp = 3; |
| |
| LastCommutableVecOp++; |
| } else if (IsIntrinsic) { |
| // Commuting the first operand of an intrinsic instruction isn't possible |
| // unless we can prove that only the lowest element of the result is used. |
| FirstCommutableVecOp = 2; |
| } |
| |
| if (isMem(MI, LastCommutableVecOp)) |
| LastCommutableVecOp--; |
| |
| // Only the first RegOpsNum operands are commutable. |
| // Also, the value 'CommuteAnyOperandIndex' is valid here as it means |
| // that the operand is not specified/fixed. |
| if (SrcOpIdx1 != CommuteAnyOperandIndex && |
| (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp || |
| SrcOpIdx1 == KMaskOp)) |
| return false; |
| if (SrcOpIdx2 != CommuteAnyOperandIndex && |
| (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp || |
| SrcOpIdx2 == KMaskOp)) |
| return false; |
| |
| // Look for two different register operands assumed to be commutable |
| // regardless of the FMA opcode. The FMA opcode is adjusted later. |
| if (SrcOpIdx1 == CommuteAnyOperandIndex || |
| SrcOpIdx2 == CommuteAnyOperandIndex) { |
| unsigned CommutableOpIdx2 = SrcOpIdx2; |
| |
| // At least one of operands to be commuted is not specified and |
| // this method is free to choose appropriate commutable operands. |
| if (SrcOpIdx1 == SrcOpIdx2) |
| // Both of operands are not fixed. By default set one of commutable |
| // operands to the last register operand of the instruction. |
| CommutableOpIdx2 = LastCommutableVecOp; |
| else if (SrcOpIdx2 == CommuteAnyOperandIndex) |
| // Only one of operands is not fixed. |
| CommutableOpIdx2 = SrcOpIdx1; |
| |
| // CommutableOpIdx2 is well defined now. Let's choose another commutable |
| // operand and assign its index to CommutableOpIdx1. |
| Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg(); |
| |
| unsigned CommutableOpIdx1; |
| for (CommutableOpIdx1 = LastCommutableVecOp; |
| CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) { |
| // Just ignore and skip the k-mask operand. |
| if (CommutableOpIdx1 == KMaskOp) |
| continue; |
| |
| // The commuted operands must have different registers. |
| // Otherwise, the commute transformation does not change anything and |
| // is useless then. |
| if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg()) |
| break; |
| } |
| |
| // No appropriate commutable operands were found. |
| if (CommutableOpIdx1 < FirstCommutableVecOp) |
| return false; |
| |
| // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2 |
| // to return those values. |
| if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, |
| CommutableOpIdx1, CommutableOpIdx2)) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI, |
| unsigned &SrcOpIdx1, |
| unsigned &SrcOpIdx2) const { |
| const MCInstrDesc &Desc = MI.getDesc(); |
| if (!Desc.isCommutable()) |
| return false; |
| |
| switch (MI.getOpcode()) { |
| case X86::CMPSDrr: |
| case X86::CMPSSrr: |
| case X86::CMPPDrri: |
| case X86::CMPPSrri: |
| case X86::VCMPSDrr: |
| case X86::VCMPSSrr: |
| case X86::VCMPPDrri: |
| case X86::VCMPPSrri: |
| case X86::VCMPPDYrri: |
| case X86::VCMPPSYrri: |
| case X86::VCMPSDZrr: |
| case X86::VCMPSSZrr: |
| case X86::VCMPPDZrri: |
| case X86::VCMPPSZrri: |
| case X86::VCMPSHZrr: |
| case X86::VCMPPHZrri: |
| case X86::VCMPPHZ128rri: |
| case X86::VCMPPHZ256rri: |
| case X86::VCMPPDZ128rri: |
| case X86::VCMPPSZ128rri: |
| case X86::VCMPPDZ256rri: |
| case X86::VCMPPSZ256rri: |
| case X86::VCMPPDZrrik: |
| case X86::VCMPPSZrrik: |
| case X86::VCMPPDZ128rrik: |
| case X86::VCMPPSZ128rrik: |
| case X86::VCMPPDZ256rrik: |
| case X86::VCMPPSZ256rrik: { |
| unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0; |
| |
| // Float comparison can be safely commuted for |
| // Ordered/Unordered/Equal/NotEqual tests |
| unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7; |
| switch (Imm) { |
| default: |
| // EVEX versions can be commuted. |
| if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX) |
| break; |
| return false; |
| case 0x00: // EQUAL |
| case 0x03: // UNORDERED |
| case 0x04: // NOT EQUAL |
| case 0x07: // ORDERED |
| break; |
| } |
| |
| // The indices of the commutable operands are 1 and 2 (or 2 and 3 |
| // when masked). |
| // Assign them to the returned operand indices here. |
| return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset, |
| 2 + OpOffset); |
| } |
| case X86::MOVSSrr: |
| // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can |
| // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since |
| // AVX implies sse4.1. |
| if (Subtarget.hasSSE41()) |
| return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); |
| return false; |
| case X86::SHUFPDrri: |
| // We can commute this to MOVSD. |