blob: 65e3c1e19fa90dd422300a5c9f407feab81926d5 [file] [log] [blame]
//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file describes the X86 SSE instruction set, defining the instructions,
// and properties of the instructions which are needed for code generation,
// machine code emission, and analysis.
//
//===----------------------------------------------------------------------===//
class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
InstrItinClass rr = arg_rr;
InstrItinClass rm = arg_rm;
}
class SizeItins<OpndItins arg_s, OpndItins arg_d> {
OpndItins s = arg_s;
OpndItins d = arg_d;
}
class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
InstrItinClass arg_ri> {
InstrItinClass rr = arg_rr;
InstrItinClass rm = arg_rm;
InstrItinClass ri = arg_ri;
}
// scalar
def SSE_ALU_F32S : OpndItins<
IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
>;
def SSE_ALU_F64S : OpndItins<
IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
>;
def SSE_ALU_ITINS_S : SizeItins<
SSE_ALU_F32S, SSE_ALU_F64S
>;
def SSE_MUL_F32S : OpndItins<
IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
>;
def SSE_MUL_F64S : OpndItins<
IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
>;
def SSE_MUL_ITINS_S : SizeItins<
SSE_MUL_F32S, SSE_MUL_F64S
>;
def SSE_DIV_F32S : OpndItins<
IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
>;
def SSE_DIV_F64S : OpndItins<
IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
>;
def SSE_DIV_ITINS_S : SizeItins<
SSE_DIV_F32S, SSE_DIV_F64S
>;
// parallel
def SSE_ALU_F32P : OpndItins<
IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
>;
def SSE_ALU_F64P : OpndItins<
IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
>;
def SSE_ALU_ITINS_P : SizeItins<
SSE_ALU_F32P, SSE_ALU_F64P
>;
def SSE_MUL_F32P : OpndItins<
IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
>;
def SSE_MUL_F64P : OpndItins<
IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
>;
def SSE_MUL_ITINS_P : SizeItins<
SSE_MUL_F32P, SSE_MUL_F64P
>;
def SSE_DIV_F32P : OpndItins<
IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
>;
def SSE_DIV_F64P : OpndItins<
IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
>;
def SSE_DIV_ITINS_P : SizeItins<
SSE_DIV_F32P, SSE_DIV_F64P
>;
def SSE_BIT_ITINS_P : OpndItins<
IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
>;
def SSE_INTALU_ITINS_P : OpndItins<
IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
>;
def SSE_INTALUQ_ITINS_P : OpndItins<
IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
>;
def SSE_INTMUL_ITINS_P : OpndItins<
IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
>;
def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
>;
def SSE_MOVA_ITINS : OpndItins<
IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
>;
def SSE_MOVU_ITINS : OpndItins<
IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
>;
//===----------------------------------------------------------------------===//
// SSE 1 & 2 Instructions Classes
//===----------------------------------------------------------------------===//
/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
RegisterClass RC, X86MemOperand x86memop,
OpndItins itins,
bit Is2Addr = 1> {
let isCommutable = 1 in {
def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>;
}
def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>;
}
/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
string asm, string SSEVer, string FPSizeStr,
Operand memopr, ComplexPattern mem_cpat,
OpndItins itins,
bit Is2Addr = 1> {
def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (!cast<Intrinsic>(
!strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
RC:$src1, RC:$src2))], itins.rr>;
def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
SSEVer, "_", OpcodeStr, FPSizeStr))
RC:$src1, mem_cpat:$src2))], itins.rm>;
}
/// sse12_fp_packed - SSE 1 & 2 packed instructions class
multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
RegisterClass RC, ValueType vt,
X86MemOperand x86memop, PatFrag mem_frag,
Domain d, OpndItins itins, bit Is2Addr = 1> {
let isCommutable = 1 in
def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>;
let mayLoad = 1 in
def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
itins.rm, d>;
}
/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
string OpcodeStr, X86MemOperand x86memop,
list<dag> pat_rr, list<dag> pat_rm,
bit Is2Addr = 1,
bit rr_hasSideEffects = 0> {
let isCommutable = 1, neverHasSideEffects = rr_hasSideEffects in
def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
pat_rr, IIC_DEFAULT, d>;
def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
pat_rm, IIC_DEFAULT, d>;
}
/// sse12_fp_packed_int - SSE 1 & 2 packed instructions intrinsics class
multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
string asm, string SSEVer, string FPSizeStr,
X86MemOperand x86memop, PatFrag mem_frag,
Domain d, OpndItins itins, bit Is2Addr = 1> {
def rr_Int : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (!cast<Intrinsic>(
!strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr))
RC:$src1, RC:$src2))], IIC_DEFAULT, d>;
def rm_Int : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1,x86memop:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (!cast<Intrinsic>(
!strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr))
RC:$src1, (mem_frag addr:$src2)))], IIC_DEFAULT, d>;
}
//===----------------------------------------------------------------------===//
// Non-instruction patterns
//===----------------------------------------------------------------------===//
// A vector extract of the first f32/f64 position is a subregister copy
def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
(f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
(f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
// A 128-bit subvector extract from the first 256-bit vector position
// is a subregister copy that needs no instruction.
def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (i32 0))),
(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (i32 0))),
(v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (i32 0))),
(v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (i32 0))),
(v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (i32 0))),
(v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (i32 0))),
(v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
// A 128-bit subvector insert to the first 256-bit vector position
// is a subregister copy that needs no instruction.
def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)),
(INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)),
(INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)),
(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)),
(INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)),
(INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)),
(INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
// Implicitly promote a 32-bit scalar to a vector.
def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
(INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
// Implicitly promote a 64-bit scalar to a vector.
def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
(INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
// Bitcasts between 128-bit vector types. Return the original type since
// no instruction is needed for the conversion
let Predicates = [HasSSE2] in {
def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
}
// Bitcasts between 256-bit vector types. Return the original type since
// no instruction is needed for the conversion
let Predicates = [HasAVX] in {
def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>;
def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>;
def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>;
def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>;
def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>;
def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>;
def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>;
def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>;
def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>;
def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>;
def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>;
def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>;
def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>;
def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>;
def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>;
def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>;
def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>;
def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>;
def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>;
def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>;
def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>;
def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>;
def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>;
def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>;
def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>;
}
// Alias instructions that map fld0 to pxor for sse.
// This is expanded by ExpandPostRAPseudos.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1 in {
def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
[(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
[(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
}
//===----------------------------------------------------------------------===//
// AVX & SSE - Zero/One Vectors
//===----------------------------------------------------------------------===//
// Alias instruction that maps zero vector to pxor / xorp* for sse.
// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
// swizzled by ExecutionDepsFix to pxor.
// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-zeros value if folding it would be beneficial.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, neverHasSideEffects = 1 in {
def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", []>;
}
def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
// The same as done above but for AVX. The 256-bit ISA does not support PI,
// and doesn't need it because on sandy bridge the register is set to zero
// at the rename stage without using any execution unit, so SET0PSY
// and SET0PDY can be used for vector int instructions without penalty
// FIXME: Change encoding to pseudo! This is blocked right now by the x86
// JIT implementatioan, it does not expand the instructions below like
// X86MCInstLower does.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isCodeGenOnly = 1 in {
let Predicates = [HasAVX] in {
def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
[(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V;
def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
[(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
}
let Predicates = [HasAVX2], neverHasSideEffects = 1 in
def AVX2_SET0 : PDI<0xef, MRMInitReg, (outs VR256:$dst), (ins), "",
[]>, VEX_4V;
}
let Predicates = [HasAVX2], AddedComplexity = 5 in {
def : Pat<(v4i64 immAllZerosV), (AVX2_SET0)>;
def : Pat<(v8i32 immAllZerosV), (AVX2_SET0)>;
def : Pat<(v16i16 immAllZerosV), (AVX2_SET0)>;
def : Pat<(v32i8 immAllZerosV), (AVX2_SET0)>;
}
// AVX has no support for 256-bit integer instructions, but since the 128-bit
// VPXOR instruction writes zero to its upper part, it's safe build zeros.
def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
(SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
(SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
(SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
(SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-ones value if folding it would be beneficial.
// FIXME: Change encoding to pseudo! This is blocked right now by the x86
// JIT implementation, it does not expand the instructions below like
// X86MCInstLower does.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isCodeGenOnly = 1, ExeDomain = SSEPackedInt in {
let Predicates = [HasAVX] in
def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V;
def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4i32 immAllOnesV))]>;
let Predicates = [HasAVX2] in
def AVX2_SETALLONES : PDI<0x76, MRMInitReg, (outs VR256:$dst), (ins), "",
[(set VR256:$dst, (v8i32 immAllOnesV))]>, VEX_4V;
}
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move FP Scalar Instructions
//
// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
// is used instead. Register-to-register movss/movsd is not modeled as an
// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
//===----------------------------------------------------------------------===//
class sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, string asm> :
SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), asm,
[(set VR128:$dst, (vt (OpNode VR128:$src1,
(scalar_to_vector RC:$src2))))],
IIC_SSE_MOV_S_RR>;
// Loading from memory automatically zeroing upper bits.
class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
PatFrag mem_pat, string OpcodeStr> :
SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (mem_pat addr:$src))],
IIC_SSE_MOV_S_RM>;
// AVX
def VMOVSSrr : sse12_move_rr<FR32, X86Movss, v4f32,
"movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V,
VEX_LIG;
def VMOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64,
"movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V,
VEX_LIG;
// For the disassembler
let isCodeGenOnly = 1 in {
def VMOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src1, FR32:$src2),
"movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
IIC_SSE_MOV_S_RR>,
XS, VEX_4V, VEX_LIG;
def VMOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src1, FR64:$src2),
"movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
IIC_SSE_MOV_S_RR>,
XD, VEX_4V, VEX_LIG;
}
let canFoldAsLoad = 1, isReMaterializable = 1 in {
def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX,
VEX_LIG;
let AddedComplexity = 20 in
def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX,
VEX_LIG;
}
def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
"movss\t{$src, $dst|$dst, $src}",
[(store FR32:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
XS, VEX, VEX_LIG;
def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
"movsd\t{$src, $dst|$dst, $src}",
[(store FR64:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
XD, VEX, VEX_LIG;
// SSE1 & 2
let Constraints = "$src1 = $dst" in {
def MOVSSrr : sse12_move_rr<FR32, X86Movss, v4f32,
"movss\t{$src2, $dst|$dst, $src2}">, XS;
def MOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64,
"movsd\t{$src2, $dst|$dst, $src2}">, XD;
// For the disassembler
let isCodeGenOnly = 1 in {
def MOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src1, FR32:$src2),
"movss\t{$src2, $dst|$dst, $src2}", [],
IIC_SSE_MOV_S_RR>, XS;
def MOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src1, FR64:$src2),
"movsd\t{$src2, $dst|$dst, $src2}", [],
IIC_SSE_MOV_S_RR>, XD;
}
}
let canFoldAsLoad = 1, isReMaterializable = 1 in {
def MOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
let AddedComplexity = 20 in
def MOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
}
def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
"movss\t{$src, $dst|$dst, $src}",
[(store FR32:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;
def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
"movsd\t{$src, $dst|$dst, $src}",
[(store FR64:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;
// Patterns
let Predicates = [HasAVX] in {
let AddedComplexity = 15 in {
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
// MOVS{S,D} to the lower bits.
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
(VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
(VMOVSSrr (v4f32 (V_SET0)),
(f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
(VMOVSSrr (v4i32 (V_SET0)),
(EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
(VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
// Move low f32 and clear high bits.
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
(SUBREG_TO_REG (i32 0),
(VMOVSSrr (v4f32 (V_SET0)),
(EXTRACT_SUBREG (v8f32 VR256:$src), sub_ss)), sub_xmm)>;
def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
(SUBREG_TO_REG (i32 0),
(VMOVSSrr (v4i32 (V_SET0)),
(EXTRACT_SUBREG (v8i32 VR256:$src), sub_ss)), sub_xmm)>;
}
let AddedComplexity = 20 in {
// MOVSSrm zeros the high parts of the register; represent this
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
// MOVSDrm zeros the high parts of the register; represent this
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
(SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
(SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
(SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
(SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
def : Pat<(v2f64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
// Represent the same patterns above but in the form they appear for
// 256-bit types
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector (loadi32 addr:$src))), (i32 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
(v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_sd)>;
}
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
(v4f32 (scalar_to_vector FR32:$src)), (i32 0)))),
(SUBREG_TO_REG (i32 0),
(v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
sub_xmm)>;
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
(v2f64 (scalar_to_vector FR64:$src)), (i32 0)))),
(SUBREG_TO_REG (i64 0),
(v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
sub_xmm)>;
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
(v2i64 (scalar_to_vector (loadi64 addr:$src))), (i32 0)))),
(SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
// Move low f64 and clear high bits.
def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
(SUBREG_TO_REG (i32 0),
(VMOVSDrr (v2f64 (V_SET0)),
(EXTRACT_SUBREG (v4f64 VR256:$src), sub_sd)), sub_xmm)>;
def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
(SUBREG_TO_REG (i32 0),
(VMOVSDrr (v2i64 (V_SET0)),
(EXTRACT_SUBREG (v4i64 VR256:$src), sub_sd)), sub_xmm)>;
// Extract and store.
def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
addr:$dst),
(VMOVSSmr addr:$dst,
(EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
addr:$dst),
(VMOVSDmr addr:$dst,
(EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
// Shuffle with VMOVSS
def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
(VMOVSSrr (v4i32 VR128:$src1),
(EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
(VMOVSSrr (v4f32 VR128:$src1),
(EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
// 256-bit variants
def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
(SUBREG_TO_REG (i32 0),
(VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_ss),
(EXTRACT_SUBREG (v8i32 VR256:$src2), sub_ss)), sub_xmm)>;
def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
(SUBREG_TO_REG (i32 0),
(VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_ss),
(EXTRACT_SUBREG (v8f32 VR256:$src2), sub_ss)), sub_xmm)>;
// Shuffle with VMOVSD
def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
(VMOVSDrr (v2i64 VR128:$src1),
(EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
(VMOVSDrr (v2f64 VR128:$src1),
(EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
(VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),
sub_sd))>;
def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
(VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),
sub_sd))>;
// 256-bit variants
def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
(SUBREG_TO_REG (i32 0),
(VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_sd),
(EXTRACT_SUBREG (v4i64 VR256:$src2), sub_sd)), sub_xmm)>;
def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
(SUBREG_TO_REG (i32 0),
(VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_sd),
(EXTRACT_SUBREG (v4f64 VR256:$src2), sub_sd)), sub_xmm)>;
// FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
// is during lowering, where it's not possible to recognize the fold cause
// it has two uses through a bitcast. One use disappears at isel time and the
// fold opportunity reappears.
def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
(VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2f64 VR128:$src2),
sub_sd))>;
def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
(VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2i64 VR128:$src2),
sub_sd))>;
def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
(VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),
sub_sd))>;
def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
(VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),
sub_sd))>;
}
let Predicates = [HasSSE1] in {
let AddedComplexity = 15 in {
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
// MOVSS to the lower bits.
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
(MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
(MOVSSrr (v4f32 (V_SET0)),
(f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
(MOVSSrr (v4i32 (V_SET0)),
(EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
}
let AddedComplexity = 20 in {
// MOVSSrm zeros the high parts of the register; represent this
// with SUBREG_TO_REG.
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
(SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
(SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
(SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
}
// Extract and store.
def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
addr:$dst),
(MOVSSmr addr:$dst,
(EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
// Shuffle with MOVSS
def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
(MOVSSrr (v4i32 VR128:$src1),
(EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
(MOVSSrr (v4f32 VR128:$src1),
(EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
}
let Predicates = [HasSSE2] in {
let AddedComplexity = 15 in {
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
// MOVSD to the lower bits.
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
(MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
}
let AddedComplexity = 20 in {
// MOVSDrm zeros the high parts of the register; represent this
// with SUBREG_TO_REG.
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
(SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
(SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
(SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
(SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
def : Pat<(v2f64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
}
// Extract and store.
def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
addr:$dst),
(MOVSDmr addr:$dst,
(EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
// Shuffle with MOVSD
def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
(MOVSDrr (v2i64 VR128:$src1),
(EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
(MOVSDrr (v2f64 VR128:$src1),
(EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
(MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
(MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
// FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
// is during lowering, where it's not possible to recognize the fold cause
// it has two uses through a bitcast. One use disappears at isel time and the
// fold opportunity reappears.
def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
(MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2f64 VR128:$src2),sub_sd))>;
def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
(MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2i64 VR128:$src2),sub_sd))>;
def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
(MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
(MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
}
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
//===----------------------------------------------------------------------===//
multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
X86MemOperand x86memop, PatFrag ld_frag,
string asm, Domain d,
OpndItins itins,
bit IsReMaterializable = 1> {
let neverHasSideEffects = 1 in
def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>;
let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>;
}
defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
"movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
TB, VEX;
defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
"movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
TB, OpSize, VEX;
defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
"movups", SSEPackedSingle, SSE_MOVU_ITINS>,
TB, VEX;
defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
"movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
TB, OpSize, VEX;
defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
"movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
TB, VEX;
defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
"movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
TB, OpSize, VEX;
defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
"movups", SSEPackedSingle, SSE_MOVU_ITINS>,
TB, VEX;
defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
"movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
TB, OpSize, VEX;
defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
"movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
TB;
defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
"movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
TB, OpSize;
defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
"movups", SSEPackedSingle, SSE_MOVU_ITINS>,
TB;
defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
"movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
TB, OpSize;
def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movaps\t{$src, $dst|$dst, $src}",
[(alignedstore (v4f32 VR128:$src), addr:$dst)],
IIC_SSE_MOVA_P_MR>, VEX;
def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movapd\t{$src, $dst|$dst, $src}",
[(alignedstore (v2f64 VR128:$src), addr:$dst)],
IIC_SSE_MOVA_P_MR>, VEX;
def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movups\t{$src, $dst|$dst, $src}",
[(store (v4f32 VR128:$src), addr:$dst)],
IIC_SSE_MOVU_P_MR>, VEX;
def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movupd\t{$src, $dst|$dst, $src}",
[(store (v2f64 VR128:$src), addr:$dst)],
IIC_SSE_MOVU_P_MR>, VEX;
def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movaps\t{$src, $dst|$dst, $src}",
[(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
IIC_SSE_MOVA_P_MR>, VEX;
def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movapd\t{$src, $dst|$dst, $src}",
[(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
IIC_SSE_MOVA_P_MR>, VEX;
def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movups\t{$src, $dst|$dst, $src}",
[(store (v8f32 VR256:$src), addr:$dst)],
IIC_SSE_MOVU_P_MR>, VEX;
def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movupd\t{$src, $dst|$dst, $src}",
[(store (v4f64 VR256:$src), addr:$dst)],
IIC_SSE_MOVU_P_MR>, VEX;
// For disassembler
let isCodeGenOnly = 1 in {
def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
"movaps\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>, VEX;
def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
"movapd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>, VEX;
def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
"movups\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVU_P_RR>, VEX;
def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
"movupd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVU_P_RR>, VEX;
def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
"movaps\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>, VEX;
def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
"movapd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>, VEX;
def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
"movups\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVU_P_RR>, VEX;
def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
"movupd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVU_P_RR>, VEX;
}
let Predicates = [HasAVX] in {
def : Pat<(v8i32 (X86vzmovl
(insert_subvector undef, (v4i32 VR128:$src), (i32 0)))),
(SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
def : Pat<(v4i64 (X86vzmovl
(insert_subvector undef, (v2i64 VR128:$src), (i32 0)))),
(SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
def : Pat<(v8f32 (X86vzmovl
(insert_subvector undef, (v4f32 VR128:$src), (i32 0)))),
(SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzmovl
(insert_subvector undef, (v2f64 VR128:$src), (i32 0)))),
(SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
}
def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
(VMOVUPSYmr addr:$dst, VR256:$src)>;
def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
(VMOVUPDYmr addr:$dst, VR256:$src)>;
def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movaps\t{$src, $dst|$dst, $src}",
[(alignedstore (v4f32 VR128:$src), addr:$dst)],
IIC_SSE_MOVA_P_MR>;
def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movapd\t{$src, $dst|$dst, $src}",
[(alignedstore (v2f64 VR128:$src), addr:$dst)],
IIC_SSE_MOVA_P_MR>;
def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movups\t{$src, $dst|$dst, $src}",
[(store (v4f32 VR128:$src), addr:$dst)],
IIC_SSE_MOVU_P_MR>;
def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movupd\t{$src, $dst|$dst, $src}",
[(store (v2f64 VR128:$src), addr:$dst)],
IIC_SSE_MOVU_P_MR>;
// For disassembler
let isCodeGenOnly = 1 in {
def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movaps\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>;
def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movapd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>;
def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movups\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVU_P_RR>;
def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movupd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVU_P_RR>;
}
let Predicates = [HasAVX] in {
def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
(VMOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
(VMOVUPDmr addr:$dst, VR128:$src)>;
}
let Predicates = [HasSSE1] in
def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
(MOVUPSmr addr:$dst, VR128:$src)>;
let Predicates = [HasSSE2] in
def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
(MOVUPDmr addr:$dst, VR128:$src)>;
// Use vmovaps/vmovups for AVX integer load/store.
let Predicates = [HasAVX] in {
// 128-bit load/store
def : Pat<(alignedloadv2i64 addr:$src),
(VMOVAPSrm addr:$src)>;
def : Pat<(loadv2i64 addr:$src),
(VMOVUPSrm addr:$src)>;
def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
(VMOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
(VMOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
(VMOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
(VMOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v2i64 VR128:$src), addr:$dst),
(VMOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v4i32 VR128:$src), addr:$dst),
(VMOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v8i16 VR128:$src), addr:$dst),
(VMOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v16i8 VR128:$src), addr:$dst),
(VMOVUPSmr addr:$dst, VR128:$src)>;
// 256-bit load/store
def : Pat<(alignedloadv4i64 addr:$src),
(VMOVAPSYrm addr:$src)>;
def : Pat<(loadv4i64 addr:$src),
(VMOVUPSYrm addr:$src)>;
def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
(VMOVAPSYmr addr:$dst, VR256:$src)>;
def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
(VMOVAPSYmr addr:$dst, VR256:$src)>;
def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
(VMOVAPSYmr addr:$dst, VR256:$src)>;
def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
(VMOVAPSYmr addr:$dst, VR256:$src)>;
def : Pat<(store (v4i64 VR256:$src), addr:$dst),
(VMOVUPSYmr addr:$dst, VR256:$src)>;
def : Pat<(store (v8i32 VR256:$src), addr:$dst),
(VMOVUPSYmr addr:$dst, VR256:$src)>;
def : Pat<(store (v16i16 VR256:$src), addr:$dst),
(VMOVUPSYmr addr:$dst, VR256:$src)>;
def : Pat<(store (v32i8 VR256:$src), addr:$dst),
(VMOVUPSYmr addr:$dst, VR256:$src)>;
}
// Use movaps / movups for SSE integer load / store (one byte shorter).
// The instructions selected below are then converted to MOVDQA/MOVDQU
// during the SSE domain pass.
let Predicates = [HasSSE1] in {
def : Pat<(alignedloadv2i64 addr:$src),
(MOVAPSrm addr:$src)>;
def : Pat<(loadv2i64 addr:$src),
(MOVUPSrm addr:$src)>;
def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
(MOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
(MOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
(MOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
(MOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v2i64 VR128:$src), addr:$dst),
(MOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v4i32 VR128:$src), addr:$dst),
(MOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v8i16 VR128:$src), addr:$dst),
(MOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v16i8 VR128:$src), addr:$dst),
(MOVUPSmr addr:$dst, VR128:$src)>;
}
// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
// bits are disregarded. FIXME: Set encoding to pseudo!
let neverHasSideEffects = 1 in {
def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
"movaps\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>, VEX;
def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
"movapd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>, VEX;
def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
"movaps\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>;
def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
"movapd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>;
}
// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
// bits are disregarded. FIXME: Set encoding to pseudo!
let canFoldAsLoad = 1, isReMaterializable = 1 in {
let isCodeGenOnly = 1 in {
def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
"movaps\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (alignedloadfsf32 addr:$src))],
IIC_SSE_MOVA_P_RM>, VEX;
def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
"movapd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (alignedloadfsf64 addr:$src))],
IIC_SSE_MOVA_P_RM>, VEX;
}
def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
"movaps\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (alignedloadfsf32 addr:$src))],
IIC_SSE_MOVA_P_RM>;
def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
"movapd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (alignedloadfsf64 addr:$src))],
IIC_SSE_MOVA_P_RM>;
}
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Low packed FP Instructions
//===----------------------------------------------------------------------===//
multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC,
SDNode psnode, SDNode pdnode, string base_opc,
string asm_opr, InstrItinClass itin> {
def PSrm : PI<opc, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
!strconcat(base_opc, "s", asm_opr),
[(set RC:$dst,
(psnode RC:$src1,
(bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
itin, SSEPackedSingle>, TB;
def PDrm : PI<opc, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, f64mem:$src2),
!strconcat(base_opc, "d", asm_opr),
[(set RC:$dst, (v2f64 (pdnode RC:$src1,
(scalar_to_vector (loadf64 addr:$src2)))))],
itin, SSEPackedDouble>, TB, OpSize;
}
let AddedComplexity = 20 in {
defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, X86Movlps, X86Movlpd, "movlp",
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
IIC_SSE_MOV_LH>, VEX_4V;
}
let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
defm MOVL : sse12_mov_hilo_packed<0x12, VR128, X86Movlps, X86Movlpd, "movlp",
"\t{$src2, $dst|$dst, $src2}",
IIC_SSE_MOV_LH>;
}
def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlps\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOV_LH>, VEX;
def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlpd\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract (v2f64 VR128:$src),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOV_LH>, VEX;
def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlps\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOV_LH>;
def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlpd\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract (v2f64 VR128:$src),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOV_LH>;
let Predicates = [HasAVX] in {
// Shuffle with VMOVLPS
def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
(VMOVLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
(VMOVLPSrm VR128:$src1, addr:$src2)>;
// Shuffle with VMOVLPD
def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
(VMOVLPDrm VR128:$src1, addr:$src2)>;
def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
(VMOVLPDrm VR128:$src1, addr:$src2)>;
// Store patterns
def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
addr:$src1),
(VMOVLPSmr addr:$src1, VR128:$src2)>;
def : Pat<(store (v4i32 (X86Movlps
(bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
(VMOVLPSmr addr:$src1, VR128:$src2)>;
def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
addr:$src1),
(VMOVLPDmr addr:$src1, VR128:$src2)>;
def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
addr:$src1),
(VMOVLPDmr addr:$src1, VR128:$src2)>;
}
let Predicates = [HasSSE1] in {
// (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
(iPTR 0))), addr:$src1),
(MOVLPSmr addr:$src1, VR128:$src2)>;
// Shuffle with MOVLPS
def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
(MOVLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
(MOVLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Movlps VR128:$src1,
(bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
(MOVLPSrm VR128:$src1, addr:$src2)>;
// Store patterns
def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
addr:$src1),
(MOVLPSmr addr:$src1, VR128:$src2)>;
def : Pat<(store (v4i32 (X86Movlps
(bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
addr:$src1),
(MOVLPSmr addr:$src1, VR128:$src2)>;
}
let Predicates = [HasSSE2] in {
// Shuffle with MOVLPD
def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
(MOVLPDrm VR128:$src1, addr:$src2)>;
def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
(MOVLPDrm VR128:$src1, addr:$src2)>;
// Store patterns
def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
addr:$src1),
(MOVLPDmr addr:$src1, VR128:$src2)>;
def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
addr:$src1),
(MOVLPDmr addr:$src1, VR128:$src2)>;
}
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Hi packed FP Instructions
//===----------------------------------------------------------------------===//
let AddedComplexity = 20 in {
defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, X86Movlhps, X86Movlhpd, "movhp",
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
IIC_SSE_MOV_LH>, VEX_4V;
}
let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
defm MOVH : sse12_mov_hilo_packed<0x16, VR128, X86Movlhps, X86Movlhpd, "movhp",
"\t{$src2, $dst|$dst, $src2}",
IIC_SSE_MOV_LH>;
}
// v2f64 extract element 1 is always custom lowered to unpack high to low
// and extract element 0 so the non-store version isn't too horrible.
def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhps\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract
(X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
(bc_v2f64 (v4f32 VR128:$src))),
(iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhpd\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract
(v2f64 (X86Unpckh VR128:$src, VR128:$src)),
(iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhps\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract
(X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
(bc_v2f64 (v4f32 VR128:$src))),
(iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhpd\t{$src, $dst|$dst, $src}",
[(store (f64 (vector_extract
(v2f64 (X86Unpckh VR128:$src, VR128:$src)),
(iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
let Predicates = [HasAVX] in {
// VMOVHPS patterns
def : Pat<(X86Movlhps VR128:$src1,
(bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
(VMOVHPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Movlhps VR128:$src1,
(bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
(VMOVHPSrm VR128:$src1, addr:$src2)>;
// FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
// is during lowering, where it's not possible to recognize the load fold
// cause it has two uses through a bitcast. One use disappears at isel time
// and the fold opportunity reappears.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(scalar_to_vector (loadf64 addr:$src2)))),
(VMOVHPDrm VR128:$src1, addr:$src2)>;
}
let Predicates = [HasSSE1] in {
// MOVHPS patterns
def : Pat<(X86Movlhps VR128:$src1,
(bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
(MOVHPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Movlhps VR128:$src1,
(bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
(MOVHPSrm VR128:$src1, addr:$src2)>;
}
let Predicates = [HasSSE2] in {
// FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
// is during lowering, where it's not possible to recognize the load fold
// cause it has two uses through a bitcast. One use disappears at isel time
// and the fold opportunity reappears.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(scalar_to_vector (loadf64 addr:$src2)))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
}
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
//===----------------------------------------------------------------------===//
let AddedComplexity = 20 in {
def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
IIC_SSE_MOV_LH>,
VEX_4V;
def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
IIC_SSE_MOV_LH>,
VEX_4V;
}
let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movlhps\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
IIC_SSE_MOV_LH>;
def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movhlps\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
IIC_SSE_MOV_LH>;
}
let Predicates = [HasAVX] in {
// MOVLHPS patterns
def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
(VMOVLHPSrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
(VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
// MOVHLPS patterns
def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
(VMOVHLPSrr VR128:$src1, VR128:$src2)>;
}
let Predicates = [HasSSE1] in {
// MOVLHPS patterns
def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
(MOVLHPSrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
(MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
// MOVHLPS patterns
def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
(MOVHLPSrr VR128:$src1, VR128:$src2)>;
}
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Conversion Instructions
//===----------------------------------------------------------------------===//
def SSE_CVT_PD : OpndItins<
IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
>;
def SSE_CVT_PS : OpndItins<
IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
>;
def SSE_CVT_Scalar : OpndItins<
IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
>;
def SSE_CVT_SS2SI_32 : OpndItins<
IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
>;
def SSE_CVT_SS2SI_64 : OpndItins<
IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
>;
def SSE_CVT_SD2SI : OpndItins<
IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
>;
multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
string asm, OpndItins itins> {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
[(set DstRC:$dst, (OpNode SrcRC:$src))],
itins.rr>;
def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
[(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
itins.rm>;
}
multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
string asm, Domain d, OpndItins itins> {
def rr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
[(set DstRC:$dst, (OpNode SrcRC:$src))],
itins.rr, d>;
def rm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
[(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
itins.rm, d>;
}
multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
X86MemOperand x86memop, string asm> {
let neverHasSideEffects = 1 in {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
!strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
let mayLoad = 1 in
def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
(ins DstRC:$src1, x86memop:$src),
!strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
} // neverHasSideEffects = 1
}
defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
"cvttss2si\t{$src, $dst|$dst, $src}",
SSE_CVT_SS2SI_32>,
XS, VEX, VEX_LIG;
defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
"cvttss2si\t{$src, $dst|$dst, $src}",
SSE_CVT_SS2SI_64>,
XS, VEX, VEX_W, VEX_LIG;
defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
"cvttsd2si\t{$src, $dst|$dst, $src}",
SSE_CVT_SD2SI>,
XD, VEX, VEX_LIG;
defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
"cvttsd2si\t{$src, $dst|$dst, $src}",
SSE_CVT_SD2SI>,
XD, VEX, VEX_W, VEX_LIG;
// The assembler can recognize rr 64-bit instructions by seeing a rxx
// register, but the same isn't true when only using memory operands,
// provide other assembly "l" and "q" forms to address this explicitly
// where appropriate to do so.
defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss">,
XS, VEX_4V, VEX_LIG;
defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
XS, VEX_4V, VEX_W, VEX_LIG;
defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd">,
XD, VEX_4V, VEX_LIG;
defm VCVTSI2SDL : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
XD, VEX_4V, VEX_LIG;
defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
XD, VEX_4V, VEX_W, VEX_LIG;
let Predicates = [HasAVX], AddedComplexity = 1 in {
def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
(VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
(VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(f32 (sint_to_fp GR32:$src)),
(VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
def : Pat<(f32 (sint_to_fp GR64:$src)),
(VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
def : Pat<(f64 (sint_to_fp GR32:$src)),
(VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
def : Pat<(f64 (sint_to_fp GR64:$src)),
(VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
}
defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
"cvttss2si\t{$src, $dst|$dst, $src}",
SSE_CVT_SS2SI_32>, XS;
defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
"cvttss2si{q}\t{$src, $dst|$dst, $src}",
SSE_CVT_SS2SI_64>, XS, REX_W;
defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
"cvttsd2si\t{$src, $dst|$dst, $src}",
SSE_CVT_SD2SI>, XD;
defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
SSE_CVT_SD2SI>, XD, REX_W;
defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
"cvtsi2ss\t{$src, $dst|$dst, $src}",
SSE_CVT_Scalar>, XS;
defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
SSE_CVT_Scalar>, XS, REX_W;
defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
"cvtsi2sd\t{$src, $dst|$dst, $src}",
SSE_CVT_Scalar>, XD;
defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
SSE_CVT_Scalar>, XD, REX_W;
// Conversion Instructions Intrinsics - Match intrinsics which expect MM
// and/or XMM operand(s).
multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
string asm, OpndItins itins> {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>;
def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm>;
}
multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
PatFrag ld_frag, string asm, OpndItins itins,
bit Is2Addr = 1> {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
itins.rr>;
def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
(ins DstRC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
itins.rm>;
}
defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
f128mem, load, "cvtsd2si", SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si",
SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
f128mem, load, "cvtsd2si{l}", SSE_CVT_SD2SI>, XD;
defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
f128mem, load, "cvtsd2si{q}", SSE_CVT_SD2SI>, XD, REX_W;
defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss",
SSE_CVT_Scalar, 0>, XS, VEX_4V;
defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss",
SSE_CVT_Scalar, 0>, XS, VEX_4V,
VEX_W;
defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd",
SSE_CVT_Scalar, 0>, XD, VEX_4V;
defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd",
SSE_CVT_Scalar, 0>, XD,
VEX_4V, VEX_W;
let Constraints = "$src1 = $dst" in {
defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
int_x86_sse_cvtsi2ss, i32mem, loadi32,
"cvtsi2ss", SSE_CVT_Scalar>, XS;
defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
int_x86_sse_cvtsi642ss, i64mem, loadi64,
"cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
int_x86_sse2_cvtsi2sd, i32mem, loadi32,
"cvtsi2sd", SSE_CVT_Scalar>, XD;
defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
int_x86_sse2_cvtsi642sd, i64mem, loadi64,
"cvtsi2sd", SSE_CVT_Scalar>, XD, REX_W;
}
/// SSE 1 Only
// Aliases for intrinsics
defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
f32mem, load, "cvttss2si",
SSE_CVT_SS2SI_32>, XS, VEX;
defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse_cvttss2si64, f32mem, load,
"cvttss2si", SSE_CVT_SS2SI_64>,
XS, VEX, VEX_W;
defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
f128mem, load, "cvttsd2si", SSE_CVT_SD2SI>,
XD, VEX;
defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse2_cvttsd2si64, f128mem, load,
"cvttsd2si", SSE_CVT_SD2SI>,
XD, VEX, VEX_W;
defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
f32mem, load, "cvttss2si",
SSE_CVT_SS2SI_32>, XS;
defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse_cvttss2si64, f32mem, load,
"cvttss2si{q}", SSE_CVT_SS2SI_64>,
XS, REX_W;
defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
f128mem, load, "cvttsd2si", SSE_CVT_SD2SI>,
XD;
defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse2_cvttsd2si64, f128mem, load,
"cvttsd2si{q}", SSE_CVT_SD2SI>,
XD, REX_W;
let Pattern = []<dag> in {
defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load,
"cvtss2si{l}\t{$src, $dst|$dst, $src}",
SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load,
"cvtss2si\t{$src, $dst|$dst, $src}",
SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load,
"cvtdq2ps\t{$src, $dst|$dst, $src}",
SSEPackedSingle, SSE_CVT_PS>, TB, VEX;
defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, undef, i256mem, load,
"cvtdq2ps\t{$src, $dst|$dst, $src}",
SSEPackedSingle, SSE_CVT_PS>, TB, VEX;
}
let Pattern = []<dag> in {
defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load /*dummy*/,
"cvtss2si{l}\t{$src, $dst|$dst, $src}",
SSE_CVT_SS2SI_32>, XS;
defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load /*dummy*/,
"cvtss2si{q}\t{$src, $dst|$dst, $src}",
SSE_CVT_SS2SI_64>, XS, REX_W;
defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/,
"cvtdq2ps\t{$src, $dst|$dst, $src}",
SSEPackedSingle, SSE_CVT_PS>,
TB; /* PD SSE3 form is avaiable */
}
let Predicates = [HasAVX] in {
def : Pat<(int_x86_sse_cvtss2si VR128:$src),
(VCVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
def : Pat<(int_x86_sse_cvtss2si (load addr:$src)),
(VCVTSS2SIrm addr:$src)>;
def : Pat<(int_x86_sse_cvtss2si64 VR128:$src),
(VCVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)),
(VCVTSS2SI64rm addr:$src)>;
}
let Predicates = [HasSSE1] in {
def : Pat<(int_x86_sse_cvtss2si VR128:$src),
(CVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
def : Pat<(int_x86_sse_cvtss2si (load addr:$src)),
(CVTSS2SIrm addr:$src)>;
def : Pat<(int_x86_sse_cvtss2si64 VR128:$src),
(CVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)),
(CVTSS2SI64rm addr:$src)>;
}
/// SSE 2 Only
// Convert scalar double to scalar single
def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
(ins FR64:$src1, FR64:$src2),
"cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG;
let mayLoad = 1 in
def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
(ins FR64:$src1, f64mem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[], IIC_SSE_CVT_Scalar_RM>,
XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG;
def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
Requires<[HasAVX]>;
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (fround FR64:$src))],
IIC_SSE_CVT_Scalar_RR>;
def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (fround (loadf64 addr:$src)))],
IIC_SSE_CVT_Scalar_RM>,
XD,
Requires<[HasSSE2, OptForSize]>;
defm Int_VCVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss",
SSE_CVT_Scalar, 0>,
XS, VEX_4V;
let Constraints = "$src1 = $dst" in
defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss",
SSE_CVT_Scalar>, XS;
// Convert scalar single to scalar double
// SSE2 instructions with XS prefix
def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
(ins FR32:$src1, FR32:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[], IIC_SSE_CVT_Scalar_RR>,
XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG;
let mayLoad = 1 in
def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
(ins FR32:$src1, f32mem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[], IIC_SSE_CVT_Scalar_RM>,
XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>;
let Predicates = [HasAVX] in {
def : Pat<(f64 (fextend FR32:$src)),
(VCVTSS2SDrr FR32:$src, FR32:$src)>;
def : Pat<(fextend (loadf32 addr:$src)),
(VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(extloadf32 addr:$src),
(VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>;
}
def : Pat<(extloadf32 addr:$src),
(VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (MOVSSrm addr:$src))>,
Requires<[HasAVX, OptForSpeed]>;
def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (fextend FR32:$src))],
IIC_SSE_CVT_Scalar_RR>, XS,
Requires<[HasSSE2]>;
def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (extloadf32 addr:$src))],
IIC_SSE_CVT_Scalar_RM>, XS,
Requires<[HasSSE2, OptForSize]>;
// extload f32 -> f64. This matches load+fextend because we have a hack in
// the isel (PreprocessForFPConvert) that can introduce loads after dag
// combine.
// Since these loads aren't folded into the fextend, we have to match it
// explicitly here.
def : Pat<(fextend (loadf32 addr:$src)),
(CVTSS2SDrm addr:$src)>, Requires<[HasSSE2]>;
def : Pat<(extloadf32 addr:$src),
(CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>;
def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
VR128:$src2))],
IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V,
Requires<[HasAVX]>;
def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
(load addr:$src2)))],
IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V,
Requires<[HasAVX]>;
let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"cvtss2sd\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
VR128:$src2))],
IIC_SSE_CVT_Scalar_RR>, XS,
Requires<[HasSSE2]>;
def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
"cvtss2sd\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
(load addr:$src2)))],
IIC_SSE_CVT_Scalar_RM>, XS,
Requires<[HasSSE2]>;
}
// Convert doubleword to packed single/double fp
// SSE2 instructions without OpSize prefix
def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtdq2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))],
IIC_SSE_CVT_PS_RR>,
TB, VEX, Requires<[HasAVX]>;
def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"vcvtdq2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtdq2ps
(bitconvert (memopv2i64 addr:$src))))],
IIC_SSE_CVT_PS_RM>,
TB, VEX, Requires<[HasAVX]>;
def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtdq2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))],
IIC_SSE_CVT_PS_RR>,
TB, Requires<[HasSSE2]>;
def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"cvtdq2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtdq2ps
(bitconvert (memopv2i64 addr:$src))))],
IIC_SSE_CVT_PS_RM>,
TB, Requires<[HasSSE2]>;
// FIXME: why the non-intrinsic version is described as SSE3?
// SSE2 instructions with XS prefix
def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
IIC_SSE_CVT_PD_RR>,
XS, VEX, Requires<[HasAVX]>;
def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtdq2pd
(bitconvert (memopv2i64 addr:$src))))],
IIC_SSE_CVT_PD_RM>,
XS, VEX, Requires<[HasAVX]>;
def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
IIC_SSE_CVT_PD_RR>,
XS, Requires<[HasSSE2]>;
def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtdq2pd
(bitconvert (memopv2i64 addr:$src))))],
IIC_SSE_CVT_PD_RM>,
XS, Requires<[HasSSE2]>;
// Convert packed single/double fp to doubleword
def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PS_RR>, VEX;
def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PS_RM>, VEX;
def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PS_RR>, VEX;
def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PS_RM>, VEX;
def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PS_RR>;
def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PS_RM>;
def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
IIC_SSE_CVT_PS_RR>,
VEX;
def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst),
(ins f128mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2dq
(memop addr:$src)))],
IIC_SSE_CVT_PS_RM>, VEX;
def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
IIC_SSE_CVT_PS_RR>;
def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2dq
(memop addr:$src)))],
IIC_SSE_CVT_PS_RM>;
// SSE2 packed instructions with XD prefix
def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
IIC_SSE_CVT_PD_RR>,
XD, VEX, Requires<[HasAVX]>;
def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"vcvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2dq
(memop addr:$src)))],
IIC_SSE_CVT_PD_RM>,
XD, VEX, Requires<[HasAVX]>;
def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
IIC_SSE_CVT_PD_RR>,
XD, Requires<[HasSSE2]>;
def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2dq
(memop addr:$src)))],
IIC_SSE_CVT_PD_RM>,
XD, Requires<[HasSSE2]>;
// Convert with truncation packed single/double fp to doubleword
// SSE2 packed instructions with XS prefix
def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvttps2dq VR128:$src))],
IIC_SSE_CVT_PS_RR>, VEX;
def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttps2dq
(memop addr:$src)))],
IIC_SSE_CVT_PS_RM>, VEX;
def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
(int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
IIC_SSE_CVT_PS_RR>, VEX;
def VCVTTPS2DQYrm : VSSI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
(memopv8f32 addr:$src)))],
IIC_SSE_CVT_PS_RM>, VEX;
def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvttps2dq VR128:$src))],
IIC_SSE_CVT_PS_RR>;
def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvttps2dq (memop addr:$src)))],
IIC_SSE_CVT_PS_RM>;
let Predicates = [HasAVX] in {
def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
(Int_VCVTDQ2PSrr VR128:$src)>;
def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
(Int_VCVTDQ2PSrm addr:$src)>;
def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
(VCVTTPS2DQrr VR128:$src)>;
def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
(VCVTTPS2DQrm addr:$src)>;
def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
(VCVTDQ2PSYrr VR256:$src)>;
def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (memopv4i64 addr:$src)))),
(VCVTDQ2PSYrm addr:$src)>;
def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
(VCVTTPS2DQYrr VR256:$src)>;
def : Pat<(v8i32 (fp_to_sint (memopv8f32 addr:$src))),
(VCVTTPS2DQYrm addr:$src)>;
}
let Predicates = [HasSSE2] in {
def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
(Int_CVTDQ2PSrr VR128:$src)>;
def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
(Int_CVTDQ2PSrm addr:$src)>;
def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
(CVTTPS2DQrr VR128:$src)>;
def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
(CVTTPS2DQrm addr:$src)>;
}
def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvttpd2dq VR128:$src))],
IIC_SSE_CVT_PD_RR>, VEX;
let isCodeGenOnly = 1 in
def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttpd2dq
(memop addr:$src)))],
IIC_SSE_CVT_PD_RM>, VEX;
def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
IIC_SSE_CVT_PD_RR>;
def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttpd2dq
(memop addr:$src)))],
IIC_SSE_CVT_PD_RM>;
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
// Provide other assembly rr and rm forms to address this explicitly.
def VCVTTPD2DQXrYr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>, VEX;
// XMM only
def VCVTTPD2DQXrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dqx\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>, VEX;
def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttpd2dqx\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RM>, VEX;
// YMM only
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvttpd2dqy\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>, VEX;
def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvttpd2dqy\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RM>, VEX, VEX_L;
// Convert packed single to packed double
let Predicates = [HasAVX] in {
// SSE2 instructions without OpSize prefix
def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>, TB, VEX;
def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RM>, TB, VEX;
def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>, TB, VEX;
def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RM>, TB, VEX;
}
def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>, TB;
def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RM>, TB;
def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
IIC_SSE_CVT_PD_RR>,
TB, VEX, Requires<[HasAVX]>;
def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2pd
(load addr:$src)))],
IIC_SSE_CVT_PD_RM>,
TB, VEX, Requires<[HasAVX]>;
def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
IIC_SSE_CVT_PD_RR>,
TB, Requires<[HasSSE2]>;
def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtps2pd
(load addr:$src)))],
IIC_SSE_CVT_PD_RM>,
TB, Requires<[HasSSE2]>;
// Convert packed double to packed single
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
// Provide other assembly rr and rm forms to address this explicitly.
def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>, VEX;
def VCVTPD2PSXrYr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>, VEX;
// XMM only
def VCVTPD2PSXrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2psx\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>, VEX;
def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2psx\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RM>, VEX;
// YMM only
def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvtpd2psy\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>, VEX;
def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvtpd2psy\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RM>, VEX, VEX_L;
def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RR>;
def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}", [],
IIC_SSE_CVT_PD_RM>;
def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
IIC_SSE_CVT_PD_RR>;
def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst),
(ins f128mem:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2ps
(memop addr:$src)))],
IIC_SSE_CVT_PD_RM>;
def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
IIC_SSE_CVT_PD_RR>;
def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvtpd2ps
(memop addr:$src)))],
IIC_SSE_CVT_PD_RM>;
// AVX 256-bit register conversion intrinsics
// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
// whenever possible to avoid declaring two versions of each one.
def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
(VCVTDQ2PSYrr VR256:$src)>;
def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))),
(VCVTDQ2PSYrm addr:$src)>;
def : Pat<(int_x86_avx_cvt_pd2_ps_256 VR256:$src),
(VCVTPD2PSYrr VR256:$src)>;
def : Pat<(int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)),
(VCVTPD2PSYrm addr:$src)>;
def : Pat<(int_x86_avx_cvt_ps2dq_256 VR256:$src),
(VCVTPS2DQYrr VR256:$src)>;
def : Pat<(int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)),
(VCVTPS2DQYrm addr:$src)>;
def : Pat<(int_x86_avx_cvt_ps2_pd_256 VR128:$src),
(VCVTPS2PDYrr VR128:$src)>;
def : Pat<(int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)),
(VCVTPS2PDYrm addr:$src)>;
def : Pat<(int_x86_avx_cvtt_pd2dq_256 VR256:$src),
(VCVTTPD2DQYrr VR256:$src)>;
def : Pat<(int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)),
(VCVTTPD2DQYrm addr:$src)>;
// Match fround and fextend for 128/256-bit conversions
def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
(VCVTPD2PSYrr VR256:$src)>;
def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
(VCVTPD2PSYrm addr:$src)>;
def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
(VCVTPS2PDYrr VR128:$src)>;
def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
(VCVTPS2PDYrm addr:$src)>;
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Compare Instructions
//===----------------------------------------------------------------------===//
// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
Operand CC, SDNode OpNode, ValueType VT,
PatFrag ld_frag, string asm, string asm_alt,
OpndItins itins> {
def rr : SIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
[(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
itins.rr>;
def rm : SIi8<0xC2, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
[(set RC:$dst, (OpNode (VT RC:$src1),
(ld_frag addr:$src2), imm:$cc))],
itins.rm>;
// Accept explicit immediate argument form instead of comparison code.
let neverHasSideEffects = 1 in {
def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [],
IIC_SSE_ALU_F32S_RR>;
let mayLoad = 1 in
def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [],
IIC_SSE_ALU_F32S_RM>;
}
}
defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmpss, f32, loadf32,
"cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
SSE_ALU_F32S>,
XS, VEX_4V, VEX_LIG;
defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmpsd, f64, loadf64,
"cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
SSE_ALU_F32S>, // same latency as 32 bit compare
XD, VEX_4V, VEX_LIG;
let Constraints = "$src1 = $dst" in {
defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmpss, f32, loadf32,
"cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
"cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
XS;
defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmpsd, f64, loadf64,
"cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
"cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
SSE_ALU_F32S>, // same latency as 32 bit compare
XD;
}
multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
Intrinsic Int, string asm, OpndItins itins> {
def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src, CC:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
VR128:$src, imm:$cc))],
itins.rr>;
def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, x86memop:$src, CC:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
(load addr:$src), imm:$cc))],
itins.rm>;
}
// Aliases to match intrinsics which expect XMM operand(s).
defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
"cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
SSE_ALU_F32S>,
XS, VEX_4V;
defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
"cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
SSE_ALU_F32S>, // same latency as f32
XD, VEX_4V;
let Constraints = "$src1 = $dst" in {
defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
"cmp${cc}ss\t{$src, $dst|$dst, $src}",
SSE_ALU_F32S>, XS;
defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
"cmp${cc}sd\t{$src, $dst|$dst, $src}",
SSE_ALU_F32S>, // same latency as f32
XD;
}
// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
ValueType vt, X86MemOperand x86memop,
PatFrag ld_frag, string OpcodeStr, Domain d> {
def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
IIC_SSE_COMIS_RR, d>;
def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1),
(ld_frag addr:$src2)))],
IIC_SSE_COMIS_RM, d>;
}
let Defs = [EFLAGS] in {
defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
"ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG;
defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
"ucomisd", SSEPackedDouble>, TB, OpSize, VEX,
VEX_LIG;
let Pattern = []<dag> in {
defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
"comiss", SSEPackedSingle>, TB, VEX,
VEX_LIG;
defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
"comisd", SSEPackedDouble>, TB, OpSize, VEX,
VEX_LIG;
}
defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
load, "ucomiss", SSEPackedSingle>, TB, VEX;
defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX;
defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
load, "comiss", SSEPackedSingle>, TB, VEX;
defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
load, "comisd", SSEPackedDouble>, TB, OpSize, VEX;
defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
"ucomiss", SSEPackedSingle>, TB;
defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
"ucomisd", SSEPackedDouble>, TB, OpSize;
let Pattern = []<dag> in {
defm COMISS : sse12