[X86] Filter out tuning feature flags and a few ISA feature flags when checking for function inline compatibility.

Tuning flags don't have any effect on the available instructions so aren't a good reason to prevent inlining.

There are also some ISA flags that don't have any intrinsics our ABI requirements that we can exclude. I've put only the most basic ones like cmpxchg16b and lahfsahf. These are interesting because they aren't present in all 64-bit CPUs, but we have codegen workarounds when they aren't present.

Loosening these checks can help with scenarios where a caller has a more specific CPU than a callee. The default tuning flags on our generic 'x86-64' CPU can currently make it inline compatible with other CPUs. I've also added an example test for 'nocona' and 'prescott' where 'nocona' is just a 64-bit capable version of 'prescott' but in 32-bit mode they should be completely compatible.

I've based the implementation here of the similar code in AMDGPU.

Differential Revision: https://reviews.llvm.org/D58371

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@354355 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 9ae4a92..a7ecfc2 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3065,10 +3065,9 @@
   const FeatureBitset &CalleeBits =
       TM.getSubtargetImpl(*Callee)->getFeatureBits();
 
-  // FIXME: This is likely too limiting as it will include subtarget features
-  // that we might not care about for inlining, but it is conservatively
-  // correct.
-  return (CallerBits & CalleeBits) == CalleeBits;
+  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
+  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
+  return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
 }
 
 const X86TTIImpl::TTI::MemCmpExpansionOptions *
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index b2e3e05..5035818 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -35,6 +35,60 @@
   const X86Subtarget *getST() const { return ST; }
   const X86TargetLowering *getTLI() const { return TLI; }
 
+  const FeatureBitset InlineFeatureIgnoreList = {
+    // This indicates the CPU is 64 bit capable not that we are in 64-bit mode.
+    X86::Feature64Bit,
+
+    // These features don't have any intrinsics or ABI effect.
+    X86::FeatureNOPL,
+    X86::FeatureCMPXCHG16B,
+    X86::FeatureLAHFSAHF,
+
+    // Codegen control options.
+    X86::FeatureFast11ByteNOP,
+    X86::FeatureFast15ByteNOP,
+    X86::FeatureFastBEXTR,
+    X86::FeatureFastHorizontalOps,
+    X86::FeatureFastLZCNT,
+    X86::FeatureFastPartialYMMorZMMWrite,
+    X86::FeatureFastScalarFSQRT,
+    X86::FeatureFastSHLDRotate,
+    X86::FeatureFastVariableShuffle,
+    X86::FeatureFastVectorFSQRT,
+    X86::FeatureLEAForSP,
+    X86::FeatureLEAUsesAG,
+    X86::FeatureLZCNTFalseDeps,
+    X86::FeatureMacroFusion,
+    X86::FeatureMergeToThreeWayBranch,
+    X86::FeaturePadShortFunctions,
+    X86::FeaturePOPCNTFalseDeps,
+    X86::FeatureSSEUnalignedMem,
+    X86::FeatureSlow3OpsLEA,
+    X86::FeatureSlowDivide32,
+    X86::FeatureSlowDivide64,
+    X86::FeatureSlowIncDec,
+    X86::FeatureSlowLEA,
+    X86::FeatureSlowPMADDWD,
+    X86::FeatureSlowPMULLD,
+    X86::FeatureSlowSHLD,
+    X86::FeatureSlowTwoMemOps,
+    X86::FeatureSlowUAMem16,
+
+    // Perf-tuning flags.
+    X86::FeatureHasFastGather,
+    X86::FeatureSlowUAMem32,
+
+    // Based on whether user set the -mprefer-vector-width command line.
+    X86::FeaturePrefer256Bit,
+
+    // CPU name enums. These just follow CPU string.
+    X86::ProcIntelAtom,
+    X86::ProcIntelGLM,
+    X86::ProcIntelGLP,
+    X86::ProcIntelSLM,
+    X86::ProcIntelTRM,
+  };
+
 public:
   explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
diff --git a/test/Transforms/Inline/X86/inline-target-cpu-i686.ll b/test/Transforms/Inline/X86/inline-target-cpu-i686.ll
new file mode 100644
index 0000000..a032544
--- /dev/null
+++ b/test/Transforms/Inline/X86/inline-target-cpu-i686.ll
@@ -0,0 +1,15 @@
+; RUN: opt < %s -mtriple=i686-unknown-unknown -S -inline | FileCheck %s
+
+define i32 @func_target_cpu_nocona() #0 {
+  ret i32 0
+}
+
+; CHECK-LABEL: @target_cpu_prescott_call_target_cpu_nocona(
+; CHECK-NEXT: ret i32 0
+define i32 @target_cpu_prescott_call_target_cpu_nocona() #1 {
+  %call = call i32 @func_target_cpu_nocona()
+  ret i32 %call
+}
+
+attributes #0 = { nounwind "target-cpu"="nocona" }
+attributes #1 = { nounwind "target-cpu"="prescott" }
diff --git a/test/Transforms/Inline/X86/inline-target-cpu-x86_64.ll b/test/Transforms/Inline/X86/inline-target-cpu-x86_64.ll
new file mode 100644
index 0000000..fa04a77
--- /dev/null
+++ b/test/Transforms/Inline/X86/inline-target-cpu-x86_64.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-unknown -S -inline | FileCheck %s
+
+define i32 @func_target_cpu_base() #0 {
+  ret i32 0
+}
+
+; CHECK-LABEL: @target_cpu_k8_call_target_cpu_base(
+; CHECK-NEXT: ret i32 0
+define i32 @target_cpu_k8_call_target_cpu_base() #1 {
+  %call = call i32 @func_target_cpu_base()
+  ret i32 %call
+}
+
+; CHECK-LABEL: @target_cpu_target_nehalem_call_target_cpu_base(
+; CHECK-NEXT: ret i32 0
+define i32 @target_cpu_target_nehalem_call_target_cpu_base() #2 {
+  %call = call i32 @func_target_cpu_base()
+  ret i32 %call
+}
+
+; CHECK-LABEL: @target_cpu_target_goldmont_call_target_cpu_base(
+; CHECK-NEXT: ret i32 0
+define i32 @target_cpu_target_goldmont_call_target_cpu_base() #3 {
+  %call = call i32 @func_target_cpu_base()
+  ret i32 %call
+}
+
+define i32 @func_target_cpu_nocona() #4 {
+  ret i32 0
+}
+
+; CHECK-LABEL: @target_cpu_target_base_call_target_cpu_nocona(
+; CHECK-NEXT: ret i32 0
+define i32 @target_cpu_target_base_call_target_cpu_nocona() #0 {
+  %call = call i32 @func_target_cpu_nocona()
+  ret i32 %call
+}
+
+attributes #0 = { nounwind "target-cpu"="x86-64" }
+attributes #1 = { nounwind "target-cpu"="k8" }
+attributes #2 = { nounwind "target-cpu"="nehalem" }
+attributes #3 = { nounwind "target-cpu"="goldmont" }
+attributes #4 = { nounwind "target-cpu"="nocona" "target-features"="-sse3" }