AMDGPU: Add Vega12 and Vega20

Changes by
  Matt Arsenault
  Konstantin Zhuravlyov

llvm-svn: 331215
diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll
index 8e51f82..68e25ee 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.ll
@@ -1,4 +1,5 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX906 -check-prefix=FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN:  not llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cedar -verify-machineinstrs < %s
 ; RUN:  not llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=juniper -verify-machineinstrs < %s
@@ -16,6 +17,7 @@
 
 ; FUNC-LABEL: {{^}}fma_f32:
 ; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}},
 ; EG: FMA {{\*? *}}[[RES]]
@@ -29,10 +31,20 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}fmac_to_3addr_f32:
+; GCN: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+define float @fmac_to_3addr_f32(float %r0, float %r1, float %r2) {
+  %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
+  ret float %r3
+}
+
 ; FUNC-LABEL: {{^}}fma_v2f32:
 ; SI: v_fma_f32
 ; SI: v_fma_f32
 
+; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}},
 ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]]
 ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
@@ -51,6 +63,10 @@
 ; SI: v_fma_f32
 ; SI: v_fma_f32
 ; SI: v_fma_f32
+; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+$}}
+; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
 
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}},
 ; EG-DAG: FMA {{\*? *}}[[RES]].X
@@ -97,3 +113,34 @@
   store float %fma, float addrspace(1)* %out.gep, align 4
   ret void
 }
+
+; Without special casing the inline constant check for v_fmac_f32's
+; src2, this fails to fold the 1.0 into an fma.
+
+; FUNC-LABEL: {{^}}fold_inline_imm_into_fmac_src2_f32:
+; GFX906: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GFX906: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+
+; GFX906: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
+; GFX906: v_fma_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
+define amdgpu_kernel void @fold_inline_imm_into_fmac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) nounwind {
+bb:
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep.a = getelementptr inbounds float, float addrspace(1)* %a, i64 %tid.ext
+  %gep.b = getelementptr inbounds float, float addrspace(1)* %b, i64 %tid.ext
+  %gep.out = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %tmp = load volatile float, float addrspace(1)* %gep.a
+  %tmp1 = load volatile float, float addrspace(1)* %gep.b
+  %tmp2 = fadd contract float %tmp, %tmp
+  %tmp3 = fmul contract float %tmp2, 4.0
+  %tmp4 = fsub contract float 1.0, %tmp3
+  %tmp5 = fadd contract float %tmp4, %tmp1
+  %tmp6 = fadd contract float %tmp1, %tmp1
+  %tmp7 = fmul contract float %tmp6, %tmp
+  %tmp8 = fsub contract float 1.0, %tmp7
+  %tmp9 = fmul contract float %tmp8, 8.0
+  %tmp10 = fadd contract float %tmp5, %tmp9
+  store float %tmp10, float addrspace(1)* %gep.out
+  ret void
+}