[AMDLIBM] - Add new vector call from amdlibm library (#194793)

Add vector call from AMDLIBM

erfinv
Vector - vrd2, vrd4, vrd8

erfcinv
Vector - vrd2, vrd4, vrd8

cdfnorminv
Vector - vrd2, vrd4, vrd8 

As per the latest external supported calls

[amdlibm_vec.h](https://github.com/amd/aocl-libm-ose/blob/master/include/external/amdlibm_vec.h)
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 4addfbf..e7b949b 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -1912,6 +1912,14 @@
 TLI_DEFINE_VECFUNC("erfc", "amd_vrd4_erfc", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("erfc", "amd_vrd8_erfc", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 
+TLI_DEFINE_VECFUNC("erfinv", "amd_vrd2_erfinv", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("erfinv", "amd_vrd4_erfinv", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("erfinv", "amd_vrd8_erfinv", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+
+TLI_DEFINE_VECFUNC("erfcinv", "amd_vrd2_erfcinv", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("erfcinv", "amd_vrd4_erfcinv", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("erfcinv", "amd_vrd8_erfcinv", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+
 TLI_DEFINE_VECFUNC("erfcf", "amd_vrs4_erfcf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("erfcf", "amd_vrs8_erfcf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("erfcf", "amd_vrs16_erfcf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
@@ -1920,6 +1928,10 @@
 TLI_DEFINE_VECFUNC("cdfnorm", "amd_vrd4_cdfnorm", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("cdfnorm", "amd_vrd8_cdfnorm", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 
+TLI_DEFINE_VECFUNC("cdfnorminv", "amd_vrd2_cdfnorminv", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("cdfnorminv", "amd_vrd4_cdfnorminv", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("cdfnorminv", "amd_vrd8_cdfnorminv", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+
 TLI_DEFINE_VECFUNC("roundf", "amd_vrs16_roundf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
 TLI_DEFINE_VECFUNC("roundf", "amd_vrs8_roundf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("roundf", "amd_vrs4_roundf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index a09825d..cbdc48a 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -218,7 +218,9 @@
   def COSH_#S : RuntimeLibcall;
   def COSPI_#S : RuntimeLibcall;
   def ERFC_#S : RuntimeLibcall;
+  def ERFCINV_#S : RuntimeLibcall;
   def ERF_#S : RuntimeLibcall;
+  def ERFINV_#S : RuntimeLibcall;
   def EXP_#S : RuntimeLibcall;
   def EXP_FINITE_#S : RuntimeLibcall;
   def EXP10_#S : RuntimeLibcall;
@@ -253,6 +255,7 @@
   def TANH_#S : RuntimeLibcall;
   def TGAMMA_#S : RuntimeLibcall;
   def CDFNORM_#S : RuntimeLibcall;
+  def CDFNORMINV_#S : RuntimeLibcall;
   def ROUND_#S : RuntimeLibcall;
 }
 
@@ -4456,6 +4459,12 @@
   def amd_vrd8_sin : RuntimeLibcallImpl<SIN_V8F64>;
   def amd_vrd8_sincos : RuntimeLibcallImpl<SINCOS_V8F64>;
   def amd_vrd8_tan : RuntimeLibcallImpl<TAN_V8F64>;
+  def amd_vrd2_erfinv : RuntimeLibcallImpl<ERFINV_V2F64>;
+  def amd_vrd4_erfinv : RuntimeLibcallImpl<ERFINV_V4F64>;
+  def amd_vrd8_erfinv : RuntimeLibcallImpl<ERFINV_V8F64>;
+  def amd_vrd2_erfcinv : RuntimeLibcallImpl<ERFCINV_V2F64>;
+  def amd_vrd4_erfcinv : RuntimeLibcallImpl<ERFCINV_V4F64>;
+  def amd_vrd8_erfcinv : RuntimeLibcallImpl<ERFCINV_V8F64>;
   def amd_vrs16_acosf : RuntimeLibcallImpl<ACOS_V16F32>;
   def amd_vrs16_asinf : RuntimeLibcallImpl<ASIN_V16F32>;
   def amd_vrs16_atanf : RuntimeLibcallImpl<ATAN_V16F32>;
@@ -4510,6 +4519,9 @@
   def amd_vrd2_cdfnorm : RuntimeLibcallImpl<CDFNORM_V2F64>;
   def amd_vrd4_cdfnorm : RuntimeLibcallImpl<CDFNORM_V4F64>;
   def amd_vrd8_cdfnorm : RuntimeLibcallImpl<CDFNORM_V8F64>;
+  def amd_vrd2_cdfnorminv : RuntimeLibcallImpl<CDFNORMINV_V2F64>;
+  def amd_vrd4_cdfnorminv : RuntimeLibcallImpl<CDFNORMINV_V4F64>;
+  def amd_vrd8_cdfnorminv : RuntimeLibcallImpl<CDFNORMINV_V8F64>;
   def amd_vrs4_roundf : RuntimeLibcallImpl<ROUND_V4F32>;
   def amd_vrs8_roundf : RuntimeLibcallImpl<ROUND_V8F32>;
   def amd_vrs16_roundf : RuntimeLibcallImpl<ROUND_V16F32>;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
index e80ee60..4625131 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
@@ -1779,6 +1779,54 @@
   ret void
 }
 
+; ======================= erfinv ============================
+define void @erfinv_f64(ptr nocapture %varray) {
+; CHECK-LABEL: @erfinv_f64(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_erfinv(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_erfinv(<4 x double> [[TMP4:%.*]])
+; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_erfinv(<8 x double> [[TMP4:%.*]])
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @erfinv(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; ======================= erfcinv ============================
+define void @erfcinv_f64(ptr nocapture %varray) {
+; CHECK-LABEL: @erfcinv_f64(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_erfcinv(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_erfcinv(<4 x double> [[TMP4:%.*]])
+; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_erfcinv(<8 x double> [[TMP4:%.*]])
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @erfcinv(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 
 ; ======================= erfc ============================
 define void @erfc_f64(ptr nocapture %varray) {
@@ -1805,6 +1853,33 @@
   ret void
 }
 
+; ======================= cdfnorminv ============================
+define void @cdfnorminv_f64(ptr nocapture %varray) {
+; CHECK-LABEL: @cdfnorminv_f64(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_cdfnorminv(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_cdfnorminv(<4 x double> [[TMP4:%.*]])
+; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_cdfnorminv(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   {{.*}} = tail call double @cdfnorminv(double {{.*}})
+; CHECK:        ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @cdfnorminv(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 ; ======================= erfcf ============================
 define void @erfcf_f32(ptr nocapture %varray) {
 ; CHECK-LABEL: @erfcf_f32(
@@ -2024,9 +2099,12 @@
 declare float @exp10f(float) #0
 declare void @sincos(double, ptr, ptr)
 declare void @sincosf(float, ptr, ptr)
+declare double @erfinv(double) #0
+declare double @erfcinv(double) #0
 declare double @erfc(double) #0
 declare float @erfcf(float) #0
 declare double @cdfnorm(double) #0
+declare double @cdfnorminv(double) #0
 declare double @round(double) #0
 declare float @roundf(float) #0
 declare double @expm1(double) #0
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
index 8969528..28d3edf 100644
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
@@ -16,7 +16,7 @@
 ; SVML-SAME:          ptr @__svml_log10f4,
 ; SVML-SAME:          ptr @__svml_log10f8,
 ; SVML-SAME:          ptr @__svml_log10f16
-; AMDLIBM-SAME:     [12 x ptr] [
+; AMDLIBM-SAME:     [21 x ptr] [
 ; AMDLIBM-SAME:       ptr @amd_vrd2_sin,
 ; AMDLIBM-SAME:       ptr @amd_vrd4_sin,
 ; AMDLIBM-SAME:       ptr @amd_vrd8_sin,
@@ -26,6 +26,15 @@
 ; AMDLIBM-SAME:       ptr @amd_vrs4_sincosf,
 ; AMDLIBM-SAME:       ptr @amd_vrs8_sincosf,
 ; AMDLIBM-SAME:       ptr @amd_vrs16_sincosf,
+; AMDLIBM-SAME:       ptr @amd_vrd2_erfinv,
+; AMDLIBM-SAME:       ptr @amd_vrd4_erfinv,
+; AMDLIBM-SAME:       ptr @amd_vrd8_erfinv,
+; AMDLIBM-SAME:       ptr @amd_vrd2_erfcinv,
+; AMDLIBM-SAME:       ptr @amd_vrd4_erfcinv,
+; AMDLIBM-SAME:       ptr @amd_vrd8_erfcinv,
+; AMDLIBM-SAME:       ptr @amd_vrd2_cdfnorminv,
+; AMDLIBM-SAME:       ptr @amd_vrd4_cdfnorminv,
+; AMDLIBM-SAME:       ptr @amd_vrd8_cdfnorminv,
 ; AMDLIBM-SAME:       ptr @amd_vrs4_log10f,
 ; AMDLIBM-SAME:       ptr @amd_vrs8_log10f,
 ; AMDLIBM-SAME:       ptr @amd_vrs16_log10f
@@ -164,6 +173,33 @@
 
 declare void @sincospif(float, ptr, ptr) #0
 
+define double @erfinv_f64(double %in) {
+; COMMON-LABEL:       @erfinv_f64(
+; AMDLIBM:            call double @erfinv(double %{{.*}}) #[[ERFINV:[0-9]+]]
+  %call = tail call double @erfinv(double %in)
+  ret double %call
+}
+
+declare double @erfinv(double) #0
+
+define double @erfcinv_f64(double %in) {
+; COMMON-LABEL:       @erfcinv_f64(
+; AMDLIBM:            call double @erfcinv(double %{{.*}}) #[[ERFCINV:[0-9]+]]
+  %call = tail call double @erfcinv(double %in)
+  ret double %call
+}
+
+declare double @erfcinv(double) #0
+
+define double @cdfnorminv_f64(double %in) {
+; COMMON-LABEL:       @cdfnorminv_f64(
+; AMDLIBM:            call double @cdfnorminv(double %{{.*}}) #[[CDFNORMINV:[0-9]+]]
+  %call = tail call double @cdfnorminv(double %in)
+  ret double %call
+}
+
+declare double @cdfnorminv(double) #0
+
 define float @call_llvm.log10.f32(float %in) {
 ; COMMON-LABEL:       @call_llvm.log10.f32(
 ; SVML:               call float @llvm.log10.f32(float %{{.*}})
@@ -287,6 +323,18 @@
 ; AMDLIBM-SAME:   "_ZGV_LLVM_N4vl4l4_sincosf(amd_vrs4_sincosf),
 ; AMDLIBM-SAME:   _ZGV_LLVM_N8vl4l4_sincosf(amd_vrs8_sincosf),
 ; AMDLIBM-SAME:   _ZGV_LLVM_N16vl4l4_sincosf(amd_vrs16_sincosf)" }
+; AMDLIBM:      attributes #[[ERFINV]] = { "vector-function-abi-variant"=
+; AMDLIBM-SAME:   "_ZGV_LLVM_N2v_erfinv(amd_vrd2_erfinv),
+; AMDLIBM-SAME:   _ZGV_LLVM_N4v_erfinv(amd_vrd4_erfinv),
+; AMDLIBM-SAME:   _ZGV_LLVM_N8v_erfinv(amd_vrd8_erfinv)" }
+; AMDLIBM:      attributes #[[ERFCINV]] = { "vector-function-abi-variant"=
+; AMDLIBM-SAME:   "_ZGV_LLVM_N2v_erfcinv(amd_vrd2_erfcinv),
+; AMDLIBM-SAME:   _ZGV_LLVM_N4v_erfcinv(amd_vrd4_erfcinv),
+; AMDLIBM-SAME:   _ZGV_LLVM_N8v_erfcinv(amd_vrd8_erfcinv)" }
+; AMDLIBM:      attributes #[[CDFNORMINV]] = { "vector-function-abi-variant"=
+; AMDLIBM-SAME:   "_ZGV_LLVM_N2v_cdfnorminv(amd_vrd2_cdfnorminv),
+; AMDLIBM-SAME:   _ZGV_LLVM_N4v_cdfnorminv(amd_vrd4_cdfnorminv),
+; AMDLIBM-SAME:   _ZGV_LLVM_N8v_cdfnorminv(amd_vrd8_cdfnorminv)" }
 ; AMDLIBM:      attributes #[[LOG10]] = { "vector-function-abi-variant"=
 ; AMDLIBM-SAME:   "_ZGV_LLVM_N4v_llvm.log10.f32(amd_vrs4_log10f),
 ; AMDLIBM-SAME:   _ZGV_LLVM_N8v_llvm.log10.f32(amd_vrs8_log10f),