llvm/test/Transforms/VectorCombine/X86/insert-binop-vector.ll - llvm-project - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX

 define <2 x double> @insert1_v2f64_f64_fdiv(<2 x double> %v0, <2 x double> %v1, double %s0, double %s1) {
 ; CHECK-LABEL: @insert1_v2f64_f64_fdiv(
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[V0:%.*]], double [[S0:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[V1:%.*]], double [[S1:%.*]], i64 1
 ; CHECK-NEXT:    [[R:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <2 x double> [[R]]
 ;
   %s = fdiv double %s0, %s1
   %v = fdiv <2 x double> %v0, %v1
   %r = insertelement <2 x double> %v, double %s, i32 1
   ret <2 x double> %r
 }

 ; SSE2 has no fast v4i32 insertelement
 define <4 x i32> @insert0_v4i32_i32_add(<4 x i32> %v0, <4 x i32> %v1, i32 %s0, i32 %s1) {
 ; SSE2-LABEL: @insert0_v4i32_i32_add(
 ; SSE2-NEXT:    [[S:%.*]] = add i32 [[S0:%.*]], [[S1:%.*]]
 ; SSE2-NEXT:    [[V:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
 ; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> [[V]], i32 [[S]], i32 0
 ; SSE2-NEXT:    ret <4 x i32> [[R]]
 ;
 ; SSE4-LABEL: @insert0_v4i32_i32_add(
 ; SSE4-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[V0:%.*]], i32 [[S0:%.*]], i64 0
 ; SSE4-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[V1:%.*]], i32 [[S1:%.*]], i64 0
 ; SSE4-NEXT:    [[R:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE4-NEXT:    ret <4 x i32> [[R]]
 ;
 ; AVX-LABEL: @insert0_v4i32_i32_add(
 ; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[V0:%.*]], i32 [[S0:%.*]], i64 0
 ; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[V1:%.*]], i32 [[S1:%.*]], i64 0
 ; AVX-NEXT:    [[R:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
 ; AVX-NEXT:    ret <4 x i32> [[R]]
 ;
   %s = add i32 %s0, %s1
   %v = add <4 x i32> %v0, %v1
   %r = insertelement <4 x i32> %v, i32 %s, i32 0
   ret <4 x i32> %r
 }

 ; AVX expensive insertion into upper 128-bit vector
 define <16 x i16> @insert9_v16i16_i16_add(<16 x i16> %v0, <16 x i16> %v1, i16 %s0, i16 %s1) {
 ; SSE-LABEL: @insert9_v16i16_i16_add(
 ; SSE-NEXT:    [[TMP1:%.*]] = insertelement <16 x i16> [[V0:%.*]], i16 [[S0:%.*]], i64 9
 ; SSE-NEXT:    [[TMP2:%.*]] = insertelement <16 x i16> [[V1:%.*]], i16 [[S1:%.*]], i64 9
 ; SSE-NEXT:    [[R:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    ret <16 x i16> [[R]]
 ;
 ; AVX-LABEL: @insert9_v16i16_i16_add(
 ; AVX-NEXT:    [[S:%.*]] = add i16 [[S0:%.*]], [[S1:%.*]]
 ; AVX-NEXT:    [[V:%.*]] = add <16 x i16> [[V0:%.*]], [[V1:%.*]]
 ; AVX-NEXT:    [[R:%.*]] = insertelement <16 x i16> [[V]], i16 [[S]], i32 9
 ; AVX-NEXT:    ret <16 x i16> [[R]]
 ;
   %s = add i16 %s0, %s1
   %v = add <16 x i16> %v0, %v1
   %r = insertelement <16 x i16> %v, i16 %s, i32 9
   ret <16 x i16> %r
 }

 ; Merge flags
 define <4 x float> @insert0_v4f32_f32_fadd_common_flags(<4 x float> %v0, <4 x float> %v1, float %s0, float %s1) {
 ; CHECK-LABEL: @insert0_v4f32_f32_fadd_common_flags(
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[V0:%.*]], float [[S0:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[V1:%.*]], float [[S1:%.*]], i64 0
 ; CHECK-NEXT:    [[R:%.*]] = fadd fast <4 x float> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = fadd fast float %s0, %s1
   %v = fadd fast <4 x float> %v0, %v1
   %r = insertelement <4 x float> %v, float %s, i32 0
   ret <4 x float> %r
 }

 ; Merge (shared) flags
 define <4 x float> @insert1_v4f32_f32_fsub_mixed_flags(<4 x float> %v0, <4 x float> %v1, float %s0, float %s1) {
 ; CHECK-LABEL: @insert1_v4f32_f32_fsub_mixed_flags(
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[V0:%.*]], float [[S0:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[V1:%.*]], float [[S1:%.*]], i64 1
 ; CHECK-NEXT:    [[R:%.*]] = fsub nnan <4 x float> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = fsub nnan nsz float %s0, %s1
   %v = fsub nnan ninf <4 x float> %v0, %v1
   %r = insertelement <4 x float> %v, float %s, i32 1
   ret <4 x float> %r
 }

 ; TODO: Fold equivalent opcodes
 define <4 x i32> @insert0_v4i32_i32_or_disjoint_add(<4 x i32> %v0, <4 x i32> %v1, i32 %s0, i32 %s1) {
 ; CHECK-LABEL: @insert0_v4i32_i32_or_disjoint_add(
 ; CHECK-NEXT:    [[S:%.*]] = add i32 [[S0:%.*]], [[S1:%.*]]
 ; CHECK-NEXT:    [[V:%.*]] = or disjoint <4 x i32> [[V0:%.*]], [[V1:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> [[V]], i32 [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %s = add i32 %s0, %s1
   %v = or disjoint <4 x i32> %v0, %v1
   %r = insertelement <4 x i32> %v, i32 %s, i32 0
   ret <4 x i32> %r
 }

 ; Negative - multi use
 define <2 x double> @insert0_v2f64_f64_fmul_multiuse(<2 x double> %v0, <2 x double> %v1, double %s0, double %s1) {
 ; CHECK-LABEL: @insert0_v2f64_f64_fmul_multiuse(
 ; CHECK-NEXT:    [[S:%.*]] = fmul double [[S0:%.*]], [[S1:%.*]]
 ; CHECK-NEXT:    [[V:%.*]] = fmul <2 x double> [[V0:%.*]], [[V1:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x double> [[V]], double [[S]], i32 0
 ; CHECK-NEXT:    call void @use_f64(double [[S]])
 ; CHECK-NEXT:    ret <2 x double> [[R]]
 ;
   %s = fmul double %s0, %s1
   %v = fmul <2 x double> %v0, %v1
   %r = insertelement <2 x double> %v, double %s, i32 0
   call void @use_f64(double %s)
   ret <2 x double> %r
 }
 declare void @use_f64(<2 x double>)

 ; Negative - multi use
 define <2 x i64> @insert0_v2i64_i64_add_multiuse(<2 x i64> %v0, <2 x i64> %v1, i64 %s0, i64 %s1) {
 ; CHECK-LABEL: @insert0_v2i64_i64_add_multiuse(
 ; CHECK-NEXT:    [[S:%.*]] = add i64 [[S0:%.*]], [[S1:%.*]]
 ; CHECK-NEXT:    [[V:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> [[V]], i64 [[S]], i32 0
 ; CHECK-NEXT:    call void @use_v2i64(<2 x i64> [[V]])
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
 ;
   %s = add i64 %s0, %s1
   %v = add <2 x i64> %v0, %v1
   %r = insertelement <2 x i64> %v, i64 %s, i32 0
   call void @use_v2i64(<2 x i64> %v)
   ret <2 x i64> %r
 }
 declare void @use_v2i64(<2 x i64>)

 ; Negative - binop mismatch
 define <2 x double> @insert0_v2f64_f64_fadd_fsub(<2 x double> %v0, <2 x double> %v1, double %s0, double %s1) {
 ; CHECK-LABEL: @insert0_v2f64_f64_fadd_fsub(
 ; CHECK-NEXT:    [[S:%.*]] = fsub double [[S0:%.*]], [[S1:%.*]]
 ; CHECK-NEXT:    [[V:%.*]] = fadd <2 x double> [[V0:%.*]], [[V1:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x double> [[V]], double [[S]], i32 0
 ; CHECK-NEXT:    ret <2 x double> [[R]]
 ;
   %s = fsub double %s0, %s1
   %v = fadd <2 x double> %v0, %v1
   %r = insertelement <2 x double> %v, double %s, i32 0
   ret <2 x double> %r
 }
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64 \| FileCheck %s --check-prefixes=CHECK,SSE,SSE2
	; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v2 \| FileCheck %s --check-prefixes=CHECK,SSE,SSE4
	; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v3 \| FileCheck %s --check-prefixes=CHECK,AVX
	; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v4 \| FileCheck %s --check-prefixes=CHECK,AVX

	define <2 x double> @insert1_v2f64_f64_fdiv(<2 x double> %v0, <2 x double> %v1, double %s0, double %s1) {
	; CHECK-LABEL: @insert1_v2f64_f64_fdiv(
	; CHECK-NEXT: [[TMP1:%.]] = insertelement <2 x double> [[V0:%.]], double [[S0:%.*]], i64 1
	; CHECK-NEXT: [[TMP2:%.]] = insertelement <2 x double> [[V1:%.]], double [[S1:%.*]], i64 1
	; CHECK-NEXT: [[R:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]]
	; CHECK-NEXT: ret <2 x double> [[R]]
	;
	%s = fdiv double %s0, %s1
	%v = fdiv <2 x double> %v0, %v1
	%r = insertelement <2 x double> %v, double %s, i32 1
	ret <2 x double> %r
	}

	; SSE2 has no fast v4i32 insertelement
	define <4 x i32> @insert0_v4i32_i32_add(<4 x i32> %v0, <4 x i32> %v1, i32 %s0, i32 %s1) {
	; SSE2-LABEL: @insert0_v4i32_i32_add(
	; SSE2-NEXT: [[S:%.]] = add i32 [[S0:%.]], [[S1:%.*]]
	; SSE2-NEXT: [[V:%.]] = add <4 x i32> [[V0:%.]], [[V1:%.*]]
	; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> [[V]], i32 [[S]], i32 0
	; SSE2-NEXT: ret <4 x i32> [[R]]
	;
	; SSE4-LABEL: @insert0_v4i32_i32_add(
	; SSE4-NEXT: [[TMP1:%.]] = insertelement <4 x i32> [[V0:%.]], i32 [[S0:%.*]], i64 0
	; SSE4-NEXT: [[TMP2:%.]] = insertelement <4 x i32> [[V1:%.]], i32 [[S1:%.*]], i64 0
	; SSE4-NEXT: [[R:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
	; SSE4-NEXT: ret <4 x i32> [[R]]
	;
	; AVX-LABEL: @insert0_v4i32_i32_add(
	; AVX-NEXT: [[TMP1:%.]] = insertelement <4 x i32> [[V0:%.]], i32 [[S0:%.*]], i64 0
	; AVX-NEXT: [[TMP2:%.]] = insertelement <4 x i32> [[V1:%.]], i32 [[S1:%.*]], i64 0
	; AVX-NEXT: [[R:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
	; AVX-NEXT: ret <4 x i32> [[R]]
	;
	%s = add i32 %s0, %s1
	%v = add <4 x i32> %v0, %v1
	%r = insertelement <4 x i32> %v, i32 %s, i32 0
	ret <4 x i32> %r
	}

	; AVX expensive insertion into upper 128-bit vector
	define <16 x i16> @insert9_v16i16_i16_add(<16 x i16> %v0, <16 x i16> %v1, i16 %s0, i16 %s1) {
	; SSE-LABEL: @insert9_v16i16_i16_add(
	; SSE-NEXT: [[TMP1:%.]] = insertelement <16 x i16> [[V0:%.]], i16 [[S0:%.*]], i64 9
	; SSE-NEXT: [[TMP2:%.]] = insertelement <16 x i16> [[V1:%.]], i16 [[S1:%.*]], i64 9
	; SSE-NEXT: [[R:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
	; SSE-NEXT: ret <16 x i16> [[R]]
	;
	; AVX-LABEL: @insert9_v16i16_i16_add(
	; AVX-NEXT: [[S:%.]] = add i16 [[S0:%.]], [[S1:%.*]]
	; AVX-NEXT: [[V:%.]] = add <16 x i16> [[V0:%.]], [[V1:%.*]]
	; AVX-NEXT: [[R:%.*]] = insertelement <16 x i16> [[V]], i16 [[S]], i32 9
	; AVX-NEXT: ret <16 x i16> [[R]]
	;
	%s = add i16 %s0, %s1
	%v = add <16 x i16> %v0, %v1
	%r = insertelement <16 x i16> %v, i16 %s, i32 9
	ret <16 x i16> %r
	}

	; Merge flags
	define <4 x float> @insert0_v4f32_f32_fadd_common_flags(<4 x float> %v0, <4 x float> %v1, float %s0, float %s1) {
	; CHECK-LABEL: @insert0_v4f32_f32_fadd_common_flags(
	; CHECK-NEXT: [[TMP1:%.]] = insertelement <4 x float> [[V0:%.]], float [[S0:%.*]], i64 0
	; CHECK-NEXT: [[TMP2:%.]] = insertelement <4 x float> [[V1:%.]], float [[S1:%.*]], i64 0
	; CHECK-NEXT: [[R:%.*]] = fadd fast <4 x float> [[TMP1]], [[TMP2]]
	; CHECK-NEXT: ret <4 x float> [[R]]
	;
	%s = fadd fast float %s0, %s1
	%v = fadd fast <4 x float> %v0, %v1
	%r = insertelement <4 x float> %v, float %s, i32 0
	ret <4 x float> %r
	}

	; Merge (shared) flags
	define <4 x float> @insert1_v4f32_f32_fsub_mixed_flags(<4 x float> %v0, <4 x float> %v1, float %s0, float %s1) {
	; CHECK-LABEL: @insert1_v4f32_f32_fsub_mixed_flags(
	; CHECK-NEXT: [[TMP1:%.]] = insertelement <4 x float> [[V0:%.]], float [[S0:%.*]], i64 1
	; CHECK-NEXT: [[TMP2:%.]] = insertelement <4 x float> [[V1:%.]], float [[S1:%.*]], i64 1
	; CHECK-NEXT: [[R:%.*]] = fsub nnan <4 x float> [[TMP1]], [[TMP2]]
	; CHECK-NEXT: ret <4 x float> [[R]]
	;
	%s = fsub nnan nsz float %s0, %s1
	%v = fsub nnan ninf <4 x float> %v0, %v1
	%r = insertelement <4 x float> %v, float %s, i32 1
	ret <4 x float> %r
	}

	; TODO: Fold equivalent opcodes
	define <4 x i32> @insert0_v4i32_i32_or_disjoint_add(<4 x i32> %v0, <4 x i32> %v1, i32 %s0, i32 %s1) {
	; CHECK-LABEL: @insert0_v4i32_i32_or_disjoint_add(
	; CHECK-NEXT: [[S:%.]] = add i32 [[S0:%.]], [[S1:%.*]]
	; CHECK-NEXT: [[V:%.]] = or disjoint <4 x i32> [[V0:%.]], [[V1:%.*]]
	; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> [[V]], i32 [[S]], i32 0
	; CHECK-NEXT: ret <4 x i32> [[R]]
	;
	%s = add i32 %s0, %s1
	%v = or disjoint <4 x i32> %v0, %v1
	%r = insertelement <4 x i32> %v, i32 %s, i32 0
	ret <4 x i32> %r
	}

	; Negative - multi use
	define <2 x double> @insert0_v2f64_f64_fmul_multiuse(<2 x double> %v0, <2 x double> %v1, double %s0, double %s1) {
	; CHECK-LABEL: @insert0_v2f64_f64_fmul_multiuse(
	; CHECK-NEXT: [[S:%.]] = fmul double [[S0:%.]], [[S1:%.*]]
	; CHECK-NEXT: [[V:%.]] = fmul <2 x double> [[V0:%.]], [[V1:%.*]]
	; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[V]], double [[S]], i32 0
	; CHECK-NEXT: call void @use_f64(double [[S]])
	; CHECK-NEXT: ret <2 x double> [[R]]
	;
	%s = fmul double %s0, %s1
	%v = fmul <2 x double> %v0, %v1
	%r = insertelement <2 x double> %v, double %s, i32 0
	call void @use_f64(double %s)
	ret <2 x double> %r
	}
	declare void @use_f64(<2 x double>)

	; Negative - multi use
	define <2 x i64> @insert0_v2i64_i64_add_multiuse(<2 x i64> %v0, <2 x i64> %v1, i64 %s0, i64 %s1) {
	; CHECK-LABEL: @insert0_v2i64_i64_add_multiuse(
	; CHECK-NEXT: [[S:%.]] = add i64 [[S0:%.]], [[S1:%.*]]
	; CHECK-NEXT: [[V:%.]] = add <2 x i64> [[V0:%.]], [[V1:%.*]]
	; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> [[V]], i64 [[S]], i32 0
	; CHECK-NEXT: call void @use_v2i64(<2 x i64> [[V]])
	; CHECK-NEXT: ret <2 x i64> [[R]]
	;
	%s = add i64 %s0, %s1
	%v = add <2 x i64> %v0, %v1
	%r = insertelement <2 x i64> %v, i64 %s, i32 0
	call void @use_v2i64(<2 x i64> %v)
	ret <2 x i64> %r
	}
	declare void @use_v2i64(<2 x i64>)

	; Negative - binop mismatch
	define <2 x double> @insert0_v2f64_f64_fadd_fsub(<2 x double> %v0, <2 x double> %v1, double %s0, double %s1) {
	; CHECK-LABEL: @insert0_v2f64_f64_fadd_fsub(
	; CHECK-NEXT: [[S:%.]] = fsub double [[S0:%.]], [[S1:%.*]]
	; CHECK-NEXT: [[V:%.]] = fadd <2 x double> [[V0:%.]], [[V1:%.*]]
	; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[V]], double [[S]], i32 0
	; CHECK-NEXT: ret <2 x double> [[R]]
	;
	%s = fsub double %s0, %s1
	%v = fadd <2 x double> %v0, %v1
	%r = insertelement <2 x double> %v, double %s, i32 0
	ret <2 x double> %r
	}