llvm/test/CodeGen/AArch64/faddv.ll - llvm-project.git - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64 < %s | FileCheck %s

 define float @test_v2f32_element_0_zero(<2 x float> %vec) {
 ; CHECK-LABEL: test_v2f32_element_0_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    mov s0, v0.s[1]
 ; CHECK-NEXT:    ret
 entry:
   %with_zero = insertelement <2 x float> %vec, float 0.0, i64 0
   %sum = call nsz float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %with_zero)
   ret float %sum
 }

 define float @test_v4f32_element_3_zero(<4 x float> %vec) {
 ; CHECK-LABEL: test_v4f32_element_3_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov s1, v0.s[2]
 ; CHECK-NEXT:    faddp s0, v0.2s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
 entry:
   %with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
   %sum = call nsz float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %with_zero)
   ret float %sum
 }

 define float @test_v4f32_elements_0_2_zero(<4 x float> %vec) {
 ; CHECK-LABEL: test_v4f32_elements_0_2_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov s1, v0.s[3]
 ; CHECK-NEXT:    mov s0, v0.s[1]
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
 entry:
   %zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
   %zero2 = insertelement <4 x float> %zero1, float 0.0, i64 2
   %sum = call nsz float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %zero2)
   ret float %sum
 }

 define float @test_v4f32_all_zero(<4 x float> %vec) {
 ; CHECK-LABEL: test_v4f32_all_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    ret
 entry:
   %zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
   %zero2 = insertelement <4 x float> %zero1, float 0.0, i64 1
   %zero3 = insertelement <4 x float> %zero2, float 0.0, i64 2
   %zero4 = insertelement <4 x float> %zero3, float 0.0, i64 3
   %sum = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %zero4)
   ret float %sum
 }

 define double @test_v2f64_element_0_zero(<2 x double> %vec) {
 ; CHECK-LABEL: test_v2f64_element_0_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov d0, v0.d[1]
 ; CHECK-NEXT:    ret
 entry:
   %with_zero = insertelement <2 x double> %vec, double 0.0, i64 0
   %sum = call nsz double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %with_zero)
   ret double %sum
 }

 define float @negative_test(<4 x float> %vec) {
 ; CHECK-LABEL: negative_test:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    faddp v0.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    faddp s0, v0.2s
 ; CHECK-NEXT:    ret
   %with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
   %sum = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %with_zero)
   ret float %sum
 }

 define float @test_reduce_v2f32_element_0_zero(<2 x float> %vec) {
 ; CHECK-LABEL: test_reduce_v2f32_element_0_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    mov s0, v0.s[1]
 ; CHECK-NEXT:    ret
 entry:
   %with_zero = insertelement <2 x float> %vec, float 0.0, i64 0
   %sum = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float -0.0, <2 x float> %with_zero)
   ret float %sum
 }

 define float @test_reduce_v4f32_element_3_zero(<4 x float> %vec) {
 ; CHECK-LABEL: test_reduce_v4f32_element_3_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov s1, v0.s[2]
 ; CHECK-NEXT:    faddp s0, v0.2s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
 entry:
   %with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
   %sum = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %with_zero)
   ret float %sum
 }

 define float @test_reduce_v4f32_elements_0_2_zero(<4 x float> %vec) {
 ; CHECK-LABEL: test_reduce_v4f32_elements_0_2_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov s1, v0.s[3]
 ; CHECK-NEXT:    mov s0, v0.s[1]
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
 entry:
   %zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
   %zero2 = insertelement <4 x float> %zero1, float 0.0, i64 2
   %sum = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %zero2)
   ret float %sum
 }

 define float @test_reduce_v4f32_all_zero(<4 x float> %vec) {
 ; CHECK-LABEL: test_reduce_v4f32_all_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    ret
 entry:
   %zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
   %zero2 = insertelement <4 x float> %zero1, float 0.0, i64 1
   %zero3 = insertelement <4 x float> %zero2, float 0.0, i64 2
   %zero4 = insertelement <4 x float> %zero3, float 0.0, i64 3
   %sum = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %zero4)
   ret float %sum
 }

 define double @test_reduce_v2f64_element_0_zero(<2 x double> %vec) {
 ; CHECK-LABEL: test_reduce_v2f64_element_0_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov d0, v0.d[1]
 ; CHECK-NEXT:    ret
 entry:
   %with_zero = insertelement <2 x double> %vec, double 0.0, i64 0
   %sum = call reassoc nsz double @llvm.vector.reduce.fadd.v2f64(double -0.0, <2 x double> %with_zero)
   ret double %sum
 }

 define float @negative_test_reduce(<4 x float> %vec) {
 ; CHECK-LABEL: negative_test_reduce:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    faddp v0.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    faddp s0, v0.2s
 ; CHECK-NEXT:    ret
   %with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
   %sum = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %with_zero)
   ret float %sum
 }

 define float @test_seq_reduce_v2f32_element_0_zero(<2 x float> %vec) {
 ; CHECK-LABEL: test_seq_reduce_v2f32_element_0_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    mov s0, v0.s[1]
 ; CHECK-NEXT:    ret
 entry:
   %with_zero = insertelement <2 x float> %vec, float 0.0, i64 0
   %sum = call nsz float @llvm.vector.reduce.fadd.v2f32(float -0.0, <2 x float> %with_zero)
   ret float %sum
 }

 define float @test_seq_reduce_v4f32_element_3_zero(<4 x float> %vec) {
 ; CHECK-LABEL: test_seq_reduce_v4f32_element_3_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov s1, v0.s[2]
 ; CHECK-NEXT:    faddp s0, v0.2s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
 entry:
   %with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
   %sum = call nsz float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %with_zero)
   ret float %sum
 }

 define float @test_seq_reduce_v4f32_elements_0_2_zero(<4 x float> %vec) {
 ; CHECK-LABEL: test_seq_reduce_v4f32_elements_0_2_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov s1, v0.s[3]
 ; CHECK-NEXT:    mov s0, v0.s[1]
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
 entry:
   %zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
   %zero2 = insertelement <4 x float> %zero1, float 0.0, i64 2
   %sum = call nsz float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %zero2)
   ret float %sum
 }

 define float @test_seq_reduce_v4f32_all_zero(float %start, <4 x float> %vec) {
 ; CHECK-LABEL: test_seq_reduce_v4f32_all_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ret
 entry:
   %zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
   %zero2 = insertelement <4 x float> %zero1, float 0.0, i64 1
   %zero3 = insertelement <4 x float> %zero2, float 0.0, i64 2
   %zero4 = insertelement <4 x float> %zero3, float 0.0, i64 3
   %sum = call nsz float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %zero4)
   ret float %sum
 }

 define float @test_seq_reduce_with_start(float %start, <4 x float> %vec) {
 ; CHECK-LABEL: test_seq_reduce_with_start:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    mov s2, v1.s[1]
 ; CHECK-NEXT:    mov s1, v1.s[2]
 ; CHECK-NEXT:    fadd s0, s0, s2
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
 entry:
   %with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
   %sum = call nsz float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %with_zero)
   ret float %sum
 }

 define double @test_seq_reduce_v2f64_element_0_zero(<2 x double> %vec) {
 ; CHECK-LABEL: test_seq_reduce_v2f64_element_0_zero:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov d0, v0.d[1]
 ; CHECK-NEXT:    ret
 entry:
   %with_zero = insertelement <2 x double> %vec, double 0.0, i64 0
   %sum = call nsz double @llvm.vector.reduce.fadd.v2f64(double -0.0, <2 x double> %with_zero)
   ret double %sum
 }

 define float @negative_test_seq_reduce(<4 x float> %vec) {
 ; CHECK-LABEL: negative_test_seq_reduce:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov s2, v0.s[2]
 ; CHECK-NEXT:    faddp s0, v0.2s
 ; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    fadd s0, s0, s2
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
   %with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
   %sum = call float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %with_zero)
   ret float %sum
 }

 ; Negative test: sequential all-zero without nsz should not return Start
 ; directly because fadd(-0.0, +0.0) = +0.0, not -0.0.
 define float @negative_test_seq_reduce_all_zero_no_nsz(float %start, <4 x float> %vec) {
 ; CHECK-LABEL: negative_test_seq_reduce_all_zero_no_nsz:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
 entry:
   %zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
   %zero2 = insertelement <4 x float> %zero1, float 0.0, i64 1
   %zero3 = insertelement <4 x float> %zero2, float 0.0, i64 2
   %zero4 = insertelement <4 x float> %zero3, float 0.0, i64 3
   %sum = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %zero4)
   ret float %sum
 }

 ; Test with -0.0 elements (negative zero). With nsz, these should be
 ; treated as zero elements.
 define float @test_v4f32_neg_zero_element(<4 x float> %vec) {
 ; CHECK-LABEL: test_v4f32_neg_zero_element:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov s1, v0.s[2]
 ; CHECK-NEXT:    faddp s0, v0.2s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
 entry:
   %with_neg_zero = insertelement <4 x float> %vec, float -0.0, i64 3
   %sum = call nsz float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %with_neg_zero)
   ret float %sum
 }

 define float @test_reduce_v4f32_neg_zero_element(<4 x float> %vec) {
 ; CHECK-LABEL: test_reduce_v4f32_neg_zero_element:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov s1, v0.s[2]
 ; CHECK-NEXT:    faddp s0, v0.2s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
 entry:
   %with_neg_zero = insertelement <4 x float> %vec, float -0.0, i64 3
   %sum = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %with_neg_zero)
   ret float %sum
 }

 declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float>)
 declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float>)
 declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double>)

 declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>)
 declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
 declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple=aarch64 < %s \| FileCheck %s

	define float @test_v2f32_element_0_zero(<2 x float> %vec) {
	; CHECK-LABEL: test_v2f32_element_0_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
	; CHECK-NEXT: mov s0, v0.s[1]
	; CHECK-NEXT: ret
	entry:
	%with_zero = insertelement <2 x float> %vec, float 0.0, i64 0
	%sum = call nsz float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %with_zero)
	ret float %sum
	}

	define float @test_v4f32_element_3_zero(<4 x float> %vec) {
	; CHECK-LABEL: test_v4f32_element_3_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: mov s1, v0.s[2]
	; CHECK-NEXT: faddp s0, v0.2s
	; CHECK-NEXT: fadd s0, s0, s1
	; CHECK-NEXT: ret
	entry:
	%with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
	%sum = call nsz float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %with_zero)
	ret float %sum
	}

	define float @test_v4f32_elements_0_2_zero(<4 x float> %vec) {
	; CHECK-LABEL: test_v4f32_elements_0_2_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: mov s1, v0.s[3]
	; CHECK-NEXT: mov s0, v0.s[1]
	; CHECK-NEXT: fadd s0, s0, s1
	; CHECK-NEXT: ret
	entry:
	%zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
	%zero2 = insertelement <4 x float> %zero1, float 0.0, i64 2
	%sum = call nsz float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %zero2)
	ret float %sum
	}

	define float @test_v4f32_all_zero(<4 x float> %vec) {
	; CHECK-LABEL: test_v4f32_all_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: movi d0, #0000000000000000
	; CHECK-NEXT: ret
	entry:
	%zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
	%zero2 = insertelement <4 x float> %zero1, float 0.0, i64 1
	%zero3 = insertelement <4 x float> %zero2, float 0.0, i64 2
	%zero4 = insertelement <4 x float> %zero3, float 0.0, i64 3
	%sum = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %zero4)
	ret float %sum
	}

	define double @test_v2f64_element_0_zero(<2 x double> %vec) {
	; CHECK-LABEL: test_v2f64_element_0_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: mov d0, v0.d[1]
	; CHECK-NEXT: ret
	entry:
	%with_zero = insertelement <2 x double> %vec, double 0.0, i64 0
	%sum = call nsz double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %with_zero)
	ret double %sum
	}

	define float @negative_test(<4 x float> %vec) {
	; CHECK-LABEL: negative_test:
	; CHECK: // %bb.0:
	; CHECK-NEXT: movi d1, #0000000000000000
	; CHECK-NEXT: mov v0.s[3], v1.s[0]
	; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
	; CHECK-NEXT: faddp s0, v0.2s
	; CHECK-NEXT: ret
	%with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
	%sum = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %with_zero)
	ret float %sum
	}

	define float @test_reduce_v2f32_element_0_zero(<2 x float> %vec) {
	; CHECK-LABEL: test_reduce_v2f32_element_0_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
	; CHECK-NEXT: mov s0, v0.s[1]
	; CHECK-NEXT: ret
	entry:
	%with_zero = insertelement <2 x float> %vec, float 0.0, i64 0
	%sum = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float -0.0, <2 x float> %with_zero)
	ret float %sum
	}

	define float @test_reduce_v4f32_element_3_zero(<4 x float> %vec) {
	; CHECK-LABEL: test_reduce_v4f32_element_3_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: mov s1, v0.s[2]
	; CHECK-NEXT: faddp s0, v0.2s
	; CHECK-NEXT: fadd s0, s0, s1
	; CHECK-NEXT: ret
	entry:
	%with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
	%sum = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %with_zero)
	ret float %sum
	}

	define float @test_reduce_v4f32_elements_0_2_zero(<4 x float> %vec) {
	; CHECK-LABEL: test_reduce_v4f32_elements_0_2_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: mov s1, v0.s[3]
	; CHECK-NEXT: mov s0, v0.s[1]
	; CHECK-NEXT: fadd s0, s0, s1
	; CHECK-NEXT: ret
	entry:
	%zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
	%zero2 = insertelement <4 x float> %zero1, float 0.0, i64 2
	%sum = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %zero2)
	ret float %sum
	}

	define float @test_reduce_v4f32_all_zero(<4 x float> %vec) {
	; CHECK-LABEL: test_reduce_v4f32_all_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: movi d0, #0000000000000000
	; CHECK-NEXT: ret
	entry:
	%zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
	%zero2 = insertelement <4 x float> %zero1, float 0.0, i64 1
	%zero3 = insertelement <4 x float> %zero2, float 0.0, i64 2
	%zero4 = insertelement <4 x float> %zero3, float 0.0, i64 3
	%sum = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %zero4)
	ret float %sum
	}

	define double @test_reduce_v2f64_element_0_zero(<2 x double> %vec) {
	; CHECK-LABEL: test_reduce_v2f64_element_0_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: mov d0, v0.d[1]
	; CHECK-NEXT: ret
	entry:
	%with_zero = insertelement <2 x double> %vec, double 0.0, i64 0
	%sum = call reassoc nsz double @llvm.vector.reduce.fadd.v2f64(double -0.0, <2 x double> %with_zero)
	ret double %sum
	}

	define float @negative_test_reduce(<4 x float> %vec) {
	; CHECK-LABEL: negative_test_reduce:
	; CHECK: // %bb.0:
	; CHECK-NEXT: movi d1, #0000000000000000
	; CHECK-NEXT: mov v0.s[3], v1.s[0]
	; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
	; CHECK-NEXT: faddp s0, v0.2s
	; CHECK-NEXT: ret
	%with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
	%sum = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %with_zero)
	ret float %sum
	}

	define float @test_seq_reduce_v2f32_element_0_zero(<2 x float> %vec) {
	; CHECK-LABEL: test_seq_reduce_v2f32_element_0_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
	; CHECK-NEXT: mov s0, v0.s[1]
	; CHECK-NEXT: ret
	entry:
	%with_zero = insertelement <2 x float> %vec, float 0.0, i64 0
	%sum = call nsz float @llvm.vector.reduce.fadd.v2f32(float -0.0, <2 x float> %with_zero)
	ret float %sum
	}

	define float @test_seq_reduce_v4f32_element_3_zero(<4 x float> %vec) {
	; CHECK-LABEL: test_seq_reduce_v4f32_element_3_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: mov s1, v0.s[2]
	; CHECK-NEXT: faddp s0, v0.2s
	; CHECK-NEXT: fadd s0, s0, s1
	; CHECK-NEXT: ret
	entry:
	%with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
	%sum = call nsz float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %with_zero)
	ret float %sum
	}

	define float @test_seq_reduce_v4f32_elements_0_2_zero(<4 x float> %vec) {
	; CHECK-LABEL: test_seq_reduce_v4f32_elements_0_2_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: mov s1, v0.s[3]
	; CHECK-NEXT: mov s0, v0.s[1]
	; CHECK-NEXT: fadd s0, s0, s1
	; CHECK-NEXT: ret
	entry:
	%zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
	%zero2 = insertelement <4 x float> %zero1, float 0.0, i64 2
	%sum = call nsz float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %zero2)
	ret float %sum
	}

	define float @test_seq_reduce_v4f32_all_zero(float %start, <4 x float> %vec) {
	; CHECK-LABEL: test_seq_reduce_v4f32_all_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: ret
	entry:
	%zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
	%zero2 = insertelement <4 x float> %zero1, float 0.0, i64 1
	%zero3 = insertelement <4 x float> %zero2, float 0.0, i64 2
	%zero4 = insertelement <4 x float> %zero3, float 0.0, i64 3
	%sum = call nsz float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %zero4)
	ret float %sum
	}

	define float @test_seq_reduce_with_start(float %start, <4 x float> %vec) {
	; CHECK-LABEL: test_seq_reduce_with_start:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: fadd s0, s0, s1
	; CHECK-NEXT: mov s2, v1.s[1]
	; CHECK-NEXT: mov s1, v1.s[2]
	; CHECK-NEXT: fadd s0, s0, s2
	; CHECK-NEXT: fadd s0, s0, s1
	; CHECK-NEXT: ret
	entry:
	%with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
	%sum = call nsz float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %with_zero)
	ret float %sum
	}

	define double @test_seq_reduce_v2f64_element_0_zero(<2 x double> %vec) {
	; CHECK-LABEL: test_seq_reduce_v2f64_element_0_zero:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: mov d0, v0.d[1]
	; CHECK-NEXT: ret
	entry:
	%with_zero = insertelement <2 x double> %vec, double 0.0, i64 0
	%sum = call nsz double @llvm.vector.reduce.fadd.v2f64(double -0.0, <2 x double> %with_zero)
	ret double %sum
	}

	define float @negative_test_seq_reduce(<4 x float> %vec) {
	; CHECK-LABEL: negative_test_seq_reduce:
	; CHECK: // %bb.0:
	; CHECK-NEXT: mov s2, v0.s[2]
	; CHECK-NEXT: faddp s0, v0.2s
	; CHECK-NEXT: movi d1, #0000000000000000
	; CHECK-NEXT: fadd s0, s0, s2
	; CHECK-NEXT: fadd s0, s0, s1
	; CHECK-NEXT: ret
	%with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
	%sum = call float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %with_zero)
	ret float %sum
	}

	; Negative test: sequential all-zero without nsz should not return Start
	; directly because fadd(-0.0, +0.0) = +0.0, not -0.0.
	define float @negative_test_seq_reduce_all_zero_no_nsz(float %start, <4 x float> %vec) {
	; CHECK-LABEL: negative_test_seq_reduce_all_zero_no_nsz:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: movi d1, #0000000000000000
	; CHECK-NEXT: fadd s0, s0, s1
	; CHECK-NEXT: ret
	entry:
	%zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
	%zero2 = insertelement <4 x float> %zero1, float 0.0, i64 1
	%zero3 = insertelement <4 x float> %zero2, float 0.0, i64 2
	%zero4 = insertelement <4 x float> %zero3, float 0.0, i64 3
	%sum = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %zero4)
	ret float %sum
	}

	; Test with -0.0 elements (negative zero). With nsz, these should be
	; treated as zero elements.
	define float @test_v4f32_neg_zero_element(<4 x float> %vec) {
	; CHECK-LABEL: test_v4f32_neg_zero_element:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: mov s1, v0.s[2]
	; CHECK-NEXT: faddp s0, v0.2s
	; CHECK-NEXT: fadd s0, s0, s1
	; CHECK-NEXT: ret
	entry:
	%with_neg_zero = insertelement <4 x float> %vec, float -0.0, i64 3
	%sum = call nsz float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %with_neg_zero)
	ret float %sum
	}

	define float @test_reduce_v4f32_neg_zero_element(<4 x float> %vec) {
	; CHECK-LABEL: test_reduce_v4f32_neg_zero_element:
	; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: mov s1, v0.s[2]
	; CHECK-NEXT: faddp s0, v0.2s
	; CHECK-NEXT: fadd s0, s0, s1
	; CHECK-NEXT: ret
	entry:
	%with_neg_zero = insertelement <4 x float> %vec, float -0.0, i64 3
	%sum = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %with_neg_zero)
	ret float %sum
	}

	declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float>)
	declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float>)
	declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double>)

	declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>)
	declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
	declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)