| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvl256b | FileCheck %s --check-prefixes=CHECK,V |
| ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+zvl256b | FileCheck %s --check-prefixes=CHECK,ZVE32F |
| ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED,OPTZVE32F |
| ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED,OPTV |
| |
| %struct.foo = type { i32, i32, i32, i32 } |
| |
| ; void gather(signed char * __restrict A, signed char * __restrict B) { |
| ; for (int i = 0; i != 1024; ++i) |
| ; A[i] += B[i * 5]; |
| ; } |
| define void @gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { |
| ; CHECK-LABEL: gather: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: addi a2, a0, 1024 |
| ; CHECK-NEXT: li a4, 32 |
| ; CHECK-NEXT: li a3, 5 |
| ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma |
| ; CHECK-NEXT: .LBB0_1: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vlse8.v v8, (a1), a3 |
| ; CHECK-NEXT: vle8.v v9, (a0) |
| ; CHECK-NEXT: vadd.vv v8, v9, v8 |
| ; CHECK-NEXT: vse8.v v8, (a0) |
| ; CHECK-NEXT: addi a0, a0, 32 |
| ; CHECK-NEXT: addi a1, a1, 160 |
| ; CHECK-NEXT: bne a0, a2, .LBB0_1 |
| ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup |
| ; CHECK-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %entry |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ] |
| %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) |
| %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i |
| %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef) |
| %i2 = getelementptr inbounds i8, ptr %A, i64 %index |
| %wide.load = load <32 x i8>, ptr %i2, align 1 |
| %i4 = add <32 x i8> %wide.load, %wide.masked.gather |
| store <32 x i8> %i4, ptr %i2, align 1 |
| %index.next = add nuw i64 %index, 32 |
| %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) |
| %i6 = icmp eq i64 %index.next, 1024 |
| br i1 %i6, label %for.cond.cleanup, label %vector.body |
| |
| for.cond.cleanup: ; preds = %vector.body |
| ret void |
| } |
| |
| define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) { |
| ; CHECK-LABEL: gather_masked: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: addi a2, a0, 1024 |
| ; CHECK-NEXT: lui a4, 983765 |
| ; CHECK-NEXT: li a3, 32 |
| ; CHECK-NEXT: addi a4, a4, 873 |
| ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma |
| ; CHECK-NEXT: vmv.s.x v0, a4 |
| ; CHECK-NEXT: li a4, 5 |
| ; CHECK-NEXT: .LBB1_1: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu |
| ; CHECK-NEXT: vmv1r.v v9, v8 |
| ; CHECK-NEXT: vlse8.v v9, (a1), a4, v0.t |
| ; CHECK-NEXT: vle8.v v10, (a0) |
| ; CHECK-NEXT: vadd.vv v9, v10, v9 |
| ; CHECK-NEXT: vse8.v v9, (a0) |
| ; CHECK-NEXT: addi a0, a0, 32 |
| ; CHECK-NEXT: addi a1, a1, 160 |
| ; CHECK-NEXT: bne a0, a2, .LBB1_1 |
| ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup |
| ; CHECK-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %entry |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ] |
| %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) |
| %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i |
| %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff) |
| %i2 = getelementptr inbounds i8, ptr %A, i64 %index |
| %wide.load = load <32 x i8>, ptr %i2, align 1 |
| %i4 = add <32 x i8> %wide.load, %wide.masked.gather |
| store <32 x i8> %i4, ptr %i2, align 1 |
| %index.next = add nuw i64 %index, 32 |
| %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) |
| %i6 = icmp eq i64 %index.next, 1024 |
| br i1 %i6, label %for.cond.cleanup, label %vector.body |
| |
| for.cond.cleanup: ; preds = %vector.body |
| ret void |
| } |
| |
| define void @gather_negative_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { |
| ; CHECK-LABEL: gather_negative_stride: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: addi a1, a1, 155 |
| ; CHECK-NEXT: addi a2, a0, 1024 |
| ; CHECK-NEXT: li a4, 32 |
| ; CHECK-NEXT: li a3, -5 |
| ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma |
| ; CHECK-NEXT: .LBB2_1: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vlse8.v v8, (a1), a3 |
| ; CHECK-NEXT: vle8.v v9, (a0) |
| ; CHECK-NEXT: vadd.vv v8, v9, v8 |
| ; CHECK-NEXT: vse8.v v8, (a0) |
| ; CHECK-NEXT: addi a0, a0, 32 |
| ; CHECK-NEXT: addi a1, a1, 160 |
| ; CHECK-NEXT: bne a0, a2, .LBB2_1 |
| ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup |
| ; CHECK-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %entry |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %vec.ind = phi <32 x i64> [ <i64 31, i64 30, i64 29, i64 28, i64 27, i64 26, i64 25, i64 24, i64 23, i64 22, i64 21, i64 20, i64 19, i64 18, i64 17, i64 16, i64 15, i64 14, i64 13, i64 12, i64 11, i64 10, i64 9, i64 8, i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, %entry ], [ %vec.ind.next, %vector.body ] |
| %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) |
| %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i |
| %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef) |
| %i2 = getelementptr inbounds i8, ptr %A, i64 %index |
| %wide.load = load <32 x i8>, ptr %i2, align 1 |
| %i4 = add <32 x i8> %wide.load, %wide.masked.gather |
| store <32 x i8> %i4, ptr %i2, align 1 |
| %index.next = add nuw i64 %index, 32 |
| %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) |
| %i6 = icmp eq i64 %index.next, 1024 |
| br i1 %i6, label %for.cond.cleanup, label %vector.body |
| |
| for.cond.cleanup: ; preds = %vector.body |
| ret void |
| } |
| |
| define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { |
| ; CHECK-LABEL: gather_zero_stride: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: addi a2, a0, 1024 |
| ; CHECK-NEXT: li a3, 32 |
| ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma |
| ; CHECK-NEXT: .LBB3_1: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: lbu a3, 0(a1) |
| ; CHECK-NEXT: vle8.v v8, (a0) |
| ; CHECK-NEXT: vadd.vx v8, v8, a3 |
| ; CHECK-NEXT: vse8.v v8, (a0) |
| ; CHECK-NEXT: addi a0, a0, 32 |
| ; CHECK-NEXT: addi a1, a1, 160 |
| ; CHECK-NEXT: bne a0, a2, .LBB3_1 |
| ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup |
| ; CHECK-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %entry |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] |
| %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) |
| %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i |
| %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef) |
| %i2 = getelementptr inbounds i8, ptr %A, i64 %index |
| %wide.load = load <32 x i8>, ptr %i2, align 1 |
| %i4 = add <32 x i8> %wide.load, %wide.masked.gather |
| store <32 x i8> %i4, ptr %i2, align 1 |
| %index.next = add nuw i64 %index, 32 |
| %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) |
| %i6 = icmp eq i64 %index.next, 1024 |
| br i1 %i6, label %for.cond.cleanup, label %vector.body |
| |
| for.cond.cleanup: ; preds = %vector.body |
| ret void |
| } |
| |
| define void @gather_zero_stride_i32(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { |
| ; CHECK-LABEL: gather_zero_stride_i32: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: addi a2, a0, 1024 |
| ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma |
| ; CHECK-NEXT: .LBB4_1: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: lw a3, 0(a1) |
| ; CHECK-NEXT: vle32.v v8, (a0) |
| ; CHECK-NEXT: vadd.vx v8, v8, a3 |
| ; CHECK-NEXT: vse32.v v8, (a0) |
| ; CHECK-NEXT: addi a0, a0, 8 |
| ; CHECK-NEXT: addi a1, a1, 160 |
| ; CHECK-NEXT: bne a0, a2, .LBB4_1 |
| ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup |
| ; CHECK-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %entry |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %vec.ind = phi <8 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] |
| %i = mul nuw nsw <8 x i64> %vec.ind, splat (i64 5) |
| %i1 = getelementptr inbounds i8, ptr %B, <8 x i64> %i |
| %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) |
| %i2 = getelementptr inbounds i8, ptr %A, i64 %index |
| %wide.load = load <8 x i32>, ptr %i2, align 4 |
| %i4 = add <8 x i32> %wide.load, %wide.masked.gather |
| store <8 x i32> %i4, ptr %i2, align 4 |
| %index.next = add nuw i64 %index, 8 |
| %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32) |
| %i6 = icmp eq i64 %index.next, 1024 |
| br i1 %i6, label %for.cond.cleanup, label %vector.body |
| |
| for.cond.cleanup: ; preds = %vector.body |
| ret void |
| } |
| |
| define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { |
| ; V-LABEL: gather_zero_stride_unfold: |
| ; V: # %bb.0: # %entry |
| ; V-NEXT: addi a2, a0, 1024 |
| ; V-NEXT: li a3, 32 |
| ; V-NEXT: vsetvli zero, a3, e8, m1, ta, ma |
| ; V-NEXT: .LBB5_1: # %vector.body |
| ; V-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; V-NEXT: lbu a3, 0(a1) |
| ; V-NEXT: vle8.v v8, (a0) |
| ; V-NEXT: vmv.v.x v9, a3 |
| ; V-NEXT: vdivu.vv v8, v9, v8 |
| ; V-NEXT: vse8.v v8, (a0) |
| ; V-NEXT: addi a0, a0, 32 |
| ; V-NEXT: addi a1, a1, 160 |
| ; V-NEXT: bne a0, a2, .LBB5_1 |
| ; V-NEXT: # %bb.2: # %for.cond.cleanup |
| ; V-NEXT: ret |
| ; |
| ; ZVE32F-LABEL: gather_zero_stride_unfold: |
| ; ZVE32F: # %bb.0: # %entry |
| ; ZVE32F-NEXT: addi a2, a0, 1024 |
| ; ZVE32F-NEXT: li a3, 32 |
| ; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma |
| ; ZVE32F-NEXT: .LBB5_1: # %vector.body |
| ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; ZVE32F-NEXT: lbu a3, 0(a1) |
| ; ZVE32F-NEXT: vle8.v v8, (a0) |
| ; ZVE32F-NEXT: vmv.v.x v9, a3 |
| ; ZVE32F-NEXT: vdivu.vv v8, v9, v8 |
| ; ZVE32F-NEXT: vse8.v v8, (a0) |
| ; ZVE32F-NEXT: addi a0, a0, 32 |
| ; ZVE32F-NEXT: addi a1, a1, 160 |
| ; ZVE32F-NEXT: bne a0, a2, .LBB5_1 |
| ; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup |
| ; ZVE32F-NEXT: ret |
| ; |
| ; OPTIMIZED-LABEL: gather_zero_stride_unfold: |
| ; OPTIMIZED: # %bb.0: # %entry |
| ; OPTIMIZED-NEXT: addi a2, a0, 1024 |
| ; OPTIMIZED-NEXT: li a3, 32 |
| ; OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma |
| ; OPTIMIZED-NEXT: .LBB5_1: # %vector.body |
| ; OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; OPTIMIZED-NEXT: vlse8.v v8, (a1), zero |
| ; OPTIMIZED-NEXT: vle8.v v9, (a0) |
| ; OPTIMIZED-NEXT: vdivu.vv v8, v8, v9 |
| ; OPTIMIZED-NEXT: vse8.v v8, (a0) |
| ; OPTIMIZED-NEXT: addi a0, a0, 32 |
| ; OPTIMIZED-NEXT: addi a1, a1, 160 |
| ; OPTIMIZED-NEXT: bne a0, a2, .LBB5_1 |
| ; OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup |
| ; OPTIMIZED-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %entry |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] |
| %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) |
| %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i |
| %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef) |
| %i2 = getelementptr inbounds i8, ptr %A, i64 %index |
| %wide.load = load <32 x i8>, ptr %i2, align 1 |
| %i4 = udiv <32 x i8> %wide.masked.gather, %wide.load |
| store <32 x i8> %i4, ptr %i2, align 1 |
| %index.next = add nuw i64 %index, 32 |
| %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) |
| %i6 = icmp eq i64 %index.next, 1024 |
| br i1 %i6, label %for.cond.cleanup, label %vector.body |
| |
| for.cond.cleanup: ; preds = %vector.body |
| ret void |
| } |
| |
| ;void scatter(signed char * __restrict A, signed char * __restrict B) { |
| ; for (int i = 0; i < 1024; ++i) |
| ; A[i * 5] += B[i]; |
| ;} |
| define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { |
| ; CHECK-LABEL: scatter: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: addi a2, a1, 1024 |
| ; CHECK-NEXT: li a4, 32 |
| ; CHECK-NEXT: li a3, 5 |
| ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma |
| ; CHECK-NEXT: .LBB6_1: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vle8.v v8, (a1) |
| ; CHECK-NEXT: vlse8.v v9, (a0), a3 |
| ; CHECK-NEXT: addi a1, a1, 32 |
| ; CHECK-NEXT: vadd.vv v8, v9, v8 |
| ; CHECK-NEXT: vsse8.v v8, (a0), a3 |
| ; CHECK-NEXT: addi a0, a0, 160 |
| ; CHECK-NEXT: bne a1, a2, .LBB6_1 |
| ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup |
| ; CHECK-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %entry |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ] |
| %i = getelementptr inbounds i8, ptr %B, i64 %index |
| %wide.load = load <32 x i8>, ptr %i, align 1 |
| %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) |
| %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2 |
| %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef) |
| %i4 = add <32 x i8> %wide.masked.gather, %wide.load |
| call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true)) |
| %index.next = add nuw i64 %index, 32 |
| %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) |
| %i5 = icmp eq i64 %index.next, 1024 |
| br i1 %i5, label %for.cond.cleanup, label %vector.body |
| |
| for.cond.cleanup: ; preds = %vector.body |
| ret void |
| } |
| |
| define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) { |
| ; CHECK-LABEL: scatter_masked: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: addi a2, a1, 1024 |
| ; CHECK-NEXT: li a3, 32 |
| ; CHECK-NEXT: lui a4, 983765 |
| ; CHECK-NEXT: addi a4, a4, 873 |
| ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma |
| ; CHECK-NEXT: vmv.s.x v0, a4 |
| ; CHECK-NEXT: li a4, 5 |
| ; CHECK-NEXT: .LBB7_1: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu |
| ; CHECK-NEXT: vle8.v v9, (a1) |
| ; CHECK-NEXT: vmv1r.v v10, v8 |
| ; CHECK-NEXT: vlse8.v v10, (a0), a4, v0.t |
| ; CHECK-NEXT: addi a1, a1, 32 |
| ; CHECK-NEXT: vadd.vv v9, v10, v9 |
| ; CHECK-NEXT: vsse8.v v9, (a0), a4, v0.t |
| ; CHECK-NEXT: addi a0, a0, 160 |
| ; CHECK-NEXT: bne a1, a2, .LBB7_1 |
| ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup |
| ; CHECK-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %entry |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ] |
| %i = getelementptr inbounds i8, ptr %B, i64 %index |
| %wide.load = load <32 x i8>, ptr %i, align 1 |
| %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) |
| %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2 |
| %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff) |
| %i4 = add <32 x i8> %wide.masked.gather, %wide.load |
| call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>) |
| %index.next = add nuw i64 %index, 32 |
| %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) |
| %i5 = icmp eq i64 %index.next, 1024 |
| br i1 %i5, label %for.cond.cleanup, label %vector.body |
| |
| for.cond.cleanup: ; preds = %vector.body |
| ret void |
| } |
| |
| ; void gather_pow2(signed char * __restrict A, signed char * __restrict B) { |
| ; for (int i = 0; i != 1024; ++i) |
| ; A[i] += B[i * 4]; |
| ; } |
| define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { |
| ; CHECK-LABEL: gather_pow2: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: lui a3, 1 |
| ; CHECK-NEXT: li a2, 16 |
| ; CHECK-NEXT: add a3, a0, a3 |
| ; CHECK-NEXT: li a4, 32 |
| ; CHECK-NEXT: .LBB8_1: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma |
| ; CHECK-NEXT: vlse32.v v8, (a1), a2 |
| ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma |
| ; CHECK-NEXT: vle8.v v9, (a0) |
| ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma |
| ; CHECK-NEXT: vadd.vv v8, v9, v8 |
| ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma |
| ; CHECK-NEXT: vse8.v v8, (a0) |
| ; CHECK-NEXT: addi a0, a0, 32 |
| ; CHECK-NEXT: addi a1, a1, 128 |
| ; CHECK-NEXT: bne a0, a3, .LBB8_1 |
| ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup |
| ; CHECK-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %entry |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ] |
| %i = shl nsw <8 x i64> %vec.ind, splat (i64 2) |
| %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i |
| %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) |
| %i2 = getelementptr inbounds i32, ptr %A, i64 %index |
| %wide.load = load <8 x i32>, ptr %i2, align 1 |
| %i4 = add <8 x i32> %wide.load, %wide.masked.gather |
| store <8 x i32> %i4, ptr %i2, align 1 |
| %index.next = add nuw i64 %index, 8 |
| %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8) |
| %i6 = icmp eq i64 %index.next, 1024 |
| br i1 %i6, label %for.cond.cleanup, label %vector.body |
| |
| for.cond.cleanup: ; preds = %vector.body |
| ret void |
| } |
| |
| ;void scatter_pow2(signed char * __restrict A, signed char * __restrict B) { |
| ; for (int i = 0; i < 1024; ++i) |
| ; A[i * 4] += B[i]; |
| ;} |
| define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { |
| ; CHECK-LABEL: scatter_pow2: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: lui a3, 1 |
| ; CHECK-NEXT: li a2, 32 |
| ; CHECK-NEXT: add a3, a1, a3 |
| ; CHECK-NEXT: li a4, 16 |
| ; CHECK-NEXT: .LBB9_1: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma |
| ; CHECK-NEXT: vle8.v v8, (a1) |
| ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma |
| ; CHECK-NEXT: vlse32.v v9, (a0), a4 |
| ; CHECK-NEXT: addi a1, a1, 32 |
| ; CHECK-NEXT: vadd.vv v8, v9, v8 |
| ; CHECK-NEXT: vsse32.v v8, (a0), a4 |
| ; CHECK-NEXT: addi a0, a0, 128 |
| ; CHECK-NEXT: bne a1, a3, .LBB9_1 |
| ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup |
| ; CHECK-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %entry |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ] |
| %i = getelementptr inbounds i32, ptr %B, i64 %index |
| %wide.load = load <8 x i32>, ptr %i, align 1 |
| %i2 = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2) |
| %i3 = getelementptr inbounds i32, ptr %A, <8 x i64> %i2 |
| %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) |
| %i4 = add <8 x i32> %wide.masked.gather, %wide.load |
| call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i4, <8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true)) |
| %index.next = add nuw i64 %index, 8 |
| %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8) |
| %i5 = icmp eq i64 %index.next, 1024 |
| br i1 %i5, label %for.cond.cleanup, label %vector.body |
| |
| for.cond.cleanup: ; preds = %vector.body |
| ret void |
| } |
| |
| ;struct foo { |
| ; int a, b, c, d; |
| ;}; |
| ; |
| ;void struct_gather(int * __restrict A, struct foo * __restrict B) { |
| ; for (int i = 0; i < 1024; ++i) |
| ; A[i] += B[i].b; |
| ;} |
| define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { |
| ; CHECK-LABEL: struct_gather: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: addi a1, a1, 132 |
| ; CHECK-NEXT: lui a2, 1 |
| ; CHECK-NEXT: add a2, a0, a2 |
| ; CHECK-NEXT: li a3, 16 |
| ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma |
| ; CHECK-NEXT: .LBB10_1: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: addi a4, a0, 32 |
| ; CHECK-NEXT: addi a5, a1, -128 |
| ; CHECK-NEXT: vlse32.v v8, (a1), a3 |
| ; CHECK-NEXT: vle32.v v9, (a0) |
| ; CHECK-NEXT: vlse32.v v10, (a5), a3 |
| ; CHECK-NEXT: vle32.v v11, (a4) |
| ; CHECK-NEXT: vadd.vv v9, v9, v10 |
| ; CHECK-NEXT: vadd.vv v8, v11, v8 |
| ; CHECK-NEXT: vse32.v v9, (a0) |
| ; CHECK-NEXT: vse32.v v8, (a4) |
| ; CHECK-NEXT: addi a0, a0, 64 |
| ; CHECK-NEXT: addi a1, a1, 256 |
| ; CHECK-NEXT: bne a0, a2, .LBB10_1 |
| ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup |
| ; CHECK-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %entry |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ] |
| %step.add = add <8 x i64> %vec.ind, splat (i64 8) |
| %i = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %vec.ind, i32 1 |
| %i1 = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %step.add, i32 1 |
| %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) |
| %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) |
| %i2 = getelementptr inbounds i32, ptr %A, i64 %index |
| %wide.load = load <8 x i32>, ptr %i2, align 4 |
| %i4 = getelementptr inbounds i32, ptr %i2, i64 8 |
| %wide.load10 = load <8 x i32>, ptr %i4, align 4 |
| %i6 = add nsw <8 x i32> %wide.load, %wide.masked.gather |
| %i7 = add nsw <8 x i32> %wide.load10, %wide.masked.gather9 |
| store <8 x i32> %i6, ptr %i2, align 4 |
| store <8 x i32> %i7, ptr %i4, align 4 |
| %index.next = add nuw i64 %index, 16 |
| %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 16) |
| %i10 = icmp eq i64 %index.next, 1024 |
| br i1 %i10, label %for.cond.cleanup, label %vector.body |
| |
| for.cond.cleanup: ; preds = %vector.body |
| ret void |
| } |
| |
| ;void gather_unroll(int * __restrict A, int * __restrict B) { |
| ; for (int i = 0; i < 1024; i+= 4 ) { |
| ; A[i] += B[i * 4]; |
| ; A[i+1] += B[(i+1) * 4]; |
| ; A[i+2] += B[(i+2) * 4]; |
| ; A[i+3] += B[(i+3) * 4]; |
| ; } |
| ;} |
| define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { |
| ; CHECK-LABEL: gather_unroll: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: li a2, 256 |
| ; CHECK-NEXT: li a3, 64 |
| ; CHECK-NEXT: li a4, 16 |
| ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma |
| ; CHECK-NEXT: .LBB11_1: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vlse32.v v8, (a1), a3 |
| ; CHECK-NEXT: vlse32.v v9, (a0), a4 |
| ; CHECK-NEXT: addi a5, a1, 16 |
| ; CHECK-NEXT: vadd.vv v8, v9, v8 |
| ; CHECK-NEXT: vsse32.v v8, (a0), a4 |
| ; CHECK-NEXT: vlse32.v v8, (a5), a3 |
| ; CHECK-NEXT: addi a5, a0, 4 |
| ; CHECK-NEXT: vlse32.v v9, (a5), a4 |
| ; CHECK-NEXT: vadd.vv v8, v9, v8 |
| ; CHECK-NEXT: vsse32.v v8, (a5), a4 |
| ; CHECK-NEXT: addi a5, a1, 32 |
| ; CHECK-NEXT: vlse32.v v8, (a5), a3 |
| ; CHECK-NEXT: addi a5, a0, 8 |
| ; CHECK-NEXT: vlse32.v v9, (a5), a4 |
| ; CHECK-NEXT: vadd.vv v8, v9, v8 |
| ; CHECK-NEXT: vsse32.v v8, (a5), a4 |
| ; CHECK-NEXT: addi a5, a1, 48 |
| ; CHECK-NEXT: vlse32.v v8, (a5), a3 |
| ; CHECK-NEXT: addi a5, a0, 12 |
| ; CHECK-NEXT: vlse32.v v9, (a5), a4 |
| ; CHECK-NEXT: addi a2, a2, -8 |
| ; CHECK-NEXT: addi a1, a1, 512 |
| ; CHECK-NEXT: vadd.vv v8, v9, v8 |
| ; CHECK-NEXT: vsse32.v v8, (a5), a4 |
| ; CHECK-NEXT: addi a0, a0, 128 |
| ; CHECK-NEXT: bnez a2, .LBB11_1 |
| ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup |
| ; CHECK-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %entry |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %vec.ind = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %entry ], [ %vec.ind.next, %vector.body ] |
| %i = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2) |
| %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i |
| %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) |
| %i2 = getelementptr inbounds i32, ptr %A, <8 x i64> %vec.ind |
| %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) |
| %i3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather |
| call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i3, <8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true)) |
| %i4 = or disjoint <8 x i64> %vec.ind, splat (i64 1) |
| %i5 = shl nsw <8 x i64> %i4, splat (i64 2) |
| %i6 = getelementptr inbounds i32, ptr %B, <8 x i64> %i5 |
| %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i6, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) |
| %i7 = getelementptr inbounds i32, ptr %A, <8 x i64> %i4 |
| %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) |
| %i8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53 |
| call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i8, <8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true)) |
| %i9 = or disjoint <8 x i64> %vec.ind, splat (i64 2) |
| %i10 = shl nsw <8 x i64> %i9, splat (i64 2) |
| %i11 = getelementptr inbounds i32, ptr %B, <8 x i64> %i10 |
| %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i11, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) |
| %i12 = getelementptr inbounds i32, ptr %A, <8 x i64> %i9 |
| %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) |
| %i13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55 |
| call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i13, <8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true)) |
| %i14 = or disjoint <8 x i64> %vec.ind, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3> |
| %i15 = shl nsw <8 x i64> %i14, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> |
| %i16 = getelementptr inbounds i32, ptr %B, <8 x i64> %i15 |
| %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i16, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) |
| %i17 = getelementptr inbounds i32, ptr %A, <8 x i64> %i14 |
| %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) |
| %i18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57 |
| call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i18, <8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true)) |
| %index.next = add nuw i64 %index, 8 |
| %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32) |
| %i19 = icmp eq i64 %index.next, 256 |
| br i1 %i19, label %for.cond.cleanup, label %vector.body |
| |
| for.cond.cleanup: ; preds = %vector.body |
| ret void |
| } |
| |
| declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i8>) |
| declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i32>) |
| declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32 immarg, <32 x i1>) |
| declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32 immarg, <8 x i1>) |
| |
| ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. |
| define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) { |
| ; V-LABEL: gather_of_pointers: |
| ; V: # %bb.0: # %bb |
| ; V-NEXT: lui a2, 2 |
| ; V-NEXT: add a2, a0, a2 |
| ; V-NEXT: li a3, 40 |
| ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma |
| ; V-NEXT: .LBB12_1: # %bb2 |
| ; V-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; V-NEXT: vlse64.v v8, (a1), a3 |
| ; V-NEXT: addi a4, a1, 80 |
| ; V-NEXT: vlse64.v v9, (a4), a3 |
| ; V-NEXT: addi a4, a0, 16 |
| ; V-NEXT: vse64.v v8, (a0) |
| ; V-NEXT: addi a0, a0, 32 |
| ; V-NEXT: vse64.v v9, (a4) |
| ; V-NEXT: addi a1, a1, 160 |
| ; V-NEXT: bne a0, a2, .LBB12_1 |
| ; V-NEXT: # %bb.2: # %bb18 |
| ; V-NEXT: ret |
| ; |
| ; ZVE32F-LABEL: gather_of_pointers: |
| ; ZVE32F: # %bb.0: # %bb |
| ; ZVE32F-NEXT: li a2, 0 |
| ; ZVE32F-NEXT: lui a4, 2 |
| ; ZVE32F-NEXT: li a3, 1 |
| ; ZVE32F-NEXT: add a4, a0, a4 |
| ; ZVE32F-NEXT: li a5, 40 |
| ; ZVE32F-NEXT: .LBB12_1: # %bb2 |
| ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; ZVE32F-NEXT: mul a6, a3, a5 |
| ; ZVE32F-NEXT: mul a7, a2, a5 |
| ; ZVE32F-NEXT: addi a2, a2, 4 |
| ; ZVE32F-NEXT: add a6, a1, a6 |
| ; ZVE32F-NEXT: add a7, a1, a7 |
| ; ZVE32F-NEXT: ld t0, 0(a7) |
| ; ZVE32F-NEXT: ld t1, 0(a6) |
| ; ZVE32F-NEXT: ld a7, 80(a7) |
| ; ZVE32F-NEXT: ld a6, 80(a6) |
| ; ZVE32F-NEXT: sd t0, 0(a0) |
| ; ZVE32F-NEXT: sd t1, 8(a0) |
| ; ZVE32F-NEXT: sd a7, 16(a0) |
| ; ZVE32F-NEXT: sd a6, 24(a0) |
| ; ZVE32F-NEXT: addi a0, a0, 32 |
| ; ZVE32F-NEXT: addi a3, a3, 4 |
| ; ZVE32F-NEXT: bne a0, a4, .LBB12_1 |
| ; ZVE32F-NEXT: # %bb.2: # %bb18 |
| ; ZVE32F-NEXT: ret |
| ; |
| ; OPTZVE32F-LABEL: gather_of_pointers: |
| ; OPTZVE32F: # %bb.0: # %bb |
| ; OPTZVE32F-NEXT: lui a2, 2 |
| ; OPTZVE32F-NEXT: add a2, a0, a2 |
| ; OPTZVE32F-NEXT: li a3, 40 |
| ; OPTZVE32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma |
| ; OPTZVE32F-NEXT: .LBB12_1: # %bb2 |
| ; OPTZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; OPTZVE32F-NEXT: vlse64.v v8, (a1), a3 |
| ; OPTZVE32F-NEXT: addi a4, a1, 80 |
| ; OPTZVE32F-NEXT: vlse64.v v9, (a4), a3 |
| ; OPTZVE32F-NEXT: addi a4, a0, 16 |
| ; OPTZVE32F-NEXT: vse64.v v8, (a0) |
| ; OPTZVE32F-NEXT: addi a0, a0, 32 |
| ; OPTZVE32F-NEXT: vse64.v v9, (a4) |
| ; OPTZVE32F-NEXT: addi a1, a1, 160 |
| ; OPTZVE32F-NEXT: bne a0, a2, .LBB12_1 |
| ; OPTZVE32F-NEXT: # %bb.2: # %bb18 |
| ; OPTZVE32F-NEXT: ret |
| ; |
| ; OPTV-LABEL: gather_of_pointers: |
| ; OPTV: # %bb.0: # %bb |
| ; OPTV-NEXT: li a2, 0 |
| ; OPTV-NEXT: lui a4, 2 |
| ; OPTV-NEXT: li a3, 1 |
| ; OPTV-NEXT: add a4, a0, a4 |
| ; OPTV-NEXT: li a5, 40 |
| ; OPTV-NEXT: .LBB12_1: # %bb2 |
| ; OPTV-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; OPTV-NEXT: mul a6, a3, a5 |
| ; OPTV-NEXT: mul a7, a2, a5 |
| ; OPTV-NEXT: addi a2, a2, 4 |
| ; OPTV-NEXT: add a6, a1, a6 |
| ; OPTV-NEXT: add a7, a1, a7 |
| ; OPTV-NEXT: ld t0, 0(a7) |
| ; OPTV-NEXT: ld t1, 0(a6) |
| ; OPTV-NEXT: ld a7, 80(a7) |
| ; OPTV-NEXT: ld a6, 80(a6) |
| ; OPTV-NEXT: sd t0, 0(a0) |
| ; OPTV-NEXT: sd t1, 8(a0) |
| ; OPTV-NEXT: sd a7, 16(a0) |
| ; OPTV-NEXT: sd a6, 24(a0) |
| ; OPTV-NEXT: addi a0, a0, 32 |
| ; OPTV-NEXT: addi a3, a3, 4 |
| ; OPTV-NEXT: bne a0, a4, .LBB12_1 |
| ; OPTV-NEXT: # %bb.2: # %bb18 |
| ; OPTV-NEXT: ret |
| bb: |
| br label %bb2 |
| |
| bb2: ; preds = %bb2, %bb |
| %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ] |
| %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ] |
| %i4 = mul nuw nsw <2 x i64> %i3, splat (i64 5) |
| %i5 = mul <2 x i64> %i3, splat (i64 5) |
| %i6 = add <2 x i64> %i5, <i64 10, i64 10> |
| %i7 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i4 |
| %i8 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i6 |
| %i9 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i7, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef) |
| %i10 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i8, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef) |
| %i11 = getelementptr inbounds ptr, ptr %arg, i64 %i |
| store <2 x ptr> %i9, ptr %i11, align 8 |
| %i13 = getelementptr inbounds ptr, ptr %i11, i64 2 |
| store <2 x ptr> %i10, ptr %i13, align 8 |
| %i15 = add nuw i64 %i, 4 |
| %i16 = add <2 x i64> %i3, <i64 4, i64 4> |
| %i17 = icmp eq i64 %i15, 1024 |
| br i1 %i17, label %bb18, label %bb2 |
| |
| bb18: ; preds = %bb2 |
| ret void |
| } |
| |
| declare <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x ptr>) |
| |
| ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. |
| define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) { |
| ; V-LABEL: scatter_of_pointers: |
| ; V: # %bb.0: # %bb |
| ; V-NEXT: lui a2, 2 |
| ; V-NEXT: add a2, a1, a2 |
| ; V-NEXT: li a3, 40 |
| ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma |
| ; V-NEXT: .LBB13_1: # %bb2 |
| ; V-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; V-NEXT: addi a4, a1, 16 |
| ; V-NEXT: vle64.v v8, (a1) |
| ; V-NEXT: vle64.v v9, (a4) |
| ; V-NEXT: addi a4, a0, 80 |
| ; V-NEXT: addi a1, a1, 32 |
| ; V-NEXT: vsse64.v v8, (a0), a3 |
| ; V-NEXT: vsse64.v v9, (a4), a3 |
| ; V-NEXT: addi a0, a0, 160 |
| ; V-NEXT: bne a1, a2, .LBB13_1 |
| ; V-NEXT: # %bb.2: # %bb18 |
| ; V-NEXT: ret |
| ; |
| ; ZVE32F-LABEL: scatter_of_pointers: |
| ; ZVE32F: # %bb.0: # %bb |
| ; ZVE32F-NEXT: li a2, 0 |
| ; ZVE32F-NEXT: lui a4, 2 |
| ; ZVE32F-NEXT: li a3, 1 |
| ; ZVE32F-NEXT: add a4, a1, a4 |
| ; ZVE32F-NEXT: li a5, 40 |
| ; ZVE32F-NEXT: .LBB13_1: # %bb2 |
| ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; ZVE32F-NEXT: ld a6, 0(a1) |
| ; ZVE32F-NEXT: ld a7, 8(a1) |
| ; ZVE32F-NEXT: ld t0, 16(a1) |
| ; ZVE32F-NEXT: ld t1, 24(a1) |
| ; ZVE32F-NEXT: mul t2, a3, a5 |
| ; ZVE32F-NEXT: mul t3, a2, a5 |
| ; ZVE32F-NEXT: addi a2, a2, 4 |
| ; ZVE32F-NEXT: addi a1, a1, 32 |
| ; ZVE32F-NEXT: add t2, a0, t2 |
| ; ZVE32F-NEXT: add t3, a0, t3 |
| ; ZVE32F-NEXT: sd a6, 0(t3) |
| ; ZVE32F-NEXT: sd a7, 0(t2) |
| ; ZVE32F-NEXT: sd t0, 80(t3) |
| ; ZVE32F-NEXT: sd t1, 80(t2) |
| ; ZVE32F-NEXT: addi a3, a3, 4 |
| ; ZVE32F-NEXT: bne a1, a4, .LBB13_1 |
| ; ZVE32F-NEXT: # %bb.2: # %bb18 |
| ; ZVE32F-NEXT: ret |
| ; |
| ; OPTZVE32F-LABEL: scatter_of_pointers: |
| ; OPTZVE32F: # %bb.0: # %bb |
| ; OPTZVE32F-NEXT: lui a2, 2 |
| ; OPTZVE32F-NEXT: add a2, a1, a2 |
| ; OPTZVE32F-NEXT: li a3, 40 |
| ; OPTZVE32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma |
| ; OPTZVE32F-NEXT: .LBB13_1: # %bb2 |
| ; OPTZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; OPTZVE32F-NEXT: addi a4, a1, 16 |
| ; OPTZVE32F-NEXT: vle64.v v8, (a1) |
| ; OPTZVE32F-NEXT: vle64.v v9, (a4) |
| ; OPTZVE32F-NEXT: addi a4, a0, 80 |
| ; OPTZVE32F-NEXT: addi a1, a1, 32 |
| ; OPTZVE32F-NEXT: vsse64.v v8, (a0), a3 |
| ; OPTZVE32F-NEXT: vsse64.v v9, (a4), a3 |
| ; OPTZVE32F-NEXT: addi a0, a0, 160 |
| ; OPTZVE32F-NEXT: bne a1, a2, .LBB13_1 |
| ; OPTZVE32F-NEXT: # %bb.2: # %bb18 |
| ; OPTZVE32F-NEXT: ret |
| ; |
| ; OPTV-LABEL: scatter_of_pointers: |
| ; OPTV: # %bb.0: # %bb |
| ; OPTV-NEXT: li a2, 0 |
| ; OPTV-NEXT: lui a4, 2 |
| ; OPTV-NEXT: li a3, 1 |
| ; OPTV-NEXT: add a4, a1, a4 |
| ; OPTV-NEXT: li a5, 40 |
| ; OPTV-NEXT: .LBB13_1: # %bb2 |
| ; OPTV-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; OPTV-NEXT: ld a6, 0(a1) |
| ; OPTV-NEXT: ld a7, 8(a1) |
| ; OPTV-NEXT: ld t0, 16(a1) |
| ; OPTV-NEXT: ld t1, 24(a1) |
| ; OPTV-NEXT: mul t2, a3, a5 |
| ; OPTV-NEXT: mul t3, a2, a5 |
| ; OPTV-NEXT: addi a2, a2, 4 |
| ; OPTV-NEXT: addi a1, a1, 32 |
| ; OPTV-NEXT: add t2, a0, t2 |
| ; OPTV-NEXT: add t3, a0, t3 |
| ; OPTV-NEXT: sd a6, 0(t3) |
| ; OPTV-NEXT: sd a7, 0(t2) |
| ; OPTV-NEXT: sd t0, 80(t3) |
| ; OPTV-NEXT: sd t1, 80(t2) |
| ; OPTV-NEXT: addi a3, a3, 4 |
| ; OPTV-NEXT: bne a1, a4, .LBB13_1 |
| ; OPTV-NEXT: # %bb.2: # %bb18 |
| ; OPTV-NEXT: ret |
| bb: |
| br label %bb2 |
| |
| bb2: ; preds = %bb2, %bb |
| %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ] |
| %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ] |
| %i4 = getelementptr inbounds ptr, ptr %arg1, i64 %i |
| %i6 = load <2 x ptr>, ptr %i4, align 8 |
| %i7 = getelementptr inbounds ptr, ptr %i4, i64 2 |
| %i9 = load <2 x ptr>, ptr %i7, align 8 |
| %i10 = mul nuw nsw <2 x i64> %i3, splat (i64 5) |
| %i11 = mul <2 x i64> %i3, splat (i64 5) |
| %i12 = add <2 x i64> %i11, <i64 10, i64 10> |
| %i13 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i10 |
| %i14 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i12 |
| call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i6, <2 x ptr> %i13, i32 8, <2 x i1> splat (i1 true)) |
| call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i9, <2 x ptr> %i14, i32 8, <2 x i1> splat (i1 true)) |
| %i15 = add nuw i64 %i, 4 |
| %i16 = add <2 x i64> %i3, <i64 4, i64 4> |
| %i17 = icmp eq i64 %i15, 1024 |
| br i1 %i17, label %bb18, label %bb2 |
| |
| bb18: ; preds = %bb2 |
| ret void |
| } |
| |
| declare void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr>, <2 x ptr>, i32 immarg, <2 x i1>) |
| |
| define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1, i32 signext %arg2) { |
| ; CHECK-LABEL: strided_load_startval_add_with_splat: |
| ; CHECK: # %bb.0: # %bb |
| ; CHECK-NEXT: li a3, 1024 |
| ; CHECK-NEXT: beq a2, a3, .LBB14_7 |
| ; CHECK-NEXT: # %bb.1: # %bb3 |
| ; CHECK-NEXT: li a3, 1023 |
| ; CHECK-NEXT: subw a5, a3, a2 |
| ; CHECK-NEXT: li a6, 31 |
| ; CHECK-NEXT: mv a4, a2 |
| ; CHECK-NEXT: bltu a5, a6, .LBB14_5 |
| ; CHECK-NEXT: # %bb.2: # %bb9 |
| ; CHECK-NEXT: slli a4, a5, 32 |
| ; CHECK-NEXT: slli t0, a2, 2 |
| ; CHECK-NEXT: add a5, a0, a2 |
| ; CHECK-NEXT: add a6, a1, a2 |
| ; CHECK-NEXT: li t2, 32 |
| ; CHECK-NEXT: srli a4, a4, 32 |
| ; CHECK-NEXT: add t0, a6, t0 |
| ; CHECK-NEXT: addi a6, a4, 1 |
| ; CHECK-NEXT: andi a7, a6, -32 |
| ; CHECK-NEXT: add a4, a7, a2 |
| ; CHECK-NEXT: add a2, a4, a0 |
| ; CHECK-NEXT: li t1, 5 |
| ; CHECK-NEXT: vsetvli zero, t2, e8, m1, ta, ma |
| ; CHECK-NEXT: .LBB14_3: # %bb15 |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vlse8.v v8, (t0), t1 |
| ; CHECK-NEXT: vle8.v v9, (a5) |
| ; CHECK-NEXT: vadd.vv v8, v9, v8 |
| ; CHECK-NEXT: vse8.v v8, (a5) |
| ; CHECK-NEXT: addi a5, a5, 32 |
| ; CHECK-NEXT: addi t0, t0, 160 |
| ; CHECK-NEXT: bne a5, a2, .LBB14_3 |
| ; CHECK-NEXT: # %bb.4: # %bb30 |
| ; CHECK-NEXT: beq a6, a7, .LBB14_7 |
| ; CHECK-NEXT: .LBB14_5: # %bb32 |
| ; CHECK-NEXT: add a2, a0, a4 |
| ; CHECK-NEXT: slli a5, a4, 2 |
| ; CHECK-NEXT: add a1, a1, a4 |
| ; CHECK-NEXT: subw a3, a3, a4 |
| ; CHECK-NEXT: add a1, a1, a5 |
| ; CHECK-NEXT: slli a3, a3, 32 |
| ; CHECK-NEXT: srli a3, a3, 32 |
| ; CHECK-NEXT: add a0, a4, a0 |
| ; CHECK-NEXT: add a0, a0, a3 |
| ; CHECK-NEXT: addi a0, a0, 1 |
| ; CHECK-NEXT: .LBB14_6: # %bb35 |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: lbu a3, 0(a1) |
| ; CHECK-NEXT: lbu a4, 0(a2) |
| ; CHECK-NEXT: add a3, a4, a3 |
| ; CHECK-NEXT: sb a3, 0(a2) |
| ; CHECK-NEXT: addi a2, a2, 1 |
| ; CHECK-NEXT: addi a1, a1, 5 |
| ; CHECK-NEXT: bne a2, a0, .LBB14_6 |
| ; CHECK-NEXT: .LBB14_7: # %bb34 |
| ; CHECK-NEXT: ret |
| bb: |
| %i = icmp eq i32 %arg2, 1024 |
| br i1 %i, label %bb34, label %bb3 |
| |
| bb3: ; preds = %bb |
| %i4 = sext i32 %arg2 to i64 |
| %i5 = sub i32 1023, %arg2 |
| %i6 = zext i32 %i5 to i64 |
| %i7 = add nuw nsw i64 %i6, 1 |
| %i8 = icmp ult i32 %i5, 31 |
| br i1 %i8, label %bb32, label %bb9 |
| |
| bb9: ; preds = %bb3 |
| %i10 = and i64 %i7, 8589934560 |
| %i11 = add nsw i64 %i10, %i4 |
| %i12 = insertelement <32 x i64> poison, i64 %i4, i64 0 |
| %i13 = shufflevector <32 x i64> %i12, <32 x i64> poison, <32 x i32> zeroinitializer |
| %i14 = add <32 x i64> %i13, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31> |
| br label %bb15 |
| |
| bb15: ; preds = %bb15, %bb9 |
| %i16 = phi i64 [ 0, %bb9 ], [ %i27, %bb15 ] |
| %i17 = phi <32 x i64> [ %i14, %bb9 ], [ %i28, %bb15 ] |
| %i18 = add i64 %i16, %i4 |
| %i19 = mul nsw <32 x i64> %i17, splat (i64 5) |
| %i20 = getelementptr inbounds i8, ptr %arg1, <32 x i64> %i19 |
| %i21 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i20, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef) |
| %i22 = getelementptr inbounds i8, ptr %arg, i64 %i18 |
| %i24 = load <32 x i8>, ptr %i22, align 1 |
| %i25 = add <32 x i8> %i24, %i21 |
| store <32 x i8> %i25, ptr %i22, align 1 |
| %i27 = add nuw i64 %i16, 32 |
| %i28 = add <32 x i64> %i17, splat (i64 32) |
| %i29 = icmp eq i64 %i27, %i10 |
| br i1 %i29, label %bb30, label %bb15 |
| |
| bb30: ; preds = %bb15 |
| %i31 = icmp eq i64 %i7, %i10 |
| br i1 %i31, label %bb34, label %bb32 |
| |
| bb32: ; preds = %bb30, %bb3 |
| %i33 = phi i64 [ %i4, %bb3 ], [ %i11, %bb30 ] |
| br label %bb35 |
| |
| bb34: ; preds = %bb35, %bb30, %bb |
| ret void |
| |
| bb35: ; preds = %bb35, %bb32 |
| %i36 = phi i64 [ %i43, %bb35 ], [ %i33, %bb32 ] |
| %i37 = mul nsw i64 %i36, 5 |
| %i38 = getelementptr inbounds i8, ptr %arg1, i64 %i37 |
| %i39 = load i8, ptr %i38, align 1 |
| %i40 = getelementptr inbounds i8, ptr %arg, i64 %i36 |
| %i41 = load i8, ptr %i40, align 1 |
| %i42 = add i8 %i41, %i39 |
| store i8 %i42, ptr %i40, align 1 |
| %i43 = add nsw i64 %i36, 1 |
| %i44 = trunc i64 %i43 to i32 |
| %i45 = icmp eq i32 %i44, 1024 |
| br i1 %i45, label %bb34, label %bb35 |
| } |
| |
| declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32 immarg, <16 x i1>, <16 x i8>) |
| declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32 immarg, <16 x i1>) |
| |
| define void @gather_no_scalar_remainder(ptr noalias nocapture noundef %arg, ptr noalias nocapture noundef readonly %arg1, i64 noundef %arg2) { |
| ; CHECK-LABEL: gather_no_scalar_remainder: |
| ; CHECK: # %bb.0: # %bb |
| ; CHECK-NEXT: slli a2, a2, 4 |
| ; CHECK-NEXT: beqz a2, .LBB15_3 |
| ; CHECK-NEXT: # %bb.1: # %bb2 |
| ; CHECK-NEXT: addi a2, a2, -16 |
| ; CHECK-NEXT: andi a2, a2, -16 |
| ; CHECK-NEXT: add a2, a2, a0 |
| ; CHECK-NEXT: addi a2, a2, 16 |
| ; CHECK-NEXT: li a3, 5 |
| ; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, ma |
| ; CHECK-NEXT: .LBB15_2: # %bb4 |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vlse8.v v8, (a1), a3 |
| ; CHECK-NEXT: vle8.v v9, (a0) |
| ; CHECK-NEXT: vadd.vv v8, v9, v8 |
| ; CHECK-NEXT: vse8.v v8, (a0) |
| ; CHECK-NEXT: addi a0, a0, 16 |
| ; CHECK-NEXT: addi a1, a1, 80 |
| ; CHECK-NEXT: bne a0, a2, .LBB15_2 |
| ; CHECK-NEXT: .LBB15_3: # %bb16 |
| ; CHECK-NEXT: ret |
| bb: |
| %i = shl i64 %arg2, 4 |
| %i3 = icmp eq i64 %i, 0 |
| br i1 %i3, label %bb16, label %bb2 |
| |
| bb2: ; preds = %bb |
| br label %bb4 |
| |
| bb4: ; preds = %bb4, %bb2 |
| %i5 = phi i64 [ %i13, %bb4 ], [ 0, %bb2 ] |
| %i6 = phi <16 x i64> [ %i14, %bb4 ], [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %bb2 ] |
| %i7 = mul <16 x i64> %i6, splat (i64 5) |
| %i8 = getelementptr inbounds i8, ptr %arg1, <16 x i64> %i7 |
| %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %i8, i32 1, <16 x i1> splat (i1 true), <16 x i8> undef) |
| %i10 = getelementptr inbounds i8, ptr %arg, i64 %i5 |
| %i11 = load <16 x i8>, ptr %i10, align 1 |
| %i12 = add <16 x i8> %i11, %i9 |
| store <16 x i8> %i12, ptr %i10, align 1 |
| %i13 = add nuw i64 %i5, 16 |
| %i14 = add <16 x i64> %i6, splat (i64 16) |
| %i15 = icmp eq i64 %i13, %i |
| br i1 %i15, label %bb16, label %bb4 |
| |
| bb16: ; preds = %bb4, %bb |
| ret void |
| } |
| |
| define void @gather_zero_stride_fp(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { |
| ; CHECK-LABEL: gather_zero_stride_fp: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: lui a2, 1 |
| ; CHECK-NEXT: add a2, a0, a2 |
| ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma |
| ; CHECK-NEXT: .LBB16_1: # %vector.body |
| ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: flw fa5, 0(a1) |
| ; CHECK-NEXT: vle32.v v8, (a0) |
| ; CHECK-NEXT: vfadd.vf v8, v8, fa5 |
| ; CHECK-NEXT: vse32.v v8, (a0) |
| ; CHECK-NEXT: addi a0, a0, 128 |
| ; CHECK-NEXT: addi a1, a1, 640 |
| ; CHECK-NEXT: bne a0, a2, .LBB16_1 |
| ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup |
| ; CHECK-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %entry |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %vec.ind = phi <8 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] |
| %i = mul nuw nsw <8 x i64> %vec.ind, splat (i64 5) |
| %i1 = getelementptr inbounds float, ptr %B, <8 x i64> %i |
| %wide.masked.gather = call <8 x float> @llvm.masked.gather.v8f32.v32p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x float> undef) |
| %i2 = getelementptr inbounds float, ptr %A, i64 %index |
| %wide.load = load <8 x float>, ptr %i2, align 4 |
| %i4 = fadd <8 x float> %wide.load, %wide.masked.gather |
| store <8 x float> %i4, ptr %i2, align 4 |
| %index.next = add nuw i64 %index, 32 |
| %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32) |
| %i6 = icmp eq i64 %index.next, 1024 |
| br i1 %i6, label %for.cond.cleanup, label %vector.body |
| |
| for.cond.cleanup: ; preds = %vector.body |
| ret void |
| } |