| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=hexagon -hexagon-hvx-widen=32 < %s -verify-machineinstrs | FileCheck %s |
| |
| target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" |
| target triple = "hexagon" |
| |
| ; s8 -> f16 |
| ; No widening |
| define void @s8f16_0(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: s8f16_0: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r7 = #1 |
| ; CHECK-NEXT: r6 = #64 |
| ; CHECK-NEXT: v1:0.h = vunpack(v0.b) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.h = vsplat(r7) |
| ; CHECK-NEXT: r3:2 = combine(#31,#5) |
| ; CHECK-NEXT: v3.h = vabs(v0.h) |
| ; CHECK-NEXT: v4.h = vabs(v1.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v8.h = vsplat(r6) |
| ; CHECK-NEXT: v7.h = vsplat(r3) |
| ; CHECK-NEXT: v9 = vxor(v9,v9) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r5 = ##32768 |
| ; CHECK-NEXT: v5.uh = vcl0(v3.uh) |
| ; CHECK-NEXT: q0 = vcmp.gt(v9.h,v0.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v10.h = vsplat(r5) |
| ; CHECK-NEXT: r4 = #10 |
| ; CHECK-NEXT: v6.uh = vcl0(v4.uh) |
| ; CHECK-NEXT: v5.h = vadd(v5.h,v2.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v27 = vmux(q0,v10,v9) |
| ; CHECK-NEXT: v6.h = vadd(v6.h,v2.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.h = vasl(v3.h,v5.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.h = vasl(v4.h,v6.h) |
| ; CHECK-NEXT: v13 = vand(v3,v8) |
| ; CHECK-NEXT: v11.h = vadd(v3.h,v7.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v14.h = vadd(v4.h,v7.h) |
| ; CHECK-NEXT: q2 = vcmp.eq(v13.h,v9.h) |
| ; CHECK-NEXT: v8 = vand(v4,v8) |
| ; CHECK-NEXT: q1 = vcmp.gt(v3.uh,v11.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v11.uh = vlsr(v11.uh,r2) |
| ; CHECK-NEXT: v13 = vmux(q2,v9,v2) |
| ; CHECK-NEXT: q2 = vcmp.eq(v8.h,v9.h) |
| ; CHECK-NEXT: q3 = vcmp.gt(v4.uh,v14.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v20.uh = vlsr(v14.uh,r2) |
| ; CHECK-NEXT: v22 = vmux(q2,v9,v2) |
| ; CHECK-NEXT: v21 = vmux(q1,v2,v9) |
| ; CHECK-NEXT: v2 = vmux(q3,v2,v9) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v19.uh = vlsr(v4.uh,r2) |
| ; CHECK-NEXT: v13.h = vadd(v11.h,v13.h) |
| ; CHECK-NEXT: v24.h = vadd(v20.h,v22.h) |
| ; CHECK-NEXT: v2.h = vadd(v2.h,v7.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v12.uh = vlsr(v3.uh,r2) |
| ; CHECK-NEXT: v23.h = vadd(v21.h,v7.h) |
| ; CHECK-NEXT: v2.h = vsub(v2.h,v6.h) |
| ; CHECK-NEXT: q3 = vcmp.gt(v9.h,v1.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v11.uh = vlsr(v11.uh,r7) |
| ; CHECK-NEXT: v3.h = vsub(v23.h,v5.h) |
| ; CHECK-NEXT: q2 = vcmp.eq(v12.h,v11.h) |
| ; CHECK-NEXT: q1 = vcmp.eq(v19.h,v20.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v25.uh = vlsr(v13.uh,r7) |
| ; CHECK-NEXT: v28 = vmux(q3,v10,v9) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.h,v9.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v26.uh = vlsr(v24.uh,r7) |
| ; CHECK-NEXT: v5 = vmux(q2,v25,v11) |
| ; CHECK-NEXT: q2 = vcmp.eq(v1.h,v9.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.uh = vlsr(v20.uh,r7) |
| ; CHECK-NEXT: v5 = vor(v27,v5) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.h = vasl(v3.h,r4) |
| ; CHECK-NEXT: v4 = vmux(q1,v26,v4) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.h = vasl(v2.h,r4) |
| ; CHECK-NEXT: v4 = vor(v28,v4) |
| ; CHECK-NEXT: v29 = vor(v5,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2 = vor(v4,v2) |
| ; CHECK-NEXT: v31 = vmux(q3,v9,v29) |
| ; CHECK-NEXT: vmem(r1+#0) = v31.new |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30 = vmux(q2,v9,v2) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#1) = v30.new |
| ; CHECK-NEXT: } |
| %v0 = load <128 x i8>, ptr %a0, align 128 |
| %v1 = sitofp <128 x i8> %v0 to <128 x half> |
| store <128 x half> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; Widen input |
| define void @s8f16_1(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: s8f16_1: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r6 = #1 |
| ; CHECK-NEXT: r3:2 = combine(#64,#31) |
| ; CHECK-NEXT: v1:0.h = vunpack(v0.b) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.h = vsplat(r6) |
| ; CHECK-NEXT: v4.h = vsplat(r2) |
| ; CHECK-NEXT: v2.h = vabs(v0.h) |
| ; CHECK-NEXT: v1 = vxor(v1,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.h = vsplat(r3) |
| ; CHECK-NEXT: r5:4 = combine(##32768,#5) |
| ; CHECK-NEXT: r2 = #10 |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v8.h = vsplat(r5) |
| ; CHECK-NEXT: v5.uh = vcl0(v2.uh) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.h,v1.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.h = vadd(v5.h,v3.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.h = vasl(v2.h,v5.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.h = vadd(v2.h,v4.h) |
| ; CHECK-NEXT: v6 = vand(v2,v6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.uh = vlsr(v2.uh,r4) |
| ; CHECK-NEXT: q0 = vcmp.eq(v6.h,v1.h) |
| ; CHECK-NEXT: q1 = vcmp.gt(v2.uh,v7.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v25.uh = vlsr(v7.uh,r4) |
| ; CHECK-NEXT: v26 = vmux(q0,v1,v3) |
| ; CHECK-NEXT: v3 = vmux(q1,v3,v1) |
| ; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.h = vadd(v25.h,v26.h) |
| ; CHECK-NEXT: v3.h = vadd(v3.h,v4.h) |
| ; CHECK-NEXT: q2 = vcmp.eq(v2.h,v25.h) |
| ; CHECK-NEXT: v30 = vmux(q1,v8,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v27.uh = vlsr(v25.uh,r6) |
| ; CHECK-NEXT: v28.h = vsub(v3.h,v5.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v29.uh = vlsr(v7.uh,r6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.h = vasl(v28.h,r2) |
| ; CHECK-NEXT: v3 = vmux(q2,v29,v27) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3 = vor(v30,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31 = vor(v3,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q3,v1,v31) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#0) = v0.new |
| ; CHECK-NEXT: } |
| %v0 = load <64 x i8>, ptr %a0, align 128 |
| %v1 = sitofp <64 x i8> %v0 to <64 x half> |
| store <64 x half> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| |
| ; s8 -> f32 |
| ; No widening |
| define void @s8f32_0(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: s8f32_0: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r7 = #64 |
| ; CHECK-NEXT: r0 = #1 |
| ; CHECK-NEXT: v0 = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2 = vsplat(r0) |
| ; CHECK-NEXT: r3:2 = combine(##255,#8) |
| ; CHECK-NEXT: v1 = valign(v0,v0,r7) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3 = vsplat(r3) |
| ; CHECK-NEXT: r7 = #512 |
| ; CHECK-NEXT: v9:8.h = vunpack(v0.b) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4 = vsplat(r7) |
| ; CHECK-NEXT: r6 = ##-2147483648 |
| ; CHECK-NEXT: r5 = #159 |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r4 = #23 |
| ; CHECK-NEXT: v7:6.h = vunpack(v1.b) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v8 = vsplat(r6) |
| ; CHECK-NEXT: v1:0.w = vunpack(v8.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7:6.w = vunpack(v6.h) |
| ; CHECK-NEXT: v5.w = vabs(v0.w) |
| ; CHECK-NEXT: v10.w = vabs(v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v26.w = vabs(v6.w) |
| ; CHECK-NEXT: v13.w = vabs(v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v9.uw = vcl0(v5.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v12.uw = vcl0(v26.uw) |
| ; CHECK-NEXT: v9.w = vadd(v9.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v14.uw = vcl0(v13.uw) |
| ; CHECK-NEXT: v15.w = vadd(v12.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v11.uw = vcl0(v10.uw) |
| ; CHECK-NEXT: v12.w = vadd(v14.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v27.w = vasl(v26.w,v15.w) |
| ; CHECK-NEXT: v11.w = vadd(v11.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v13.w = vasl(v13.w,v12.w) |
| ; CHECK-NEXT: v20 = vand(v27,v4) |
| ; CHECK-NEXT: v19.w = vadd(v27.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v16.w = vasl(v5.w,v9.w) |
| ; CHECK-NEXT: v5 = vxor(v5,v5) |
| ; CHECK-NEXT: v23.w = vadd(v13.w,v3.w) |
| ; CHECK-NEXT: v28 = vand(v13,v4) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v17.w = vasl(v10.w,v11.w) |
| ; CHECK-NEXT: q3 = vcmp.eq(v20.w,v5.w) |
| ; CHECK-NEXT: q2 = vcmp.gt(v27.uw,v19.uw) |
| ; CHECK-NEXT: q0 = vcmp.gt(v5.w,v6.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v21.uw = vlsr(v27.uw,r2) |
| ; CHECK-NEXT: v30 = vmux(q3,v5,v2) |
| ; CHECK-NEXT: q3 = vcmp.eq(v28.w,v5.w) |
| ; CHECK-NEXT: v22 = vand(v17,v4) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v14.uw = vlsr(v19.uw,r2) |
| ; CHECK-NEXT: v27 = vmux(q3,v5,v2) |
| ; CHECK-NEXT: q1 = vcmp.eq(v22.w,v5.w) |
| ; CHECK-NEXT: v24 = vmux(q2,v2,v5) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31.uw = vlsr(v23.uw,r2) |
| ; CHECK-NEXT: v22.w = vadd(v14.w,v30.w) |
| ; CHECK-NEXT: v30.w = vadd(v17.w,v3.w) |
| ; CHECK-NEXT: q2 = vcmp.eq(v21.w,v14.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v29.uw = vlsr(v13.uw,r2) |
| ; CHECK-NEXT: v28.w = vadd(v31.w,v27.w) |
| ; CHECK-NEXT: v3.w = vadd(v16.w,v3.w) |
| ; CHECK-NEXT: v4 = vand(v16,v4) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v14.uw = vlsr(v14.uw,r0) |
| ; CHECK-NEXT: q3 = vcmp.eq(v29.w,v31.w) |
| ; CHECK-NEXT: v18 = vmux(q0,v8,v5) |
| ; CHECK-NEXT: q0 = vcmp.gt(v5.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v19.uw = vlsr(v31.uw,r0) |
| ; CHECK-NEXT: v26 = vmux(q1,v5,v2) |
| ; CHECK-NEXT: v31 = vmux(q0,v8,v5) |
| ; CHECK-NEXT: q0 = vcmp.gt(v16.uw,v3.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v10 = vsplat(r5) |
| ; CHECK-NEXT: v29.uw = vlsr(v22.uw,r0) |
| ; CHECK-NEXT: v15.w = vsub(v24.w,v15.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v20.uw = vlsr(v28.uw,r0) |
| ; CHECK-NEXT: v14 = vmux(q2,v29,v14) |
| ; CHECK-NEXT: q2 = vcmp.gt(v13.uw,v23.uw) |
| ; CHECK-NEXT: v15.w = vadd(v15.w,v10.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v25.uw = vlsr(v30.uw,r2) |
| ; CHECK-NEXT: v19 = vmux(q3,v20,v19) |
| ; CHECK-NEXT: q3 = vcmp.eq(v4.w,v5.w) |
| ; CHECK-NEXT: v27 = vmux(q2,v2,v5) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.uw = vlsr(v3.uw,r2) |
| ; CHECK-NEXT: q2 = vcmp.gt(v17.uw,v30.uw) |
| ; CHECK-NEXT: v28.w = vadd(v25.w,v26.w) |
| ; CHECK-NEXT: v29 = vmux(q3,v5,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v17.uw = vlsr(v17.uw,r2) |
| ; CHECK-NEXT: v19 = vor(v31,v19) |
| ; CHECK-NEXT: v31 = vmux(q2,v2,v5) |
| ; CHECK-NEXT: v2 = vmux(q0,v2,v5) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v24.uw = vlsr(v16.uw,r2) |
| ; CHECK-NEXT: v30.w = vadd(v3.w,v29.w) |
| ; CHECK-NEXT: v2.w = vsub(v2.w,v9.w) |
| ; CHECK-NEXT: v11.w = vsub(v31.w,v11.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v16.uw = vlsr(v28.uw,r0) |
| ; CHECK-NEXT: q3 = vcmp.eq(v17.w,v25.w) |
| ; CHECK-NEXT: v4.w = vsub(v27.w,v12.w) |
| ; CHECK-NEXT: v2.w = vadd(v2.w,v10.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v13.uw = vlsr(v25.uw,r0) |
| ; CHECK-NEXT: q0 = vcmp.eq(v24.w,v3.w) |
| ; CHECK-NEXT: v21.w = vadd(v11.w,v10.w) |
| ; CHECK-NEXT: q2 = vcmp.gt(v5.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v22.uw = vlsr(v30.uw,r0) |
| ; CHECK-NEXT: v23 = vmux(q3,v16,v13) |
| ; CHECK-NEXT: q3 = vcmp.gt(v5.w,v0.w) |
| ; CHECK-NEXT: v24 = vmux(q2,v8,v5) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.uw = vlsr(v3.uw,r0) |
| ; CHECK-NEXT: v4.w = vadd(v4.w,v10.w) |
| ; CHECK-NEXT: v8 = vmux(q3,v8,v5) |
| ; CHECK-NEXT: v10 = vor(v24,v23) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v9.w = vasl(v21.w,r4) |
| ; CHECK-NEXT: v3 = vmux(q0,v22,v3) |
| ; CHECK-NEXT: v14 = vor(v18,v14) |
| ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v5.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.w = vasl(v2.w,r4) |
| ; CHECK-NEXT: v3 = vor(v8,v3) |
| ; CHECK-NEXT: v25 = vor(v10,v9) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v5.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v15.w = vasl(v15.w,r4) |
| ; CHECK-NEXT: v2 = vor(v3,v2) |
| ; CHECK-NEXT: v27 = vmux(q2,v5,v25) |
| ; CHECK-NEXT: vmem(r1+#1) = v27.new |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v26.w = vasl(v4.w,r4) |
| ; CHECK-NEXT: v29 = vmux(q3,v5,v2) |
| ; CHECK-NEXT: q2 = vcmp.eq(v7.w,v5.w) |
| ; CHECK-NEXT: vmem(r1+#0) = v29.new |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v28 = vor(v19,v26) |
| ; CHECK-NEXT: v30 = vor(v14,v15) |
| ; CHECK-NEXT: q3 = vcmp.eq(v6.w,v5.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q2,v5,v28) |
| ; CHECK-NEXT: v31 = vmux(q3,v5,v30) |
| ; CHECK-NEXT: vmem(r1+#3) = v0.new |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#2) = v31 |
| ; CHECK-NEXT: } |
| %v0 = load <128 x i8>, ptr %a0, align 128 |
| %v1 = sitofp <128 x i8> %v0 to <128 x float> |
| store <128 x float> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; Widen input #1 |
| define void @s8f32_1(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: s8f32_1: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r0 = #1 |
| ; CHECK-NEXT: v3:2.h = vunpack(v0.b) |
| ; CHECK-NEXT: v0.cur = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1 = vsplat(r0) |
| ; CHECK-NEXT: r3:2 = combine(##255,#8) |
| ; CHECK-NEXT: r6 = #512 |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7 = vsplat(r3) |
| ; CHECK-NEXT: v3:2.w = vunpack(v2.h) |
| ; CHECK-NEXT: v22 = vxor(v22,v22) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v10 = vsplat(r6) |
| ; CHECK-NEXT: r7 = ##-2147483648 |
| ; CHECK-NEXT: r5 = #159 |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v9 = vsplat(r7) |
| ; CHECK-NEXT: v4.w = vabs(v2.w) |
| ; CHECK-NEXT: v5.w = vabs(v3.w) |
| ; CHECK-NEXT: q0 = vcmp.gt(v22.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v12 = vsplat(r5) |
| ; CHECK-NEXT: r4 = #23 |
| ; CHECK-NEXT: v11 = vmux(q0,v9,v22) |
| ; CHECK-NEXT: q0 = vcmp.gt(v22.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uw = vcl0(v4.uw) |
| ; CHECK-NEXT: v30 = vmux(q0,v9,v22) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v8.uw = vcl0(v5.uw) |
| ; CHECK-NEXT: v6.w = vadd(v6.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v8.w = vadd(v8.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.w = vasl(v4.w,v6.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.w = vasl(v5.w,v8.w) |
| ; CHECK-NEXT: v13 = vand(v4,v10) |
| ; CHECK-NEXT: v14.w = vadd(v4.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v10 = vand(v5,v10) |
| ; CHECK-NEXT: v7.w = vadd(v5.w,v7.w) |
| ; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v14.uw) |
| ; CHECK-NEXT: q1 = vcmp.eq(v13.w,v22.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v14.uw = vlsr(v14.uw,r2) |
| ; CHECK-NEXT: q3 = vcmp.eq(v10.w,v22.w) |
| ; CHECK-NEXT: v25 = vmux(q2,v1,v22) |
| ; CHECK-NEXT: q2 = vcmp.gt(v5.uw,v7.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2) |
| ; CHECK-NEXT: v26 = vmux(q1,v22,v1) |
| ; CHECK-NEXT: v27 = vmux(q3,v22,v1) |
| ; CHECK-NEXT: v1 = vmux(q2,v1,v22) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2) |
| ; CHECK-NEXT: v5.w = vadd(v14.w,v26.w) |
| ; CHECK-NEXT: v29.w = vadd(v7.w,v27.w) |
| ; CHECK-NEXT: v6.w = vsub(v25.w,v6.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v23.uw = vlsr(v4.uw,r2) |
| ; CHECK-NEXT: v1.w = vsub(v1.w,v8.w) |
| ; CHECK-NEXT: v6.w = vadd(v6.w,v12.w) |
| ; CHECK-NEXT: q3 = vcmp.eq(v24.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v28.uw = vlsr(v14.uw,r0) |
| ; CHECK-NEXT: v1.w = vadd(v1.w,v12.w) |
| ; CHECK-NEXT: q1 = vcmp.eq(v23.w,v14.w) |
| ; CHECK-NEXT: q2 = vcmp.eq(v3.w,v22.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.uw = vlsr(v7.uw,r0) |
| ; CHECK-NEXT: v5 = vmux(q1,v5,v28) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.uw = vlsr(v29.uw,r0) |
| ; CHECK-NEXT: v5 = vor(v11,v5) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vasl(v6.w,r4) |
| ; CHECK-NEXT: v4 = vmux(q3,v4,v7) |
| ; CHECK-NEXT: q3 = vcmp.eq(v2.w,v22.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vasl(v1.w,r4) |
| ; CHECK-NEXT: v4 = vor(v30,v4) |
| ; CHECK-NEXT: v31 = vor(v5,v6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1 = vor(v4,v1) |
| ; CHECK-NEXT: v0 = vmux(q3,v22,v31) |
| ; CHECK-NEXT: vmem(r1+#0) = v0.new |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1 = vmux(q2,v22,v1) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#1) = v1.new |
| ; CHECK-NEXT: } |
| %v0 = load <64 x i8>, ptr %a0, align 128 |
| %v1 = sitofp <64 x i8> %v0 to <64 x float> |
| store <64 x float> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; Widen input #2 |
| define void @s8f32_2(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: s8f32_2: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r0 = #1 |
| ; CHECK-NEXT: r3 = #512 |
| ; CHECK-NEXT: v1:0.h = vunpack(v0.b) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2 = vsplat(r0) |
| ; CHECK-NEXT: v4 = vsplat(r3) |
| ; CHECK-NEXT: r2 = #255 |
| ; CHECK-NEXT: v3 = vxor(v3,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r7:6 = combine(##-2147483648,#8) |
| ; CHECK-NEXT: r4 = #159 |
| ; CHECK-NEXT: v1:0.w = vunpack(v0.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1 = vsplat(r2) |
| ; CHECK-NEXT: v8 = vsplat(r4) |
| ; CHECK-NEXT: v5.w = vabs(v0.w) |
| ; CHECK-NEXT: q2 = vcmp.gt(v3.w,v0.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7 = vsplat(r7) |
| ; CHECK-NEXT: r2 = #23 |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uw = vcl0(v5.uw) |
| ; CHECK-NEXT: v30 = vmux(q2,v7,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vadd(v6.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.w = vasl(v5.w,v6.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vadd(v5.w,v1.w) |
| ; CHECK-NEXT: v4 = vand(v5,v4) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6) |
| ; CHECK-NEXT: q0 = vcmp.eq(v4.w,v3.w) |
| ; CHECK-NEXT: q1 = vcmp.gt(v5.uw,v1.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6) |
| ; CHECK-NEXT: v4 = vmux(q0,v3,v2) |
| ; CHECK-NEXT: v2 = vmux(q1,v2,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.w = vadd(v1.w,v4.w) |
| ; CHECK-NEXT: v2.w = vsub(v2.w,v6.w) |
| ; CHECK-NEXT: q3 = vcmp.eq(v5.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v28.uw = vlsr(v1.uw,r0) |
| ; CHECK-NEXT: v2.w = vadd(v2.w,v8.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v29.uw = vlsr(v4.uw,r0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.w = vasl(v2.w,r2) |
| ; CHECK-NEXT: v1 = vmux(q3,v29,v28) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1 = vor(v30,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31 = vor(v1,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q3,v3,v31) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#0) = v0.new |
| ; CHECK-NEXT: } |
| %v0 = load <32 x i8>, ptr %a0, align 128 |
| %v1 = sitofp <32 x i8> %v0 to <32 x float> |
| store <32 x float> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| |
| ; s16 -> f16 |
| ; No widening |
| define void @s16f16_0(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: s16f16_0: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r6 = #1 |
| ; CHECK-NEXT: r3:2 = combine(#64,#31) |
| ; CHECK-NEXT: v1.h = vabs(v0.h) |
| ; CHECK-NEXT: v0.cur = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.h = vsplat(r6) |
| ; CHECK-NEXT: v5.h = vsplat(r2) |
| ; CHECK-NEXT: v2 = vxor(v2,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.h = vsplat(r3) |
| ; CHECK-NEXT: r5:4 = combine(##32768,#5) |
| ; CHECK-NEXT: v4.uh = vcl0(v1.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v8.h = vsplat(r5) |
| ; CHECK-NEXT: r2 = #10 |
| ; CHECK-NEXT: v4.h = vadd(v4.h,v3.h) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.h = vasl(v1.h,v4.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.h = vadd(v1.h,v5.h) |
| ; CHECK-NEXT: v6 = vand(v1,v6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.uh = vlsr(v1.uh,r4) |
| ; CHECK-NEXT: q0 = vcmp.eq(v6.h,v2.h) |
| ; CHECK-NEXT: q1 = vcmp.gt(v1.uh,v7.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v25.uh = vlsr(v7.uh,r4) |
| ; CHECK-NEXT: v26 = vmux(q0,v2,v3) |
| ; CHECK-NEXT: v3 = vmux(q1,v3,v2) |
| ; CHECK-NEXT: q1 = vcmp.gt(v2.h,v0.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.h = vadd(v25.h,v26.h) |
| ; CHECK-NEXT: v3.h = vadd(v3.h,v5.h) |
| ; CHECK-NEXT: q2 = vcmp.eq(v1.h,v25.h) |
| ; CHECK-NEXT: v30 = vmux(q1,v8,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v27.uh = vlsr(v25.uh,r6) |
| ; CHECK-NEXT: v28.h = vsub(v3.h,v4.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v29.uh = vlsr(v7.uh,r6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.h = vasl(v28.h,r2) |
| ; CHECK-NEXT: v3 = vmux(q2,v29,v27) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3 = vor(v30,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31 = vor(v3,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q3,v2,v31) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#0) = v0.new |
| ; CHECK-NEXT: } |
| %v0 = load <64 x i16>, ptr %a0, align 128 |
| %v1 = sitofp <64 x i16> %v0 to <64 x half> |
| store <64 x half> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; Widen input and result |
| define void @s16f16_1(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: s16f16_1: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r3:2 = combine(#31,#1) |
| ; CHECK-NEXT: r7 = #64 |
| ; CHECK-NEXT: v1.h = vabs(v0.h) |
| ; CHECK-NEXT: v0.cur = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.h = vsplat(r2) |
| ; CHECK-NEXT: v5.h = vsplat(r3) |
| ; CHECK-NEXT: r6 = #5 |
| ; CHECK-NEXT: v3 = vxor(v3,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.h = vsplat(r7) |
| ; CHECK-NEXT: r4 = ##32768 |
| ; CHECK-NEXT: v4.uh = vcl0(v1.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v8.h = vsplat(r4) |
| ; CHECK-NEXT: r4 = #10 |
| ; CHECK-NEXT: q2 = vcmp.gt(v3.h,v0.h) |
| ; CHECK-NEXT: v4.h = vadd(v4.h,v2.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30 = vmux(q2,v8,v3) |
| ; CHECK-NEXT: q2 = vcmp.eq(v0.h,v3.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.h = vasl(v1.h,v4.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.h = vadd(v1.h,v5.h) |
| ; CHECK-NEXT: v6 = vand(v1,v6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.uh = vlsr(v1.uh,r6) |
| ; CHECK-NEXT: q1 = vcmp.eq(v6.h,v3.h) |
| ; CHECK-NEXT: q0 = vcmp.gt(v1.uh,v7.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v25.uh = vlsr(v7.uh,r6) |
| ; CHECK-NEXT: v26 = vmux(q1,v3,v2) |
| ; CHECK-NEXT: v2 = vmux(q0,v2,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.h = vadd(v25.h,v26.h) |
| ; CHECK-NEXT: v2.h = vadd(v2.h,v5.h) |
| ; CHECK-NEXT: q3 = vcmp.eq(v1.h,v25.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v27.uh = vlsr(v25.uh,r2) |
| ; CHECK-NEXT: v28.h = vsub(v2.h,v4.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v29.uh = vlsr(v7.uh,r2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.h = vasl(v28.h,r4) |
| ; CHECK-NEXT: q3 = vsetq(r7) |
| ; CHECK-NEXT: v2 = vmux(q3,v29,v27) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2 = vor(v30,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31 = vor(v2,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q2,v3,v31) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 |
| ; CHECK-NEXT: } |
| %v0 = load <32 x i16>, ptr %a0, align 128 |
| %v1 = sitofp <32 x i16> %v0 to <32 x half> |
| store <32 x half> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| |
| ; s16 -> f32 |
| ; No widening |
| define void @s16f32_0(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: s16f32_0: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r0 = #1 |
| ; CHECK-NEXT: r3:2 = combine(##255,#8) |
| ; CHECK-NEXT: v1:0.w = vunpack(v0.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3 = vsplat(r0) |
| ; CHECK-NEXT: r7 = #512 |
| ; CHECK-NEXT: v4.w = vabs(v0.w) |
| ; CHECK-NEXT: v6.w = vabs(v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5 = vsplat(r3) |
| ; CHECK-NEXT: v9 = vsplat(r7) |
| ; CHECK-NEXT: r5 = #159 |
| ; CHECK-NEXT: v2 = vxor(v2,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v13 = vsplat(r5) |
| ; CHECK-NEXT: r6 = ##-2147483648 |
| ; CHECK-NEXT: v7.uw = vcl0(v4.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v10 = vsplat(r6) |
| ; CHECK-NEXT: v8.uw = vcl0(v6.uw) |
| ; CHECK-NEXT: q0 = vcmp.gt(v2.w,v0.w) |
| ; CHECK-NEXT: v7.w = vadd(v7.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r4 = #23 |
| ; CHECK-NEXT: v8.w = vadd(v8.w,v3.w) |
| ; CHECK-NEXT: v27 = vmux(q0,v10,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.w = vasl(v4.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vasl(v6.w,v8.w) |
| ; CHECK-NEXT: v11.w = vadd(v4.w,v5.w) |
| ; CHECK-NEXT: v12 = vand(v4,v9) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.w = vadd(v6.w,v5.w) |
| ; CHECK-NEXT: v9 = vand(v6,v9) |
| ; CHECK-NEXT: q1 = vcmp.eq(v12.w,v2.w) |
| ; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v11.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v22.uw = vlsr(v11.uw,r2) |
| ; CHECK-NEXT: q3 = vcmp.eq(v9.w,v2.w) |
| ; CHECK-NEXT: v23 = vmux(q1,v2,v3) |
| ; CHECK-NEXT: v14 = vmux(q2,v3,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2) |
| ; CHECK-NEXT: v11.w = vadd(v22.w,v23.w) |
| ; CHECK-NEXT: q2 = vcmp.gt(v6.uw,v5.uw) |
| ; CHECK-NEXT: v25 = vmux(q3,v2,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v21.uw = vlsr(v4.uw,r2) |
| ; CHECK-NEXT: v5.w = vadd(v24.w,v25.w) |
| ; CHECK-NEXT: v3 = vmux(q2,v3,v2) |
| ; CHECK-NEXT: v7.w = vsub(v14.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2) |
| ; CHECK-NEXT: v3.w = vsub(v3.w,v8.w) |
| ; CHECK-NEXT: q3 = vcmp.eq(v21.w,v22.w) |
| ; CHECK-NEXT: v7.w = vadd(v7.w,v13.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.uw = vlsr(v22.uw,r0) |
| ; CHECK-NEXT: v3.w = vadd(v3.w,v13.w) |
| ; CHECK-NEXT: q2 = vcmp.eq(v6.w,v24.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v11.uw = vlsr(v11.uw,r0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0) |
| ; CHECK-NEXT: v4 = vmux(q3,v11,v4) |
| ; CHECK-NEXT: q3 = vcmp.gt(v2.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v26.uw = vlsr(v24.uw,r0) |
| ; CHECK-NEXT: v28 = vmux(q3,v10,v2) |
| ; CHECK-NEXT: v4 = vor(v27,v4) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.w = vasl(v7.w,r4) |
| ; CHECK-NEXT: v5 = vmux(q2,v5,v26) |
| ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.w = vasl(v3.w,r4) |
| ; CHECK-NEXT: v5 = vor(v28,v5) |
| ; CHECK-NEXT: v29 = vor(v4,v7) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3 = vor(v5,v3) |
| ; CHECK-NEXT: v31 = vmux(q3,v2,v29) |
| ; CHECK-NEXT: vmem(r1+#0) = v31.new |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30 = vmux(q2,v2,v3) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#1) = v30.new |
| ; CHECK-NEXT: } |
| %v0 = load <64 x i16>, ptr %a0, align 128 |
| %v1 = sitofp <64 x i16> %v0 to <64 x float> |
| store <64 x float> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; Widen input |
| define void @s16f32_1(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: s16f32_1: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r0 = #1 |
| ; CHECK-NEXT: r2 = #255 |
| ; CHECK-NEXT: v1:0.w = vunpack(v0.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3 = vsplat(r0) |
| ; CHECK-NEXT: v4 = vsplat(r2) |
| ; CHECK-NEXT: r3 = #512 |
| ; CHECK-NEXT: v2.w = vabs(v0.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6 = vsplat(r3) |
| ; CHECK-NEXT: r7:6 = combine(##-2147483648,#8) |
| ; CHECK-NEXT: v1 = vxor(v1,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r4 = #159 |
| ; CHECK-NEXT: v5.uw = vcl0(v2.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7 = vsplat(r4) |
| ; CHECK-NEXT: v29 = vsplat(r7) |
| ; CHECK-NEXT: q2 = vcmp.gt(v1.w,v0.w) |
| ; CHECK-NEXT: v5.w = vadd(v5.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r2 = #23 |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.w = vasl(v2.w,v5.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.w = vadd(v2.w,v4.w) |
| ; CHECK-NEXT: v6 = vand(v2,v6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.uw = vlsr(v2.uw,r6) |
| ; CHECK-NEXT: q0 = vcmp.eq(v6.w,v1.w) |
| ; CHECK-NEXT: q1 = vcmp.gt(v2.uw,v4.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.uw = vlsr(v4.uw,r6) |
| ; CHECK-NEXT: v6 = vmux(q0,v1,v3) |
| ; CHECK-NEXT: v3 = vmux(q1,v3,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vadd(v4.w,v6.w) |
| ; CHECK-NEXT: v27.w = vsub(v3.w,v5.w) |
| ; CHECK-NEXT: q3 = vcmp.eq(v2.w,v4.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v28.uw = vlsr(v4.uw,r0) |
| ; CHECK-NEXT: v2.w = vadd(v27.w,v7.w) |
| ; CHECK-NEXT: v4 = vmux(q2,v29,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.w = vasl(v2.w,r2) |
| ; CHECK-NEXT: v3 = vmux(q3,v30,v28) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3 = vor(v4,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31 = vor(v3,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q3,v1,v31) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#0) = v0.new |
| ; CHECK-NEXT: } |
| %v0 = load <32 x i16>, ptr %a0, align 128 |
| %v1 = sitofp <32 x i16> %v0 to <32 x float> |
| store <32 x float> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| |
| ; s32 -> f16 |
| ; No widening |
| define void @s32f16_0(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: s32f16_0: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r3:2 = combine(#8,#1) |
| ; CHECK-NEXT: r6 = #255 |
| ; CHECK-NEXT: v4.w = vabs(v1.w) |
| ; CHECK-NEXT: v1.cur = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2 = vsplat(r2) |
| ; CHECK-NEXT: r4 = #512 |
| ; CHECK-NEXT: v5.w = vabs(v0.w) |
| ; CHECK-NEXT: v0.cur = vmem(r0+#1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v9 = vsplat(r4) |
| ; CHECK-NEXT: v8 = vsplat(r6) |
| ; CHECK-NEXT: r4 = #159 |
| ; CHECK-NEXT: v3.uw = vcl0(v4.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uw = vcl0(v5.uw) |
| ; CHECK-NEXT: v7.w = vadd(v3.w,v2.w) |
| ; CHECK-NEXT: v3 = vxor(v3,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v26 = vsplat(r4) |
| ; CHECK-NEXT: r5 = ##-2147483648 |
| ; CHECK-NEXT: v6.w = vadd(v6.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v13 = vsplat(r5) |
| ; CHECK-NEXT: v4.w = vasl(v4.w,v7.w) |
| ; CHECK-NEXT: q0 = vcmp.gt(v3.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.w = vasl(v5.w,v6.w) |
| ; CHECK-NEXT: v25 = vmux(q0,v13,v3) |
| ; CHECK-NEXT: v10.w = vadd(v4.w,v8.w) |
| ; CHECK-NEXT: v11 = vand(v4,v9) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v9 = vand(v5,v9) |
| ; CHECK-NEXT: q3 = vcmp.eq(v11.w,v3.w) |
| ; CHECK-NEXT: v8.w = vadd(v5.w,v8.w) |
| ; CHECK-NEXT: q1 = vcmp.gt(v4.uw,v10.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v20.uw = vlsr(v10.uw,r3) |
| ; CHECK-NEXT: q2 = vcmp.eq(v9.w,v3.w) |
| ; CHECK-NEXT: v21 = vmux(q3,v3,v2) |
| ; CHECK-NEXT: q3 = vcmp.gt(v5.uw,v8.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v8.uw = vlsr(v8.uw,r3) |
| ; CHECK-NEXT: v9.w = vadd(v20.w,v21.w) |
| ; CHECK-NEXT: v23 = vmux(q2,v3,v2) |
| ; CHECK-NEXT: v22 = vmux(q1,v2,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v12.uw = vlsr(v4.uw,r3) |
| ; CHECK-NEXT: v2 = vmux(q3,v2,v3) |
| ; CHECK-NEXT: v24.w = vadd(v8.w,v23.w) |
| ; CHECK-NEXT: v7.w = vsub(v22.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r3) |
| ; CHECK-NEXT: v2.w = vsub(v2.w,v6.w) |
| ; CHECK-NEXT: q3 = vcmp.eq(v12.w,v20.w) |
| ; CHECK-NEXT: v7.w = vadd(v7.w,v26.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r3 = #23 |
| ; CHECK-NEXT: v4.uw = vlsr(v20.uw,r2) |
| ; CHECK-NEXT: q2 = vcmp.eq(v5.w,v8.w) |
| ; CHECK-NEXT: v2.w = vadd(v2.w,v26.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v27.uw = vlsr(v24.uw,r2) |
| ; CHECK-NEXT: v4 = vmux(q3,v9,v4) |
| ; CHECK-NEXT: q3 = vcmp.gt(v3.w,v0.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v28.uw = vlsr(v8.uw,r2) |
| ; CHECK-NEXT: v30 = vmux(q3,v13,v3) |
| ; CHECK-NEXT: v4 = vor(v25,v4) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v29.w = vasl(v7.w,r3) |
| ; CHECK-NEXT: v5 = vmux(q2,v27,v28) |
| ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.w = vasl(v2.w,r3) |
| ; CHECK-NEXT: v31 = vor(v30,v5) |
| ; CHECK-NEXT: v4 = vor(v4,v29) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1 = vor(v31,v2) |
| ; CHECK-NEXT: v4 = vmux(q2,v3,v4) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q3,v3,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.qf32 = vadd(v4.sf,v3.sf) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v3.sf) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0.hf = v3:2.qf32 |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0.h = vdeal(v0.h) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#0) = v0.new |
| ; CHECK-NEXT: } |
| %v0 = load <64 x i32>, ptr %a0, align 128 |
| %v1 = sitofp <64 x i32> %v0 to <64 x half> |
| store <64 x half> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; Widen result |
| define void @s32f16_1(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: s32f16_1: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r6 = #1 |
| ; CHECK-NEXT: v1.w = vabs(v0.w) |
| ; CHECK-NEXT: v0.cur = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2 = vsplat(r6) |
| ; CHECK-NEXT: r3:2 = combine(##255,#8) |
| ; CHECK-NEXT: r4 = #512 |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5 = vsplat(r3) |
| ; CHECK-NEXT: v6 = vsplat(r4) |
| ; CHECK-NEXT: v4.uw = vcl0(v1.uw) |
| ; CHECK-NEXT: v3 = vxor(v3,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r5 = #159 |
| ; CHECK-NEXT: r4 = ##-2147483648 |
| ; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v28 = vsplat(r5) |
| ; CHECK-NEXT: v29 = vsplat(r4) |
| ; CHECK-NEXT: q3 = vcmp.gt(v3.w,v0.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r3 = #23 |
| ; CHECK-NEXT: v1.w = vasl(v1.w,v4.w) |
| ; CHECK-NEXT: v31 = vmux(q3,v29,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.w = vadd(v1.w,v5.w) |
| ; CHECK-NEXT: v6 = vand(v1,v6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.uw = vlsr(v1.uw,r2) |
| ; CHECK-NEXT: q0 = vcmp.eq(v6.w,v3.w) |
| ; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r2 = #64 |
| ; CHECK-NEXT: v1.uw = vlsr(v5.uw,r2) |
| ; CHECK-NEXT: v27 = vmux(q0,v3,v2) |
| ; CHECK-NEXT: v2 = vmux(q1,v2,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: q3 = vsetq(r2) |
| ; CHECK-NEXT: v5.w = vadd(v1.w,v27.w) |
| ; CHECK-NEXT: v2.w = vsub(v2.w,v4.w) |
| ; CHECK-NEXT: q2 = vcmp.eq(v7.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6) |
| ; CHECK-NEXT: v2.w = vadd(v2.w,v28.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30.uw = vlsr(v5.uw,r6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.w = vasl(v2.w,r3) |
| ; CHECK-NEXT: v1 = vmux(q2,v30,v1) |
| ; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1 = vor(v31,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.qf32 = vadd(v3.sf,v3.sf) |
| ; CHECK-NEXT: v0 = vor(v1,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q2,v3,v0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0.qf32 = vadd(v0.sf,v3.sf) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0.hf = v1:0.qf32 |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0.h = vdeal(v0.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 |
| ; CHECK-NEXT: } |
| %v0 = load <32 x i32>, ptr %a0, align 128 |
| %v1 = sitofp <32 x i32> %v0 to <32 x half> |
| store <32 x half> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; s32 -> f32 |
| ; No widening |
| define void @s32f32_0(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: s32f32_0: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r0 = #1 |
| ; CHECK-NEXT: r2 = #255 |
| ; CHECK-NEXT: v1.w = vabs(v0.w) |
| ; CHECK-NEXT: v0.cur = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3 = vsplat(r0) |
| ; CHECK-NEXT: v5 = vsplat(r2) |
| ; CHECK-NEXT: r3 = #512 |
| ; CHECK-NEXT: v2 = vxor(v2,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6 = vsplat(r3) |
| ; CHECK-NEXT: r7:6 = combine(##-2147483648,#8) |
| ; CHECK-NEXT: v4.uw = vcl0(v1.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r4 = #159 |
| ; CHECK-NEXT: v4.w = vadd(v4.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7 = vsplat(r4) |
| ; CHECK-NEXT: v29 = vsplat(r7) |
| ; CHECK-NEXT: r2 = #23 |
| ; CHECK-NEXT: q2 = vcmp.gt(v2.w,v0.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vasl(v1.w,v4.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.w = vadd(v1.w,v5.w) |
| ; CHECK-NEXT: v6 = vand(v1,v6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6) |
| ; CHECK-NEXT: q0 = vcmp.eq(v6.w,v2.w) |
| ; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6) |
| ; CHECK-NEXT: v6 = vmux(q0,v2,v3) |
| ; CHECK-NEXT: v3 = vmux(q1,v3,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vadd(v5.w,v6.w) |
| ; CHECK-NEXT: v27.w = vsub(v3.w,v4.w) |
| ; CHECK-NEXT: q3 = vcmp.eq(v1.w,v5.w) |
| ; CHECK-NEXT: v4 = vmux(q2,v29,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v28.uw = vlsr(v5.uw,r0) |
| ; CHECK-NEXT: v1.w = vadd(v27.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vasl(v1.w,r2) |
| ; CHECK-NEXT: v3 = vmux(q3,v30,v28) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3 = vor(v4,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31 = vor(v3,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q3,v2,v31) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#0) = v0.new |
| ; CHECK-NEXT: } |
| %v0 = load <32 x i32>, ptr %a0, align 128 |
| %v1 = sitofp <32 x i32> %v0 to <32 x float> |
| store <32 x float> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; Widen input and result |
| define void @s32f32_1(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: s32f32_1: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r0 = #1 |
| ; CHECK-NEXT: r2 = #255 |
| ; CHECK-NEXT: v1.w = vabs(v0.w) |
| ; CHECK-NEXT: v0.cur = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2 = vsplat(r0) |
| ; CHECK-NEXT: v5 = vsplat(r2) |
| ; CHECK-NEXT: r3 = #512 |
| ; CHECK-NEXT: v3 = vxor(v3,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6 = vsplat(r3) |
| ; CHECK-NEXT: r7:6 = combine(##-2147483648,#8) |
| ; CHECK-NEXT: v4.uw = vcl0(v1.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r4 = #159 |
| ; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7 = vsplat(r4) |
| ; CHECK-NEXT: v29 = vsplat(r7) |
| ; CHECK-NEXT: r3 = #23 |
| ; CHECK-NEXT: q3 = vcmp.gt(v3.w,v0.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r2 = #64 |
| ; CHECK-NEXT: v1.w = vasl(v1.w,v4.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.w = vadd(v1.w,v5.w) |
| ; CHECK-NEXT: v6 = vand(v1,v6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6) |
| ; CHECK-NEXT: q0 = vcmp.eq(v6.w,v3.w) |
| ; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6) |
| ; CHECK-NEXT: v6 = vmux(q0,v3,v2) |
| ; CHECK-NEXT: v2 = vmux(q1,v2,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vadd(v5.w,v6.w) |
| ; CHECK-NEXT: v27.w = vsub(v2.w,v4.w) |
| ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v5.w) |
| ; CHECK-NEXT: v4 = vmux(q3,v29,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v28.uw = vlsr(v5.uw,r0) |
| ; CHECK-NEXT: q3 = vsetq(r2) |
| ; CHECK-NEXT: v1.w = vadd(v27.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vasl(v1.w,r3) |
| ; CHECK-NEXT: v2 = vmux(q2,v30,v28) |
| ; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2 = vor(v4,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31 = vor(v2,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q2,v3,v31) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 |
| ; CHECK-NEXT: } |
| %v0 = load <16 x i32>, ptr %a0, align 128 |
| %v1 = sitofp <16 x i32> %v0 to <16 x float> |
| store <16 x float> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| |
| ; u8 -> f16 |
| ; No widening |
| define void @u8f16_0(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: u8f16_0: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r6 = #1 |
| ; CHECK-NEXT: r3:2 = combine(#31,#5) |
| ; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.h = vsplat(r6) |
| ; CHECK-NEXT: v4.h = vsplat(r3) |
| ; CHECK-NEXT: r5 = #64 |
| ; CHECK-NEXT: v2 = vxor(v2,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.h = vsplat(r5) |
| ; CHECK-NEXT: r4 = #10 |
| ; CHECK-NEXT: v5.uh = vcl0(v0.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.uh = vcl0(v1.uh) |
| ; CHECK-NEXT: v5.h = vadd(v5.h,v3.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.h = vadd(v7.h,v3.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v8.h = vasl(v0.h,v5.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v11.h = vasl(v1.h,v7.h) |
| ; CHECK-NEXT: v10 = vand(v8,v6) |
| ; CHECK-NEXT: v9.h = vadd(v8.h,v4.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v22.h = vadd(v11.h,v4.h) |
| ; CHECK-NEXT: v6 = vand(v11,v6) |
| ; CHECK-NEXT: q0 = vcmp.gt(v8.uh,v9.uh) |
| ; CHECK-NEXT: q1 = vcmp.eq(v10.h,v2.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v21.uh = vlsr(v8.uh,r2) |
| ; CHECK-NEXT: q2 = vcmp.eq(v6.h,v2.h) |
| ; CHECK-NEXT: q3 = vcmp.gt(v11.uh,v22.uh) |
| ; CHECK-NEXT: v12 = vmux(q1,v2,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v9.uh = vlsr(v9.uh,r2) |
| ; CHECK-NEXT: v13 = vmux(q2,v2,v3) |
| ; CHECK-NEXT: v25 = vmux(q0,v3,v2) |
| ; CHECK-NEXT: v3 = vmux(q3,v3,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v8.uh = vlsr(v22.uh,r2) |
| ; CHECK-NEXT: v24.h = vadd(v9.h,v12.h) |
| ; CHECK-NEXT: v3.h = vadd(v3.h,v4.h) |
| ; CHECK-NEXT: v12.h = vadd(v25.h,v4.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v23.uh = vlsr(v11.uh,r2) |
| ; CHECK-NEXT: v13.h = vadd(v8.h,v13.h) |
| ; CHECK-NEXT: v5.h = vsub(v12.h,v5.h) |
| ; CHECK-NEXT: v3.h = vsub(v3.h,v7.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v14.uh = vlsr(v9.uh,r6) |
| ; CHECK-NEXT: q2 = vcmp.eq(v21.h,v9.h) |
| ; CHECK-NEXT: q3 = vcmp.eq(v23.h,v8.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v26.uh = vlsr(v24.uh,r6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v27.uh = vlsr(v13.uh,r6) |
| ; CHECK-NEXT: v4 = vmux(q2,v26,v14) |
| ; CHECK-NEXT: q2 = vcmp.eq(v1.h,v2.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v28.uh = vlsr(v8.uh,r6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.h = vasl(v5.h,r4) |
| ; CHECK-NEXT: v6 = vmux(q3,v27,v28) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.h = vasl(v3.h,r4) |
| ; CHECK-NEXT: v29 = vor(v4,v5) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3 = vor(v6,v3) |
| ; CHECK-NEXT: v31 = vmux(q3,v2,v29) |
| ; CHECK-NEXT: vmem(r1+#0) = v31.new |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30 = vmux(q2,v2,v3) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#1) = v30.new |
| ; CHECK-NEXT: } |
| %v0 = load <128 x i8>, ptr %a0, align 128 |
| %v1 = uitofp <128 x i8> %v0 to <128 x half> |
| store <128 x half> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; Widen input |
| define void @u8f16_1(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: u8f16_1: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r6 = #1 |
| ; CHECK-NEXT: r3:2 = combine(#64,#31) |
| ; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.h = vsplat(r6) |
| ; CHECK-NEXT: v4.h = vsplat(r2) |
| ; CHECK-NEXT: r5 = #5 |
| ; CHECK-NEXT: v2 = vxor(v2,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.h = vsplat(r3) |
| ; CHECK-NEXT: r4 = #10 |
| ; CHECK-NEXT: v3.uh = vcl0(v0.uh) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.h = vadd(v3.h,v1.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.h = vasl(v0.h,v3.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.h = vadd(v6.h,v4.h) |
| ; CHECK-NEXT: v5 = vand(v6,v5) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uh = vlsr(v6.uh,r5) |
| ; CHECK-NEXT: q0 = vcmp.gt(v6.uh,v7.uh) |
| ; CHECK-NEXT: q1 = vcmp.eq(v5.h,v2.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v26.uh = vlsr(v7.uh,r5) |
| ; CHECK-NEXT: v27 = vmux(q1,v2,v1) |
| ; CHECK-NEXT: v1 = vmux(q0,v1,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.h = vadd(v1.h,v4.h) |
| ; CHECK-NEXT: v28.h = vadd(v26.h,v27.h) |
| ; CHECK-NEXT: q2 = vcmp.eq(v6.h,v26.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v29.uh = vlsr(v26.uh,r6) |
| ; CHECK-NEXT: v1.h = vsub(v1.h,v3.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30.uh = vlsr(v28.uh,r6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.h = vasl(v1.h,r4) |
| ; CHECK-NEXT: v3 = vmux(q2,v30,v29) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31 = vor(v3,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q3,v2,v31) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#0) = v0.new |
| ; CHECK-NEXT: } |
| %v0 = load <64 x i8>, ptr %a0, align 128 |
| %v1 = uitofp <64 x i8> %v0 to <64 x half> |
| store <64 x half> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| |
| ; u8 -> f32 |
| ; No widening |
| define void @u8f32_0(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: u8f32_0: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r7 = #64 |
| ; CHECK-NEXT: r0 = #1 |
| ; CHECK-NEXT: r6 = #512 |
| ; CHECK-NEXT: v0 = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4 = vsplat(r0) |
| ; CHECK-NEXT: r3:2 = combine(##255,#8) |
| ; CHECK-NEXT: v1 = valign(v0,v0,r7) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v15 = vsplat(r6) |
| ; CHECK-NEXT: v6 = vsplat(r3) |
| ; CHECK-NEXT: r5 = #159 |
| ; CHECK-NEXT: v3:2.uh = vunpack(v0.ub) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r4 = #23 |
| ; CHECK-NEXT: v31:30.uh = vunpack(v1.ub) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3:2.uw = vunpack(v2.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1:0.uw = vunpack(v30.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.uw = vcl0(v2.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.uw = vcl0(v0.uw) |
| ; CHECK-NEXT: v5.w = vadd(v5.w,v4.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v8.uw = vcl0(v3.uw) |
| ; CHECK-NEXT: v11.w = vadd(v7.w,v4.w) |
| ; CHECK-NEXT: v7 = vxor(v7,v7) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v9.uw = vcl0(v1.uw) |
| ; CHECK-NEXT: v10.w = vadd(v8.w,v4.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v9 = vsplat(r5) |
| ; CHECK-NEXT: v14.w = vasl(v0.w,v11.w) |
| ; CHECK-NEXT: v8.w = vadd(v9.w,v4.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v12.w = vasl(v2.w,v5.w) |
| ; CHECK-NEXT: v24 = vand(v14,v15) |
| ; CHECK-NEXT: v20.w = vadd(v14.w,v6.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v13.w = vasl(v3.w,v10.w) |
| ; CHECK-NEXT: v19 = vand(v12,v15) |
| ; CHECK-NEXT: q3 = vcmp.eq(v24.w,v7.w) |
| ; CHECK-NEXT: v18.w = vadd(v12.w,v6.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v16.w = vasl(v1.w,v8.w) |
| ; CHECK-NEXT: v23 = vand(v13,v15) |
| ; CHECK-NEXT: v22.w = vadd(v13.w,v6.w) |
| ; CHECK-NEXT: q0 = vcmp.gt(v14.uw,v20.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vadd(v16.w,v6.w) |
| ; CHECK-NEXT: v15 = vand(v16,v15) |
| ; CHECK-NEXT: v30 = vmux(q3,v7,v4) |
| ; CHECK-NEXT: q2 = vcmp.eq(v19.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v21.uw = vlsr(v14.uw,r2) |
| ; CHECK-NEXT: q3 = vcmp.eq(v15.w,v7.w) |
| ; CHECK-NEXT: v28 = vmux(q0,v4,v7) |
| ; CHECK-NEXT: q1 = vcmp.eq(v23.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v14.uw = vlsr(v20.uw,r2) |
| ; CHECK-NEXT: v26 = vmux(q3,v7,v4) |
| ; CHECK-NEXT: v11.w = vsub(v28.w,v11.w) |
| ; CHECK-NEXT: q3 = vcmp.gt(v13.uw,v22.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v15.uw = vlsr(v6.uw,r2) |
| ; CHECK-NEXT: v20.w = vadd(v14.w,v30.w) |
| ; CHECK-NEXT: v30 = vmux(q1,v7,v4) |
| ; CHECK-NEXT: v31 = vmux(q2,v7,v4) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v19.uw = vlsr(v18.uw,r2) |
| ; CHECK-NEXT: v29.w = vadd(v15.w,v26.w) |
| ; CHECK-NEXT: q1 = vcmp.gt(v12.uw,v18.uw) |
| ; CHECK-NEXT: v11.w = vadd(v11.w,v9.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v28.uw = vlsr(v22.uw,r2) |
| ; CHECK-NEXT: v23.w = vadd(v19.w,v31.w) |
| ; CHECK-NEXT: v22 = vmux(q3,v4,v7) |
| ; CHECK-NEXT: q3 = vcmp.gt(v16.uw,v6.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v24.uw = vlsr(v29.uw,r0) |
| ; CHECK-NEXT: v31.w = vadd(v28.w,v30.w) |
| ; CHECK-NEXT: v30 = vmux(q1,v4,v7) |
| ; CHECK-NEXT: v4 = vmux(q3,v4,v7) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v17.uw = vlsr(v12.uw,r2) |
| ; CHECK-NEXT: v5.w = vsub(v30.w,v5.w) |
| ; CHECK-NEXT: v29.w = vsub(v22.w,v10.w) |
| ; CHECK-NEXT: v4.w = vsub(v4.w,v8.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v13.uw = vlsr(v13.uw,r2) |
| ; CHECK-NEXT: v6.w = vadd(v29.w,v9.w) |
| ; CHECK-NEXT: v5.w = vadd(v5.w,v9.w) |
| ; CHECK-NEXT: q0 = vcmp.eq(v21.w,v14.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v25.uw = vlsr(v16.uw,r2) |
| ; CHECK-NEXT: q2 = vcmp.eq(v17.w,v19.w) |
| ; CHECK-NEXT: q3 = vcmp.eq(v13.w,v28.w) |
| ; CHECK-NEXT: v4.w = vadd(v4.w,v9.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v21.uw = vlsr(v23.uw,r0) |
| ; CHECK-NEXT: q1 = vcmp.eq(v25.w,v15.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v23.uw = vlsr(v19.uw,r0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31.uw = vlsr(v31.uw,r0) |
| ; CHECK-NEXT: v23 = vmux(q2,v21,v23) |
| ; CHECK-NEXT: q2 = vcmp.eq(v3.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v16.uw = vlsr(v28.uw,r0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v26.uw = vlsr(v15.uw,r0) |
| ; CHECK-NEXT: v8 = vmux(q3,v31,v16) |
| ; CHECK-NEXT: q3 = vcmp.eq(v2.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vasl(v6.w,r4) |
| ; CHECK-NEXT: v22 = vmux(q1,v24,v26) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.w = vasl(v5.w,r4) |
| ; CHECK-NEXT: v6 = vor(v8,v6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v27.uw = vlsr(v14.uw,r0) |
| ; CHECK-NEXT: v25 = vor(v23,v5) |
| ; CHECK-NEXT: v26 = vmux(q2,v7,v6) |
| ; CHECK-NEXT: vmem(r1+#1) = v26.new |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v20.uw = vlsr(v20.uw,r0) |
| ; CHECK-NEXT: v28 = vmux(q3,v7,v25) |
| ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v7.w) |
| ; CHECK-NEXT: vmem(r1+#0) = v28.new |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v11.w = vasl(v11.w,r4) |
| ; CHECK-NEXT: v20 = vmux(q0,v20,v27) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v24.w = vasl(v4.w,r4) |
| ; CHECK-NEXT: v29 = vor(v20,v11) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v27 = vor(v22,v24) |
| ; CHECK-NEXT: v31 = vmux(q3,v7,v29) |
| ; CHECK-NEXT: vmem(r1+#2) = v31.new |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30 = vmux(q2,v7,v27) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#3) = v30.new |
| ; CHECK-NEXT: } |
| %v0 = load <128 x i8>, ptr %a0, align 128 |
| %v1 = uitofp <128 x i8> %v0 to <128 x float> |
| store <128 x float> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; Widen input #1 |
| define void @u8f32_1(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: u8f32_1: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r7 = #1 |
| ; CHECK-NEXT: r6 = #512 |
| ; CHECK-NEXT: v3:2.uh = vunpack(v0.ub) |
| ; CHECK-NEXT: v0.cur = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1 = vsplat(r7) |
| ; CHECK-NEXT: v8 = vsplat(r6) |
| ; CHECK-NEXT: r3:2 = combine(##255,#8) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6 = vsplat(r3) |
| ; CHECK-NEXT: r5 = #159 |
| ; CHECK-NEXT: v3:2.uw = vunpack(v2.uh) |
| ; CHECK-NEXT: v21 = vxor(v21,v21) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v13 = vsplat(r5) |
| ; CHECK-NEXT: r4 = #23 |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.uw = vcl0(v2.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.uw = vcl0(v3.uw) |
| ; CHECK-NEXT: v4.w = vadd(v4.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.w = vadd(v5.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.w = vasl(v2.w,v4.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v9.w = vasl(v3.w,v5.w) |
| ; CHECK-NEXT: v11 = vand(v7,v8) |
| ; CHECK-NEXT: v10.w = vadd(v7.w,v6.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vadd(v9.w,v6.w) |
| ; CHECK-NEXT: q1 = vcmp.eq(v11.w,v21.w) |
| ; CHECK-NEXT: v8 = vand(v9,v8) |
| ; CHECK-NEXT: q0 = vcmp.gt(v7.uw,v10.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v22.uw = vlsr(v10.uw,r2) |
| ; CHECK-NEXT: v24 = vmux(q1,v21,v1) |
| ; CHECK-NEXT: q3 = vcmp.eq(v8.w,v21.w) |
| ; CHECK-NEXT: q1 = vcmp.gt(v9.uw,v6.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v23.uw = vlsr(v6.uw,r2) |
| ; CHECK-NEXT: v25 = vmux(q0,v1,v21) |
| ; CHECK-NEXT: v27 = vmux(q3,v21,v1) |
| ; CHECK-NEXT: v1 = vmux(q1,v1,v21) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.w = vsub(v25.w,v4.w) |
| ; CHECK-NEXT: v1.w = vsub(v1.w,v5.w) |
| ; CHECK-NEXT: v10.w = vadd(v22.w,v24.w) |
| ; CHECK-NEXT: v28.w = vadd(v23.w,v27.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2) |
| ; CHECK-NEXT: v4.w = vadd(v4.w,v13.w) |
| ; CHECK-NEXT: v1.w = vadd(v1.w,v13.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v26.uw = vlsr(v9.uw,r2) |
| ; CHECK-NEXT: q2 = vcmp.eq(v12.w,v22.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v11.uw = vlsr(v22.uw,r7) |
| ; CHECK-NEXT: q3 = vcmp.eq(v26.w,v23.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30.uw = vlsr(v10.uw,r7) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v29.uw = vlsr(v23.uw,r7) |
| ; CHECK-NEXT: v5 = vmux(q2,v30,v11) |
| ; CHECK-NEXT: q2 = vcmp.eq(v3.w,v21.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uw = vlsr(v28.uw,r7) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.w = vasl(v4.w,r4) |
| ; CHECK-NEXT: v6 = vmux(q3,v6,v29) |
| ; CHECK-NEXT: q3 = vcmp.eq(v2.w,v21.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vasl(v1.w,r4) |
| ; CHECK-NEXT: v31 = vor(v5,v4) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1 = vor(v6,v1) |
| ; CHECK-NEXT: v0 = vmux(q3,v21,v31) |
| ; CHECK-NEXT: vmem(r1+#0) = v0.new |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1 = vmux(q2,v21,v1) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#1) = v1.new |
| ; CHECK-NEXT: } |
| %v0 = load <64 x i8>, ptr %a0, align 128 |
| %v1 = uitofp <64 x i8> %v0 to <64 x float> |
| store <64 x float> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; Widen input #2 |
| define void @u8f32_2(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: u8f32_2: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r6 = #1 |
| ; CHECK-NEXT: r3 = #512 |
| ; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2 = vsplat(r6) |
| ; CHECK-NEXT: v4 = vsplat(r3) |
| ; CHECK-NEXT: r2 = #255 |
| ; CHECK-NEXT: v3 = vxor(v3,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r5:4 = combine(##159,#8) |
| ; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1 = vsplat(r2) |
| ; CHECK-NEXT: v7 = vsplat(r5) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.uw = vcl0(v0.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.w = vadd(v5.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vasl(v0.w,v5.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vadd(v6.w,v1.w) |
| ; CHECK-NEXT: v4 = vand(v6,v4) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r4) |
| ; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v1.uw) |
| ; CHECK-NEXT: q1 = vcmp.eq(v4.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r4 = #23 |
| ; CHECK-NEXT: v1.uw = vlsr(v1.uw,r4) |
| ; CHECK-NEXT: v4 = vmux(q1,v3,v2) |
| ; CHECK-NEXT: v2 = vmux(q0,v2,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.w = vsub(v2.w,v5.w) |
| ; CHECK-NEXT: v4.w = vadd(v1.w,v4.w) |
| ; CHECK-NEXT: q2 = vcmp.eq(v6.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v29.uw = vlsr(v1.uw,r6) |
| ; CHECK-NEXT: v2.w = vadd(v2.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30.uw = vlsr(v4.uw,r6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.w = vasl(v2.w,r4) |
| ; CHECK-NEXT: v1 = vmux(q2,v30,v29) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31 = vor(v1,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q3,v3,v31) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#0) = v0.new |
| ; CHECK-NEXT: } |
| %v0 = load <32 x i8>, ptr %a0, align 128 |
| %v1 = uitofp <32 x i8> %v0 to <32 x float> |
| store <32 x float> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| |
| ; u16 -> f16 |
| ; No widening |
| define void @u16f16_0(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: u16f16_0: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r3:2 = combine(#64,#1) |
| ; CHECK-NEXT: r5 = #31 |
| ; CHECK-NEXT: v1.uh = vcl0(v0.uh) |
| ; CHECK-NEXT: v0.cur = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.h = vsplat(r2) |
| ; CHECK-NEXT: v5.h = vsplat(r3) |
| ; CHECK-NEXT: r4 = #5 |
| ; CHECK-NEXT: v3 = vxor(v3,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.h = vsplat(r5) |
| ; CHECK-NEXT: r3 = #10 |
| ; CHECK-NEXT: v1.h = vadd(v1.h,v2.h) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.h,v3.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.h = vasl(v0.h,v1.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.h = vadd(v6.h,v4.h) |
| ; CHECK-NEXT: v5 = vand(v6,v5) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uh = vlsr(v6.uh,r4) |
| ; CHECK-NEXT: q0 = vcmp.eq(v5.h,v3.h) |
| ; CHECK-NEXT: q1 = vcmp.gt(v6.uh,v7.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v26.uh = vlsr(v7.uh,r4) |
| ; CHECK-NEXT: v27 = vmux(q0,v3,v2) |
| ; CHECK-NEXT: v2 = vmux(q1,v2,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.h = vadd(v2.h,v4.h) |
| ; CHECK-NEXT: v28.h = vadd(v26.h,v27.h) |
| ; CHECK-NEXT: q2 = vcmp.eq(v6.h,v26.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v29.uh = vlsr(v26.uh,r2) |
| ; CHECK-NEXT: v1.h = vsub(v2.h,v1.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30.uh = vlsr(v28.uh,r2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.h = vasl(v1.h,r3) |
| ; CHECK-NEXT: v2 = vmux(q2,v30,v29) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31 = vor(v2,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q3,v3,v31) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#0) = v0.new |
| ; CHECK-NEXT: } |
| %v0 = load <64 x i16>, ptr %a0, align 128 |
| %v1 = uitofp <64 x i16> %v0 to <64 x half> |
| store <64 x half> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; Widen input and result |
| define void @u16f16_1(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: u16f16_1: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r3:2 = combine(#31,#1) |
| ; CHECK-NEXT: r6 = #64 |
| ; CHECK-NEXT: v1.uh = vcl0(v0.uh) |
| ; CHECK-NEXT: v0.cur = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.h = vsplat(r2) |
| ; CHECK-NEXT: v4.h = vsplat(r3) |
| ; CHECK-NEXT: r5 = #5 |
| ; CHECK-NEXT: v3 = vxor(v3,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.h = vsplat(r6) |
| ; CHECK-NEXT: r4 = #10 |
| ; CHECK-NEXT: v1.h = vadd(v1.h,v2.h) |
| ; CHECK-NEXT: q2 = vcmp.eq(v0.h,v3.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: q3 = vsetq(r6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.h = vasl(v0.h,v1.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.h = vadd(v6.h,v4.h) |
| ; CHECK-NEXT: v5 = vand(v6,v5) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uh = vlsr(v6.uh,r5) |
| ; CHECK-NEXT: q1 = vcmp.eq(v5.h,v3.h) |
| ; CHECK-NEXT: q0 = vcmp.gt(v6.uh,v7.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.uh = vlsr(v7.uh,r5) |
| ; CHECK-NEXT: v5 = vmux(q1,v3,v2) |
| ; CHECK-NEXT: v2 = vmux(q0,v2,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.h = vadd(v2.h,v4.h) |
| ; CHECK-NEXT: v28.h = vadd(v7.h,v5.h) |
| ; CHECK-NEXT: q1 = vcmp.eq(v6.h,v7.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v29.uh = vlsr(v7.uh,r2) |
| ; CHECK-NEXT: v1.h = vsub(v2.h,v1.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30.uh = vlsr(v28.uh,r2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.h = vasl(v1.h,r4) |
| ; CHECK-NEXT: v2 = vmux(q1,v30,v29) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31 = vor(v2,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q2,v3,v31) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 |
| ; CHECK-NEXT: } |
| %v0 = load <32 x i16>, ptr %a0, align 128 |
| %v1 = uitofp <32 x i16> %v0 to <32 x half> |
| store <32 x half> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| |
| ; u16 -> f32 |
| ; No widening |
| define void @u16f32_0(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: u16f32_0: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r7 = #1 |
| ; CHECK-NEXT: r3:2 = combine(##255,#8) |
| ; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3 = vsplat(r7) |
| ; CHECK-NEXT: v6 = vsplat(r3) |
| ; CHECK-NEXT: r6 = #512 |
| ; CHECK-NEXT: v2 = vxor(v2,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v8 = vsplat(r6) |
| ; CHECK-NEXT: r5 = #159 |
| ; CHECK-NEXT: r4 = #23 |
| ; CHECK-NEXT: v4.uw = vcl0(v0.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v14 = vsplat(r5) |
| ; CHECK-NEXT: v5.uw = vcl0(v1.uw) |
| ; CHECK-NEXT: v4.w = vadd(v4.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.w = vadd(v5.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.w = vasl(v0.w,v4.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v9.w = vasl(v1.w,v5.w) |
| ; CHECK-NEXT: v10.w = vadd(v7.w,v6.w) |
| ; CHECK-NEXT: v11 = vand(v7,v8) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vadd(v9.w,v6.w) |
| ; CHECK-NEXT: v8 = vand(v9,v8) |
| ; CHECK-NEXT: q1 = vcmp.eq(v11.w,v2.w) |
| ; CHECK-NEXT: q0 = vcmp.gt(v7.uw,v10.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v19.uw = vlsr(v10.uw,r2) |
| ; CHECK-NEXT: q2 = vcmp.eq(v8.w,v2.w) |
| ; CHECK-NEXT: q3 = vcmp.gt(v9.uw,v6.uw) |
| ; CHECK-NEXT: v20 = vmux(q1,v2,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v21.uw = vlsr(v6.uw,r2) |
| ; CHECK-NEXT: v22 = vmux(q2,v2,v3) |
| ; CHECK-NEXT: v25 = vmux(q0,v3,v2) |
| ; CHECK-NEXT: v3 = vmux(q3,v3,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.w = vsub(v25.w,v4.w) |
| ; CHECK-NEXT: v3.w = vsub(v3.w,v5.w) |
| ; CHECK-NEXT: v23.w = vadd(v19.w,v20.w) |
| ; CHECK-NEXT: v10.w = vadd(v21.w,v22.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2) |
| ; CHECK-NEXT: v4.w = vadd(v4.w,v14.w) |
| ; CHECK-NEXT: v3.w = vadd(v3.w,v14.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v24.uw = vlsr(v9.uw,r2) |
| ; CHECK-NEXT: q2 = vcmp.eq(v12.w,v19.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v13.uw = vlsr(v19.uw,r7) |
| ; CHECK-NEXT: q3 = vcmp.eq(v24.w,v21.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v26.uw = vlsr(v23.uw,r7) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v27.uw = vlsr(v10.uw,r7) |
| ; CHECK-NEXT: v5 = vmux(q2,v26,v13) |
| ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v28.uw = vlsr(v21.uw,r7) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.w = vasl(v4.w,r4) |
| ; CHECK-NEXT: v6 = vmux(q3,v27,v28) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.w = vasl(v3.w,r4) |
| ; CHECK-NEXT: v29 = vor(v5,v4) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3 = vor(v6,v3) |
| ; CHECK-NEXT: v31 = vmux(q3,v2,v29) |
| ; CHECK-NEXT: vmem(r1+#0) = v31.new |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30 = vmux(q2,v2,v3) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#1) = v30.new |
| ; CHECK-NEXT: } |
| %v0 = load <64 x i16>, ptr %a0, align 128 |
| %v1 = uitofp <64 x i16> %v0 to <64 x float> |
| store <64 x float> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; Widen input |
| define void @u16f32_1(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: u16f32_1: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r6 = #1 |
| ; CHECK-NEXT: r2 = #255 |
| ; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1 = vsplat(r6) |
| ; CHECK-NEXT: v4 = vsplat(r2) |
| ; CHECK-NEXT: r3 = #512 |
| ; CHECK-NEXT: v2 = vxor(v2,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5 = vsplat(r3) |
| ; CHECK-NEXT: r5:4 = combine(##159,#8) |
| ; CHECK-NEXT: v3.uw = vcl0(v0.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7 = vsplat(r5) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) |
| ; CHECK-NEXT: v3.w = vadd(v3.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vasl(v0.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) |
| ; CHECK-NEXT: v5 = vand(v6,v5) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r4) |
| ; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v4.uw) |
| ; CHECK-NEXT: q1 = vcmp.eq(v5.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r4 = #23 |
| ; CHECK-NEXT: v4.uw = vlsr(v4.uw,r4) |
| ; CHECK-NEXT: v5 = vmux(q1,v2,v1) |
| ; CHECK-NEXT: v1 = vmux(q0,v1,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vsub(v1.w,v3.w) |
| ; CHECK-NEXT: v29.w = vadd(v4.w,v5.w) |
| ; CHECK-NEXT: q2 = vcmp.eq(v6.w,v4.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30.uw = vlsr(v4.uw,r6) |
| ; CHECK-NEXT: v1.w = vadd(v1.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.uw = vlsr(v29.uw,r6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vasl(v1.w,r4) |
| ; CHECK-NEXT: v3 = vmux(q2,v3,v30) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31 = vor(v3,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q3,v2,v31) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#0) = v0.new |
| ; CHECK-NEXT: } |
| %v0 = load <32 x i16>, ptr %a0, align 128 |
| %v1 = uitofp <32 x i16> %v0 to <32 x float> |
| store <32 x float> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| |
| ; u32 -> f16 |
| ; No widening |
| define void @u32f16_0(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: u32f16_0: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r3:2 = combine(#8,#1) |
| ; CHECK-NEXT: r6 = #255 |
| ; CHECK-NEXT: v3.uw = vcl0(v1.uw) |
| ; CHECK-NEXT: v1.cur = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2 = vsplat(r2) |
| ; CHECK-NEXT: r4 = #512 |
| ; CHECK-NEXT: v4.uw = vcl0(v0.uw) |
| ; CHECK-NEXT: v0.cur = vmem(r0+#1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7 = vsplat(r4) |
| ; CHECK-NEXT: v6 = vsplat(r6) |
| ; CHECK-NEXT: v3.w = vadd(v3.w,v2.w) |
| ; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r4 = #159 |
| ; CHECK-NEXT: v9 = vxor(v9,v9) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v10 = vsplat(r4) |
| ; CHECK-NEXT: v5.w = vasl(v1.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v8.w = vasl(v0.w,v4.w) |
| ; CHECK-NEXT: v11.w = vadd(v5.w,v6.w) |
| ; CHECK-NEXT: v13 = vand(v5,v7) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vadd(v8.w,v6.w) |
| ; CHECK-NEXT: v7 = vand(v8,v7) |
| ; CHECK-NEXT: q1 = vcmp.gt(v5.uw,v11.uw) |
| ; CHECK-NEXT: q2 = vcmp.eq(v13.w,v9.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v27.uw = vlsr(v11.uw,r3) |
| ; CHECK-NEXT: q3 = vcmp.gt(v8.uw,v6.uw) |
| ; CHECK-NEXT: q0 = vcmp.eq(v7.w,v9.w) |
| ; CHECK-NEXT: v28 = vmux(q2,v9,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r3) |
| ; CHECK-NEXT: v29 = vmux(q1,v2,v9) |
| ; CHECK-NEXT: v30 = vmux(q3,v2,v9) |
| ; CHECK-NEXT: v2 = vmux(q0,v9,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.w = vsub(v29.w,v3.w) |
| ; CHECK-NEXT: v7.w = vadd(v27.w,v28.w) |
| ; CHECK-NEXT: v4.w = vsub(v30.w,v4.w) |
| ; CHECK-NEXT: v2.w = vadd(v6.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v12.uw = vlsr(v5.uw,r3) |
| ; CHECK-NEXT: v3.w = vadd(v3.w,v10.w) |
| ; CHECK-NEXT: v4.w = vadd(v4.w,v10.w) |
| ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v9.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r3 = #23 |
| ; CHECK-NEXT: v14.uw = vlsr(v8.uw,r3) |
| ; CHECK-NEXT: q3 = vcmp.eq(v12.w,v27.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v5.uw = vlsr(v27.uw,r2) |
| ; CHECK-NEXT: q1 = vcmp.eq(v14.w,v6.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.uw = vlsr(v2.uw,r2) |
| ; CHECK-NEXT: v5 = vmux(q3,v7,v5) |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v9.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.w = vasl(v3.w,r3) |
| ; CHECK-NEXT: v31 = vmux(q1,v2,v6) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.w = vasl(v4.w,r3) |
| ; CHECK-NEXT: v3 = vor(v5,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1 = vor(v31,v2) |
| ; CHECK-NEXT: v3 = vmux(q2,v9,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q3,v9,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.qf32 = vadd(v3.sf,v9.sf) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v9.sf) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0.hf = v3:2.qf32 |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0.h = vdeal(v0.h) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#0) = v0.new |
| ; CHECK-NEXT: } |
| %v0 = load <64 x i32>, ptr %a0, align 128 |
| %v1 = uitofp <64 x i32> %v0 to <64 x half> |
| store <64 x half> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; Widen result |
| define void @u32f16_1(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: u32f16_1: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r3:2 = combine(##512,#1) |
| ; CHECK-NEXT: v1.uw = vcl0(v0.uw) |
| ; CHECK-NEXT: v0.cur = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v3 = vsplat(r2) |
| ; CHECK-NEXT: v5 = vsplat(r3) |
| ; CHECK-NEXT: r6 = #255 |
| ; CHECK-NEXT: v2 = vxor(v2,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4 = vsplat(r6) |
| ; CHECK-NEXT: r5 = #8 |
| ; CHECK-NEXT: r4 = #159 |
| ; CHECK-NEXT: v1.w = vadd(v1.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7 = vsplat(r4) |
| ; CHECK-NEXT: r3 = #23 |
| ; CHECK-NEXT: q2 = vcmp.eq(v0.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vasl(v0.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) |
| ; CHECK-NEXT: v5 = vand(v6,v5) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5) |
| ; CHECK-NEXT: q0 = vcmp.eq(v5.w,v2.w) |
| ; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5) |
| ; CHECK-NEXT: v5 = vmux(q0,v2,v3) |
| ; CHECK-NEXT: v3 = vmux(q1,v3,v2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vsub(v3.w,v1.w) |
| ; CHECK-NEXT: v30.w = vadd(v4.w,v5.w) |
| ; CHECK-NEXT: q1 = vcmp.eq(v6.w,v4.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31.uw = vlsr(v4.uw,r2) |
| ; CHECK-NEXT: v1.w = vadd(v1.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r2 = #64 |
| ; CHECK-NEXT: v3.uw = vlsr(v30.uw,r2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vasl(v1.w,r3) |
| ; CHECK-NEXT: q3 = vsetq(r2) |
| ; CHECK-NEXT: v3 = vmux(q1,v3,v31) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.qf32 = vadd(v2.sf,v2.sf) |
| ; CHECK-NEXT: v0 = vor(v3,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q2,v2,v0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0.qf32 = vadd(v0.sf,v2.sf) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0.hf = v1:0.qf32 |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0.h = vdeal(v0.h) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 |
| ; CHECK-NEXT: } |
| %v0 = load <32 x i32>, ptr %a0, align 128 |
| %v1 = uitofp <32 x i32> %v0 to <32 x half> |
| store <32 x half> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; u32 -> f32 |
| ; No widening |
| define void @u32f32_0(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: u32f32_0: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r3:2 = combine(##512,#1) |
| ; CHECK-NEXT: v1.uw = vcl0(v0.uw) |
| ; CHECK-NEXT: v0.cur = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2 = vsplat(r2) |
| ; CHECK-NEXT: v5 = vsplat(r3) |
| ; CHECK-NEXT: r6 = #255 |
| ; CHECK-NEXT: v3 = vxor(v3,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4 = vsplat(r6) |
| ; CHECK-NEXT: r5 = #8 |
| ; CHECK-NEXT: r4 = #159 |
| ; CHECK-NEXT: v1.w = vadd(v1.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7 = vsplat(r4) |
| ; CHECK-NEXT: r3 = #23 |
| ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vasl(v0.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) |
| ; CHECK-NEXT: v5 = vand(v6,v5) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5) |
| ; CHECK-NEXT: q0 = vcmp.eq(v5.w,v3.w) |
| ; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5) |
| ; CHECK-NEXT: v5 = vmux(q0,v3,v2) |
| ; CHECK-NEXT: v2 = vmux(q1,v2,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vsub(v2.w,v1.w) |
| ; CHECK-NEXT: v29.w = vadd(v4.w,v5.w) |
| ; CHECK-NEXT: q2 = vcmp.eq(v6.w,v4.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30.uw = vlsr(v4.uw,r2) |
| ; CHECK-NEXT: v1.w = vadd(v1.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2.uw = vlsr(v29.uw,r2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vasl(v1.w,r3) |
| ; CHECK-NEXT: v2 = vmux(q2,v2,v30) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31 = vor(v2,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q3,v3,v31) |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: vmem(r1+#0) = v0.new |
| ; CHECK-NEXT: } |
| %v0 = load <32 x i32>, ptr %a0, align 128 |
| %v1 = uitofp <32 x i32> %v0 to <32 x float> |
| store <32 x float> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| ; Widen input and result |
| define void @u32f32_1(ptr %a0, ptr %a1) #0 { |
| ; CHECK-LABEL: u32f32_1: |
| ; CHECK: .cfi_startproc |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r3:2 = combine(##512,#1) |
| ; CHECK-NEXT: v1.uw = vcl0(v0.uw) |
| ; CHECK-NEXT: v0.cur = vmem(r0+#0) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v2 = vsplat(r2) |
| ; CHECK-NEXT: v5 = vsplat(r3) |
| ; CHECK-NEXT: r6 = #255 |
| ; CHECK-NEXT: v3 = vxor(v3,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4 = vsplat(r6) |
| ; CHECK-NEXT: r5 = #8 |
| ; CHECK-NEXT: r4 = #159 |
| ; CHECK-NEXT: v1.w = vadd(v1.w,v2.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v7 = vsplat(r4) |
| ; CHECK-NEXT: r3 = #23 |
| ; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.w = vasl(v0.w,v1.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) |
| ; CHECK-NEXT: v5 = vand(v6,v5) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5) |
| ; CHECK-NEXT: q0 = vcmp.eq(v5.w,v3.w) |
| ; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5) |
| ; CHECK-NEXT: v5 = vmux(q0,v3,v2) |
| ; CHECK-NEXT: v2 = vmux(q1,v2,v3) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vsub(v2.w,v1.w) |
| ; CHECK-NEXT: v29.w = vadd(v4.w,v5.w) |
| ; CHECK-NEXT: q1 = vcmp.eq(v6.w,v4.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v30.uw = vlsr(v4.uw,r2) |
| ; CHECK-NEXT: v1.w = vadd(v1.w,v7.w) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: r2 = #64 |
| ; CHECK-NEXT: v2.uw = vlsr(v29.uw,r2) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v1.w = vasl(v1.w,r3) |
| ; CHECK-NEXT: q3 = vsetq(r2) |
| ; CHECK-NEXT: v2 = vmux(q1,v2,v30) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v31 = vor(v2,v1) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: v0 = vmux(q2,v3,v31) |
| ; CHECK-NEXT: } |
| ; CHECK-NEXT: { |
| ; CHECK-NEXT: jumpr r31 |
| ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0 |
| ; CHECK-NEXT: } |
| %v0 = load <16 x i32>, ptr %a0, align 128 |
| %v1 = uitofp <16 x i32> %v0 to <16 x float> |
| store <16 x float> %v1, ptr %a1, align 128 |
| ret void |
| } |
| |
| |
| attributes #0 = { "target-features"="+v68,+hvxv68,+hvx-length128b,+hvx-qfloat" } |
| |