| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s |
| |
| target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" |
| target triple = "x86_64-unknown-unknown" |
| |
| ; Stack reload folding tests. |
| ; |
| ; By including a nop call with sideeffects we can force a partial register spill of the |
| ; relevant registers and check that the reload is correctly folded into the instruction. |
| |
| define <32 x half> @stack_fold_addph_zmm(<32 x half> %a0, <32 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_addph_zmm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fadd <32 x half> %a0, %a1 |
| ret <32 x half> %2 |
| } |
| |
| define <32 x half> @stack_fold_addph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) { |
| ; CHECK-LABEL: stack_fold_addph_zmm_k: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmovaps (%rsi), %zmm2 |
| ; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fadd <32 x half> %a0, %a1 |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = load <32 x half>, ptr %passthru |
| %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 |
| ret <32 x half> %5 |
| } |
| |
| define <32 x half> @stack_fold_addph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) { |
| ; CHECK-LABEL: stack_fold_addph_zmm_k_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmovaps (%rsi), %zmm2 |
| ; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fadd <32 x half> %a1, %a0 |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = load <32 x half>, ptr %passthru |
| %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 |
| ret <32 x half> %5 |
| } |
| |
| define <32 x half> @stack_fold_addph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) { |
| ; CHECK-LABEL: stack_fold_addph_zmm_kz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fadd <32 x half> %a1, %a0 |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer |
| ret <32 x half> %4 |
| } |
| |
| define half @stack_fold_addsh(half %a0, half %a1) { |
| ; CHECK-LABEL: stack_fold_addsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vaddsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fadd half %a0, %a1 |
| ret half %2 |
| } |
| |
| define <8 x half> @stack_fold_addsh_int(<8 x half> %a0, <8 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_addsh_int: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vaddsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = extractelement <8 x half> %a0, i32 0 |
| %3 = extractelement <8 x half> %a1, i32 0 |
| %4 = fadd half %2, %3 |
| %5 = insertelement <8 x half> %a0, half %4, i32 0 |
| ret <8 x half> %5 |
| } |
| |
| define i32 @stack_fold_cmpph(<32 x half> %a0, <32 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_cmpph: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload |
| ; CHECK-NEXT: kmovd %k0, %eax |
| ; CHECK-NEXT: vzeroupper |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %res = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %a0, <32 x half> %a1, i32 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) |
| %2 = bitcast <32 x i1> %res to i32 |
| ret i32 %2 |
| } |
| declare <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half>, <32 x half>, i32, <32 x i1>, i32) |
| |
| define <32 x half> @stack_fold_cmpph_mask(<32 x half> %a0, <32 x half> %a1, ptr %a2, i32 %mask, <32 x half> %b0, <32 x half> %b1) { |
| ; CHECK-LABEL: stack_fold_cmpph_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: subq $136, %rsp |
| ; CHECK-NEXT: .cfi_def_cfa_offset 144 |
| ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload |
| ; CHECK-NEXT: vaddph (%rdi), %zmm0, %zmm0 |
| ; CHECK-NEXT: vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: kandd %k0, %k1, %k1 |
| ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload |
| ; CHECK-NEXT: vmovdqu16 (%rsp), %zmm0 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: addq $136, %rsp |
| ; CHECK-NEXT: .cfi_def_cfa_offset 8 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load |
| %2 = load <32 x half>, ptr %a2 |
| %3 = fadd <32 x half> %a1, %2 |
| %4 = bitcast i32 %mask to <32 x i1> |
| %5 = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %3, <32 x half> %a0, i32 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) |
| %6 = and <32 x i1> %4, %5 |
| %7 = select <32 x i1> %6, <32 x half> %b0, <32 x half> %b1 |
| ret <32 x half> %7 |
| } |
| |
| define <32 x half> @stack_fold_cmpph_mask_commuted(<32 x half> %a0, <32 x half> %a1, ptr %a2, i32 %mask, <32 x half> %b0, <32 x half> %b1) { |
| ; CHECK-LABEL: stack_fold_cmpph_mask_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: subq $136, %rsp |
| ; CHECK-NEXT: .cfi_def_cfa_offset 144 |
| ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload |
| ; CHECK-NEXT: vaddph (%rdi), %zmm0, %zmm0 |
| ; CHECK-NEXT: vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: kandd %k0, %k1, %k1 |
| ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload |
| ; CHECK-NEXT: vmovdqu16 (%rsp), %zmm0 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: addq $136, %rsp |
| ; CHECK-NEXT: .cfi_def_cfa_offset 8 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load |
| %2 = load <32 x half>, ptr %a2 |
| %3 = fadd <32 x half> %a1, %2 |
| %4 = bitcast i32 %mask to <32 x i1> |
| %5 = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %a0, <32 x half> %3, i32 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) |
| %6 = and <32 x i1> %4, %5 |
| %7 = select <32 x i1> %6, <32 x half> %b0, <32 x half> %b1 |
| ret <32 x half> %7 |
| } |
| |
| define half @stack_fold_divsh(half %a0, half %a1) { |
| ; CHECK-LABEL: stack_fold_divsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vdivsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fdiv half %a0, %a1 |
| ret half %2 |
| } |
| |
| define <8 x half> @stack_fold_divsh_int(<8 x half> %a0, <8 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_divsh_int: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vdivsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = extractelement <8 x half> %a0, i32 0 |
| %3 = extractelement <8 x half> %a1, i32 0 |
| %4 = fdiv half %2, %3 |
| %5 = insertelement <8 x half> %a0, half %4, i32 0 |
| ret <8 x half> %5 |
| } |
| |
| define i32 @stack_fold_fpclassph(<32 x half> %a0) { |
| ; CHECK-LABEL: stack_fold_fpclassph: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vfpclassphz $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 64-byte Folded Reload |
| ; CHECK-NEXT: kmovd %k0, %eax |
| ; CHECK-NEXT: vzeroupper |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %a0, i32 4) |
| %3 = bitcast <32 x i1> %2 to i32 |
| ret i32 %3 |
| } |
| declare <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half>, i32) |
| |
| define i32 @stack_fold_fpclassph_mask(<32 x half> %a0, ptr %p) { |
| ; CHECK-LABEL: stack_fold_fpclassph_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd (%rdi), %k1 |
| ; CHECK-NEXT: vfpclassphz $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: kmovd %k0, %eax |
| ; CHECK-NEXT: vzeroupper |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %a0, i32 4) |
| %mask = load <32 x i1>, ptr %p |
| %3 = and <32 x i1> %2, %mask |
| %4 = bitcast <32 x i1> %3 to i32 |
| ret i32 %4 |
| } |
| |
| define i8 @stack_fold_fpclasssh(<8 x half> %a0) { |
| ;CHECK-LABEl: stack_fold_fpclasssh: |
| ; CHECK-LABEL: stack_fold_fpclasssh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vfpclasssh $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 16-byte Folded Reload |
| ; CHECK-NEXT: kmovd %k0, %eax |
| ; CHECK-NEXT: # kill: def $al killed $al killed $eax |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %a0, i32 4, i8 -1) |
| ret i8 %2 |
| } |
| declare i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half>, i32, i8) |
| |
| define i8 @stack_fold_fpclasssh_mask(<8 x half> %a0, ptr %p) { |
| ; CHECK-LABEL: stack_fold_fpclasssh_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovb (%rdi), %k1 |
| ; CHECK-NEXT: vfpclasssh $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 {%k1} # 16-byte Folded Reload |
| ; CHECK-NEXT: kmovd %k0, %eax |
| ; CHECK-NEXT: # kill: def $al killed $al killed $eax |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %mask = load i8, ptr %p |
| %2 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %a0, i32 4, i8 %mask) |
| ret i8 %2 |
| } |
| |
| define <32 x half> @stack_fold_getexpph(<32 x half> %a0) { |
| ; CHECK-LABEL: stack_fold_getexpph: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1, i32 4) |
| ret <32 x half> %2 |
| } |
| declare <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half>, <32 x half>, i32, i32) |
| |
| define <32 x half> @stack_fold_getexpph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) { |
| ; CHECK-LABEL: stack_fold_getexpph_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %zmm1 |
| ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm1, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <32 x half>, ptr %passthru |
| %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask, i32 4) |
| ret <32 x half> %3 |
| } |
| |
| define <32 x half> @stack_fold_getexpph_maskz(<32 x half> %a0, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_getexpph_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd (%rdi), %k1 |
| ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i32, ptr %mask |
| %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2, i32 4) |
| ret <32 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_getexpsh(<8 x half> %a0, <8 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_getexpsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vgetexpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4) |
| ret <8 x half> %2 |
| } |
| declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32) |
| |
| define <8 x half> @stack_fold_getexpsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) { |
| ; CHECK-LABEL: stack_fold_getexpsh_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %xmm2 |
| ; CHECK-NEXT: vgetexpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <8 x half>, ptr %passthru |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4) |
| ret <8 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_getexpsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_getexpsh_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovb (%rdi), %k1 |
| ; CHECK-NEXT: vgetexpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i8, ptr %mask |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 4) |
| ret <8 x half> %3 |
| } |
| |
| define <32 x half> @stack_fold_getmantph(<32 x half> %a0) { |
| ; CHECK-LABEL: stack_fold_getmantph: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4) |
| ret <32 x half> %2 |
| } |
| declare <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half>, i32, <32 x half>, i32, i32) |
| |
| define <32 x half> @stack_fold_getmantph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) { |
| ; CHECK-LABEL: stack_fold_getmantph_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %zmm1 |
| ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm1, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <32 x half>, ptr %passthru |
| %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4) |
| ret <32 x half> %3 |
| } |
| |
| define <32 x half> @stack_fold_getmantph_maskz(<32 x half> %a0, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_getmantph_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd (%rdi), %k1 |
| ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i32, ptr %mask |
| %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4) |
| ret <32 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_getmantsh(<8 x half> %a0, <8 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_getmantsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vgetmantsh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> undef, i8 -1, i32 4) |
| ret <8 x half> %2 |
| } |
| declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half>, <8 x half>, i32, <8 x half>, i8, i32) |
| |
| define <8 x half> @stack_fold_getmantsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) { |
| ; CHECK-LABEL: stack_fold_getmantsh_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %xmm2 |
| ; CHECK-NEXT: vgetmantsh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <8 x half>, ptr %passthru |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> %2, i8 %mask, i32 4) |
| ret <8 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_getmantsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_getmantsh_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovb (%rdi), %k1 |
| ; CHECK-NEXT: vgetmantsh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i8, ptr %mask |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> zeroinitializer, i8 %2, i32 4) |
| ret <8 x half> %3 |
| } |
| |
| define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 { |
| ; CHECK-LABEL: stack_fold_maxph_zmm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) |
| ret <32 x half> %2 |
| } |
| declare <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone |
| |
| define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 { |
| ; CHECK-LABEL: stack_fold_maxph_zmm_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload |
| ; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) |
| ret <32 x half> %2 |
| } |
| |
| define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 { |
| ; CHECK-LABEL: stack_fold_maxph_zmm_k: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmovaps (%rsi), %zmm2 |
| ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = load <32 x half>, ptr %passthru |
| %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 |
| ret <32 x half> %5 |
| } |
| |
| define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 { |
| ; CHECK-LABEL: stack_fold_maxph_zmm_k_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmovaps (%rsi), %zmm2 |
| ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload |
| ; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm2 {%k1} |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = load <32 x half>, ptr %passthru |
| %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 |
| ret <32 x half> %5 |
| } |
| |
| define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 { |
| ; CHECK-LABEL: stack_fold_maxph_zmm_kz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer |
| ret <32 x half> %4 |
| } |
| |
| define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 { |
| ; CHECK-LABEL: stack_fold_maxph_zmm_kz_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload |
| ; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm0 {%k1} {z} |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer |
| ret <32 x half> %4 |
| } |
| |
| define <32 x half> @stack_fold_maxph_zmm_commutable(<32 x half> %a0, <32 x half> %a1) #1 { |
| ; CHECK-LABEL: stack_fold_maxph_zmm_commutable: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) |
| ret <32 x half> %2 |
| } |
| |
| define <32 x half> @stack_fold_maxph_zmm_commutable_commuted(<32 x half> %a0, <32 x half> %a1) #1 { |
| ; CHECK-LABEL: stack_fold_maxph_zmm_commutable_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) |
| ret <32 x half> %2 |
| } |
| |
| define <32 x half> @stack_fold_maxph_zmm_commutable_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 { |
| ; CHECK-LABEL: stack_fold_maxph_zmm_commutable_k: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmovaps (%rsi), %zmm2 |
| ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = load <32 x half>, ptr %passthru |
| %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 |
| ret <32 x half> %5 |
| } |
| |
| define <32 x half> @stack_fold_maxph_zmm_commutable_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 { |
| ; CHECK-LABEL: stack_fold_maxph_zmm_commutable_k_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmovaps (%rsi), %zmm2 |
| ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = load <32 x half>, ptr %passthru |
| %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 |
| ret <32 x half> %5 |
| } |
| |
| define <32 x half> @stack_fold_maxph_zmm_commutable_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 { |
| ; CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer |
| ret <32 x half> %4 |
| } |
| |
| define <32 x half> @stack_fold_maxph_zmm_commutable_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 { |
| ; CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer |
| ret <32 x half> %4 |
| } |
| |
| define half @stack_fold_maxsh(half %a0, half %a1) #0 { |
| ; CHECK-LABEL: stack_fold_maxsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fcmp ogt half %a0, %a1 |
| %3 = select i1 %2, half %a0, half %a1 |
| ret half %3 |
| } |
| |
| define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 { |
| ; CHECK-LABEL: stack_fold_maxsh_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload |
| ; CHECK-NEXT: vmaxsh %xmm0, %xmm1, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fcmp ogt half %a1, %a0 |
| %3 = select i1 %2, half %a1, half %a0 |
| ret half %3 |
| } |
| |
| define half @stack_fold_maxsh_commutable(half %a0, half %a1) #1 { |
| ; CHECK-LABEL: stack_fold_maxsh_commutable: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fcmp ogt half %a0, %a1 |
| %3 = select i1 %2, half %a0, half %a1 |
| ret half %3 |
| } |
| |
| define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 { |
| ; CHECK-LABEL: stack_fold_maxsh_commutable_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fcmp ogt half %a1, %a0 |
| %3 = select i1 %2, half %a1, half %a0 |
| ret half %3 |
| } |
| |
| define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) #0 { |
| ; CHECK-LABEL: stack_fold_maxsh_int: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4) |
| ret <8 x half> %2 |
| } |
| declare <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) |
| |
| define <8 x half> @stack_fold_maxsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mask, ptr %passthru) { |
| ; CHECK-LABEL: stack_fold_maxsh_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rsi), %xmm2 |
| ; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <8 x half>, ptr %passthru |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4) |
| ret <8 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_maxsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %mask) { |
| ; CHECK-LABEL: stack_fold_maxsh_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 4) |
| ret <8 x half> %2 |
| } |
| |
| define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 { |
| ; CHECK-LABEL: stack_fold_minph_zmm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) |
| ret <32 x half> %2 |
| } |
| declare <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone |
| |
| define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 { |
| ; CHECK-LABEL: stack_fold_minph_zmm_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload |
| ; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) |
| ret <32 x half> %2 |
| } |
| |
| define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 { |
| ; CHECK-LABEL: stack_fold_minph_zmm_k: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmovaps (%rsi), %zmm2 |
| ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = load <32 x half>, ptr %passthru |
| %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 |
| ret <32 x half> %5 |
| } |
| |
| define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 { |
| ; CHECK-LABEL: stack_fold_minph_zmm_k_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmovaps (%rsi), %zmm2 |
| ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload |
| ; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm2 {%k1} |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = load <32 x half>, ptr %passthru |
| %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 |
| ret <32 x half> %5 |
| } |
| |
| define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 { |
| ; CHECK-LABEL: stack_fold_minph_zmm_kz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer |
| ret <32 x half> %4 |
| } |
| |
| define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 { |
| ; CHECK-LABEL: stack_fold_minph_zmm_kz_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload |
| ; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm0 {%k1} {z} |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer |
| ret <32 x half> %4 |
| } |
| |
| define <32 x half> @stack_fold_minph_zmm_commutable(<32 x half> %a0, <32 x half> %a1) #1 { |
| ; CHECK-LABEL: stack_fold_minph_zmm_commutable: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) |
| ret <32 x half> %2 |
| } |
| |
| define <32 x half> @stack_fold_minph_zmm_commutable_commuted(<32 x half> %a0, <32 x half> %a1) #1 { |
| ; CHECK-LABEL: stack_fold_minph_zmm_commutable_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) |
| ret <32 x half> %2 |
| } |
| |
| define <32 x half> @stack_fold_minph_zmm_commutable_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 { |
| ; CHECK-LABEL: stack_fold_minph_zmm_commutable_k: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmovaps (%rsi), %zmm2 |
| ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = load <32 x half>, ptr %passthru |
| %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 |
| ret <32 x half> %5 |
| } |
| |
| define <32 x half> @stack_fold_minph_zmm_commutable_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 { |
| ; CHECK-LABEL: stack_fold_minph_zmm_commutable_k_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmovaps (%rsi), %zmm2 |
| ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = load <32 x half>, ptr %passthru |
| %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 |
| ret <32 x half> %5 |
| } |
| |
| define <32 x half> @stack_fold_minph_zmm_commutable_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 { |
| ; CHECK-LABEL: stack_fold_minph_zmm_commutable_kz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer |
| ret <32 x half> %4 |
| } |
| |
| define <32 x half> @stack_fold_minph_zmm_commutable_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 { |
| ; CHECK-LABEL: stack_fold_minph_zmm_commutable_kz_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer |
| ret <32 x half> %4 |
| } |
| |
| define half @stack_fold_minsh(half %a0, half %a1) #0 { |
| ; CHECK-LABEL: stack_fold_minsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fcmp olt half %a0, %a1 |
| %3 = select i1 %2, half %a0, half %a1 |
| ret half %3 |
| } |
| |
| define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 { |
| ; CHECK-LABEL: stack_fold_minsh_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload |
| ; CHECK-NEXT: vminsh %xmm0, %xmm1, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fcmp olt half %a1, %a0 |
| %3 = select i1 %2, half %a1, half %a0 |
| ret half %3 |
| } |
| |
| define half @stack_fold_minsh_commutable(half %a0, half %a1) #1 { |
| ; CHECK-LABEL: stack_fold_minsh_commutable: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fcmp olt half %a0, %a1 |
| %3 = select i1 %2, half %a0, half %a1 |
| ret half %3 |
| } |
| |
| define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 { |
| ; CHECK-LABEL: stack_fold_minsh_commutable_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fcmp olt half %a1, %a0 |
| %3 = select i1 %2, half %a1, half %a0 |
| ret half %3 |
| } |
| |
| define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) #0 { |
| ; CHECK-LABEL: stack_fold_minsh_int: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4) |
| ret <8 x half> %2 |
| } |
| declare <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) |
| |
| define <8 x half> @stack_fold_minsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mask, ptr %passthru) { |
| ; CHECK-LABEL: stack_fold_minsh_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rsi), %xmm2 |
| ; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <8 x half>, ptr %passthru |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4) |
| ret <8 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_minsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %mask) { |
| ; CHECK-LABEL: stack_fold_minsh_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 4) |
| ret <8 x half> %2 |
| } |
| |
| define <32 x half> @stack_fold_mulph_zmm(<32 x half> %a0, <32 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_mulph_zmm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fmul <32 x half> %a0, %a1 |
| ret <32 x half> %2 |
| } |
| |
| define <32 x half> @stack_fold_mulph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) { |
| ; CHECK-LABEL: stack_fold_mulph_zmm_k: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmovaps (%rsi), %zmm2 |
| ; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fmul <32 x half> %a0, %a1 |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = load <32 x half>, ptr %passthru |
| %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 |
| ret <32 x half> %5 |
| } |
| |
| define <32 x half> @stack_fold_mulph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) { |
| ; CHECK-LABEL: stack_fold_mulph_zmm_k_commuted: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmovaps (%rsi), %zmm2 |
| ; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fmul <32 x half> %a1, %a0 |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = load <32 x half>, ptr %passthru |
| %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 |
| ret <32 x half> %5 |
| } |
| |
| define <32 x half> @stack_fold_mulph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) { |
| ; CHECK-LABEL: stack_fold_mulph_zmm_kz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd %edi, %k1 |
| ; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fmul <32 x half> %a1, %a0 |
| %3 = bitcast i32 %mask to <32 x i1> |
| %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer |
| ret <32 x half> %4 |
| } |
| |
| define half @stack_fold_mulsh(half %a0, half %a1) { |
| ; CHECK-LABEL: stack_fold_mulsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmulsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fmul half %a0, %a1 |
| ret half %2 |
| } |
| |
| define <8 x half> @stack_fold_mulsh_int(<8 x half> %a0, <8 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_mulsh_int: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmulsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = extractelement <8 x half> %a0, i32 0 |
| %3 = extractelement <8 x half> %a1, i32 0 |
| %4 = fmul half %2, %3 |
| %5 = insertelement <8 x half> %a0, half %4, i32 0 |
| ret <8 x half> %5 |
| } |
| |
| define <32 x half> @stack_fold_rcpph(<32 x half> %a0) { |
| ; CHECK-LABEL: stack_fold_rcpph: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1) |
| ret <32 x half> %2 |
| } |
| declare <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half>, <32 x half>, i32) |
| |
| define <32 x half> @stack_fold_rcpph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) { |
| ; CHECK-LABEL: stack_fold_rcpph_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %zmm1 |
| ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm1, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <32 x half>, ptr %passthru |
| %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask) |
| ret <32 x half> %3 |
| } |
| |
| define <32 x half> @stack_fold_rcpph_maskz(<32 x half> %a0, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_rcpph_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd (%rdi), %k1 |
| ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i32, ptr %mask |
| %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2) |
| ret <32 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_rcpsh(<8 x half> %a0, <8 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_rcpsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vrcpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1) |
| ret <8 x half> %2 |
| } |
| declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half>, <8 x half>, <8 x half>, i8) |
| |
| define <8 x half> @stack_fold_rcpsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) { |
| ; CHECK-LABEL: stack_fold_rcpsh_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %xmm2 |
| ; CHECK-NEXT: vrcpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <8 x half>, ptr %passthru |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask) |
| ret <8 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_rcpsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_rcpsh_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovb (%rdi), %k1 |
| ; CHECK-NEXT: vrcpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i8, ptr %mask |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2) |
| ret <8 x half> %3 |
| } |
| |
| define <32 x half> @stack_fold_reduceph(<32 x half> %a0) { |
| ; CHECK-LABEL: stack_fold_reduceph: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4) |
| ret <32 x half> %2 |
| } |
| declare <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half>, i32, <32 x half>, i32, i32) |
| |
| define <32 x half> @stack_fold_reduceph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) { |
| ; CHECK-LABEL: stack_fold_reduceph_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %zmm1 |
| ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm1, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <32 x half>, ptr %passthru |
| %3 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4) |
| ret <32 x half> %3 |
| } |
| |
| define <32 x half> @stack_fold_reduceph_maskz(<32 x half> %a0, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_reduceph_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd (%rdi), %k1 |
| ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i32, ptr %mask |
| %3 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4) |
| ret <32 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_reducesh(<8 x half> %a0, <8 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_reducesh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vreducesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 8, i32 4) |
| ret <8 x half> %2 |
| } |
| declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32, i32) |
| |
| define <8 x half> @stack_fold_reducesh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) { |
| ; CHECK-LABEL: stack_fold_reducesh_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %xmm2 |
| ; CHECK-NEXT: vreducesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <8 x half>, ptr %passthru |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 8, i32 4) |
| ret <8 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_reducesh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_reducesh_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovb (%rdi), %k1 |
| ; CHECK-NEXT: vreducesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i8, ptr %mask |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 8, i32 4) |
| ret <8 x half> %3 |
| } |
| |
| define <32 x half> @stack_fold_rndscaleph(<32 x half> %a0) { |
| ; CHECK-LABEL: stack_fold_rndscaleph: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4) |
| ret <32 x half> %2 |
| } |
| declare <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half>, i32, <32 x half>, i32, i32) |
| |
| define <32 x half> @stack_fold_rndscaleph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) { |
| ; CHECK-LABEL: stack_fold_rndscaleph_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %zmm1 |
| ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm1, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <32 x half>, ptr %passthru |
| %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4) |
| ret <32 x half> %3 |
| } |
| |
| define <32 x half> @stack_fold_rndscaleph_maskz(<32 x half> %a0, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_rndscaleph_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd (%rdi), %k1 |
| ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i32, ptr %mask |
| %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4) |
| ret <32 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_rndscalesh(<8 x half> %a0, <8 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_rndscalesh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vrndscalesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 8, i32 4) |
| ret <8 x half> %2 |
| } |
| declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32, i32) |
| |
| define <8 x half> @stack_fold_rndscalesh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) { |
| ; CHECK-LABEL: stack_fold_rndscalesh_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %xmm2 |
| ; CHECK-NEXT: vrndscalesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <8 x half>, ptr %passthru |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 8, i32 4) |
| ret <8 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_rndscalesh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_rndscalesh_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovb (%rdi), %k1 |
| ; CHECK-NEXT: vrndscalesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i8, ptr %mask |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 8, i32 4) |
| ret <8 x half> %3 |
| } |
| |
| define <32 x half> @stack_fold_rsqrtph(<32 x half> %a0) { |
| ; CHECK-LABEL: stack_fold_rsqrtph: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1) |
| ret <32 x half> %2 |
| } |
| declare <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half>, <32 x half>, i32) |
| |
| define <32 x half> @stack_fold_rsqrtph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) { |
| ; CHECK-LABEL: stack_fold_rsqrtph_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %zmm1 |
| ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm1, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <32 x half>, ptr %passthru |
| %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask) |
| ret <32 x half> %3 |
| } |
| |
| define <32 x half> @stack_fold_rsqrtph_maskz(<32 x half> %a0, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_rsqrtph_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd (%rdi), %k1 |
| ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i32, ptr %mask |
| %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2) |
| ret <32 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_rsqrtsh(<8 x half> %a0, <8 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_rsqrtsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vrsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1) |
| ret <8 x half> %2 |
| } |
| declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8) |
| |
| define <8 x half> @stack_fold_rsqrtsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) { |
| ; CHECK-LABEL: stack_fold_rsqrtsh_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %xmm2 |
| ; CHECK-NEXT: vrsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <8 x half>, ptr %passthru |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask) |
| ret <8 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_rsqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_rsqrtsh_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovb (%rdi), %k1 |
| ; CHECK-NEXT: vrsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i8, ptr %mask |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2) |
| ret <8 x half> %3 |
| } |
| |
| define <32 x half> @stack_fold_sqrtph(<32 x half> %a0) { |
| ; CHECK-LABEL: stack_fold_sqrtph: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0) |
| ret <32 x half> %2 |
| } |
| declare <32 x half> @llvm.sqrt.v32f16(<32 x half>) |
| |
| define <32 x half> @stack_fold_sqrtph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) { |
| ; CHECK-LABEL: stack_fold_sqrtph_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %zmm1 |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm1, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <32 x half>, ptr %passthru |
| %3 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0) |
| %4 = bitcast i32 %mask to <32 x i1> |
| %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> %2 |
| ret <32 x half> %5 |
| } |
| |
| define <32 x half> @stack_fold_sqrtph_maskz(<32 x half> %a0, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_sqrtph_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovd (%rdi), %k1 |
| ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i32, ptr %mask |
| %3 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0) |
| %4 = bitcast i32 %2 to <32 x i1> |
| %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> zeroinitializer |
| ret <32 x half> %5 |
| } |
| |
| define <8 x half> @stack_fold_sqrtsh(<8 x half> %a0, <8 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_sqrtsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4) |
| ret <8 x half> %2 |
| } |
| declare <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32) |
| |
| define <8 x half> @stack_fold_sqrtsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) { |
| ; CHECK-LABEL: stack_fold_sqrtsh_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %xmm2 |
| ; CHECK-NEXT: vsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <8 x half>, ptr %passthru |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4) |
| ret <8 x half> %3 |
| } |
| |
| define <8 x half> @stack_fold_sqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_sqrtsh_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovb (%rdi), %k1 |
| ; CHECK-NEXT: vsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i8, ptr %mask |
| %3 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 4) |
| ret <8 x half> %3 |
| } |
| |
| define <32 x half> @stack_fold_subph_zmm(<32 x half> %a0, <32 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_subph_zmm: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vsubph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fsub <32 x half> %a0, %a1 |
| ret <32 x half> %2 |
| } |
| |
| define half @stack_fold_subsh(half %a0, half %a1) { |
| ; CHECK-LABEL: stack_fold_subsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vsubsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = fsub half %a0, %a1 |
| ret half %2 |
| } |
| |
| define <8 x half> @stack_fold_subsh_int(<8 x half> %a0, <8 x half> %a1) { |
| ; CHECK-LABEL: stack_fold_subsh_int: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vsubsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = extractelement <8 x half> %a0, i32 0 |
| %3 = extractelement <8 x half> %a1, i32 0 |
| %4 = fsub half %2, %3 |
| %5 = insertelement <8 x half> %a0, half %4, i32 0 |
| ret <8 x half> %5 |
| } |
| |
| define <16 x float> @stack_fold_fmulcph(<16 x float> %a0, <16 x float> %a1) { |
| ; CHECK-LABEL: stack_fold_fmulcph: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4) |
| ret <16 x float> %2 |
| } |
| |
| define <16 x float> @stack_fold_fmulcph_commute(<16 x float> %a0, <16 x float> %a1) { |
| ; CHECK-LABEL: stack_fold_fmulcph_commute: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a1, <16 x float> %a0, <16 x float> undef, i16 -1, i32 4) |
| ret <16 x float> %2 |
| } |
| declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) |
| |
| define <16 x float> @stack_fold_fmulcph_mask(<16 x float> %a0, <16 x float> %a1, ptr %passthru, i16 %mask) { |
| ; CHECK-LABEL: stack_fold_fmulcph_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %zmm2 |
| ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <16 x float>, ptr %passthru |
| %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %2, i16 %mask, i32 4) |
| ret <16 x float> %3 |
| } |
| |
| define <16 x float> @stack_fold_fmulcph_maskz(<16 x float> %a0, <16 x float> %a1, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_fmulcph_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovw (%rdi), %k1 |
| ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i16, ptr %mask |
| %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4) |
| ret <16 x float> %3 |
| } |
| |
| define <16 x float> @stack_fold_fcmulcph(<16 x float> %a0, <16 x float> %a1) { |
| ; CHECK-LABEL: stack_fold_fcmulcph: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4) |
| ret <16 x float> %2 |
| } |
| |
| define <16 x float> @stack_fold_fcmulcph_commute(<16 x float> %a0, <16 x float> %a1) { |
| ; CHECK-LABEL: stack_fold_fcmulcph_commute: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload |
| ; CHECK-NEXT: vfcmulcph %zmm0, %zmm1, %zmm2 |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a1, <16 x float> %a0, <16 x float> undef, i16 -1, i32 4) |
| ret <16 x float> %2 |
| } |
| declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) |
| |
| define <16 x float> @stack_fold_fcmulcph_mask(<16 x float> %a0, <16 x float> %a1, ptr %passthru, i16 %mask) { |
| ; CHECK-LABEL: stack_fold_fcmulcph_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %zmm2 |
| ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <16 x float>, ptr %passthru |
| %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %2, i16 %mask, i32 4) |
| ret <16 x float> %3 |
| } |
| |
| define <16 x float> @stack_fold_fcmulcph_maskz(<16 x float> %a0, <16 x float> %a1, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_fcmulcph_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovw (%rdi), %k1 |
| ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i16, ptr %mask |
| %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4) |
| ret <16 x float> %3 |
| } |
| |
| define <16 x float> @stack_fold_fmaddcph(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { |
| ; CHECK-LABEL: stack_fold_fmaddcph: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 -1, i32 4) |
| ret <16 x float> %2 |
| } |
| |
| define <16 x float> @stack_fold_fmaddcph_commute(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { |
| ; CHECK-LABEL: stack_fold_fmaddcph_commute: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a2, <16 x float> %a1, <16 x float> %a0, i16 -1, i32 4) |
| ret <16 x float> %2 |
| } |
| declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) |
| |
| define <16 x float> @stack_fold_fmaddcph_mask(ptr %p, <16 x float> %a1, <16 x float> %a2, i16 %mask) { |
| ; CHECK-LABEL: stack_fold_fmaddcph_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %zmm2 |
| ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %a0 = load <16 x float>, ptr %p |
| %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 %mask, i32 4) |
| ret <16 x float> %2 |
| } |
| |
| define <16 x float> @stack_fold_fmaddcph_maskz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_fmaddcph_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovw (%rdi), %k1 |
| ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i16, ptr %mask |
| %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> zeroinitializer, i16 %2, i32 4) |
| ret <16 x float> %3 |
| } |
| declare <16 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) |
| |
| define <16 x float> @stack_fold_fcmaddcph(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { |
| ; CHECK-LABEL: stack_fold_fcmaddcph: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 -1, i32 4) |
| ret <16 x float> %2 |
| } |
| |
| define <16 x float> @stack_fold_fcmaddcph_commute(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { |
| ; CHECK-LABEL: stack_fold_fcmaddcph_commute: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload |
| ; CHECK-NEXT: vfcmaddcph %zmm1, %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a2, <16 x float> %a1, <16 x float> %a0, i16 -1, i32 4) |
| ret <16 x float> %2 |
| } |
| declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) |
| |
| define <16 x float> @stack_fold_fcmaddcph_mask(ptr %p, <16 x float> %a1, <16 x float> %a2, i16 %mask) { |
| ; CHECK-LABEL: stack_fold_fcmaddcph_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %zmm2 |
| ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %zmm2, %zmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %a0 = load <16 x float>, ptr %p |
| %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 %mask, i32 4) |
| ret <16 x float> %2 |
| } |
| |
| define <16 x float> @stack_fold_fcmaddcph_maskz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_fcmaddcph_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovw (%rdi), %k1 |
| ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i16, ptr %mask |
| %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> zeroinitializer, i16 %2, i32 4) |
| ret <16 x float> %3 |
| } |
| declare <16 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) |
| |
| define <4 x float> @stack_fold_fmulcsh(<4 x float> %a0, <4 x float> %a1) { |
| ; CHECK-LABEL: stack_fold_fmulcsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) |
| ret <4 x float> %2 |
| } |
| |
| define <4 x float> @stack_fold_fmulcsh_commute(<4 x float> %a0, <4 x float> %a1) { |
| ; CHECK-LABEL: stack_fold_fmulcsh_commute: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a1, <4 x float> %a0, <4 x float> undef, i8 -1, i32 4) |
| ret <4 x float> %2 |
| } |
| declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32) |
| |
| define <4 x float> @stack_fold_fmulcsh_mask(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %mask) { |
| ; CHECK-LABEL: stack_fold_fmulcsh_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %xmm2 |
| ; CHECK-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <4 x float>, ptr %passthru |
| %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask, i32 4) |
| ret <4 x float> %3 |
| } |
| |
| define <4 x float> @stack_fold_fmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_fmulcsh_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovb (%rdi), %k1 |
| ; CHECK-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i8, ptr %mask |
| %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4) |
| ret <4 x float> %3 |
| } |
| |
| define <4 x float> @stack_fold_fcmulcsh(<4 x float> %a0, <4 x float> %a1) { |
| ; CHECK-LABEL: stack_fold_fcmulcsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) |
| ret <4 x float> %2 |
| } |
| |
| define <4 x float> @stack_fold_fcmulcsh_commute(<4 x float> %a0, <4 x float> %a1) { |
| ; CHECK-LABEL: stack_fold_fcmulcsh_commute: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload |
| ; CHECK-NEXT: vfcmulcsh %xmm0, %xmm1, %xmm2 |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a1, <4 x float> %a0, <4 x float> undef, i8 -1, i32 4) |
| ret <4 x float> %2 |
| } |
| declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32) |
| |
| define <4 x float> @stack_fold_fcmulcsh_mask(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %mask) { |
| ; CHECK-LABEL: stack_fold_fcmulcsh_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %xmm2 |
| ; CHECK-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load <4 x float>, ptr %passthru |
| %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask, i32 4) |
| ret <4 x float> %3 |
| } |
| |
| define <4 x float> @stack_fold_fcmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_fcmulcsh_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovb (%rdi), %k1 |
| ; CHECK-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i8, ptr %mask |
| %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4) |
| ret <4 x float> %3 |
| } |
| |
| define <4 x float> @stack_fold_fmaddcsh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { |
| ; CHECK-LABEL: stack_fold_fmaddcsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1, i32 4) |
| ret <4 x float> %2 |
| } |
| |
| define <4 x float> @stack_fold_fmaddcsh_commute(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { |
| ; CHECK-LABEL: stack_fold_fmaddcsh_commute: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a2, <4 x float> %a1, <4 x float> %a0, i8 -1, i32 4) |
| ret <4 x float> %2 |
| } |
| declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32) |
| |
| define <4 x float> @stack_fold_fmaddcsh_mask(ptr %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) { |
| ; CHECK-LABEL: stack_fold_fmaddcsh_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %xmm2 |
| ; CHECK-NEXT: vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %a0 = load <4 x float>, ptr %p |
| %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask, i32 4) |
| ret <4 x float> %2 |
| } |
| |
| define <4 x float> @stack_fold_fmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_fmaddcsh_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovb (%rdi), %k1 |
| ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; CHECK-NEXT: vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i8, ptr %mask |
| %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2, i32 4) |
| ret <4 x float> %3 |
| } |
| declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32) |
| |
| define <4 x float> @stack_fold_fcmaddcsh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { |
| ; CHECK-LABEL: stack_fold_fcmaddcsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vfcmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1, i32 4) |
| ret <4 x float> %2 |
| } |
| |
| define <4 x float> @stack_fold_fcmaddcsh_commute(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { |
| ; CHECK-LABEL: stack_fold_fcmaddcsh_commute: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload |
| ; CHECK-NEXT: vfcmaddcsh %xmm1, %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a2, <4 x float> %a1, <4 x float> %a0, i8 -1, i32 4) |
| ret <4 x float> %2 |
| } |
| declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32) |
| |
| define <4 x float> @stack_fold_fcmaddcsh_mask(ptr %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) { |
| ; CHECK-LABEL: stack_fold_fcmaddcsh_mask: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: kmovd %esi, %k1 |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: vmovaps (%rdi), %xmm2 |
| ; CHECK-NEXT: vfcmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload |
| ; CHECK-NEXT: vmovaps %xmm2, %xmm0 |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %a0 = load <4 x float>, ptr %p |
| %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask, i32 4) |
| ret <4 x float> %2 |
| } |
| |
| define <4 x float> @stack_fold_fcmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, ptr %mask) { |
| ; CHECK-LABEL: stack_fold_fcmaddcsh_maskz: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill |
| ; CHECK-NEXT: #APP |
| ; CHECK-NEXT: nop |
| ; CHECK-NEXT: #NO_APP |
| ; CHECK-NEXT: kmovb (%rdi), %k1 |
| ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; CHECK-NEXT: vfcmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload |
| ; CHECK-NEXT: retq |
| %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() |
| %2 = load i8, ptr %mask |
| %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2, i32 4) |
| ret <4 x float> %3 |
| } |
| declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32) |
| |
| attributes #0 = { "unsafe-fp-math"="false" } |
| attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } |