| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+gfni,+avx | FileCheck %s --check-prefixes=AVX |
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+gfni,+avx512bw | FileCheck %s --check-prefixes=AVX512 |
| |
| declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8) |
| declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8) |
| |
| define <16 x i8> @test_affine_xor_fold_128(<16 x i8> %src1, <16 x i8> %src2) nounwind { |
| ; |
| ; AVX-LABEL: test_affine_xor_fold_128: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| ; |
| ; AVX512-LABEL: test_affine_xor_fold_128: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0 |
| ; AVX512-NEXT: retq |
| %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0) |
| %xor = xor <16 x i8> %gfni, splat(i8 -1) |
| ret <16 x i8> %xor |
| } |
| |
| define <16 x i8> @test_affine_xor_fold_nonzero_imm(<16 x i8> %src1, <16 x i8> %src2) nounwind { |
| ; |
| ; AVX-LABEL: test_affine_xor_fold_nonzero_imm: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vgf2p8affineqb $175, %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| ; |
| ; AVX512-LABEL: test_affine_xor_fold_nonzero_imm: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vgf2p8affineqb $175, %xmm1, %xmm0, %xmm0 |
| ; AVX512-NEXT: retq |
| %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 5) |
| %xor = xor <16 x i8> %gfni, splat(i8 -86) |
| ret <16 x i8> %xor |
| } |
| |
| define <16 x i8> @test_affine_xor_fold_hex(<16 x i8> %src1, <16 x i8> %src2) nounwind { |
| ; |
| ; AVX-LABEL: test_affine_xor_fold_hex: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vgf2p8affineqb $83, %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| ; |
| ; AVX512-LABEL: test_affine_xor_fold_hex: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vgf2p8affineqb $83, %xmm1, %xmm0, %xmm0 |
| ; AVX512-NEXT: retq |
| %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 17) |
| %xor = xor <16 x i8> %gfni, splat(i8 66) |
| ret <16 x i8> %xor |
| } |
| |
| define <32 x i8> @test_affine_xor_fold_256(<32 x i8> %src1, <32 x i8> %src2) nounwind { |
| ; |
| ; AVX-LABEL: test_affine_xor_fold_256: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vgf2p8affineqb $255, %ymm1, %ymm0, %ymm0 |
| ; AVX-NEXT: retq |
| ; |
| ; AVX512-LABEL: test_affine_xor_fold_256: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vgf2p8affineqb $255, %ymm1, %ymm0, %ymm0 |
| ; AVX512-NEXT: retq |
| %gfni = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 0) |
| %xor = xor <32 x i8> %gfni, splat(i8 -1) |
| ret <32 x i8> %xor |
| } |
| |
| define <16 x i8> @test_affine_xor_fold_commutative(<16 x i8> %src1, <16 x i8> %src2) nounwind { |
| ; |
| ; AVX-LABEL: test_affine_xor_fold_commutative: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| ; |
| ; AVX512-LABEL: test_affine_xor_fold_commutative: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0 |
| ; AVX512-NEXT: retq |
| %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0) |
| %xor = xor <16 x i8> splat(i8 -1), %gfni |
| ret <16 x i8> %xor |
| } |
| |
| define <16 x i8> @test_affine_xor_no_fold_multi_use(<16 x i8> %src1, <16 x i8> %src2, ptr %out) nounwind { |
| ; |
| ; AVX-LABEL: test_affine_xor_no_fold_multi_use: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm0, (%rdi) |
| ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 |
| ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| ; |
| ; AVX512-LABEL: test_affine_xor_no_fold_multi_use: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovdqa %xmm0, (%rdi) |
| ; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 |
| ; AVX512-NEXT: retq |
| %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0) |
| store <16 x i8> %gfni, ptr %out |
| %xor = xor <16 x i8> %gfni, splat(i8 -1) |
| ret <16 x i8> %xor |
| } |
| |
| define <16 x i8> @test_affine_xor_no_fold_non_splat(<16 x i8> %src1, <16 x i8> %src2) nounwind { |
| ; |
| ; AVX-LABEL: test_affine_xor_no_fold_non_splat: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| ; |
| ; AVX512-LABEL: test_affine_xor_no_fold_non_splat: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0 |
| ; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 |
| ; AVX512-NEXT: retq |
| %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0) |
| %xor = xor <16 x i8> %gfni, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16> |
| ret <16 x i8> %xor |
| } |
| |
| define <16 x i8> @test_affine_xor_no_fold_variable(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %var) nounwind { |
| ; |
| ; AVX-LABEL: test_affine_xor_no_fold_variable: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| ; |
| ; AVX512-LABEL: test_affine_xor_no_fold_variable: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0 |
| ; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0 |
| ; AVX512-NEXT: retq |
| %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0) |
| %xor = xor <16 x i8> %gfni, %var |
| ret <16 x i8> %xor |
| } |
| |
| ;; Test folding XOR of two vgf2p8affineqb with same input - 128-bit |
| define <16 x i8> @test_affine_affine_xor_fold_128(<16 x i8> %src, <16 x i8> %m1, <16 x i8> %m2) nounwind { |
| ; |
| ; AVX-LABEL: test_affine_affine_xor_fold_128: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 |
| ; AVX-NEXT: vgf2p8affineqb $89, %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| ; |
| ; AVX512-LABEL: test_affine_affine_xor_fold_128: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1 |
| ; AVX512-NEXT: vgf2p8affineqb $89, %xmm1, %xmm0, %xmm0 |
| ; AVX512-NEXT: retq |
| %gfni1 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m1, i8 42) |
| %gfni2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m2, i8 115) |
| %xor = xor <16 x i8> %gfni1, %gfni2 |
| ret <16 x i8> %xor |
| } |
| |
| ;; Test with non-zero immediates - 128-bit |
| define <16 x i8> @test_affine_affine_xor_fold_128_nonzero(<16 x i8> %src, <16 x i8> %m1, <16 x i8> %m2) nounwind { |
| ; |
| ; AVX-LABEL: test_affine_affine_xor_fold_128_nonzero: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 |
| ; AVX-NEXT: vgf2p8affineqb $15, %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| ; |
| ; AVX512-LABEL: test_affine_affine_xor_fold_128_nonzero: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1 |
| ; AVX512-NEXT: vgf2p8affineqb $15, %xmm1, %xmm0, %xmm0 |
| ; AVX512-NEXT: retq |
| %gfni1 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m1, i8 5) |
| %gfni2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m2, i8 10) |
| %xor = xor <16 x i8> %gfni1, %gfni2 |
| ret <16 x i8> %xor |
| } |
| |
| ;; Test commutative XOR - 128-bit |
| define <16 x i8> @test_affine_affine_xor_fold_128_commutative(<16 x i8> %src, <16 x i8> %m1, <16 x i8> %m2) nounwind { |
| ; |
| ; AVX-LABEL: test_affine_affine_xor_fold_128_commutative: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpxor %xmm1, %xmm2, %xmm1 |
| ; AVX-NEXT: vgf2p8affineqb $166, %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| ; |
| ; AVX512-LABEL: test_affine_affine_xor_fold_128_commutative: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vpxor %xmm1, %xmm2, %xmm1 |
| ; AVX512-NEXT: vgf2p8affineqb $166, %xmm1, %xmm0, %xmm0 |
| ; AVX512-NEXT: retq |
| %gfni1 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m1, i8 199) |
| %gfni2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m2, i8 97) |
| %xor = xor <16 x i8> %gfni2, %gfni1 |
| ret <16 x i8> %xor |
| } |
| |
| ;; Negative test: multi-use should not fold - 128-bit |
| define <16 x i8> @test_affine_affine_xor_no_fold_multi_use(<16 x i8> %src, <16 x i8> %m1, <16 x i8> %m2, ptr %out) nounwind { |
| ; |
| ; AVX-LABEL: test_affine_affine_xor_no_fold_multi_use: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vgf2p8affineqb $23, %xmm1, %xmm0, %xmm1 |
| ; AVX-NEXT: vgf2p8affineqb $200, %xmm2, %xmm0, %xmm0 |
| ; AVX-NEXT: vmovdqa %xmm1, (%rdi) |
| ; AVX-NEXT: vpxor %xmm0, %xmm1, %xmm0 |
| ; AVX-NEXT: retq |
| ; |
| ; AVX512-LABEL: test_affine_affine_xor_no_fold_multi_use: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vgf2p8affineqb $23, %xmm1, %xmm0, %xmm1 |
| ; AVX512-NEXT: vgf2p8affineqb $200, %xmm2, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) |
| ; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 |
| ; AVX512-NEXT: retq |
| %gfni1 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m1, i8 23) |
| %gfni2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m2, i8 200) |
| store <16 x i8> %gfni1, ptr %out |
| %xor = xor <16 x i8> %gfni1, %gfni2 |
| ret <16 x i8> %xor |
| } |
| |
| ;; Negative test: different inputs should not fold - 128-bit |
| define <16 x i8> @test_affine_affine_xor_no_fold_different_inputs(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %m1, <16 x i8> %m2) nounwind { |
| ; |
| ; AVX-LABEL: test_affine_affine_xor_no_fold_different_inputs: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vgf2p8affineqb $55, %xmm2, %xmm0, %xmm0 |
| ; AVX-NEXT: vgf2p8affineqb $77, %xmm3, %xmm1, %xmm1 |
| ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| ; |
| ; AVX512-LABEL: test_affine_affine_xor_no_fold_different_inputs: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vgf2p8affineqb $55, %xmm2, %xmm0, %xmm0 |
| ; AVX512-NEXT: vgf2p8affineqb $77, %xmm3, %xmm1, %xmm1 |
| ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| ; AVX512-NEXT: retq |
| %gfni1 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %m1, i8 55) |
| %gfni2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src2, <16 x i8> %m2, i8 77) |
| %xor = xor <16 x i8> %gfni1, %gfni2 |
| ret <16 x i8> %xor |
| } |
| |
| ;; Test 256-bit vectors |
| define <32 x i8> @test_affine_affine_xor_fold_256(<32 x i8> %src, <32 x i8> %m1, <32 x i8> %m2) nounwind { |
| ; |
| ; AVX-LABEL: test_affine_affine_xor_fold_256: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vxorps %ymm2, %ymm1, %ymm1 |
| ; AVX-NEXT: vgf2p8affineqb $89, %ymm1, %ymm0, %ymm0 |
| ; AVX-NEXT: retq |
| ; |
| ; AVX512-LABEL: test_affine_affine_xor_fold_256: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1 |
| ; AVX512-NEXT: vgf2p8affineqb $89, %ymm1, %ymm0, %ymm0 |
| ; AVX512-NEXT: retq |
| %gfni1 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src, <32 x i8> %m1, i8 42) |
| %gfni2 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src, <32 x i8> %m2, i8 115) |
| %xor = xor <32 x i8> %gfni1, %gfni2 |
| ret <32 x i8> %xor |
| } |