blob: e907410ae7babf03f6734a5c68ebc9ddd4196818 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-- -mattr=+gfni,+avx | FileCheck %s --check-prefixes=AVX
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+gfni,+avx512bw | FileCheck %s --check-prefixes=AVX512
declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
define <16 x i8> @test_affine_xor_fold_128(<16 x i8> %src1, <16 x i8> %src2) nounwind {
;
; AVX-LABEL: test_affine_xor_fold_128:
; AVX: # %bb.0:
; AVX-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_affine_xor_fold_128:
; AVX512: # %bb.0:
; AVX512-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
%xor = xor <16 x i8> %gfni, splat(i8 -1)
ret <16 x i8> %xor
}
define <16 x i8> @test_affine_xor_fold_nonzero_imm(<16 x i8> %src1, <16 x i8> %src2) nounwind {
;
; AVX-LABEL: test_affine_xor_fold_nonzero_imm:
; AVX: # %bb.0:
; AVX-NEXT: vgf2p8affineqb $175, %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_affine_xor_fold_nonzero_imm:
; AVX512: # %bb.0:
; AVX512-NEXT: vgf2p8affineqb $175, %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 5)
%xor = xor <16 x i8> %gfni, splat(i8 -86)
ret <16 x i8> %xor
}
define <16 x i8> @test_affine_xor_fold_hex(<16 x i8> %src1, <16 x i8> %src2) nounwind {
;
; AVX-LABEL: test_affine_xor_fold_hex:
; AVX: # %bb.0:
; AVX-NEXT: vgf2p8affineqb $83, %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_affine_xor_fold_hex:
; AVX512: # %bb.0:
; AVX512-NEXT: vgf2p8affineqb $83, %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 17)
%xor = xor <16 x i8> %gfni, splat(i8 66)
ret <16 x i8> %xor
}
define <32 x i8> @test_affine_xor_fold_256(<32 x i8> %src1, <32 x i8> %src2) nounwind {
;
; AVX-LABEL: test_affine_xor_fold_256:
; AVX: # %bb.0:
; AVX-NEXT: vgf2p8affineqb $255, %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_affine_xor_fold_256:
; AVX512: # %bb.0:
; AVX512-NEXT: vgf2p8affineqb $255, %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%gfni = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 0)
%xor = xor <32 x i8> %gfni, splat(i8 -1)
ret <32 x i8> %xor
}
define <16 x i8> @test_affine_xor_fold_commutative(<16 x i8> %src1, <16 x i8> %src2) nounwind {
;
; AVX-LABEL: test_affine_xor_fold_commutative:
; AVX: # %bb.0:
; AVX-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_affine_xor_fold_commutative:
; AVX512: # %bb.0:
; AVX512-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
%xor = xor <16 x i8> splat(i8 -1), %gfni
ret <16 x i8> %xor
}
define <16 x i8> @test_affine_xor_no_fold_multi_use(<16 x i8> %src1, <16 x i8> %src2, ptr %out) nounwind {
;
; AVX-LABEL: test_affine_xor_no_fold_multi_use:
; AVX: # %bb.0:
; AVX-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, (%rdi)
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_affine_xor_no_fold_multi_use:
; AVX512: # %bb.0:
; AVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: retq
%gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
store <16 x i8> %gfni, ptr %out
%xor = xor <16 x i8> %gfni, splat(i8 -1)
ret <16 x i8> %xor
}
define <16 x i8> @test_affine_xor_no_fold_non_splat(<16 x i8> %src1, <16 x i8> %src2) nounwind {
;
; AVX-LABEL: test_affine_xor_no_fold_non_splat:
; AVX: # %bb.0:
; AVX-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_affine_xor_no_fold_non_splat:
; AVX512: # %bb.0:
; AVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
%xor = xor <16 x i8> %gfni, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>
ret <16 x i8> %xor
}
define <16 x i8> @test_affine_xor_no_fold_variable(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %var) nounwind {
;
; AVX-LABEL: test_affine_xor_no_fold_variable:
; AVX: # %bb.0:
; AVX-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_affine_xor_no_fold_variable:
; AVX512: # %bb.0:
; AVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX512-NEXT: retq
%gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
%xor = xor <16 x i8> %gfni, %var
ret <16 x i8> %xor
}
;; Test folding XOR of two vgf2p8affineqb with same input - 128-bit
define <16 x i8> @test_affine_affine_xor_fold_128(<16 x i8> %src, <16 x i8> %m1, <16 x i8> %m2) nounwind {
;
; AVX-LABEL: test_affine_affine_xor_fold_128:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX-NEXT: vgf2p8affineqb $89, %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_affine_affine_xor_fold_128:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vgf2p8affineqb $89, %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%gfni1 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m1, i8 42)
%gfni2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m2, i8 115)
%xor = xor <16 x i8> %gfni1, %gfni2
ret <16 x i8> %xor
}
;; Test with non-zero immediates - 128-bit
define <16 x i8> @test_affine_affine_xor_fold_128_nonzero(<16 x i8> %src, <16 x i8> %m1, <16 x i8> %m2) nounwind {
;
; AVX-LABEL: test_affine_affine_xor_fold_128_nonzero:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX-NEXT: vgf2p8affineqb $15, %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_affine_affine_xor_fold_128_nonzero:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vgf2p8affineqb $15, %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%gfni1 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m1, i8 5)
%gfni2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m2, i8 10)
%xor = xor <16 x i8> %gfni1, %gfni2
ret <16 x i8> %xor
}
;; Test commutative XOR - 128-bit
define <16 x i8> @test_affine_affine_xor_fold_128_commutative(<16 x i8> %src, <16 x i8> %m1, <16 x i8> %m2) nounwind {
;
; AVX-LABEL: test_affine_affine_xor_fold_128_commutative:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm2, %xmm1
; AVX-NEXT: vgf2p8affineqb $166, %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_affine_affine_xor_fold_128_commutative:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm1, %xmm2, %xmm1
; AVX512-NEXT: vgf2p8affineqb $166, %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%gfni1 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m1, i8 199)
%gfni2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m2, i8 97)
%xor = xor <16 x i8> %gfni2, %gfni1
ret <16 x i8> %xor
}
;; Negative test: multi-use should not fold - 128-bit
define <16 x i8> @test_affine_affine_xor_no_fold_multi_use(<16 x i8> %src, <16 x i8> %m1, <16 x i8> %m2, ptr %out) nounwind {
;
; AVX-LABEL: test_affine_affine_xor_no_fold_multi_use:
; AVX: # %bb.0:
; AVX-NEXT: vgf2p8affineqb $23, %xmm1, %xmm0, %xmm1
; AVX-NEXT: vgf2p8affineqb $200, %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm1, (%rdi)
; AVX-NEXT: vpxor %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_affine_affine_xor_no_fold_multi_use:
; AVX512: # %bb.0:
; AVX512-NEXT: vgf2p8affineqb $23, %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vgf2p8affineqb $200, %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%gfni1 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m1, i8 23)
%gfni2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src, <16 x i8> %m2, i8 200)
store <16 x i8> %gfni1, ptr %out
%xor = xor <16 x i8> %gfni1, %gfni2
ret <16 x i8> %xor
}
;; Negative test: different inputs should not fold - 128-bit
define <16 x i8> @test_affine_affine_xor_no_fold_different_inputs(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %m1, <16 x i8> %m2) nounwind {
;
; AVX-LABEL: test_affine_affine_xor_no_fold_different_inputs:
; AVX: # %bb.0:
; AVX-NEXT: vgf2p8affineqb $55, %xmm2, %xmm0, %xmm0
; AVX-NEXT: vgf2p8affineqb $77, %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_affine_affine_xor_no_fold_different_inputs:
; AVX512: # %bb.0:
; AVX512-NEXT: vgf2p8affineqb $55, %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vgf2p8affineqb $77, %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%gfni1 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %m1, i8 55)
%gfni2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src2, <16 x i8> %m2, i8 77)
%xor = xor <16 x i8> %gfni1, %gfni2
ret <16 x i8> %xor
}
;; Test 256-bit vectors
define <32 x i8> @test_affine_affine_xor_fold_256(<32 x i8> %src, <32 x i8> %m1, <32 x i8> %m2) nounwind {
;
; AVX-LABEL: test_affine_affine_xor_fold_256:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %ymm2, %ymm1, %ymm1
; AVX-NEXT: vgf2p8affineqb $89, %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_affine_affine_xor_fold_256:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX512-NEXT: vgf2p8affineqb $89, %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%gfni1 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src, <32 x i8> %m1, i8 42)
%gfni2 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src, <32 x i8> %m2, i8 115)
%xor = xor <32 x i8> %gfni1, %gfni2
ret <32 x i8> %xor
}