[X86] Improve variable 8-bit shifts on AVX512BW (#164136)

Previously, `clang -march=znver5 -O3` would emit the following for
`shl`, `lshr` and `ashr <64 x i8>`:
```asm
.LCPI0_2:
	.byte	8
	.byte	4
	.byte	2
	.byte	1
	.byte	0
	.byte	0
	.byte	0
	.byte	0
.LCPI0_3:
	.byte	32
	.byte	16
	.byte	8
	.byte	4
	.byte	2
	.byte	1
	.byte	0
	.byte	0
shl:
	vpsllw	zmm1, zmm1, 5
	vpmovb2m	k1, zmm1
	vpaddb	zmm1, zmm1, zmm1
	vgf2p8affineqb	zmm0 {k1}, zmm0, qword ptr [rip + .LCPI0_2]{1to8}, 0
	vpmovb2m	k1, zmm1
	vpaddb	zmm1, zmm1, zmm1
	vgf2p8affineqb	zmm0 {k1}, zmm0, qword ptr [rip + .LCPI0_3]{1to8}, 0
	vpmovb2m	k1, zmm1
	vpaddb	zmm0 {k1}, zmm0, zmm0
	ret

.LCPI1_3:
	.byte	0
	.byte	0
	.byte	0
	.byte	0
	.byte	128
	.byte	64
	.byte	32
	.byte	16
.LCPI1_4:
	.byte	0
	.byte	0
	.byte	128
	.byte	64
	.byte	32
	.byte	16
	.byte	8
	.byte	4
.LCPI1_5:
	.byte	0
	.byte	128
	.byte	64
	.byte	32
	.byte	16
	.byte	8
	.byte	4
	.byte	2
lshr:
	vpsllw	zmm1, zmm1, 5
	vpmovb2m	k1, zmm1
	vpaddb	zmm1, zmm1, zmm1
	vgf2p8affineqb	zmm0 {k1}, zmm0, qword ptr [rip + .LCPI1_3]{1to8}, 0
	vpmovb2m	k1, zmm1
	vpaddb	zmm1, zmm1, zmm1
	vgf2p8affineqb	zmm0 {k1}, zmm0, qword ptr [rip + .LCPI1_4]{1to8}, 0
	vpmovb2m	k1, zmm1
	vgf2p8affineqb	zmm0 {k1}, zmm0, qword ptr [rip + .LCPI1_5]{1to8}, 0
	ret

ashr:
	vpsllw	zmm1, zmm1, 5
	vpunpckhbw	zmm2, zmm0, zmm0
	vpunpckhbw	zmm4, zmm1, zmm1
	vpsraw	zmm3, zmm2, 4
	vpunpcklbw	zmm0, zmm0, zmm0
	vpmovb2m	k1, zmm4
	vpaddw	zmm4, zmm4, zmm4
	vpunpcklbw	zmm1, zmm1, zmm1
	vmovdqu8	zmm2 {k1}, zmm3
	vpmovb2m	k1, zmm4
	vpsraw	zmm3, zmm2, 2
	vpaddw	zmm4, zmm4, zmm4
	vmovdqu8	zmm2 {k1}, zmm3
	vpsraw	zmm3, zmm2, 1
	vpmovb2m	k1, zmm4
	vmovdqu8	zmm2 {k1}, zmm3
	vpmovb2m	k1, zmm1
	vpsraw	zmm3, zmm0, 4
	vpaddw	zmm1, zmm1, zmm1
	vpsrlw	zmm2, zmm2, 8
	vmovdqu8	zmm0 {k1}, zmm3
	vpmovb2m	k1, zmm1
	vpsraw	zmm3, zmm0, 2
	vpaddw	zmm1, zmm1, zmm1
	vmovdqu8	zmm0 {k1}, zmm3
	vpsraw	zmm3, zmm0, 1
	vpmovb2m	k1, zmm1
	vmovdqu8	zmm0 {k1}, zmm3
	vpsrlw	zmm0, zmm0, 8
	vpackuswb	zmm0, zmm0, zmm2
	ret
```

With this commit, the generated assembly becomes this:

```asm
.LCPI0_2:
	.byte	0
	.byte	255
	.byte	0
	.byte	255
.LCPI0_3:
	.byte	255
	.byte	0
	.byte	255
	.byte	0
shl:
	vpsrlw	zmm2, zmm1, 8
	vpandd	zmm3, zmm0, dword ptr [rip + .LCPI0_2]{1to16}
	vpandd	zmm1, zmm1, dword ptr [rip + .LCPI0_3]{1to16}
	movabs	rax, -6148914691236517206
	kmovq	k1, rax
	vpsllvw	zmm2, zmm3, zmm2
	vpsllvw	zmm0, zmm0, zmm1
	vmovdqu8	zmm0 {k1}, zmm2
	ret

.LCPI1_0:
	.byte	255
	.byte	0
lshr:
	vpbroadcastw	zmm2, word ptr [rip + .LCPI1_0]
	movabs	rax, -6148914691236517206
	kmovq	k1, rax
	vpandq	zmm3, zmm1, zmm2
	vpandq	zmm2, zmm0, zmm2
	vpsrlw	zmm1, zmm1, 8
	vpsrlvw	zmm2, zmm2, zmm3
	vpsrlvw	zmm0, zmm0, zmm1
	vmovdqu8	zmm2 {k1}, zmm0
	vmovdqa64	zmm0, zmm2
	ret

.LCPI2_1:
	.byte	255
	.byte	0
	.byte	255
	.byte	0
ashr:
	vpsrlw	zmm2, zmm1, 8
	vpandd	zmm1, zmm1, dword ptr [rip + .LCPI2_1]{1to16}
	movabs	rax, -6148914691236517206
	vpsravw	zmm2, zmm0, zmm2
	vpsllw	zmm0, zmm0, 8
	kmovq	k1, rax
	vpsraw	zmm0, zmm0, 8
	vpsravw	zmm0, zmm0, zmm1
	vmovdqu8	zmm0 {k1}, zmm2
	ret
```

While I don't have AVX512 hardware, llvm-mca suggests significant
speedups, and I've done some simple correctness tests on random inputs
using the Intel Software Development Emulator.
5 files changed
tree: 89b5e04a89eb404b248f6f3a02e85464846b0200
  1. .ci/
  2. .github/
  3. bolt/
  4. clang/
  5. clang-tools-extra/
  6. cmake/
  7. compiler-rt/
  8. cross-project-tests/
  9. flang/
  10. flang-rt/
  11. libc/
  12. libclc/
  13. libcxx/
  14. libcxxabi/
  15. libsycl/
  16. libunwind/
  17. lld/
  18. lldb/
  19. llvm/
  20. llvm-libgcc/
  21. mlir/
  22. offload/
  23. openmp/
  24. orc-rt/
  25. polly/
  26. runtimes/
  27. third-party/
  28. utils/
  29. .clang-format
  30. .clang-format-ignore
  31. .clang-tidy
  32. .git-blame-ignore-revs
  33. .gitattributes
  34. .gitignore
  35. .mailmap
  36. CODE_OF_CONDUCT.md
  37. CONTRIBUTING.md
  38. LICENSE.TXT
  39. pyproject.toml
  40. README.md
  41. SECURITY.md
README.md

The LLVM Compiler Infrastructure

OpenSSF Scorecard OpenSSF Best Practices libc++

Welcome to the LLVM project!

This repository contains the source code for LLVM, a toolkit for the construction of highly optimized compilers, optimizers, and run-time environments.

The LLVM project has multiple components. The core of the project is itself called “LLVM”. This contains all of the tools, libraries, and header files needed to process intermediate representations and convert them into object files. Tools include an assembler, disassembler, bitcode analyzer, and bitcode optimizer.

C-like languages use the Clang frontend. This component compiles C, C++, Objective-C, and Objective-C++ code into LLVM bitcode -- and from there into object files, using LLVM.

Other components include: the libc++ C++ standard library, the LLD linker, and more.

Getting the Source Code and Building LLVM

Consult the Getting Started with LLVM page for information on building and running LLVM.

For information on how to contribute to the LLVM project, please take a look at the Contributing to LLVM guide.

Getting in touch

Join the LLVM Discourse forums, Discord chat, LLVM Office Hours or Regular sync-ups.

The LLVM project has adopted a code of conduct for participants to all modes of communication within the project.