| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: llc < %s -mcpu=sm_20 | FileCheck %s |
| ; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %} |
| target triple = "nvptx64-nvidia-cuda" |
| |
| ; Test dynamic insertelt at the beginning of a chain |
| define <4 x i32> @dynamic_at_beginning(i32 %idx) { |
| ; CHECK-LABEL: dynamic_at_beginning( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 4 .b8 __local_depot0[16]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b32 %r<3>; |
| ; CHECK-NEXT: .reg .b64 %rd<6>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot0; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_at_beginning_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; |
| ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; |
| ; CHECK-NEXT: add.u64 %rd4, %SP, 0; |
| ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; |
| ; CHECK-NEXT: st.b32 [%rd5], 10; |
| ; CHECK-NEXT: ld.b32 %r1, [%SP+12]; |
| ; CHECK-NEXT: ld.b32 %r2, [%SP]; |
| ; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r2, 20, 30, %r1}; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx |
| %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 |
| %v2 = insertelement <4 x i32> %v1, i32 30, i32 2 |
| ret <4 x i32> %v2 |
| } |
| |
| ; Test dynamic insertelt at the end of a chain |
| define <4 x i32> @dynamic_at_end(i32 %idx) { |
| ; CHECK-LABEL: dynamic_at_end( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 4 .b8 __local_depot1[16]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<6>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot1; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_at_end_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; |
| ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; |
| ; CHECK-NEXT: add.u64 %rd4, %SP, 0; |
| ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; |
| ; CHECK-NEXT: st.b32 [%SP+4], 20; |
| ; CHECK-NEXT: st.b32 [%SP], 10; |
| ; CHECK-NEXT: st.b32 [%rd5], 30; |
| ; CHECK-NEXT: ld.b32 %r1, [%SP+12]; |
| ; CHECK-NEXT: ld.b32 %r2, [%SP+8]; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP+4]; |
| ; CHECK-NEXT: ld.b32 %r4, [%SP]; |
| ; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <4 x i32> poison, i32 10, i32 0 |
| %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 |
| %v2 = insertelement <4 x i32> %v1, i32 30, i32 %idx |
| ret <4 x i32> %v2 |
| } |
| |
| ; Test dynamic insertelt in the middle of a chain |
| define <4 x i32> @dynamic_in_middle(i32 %idx) { |
| ; CHECK-LABEL: dynamic_in_middle( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 4 .b8 __local_depot2[16]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b32 %r<4>; |
| ; CHECK-NEXT: .reg .b64 %rd<6>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot2; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_in_middle_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; |
| ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; |
| ; CHECK-NEXT: add.u64 %rd4, %SP, 0; |
| ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; |
| ; CHECK-NEXT: st.b32 [%SP], 10; |
| ; CHECK-NEXT: st.b32 [%rd5], 20; |
| ; CHECK-NEXT: ld.b32 %r1, [%SP+12]; |
| ; CHECK-NEXT: ld.b32 %r2, [%SP+4]; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP]; |
| ; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r3, %r2, 30, %r1}; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <4 x i32> poison, i32 10, i32 0 |
| %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx |
| %v2 = insertelement <4 x i32> %v1, i32 30, i32 2 |
| ret <4 x i32> %v2 |
| } |
| |
| ; Test repeated dynamic insertelt with the same index |
| define <4 x i32> @repeated_same_index(i32 %idx) { |
| ; CHECK-LABEL: repeated_same_index( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 4 .b8 __local_depot3[16]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<6>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot3; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [repeated_same_index_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; |
| ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; |
| ; CHECK-NEXT: add.u64 %rd4, %SP, 0; |
| ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; |
| ; CHECK-NEXT: st.b32 [%rd5], 20; |
| ; CHECK-NEXT: ld.b32 %r1, [%SP+12]; |
| ; CHECK-NEXT: ld.b32 %r2, [%SP+8]; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP+4]; |
| ; CHECK-NEXT: ld.b32 %r4, [%SP]; |
| ; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx |
| %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx |
| ret <4 x i32> %v1 |
| } |
| |
| ; Test multiple dynamic insertelts |
| define <4 x i32> @multiple_dynamic(i32 %idx0, i32 %idx1) { |
| ; CHECK-LABEL: multiple_dynamic( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 4 .b8 __local_depot4[16]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<10>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot4; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [multiple_dynamic_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; |
| ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; |
| ; CHECK-NEXT: add.u64 %rd4, %SP, 0; |
| ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; |
| ; CHECK-NEXT: st.b32 [%rd5], 10; |
| ; CHECK-NEXT: ld.param.b32 %rd6, [multiple_dynamic_param_1]; |
| ; CHECK-NEXT: and.b64 %rd7, %rd6, 3; |
| ; CHECK-NEXT: shl.b64 %rd8, %rd7, 2; |
| ; CHECK-NEXT: add.s64 %rd9, %rd4, %rd8; |
| ; CHECK-NEXT: st.b32 [%rd9], 20; |
| ; CHECK-NEXT: ld.b32 %r1, [%SP+12]; |
| ; CHECK-NEXT: ld.b32 %r2, [%SP+8]; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP+4]; |
| ; CHECK-NEXT: ld.b32 %r4, [%SP]; |
| ; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 |
| %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx1 |
| ret <4 x i32> %v1 |
| } |
| |
| ; Test chain with all dynamic insertelts |
| define <4 x i32> @all_dynamic(i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) { |
| ; CHECK-LABEL: all_dynamic( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 4 .b8 __local_depot5[16]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<18>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot5; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [all_dynamic_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; |
| ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; |
| ; CHECK-NEXT: add.u64 %rd4, %SP, 0; |
| ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; |
| ; CHECK-NEXT: ld.param.b32 %rd6, [all_dynamic_param_1]; |
| ; CHECK-NEXT: and.b64 %rd7, %rd6, 3; |
| ; CHECK-NEXT: shl.b64 %rd8, %rd7, 2; |
| ; CHECK-NEXT: add.s64 %rd9, %rd4, %rd8; |
| ; CHECK-NEXT: ld.param.b32 %rd10, [all_dynamic_param_2]; |
| ; CHECK-NEXT: and.b64 %rd11, %rd10, 3; |
| ; CHECK-NEXT: shl.b64 %rd12, %rd11, 2; |
| ; CHECK-NEXT: add.s64 %rd13, %rd4, %rd12; |
| ; CHECK-NEXT: st.b32 [%rd5], 10; |
| ; CHECK-NEXT: st.b32 [%rd9], 20; |
| ; CHECK-NEXT: st.b32 [%rd13], 30; |
| ; CHECK-NEXT: ld.param.b32 %rd14, [all_dynamic_param_3]; |
| ; CHECK-NEXT: and.b64 %rd15, %rd14, 3; |
| ; CHECK-NEXT: shl.b64 %rd16, %rd15, 2; |
| ; CHECK-NEXT: add.s64 %rd17, %rd4, %rd16; |
| ; CHECK-NEXT: st.b32 [%rd17], 40; |
| ; CHECK-NEXT: ld.b32 %r1, [%SP+12]; |
| ; CHECK-NEXT: ld.b32 %r2, [%SP+8]; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP+4]; |
| ; CHECK-NEXT: ld.b32 %r4, [%SP]; |
| ; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 |
| %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx1 |
| %v2 = insertelement <4 x i32> %v1, i32 30, i32 %idx2 |
| %v3 = insertelement <4 x i32> %v2, i32 40, i32 %idx3 |
| ret <4 x i32> %v3 |
| } |
| |
| ; Test mixed constant and dynamic insertelts with high ratio of dynamic ones. |
| ; Should lower all insertelts to stores. |
| define <4 x i32> @mix_dynamic_constant(i32 %idx0, i32 %idx1) { |
| ; CHECK-LABEL: mix_dynamic_constant( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 4 .b8 __local_depot6[16]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b32 %r<5>; |
| ; CHECK-NEXT: .reg .b64 %rd<10>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot6; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [mix_dynamic_constant_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; |
| ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; |
| ; CHECK-NEXT: add.u64 %rd4, %SP, 0; |
| ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; |
| ; CHECK-NEXT: st.b32 [%rd5], 10; |
| ; CHECK-NEXT: ld.param.b32 %rd6, [mix_dynamic_constant_param_1]; |
| ; CHECK-NEXT: and.b64 %rd7, %rd6, 3; |
| ; CHECK-NEXT: shl.b64 %rd8, %rd7, 2; |
| ; CHECK-NEXT: add.s64 %rd9, %rd4, %rd8; |
| ; CHECK-NEXT: st.b32 [%SP+4], 20; |
| ; CHECK-NEXT: st.b32 [%rd9], 30; |
| ; CHECK-NEXT: ld.b32 %r1, [%SP+12]; |
| ; CHECK-NEXT: ld.b32 %r2, [%SP+8]; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP+4]; |
| ; CHECK-NEXT: ld.b32 %r4, [%SP]; |
| ; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 |
| %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 |
| %v2 = insertelement <4 x i32> %v1, i32 30, i32 %idx1 |
| ret <4 x i32> %v2 |
| } |
| |
| ; Test two separate chains that don't interfere |
| define void @two_separate_chains(i32 %idx0, i32 %idx1, ptr %out0, ptr %out1) { |
| ; CHECK-LABEL: two_separate_chains( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 4 .b8 __local_depot7[32]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b32 %r<7>; |
| ; CHECK-NEXT: .reg .b64 %rd<13>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot7; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [two_separate_chains_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; |
| ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; |
| ; CHECK-NEXT: add.u64 %rd4, %SP, 16; |
| ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; |
| ; CHECK-NEXT: st.b32 [%rd5], 10; |
| ; CHECK-NEXT: ld.param.b32 %rd6, [two_separate_chains_param_1]; |
| ; CHECK-NEXT: and.b64 %rd7, %rd6, 3; |
| ; CHECK-NEXT: shl.b64 %rd8, %rd7, 2; |
| ; CHECK-NEXT: add.u64 %rd9, %SP, 0; |
| ; CHECK-NEXT: add.s64 %rd10, %rd9, %rd8; |
| ; CHECK-NEXT: ld.b32 %r1, [%SP+28]; |
| ; CHECK-NEXT: ld.b32 %r2, [%SP+24]; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP+16]; |
| ; CHECK-NEXT: ld.param.b64 %rd11, [two_separate_chains_param_2]; |
| ; CHECK-NEXT: st.b32 [%rd10], 30; |
| ; CHECK-NEXT: ld.param.b64 %rd12, [two_separate_chains_param_3]; |
| ; CHECK-NEXT: ld.b32 %r4, [%SP+12]; |
| ; CHECK-NEXT: ld.b32 %r5, [%SP+4]; |
| ; CHECK-NEXT: ld.b32 %r6, [%SP]; |
| ; CHECK-NEXT: st.v4.b32 [%rd11], {%r3, 20, %r2, %r1}; |
| ; CHECK-NEXT: st.v4.b32 [%rd12], {%r6, %r5, 40, %r4}; |
| ; CHECK-NEXT: ret; |
| ; Chain 1 |
| %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 |
| %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 |
| |
| ; Chain 2 |
| %w0 = insertelement <4 x i32> poison, i32 30, i32 %idx1 |
| %w1 = insertelement <4 x i32> %w0, i32 40, i32 2 |
| |
| store <4 x i32> %v1, ptr %out0 |
| store <4 x i32> %w1, ptr %out1 |
| ret void |
| } |
| |
| ; Test overlapping chains (chain 2 starts from middle of chain 1) |
| define void @overlapping_chains(i32 %idx0, i32 %idx1, ptr %out0, ptr %out1) { |
| ; CHECK-LABEL: overlapping_chains( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 4 .b8 __local_depot8[32]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b32 %r<7>; |
| ; CHECK-NEXT: .reg .b64 %rd<14>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot8; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [overlapping_chains_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; |
| ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; |
| ; CHECK-NEXT: add.u64 %rd4, %SP, 16; |
| ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; |
| ; CHECK-NEXT: st.b32 [%rd5], 10; |
| ; CHECK-NEXT: add.u64 %rd6, %SP, 0; |
| ; CHECK-NEXT: add.s64 %rd7, %rd6, %rd3; |
| ; CHECK-NEXT: ld.b32 %r1, [%SP+28]; |
| ; CHECK-NEXT: ld.b32 %r2, [%SP+16]; |
| ; CHECK-NEXT: ld.param.b64 %rd8, [overlapping_chains_param_2]; |
| ; CHECK-NEXT: st.b32 [%rd7], 10; |
| ; CHECK-NEXT: ld.param.b32 %rd9, [overlapping_chains_param_1]; |
| ; CHECK-NEXT: and.b64 %rd10, %rd9, 3; |
| ; CHECK-NEXT: shl.b64 %rd11, %rd10, 2; |
| ; CHECK-NEXT: add.s64 %rd12, %rd6, %rd11; |
| ; CHECK-NEXT: st.b32 [%SP+4], 20; |
| ; CHECK-NEXT: st.b32 [%rd12], 30; |
| ; CHECK-NEXT: ld.param.b64 %rd13, [overlapping_chains_param_3]; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP+12]; |
| ; CHECK-NEXT: ld.b32 %r4, [%SP+8]; |
| ; CHECK-NEXT: ld.b32 %r5, [%SP+4]; |
| ; CHECK-NEXT: ld.b32 %r6, [%SP]; |
| ; CHECK-NEXT: st.v4.b32 [%rd8], {%r2, 20, 40, %r1}; |
| ; CHECK-NEXT: st.v4.b32 [%rd13], {%r6, %r5, %r4, %r3}; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 |
| %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 |
| |
| ; Chain 2 starts from v1 |
| %w0 = insertelement <4 x i32> %v1, i32 30, i32 %idx1 |
| |
| ; Continue chain 1 |
| %v2 = insertelement <4 x i32> %v1, i32 40, i32 2 |
| |
| store <4 x i32> %v2, ptr %out0 |
| store <4 x i32> %w0, ptr %out1 |
| ret void |
| } |
| |
| ; Test with i1 elements (1-bit, non-byte-aligned) |
| define <8 x i1> @dynamic_i1(i32 %idx) { |
| ; CHECK-LABEL: dynamic_i1( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 8 .b8 __local_depot9[8]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-NEXT: .reg .b64 %rd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot9; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i1_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 7; |
| ; CHECK-NEXT: add.u64 %rd3, %SP, 0; |
| ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; |
| ; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; |
| ; CHECK-NEXT: st.b8 [%rd4], 1; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP]; |
| ; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x7773U; |
| ; CHECK-NEXT: ld.b32 %r5, [%SP+4]; |
| ; CHECK-NEXT: prmt.b32 %r6, %r5, 0, 0x7771U; |
| ; CHECK-NEXT: prmt.b32 %r7, %r5, 0, 0x7772U; |
| ; CHECK-NEXT: prmt.b32 %r8, %r5, 0, 0x7773U; |
| ; CHECK-NEXT: st.param.b8 [func_retval0+4], %r5; |
| ; CHECK-NEXT: st.param.b8 [func_retval0], %r3; |
| ; CHECK-NEXT: st.param.b8 [func_retval0+7], %r8; |
| ; CHECK-NEXT: st.param.b8 [func_retval0+6], %r7; |
| ; CHECK-NEXT: st.param.b8 [func_retval0+5], %r6; |
| ; CHECK-NEXT: st.param.b8 [func_retval0+3], %r4; |
| ; CHECK-NEXT: st.param.b8 [func_retval0+2], 1; |
| ; CHECK-NEXT: st.param.b8 [func_retval0+1], 0; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <8 x i1> poison, i1 1, i32 %idx |
| %v1 = insertelement <8 x i1> %v0, i1 0, i32 1 |
| %v2 = insertelement <8 x i1> %v1, i1 1, i32 2 |
| ret <8 x i1> %v2 |
| } |
| |
| ; Test with i2 elements (2-bit, non-byte-aligned) |
| define <8 x i2> @dynamic_i2(i32 %idx) { |
| ; CHECK-LABEL: dynamic_i2( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 8 .b8 __local_depot10[16]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b16 %rs<24>; |
| ; CHECK-NEXT: .reg .b32 %r<10>; |
| ; CHECK-NEXT: .reg .b64 %rd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot10; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i2_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 7; |
| ; CHECK-NEXT: add.u64 %rd3, %SP, 0; |
| ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; |
| ; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; |
| ; CHECK-NEXT: st.b8 [%rd4], 1; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP+4]; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; |
| ; CHECK-NEXT: and.b16 %rs2, %rs1, 3; |
| ; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x7771U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; |
| ; CHECK-NEXT: and.b16 %rs4, %rs3, 3; |
| ; CHECK-NEXT: shl.b16 %rs5, %rs4, 2; |
| ; CHECK-NEXT: or.b16 %rs6, %rs2, %rs5; |
| ; CHECK-NEXT: prmt.b32 %r5, %r3, 0, 0x7772U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r5; |
| ; CHECK-NEXT: and.b16 %rs8, %rs7, 3; |
| ; CHECK-NEXT: shl.b16 %rs9, %rs8, 4; |
| ; CHECK-NEXT: or.b16 %rs10, %rs6, %rs9; |
| ; CHECK-NEXT: prmt.b32 %r6, %r3, 0, 0x7773U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r6; |
| ; CHECK-NEXT: shl.b16 %rs12, %rs11, 6; |
| ; CHECK-NEXT: or.b16 %rs13, %rs10, %rs12; |
| ; CHECK-NEXT: st.b8 [%SP+8], %rs13; |
| ; CHECK-NEXT: ld.b32 %r7, [%SP]; |
| ; CHECK-NEXT: prmt.b32 %r8, %r7, 0, 0x7773U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs14, %r8; |
| ; CHECK-NEXT: shl.b16 %rs15, %rs14, 6; |
| ; CHECK-NEXT: and.b16 %rs16, %rs15, 192; |
| ; CHECK-NEXT: ld.s8 %rs17, [%SP+8]; |
| ; CHECK-NEXT: shl.b16 %rs18, %rs17, 8; |
| ; CHECK-NEXT: or.b16 %rs19, %rs16, %rs18; |
| ; CHECK-NEXT: prmt.b32 %r9, %r7, 0, 0x7770U; |
| ; CHECK-NEXT: st.param.b16 [func_retval0], %r9; |
| ; CHECK-NEXT: st.param.b16 [func_retval0+8], %rs17; |
| ; CHECK-NEXT: shr.s16 %rs20, %rs18, 14; |
| ; CHECK-NEXT: st.param.b16 [func_retval0+14], %rs20; |
| ; CHECK-NEXT: shr.s16 %rs21, %rs18, 12; |
| ; CHECK-NEXT: st.param.b16 [func_retval0+12], %rs21; |
| ; CHECK-NEXT: shr.s16 %rs22, %rs18, 10; |
| ; CHECK-NEXT: st.param.b16 [func_retval0+10], %rs22; |
| ; CHECK-NEXT: shr.s16 %rs23, %rs19, 6; |
| ; CHECK-NEXT: st.param.b16 [func_retval0+6], %rs23; |
| ; CHECK-NEXT: st.param.b16 [func_retval0+4], 3; |
| ; CHECK-NEXT: st.param.b16 [func_retval0+2], 2; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <8 x i2> poison, i2 1, i32 %idx |
| %v1 = insertelement <8 x i2> %v0, i2 2, i32 1 |
| %v2 = insertelement <8 x i2> %v1, i2 3, i32 2 |
| ret <8 x i2> %v2 |
| } |
| |
| ; Test with i3 elements (3-bit, non-byte-aligned) |
| define <8 x i3> @dynamic_i3(i32 %idx) { |
| ; CHECK-LABEL: dynamic_i3( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 8 .b8 __local_depot11[8]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b32 %r<15>; |
| ; CHECK-NEXT: .reg .b64 %rd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot11; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i3_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 7; |
| ; CHECK-NEXT: add.u64 %rd3, %SP, 0; |
| ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; |
| ; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; |
| ; CHECK-NEXT: st.b8 [%rd4], 1; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP]; |
| ; CHECK-NEXT: ld.b32 %r4, [%SP+4]; |
| ; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U; |
| ; CHECK-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U; |
| ; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U; |
| ; CHECK-NEXT: st.param.b32 [func_retval0+12], %r7; |
| ; CHECK-NEXT: prmt.b32 %r8, %r4, 0, 0x7771U; |
| ; CHECK-NEXT: prmt.b32 %r9, %r4, 0, 0x7770U; |
| ; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x5410U; |
| ; CHECK-NEXT: st.param.b32 [func_retval0+8], %r10; |
| ; CHECK-NEXT: prmt.b32 %r11, %r3, 0, 0x7773U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r11; |
| ; CHECK-NEXT: mov.b16 %rs2, 3; |
| ; CHECK-NEXT: mov.b32 %r12, {%rs2, %rs1}; |
| ; CHECK-NEXT: st.param.b32 [func_retval0+4], %r12; |
| ; CHECK-NEXT: prmt.b32 %r13, %r3, 0, 0x7770U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r13; |
| ; CHECK-NEXT: mov.b16 %rs4, 2; |
| ; CHECK-NEXT: mov.b32 %r14, {%rs3, %rs4}; |
| ; CHECK-NEXT: st.param.b32 [func_retval0], %r14; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <8 x i3> poison, i3 1, i32 %idx |
| %v1 = insertelement <8 x i3> %v0, i3 2, i32 1 |
| %v2 = insertelement <8 x i3> %v1, i3 3, i32 2 |
| ret <8 x i3> %v2 |
| } |
| |
| ; Test with i4 elements (4-bit, non-byte-aligned) |
| define <8 x i4> @dynamic_i4(i32 %idx) { |
| ; CHECK-LABEL: dynamic_i4( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 8 .b8 __local_depot12[16]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b16 %rs<30>; |
| ; CHECK-NEXT: .reg .b32 %r<22>; |
| ; CHECK-NEXT: .reg .b64 %rd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot12; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i4_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 7; |
| ; CHECK-NEXT: add.u64 %rd3, %SP, 0; |
| ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; |
| ; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; |
| ; CHECK-NEXT: st.b8 [%rd4], 1; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP]; |
| ; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x7770U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r4; |
| ; CHECK-NEXT: and.b16 %rs2, %rs1, 15; |
| ; CHECK-NEXT: prmt.b32 %r5, %r3, 0, 0x7771U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; |
| ; CHECK-NEXT: and.b16 %rs4, %rs3, 15; |
| ; CHECK-NEXT: shl.b16 %rs5, %rs4, 4; |
| ; CHECK-NEXT: or.b16 %rs6, %rs2, %rs5; |
| ; CHECK-NEXT: prmt.b32 %r6, %r3, 0, 0x7772U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs7, %r6; |
| ; CHECK-NEXT: and.b16 %rs8, %rs7, 15; |
| ; CHECK-NEXT: shl.b16 %rs9, %rs8, 8; |
| ; CHECK-NEXT: or.b16 %rs10, %rs6, %rs9; |
| ; CHECK-NEXT: prmt.b32 %r7, %r3, 0, 0x7773U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs11, %r7; |
| ; CHECK-NEXT: shl.b16 %rs12, %rs11, 12; |
| ; CHECK-NEXT: or.b16 %rs13, %rs10, %rs12; |
| ; CHECK-NEXT: cvt.u32.u16 %r8, %rs13; |
| ; CHECK-NEXT: ld.b32 %r9, [%SP+4]; |
| ; CHECK-NEXT: prmt.b32 %r10, %r9, 0, 0x7770U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs14, %r10; |
| ; CHECK-NEXT: and.b16 %rs15, %rs14, 15; |
| ; CHECK-NEXT: prmt.b32 %r11, %r9, 0, 0x7771U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs16, %r11; |
| ; CHECK-NEXT: and.b16 %rs17, %rs16, 15; |
| ; CHECK-NEXT: shl.b16 %rs18, %rs17, 4; |
| ; CHECK-NEXT: or.b16 %rs19, %rs15, %rs18; |
| ; CHECK-NEXT: prmt.b32 %r12, %r9, 0, 0x7772U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs20, %r12; |
| ; CHECK-NEXT: and.b16 %rs21, %rs20, 15; |
| ; CHECK-NEXT: shl.b16 %rs22, %rs21, 8; |
| ; CHECK-NEXT: or.b16 %rs23, %rs19, %rs22; |
| ; CHECK-NEXT: prmt.b32 %r13, %r9, 0, 0x7773U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs24, %r13; |
| ; CHECK-NEXT: shl.b16 %rs25, %rs24, 12; |
| ; CHECK-NEXT: or.b16 %rs26, %rs23, %rs25; |
| ; CHECK-NEXT: cvt.u32.u16 %r14, %rs26; |
| ; CHECK-NEXT: shl.b32 %r15, %r14, 16; |
| ; CHECK-NEXT: or.b32 %r16, %r8, %r15; |
| ; CHECK-NEXT: mov.b32 %r17, {%rs20, %rs24}; |
| ; CHECK-NEXT: st.param.b32 [func_retval0+12], %r17; |
| ; CHECK-NEXT: mov.b32 %r18, {%rs14, %rs16}; |
| ; CHECK-NEXT: st.param.b32 [func_retval0+8], %r18; |
| ; CHECK-NEXT: mov.b16 %rs27, 2; |
| ; CHECK-NEXT: mov.b32 %r19, {%rs1, %rs27}; |
| ; CHECK-NEXT: st.param.b32 [func_retval0], %r19; |
| ; CHECK-NEXT: shr.u32 %r20, %r16, 12; |
| ; CHECK-NEXT: cvt.u16.u32 %rs28, %r20; |
| ; CHECK-NEXT: mov.b16 %rs29, 3; |
| ; CHECK-NEXT: mov.b32 %r21, {%rs29, %rs28}; |
| ; CHECK-NEXT: st.param.b32 [func_retval0+4], %r21; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <8 x i4> poison, i4 1, i32 %idx |
| %v1 = insertelement <8 x i4> %v0, i4 2, i32 1 |
| %v2 = insertelement <8 x i4> %v1, i4 3, i32 2 |
| ret <8 x i4> %v2 |
| } |
| |
| ; Test with i5 elements (5-bit, non-byte-aligned) |
| define <8 x i5> @dynamic_i5(i32 %idx) { |
| ; CHECK-LABEL: dynamic_i5( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 8 .b8 __local_depot13[8]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b32 %r<15>; |
| ; CHECK-NEXT: .reg .b64 %rd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot13; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i5_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 7; |
| ; CHECK-NEXT: add.u64 %rd3, %SP, 0; |
| ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; |
| ; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; |
| ; CHECK-NEXT: st.b8 [%rd4], 1; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP]; |
| ; CHECK-NEXT: ld.b32 %r4, [%SP+4]; |
| ; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U; |
| ; CHECK-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U; |
| ; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U; |
| ; CHECK-NEXT: prmt.b32 %r8, %r4, 0, 0x7771U; |
| ; CHECK-NEXT: prmt.b32 %r9, %r4, 0, 0x7770U; |
| ; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x5410U; |
| ; CHECK-NEXT: st.param.v2.b32 [func_retval0+8], {%r10, %r7}; |
| ; CHECK-NEXT: prmt.b32 %r11, %r3, 0, 0x7773U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r11; |
| ; CHECK-NEXT: mov.b16 %rs2, 3; |
| ; CHECK-NEXT: mov.b32 %r12, {%rs2, %rs1}; |
| ; CHECK-NEXT: prmt.b32 %r13, %r3, 0, 0x7770U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r13; |
| ; CHECK-NEXT: mov.b16 %rs4, 2; |
| ; CHECK-NEXT: mov.b32 %r14, {%rs3, %rs4}; |
| ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r12}; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <8 x i5> poison, i5 1, i32 %idx |
| %v1 = insertelement <8 x i5> %v0, i5 2, i32 1 |
| %v2 = insertelement <8 x i5> %v1, i5 3, i32 2 |
| ret <8 x i5> %v2 |
| } |
| |
| ; Test with i7 elements (7-bit, non-byte-aligned) |
| define <8 x i7> @dynamic_i7(i32 %idx) { |
| ; CHECK-LABEL: dynamic_i7( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 8 .b8 __local_depot14[8]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b32 %r<15>; |
| ; CHECK-NEXT: .reg .b64 %rd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot14; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i7_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 7; |
| ; CHECK-NEXT: add.u64 %rd3, %SP, 0; |
| ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; |
| ; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; |
| ; CHECK-NEXT: st.b8 [%rd4], 1; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP]; |
| ; CHECK-NEXT: ld.b32 %r4, [%SP+4]; |
| ; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U; |
| ; CHECK-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U; |
| ; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U; |
| ; CHECK-NEXT: prmt.b32 %r8, %r4, 0, 0x7771U; |
| ; CHECK-NEXT: prmt.b32 %r9, %r4, 0, 0x7770U; |
| ; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x5410U; |
| ; CHECK-NEXT: st.param.v2.b32 [func_retval0+8], {%r10, %r7}; |
| ; CHECK-NEXT: prmt.b32 %r11, %r3, 0, 0x7773U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r11; |
| ; CHECK-NEXT: mov.b16 %rs2, 3; |
| ; CHECK-NEXT: mov.b32 %r12, {%rs2, %rs1}; |
| ; CHECK-NEXT: prmt.b32 %r13, %r3, 0, 0x7770U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r13; |
| ; CHECK-NEXT: mov.b16 %rs4, 2; |
| ; CHECK-NEXT: mov.b32 %r14, {%rs3, %rs4}; |
| ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r12}; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <8 x i7> poison, i7 1, i32 %idx |
| %v1 = insertelement <8 x i7> %v0, i7 2, i32 1 |
| %v2 = insertelement <8 x i7> %v1, i7 3, i32 2 |
| ret <8 x i7> %v2 |
| } |
| |
| ; Test with i6 elements (6-bit, non-byte-aligned) |
| define <8 x i6> @dynamic_i6(i32 %idx) { |
| ; CHECK-LABEL: dynamic_i6( |
| ; CHECK: { |
| ; CHECK-NEXT: .local .align 8 .b8 __local_depot15[8]; |
| ; CHECK-NEXT: .reg .b64 %SP; |
| ; CHECK-NEXT: .reg .b64 %SPL; |
| ; CHECK-NEXT: .reg .b16 %rs<5>; |
| ; CHECK-NEXT: .reg .b32 %r<15>; |
| ; CHECK-NEXT: .reg .b64 %rd<5>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: mov.b64 %SPL, __local_depot15; |
| ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; |
| ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i6_param_0]; |
| ; CHECK-NEXT: and.b64 %rd2, %rd1, 7; |
| ; CHECK-NEXT: add.u64 %rd3, %SP, 0; |
| ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; |
| ; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; |
| ; CHECK-NEXT: st.b8 [%rd4], 1; |
| ; CHECK-NEXT: ld.b32 %r3, [%SP]; |
| ; CHECK-NEXT: ld.b32 %r4, [%SP+4]; |
| ; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U; |
| ; CHECK-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U; |
| ; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U; |
| ; CHECK-NEXT: prmt.b32 %r8, %r4, 0, 0x7771U; |
| ; CHECK-NEXT: prmt.b32 %r9, %r4, 0, 0x7770U; |
| ; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x5410U; |
| ; CHECK-NEXT: st.param.v2.b32 [func_retval0+8], {%r10, %r7}; |
| ; CHECK-NEXT: prmt.b32 %r11, %r3, 0, 0x7773U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs1, %r11; |
| ; CHECK-NEXT: mov.b16 %rs2, 3; |
| ; CHECK-NEXT: mov.b32 %r12, {%rs2, %rs1}; |
| ; CHECK-NEXT: prmt.b32 %r13, %r3, 0, 0x7770U; |
| ; CHECK-NEXT: cvt.u16.u32 %rs3, %r13; |
| ; CHECK-NEXT: mov.b16 %rs4, 2; |
| ; CHECK-NEXT: mov.b32 %r14, {%rs3, %rs4}; |
| ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r12}; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <8 x i6> poison, i6 1, i32 %idx |
| %v1 = insertelement <8 x i6> %v0, i6 2, i32 1 |
| %v2 = insertelement <8 x i6> %v1, i6 3, i32 2 |
| ret <8 x i6> %v2 |
| } |
| |
| ; Test with multiple dynamic insertions on i3 elements |
| define <4 x i3> @multiple_dynamic_i3(i32 %idx0, i32 %idx1) { |
| ; CHECK-LABEL: multiple_dynamic_i3( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b32 %r<9>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.b32 %r1, [multiple_dynamic_i3_param_0]; |
| ; CHECK-NEXT: shl.b32 %r2, %r1, 3; |
| ; CHECK-NEXT: bfi.b32 %r3, 1, %r4, %r2, 8; |
| ; CHECK-NEXT: ld.param.b32 %r5, [multiple_dynamic_i3_param_1]; |
| ; CHECK-NEXT: shl.b32 %r6, %r5, 3; |
| ; CHECK-NEXT: bfi.b32 %r7, 2, %r3, %r6, 8; |
| ; CHECK-NEXT: st.param.b16 [func_retval0], %r7; |
| ; CHECK-NEXT: shr.u32 %r8, %r7, 16; |
| ; CHECK-NEXT: st.param.b16 [func_retval0+2], %r8; |
| ; CHECK-NEXT: ret; |
| %v0 = insertelement <4 x i3> poison, i3 1, i32 %idx0 |
| %v1 = insertelement <4 x i3> %v0, i3 2, i32 %idx1 |
| ret <4 x i3> %v1 |
| } |