| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -mcpu=sm_90 -mattr=+ptx87 -S < %s | FileCheck %s -check-prefixes=CHECK,SM90 |
| ; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -mcpu=sm_100 -mattr=+ptx88 -S < %s | FileCheck %s -check-prefixes=CHECK,SM100 |
| |
| ; 256 bit loads/stores are only currently supported for: |
| ; - global |
| ; - blackwell (sm_100) |
| ; - ptx 8.8 |
| ; - 32/64-bit types |
| |
| ; Currently, the LSV produces 256 bit loads/stores if the first three conditions |
| ; are satisfied, as the backend will either upsize or split vectors |
| ; of smaller elements in the Type Legalization stage. |
| |
| ; In this file, we test i8, i16, i32, i64, f32, f64. |
| ; The other floating point types are omitted for simplicity. |
| ; We also test the negative case for non-global i32. |
| |
| define void @int8x32(ptr addrspace(1) %ptr) { |
| ; SM90-LABEL: define void @int8x32( |
| ; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { |
| ; SM90-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr addrspace(1) [[PTR]], i64 0 |
| ; SM90-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[PTR]], i64 16 |
| ; SM90-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr addrspace(1) [[PTR0]], align 32 |
| ; SM90-NEXT: [[L01:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0 |
| ; SM90-NEXT: [[L110:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1 |
| ; SM90-NEXT: [[L211:%.*]] = extractelement <16 x i8> [[TMP1]], i32 2 |
| ; SM90-NEXT: [[L312:%.*]] = extractelement <16 x i8> [[TMP1]], i32 3 |
| ; SM90-NEXT: [[L413:%.*]] = extractelement <16 x i8> [[TMP1]], i32 4 |
| ; SM90-NEXT: [[L514:%.*]] = extractelement <16 x i8> [[TMP1]], i32 5 |
| ; SM90-NEXT: [[L615:%.*]] = extractelement <16 x i8> [[TMP1]], i32 6 |
| ; SM90-NEXT: [[L716:%.*]] = extractelement <16 x i8> [[TMP1]], i32 7 |
| ; SM90-NEXT: [[L817:%.*]] = extractelement <16 x i8> [[TMP1]], i32 8 |
| ; SM90-NEXT: [[L918:%.*]] = extractelement <16 x i8> [[TMP1]], i32 9 |
| ; SM90-NEXT: [[LA19:%.*]] = extractelement <16 x i8> [[TMP1]], i32 10 |
| ; SM90-NEXT: [[LB20:%.*]] = extractelement <16 x i8> [[TMP1]], i32 11 |
| ; SM90-NEXT: [[LC21:%.*]] = extractelement <16 x i8> [[TMP1]], i32 12 |
| ; SM90-NEXT: [[LD22:%.*]] = extractelement <16 x i8> [[TMP1]], i32 13 |
| ; SM90-NEXT: [[LE23:%.*]] = extractelement <16 x i8> [[TMP1]], i32 14 |
| ; SM90-NEXT: [[LF24:%.*]] = extractelement <16 x i8> [[TMP1]], i32 15 |
| ; SM90-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr addrspace(1) [[PTR10]], align 16 |
| ; SM90-NEXT: [[L1025:%.*]] = extractelement <16 x i8> [[TMP2]], i32 0 |
| ; SM90-NEXT: [[L1126:%.*]] = extractelement <16 x i8> [[TMP2]], i32 1 |
| ; SM90-NEXT: [[L1227:%.*]] = extractelement <16 x i8> [[TMP2]], i32 2 |
| ; SM90-NEXT: [[L1328:%.*]] = extractelement <16 x i8> [[TMP2]], i32 3 |
| ; SM90-NEXT: [[L1429:%.*]] = extractelement <16 x i8> [[TMP2]], i32 4 |
| ; SM90-NEXT: [[L1530:%.*]] = extractelement <16 x i8> [[TMP2]], i32 5 |
| ; SM90-NEXT: [[L1631:%.*]] = extractelement <16 x i8> [[TMP2]], i32 6 |
| ; SM90-NEXT: [[L1732:%.*]] = extractelement <16 x i8> [[TMP2]], i32 7 |
| ; SM90-NEXT: [[L1833:%.*]] = extractelement <16 x i8> [[TMP2]], i32 8 |
| ; SM90-NEXT: [[L1934:%.*]] = extractelement <16 x i8> [[TMP2]], i32 9 |
| ; SM90-NEXT: [[L1A35:%.*]] = extractelement <16 x i8> [[TMP2]], i32 10 |
| ; SM90-NEXT: [[L1B36:%.*]] = extractelement <16 x i8> [[TMP2]], i32 11 |
| ; SM90-NEXT: [[L1C37:%.*]] = extractelement <16 x i8> [[TMP2]], i32 12 |
| ; SM90-NEXT: [[L1D38:%.*]] = extractelement <16 x i8> [[TMP2]], i32 13 |
| ; SM90-NEXT: [[L1E39:%.*]] = extractelement <16 x i8> [[TMP2]], i32 14 |
| ; SM90-NEXT: [[L1F40:%.*]] = extractelement <16 x i8> [[TMP2]], i32 15 |
| ; SM90-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[LB20]], i32 0 |
| ; SM90-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[LA19]], i32 1 |
| ; SM90-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L918]], i32 2 |
| ; SM90-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L817]], i32 3 |
| ; SM90-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[L716]], i32 4 |
| ; SM90-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[L615]], i32 5 |
| ; SM90-NEXT: [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[L514]], i32 6 |
| ; SM90-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[L413]], i32 7 |
| ; SM90-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[L312]], i32 8 |
| ; SM90-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[L211]], i32 9 |
| ; SM90-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[L110]], i32 10 |
| ; SM90-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[L01]], i32 11 |
| ; SM90-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[LF24]], i32 12 |
| ; SM90-NEXT: [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[LE23]], i32 13 |
| ; SM90-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> [[TMP16]], i8 [[LD22]], i32 14 |
| ; SM90-NEXT: [[TMP18:%.*]] = insertelement <16 x i8> [[TMP17]], i8 [[LC21]], i32 15 |
| ; SM90-NEXT: store <16 x i8> [[TMP18]], ptr addrspace(1) [[PTR0]], align 32 |
| ; SM90-NEXT: [[TMP19:%.*]] = insertelement <16 x i8> poison, i8 [[L1B36]], i32 0 |
| ; SM90-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> [[TMP19]], i8 [[L1A35]], i32 1 |
| ; SM90-NEXT: [[TMP21:%.*]] = insertelement <16 x i8> [[TMP20]], i8 [[L1934]], i32 2 |
| ; SM90-NEXT: [[TMP22:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[L1833]], i32 3 |
| ; SM90-NEXT: [[TMP23:%.*]] = insertelement <16 x i8> [[TMP22]], i8 [[L1732]], i32 4 |
| ; SM90-NEXT: [[TMP24:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[L1631]], i32 5 |
| ; SM90-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP24]], i8 [[L1530]], i32 6 |
| ; SM90-NEXT: [[TMP26:%.*]] = insertelement <16 x i8> [[TMP25]], i8 [[L1429]], i32 7 |
| ; SM90-NEXT: [[TMP27:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[L1328]], i32 8 |
| ; SM90-NEXT: [[TMP28:%.*]] = insertelement <16 x i8> [[TMP27]], i8 [[L1227]], i32 9 |
| ; SM90-NEXT: [[TMP29:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[L1126]], i32 10 |
| ; SM90-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[L1025]], i32 11 |
| ; SM90-NEXT: [[TMP31:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[L1F40]], i32 12 |
| ; SM90-NEXT: [[TMP32:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[L1E39]], i32 13 |
| ; SM90-NEXT: [[TMP33:%.*]] = insertelement <16 x i8> [[TMP32]], i8 [[L1D38]], i32 14 |
| ; SM90-NEXT: [[TMP34:%.*]] = insertelement <16 x i8> [[TMP33]], i8 [[L1C37]], i32 15 |
| ; SM90-NEXT: store <16 x i8> [[TMP34]], ptr addrspace(1) [[PTR10]], align 16 |
| ; SM90-NEXT: ret void |
| ; |
| ; SM100-LABEL: define void @int8x32( |
| ; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { |
| ; SM100-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr addrspace(1) [[PTR]], i64 0 |
| ; SM100-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr addrspace(1) [[PTR0]], align 32 |
| ; SM100-NEXT: [[L01:%.*]] = extractelement <32 x i8> [[TMP1]], i32 0 |
| ; SM100-NEXT: [[L110:%.*]] = extractelement <32 x i8> [[TMP1]], i32 1 |
| ; SM100-NEXT: [[L211:%.*]] = extractelement <32 x i8> [[TMP1]], i32 2 |
| ; SM100-NEXT: [[L312:%.*]] = extractelement <32 x i8> [[TMP1]], i32 3 |
| ; SM100-NEXT: [[L413:%.*]] = extractelement <32 x i8> [[TMP1]], i32 4 |
| ; SM100-NEXT: [[L514:%.*]] = extractelement <32 x i8> [[TMP1]], i32 5 |
| ; SM100-NEXT: [[L615:%.*]] = extractelement <32 x i8> [[TMP1]], i32 6 |
| ; SM100-NEXT: [[L716:%.*]] = extractelement <32 x i8> [[TMP1]], i32 7 |
| ; SM100-NEXT: [[L817:%.*]] = extractelement <32 x i8> [[TMP1]], i32 8 |
| ; SM100-NEXT: [[L918:%.*]] = extractelement <32 x i8> [[TMP1]], i32 9 |
| ; SM100-NEXT: [[LA19:%.*]] = extractelement <32 x i8> [[TMP1]], i32 10 |
| ; SM100-NEXT: [[LB20:%.*]] = extractelement <32 x i8> [[TMP1]], i32 11 |
| ; SM100-NEXT: [[LC21:%.*]] = extractelement <32 x i8> [[TMP1]], i32 12 |
| ; SM100-NEXT: [[LD22:%.*]] = extractelement <32 x i8> [[TMP1]], i32 13 |
| ; SM100-NEXT: [[LE23:%.*]] = extractelement <32 x i8> [[TMP1]], i32 14 |
| ; SM100-NEXT: [[LF24:%.*]] = extractelement <32 x i8> [[TMP1]], i32 15 |
| ; SM100-NEXT: [[L1025:%.*]] = extractelement <32 x i8> [[TMP1]], i32 16 |
| ; SM100-NEXT: [[L1126:%.*]] = extractelement <32 x i8> [[TMP1]], i32 17 |
| ; SM100-NEXT: [[L1227:%.*]] = extractelement <32 x i8> [[TMP1]], i32 18 |
| ; SM100-NEXT: [[L1328:%.*]] = extractelement <32 x i8> [[TMP1]], i32 19 |
| ; SM100-NEXT: [[L1429:%.*]] = extractelement <32 x i8> [[TMP1]], i32 20 |
| ; SM100-NEXT: [[L1530:%.*]] = extractelement <32 x i8> [[TMP1]], i32 21 |
| ; SM100-NEXT: [[L1631:%.*]] = extractelement <32 x i8> [[TMP1]], i32 22 |
| ; SM100-NEXT: [[L1732:%.*]] = extractelement <32 x i8> [[TMP1]], i32 23 |
| ; SM100-NEXT: [[L1833:%.*]] = extractelement <32 x i8> [[TMP1]], i32 24 |
| ; SM100-NEXT: [[L1934:%.*]] = extractelement <32 x i8> [[TMP1]], i32 25 |
| ; SM100-NEXT: [[L1A35:%.*]] = extractelement <32 x i8> [[TMP1]], i32 26 |
| ; SM100-NEXT: [[L1B36:%.*]] = extractelement <32 x i8> [[TMP1]], i32 27 |
| ; SM100-NEXT: [[L1C37:%.*]] = extractelement <32 x i8> [[TMP1]], i32 28 |
| ; SM100-NEXT: [[L1D38:%.*]] = extractelement <32 x i8> [[TMP1]], i32 29 |
| ; SM100-NEXT: [[L1E39:%.*]] = extractelement <32 x i8> [[TMP1]], i32 30 |
| ; SM100-NEXT: [[L1F40:%.*]] = extractelement <32 x i8> [[TMP1]], i32 31 |
| ; SM100-NEXT: [[TMP2:%.*]] = insertelement <32 x i8> poison, i8 [[LB20]], i32 0 |
| ; SM100-NEXT: [[TMP3:%.*]] = insertelement <32 x i8> [[TMP2]], i8 [[LA19]], i32 1 |
| ; SM100-NEXT: [[TMP4:%.*]] = insertelement <32 x i8> [[TMP3]], i8 [[L918]], i32 2 |
| ; SM100-NEXT: [[TMP5:%.*]] = insertelement <32 x i8> [[TMP4]], i8 [[L817]], i32 3 |
| ; SM100-NEXT: [[TMP6:%.*]] = insertelement <32 x i8> [[TMP5]], i8 [[L716]], i32 4 |
| ; SM100-NEXT: [[TMP7:%.*]] = insertelement <32 x i8> [[TMP6]], i8 [[L615]], i32 5 |
| ; SM100-NEXT: [[TMP8:%.*]] = insertelement <32 x i8> [[TMP7]], i8 [[L514]], i32 6 |
| ; SM100-NEXT: [[TMP9:%.*]] = insertelement <32 x i8> [[TMP8]], i8 [[L413]], i32 7 |
| ; SM100-NEXT: [[TMP10:%.*]] = insertelement <32 x i8> [[TMP9]], i8 [[L312]], i32 8 |
| ; SM100-NEXT: [[TMP11:%.*]] = insertelement <32 x i8> [[TMP10]], i8 [[L211]], i32 9 |
| ; SM100-NEXT: [[TMP12:%.*]] = insertelement <32 x i8> [[TMP11]], i8 [[L110]], i32 10 |
| ; SM100-NEXT: [[TMP13:%.*]] = insertelement <32 x i8> [[TMP12]], i8 [[L01]], i32 11 |
| ; SM100-NEXT: [[TMP14:%.*]] = insertelement <32 x i8> [[TMP13]], i8 [[LF24]], i32 12 |
| ; SM100-NEXT: [[TMP15:%.*]] = insertelement <32 x i8> [[TMP14]], i8 [[LE23]], i32 13 |
| ; SM100-NEXT: [[TMP16:%.*]] = insertelement <32 x i8> [[TMP15]], i8 [[LD22]], i32 14 |
| ; SM100-NEXT: [[TMP17:%.*]] = insertelement <32 x i8> [[TMP16]], i8 [[LC21]], i32 15 |
| ; SM100-NEXT: [[TMP18:%.*]] = insertelement <32 x i8> [[TMP17]], i8 [[L1B36]], i32 16 |
| ; SM100-NEXT: [[TMP19:%.*]] = insertelement <32 x i8> [[TMP18]], i8 [[L1A35]], i32 17 |
| ; SM100-NEXT: [[TMP20:%.*]] = insertelement <32 x i8> [[TMP19]], i8 [[L1934]], i32 18 |
| ; SM100-NEXT: [[TMP21:%.*]] = insertelement <32 x i8> [[TMP20]], i8 [[L1833]], i32 19 |
| ; SM100-NEXT: [[TMP22:%.*]] = insertelement <32 x i8> [[TMP21]], i8 [[L1732]], i32 20 |
| ; SM100-NEXT: [[TMP23:%.*]] = insertelement <32 x i8> [[TMP22]], i8 [[L1631]], i32 21 |
| ; SM100-NEXT: [[TMP24:%.*]] = insertelement <32 x i8> [[TMP23]], i8 [[L1530]], i32 22 |
| ; SM100-NEXT: [[TMP25:%.*]] = insertelement <32 x i8> [[TMP24]], i8 [[L1429]], i32 23 |
| ; SM100-NEXT: [[TMP26:%.*]] = insertelement <32 x i8> [[TMP25]], i8 [[L1328]], i32 24 |
| ; SM100-NEXT: [[TMP27:%.*]] = insertelement <32 x i8> [[TMP26]], i8 [[L1227]], i32 25 |
| ; SM100-NEXT: [[TMP28:%.*]] = insertelement <32 x i8> [[TMP27]], i8 [[L1126]], i32 26 |
| ; SM100-NEXT: [[TMP29:%.*]] = insertelement <32 x i8> [[TMP28]], i8 [[L1025]], i32 27 |
| ; SM100-NEXT: [[TMP30:%.*]] = insertelement <32 x i8> [[TMP29]], i8 [[L1F40]], i32 28 |
| ; SM100-NEXT: [[TMP31:%.*]] = insertelement <32 x i8> [[TMP30]], i8 [[L1E39]], i32 29 |
| ; SM100-NEXT: [[TMP32:%.*]] = insertelement <32 x i8> [[TMP31]], i8 [[L1D38]], i32 30 |
| ; SM100-NEXT: [[TMP33:%.*]] = insertelement <32 x i8> [[TMP32]], i8 [[L1C37]], i32 31 |
| ; SM100-NEXT: store <32 x i8> [[TMP33]], ptr addrspace(1) [[PTR0]], align 32 |
| ; SM100-NEXT: ret void |
| ; |
| |
| %ptr0 = getelementptr i8, ptr addrspace(1) %ptr, i64 0 |
| %ptr1 = getelementptr i8, ptr addrspace(1) %ptr, i64 1 |
| %ptr2 = getelementptr i8, ptr addrspace(1) %ptr, i64 2 |
| %ptr3 = getelementptr i8, ptr addrspace(1) %ptr, i64 3 |
| %ptr4 = getelementptr i8, ptr addrspace(1) %ptr, i64 4 |
| %ptr5 = getelementptr i8, ptr addrspace(1) %ptr, i64 5 |
| %ptr6 = getelementptr i8, ptr addrspace(1) %ptr, i64 6 |
| %ptr7 = getelementptr i8, ptr addrspace(1) %ptr, i64 7 |
| %ptr8 = getelementptr i8, ptr addrspace(1) %ptr, i64 8 |
| %ptr9 = getelementptr i8, ptr addrspace(1) %ptr, i64 9 |
| %ptra = getelementptr i8, ptr addrspace(1) %ptr, i64 10 |
| %ptrb = getelementptr i8, ptr addrspace(1) %ptr, i64 11 |
| %ptrc = getelementptr i8, ptr addrspace(1) %ptr, i64 12 |
| %ptrd = getelementptr i8, ptr addrspace(1) %ptr, i64 13 |
| %ptre = getelementptr i8, ptr addrspace(1) %ptr, i64 14 |
| %ptrf = getelementptr i8, ptr addrspace(1) %ptr, i64 15 |
| %ptr10 = getelementptr i8, ptr addrspace(1) %ptr, i64 16 |
| %ptr11 = getelementptr i8, ptr addrspace(1) %ptr, i64 17 |
| %ptr12 = getelementptr i8, ptr addrspace(1) %ptr, i64 18 |
| %ptr13 = getelementptr i8, ptr addrspace(1) %ptr, i64 19 |
| %ptr14 = getelementptr i8, ptr addrspace(1) %ptr, i64 20 |
| %ptr15 = getelementptr i8, ptr addrspace(1) %ptr, i64 21 |
| %ptr16 = getelementptr i8, ptr addrspace(1) %ptr, i64 22 |
| %ptr17 = getelementptr i8, ptr addrspace(1) %ptr, i64 23 |
| %ptr18 = getelementptr i8, ptr addrspace(1) %ptr, i64 24 |
| %ptr19 = getelementptr i8, ptr addrspace(1) %ptr, i64 25 |
| %ptr1a = getelementptr i8, ptr addrspace(1) %ptr, i64 26 |
| %ptr1b = getelementptr i8, ptr addrspace(1) %ptr, i64 27 |
| %ptr1c = getelementptr i8, ptr addrspace(1) %ptr, i64 28 |
| %ptr1d = getelementptr i8, ptr addrspace(1) %ptr, i64 29 |
| %ptr1e = getelementptr i8, ptr addrspace(1) %ptr, i64 30 |
| %ptr1f = getelementptr i8, ptr addrspace(1) %ptr, i64 31 |
| |
| %l0 = load i8, ptr addrspace(1) %ptr0, align 32 |
| %l1 = load i8, ptr addrspace(1) %ptr1 |
| %l2 = load i8, ptr addrspace(1) %ptr2 |
| %l3 = load i8, ptr addrspace(1) %ptr3 |
| %l4 = load i8, ptr addrspace(1) %ptr4 |
| %l5 = load i8, ptr addrspace(1) %ptr5 |
| %l6 = load i8, ptr addrspace(1) %ptr6 |
| %l7 = load i8, ptr addrspace(1) %ptr7 |
| %l8 = load i8, ptr addrspace(1) %ptr8 |
| %l9 = load i8, ptr addrspace(1) %ptr9 |
| %la = load i8, ptr addrspace(1) %ptra |
| %lb = load i8, ptr addrspace(1) %ptrb |
| %lc = load i8, ptr addrspace(1) %ptrc |
| %ld = load i8, ptr addrspace(1) %ptrd |
| %le = load i8, ptr addrspace(1) %ptre |
| %lf = load i8, ptr addrspace(1) %ptrf |
| %l10 = load i8, ptr addrspace(1) %ptr10, align 16 |
| %l11 = load i8, ptr addrspace(1) %ptr11 |
| %l12 = load i8, ptr addrspace(1) %ptr12 |
| %l13 = load i8, ptr addrspace(1) %ptr13 |
| %l14 = load i8, ptr addrspace(1) %ptr14 |
| %l15 = load i8, ptr addrspace(1) %ptr15 |
| %l16 = load i8, ptr addrspace(1) %ptr16 |
| %l17 = load i8, ptr addrspace(1) %ptr17 |
| %l18 = load i8, ptr addrspace(1) %ptr18 |
| %l19 = load i8, ptr addrspace(1) %ptr19 |
| %l1a = load i8, ptr addrspace(1) %ptr1a |
| %l1b = load i8, ptr addrspace(1) %ptr1b |
| %l1c = load i8, ptr addrspace(1) %ptr1c |
| %l1d = load i8, ptr addrspace(1) %ptr1d |
| %l1e = load i8, ptr addrspace(1) %ptr1e |
| %l1f = load i8, ptr addrspace(1) %ptr1f |
| |
| store i8 %lf, ptr addrspace(1) %ptrc |
| store i8 %le, ptr addrspace(1) %ptrd |
| store i8 %ld, ptr addrspace(1) %ptre |
| store i8 %lc, ptr addrspace(1) %ptrf |
| store i8 %lb, ptr addrspace(1) %ptr0, align 32 |
| store i8 %la, ptr addrspace(1) %ptr1 |
| store i8 %l9, ptr addrspace(1) %ptr2 |
| store i8 %l8, ptr addrspace(1) %ptr3 |
| store i8 %l7, ptr addrspace(1) %ptr4 |
| store i8 %l6, ptr addrspace(1) %ptr5 |
| store i8 %l5, ptr addrspace(1) %ptr6 |
| store i8 %l4, ptr addrspace(1) %ptr7 |
| store i8 %l3, ptr addrspace(1) %ptr8 |
| store i8 %l2, ptr addrspace(1) %ptr9 |
| store i8 %l1, ptr addrspace(1) %ptra |
| store i8 %l0, ptr addrspace(1) %ptrb |
| store i8 %l1f, ptr addrspace(1) %ptr1c |
| store i8 %l1e, ptr addrspace(1) %ptr1d |
| store i8 %l1d, ptr addrspace(1) %ptr1e |
| store i8 %l1c, ptr addrspace(1) %ptr1f |
| store i8 %l1b, ptr addrspace(1) %ptr10, align 16 |
| store i8 %l1a, ptr addrspace(1) %ptr11 |
| store i8 %l19, ptr addrspace(1) %ptr12 |
| store i8 %l18, ptr addrspace(1) %ptr13 |
| store i8 %l17, ptr addrspace(1) %ptr14 |
| store i8 %l16, ptr addrspace(1) %ptr15 |
| store i8 %l15, ptr addrspace(1) %ptr16 |
| store i8 %l14, ptr addrspace(1) %ptr17 |
| store i8 %l13, ptr addrspace(1) %ptr18 |
| store i8 %l12, ptr addrspace(1) %ptr19 |
| store i8 %l11, ptr addrspace(1) %ptr1a |
| store i8 %l10, ptr addrspace(1) %ptr1b |
| |
| ret void |
| } |
| |
| define void @int16x16(ptr addrspace(1) %ptr) { |
| ; SM90-LABEL: define void @int16x16( |
| ; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { |
| ; SM90-NEXT: [[PTR0:%.*]] = getelementptr i16, ptr addrspace(1) [[PTR]], i64 0 |
| ; SM90-NEXT: [[PTR8:%.*]] = getelementptr i16, ptr addrspace(1) [[PTR]], i64 8 |
| ; SM90-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr addrspace(1) [[PTR0]], align 32 |
| ; SM90-NEXT: [[L01:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 |
| ; SM90-NEXT: [[L12:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 |
| ; SM90-NEXT: [[L23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 |
| ; SM90-NEXT: [[L34:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 |
| ; SM90-NEXT: [[L45:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 |
| ; SM90-NEXT: [[L56:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 |
| ; SM90-NEXT: [[L67:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 |
| ; SM90-NEXT: [[L78:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 |
| ; SM90-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr addrspace(1) [[PTR8]], align 16 |
| ; SM90-NEXT: [[L89:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 |
| ; SM90-NEXT: [[L910:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1 |
| ; SM90-NEXT: [[LA11:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2 |
| ; SM90-NEXT: [[LB12:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3 |
| ; SM90-NEXT: [[LC13:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 |
| ; SM90-NEXT: [[LD14:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 |
| ; SM90-NEXT: [[LE15:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 |
| ; SM90-NEXT: [[LF16:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 |
| ; SM90-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[LB12]], i32 0 |
| ; SM90-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[LA11]], i32 1 |
| ; SM90-NEXT: [[TMP5:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[L910]], i32 2 |
| ; SM90-NEXT: [[TMP6:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[L89]], i32 3 |
| ; SM90-NEXT: [[TMP7:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[L78]], i32 4 |
| ; SM90-NEXT: [[TMP8:%.*]] = insertelement <8 x i16> [[TMP7]], i16 [[L67]], i32 5 |
| ; SM90-NEXT: [[TMP9:%.*]] = insertelement <8 x i16> [[TMP8]], i16 [[L56]], i32 6 |
| ; SM90-NEXT: [[TMP10:%.*]] = insertelement <8 x i16> [[TMP9]], i16 [[L45]], i32 7 |
| ; SM90-NEXT: store <8 x i16> [[TMP10]], ptr addrspace(1) [[PTR0]], align 32 |
| ; SM90-NEXT: [[TMP11:%.*]] = insertelement <8 x i16> poison, i16 [[L34]], i32 0 |
| ; SM90-NEXT: [[TMP12:%.*]] = insertelement <8 x i16> [[TMP11]], i16 [[L23]], i32 1 |
| ; SM90-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[L12]], i32 2 |
| ; SM90-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[L01]], i32 3 |
| ; SM90-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[LF16]], i32 4 |
| ; SM90-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP15]], i16 [[LE15]], i32 5 |
| ; SM90-NEXT: [[TMP17:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[LD14]], i32 6 |
| ; SM90-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP17]], i16 [[LC13]], i32 7 |
| ; SM90-NEXT: store <8 x i16> [[TMP18]], ptr addrspace(1) [[PTR8]], align 16 |
| ; SM90-NEXT: ret void |
| ; |
| ; SM100-LABEL: define void @int16x16( |
| ; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { |
| ; SM100-NEXT: [[PTR0:%.*]] = getelementptr i16, ptr addrspace(1) [[PTR]], i64 0 |
| ; SM100-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr addrspace(1) [[PTR0]], align 32 |
| ; SM100-NEXT: [[L01:%.*]] = extractelement <16 x i16> [[TMP1]], i32 0 |
| ; SM100-NEXT: [[L12:%.*]] = extractelement <16 x i16> [[TMP1]], i32 1 |
| ; SM100-NEXT: [[L23:%.*]] = extractelement <16 x i16> [[TMP1]], i32 2 |
| ; SM100-NEXT: [[L34:%.*]] = extractelement <16 x i16> [[TMP1]], i32 3 |
| ; SM100-NEXT: [[L45:%.*]] = extractelement <16 x i16> [[TMP1]], i32 4 |
| ; SM100-NEXT: [[L56:%.*]] = extractelement <16 x i16> [[TMP1]], i32 5 |
| ; SM100-NEXT: [[L67:%.*]] = extractelement <16 x i16> [[TMP1]], i32 6 |
| ; SM100-NEXT: [[L78:%.*]] = extractelement <16 x i16> [[TMP1]], i32 7 |
| ; SM100-NEXT: [[L89:%.*]] = extractelement <16 x i16> [[TMP1]], i32 8 |
| ; SM100-NEXT: [[L910:%.*]] = extractelement <16 x i16> [[TMP1]], i32 9 |
| ; SM100-NEXT: [[LA11:%.*]] = extractelement <16 x i16> [[TMP1]], i32 10 |
| ; SM100-NEXT: [[LB12:%.*]] = extractelement <16 x i16> [[TMP1]], i32 11 |
| ; SM100-NEXT: [[LC13:%.*]] = extractelement <16 x i16> [[TMP1]], i32 12 |
| ; SM100-NEXT: [[LD14:%.*]] = extractelement <16 x i16> [[TMP1]], i32 13 |
| ; SM100-NEXT: [[LE15:%.*]] = extractelement <16 x i16> [[TMP1]], i32 14 |
| ; SM100-NEXT: [[LF16:%.*]] = extractelement <16 x i16> [[TMP1]], i32 15 |
| ; SM100-NEXT: [[TMP2:%.*]] = insertelement <16 x i16> poison, i16 [[LB12]], i32 0 |
| ; SM100-NEXT: [[TMP3:%.*]] = insertelement <16 x i16> [[TMP2]], i16 [[LA11]], i32 1 |
| ; SM100-NEXT: [[TMP4:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[L910]], i32 2 |
| ; SM100-NEXT: [[TMP5:%.*]] = insertelement <16 x i16> [[TMP4]], i16 [[L89]], i32 3 |
| ; SM100-NEXT: [[TMP6:%.*]] = insertelement <16 x i16> [[TMP5]], i16 [[L78]], i32 4 |
| ; SM100-NEXT: [[TMP7:%.*]] = insertelement <16 x i16> [[TMP6]], i16 [[L67]], i32 5 |
| ; SM100-NEXT: [[TMP8:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[L56]], i32 6 |
| ; SM100-NEXT: [[TMP9:%.*]] = insertelement <16 x i16> [[TMP8]], i16 [[L45]], i32 7 |
| ; SM100-NEXT: [[TMP10:%.*]] = insertelement <16 x i16> [[TMP9]], i16 [[L34]], i32 8 |
| ; SM100-NEXT: [[TMP11:%.*]] = insertelement <16 x i16> [[TMP10]], i16 [[L23]], i32 9 |
| ; SM100-NEXT: [[TMP12:%.*]] = insertelement <16 x i16> [[TMP11]], i16 [[L12]], i32 10 |
| ; SM100-NEXT: [[TMP13:%.*]] = insertelement <16 x i16> [[TMP12]], i16 [[L01]], i32 11 |
| ; SM100-NEXT: [[TMP14:%.*]] = insertelement <16 x i16> [[TMP13]], i16 [[LF16]], i32 12 |
| ; SM100-NEXT: [[TMP15:%.*]] = insertelement <16 x i16> [[TMP14]], i16 [[LE15]], i32 13 |
| ; SM100-NEXT: [[TMP16:%.*]] = insertelement <16 x i16> [[TMP15]], i16 [[LD14]], i32 14 |
| ; SM100-NEXT: [[TMP17:%.*]] = insertelement <16 x i16> [[TMP16]], i16 [[LC13]], i32 15 |
| ; SM100-NEXT: store <16 x i16> [[TMP17]], ptr addrspace(1) [[PTR0]], align 32 |
| ; SM100-NEXT: ret void |
| ; |
| |
| %ptr0 = getelementptr i16, ptr addrspace(1) %ptr, i64 0 |
| %ptr1 = getelementptr i16, ptr addrspace(1) %ptr, i64 1 |
| %ptr2 = getelementptr i16, ptr addrspace(1) %ptr, i64 2 |
| %ptr3 = getelementptr i16, ptr addrspace(1) %ptr, i64 3 |
| %ptr4 = getelementptr i16, ptr addrspace(1) %ptr, i64 4 |
| %ptr5 = getelementptr i16, ptr addrspace(1) %ptr, i64 5 |
| %ptr6 = getelementptr i16, ptr addrspace(1) %ptr, i64 6 |
| %ptr7 = getelementptr i16, ptr addrspace(1) %ptr, i64 7 |
| %ptr8 = getelementptr i16, ptr addrspace(1) %ptr, i64 8 |
| %ptr9 = getelementptr i16, ptr addrspace(1) %ptr, i64 9 |
| %ptra = getelementptr i16, ptr addrspace(1) %ptr, i64 10 |
| %ptrb = getelementptr i16, ptr addrspace(1) %ptr, i64 11 |
| %ptrc = getelementptr i16, ptr addrspace(1) %ptr, i64 12 |
| %ptrd = getelementptr i16, ptr addrspace(1) %ptr, i64 13 |
| %ptre = getelementptr i16, ptr addrspace(1) %ptr, i64 14 |
| %ptrf = getelementptr i16, ptr addrspace(1) %ptr, i64 15 |
| |
| %l0 = load i16, ptr addrspace(1) %ptr0, align 32 |
| %l1 = load i16, ptr addrspace(1) %ptr1 |
| %l2 = load i16, ptr addrspace(1) %ptr2 |
| %l3 = load i16, ptr addrspace(1) %ptr3 |
| %l4 = load i16, ptr addrspace(1) %ptr4 |
| %l5 = load i16, ptr addrspace(1) %ptr5 |
| %l6 = load i16, ptr addrspace(1) %ptr6 |
| %l7 = load i16, ptr addrspace(1) %ptr7 |
| %l8 = load i16, ptr addrspace(1) %ptr8, align 16 |
| %l9 = load i16, ptr addrspace(1) %ptr9 |
| %la = load i16, ptr addrspace(1) %ptra |
| %lb = load i16, ptr addrspace(1) %ptrb |
| %lc = load i16, ptr addrspace(1) %ptrc |
| %ld = load i16, ptr addrspace(1) %ptrd |
| %le = load i16, ptr addrspace(1) %ptre |
| %lf = load i16, ptr addrspace(1) %ptrf |
| |
| store i16 %lf, ptr addrspace(1) %ptrc |
| store i16 %le, ptr addrspace(1) %ptrd |
| store i16 %ld, ptr addrspace(1) %ptre |
| store i16 %lc, ptr addrspace(1) %ptrf |
| store i16 %lb, ptr addrspace(1) %ptr0, align 32 |
| store i16 %la, ptr addrspace(1) %ptr1 |
| store i16 %l9, ptr addrspace(1) %ptr2 |
| store i16 %l8, ptr addrspace(1) %ptr3 |
| store i16 %l7, ptr addrspace(1) %ptr4 |
| store i16 %l6, ptr addrspace(1) %ptr5 |
| store i16 %l5, ptr addrspace(1) %ptr6 |
| store i16 %l4, ptr addrspace(1) %ptr7 |
| store i16 %l3, ptr addrspace(1) %ptr8, align 16 |
| store i16 %l2, ptr addrspace(1) %ptr9 |
| store i16 %l1, ptr addrspace(1) %ptra |
| store i16 %l0, ptr addrspace(1) %ptrb |
| |
| ret void |
| } |
| |
| define void @int32x8(ptr addrspace(1) %ptr) { |
| ; SM90-LABEL: define void @int32x8( |
| ; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { |
| ; SM90-NEXT: [[PTR0:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 0 |
| ; SM90-NEXT: [[PTR4:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 |
| ; SM90-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[PTR0]], align 32 |
| ; SM90-NEXT: [[L01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 |
| ; SM90-NEXT: [[L12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 |
| ; SM90-NEXT: [[L23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 |
| ; SM90-NEXT: [[L34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 |
| ; SM90-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[PTR4]], align 16 |
| ; SM90-NEXT: [[L45:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 |
| ; SM90-NEXT: [[L56:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 |
| ; SM90-NEXT: [[L67:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 |
| ; SM90-NEXT: [[L78:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 |
| ; SM90-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[L78]], i32 0 |
| ; SM90-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[L67]], i32 1 |
| ; SM90-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[L56]], i32 2 |
| ; SM90-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[L45]], i32 3 |
| ; SM90-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[PTR0]], align 32 |
| ; SM90-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[L34]], i32 0 |
| ; SM90-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[L23]], i32 1 |
| ; SM90-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[L12]], i32 2 |
| ; SM90-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[L01]], i32 3 |
| ; SM90-NEXT: store <4 x i32> [[TMP10]], ptr addrspace(1) [[PTR4]], align 16 |
| ; SM90-NEXT: ret void |
| ; |
| ; SM100-LABEL: define void @int32x8( |
| ; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { |
| ; SM100-NEXT: [[PTR0:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 0 |
| ; SM100-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[PTR0]], align 32 |
| ; SM100-NEXT: [[L01:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 |
| ; SM100-NEXT: [[L12:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 |
| ; SM100-NEXT: [[L23:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 |
| ; SM100-NEXT: [[L34:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 |
| ; SM100-NEXT: [[L45:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 |
| ; SM100-NEXT: [[L56:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 |
| ; SM100-NEXT: [[L67:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 |
| ; SM100-NEXT: [[L78:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 |
| ; SM100-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[L78]], i32 0 |
| ; SM100-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[L67]], i32 1 |
| ; SM100-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[L56]], i32 2 |
| ; SM100-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[L45]], i32 3 |
| ; SM100-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[L34]], i32 4 |
| ; SM100-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[L23]], i32 5 |
| ; SM100-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[L12]], i32 6 |
| ; SM100-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[L01]], i32 7 |
| ; SM100-NEXT: store <8 x i32> [[TMP9]], ptr addrspace(1) [[PTR0]], align 32 |
| ; SM100-NEXT: ret void |
| ; |
| |
| %ptr0 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 |
| %ptr1 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 |
| %ptr2 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 |
| %ptr3 = getelementptr i32, ptr addrspace(1) %ptr, i64 3 |
| %ptr4 = getelementptr i32, ptr addrspace(1) %ptr, i64 4 |
| %ptr5 = getelementptr i32, ptr addrspace(1) %ptr, i64 5 |
| %ptr6 = getelementptr i32, ptr addrspace(1) %ptr, i64 6 |
| %ptr7 = getelementptr i32, ptr addrspace(1) %ptr, i64 7 |
| |
| %l0 = load i32, ptr addrspace(1) %ptr0, align 32 |
| %l1 = load i32, ptr addrspace(1) %ptr1 |
| %l2 = load i32, ptr addrspace(1) %ptr2 |
| %l3 = load i32, ptr addrspace(1) %ptr3 |
| %l4 = load i32, ptr addrspace(1) %ptr4, align 16 |
| %l5 = load i32, ptr addrspace(1) %ptr5 |
| %l6 = load i32, ptr addrspace(1) %ptr6 |
| %l7 = load i32, ptr addrspace(1) %ptr7 |
| |
| store i32 %l7, ptr addrspace(1) %ptr0, align 32 |
| store i32 %l6, ptr addrspace(1) %ptr1 |
| store i32 %l5, ptr addrspace(1) %ptr2 |
| store i32 %l4, ptr addrspace(1) %ptr3 |
| store i32 %l3, ptr addrspace(1) %ptr4, align 16 |
| store i32 %l2, ptr addrspace(1) %ptr5 |
| store i32 %l1, ptr addrspace(1) %ptr6 |
| store i32 %l0, ptr addrspace(1) %ptr7 |
| |
| ret void |
| } |
| |
| define void @int64x4(ptr addrspace(1) %ptr) { |
| ; SM90-LABEL: define void @int64x4( |
| ; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { |
| ; SM90-NEXT: [[PTR0:%.*]] = getelementptr i64, ptr addrspace(1) [[PTR]], i64 0 |
| ; SM90-NEXT: [[PTR2:%.*]] = getelementptr i64, ptr addrspace(1) [[PTR]], i64 2 |
| ; SM90-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR0]], align 32 |
| ; SM90-NEXT: [[L01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 |
| ; SM90-NEXT: [[L12:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 |
| ; SM90-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR2]], align 16 |
| ; SM90-NEXT: [[L23:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 |
| ; SM90-NEXT: [[L34:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 |
| ; SM90-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[L34]], i32 0 |
| ; SM90-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[L23]], i32 1 |
| ; SM90-NEXT: store <2 x i64> [[TMP4]], ptr addrspace(1) [[PTR0]], align 32 |
| ; SM90-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[L12]], i32 0 |
| ; SM90-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[L01]], i32 1 |
| ; SM90-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[PTR2]], align 16 |
| ; SM90-NEXT: ret void |
| ; |
| ; SM100-LABEL: define void @int64x4( |
| ; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { |
| ; SM100-NEXT: [[PTR0:%.*]] = getelementptr i64, ptr addrspace(1) [[PTR]], i64 0 |
| ; SM100-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(1) [[PTR0]], align 32 |
| ; SM100-NEXT: [[L01:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0 |
| ; SM100-NEXT: [[L12:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1 |
| ; SM100-NEXT: [[L23:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2 |
| ; SM100-NEXT: [[L34:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3 |
| ; SM100-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> poison, i64 [[L34]], i32 0 |
| ; SM100-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[L23]], i32 1 |
| ; SM100-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[L12]], i32 2 |
| ; SM100-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[L01]], i32 3 |
| ; SM100-NEXT: store <4 x i64> [[TMP5]], ptr addrspace(1) [[PTR0]], align 32 |
| ; SM100-NEXT: ret void |
| ; |
| |
| %ptr0 = getelementptr i64, ptr addrspace(1) %ptr, i64 0 |
| %ptr1 = getelementptr i64, ptr addrspace(1) %ptr, i64 1 |
| %ptr2 = getelementptr i64, ptr addrspace(1) %ptr, i64 2 |
| %ptr3 = getelementptr i64, ptr addrspace(1) %ptr, i64 3 |
| |
| %l0 = load i64, ptr addrspace(1) %ptr0, align 32 |
| %l1 = load i64, ptr addrspace(1) %ptr1 |
| %l2 = load i64, ptr addrspace(1) %ptr2, align 16 |
| %l3 = load i64, ptr addrspace(1) %ptr3 |
| |
| store i64 %l3, ptr addrspace(1) %ptr0, align 32 |
| store i64 %l2, ptr addrspace(1) %ptr1 |
| store i64 %l1, ptr addrspace(1) %ptr2, align 16 |
| store i64 %l0, ptr addrspace(1) %ptr3 |
| |
| ret void |
| } |
| |
| define void @float32x8(ptr addrspace(1) %ptr) { |
| ; SM90-LABEL: define void @float32x8( |
| ; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { |
| ; SM90-NEXT: [[PTR0:%.*]] = getelementptr float, ptr addrspace(1) [[PTR]], i64 0 |
| ; SM90-NEXT: [[PTR4:%.*]] = getelementptr float, ptr addrspace(1) [[PTR]], i64 4 |
| ; SM90-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr addrspace(1) [[PTR0]], align 32 |
| ; SM90-NEXT: [[L01:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 |
| ; SM90-NEXT: [[L12:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 |
| ; SM90-NEXT: [[L23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 |
| ; SM90-NEXT: [[L34:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 |
| ; SM90-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr addrspace(1) [[PTR4]], align 16 |
| ; SM90-NEXT: [[L45:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 |
| ; SM90-NEXT: [[L56:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 |
| ; SM90-NEXT: [[L67:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 |
| ; SM90-NEXT: [[L78:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 |
| ; SM90-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[L78]], i32 0 |
| ; SM90-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[L67]], i32 1 |
| ; SM90-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[L56]], i32 2 |
| ; SM90-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[L45]], i32 3 |
| ; SM90-NEXT: store <4 x float> [[TMP6]], ptr addrspace(1) [[PTR0]], align 32 |
| ; SM90-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[L34]], i32 0 |
| ; SM90-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[L23]], i32 1 |
| ; SM90-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[L12]], i32 2 |
| ; SM90-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[L01]], i32 3 |
| ; SM90-NEXT: store <4 x float> [[TMP10]], ptr addrspace(1) [[PTR4]], align 16 |
| ; SM90-NEXT: ret void |
| ; |
| ; SM100-LABEL: define void @float32x8( |
| ; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { |
| ; SM100-NEXT: [[PTR0:%.*]] = getelementptr float, ptr addrspace(1) [[PTR]], i64 0 |
| ; SM100-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr addrspace(1) [[PTR0]], align 32 |
| ; SM100-NEXT: [[L01:%.*]] = extractelement <8 x float> [[TMP1]], i32 0 |
| ; SM100-NEXT: [[L12:%.*]] = extractelement <8 x float> [[TMP1]], i32 1 |
| ; SM100-NEXT: [[L23:%.*]] = extractelement <8 x float> [[TMP1]], i32 2 |
| ; SM100-NEXT: [[L34:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 |
| ; SM100-NEXT: [[L45:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 |
| ; SM100-NEXT: [[L56:%.*]] = extractelement <8 x float> [[TMP1]], i32 5 |
| ; SM100-NEXT: [[L67:%.*]] = extractelement <8 x float> [[TMP1]], i32 6 |
| ; SM100-NEXT: [[L78:%.*]] = extractelement <8 x float> [[TMP1]], i32 7 |
| ; SM100-NEXT: [[TMP2:%.*]] = insertelement <8 x float> poison, float [[L78]], i32 0 |
| ; SM100-NEXT: [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[L67]], i32 1 |
| ; SM100-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[L56]], i32 2 |
| ; SM100-NEXT: [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[L45]], i32 3 |
| ; SM100-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[L34]], i32 4 |
| ; SM100-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[L23]], i32 5 |
| ; SM100-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[L12]], i32 6 |
| ; SM100-NEXT: [[TMP9:%.*]] = insertelement <8 x float> [[TMP8]], float [[L01]], i32 7 |
| ; SM100-NEXT: store <8 x float> [[TMP9]], ptr addrspace(1) [[PTR0]], align 32 |
| ; SM100-NEXT: ret void |
| ; |
| |
| %ptr0 = getelementptr float, ptr addrspace(1) %ptr, i64 0 |
| %ptr1 = getelementptr float, ptr addrspace(1) %ptr, i64 1 |
| %ptr2 = getelementptr float, ptr addrspace(1) %ptr, i64 2 |
| %ptr3 = getelementptr float, ptr addrspace(1) %ptr, i64 3 |
| %ptr4 = getelementptr float, ptr addrspace(1) %ptr, i64 4 |
| %ptr5 = getelementptr float, ptr addrspace(1) %ptr, i64 5 |
| %ptr6 = getelementptr float, ptr addrspace(1) %ptr, i64 6 |
| %ptr7 = getelementptr float, ptr addrspace(1) %ptr, i64 7 |
| |
| %l0 = load float, ptr addrspace(1) %ptr0, align 32 |
| %l1 = load float, ptr addrspace(1) %ptr1 |
| %l2 = load float, ptr addrspace(1) %ptr2 |
| %l3 = load float, ptr addrspace(1) %ptr3 |
| %l4 = load float, ptr addrspace(1) %ptr4, align 16 |
| %l5 = load float, ptr addrspace(1) %ptr5 |
| %l6 = load float, ptr addrspace(1) %ptr6 |
| %l7 = load float, ptr addrspace(1) %ptr7 |
| |
| store float %l7, ptr addrspace(1) %ptr0, align 32 |
| store float %l6, ptr addrspace(1) %ptr1 |
| store float %l5, ptr addrspace(1) %ptr2 |
| store float %l4, ptr addrspace(1) %ptr3 |
| store float %l3, ptr addrspace(1) %ptr4, align 16 |
| store float %l2, ptr addrspace(1) %ptr5 |
| store float %l1, ptr addrspace(1) %ptr6 |
| store float %l0, ptr addrspace(1) %ptr7 |
| |
| ret void |
| } |
| |
| define void @float64x4(ptr addrspace(1) %ptr) { |
| ; SM90-LABEL: define void @float64x4( |
| ; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { |
| ; SM90-NEXT: [[PTR0:%.*]] = getelementptr double, ptr addrspace(1) [[PTR]], i64 0 |
| ; SM90-NEXT: [[PTR2:%.*]] = getelementptr double, ptr addrspace(1) [[PTR]], i64 2 |
| ; SM90-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr addrspace(1) [[PTR0]], align 32 |
| ; SM90-NEXT: [[L01:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 |
| ; SM90-NEXT: [[L12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 |
| ; SM90-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr addrspace(1) [[PTR2]], align 16 |
| ; SM90-NEXT: [[L23:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 |
| ; SM90-NEXT: [[L34:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 |
| ; SM90-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[L34]], i32 0 |
| ; SM90-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[L23]], i32 1 |
| ; SM90-NEXT: store <2 x double> [[TMP4]], ptr addrspace(1) [[PTR0]], align 32 |
| ; SM90-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[L12]], i32 0 |
| ; SM90-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[L01]], i32 1 |
| ; SM90-NEXT: store <2 x double> [[TMP6]], ptr addrspace(1) [[PTR2]], align 16 |
| ; SM90-NEXT: ret void |
| ; |
| ; SM100-LABEL: define void @float64x4( |
| ; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { |
| ; SM100-NEXT: [[PTR0:%.*]] = getelementptr double, ptr addrspace(1) [[PTR]], i64 0 |
| ; SM100-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr addrspace(1) [[PTR0]], align 32 |
| ; SM100-NEXT: [[L01:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 |
| ; SM100-NEXT: [[L12:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 |
| ; SM100-NEXT: [[L23:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 |
| ; SM100-NEXT: [[L34:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 |
| ; SM100-NEXT: [[TMP2:%.*]] = insertelement <4 x double> poison, double [[L34]], i32 0 |
| ; SM100-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[L23]], i32 1 |
| ; SM100-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[L12]], i32 2 |
| ; SM100-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[L01]], i32 3 |
| ; SM100-NEXT: store <4 x double> [[TMP5]], ptr addrspace(1) [[PTR0]], align 32 |
| ; SM100-NEXT: ret void |
| ; |
| |
| %ptr0 = getelementptr double, ptr addrspace(1) %ptr, i64 0 |
| %ptr1 = getelementptr double, ptr addrspace(1) %ptr, i64 1 |
| %ptr2 = getelementptr double, ptr addrspace(1) %ptr, i64 2 |
| %ptr3 = getelementptr double, ptr addrspace(1) %ptr, i64 3 |
| |
| %l0 = load double, ptr addrspace(1) %ptr0, align 32 |
| %l1 = load double, ptr addrspace(1) %ptr1 |
| %l2 = load double, ptr addrspace(1) %ptr2, align 16 |
| %l3 = load double, ptr addrspace(1) %ptr3 |
| |
| store double %l3, ptr addrspace(1) %ptr0, align 32 |
| store double %l2, ptr addrspace(1) %ptr1 |
| store double %l1, ptr addrspace(1) %ptr2, align 16 |
| store double %l0, ptr addrspace(1) %ptr3 |
| |
| ret void |
| } |
| |
| define void @int32x8_non_global(ptr %ptr) { |
| ; CHECK-LABEL: define void @int32x8_non_global( |
| ; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { |
| ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i32, ptr [[PTR]], i64 0 |
| ; CHECK-NEXT: [[PTR4:%.*]] = getelementptr i32, ptr [[PTR]], i64 4 |
| ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR0]], align 32 |
| ; CHECK-NEXT: [[L01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 |
| ; CHECK-NEXT: [[L12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 |
| ; CHECK-NEXT: [[L23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 |
| ; CHECK-NEXT: [[L34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 |
| ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[PTR4]], align 16 |
| ; CHECK-NEXT: [[L45:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 |
| ; CHECK-NEXT: [[L56:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 |
| ; CHECK-NEXT: [[L67:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 |
| ; CHECK-NEXT: [[L78:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 |
| ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[L78]], i32 0 |
| ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[L67]], i32 1 |
| ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[L56]], i32 2 |
| ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[L45]], i32 3 |
| ; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[PTR0]], align 32 |
| ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[L34]], i32 0 |
| ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[L23]], i32 1 |
| ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[L12]], i32 2 |
| ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[L01]], i32 3 |
| ; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr [[PTR4]], align 16 |
| ; CHECK-NEXT: ret void |
| |
| %ptr0 = getelementptr i32, ptr %ptr, i64 0 |
| %ptr1 = getelementptr i32, ptr %ptr, i64 1 |
| %ptr2 = getelementptr i32, ptr %ptr, i64 2 |
| %ptr3 = getelementptr i32, ptr %ptr, i64 3 |
| %ptr4 = getelementptr i32, ptr %ptr, i64 4 |
| %ptr5 = getelementptr i32, ptr %ptr, i64 5 |
| %ptr6 = getelementptr i32, ptr %ptr, i64 6 |
| %ptr7 = getelementptr i32, ptr %ptr, i64 7 |
| |
| %l0 = load i32, ptr %ptr0, align 32 |
| %l1 = load i32, ptr %ptr1 |
| %l2 = load i32, ptr %ptr2 |
| %l3 = load i32, ptr %ptr3 |
| %l4 = load i32, ptr %ptr4, align 16 |
| %l5 = load i32, ptr %ptr5 |
| %l6 = load i32, ptr %ptr6 |
| %l7 = load i32, ptr %ptr7 |
| |
| store i32 %l7, ptr %ptr0, align 32 |
| store i32 %l6, ptr %ptr1 |
| store i32 %l5, ptr %ptr2 |
| store i32 %l4, ptr %ptr3 |
| store i32 %l3, ptr %ptr4, align 16 |
| store i32 %l2, ptr %ptr5 |
| store i32 %l1, ptr %ptr6 |
| store i32 %l0, ptr %ptr7 |
| |
| ret void |
| } |