test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll - llvm-project/llvm - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name I --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s

 ; FIXME: Should not vectorize on gfx8

 define void @fadd_combine_v2f16(ptr addrspace(1) %arg) {
 ; GCN-LABEL: define void @fadd_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[TMP1:%.*]] = fadd <2 x half> [[TMP0]], splat (half 0xH3C00)
 ; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = fadd half %tmp3, 1.000000e+00
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = fadd half %tmp7, 1.000000e+00
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }

 ; FIXME: Should not vectorize on gfx8
 define void @fsub_combine_v2f16(ptr addrspace(1) %arg) {
 ; GCN-LABEL: define void @fsub_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[TMP1:%.*]] = fsub <2 x half> [[TMP0]], splat (half 0xH3C00)
 ; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = fsub half %tmp3, 1.000000e+00
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = fsub half %tmp7, 1.000000e+00
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }

 ; FIXME: Should not vectorize on gfx8
 define void @fmul_combine_v2f16(ptr addrspace(1) %arg) {
 ; GCN-LABEL: define void @fmul_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[TMP1:%.*]] = fmul <2 x half> [[TMP0]], splat (half 0xH3C00)
 ; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = fmul half %tmp3, 1.000000e+00
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = fmul half %tmp7, 1.000000e+00
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }

 define void @fdiv_combine_v2f16(ptr addrspace(1) %arg) {
 ; GCN-LABEL: define void @fdiv_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[TMP1:%.*]] = fdiv <2 x half> [[TMP0]], splat (half 0xH3C00)
 ; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = fdiv half %tmp3, 1.000000e+00
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = fdiv half %tmp7, 1.000000e+00
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }

 define void @frem_combine_v2f16(ptr addrspace(1) %arg) {
 ; GCN-LABEL: define void @frem_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[TMP1:%.*]] = frem <2 x half> [[TMP0]], splat (half 0xH3C00)
 ; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = frem half %tmp3, 1.000000e+00
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = frem half %tmp7, 1.000000e+00
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }

 ; FIXME: Should not vectorize on gfx8
 define amdgpu_kernel void @fma_combine_v2f16(ptr addrspace(1) %arg) {
 ; GCN-LABEL: define amdgpu_kernel void @fma_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00), <2 x half> splat (half 0xH3C00))
 ; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = tail call half @llvm.fma.f16(half %tmp3, half 1.000000e+00, half 1.000000e+00)
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = tail call half @llvm.fma.f16(half %tmp7, half 1.000000e+00, half 1.000000e+00)
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }

 ; FIXME: Should not vectorize on gfx8
 define amdgpu_kernel void @fmuladd_combine_v2f16(ptr addrspace(1) %arg) {
 ; GCN-LABEL: define amdgpu_kernel void @fmuladd_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00), <2 x half> splat (half 0xH3C00))
 ; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = tail call half @llvm.fmuladd.f16(half %tmp3, half 1.000000e+00, half 1.000000e+00)
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = tail call half @llvm.fmuladd.f16(half %tmp7, half 1.000000e+00, half 1.000000e+00)
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }


 define void @minnum_combine_v2f16(ptr addrspace(1) %arg) {
 ; GFX8-LABEL: define void @minnum_combine_v2f16(
 ; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GFX8-NEXT:  [[BB:.*:]]
 ; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GFX8-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
 ; GFX8-NEXT:    [[ITMP4:%.*]] = call half @llvm.minnum.f16(half [[ITMP3]], half 0xH3C00)
 ; GFX8-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
 ; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
 ; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
 ; GFX8-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
 ; GFX8-NEXT:    [[ITMP8:%.*]] = call half @llvm.minnum.f16(half [[ITMP7]], half 0xH3C00)
 ; GFX8-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
 ; GFX8-NEXT:    ret void
 ;
 ; GFX9-LABEL: define void @minnum_combine_v2f16(
 ; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GFX9-NEXT:  [[BB:.*:]]
 ; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GFX9-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
 ; GFX9-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GFX9-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = call half @llvm.minnum.f16(half %tmp3, half 1.000000e+00)
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = call half @llvm.minnum.f16(half %tmp7, half 1.000000e+00)
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }


 define void @maxnum_combine_v2f16(ptr addrspace(1) %arg) {
 ; GFX8-LABEL: define void @maxnum_combine_v2f16(
 ; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GFX8-NEXT:  [[BB:.*:]]
 ; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GFX8-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
 ; GFX8-NEXT:    [[ITMP4:%.*]] = call half @llvm.maxnum.f16(half [[ITMP3]], half 0xH3C00)
 ; GFX8-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
 ; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
 ; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
 ; GFX8-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
 ; GFX8-NEXT:    [[ITMP8:%.*]] = call half @llvm.maxnum.f16(half [[ITMP7]], half 0xH3C00)
 ; GFX8-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
 ; GFX8-NEXT:    ret void
 ;
 ; GFX9-LABEL: define void @maxnum_combine_v2f16(
 ; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GFX9-NEXT:  [[BB:.*:]]
 ; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GFX9-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
 ; GFX9-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GFX9-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = call half @llvm.maxnum.f16(half %tmp3, half 1.000000e+00)
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = call half @llvm.maxnum.f16(half %tmp7, half 1.000000e+00)
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }

 ; FIXME: Should vectorize
 define void @minimum_combine_v2f16(ptr addrspace(1) %arg) {
 ; GCN-LABEL: define void @minimum_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[ITMP4:%.*]] = call half @llvm.minimum.f16(half [[ITMP3]], half 0xH3C00)
 ; GCN-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
 ; GCN-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
 ; GCN-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
 ; GCN-NEXT:    [[ITMP8:%.*]] = call half @llvm.minimum.f16(half [[ITMP7]], half 0xH3C00)
 ; GCN-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = call half @llvm.minimum.f16(half %tmp3, half 1.000000e+00)
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = call half @llvm.minimum.f16(half %tmp7, half 1.000000e+00)
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }

 define void @maximum_combine_v2f16(ptr addrspace(1) %arg) {
 ; GCN-LABEL: define void @maximum_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[ITMP4:%.*]] = call half @llvm.maximum.f16(half [[ITMP3]], half 0xH3C00)
 ; GCN-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
 ; GCN-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
 ; GCN-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
 ; GCN-NEXT:    [[ITMP8:%.*]] = call half @llvm.maximum.f16(half [[ITMP7]], half 0xH3C00)
 ; GCN-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = call half @llvm.maximum.f16(half %tmp3, half 1.000000e+00)
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = call half @llvm.maximum.f16(half %tmp7, half 1.000000e+00)
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }

 define void @canonicalize_combine_v2f16(ptr addrspace(1) %arg) {
 ; GCN-LABEL: define void @canonicalize_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP0]])
 ; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = call half @llvm.canonicalize.f16(half %tmp3)
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = call half @llvm.canonicalize.f16(half %tmp7)
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }

 define void @fabs_combine_v2f16(ptr addrspace(1) %arg) {
 ; GCN-LABEL: define void @fabs_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP0]])
 ; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = call half @llvm.fabs.f16(half %tmp3)
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = call half @llvm.fabs.f16(half %tmp7)
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }

 define void @fneg_combine_v2f16(ptr addrspace(1) %arg) {
 ; GCN-LABEL: define void @fneg_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[TMP1:%.*]] = fneg <2 x half> [[TMP0]]
 ; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = fneg half %tmp3
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = fneg half %tmp7
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }

 define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
 ; GCN-LABEL: define void @copysign_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[TMP1:%.*]] = insertelement <2 x half> poison, half [[SIGN]], i32 0
 ; GCN-NEXT:    [[TMP2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <2 x i32> zeroinitializer
 ; GCN-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.copysign.v2f16(<2 x half> [[TMP0]], <2 x half> [[TMP2]])
 ; GCN-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = call half @llvm.copysign.f16(half %tmp3, half %sign)
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = call half @llvm.copysign.f16(half %tmp7, half %sign)
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }

 ; FIXME: Should always vectorize

 define void @copysign_combine_v4f16(ptr addrspace(1) %arg, half %sign) {
 ; GFX8-LABEL: define void @copysign_combine_v4f16(
 ; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
 ; GFX8-NEXT:  [[BB:.*:]]
 ; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
 ; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
 ; GFX8-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GFX8-NEXT:    [[TMP1:%.*]] = insertelement <2 x half> poison, half [[SIGN]], i32 0
 ; GFX8-NEXT:    [[TMP2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <2 x i32> zeroinitializer
 ; GFX8-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.copysign.v2f16(<2 x half> [[TMP0]], <2 x half> [[TMP2]])
 ; GFX8-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(1) [[ITMP2]], align 2
 ; GFX8-NEXT:    [[ITMP9:%.*]] = add nuw nsw i64 [[ITMP1]], 2
 ; GFX8-NEXT:    [[ITMP10:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP9]]
 ; GFX8-NEXT:    [[ITMP11:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
 ; GFX8-NEXT:    [[ITMP12:%.*]] = call half @llvm.copysign.f16(half [[ITMP11]], half [[SIGN]])
 ; GFX8-NEXT:    store half [[ITMP12]], ptr addrspace(1) [[ITMP10]], align 2
 ; GFX8-NEXT:    [[ITMP13:%.*]] = add nuw nsw i64 [[ITMP1]], 3
 ; GFX8-NEXT:    [[ITMP14:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP13]]
 ; GFX8-NEXT:    [[ITMP15:%.*]] = load half, ptr addrspace(1) [[ITMP14]], align 2
 ; GFX8-NEXT:    [[ITMP16:%.*]] = call half @llvm.copysign.f16(half [[ITMP15]], half [[SIGN]])
 ; GFX8-NEXT:    store half [[ITMP16]], ptr addrspace(1) [[ITMP14]], align 2
 ; GFX8-NEXT:    ret void
 ;
 ; GFX9-LABEL: define void @copysign_combine_v4f16(
 ; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
 ; GFX9-NEXT:  [[BB:.*:]]
 ; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GFX9-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
 ; GFX9-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
 ; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GFX9-NEXT:    [[TMP1:%.*]] = insertelement <2 x half> poison, half [[SIGN]], i32 0
 ; GFX9-NEXT:    [[TMP2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <2 x i32> zeroinitializer
 ; GFX9-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.copysign.v2f16(<2 x half> [[TMP0]], <2 x half> [[TMP2]])
 ; GFX9-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(1) [[ITMP2]], align 2
 ; GFX9-NEXT:    [[ITMP9:%.*]] = add nuw nsw i64 [[ITMP1]], 2
 ; GFX9-NEXT:    [[ITMP10:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP9]]
 ; GFX9-NEXT:    [[ITMP11:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
 ; GFX9-NEXT:    [[ITMP13:%.*]] = add nuw nsw i64 [[ITMP1]], 3
 ; GFX9-NEXT:    [[ITMP14:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP13]]
 ; GFX9-NEXT:    [[ITMP15:%.*]] = load half, ptr addrspace(1) [[ITMP14]], align 2
 ; GFX9-NEXT:    [[TMP4:%.*]] = insertelement <2 x half> poison, half [[ITMP11]], i32 0
 ; GFX9-NEXT:    [[TMP5:%.*]] = insertelement <2 x half> [[TMP4]], half [[ITMP15]], i32 1
 ; GFX9-NEXT:    [[TMP6:%.*]] = call <2 x half> @llvm.copysign.v2f16(<2 x half> [[TMP5]], <2 x half> [[TMP2]])
 ; GFX9-NEXT:    store <2 x half> [[TMP6]], ptr addrspace(1) [[ITMP10]], align 2
 ; GFX9-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64

   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = call half @llvm.copysign.f16(half %tmp3, half %sign)
   store half %tmp4, ptr addrspace(1) %tmp2, align 2

   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = call half @llvm.copysign.f16(half %tmp7, half %sign)
   store half %tmp8, ptr addrspace(1) %tmp6, align 2

   %tmp9 = add nuw nsw i64 %tmp1, 2
   %tmp10 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp9
   %tmp11 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp12 = call half @llvm.copysign.f16(half %tmp11, half %sign)
   store half %tmp12, ptr addrspace(1) %tmp10, align 2

   %tmp13 = add nuw nsw i64 %tmp1, 3
   %tmp14 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp13
   %tmp15 = load half, ptr addrspace(1) %tmp14, align 2
   %tmp16 = call half @llvm.copysign.f16(half %tmp15, half %sign)
   store half %tmp16, ptr addrspace(1) %tmp14, align 2
   ret void
 }


 define void @canonicalize_combine_v4f16(ptr addrspace(1) %arg) {
 ; GFX8-LABEL: define void @canonicalize_combine_v4f16(
 ; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GFX8-NEXT:  [[BB:.*:]]
 ; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
 ; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
 ; GFX8-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GFX8-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP0]])
 ; GFX8-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GFX8-NEXT:    [[ITMP9:%.*]] = add nuw nsw i64 [[ITMP1]], 2
 ; GFX8-NEXT:    [[ITMP10:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP9]]
 ; GFX8-NEXT:    [[ITMP11:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
 ; GFX8-NEXT:    [[ITMP12:%.*]] = call half @llvm.canonicalize.f16(half [[ITMP11]])
 ; GFX8-NEXT:    store half [[ITMP12]], ptr addrspace(1) [[ITMP10]], align 2
 ; GFX8-NEXT:    [[ITMP13:%.*]] = add nuw nsw i64 [[ITMP1]], 3
 ; GFX8-NEXT:    [[ITMP14:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP13]]
 ; GFX8-NEXT:    [[ITMP15:%.*]] = load half, ptr addrspace(1) [[ITMP14]], align 2
 ; GFX8-NEXT:    [[ITMP16:%.*]] = call half @llvm.canonicalize.f16(half [[ITMP15]])
 ; GFX8-NEXT:    store half [[ITMP16]], ptr addrspace(1) [[ITMP14]], align 2
 ; GFX8-NEXT:    ret void
 ;
 ; GFX9-LABEL: define void @canonicalize_combine_v4f16(
 ; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GFX9-NEXT:  [[BB:.*:]]
 ; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GFX9-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
 ; GFX9-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
 ; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GFX9-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP0]])
 ; GFX9-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GFX9-NEXT:    [[ITMP9:%.*]] = add nuw nsw i64 [[ITMP1]], 2
 ; GFX9-NEXT:    [[ITMP10:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP9]]
 ; GFX9-NEXT:    [[ITMP11:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
 ; GFX9-NEXT:    [[ITMP13:%.*]] = add nuw nsw i64 [[ITMP1]], 3
 ; GFX9-NEXT:    [[ITMP14:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP13]]
 ; GFX9-NEXT:    [[ITMP15:%.*]] = load half, ptr addrspace(1) [[ITMP14]], align 2
 ; GFX9-NEXT:    [[TMP2:%.*]] = insertelement <2 x half> poison, half [[ITMP11]], i32 0
 ; GFX9-NEXT:    [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[ITMP15]], i32 1
 ; GFX9-NEXT:    [[TMP4:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP3]])
 ; GFX9-NEXT:    store <2 x half> [[TMP4]], ptr addrspace(1) [[ITMP10]], align 2
 ; GFX9-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64

   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = call half @llvm.canonicalize.f16(half %tmp3)
   store half %tmp4, ptr addrspace(1) %tmp2, align 2

   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = call half @llvm.canonicalize.f16(half %tmp7)
   store half %tmp8, ptr addrspace(1) %tmp6, align 2

   %tmp9 = add nuw nsw i64 %tmp1, 2
   %tmp10 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp9
   %tmp11 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp12 = call half @llvm.canonicalize.f16(half %tmp11)
   store half %tmp12, ptr addrspace(1) %tmp10, align 2

   %tmp13 = add nuw nsw i64 %tmp1, 3
   %tmp14 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp13
   %tmp15 = load half, ptr addrspace(1) %tmp14, align 2
   %tmp16 = call half @llvm.canonicalize.f16(half %tmp15)
   store half %tmp16, ptr addrspace(1) %tmp14, align 2
   ret void
 }

 ; FIXME: Should not vectorize on gfx8
 define void @minimumnum_combine_v2f16(ptr addrspace(1) %arg) {
 ; GCN-LABEL: define void @minimumnum_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
 ; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = call half @llvm.minimumnum.f16(half %tmp3, half 1.000000e+00)
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = call half @llvm.minimumnum.f16(half %tmp7, half 1.000000e+00)
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }

 ; FIXME: Should not vectorize on gfx8
 define void @maximumnum_combine_v2f16(ptr addrspace(1) %arg) {
 ; GCN-LABEL: define void @maximumnum_combine_v2f16(
 ; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[BB:.*:]]
 ; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
 ; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
 ; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
 ; GCN-NEXT:    ret void
 ;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
   %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
   %tmp4 = call half @llvm.maximumnum.f16(half %tmp3, half 1.000000e+00)
   store half %tmp4, ptr addrspace(1) %tmp2, align 2
   %tmp5 = add nuw nsw i64 %tmp1, 1
   %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
   %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
   %tmp8 = call half @llvm.maximumnum.f16(half %tmp7, half 1.000000e+00)
   store half %tmp8, ptr addrspace(1) %tmp6, align 2
   ret void
 }