blob: ee15babdd3896f4a48405b2493ab7c259364da6a [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
; RUN: opt < %s -passes=memcpyopt -verify-memoryssa -S | FileCheck %s
; Test that stack-move optimization works when src is a GEP into an alloca.
; For the optimization to trigger:
; - The copy must cover the entire dest alloca (Size == DestSize, DestOffset == 0)
; - SrcOffset must be a multiple of DestAlloca's alignment
; - SrcOffset must be non-negative
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
declare void @llvm.lifetime.start.p0(ptr nocapture)
declare void @llvm.lifetime.end.p0(ptr nocapture)
declare void @use_nocapture(ptr nocapture)
; Basic test: memcpy from GEP(src) to dest alloca
; src = [16 x i8], dest = [8 x i8] align 8, copy 8 bytes
; SrcOffset(8) is a multiple of DestAlign(8), so optimization applies.
; After optimization: dest uses become src+8
define void @memcpy_src_gep_to_dest_alloca() {
; CHECK-LABEL: define void @memcpy_src_gep_to_dest_alloca() {
; CHECK-NEXT: [[SRC:%.*]] = alloca [16 x i8], align 8
; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 8
; CHECK-NEXT: [[SRC_GEP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 8
; CHECK-NEXT: store i64 42, ptr [[SRC_GEP1]], align 4
; CHECK-NEXT: call void @use_nocapture(ptr captures(none) [[SRC_GEP1]])
; CHECK-NEXT: call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
; CHECK-NEXT: ret void
;
%src = alloca [16 x i8], align 4
%dest = alloca [8 x i8], align 8
call void @llvm.lifetime.start.p0(ptr %src)
call void @llvm.lifetime.start.p0(ptr %dest)
%src.gep = getelementptr inbounds i8, ptr %src, i64 8
store i64 42, ptr %src.gep
call void @use_nocapture(ptr nocapture %src.gep)
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src.gep, i64 8, i1 false)
call void @use_nocapture(ptr nocapture %dest)
call void @llvm.lifetime.end.p0(ptr %src)
call void @llvm.lifetime.end.p0(ptr %dest)
ret void
}
; Test: memcpy from GEP(src) to dest alloca with different offset
; src = [12 x i8], dest = [8 x i8] align 4, copy 8 bytes from src+4
; SrcOffset(4) is a multiple of DestAlign(4), so optimization applies.
; After optimization: dest uses become src+4
define void @memcpy_src_gep_offset4_to_dest_alloca() {
; CHECK-LABEL: define void @memcpy_src_gep_offset4_to_dest_alloca() {
; CHECK-NEXT: [[SRC:%.*]] = alloca [12 x i8], align 4
; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
; CHECK-NEXT: [[SRC_GEP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
; CHECK-NEXT: store i64 42, ptr [[SRC_GEP1]], align 4
; CHECK-NEXT: call void @use_nocapture(ptr captures(none) [[SRC_GEP1]])
; CHECK-NEXT: call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
; CHECK-NEXT: ret void
;
%src = alloca [12 x i8], align 4
%dest = alloca [8 x i8], align 4
call void @llvm.lifetime.start.p0(ptr %src)
call void @llvm.lifetime.start.p0(ptr %dest)
%src.gep = getelementptr inbounds i8, ptr %src, i64 4
store i64 42, ptr %src.gep
call void @use_nocapture(ptr nocapture %src.gep)
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src.gep, i64 8, i1 false)
call void @use_nocapture(ptr nocapture %dest)
call void @llvm.lifetime.end.p0(ptr %src)
call void @llvm.lifetime.end.p0(ptr %dest)
ret void
}
; Test: load/store from GEP(src) to dest alloca
; src = [8 x i8], dest = [4 x i8] align 4, load/store 4 bytes from src+4
; SrcOffset(4) is a multiple of DestAlign(4), so optimization applies.
define void @load_store_src_gep_to_dest_alloca() {
; CHECK-LABEL: define void @load_store_src_gep_to_dest_alloca() {
; CHECK-NEXT: [[SRC:%.*]] = alloca [8 x i8], align 4
; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
; CHECK-NEXT: [[SRC_GEP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
; CHECK-NEXT: store i32 42, ptr [[SRC_GEP1]], align 4
; CHECK-NEXT: call void @use_nocapture(ptr captures(none) [[SRC_GEP1]])
; CHECK-NEXT: call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
; CHECK-NEXT: ret void
;
%src = alloca [8 x i8], align 4
%dest = alloca [4 x i8], align 4
call void @llvm.lifetime.start.p0(ptr %src)
call void @llvm.lifetime.start.p0(ptr %dest)
%src.gep = getelementptr inbounds i8, ptr %src, i64 4
store i32 42, ptr %src.gep
call void @use_nocapture(ptr nocapture %src.gep)
%val = load i32, ptr %src.gep
store i32 %val, ptr %dest
call void @use_nocapture(ptr nocapture %dest)
call void @llvm.lifetime.end.p0(ptr %src)
call void @llvm.lifetime.end.p0(ptr %dest)
ret void
}
; Test: both src and dest are direct allocas (no offset), same size
; This is the basic stack-move case, included here for completeness.
define void @memcpy_both_direct_allocas() {
; CHECK-LABEL: define void @memcpy_both_direct_allocas() {
; CHECK-NEXT: [[SRC:%.*]] = alloca [8 x i8], align 8
; CHECK-NEXT: store i64 42, ptr [[SRC]], align 4
; CHECK-NEXT: call void @use_nocapture(ptr captures(none) [[SRC]])
; CHECK-NEXT: call void @use_nocapture(ptr captures(none) [[SRC]])
; CHECK-NEXT: ret void
;
%src = alloca [8 x i8], align 4
%dest = alloca [8 x i8], align 8
call void @llvm.lifetime.start.p0(ptr %src)
call void @llvm.lifetime.start.p0(ptr %dest)
store i64 42, ptr %src
call void @use_nocapture(ptr nocapture %src)
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 4 %src, i64 8, i1 false)
call void @use_nocapture(ptr nocapture %dest)
call void @llvm.lifetime.end.p0(ptr %src)
call void @llvm.lifetime.end.p0(ptr %dest)
ret void
}
; Negative test: dest has offset (dest is GEP, not direct alloca)
; The optimization requires DestOffset == 0.
define void @no_optimize_dest_has_offset() {
; CHECK-LABEL: define void @no_optimize_dest_has_offset() {
; CHECK-NEXT: [[SRC:%.*]] = alloca [16 x i8], align 4
; CHECK-NEXT: [[DEST:%.*]] = alloca [16 x i8], align 8
; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[SRC]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[DEST]])
; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 8
; CHECK-NEXT: [[DEST_GEP:%.*]] = getelementptr inbounds i8, ptr [[DEST]], i64 8
; CHECK-NEXT: store i64 42, ptr [[SRC_GEP]], align 4
; CHECK-NEXT: call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST_GEP]], ptr align 4 [[SRC_GEP]], i64 8, i1 false)
; CHECK-NEXT: call void @use_nocapture(ptr captures(none) [[DEST_GEP]])
; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[SRC]])
; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[DEST]])
; CHECK-NEXT: ret void
;
%src = alloca [16 x i8], align 4
%dest = alloca [16 x i8], align 8
call void @llvm.lifetime.start.p0(ptr %src)
call void @llvm.lifetime.start.p0(ptr %dest)
%src.gep = getelementptr inbounds i8, ptr %src, i64 8
%dest.gep = getelementptr inbounds i8, ptr %dest, i64 8
store i64 42, ptr %src.gep
call void @use_nocapture(ptr nocapture %src.gep)
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest.gep, ptr align 4 %src.gep, i64 8, i1 false)
call void @use_nocapture(ptr nocapture %dest.gep)
call void @llvm.lifetime.end.p0(ptr %src)
call void @llvm.lifetime.end.p0(ptr %dest)
ret void
}
; Negative test: copy doesn't cover entire dest alloca (Size != DestSize)
; src = [12 x i8], dest = [16 x i8], copy only 8 bytes
define void @no_optimize_partial_dest_copy() {
; CHECK-LABEL: define void @no_optimize_partial_dest_copy() {
; CHECK-NEXT: [[SRC:%.*]] = alloca [12 x i8], align 4
; CHECK-NEXT: [[DEST:%.*]] = alloca [16 x i8], align 4
; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[SRC]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[DEST]])
; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
; CHECK-NEXT: store i64 42, ptr [[SRC_GEP]], align 4
; CHECK-NEXT: call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC_GEP]], i64 8, i1 false)
; CHECK-NEXT: call void @use_nocapture(ptr captures(none) [[DEST]])
; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[SRC]])
; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[DEST]])
; CHECK-NEXT: ret void
;
%src = alloca [12 x i8], align 4
%dest = alloca [16 x i8], align 4
call void @llvm.lifetime.start.p0(ptr %src)
call void @llvm.lifetime.start.p0(ptr %dest)
%src.gep = getelementptr inbounds i8, ptr %src, i64 4
store i64 42, ptr %src.gep
call void @use_nocapture(ptr nocapture %src.gep)
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src.gep, i64 8, i1 false)
call void @use_nocapture(ptr nocapture %dest)
call void @llvm.lifetime.end.p0(ptr %src)
call void @llvm.lifetime.end.p0(ptr %dest)
ret void
}
; Negative test: SrcOffset not a multiple of DestAlign
; src = [12 x i8] with offset 4, dest = [8 x i8] align 8
; SrcOffset(4) % DestAlign(8) = 4 != 0 -> rejected
define void @no_optimize_alignment_mismatch() {
; CHECK-LABEL: define void @no_optimize_alignment_mismatch() {
; CHECK-NEXT: [[SRC:%.*]] = alloca [12 x i8], align 4
; CHECK-NEXT: [[DEST:%.*]] = alloca [8 x i8], align 8
; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[SRC]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[DEST]])
; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
; CHECK-NEXT: store i64 42, ptr [[SRC_GEP]], align 4
; CHECK-NEXT: call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DEST]], ptr align 4 [[SRC_GEP]], i64 8, i1 false)
; CHECK-NEXT: call void @use_nocapture(ptr captures(none) [[DEST]])
; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[SRC]])
; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[DEST]])
; CHECK-NEXT: ret void
;
%src = alloca [12 x i8], align 4
%dest = alloca [8 x i8], align 8
call void @llvm.lifetime.start.p0(ptr %src)
call void @llvm.lifetime.start.p0(ptr %dest)
%src.gep = getelementptr inbounds i8, ptr %src, i64 4
store i64 42, ptr %src.gep
call void @use_nocapture(ptr nocapture %src.gep)
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 4 %src.gep, i64 8, i1 false)
call void @use_nocapture(ptr nocapture %dest)
call void @llvm.lifetime.end.p0(ptr %src)
call void @llvm.lifetime.end.p0(ptr %dest)
ret void
}
; Negative test: Clobbering store to source offset between memcpy from source and memcpy to dest
; This is a minimal reproducer for the bug in LLVM PR #176436.
define void @no_optimize_clobbering_store_to_src_offset(ptr noalias %dst) {
; CHECK-LABEL: define void @no_optimize_clobbering_store_to_src_offset
; CHECK-SAME: (ptr noalias [[DST:%.*]]) {
; CHECK-NEXT: [[TEMP2:%.*]] = alloca { i64, i64 }, align 8
; CHECK-NEXT: [[TEMP1:%.*]] = alloca { i64, i64 }, align 8
; CHECK-NEXT: [[LOCAL:%.*]] = alloca { [48 x i8], { i64, i64 }, ptr }, align 8
; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[TEMP1]])
; CHECK-NEXT: [[DST_BUF:%.*]] = getelementptr i8, ptr [[DST]], i64 48
; CHECK-NEXT: [[LOCAL_BUF:%.*]] = getelementptr inbounds i8, ptr [[LOCAL]], i64 48
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[LOCAL_BUF]], ptr align 8 [[DST_BUF]], i64 16, i1 false)
; CHECK-NEXT: store i8 0, ptr [[DST_BUF]], align 1
; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[TEMP1]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[TEMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TEMP2]], ptr align 8 [[LOCAL_BUF]], i64 16, i1 false)
; CHECK-NEXT: store i8 0, ptr [[LOCAL_BUF]], align 1
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST_BUF]], ptr align 8 [[TEMP2]], i64 16, i1 false)
; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[TEMP2]])
; CHECK-NEXT: ret void
;
%temp2 = alloca { i64, i64 }, align 8
%temp1 = alloca { i64, i64 }, align 8
%local = alloca { [48 x i8], { i64, i64 }, ptr }, align 8
; First move: copy from dst+48 to local+48 via temp1
call void @llvm.lifetime.start.p0(ptr %temp1)
%dst_buf = getelementptr i8, ptr %dst, i64 48
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %temp1, ptr align 8 %dst_buf, i64 16, i1 false)
store i8 0, ptr %dst_buf, align 1
%local_buf = getelementptr inbounds i8, ptr %local, i64 48
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %local_buf, ptr align 8 %temp1, i64 16, i1 false)
call void @llvm.lifetime.end.p0(ptr %temp1)
; Second move: copy from local+48 back to dst+48 via temp2
; BUG: PR incorrectly eliminated temp2 but the store below clobbers part of local_buf first!
; The fix ensures we check the right portion of SrcAlloca for any clobbering.
call void @llvm.lifetime.start.p0(ptr %temp2)
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %temp2, ptr align 8 %local_buf, i64 16, i1 false)
store i8 0, ptr %local_buf, align 1 ; <-- clobbers byte 48 of %local
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst_buf, ptr align 8 %temp2, i64 16, i1 false)
call void @llvm.lifetime.end.p0(ptr %temp2)
ret void
}