mlir/test/Dialect/Arith/int-range-opts-bug-119045.mlir - llvm-project - Git at Google

 // RUN: mlir-opt -int-range-optimizations %s

 // Note: I wish I had a simpler example than this, but getting rid of a
 // bunch of the arithmetic made the issue go away.
 // CHECK-LABEL: @blocks_prematurely_declared_dead_bug
 // CHECK-NOT: arith.constant true
 // CHECK-COUNT-4: cf.cond_br
 // CHECK: return
 func.func @blocks_prematurely_declared_dead_bug(%mem: memref<?xf16>) {
   %cst = arith.constant dense<false> : vector<1xi1>
   %c1 = arith.constant 1 : index
   %cst_0 = arith.constant dense<0.000000e+00> : vector<1xf16>
   %cst_1 = arith.constant 0.000000e+00 : f16
   %c16 = arith.constant 16 : index
   %c0 = arith.constant 0 : index
   %c64 = arith.constant 64 : index
   %thread_id_x = gpu.thread_id  x upper_bound 64
   %6 = test.with_bounds { smin = 16 : index, smax = 112 : index, umin = 16 : index, umax = 112 : index } : index
   %8 = arith.divui %6, %c16 : index
   %9 = arith.muli %8, %c16 : index
   cf.br ^bb1(%c0 : index)
 ^bb1(%12: index):  // 2 preds: ^bb0, ^bb7
   %13 = arith.cmpi slt, %12, %9 : index
   cf.cond_br %13, ^bb2, ^bb8
 ^bb2:  // pred: ^bb1
   %14 = arith.subi %9, %12 : index
   %15 = arith.minsi %14, %c64 : index
   %16 = arith.subi %15, %thread_id_x : index
   %17 = vector.constant_mask [1] : vector<1xi1>
   %18 = arith.cmpi sgt, %16, %c0 : index
   %19 = arith.select %18, %17, %cst : vector<1xi1>
   %20 = vector.extract %19[0] : i1 from vector<1xi1>
   %21 = vector.insert %20, %cst [0] : i1 into vector<1xi1>
   %22 = arith.addi %12, %thread_id_x : index
   cf.br ^bb3(%c0, %cst_0 : index, vector<1xf16>)
 ^bb3(%23: index, %24: vector<1xf16>):  // 2 preds: ^bb2, ^bb6
   %25 = arith.cmpi slt, %23, %c1 : index
   cf.cond_br %25, ^bb4, ^bb7
 ^bb4:  // pred: ^bb3
   %26 = vector.extract %21[%23] : i1 from vector<1xi1>
   cf.cond_br %26, ^bb5, ^bb6(%24 : vector<1xf16>)
 ^bb5:  // pred: ^bb4
   %27 = arith.addi %22, %23 : index
   %28 = memref.load %mem[%27] : memref<?xf16>
   %29 = vector.insert %28, %24[%23] : f16 into vector<1xf16>
   cf.br ^bb6(%29 : vector<1xf16>)
 ^bb6(%30: vector<1xf16>):  // 2 preds: ^bb4, ^bb5
   %31 = arith.addi %23, %c1 : index
   cf.br ^bb3(%31, %30 : index, vector<1xf16>)
 ^bb7:  // pred: ^bb3
   %37 = arith.addi %12, %c64 : index
   cf.br ^bb1(%37 : index)
 ^bb8:  // pred: ^bb1
   %70 = arith.cmpi eq, %thread_id_x, %c0 : index
   cf.cond_br %70, ^bb9, ^bb10
 ^bb9:  // pred: ^bb8
   memref.store %cst_1, %mem[%c0] : memref<?xf16>
   cf.br ^bb10
 ^bb10:  // 2 preds: ^bb8, ^bb9
   return
 }
	// RUN: mlir-opt -int-range-optimizations %s

	// Note: I wish I had a simpler example than this, but getting rid of a
	// bunch of the arithmetic made the issue go away.
	// CHECK-LABEL: @blocks_prematurely_declared_dead_bug
	// CHECK-NOT: arith.constant true
	// CHECK-COUNT-4: cf.cond_br
	// CHECK: return
	func.func @blocks_prematurely_declared_dead_bug(%mem: memref<?xf16>) {
	%cst = arith.constant dense<false> : vector<1xi1>
	%c1 = arith.constant 1 : index
	%cst_0 = arith.constant dense<0.000000e+00> : vector<1xf16>
	%cst_1 = arith.constant 0.000000e+00 : f16
	%c16 = arith.constant 16 : index
	%c0 = arith.constant 0 : index
	%c64 = arith.constant 64 : index
	%thread_id_x = gpu.thread_id x upper_bound 64
	%6 = test.with_bounds { smin = 16 : index, smax = 112 : index, umin = 16 : index, umax = 112 : index } : index
	%8 = arith.divui %6, %c16 : index
	%9 = arith.muli %8, %c16 : index
	cf.br ^bb1(%c0 : index)
	^bb1(%12: index): // 2 preds: ^bb0, ^bb7
	%13 = arith.cmpi slt, %12, %9 : index
	cf.cond_br %13, ^bb2, ^bb8
	^bb2: // pred: ^bb1
	%14 = arith.subi %9, %12 : index
	%15 = arith.minsi %14, %c64 : index
	%16 = arith.subi %15, %thread_id_x : index
	%17 = vector.constant_mask [1] : vector<1xi1>
	%18 = arith.cmpi sgt, %16, %c0 : index
	%19 = arith.select %18, %17, %cst : vector<1xi1>
	%20 = vector.extract %19[0] : i1 from vector<1xi1>
	%21 = vector.insert %20, %cst [0] : i1 into vector<1xi1>
	%22 = arith.addi %12, %thread_id_x : index
	cf.br ^bb3(%c0, %cst_0 : index, vector<1xf16>)
	^bb3(%23: index, %24: vector<1xf16>): // 2 preds: ^bb2, ^bb6
	%25 = arith.cmpi slt, %23, %c1 : index
	cf.cond_br %25, ^bb4, ^bb7
	^bb4: // pred: ^bb3
	%26 = vector.extract %21[%23] : i1 from vector<1xi1>
	cf.cond_br %26, ^bb5, ^bb6(%24 : vector<1xf16>)
	^bb5: // pred: ^bb4
	%27 = arith.addi %22, %23 : index
	%28 = memref.load %mem[%27] : memref<?xf16>
	%29 = vector.insert %28, %24[%23] : f16 into vector<1xf16>
	cf.br ^bb6(%29 : vector<1xf16>)
	^bb6(%30: vector<1xf16>): // 2 preds: ^bb4, ^bb5
	%31 = arith.addi %23, %c1 : index
	cf.br ^bb3(%31, %30 : index, vector<1xf16>)
	^bb7: // pred: ^bb3
	%37 = arith.addi %12, %c64 : index
	cf.br ^bb1(%37 : index)
	^bb8: // pred: ^bb1
	%70 = arith.cmpi eq, %thread_id_x, %c0 : index
	cf.cond_br %70, ^bb9, ^bb10
	^bb9: // pred: ^bb8
	memref.store %cst_1, %mem[%c0] : memref<?xf16>
	cf.br ^bb10
	^bb10: // 2 preds: ^bb8, ^bb9
	return
	}