[HardwareLoops] Optimisation remarks

This adds the initial plumbing to support optimisation remarks in
the IR hardware-loop pass.

I have left a todo in a comment where we can improve the reporting,
and will iterate on that now that we have this initial support in.

Differential Revision: https://reviews.llvm.org/D68579

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@374980 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/CodeGen/HardwareLoops.cpp b/lib/CodeGen/HardwareLoops.cpp
index 6a0f98d..968177c 100644
--- a/lib/CodeGen/HardwareLoops.cpp
+++ b/lib/CodeGen/HardwareLoops.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -75,8 +76,44 @@
 
 STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
 
+#ifndef NDEBUG
+static void debugHWLoopFailure(const StringRef DebugMsg,
+    Instruction *I) {
+  dbgs() << "HWLoops: " << DebugMsg;
+  if (I)
+    dbgs() << ' ' << *I;
+  else
+    dbgs() << '.';
+  dbgs() << '\n';
+}
+#endif
+
+static OptimizationRemarkAnalysis
+createHWLoopAnalysis(StringRef RemarkName, Loop *L, Instruction *I) {
+  Value *CodeRegion = L->getHeader();
+  DebugLoc DL = L->getStartLoc();
+
+  if (I) {
+    CodeRegion = I->getParent();
+    // If there is no debug location attached to the instruction, revert back to
+    // using the loop's.
+    if (I->getDebugLoc())
+      DL = I->getDebugLoc();
+  }
+
+  OptimizationRemarkAnalysis R(DEBUG_TYPE, RemarkName, DL, CodeRegion);
+  R << "hardware-loop not created: ";
+  return R;
+}
+
 namespace {
 
+  void reportHWLoopFailure(const StringRef Msg, const StringRef ORETag,
+      OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr) {
+    LLVM_DEBUG(debugHWLoopFailure(Msg, I));
+    ORE->emit(createHWLoopAnalysis(ORETag, TheLoop, I) << Msg);
+  }
+
   using TTI = TargetTransformInfo;
 
   class HardwareLoops : public FunctionPass {
@@ -97,6 +134,7 @@
       AU.addRequired<ScalarEvolutionWrapperPass>();
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
+      AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     }
 
     // Try to convert the given Loop into a hardware loop.
@@ -110,6 +148,7 @@
     ScalarEvolution *SE = nullptr;
     LoopInfo *LI = nullptr;
     const DataLayout *DL = nullptr;
+    OptimizationRemarkEmitter *ORE = nullptr;
     const TargetTransformInfo *TTI = nullptr;
     DominatorTree *DT = nullptr;
     bool PreserveLCSSA = false;
@@ -143,8 +182,9 @@
 
   public:
     HardwareLoop(HardwareLoopInfo &Info, ScalarEvolution &SE,
-                 const DataLayout &DL) :
-      SE(SE), DL(DL), L(Info.L), M(L->getHeader()->getModule()),
+                 const DataLayout &DL,
+                 OptimizationRemarkEmitter *ORE) :
+      SE(SE), DL(DL), ORE(ORE), L(Info.L), M(L->getHeader()->getModule()),
       ExitCount(Info.ExitCount),
       CountType(Info.CountType),
       ExitBranch(Info.ExitBranch),
@@ -157,6 +197,7 @@
   private:
     ScalarEvolution &SE;
     const DataLayout &DL;
+    OptimizationRemarkEmitter *ORE = nullptr;
     Loop *L                 = nullptr;
     Module *M               = nullptr;
     const SCEV *ExitCount   = nullptr;
@@ -182,6 +223,7 @@
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   DL = &F.getParent()->getDataLayout();
+  ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   LibInfo = TLIP ? &TLIP->getTLI(F) : nullptr;
   PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
@@ -201,31 +243,39 @@
 // converted and the parent loop doesn't support containing a hardware loop.
 bool HardwareLoops::TryConvertLoop(Loop *L) {
   // Process nested loops first.
-  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
-    if (TryConvertLoop(*I))
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+    if (TryConvertLoop(*I)) {
+      reportHWLoopFailure("nested hardware-loops not supported", "HWLoopNested",
+                          ORE, L);
       return true; // Stop search.
-
-  HardwareLoopInfo HWLoopInfo(L);
-  if (!HWLoopInfo.canAnalyze(*LI))
-    return false;
-
-  if (TTI->isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo) ||
-      ForceHardwareLoops) {
-
-    // Allow overriding of the counter width and loop decrement value.
-    if (CounterBitWidth.getNumOccurrences())
-      HWLoopInfo.CountType =
-        IntegerType::get(M->getContext(), CounterBitWidth);
-
-    if (LoopDecrement.getNumOccurrences())
-      HWLoopInfo.LoopDecrement =
-        ConstantInt::get(HWLoopInfo.CountType, LoopDecrement);
-
-    MadeChange |= TryConvertLoop(HWLoopInfo);
-    return MadeChange && (!HWLoopInfo.IsNestingLegal && !ForceNestedLoop);
+    }
   }
 
-  return false;
+  HardwareLoopInfo HWLoopInfo(L);
+  if (!HWLoopInfo.canAnalyze(*LI)) {
+    reportHWLoopFailure("cannot analyze loop, irreducible control flow",
+                        "HWLoopCannotAnalyze", ORE, L);
+    return false;
+  }
+
+  if (!ForceHardwareLoops &&
+      !TTI->isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo)) {
+    reportHWLoopFailure("it's not profitable to create a hardware-loop",
+                        "HWLoopNotProfitable", ORE, L);
+    return false;
+  }
+
+  // Allow overriding of the counter width and loop decrement value.
+  if (CounterBitWidth.getNumOccurrences())
+    HWLoopInfo.CountType =
+      IntegerType::get(M->getContext(), CounterBitWidth);
+
+  if (LoopDecrement.getNumOccurrences())
+    HWLoopInfo.LoopDecrement =
+      ConstantInt::get(HWLoopInfo.CountType, LoopDecrement);
+
+  MadeChange |= TryConvertLoop(HWLoopInfo);
+  return MadeChange && (!HWLoopInfo.IsNestingLegal && !ForceNestedLoop);
 }
 
 bool HardwareLoops::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) {
@@ -234,8 +284,13 @@
   LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L);
 
   if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT, ForceNestedLoop,
-                                          ForceHardwareLoopPHI))
+                                          ForceHardwareLoopPHI)) {
+    // TODO: there can be many reasons a loop is not considered a
+    // candidate, so we should let isHardwareLoopCandidate fill in the
+    // reason and then report a better message here.
+    reportHWLoopFailure("loop is not a candidate", "HWLoopNoCandidate", ORE, L);
     return false;
+  }
 
   assert(
       (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.ExitCount) &&
@@ -249,7 +304,7 @@
   if (!Preheader)
     return false;
 
-  HardwareLoop HWLoop(HWLoopInfo, *SE, *DL);
+  HardwareLoop HWLoop(HWLoopInfo, *SE, *DL, ORE);
   HWLoop.Create();
   ++NumHWLoops;
   return true;
@@ -257,10 +312,13 @@
 
 void HardwareLoop::Create() {
   LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n");
- 
+
   Value *LoopCountInit = InitLoopCount();
-  if (!LoopCountInit)
+  if (!LoopCountInit) {
+    reportHWLoopFailure("could not safely create a loop count expression",
+                        "HWLoopNotSafe", ORE, L);
     return;
+  }
 
   InsertIterationSetup(LoopCountInit);
 
diff --git a/test/CodeGen/ARM/O3-pipeline.ll b/test/CodeGen/ARM/O3-pipeline.ll
index 6cc7e53..cb6a005 100644
--- a/test/CodeGen/ARM/O3-pipeline.ll
+++ b/test/CodeGen/ARM/O3-pipeline.ll
@@ -52,6 +52,9 @@
 ; CHECK-NEXT:      Dominator Tree Construction
 ; CHECK-NEXT:      Natural Loop Information
 ; CHECK-NEXT:      Scalar Evolution Analysis
+; CHECK-NEXT:      Lazy Branch Probability Analysis
+; CHECK-NEXT:      Lazy Block Frequency Analysis
+; CHECK-NEXT:      Optimization Remark Emitter
 ; CHECK-NEXT:      Hardware Loop Insertion
 ; CHECK-NEXT:      Scalar Evolution Analysis
 ; CHECK-NEXT:      Loop Pass Manager
diff --git a/test/Transforms/HardwareLoops/ARM/structure.ll b/test/Transforms/HardwareLoops/ARM/structure.ll
index d413e2b..37af5c3 100644
--- a/test/Transforms/HardwareLoops/ARM/structure.ll
+++ b/test/Transforms/HardwareLoops/ARM/structure.ll
@@ -1,7 +1,8 @@
 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops %s -S -o - | FileCheck %s
-; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi %s -o - | FileCheck %s --check-prefix=CHECK-LLC
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi %s -o - -pass-remarks-analysis=hardware-loops  2>&1 | FileCheck %s --check-prefix=CHECK-LLC
 ; RUN: opt -mtriple=thumbv8.1m.main -loop-unroll -unroll-remainder=false -S < %s | llc -mtriple=thumbv8.1m.main | FileCheck %s --check-prefix=CHECK-UNROLL
 
+; CHECK-LLC: remark: <unknown>:0:0: hardware-loop not created: it's not profitable to create a hardware-loop
 ; CHECK-LABEL: early_exit
 ; CHECK-NOT: llvm.set.loop.iterations
 ; CHECK-NOT: llvm.loop.decrement
@@ -46,6 +47,7 @@
 ; CHECK-NOT: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
 ; CHECK-NOT: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7
 
+; CHECK-LLC: remark: <unknown>:0:0: hardware-loop not created: nested hardware-loops not supported
 ; CHECK-LLC:      nested:
 ; CHECK-LLC-NOT:    mov lr, r1
 ; CHECK-LLC:        dls lr, r1
@@ -176,6 +178,9 @@
   ret void
 }
 
+
+; CHECK-LLC: remark: <unknown>:0:0: hardware-loop not created: loop is not a candidate
+; CHECK-LLC: remark: <unknown>:0:0: hardware-loop not created: nested hardware-loops not supported
 ; CHECK-LABEL: not_rotated
 ; CHECK-NOT: call void @llvm.set.loop.iterations
 ; CHECK-NOT: call i32 @llvm.loop.decrement.i32
diff --git a/test/Transforms/HardwareLoops/unconditional-latch.ll b/test/Transforms/HardwareLoops/unconditional-latch.ll
index 4a3cd98..9d02e1c 100644
--- a/test/Transforms/HardwareLoops/unconditional-latch.ll
+++ b/test/Transforms/HardwareLoops/unconditional-latch.ll
@@ -1,6 +1,12 @@
 ; RUN: opt -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -hardware-loops -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALLOW
 ; RUN: opt -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -hardware-loops -force-hardware-loop-guard=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALLOW
-; RUN: opt -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -hardware-loops -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LATCH
+;
+; RUN: opt -force-hardware-loops=true -hardware-loop-decrement=1 \
+; RUN:     -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true \
+; RUN:     -hardware-loops -S -pass-remarks-analysis=hardware-loops %s -o - \
+; RUN:     2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-LATCH
+
+; CHECK-LATCH: remark: <unknown>:0:0: hardware-loop not created: loop is not a candidate
 
 ; CHECK-LABEL: not_rotated
 ; CHECK-LATCH-NOT: call void @llvm.set.loop.iterations