[Polly] Add greedy fusion algorithm.

When the option -polly-loopfusion-greedy is set, the ScheduleOptimizer
tries to aggressively fuse any band it can and does not violate any
dependences.

As part if the implementation, the functionalty for copying a band
into an new schedule was extracted out of the ScheduleTreeRewriter.

GitOrigin-RevId: 64489255be4903dc8683aff8dade315461a0a397
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 1dbbc11..3d550e2 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -11,3 +11,15 @@
   branch.
 
 - Change ...
+
+ * The command line option -polly-opt-fusion has been removed. What the
+   flag does was frequently misunderstood and is rarely useful. However,
+   the functionality is still accessible using
+```
+    -polly-isl-arg=--no-schedule-serialize-sccs
+```
+
+ * The command line option -polly-loopfusion-greedy has been added.
+   This will agressively try to fuse any loop regardless of
+   profitability. The is what users might have expected what
+   -polly-opt-fusion=max would do.
diff --git a/include/polly/ScheduleTreeTransform.h b/include/polly/ScheduleTreeTransform.h
index e868531..993245c 100644
--- a/include/polly/ScheduleTreeTransform.h
+++ b/include/polly/ScheduleTreeTransform.h
@@ -240,6 +240,14 @@
                                        llvm::ArrayRef<int> TileSizes,
                                        int DefaultTileSize);
 
+/// Apply greedy fusion. That is, fuse any loop that is possible to be fused
+/// top-down.
+///
+/// @param Sched  Sched tree to fuse all the loops in.
+/// @param Deps   Validity constraints that must be preserved.
+isl::schedule applyGreedyFusion(isl::schedule Sched,
+                                const isl::union_map &Deps);
+
 } // namespace polly
 
 #endif // POLLY_SCHEDULETREETRANSFORM_H
diff --git a/include/polly/Support/GICHelper.h b/include/polly/Support/GICHelper.h
index 5c6a325..5a106d7 100644
--- a/include/polly/Support/GICHelper.h
+++ b/include/polly/Support/GICHelper.h
@@ -231,6 +231,12 @@
 ISL_DUMP_OBJECT(val_list)
 //@}
 
+/// Emit the equivaltent of the isl_*_dump output into a raw_ostream.
+/// @{
+void dumpIslObj(const isl::schedule_node &Node, llvm::raw_ostream &OS);
+void dumpIslObj(__isl_keep isl_schedule_node *node, llvm::raw_ostream &OS);
+/// @}
+
 inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
                                      __isl_keep isl_union_map *Map) {
   OS << polly::stringFromIslObj(Map, "null");
diff --git a/lib/Support/GICHelper.cpp b/lib/Support/GICHelper.cpp
index 409dbf4..359b170 100644
--- a/lib/Support/GICHelper.cpp
+++ b/lib/Support/GICHelper.cpp
@@ -238,4 +238,25 @@
   polly::dumpIslObj(isl::val());
   polly::dumpIslObj(isl::val_list());
 }
+
+void polly::dumpIslObj(__isl_keep isl_schedule_node *node, raw_ostream &OS) {
+  if (!node)
+    return;
+
+  isl_ctx *ctx = isl_schedule_node_get_ctx(node);
+  isl_printer *p = isl_printer_to_str(ctx);
+  p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
+  p = isl_printer_print_schedule_node(p, node);
+
+  char *char_str = isl_printer_get_str(p);
+  OS << char_str;
+
+  free(char_str);
+  isl_printer_free(p);
+}
+
+void polly::dumpIslObj(const isl::schedule_node &Node, raw_ostream &OS) {
+  dumpIslObj(Node.get(), OS);
+}
+
 #endif
diff --git a/lib/Transform/ScheduleOptimizer.cpp b/lib/Transform/ScheduleOptimizer.cpp
index 932f69e..02d4685 100644
--- a/lib/Transform/ScheduleOptimizer.cpp
+++ b/lib/Transform/ScheduleOptimizer.cpp
@@ -97,6 +97,11 @@
                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
                       cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
 
+static cl::opt<bool>
+    GreedyFusion("polly-loopfusion-greedy",
+                 cl::desc("Aggressively try to fuse everything"), cl::Hidden,
+                 cl::ZeroOrMore, cl::cat(PollyCategory));
+
 static cl::opt<std::string> OuterCoincidence(
     "polly-opt-outer-coincidence",
     cl::desc("Try to construct schedules where the outer member of each band "
@@ -835,6 +840,13 @@
   if (Schedule.is_null())
     return false;
 
+  if (GreedyFusion) {
+    isl::union_map Validity = D.getDependences(
+        Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW);
+    Schedule = applyGreedyFusion(Schedule, Validity);
+    assert(!Schedule.is_null());
+  }
+
   // Apply post-rescheduling optimizations (if enabled) and/or prevectorization.
   const OptimizerAdditionalInfoTy OAI = {
       TTI, const_cast<Dependences *>(&D),
diff --git a/lib/Transform/ScheduleTreeTransform.cpp b/lib/Transform/ScheduleTreeTransform.cpp
index ce4e6ae..08b377c 100644
--- a/lib/Transform/ScheduleTreeTransform.cpp
+++ b/lib/Transform/ScheduleTreeTransform.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/ScheduleTreeTransform.h"
+#include "polly/Support/GICHelper.h"
 #include "polly/Support/ISLTools.h"
 #include "polly/Support/ScopHelper.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -20,10 +21,103 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 
+#define DEBUG_TYPE "polly-opt-isl"
+
 using namespace polly;
 using namespace llvm;
 
 namespace {
+
+/// Copy the band member attributes (coincidence, loop type, isolate ast loop
+/// type) from one band to another.
+static isl::schedule_node_band
+applyBandMemberAttributes(isl::schedule_node_band Target, int TargetIdx,
+                          const isl::schedule_node_band &Source,
+                          int SourceIdx) {
+  bool Coincident = Source.member_get_coincident(SourceIdx).release();
+  Target = Target.member_set_coincident(TargetIdx, Coincident);
+
+  isl_ast_loop_type LoopType =
+      isl_schedule_node_band_member_get_ast_loop_type(Source.get(), SourceIdx);
+  Target = isl::manage(isl_schedule_node_band_member_set_ast_loop_type(
+                           Target.release(), TargetIdx, LoopType))
+               .as<isl::schedule_node_band>();
+
+  isl_ast_loop_type IsolateType =
+      isl_schedule_node_band_member_get_isolate_ast_loop_type(Source.get(),
+                                                              SourceIdx);
+  Target = isl::manage(isl_schedule_node_band_member_set_isolate_ast_loop_type(
+                           Target.release(), TargetIdx, IsolateType))
+               .as<isl::schedule_node_band>();
+
+  return Target;
+}
+
+/// Create a new band by copying members from another @p Band. @p IncludeCb
+/// decides which band indices are copied to the result.
+template <typename CbTy>
+static isl::schedule rebuildBand(isl::schedule_node_band OldBand,
+                                 isl::schedule Body, CbTy IncludeCb) {
+  int NumBandDims = OldBand.n_member().release();
+
+  bool ExcludeAny = false;
+  bool IncludeAny = false;
+  for (auto OldIdx : seq<int>(0, NumBandDims)) {
+    if (IncludeCb(OldIdx))
+      IncludeAny = true;
+    else
+      ExcludeAny = true;
+  }
+
+  // Instead of creating a zero-member band, don't create a band at all.
+  if (!IncludeAny)
+    return Body;
+
+  isl::multi_union_pw_aff PartialSched = OldBand.get_partial_schedule();
+  isl::multi_union_pw_aff NewPartialSched;
+  if (ExcludeAny) {
+    // Select the included partial scatter functions.
+    isl::union_pw_aff_list List = PartialSched.list();
+    int NewIdx = 0;
+    for (auto OldIdx : seq<int>(0, NumBandDims)) {
+      if (IncludeCb(OldIdx))
+        NewIdx += 1;
+      else
+        List = List.drop(NewIdx, 1);
+    }
+    isl::space ParamSpace = PartialSched.get_space().params();
+    isl::space NewScatterSpace = ParamSpace.add_unnamed_tuple(NewIdx);
+    NewPartialSched = isl::multi_union_pw_aff(NewScatterSpace, List);
+  } else {
+    // Just reuse original scatter function of copying all of them.
+    NewPartialSched = PartialSched;
+  }
+
+  // Create the new band node.
+  isl::schedule_node_band NewBand =
+      Body.insert_partial_schedule(NewPartialSched)
+          .get_root()
+          .child(0)
+          .as<isl::schedule_node_band>();
+
+  // If OldBand was permutable, so is the new one, even if some dimensions are
+  // missing.
+  bool IsPermutable = OldBand.permutable().release();
+  NewBand = NewBand.set_permutable(IsPermutable);
+
+  // Reapply member attributes.
+  int NewIdx = 0;
+  for (auto OldIdx : seq<int>(0, NumBandDims)) {
+    if (!IncludeCb(OldIdx))
+      continue;
+    NewBand =
+        applyBandMemberAttributes(std::move(NewBand), NewIdx, OldBand, OldIdx);
+    NewIdx += 1;
+  }
+
+  return NewBand.get_schedule();
+}
+
 /// Recursively visit all nodes of a schedule tree while allowing changes.
 ///
 /// The visit methods return an isl::schedule_node that is used to continue
@@ -75,23 +169,9 @@
   }
 
   isl::schedule visitBand(isl::schedule_node_band Band, Args... args) {
-    isl::multi_union_pw_aff PartialSched =
-        isl::manage(isl_schedule_node_band_get_partial_schedule(Band.get()));
     isl::schedule NewChild =
         getDerived().visit(Band.child(0), std::forward<Args>(args)...);
-    isl::schedule_node NewNode =
-        NewChild.insert_partial_schedule(PartialSched).get_root().child(0);
-
-    // Reapply permutability and coincidence attributes.
-    NewNode = isl::manage(isl_schedule_node_band_set_permutable(
-        NewNode.release(), isl_schedule_node_band_get_permutable(Band.get())));
-    unsigned BandDims = isl_schedule_node_band_n_member(Band.get());
-    for (unsigned i = 0; i < BandDims; i += 1)
-      NewNode = isl::manage(isl_schedule_node_band_member_set_coincident(
-          NewNode.release(), i,
-          isl_schedule_node_band_member_get_coincident(Band.get(), i)));
-
-    return NewNode.get_schedule();
+    return rebuildBand(Band, NewChild, [](int) { return true; });
   }
 
   isl::schedule visitSequence(isl::schedule_node_sequence Sequence,
@@ -159,6 +239,10 @@
   }
 };
 
+/// Rewrite the schedule tree without any changes. Useful to copy a subtree into
+/// a new schedule, discarding everything but.
+struct IdentityRewriter : public ScheduleTreeRewriter<IdentityRewriter> {};
+
 /// Rewrite a schedule tree to an equivalent one without extension nodes.
 ///
 /// Each visit method takes two additional arguments:
@@ -265,11 +349,9 @@
     NewNode = isl::manage(isl_schedule_node_band_set_permutable(
         NewNode.release(),
         isl_schedule_node_band_get_permutable(OldNode.get())));
-    for (unsigned i = 0; i < BandDims; i += 1) {
-      NewNode = isl::manage(isl_schedule_node_band_member_set_coincident(
-          NewNode.release(), i,
-          isl_schedule_node_band_member_get_coincident(OldNode.get(), i)));
-    }
+    for (unsigned i = 0; i < BandDims; i += 1)
+      NewNode = applyBandMemberAttributes(NewNode.as<isl::schedule_node_band>(),
+                                          i, OldNode, i);
 
     return NewNode.get_schedule();
   }
@@ -504,6 +586,404 @@
   ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, -1);
   return Set.add_constraint(ExtConstr);
 }
+
+/// Collapse perfectly nested bands into a single band.
+class BandCollapseRewriter : public ScheduleTreeRewriter<BandCollapseRewriter> {
+private:
+  using BaseTy = ScheduleTreeRewriter<BandCollapseRewriter>;
+  BaseTy &getBase() { return *this; }
+  const BaseTy &getBase() const { return *this; }
+
+public:
+  isl::schedule visitBand(isl::schedule_node_band RootBand) {
+    isl::schedule_node_band Band = RootBand;
+    isl::ctx Ctx = Band.ctx();
+
+    // Do not merge permutable band to avoid loosing the permutability property.
+    // Cannot collapse even two permutable loops, they might be permutable
+    // individually, but not necassarily accross.
+    if (Band.n_member().release() > 1 && Band.permutable())
+      return getBase().visitBand(Band);
+
+    // Find collapsable bands.
+    SmallVector<isl::schedule_node_band> Nest;
+    int NumTotalLoops = 0;
+    isl::schedule_node Body;
+    while (true) {
+      Nest.push_back(Band);
+      NumTotalLoops += Band.n_member().release();
+      Body = Band.first_child();
+      if (!Body.isa<isl::schedule_node_band>())
+        break;
+      Band = Body.as<isl::schedule_node_band>();
+
+      // Do not include next band if it is permutable to not lose its
+      // permutability property.
+      if (Band.n_member().release() > 1 && Band.permutable())
+        break;
+    }
+
+    // Nothing to collapse, preserve permutability.
+    if (Nest.size() <= 1)
+      return getBase().visitBand(Band);
+
+    LLVM_DEBUG({
+      dbgs() << "Found loops to collapse between\n";
+      dumpIslObj(RootBand, dbgs());
+      dbgs() << "and\n";
+      dumpIslObj(Body, dbgs());
+      dbgs() << "\n";
+    });
+
+    isl::schedule NewBody = visit(Body);
+
+    // Collect partial schedules from all members.
+    isl::union_pw_aff_list PartScheds{Ctx, NumTotalLoops};
+    for (isl::schedule_node_band Band : Nest) {
+      int NumLoops = Band.n_member().release();
+      isl::multi_union_pw_aff BandScheds = Band.get_partial_schedule();
+      for (auto j : seq<int>(0, NumLoops))
+        PartScheds = PartScheds.add(BandScheds.at(j));
+    }
+    isl::space ScatterSpace = isl::space(Ctx, 0, NumTotalLoops);
+    isl::multi_union_pw_aff PartSchedsMulti{ScatterSpace, PartScheds};
+
+    isl::schedule_node_band CollapsedBand =
+        NewBody.insert_partial_schedule(PartSchedsMulti)
+            .get_root()
+            .first_child()
+            .as<isl::schedule_node_band>();
+
+    // Copy over loop attributes form original bands.
+    int LoopIdx = 0;
+    for (isl::schedule_node_band Band : Nest) {
+      int NumLoops = Band.n_member().release();
+      for (int i : seq<int>(0, NumLoops)) {
+        CollapsedBand = applyBandMemberAttributes(std::move(CollapsedBand),
+                                                  LoopIdx, Band, i);
+        LoopIdx += 1;
+      }
+    }
+    assert(LoopIdx == NumTotalLoops &&
+           "Expect the same number of loops to add up again");
+
+    return CollapsedBand.get_schedule();
+  }
+};
+
+static isl::schedule collapseBands(isl::schedule Sched) {
+  LLVM_DEBUG(dbgs() << "Collapse bands in schedule\n");
+  BandCollapseRewriter Rewriter;
+  return Rewriter.visit(Sched);
+}
+
+/// Collect sequentially executed bands (or anything else), even if nested in a
+/// mark or other nodes whose child is executed just once. If we can
+/// successfully fuse the bands, we allow them to be removed.
+static void collectPotentiallyFusableBands(
+    isl::schedule_node Node,
+    SmallVectorImpl<std::pair<isl::schedule_node, isl::schedule_node>>
+        &ScheduleBands,
+    const isl::schedule_node &DirectChild) {
+  switch (isl_schedule_node_get_type(Node.get())) {
+  case isl_schedule_node_sequence:
+  case isl_schedule_node_set:
+  case isl_schedule_node_mark:
+  case isl_schedule_node_domain:
+  case isl_schedule_node_filter:
+    if (Node.has_children()) {
+      isl::schedule_node C = Node.first_child();
+      while (true) {
+        collectPotentiallyFusableBands(C, ScheduleBands, DirectChild);
+        if (!C.has_next_sibling())
+          break;
+        C = C.next_sibling();
+      }
+    }
+    break;
+
+  default:
+    // Something that does not execute suquentially (e.g. a band)
+    ScheduleBands.push_back({Node, DirectChild});
+    break;
+  }
+}
+
+/// Remove dependencies that are resolved by @p PartSched. That is, remove
+/// everything that we already know is executed in-order.
+static isl::union_map remainingDepsFromPartialSchedule(isl::union_map PartSched,
+                                                       isl::union_map Deps) {
+  int NumDims = getNumScatterDims(PartSched);
+  auto ParamSpace = PartSched.get_space().params();
+
+  // { Scatter[] }
+  isl::space ScatterSpace =
+      ParamSpace.set_from_params().add_dims(isl::dim::set, NumDims);
+
+  // { Scatter[] -> Domain[] }
+  isl::union_map PartSchedRev = PartSched.reverse();
+
+  // { Scatter[] -> Scatter[] }
+  isl::map MaybeBefore = isl::map::lex_le(ScatterSpace);
+
+  // { Domain[] -> Domain[] }
+  isl::union_map DomMaybeBefore =
+      MaybeBefore.apply_domain(PartSchedRev).apply_range(PartSchedRev);
+
+  // { Domain[] -> Domain[] }
+  isl::union_map ChildRemainingDeps = Deps.intersect(DomMaybeBefore);
+
+  return ChildRemainingDeps;
+}
+
+/// Remove dependencies that are resolved by executing them in the order
+/// specified by @p Domains;
+static isl::union_map remainigDepsFromSequence(ArrayRef<isl::union_set> Domains,
+                                               isl::union_map Deps) {
+  isl::ctx Ctx = Deps.ctx();
+  isl::space ParamSpace = Deps.get_space().params();
+
+  // Create a partial schedule mapping to constants that reflect the execution
+  // order.
+  isl::union_map PartialSchedules = isl::union_map::empty(Ctx);
+  for (auto P : enumerate(Domains)) {
+    isl::val ExecTime = isl::val(Ctx, P.index());
+    isl::union_pw_aff DomSched{P.value(), ExecTime};
+    PartialSchedules = PartialSchedules.unite(DomSched.as_union_map());
+  }
+
+  return remainingDepsFromPartialSchedule(PartialSchedules, Deps);
+}
+
+/// Determine whether the outermost loop of to bands can be fused while
+/// respecting validity dependencies.
+static bool canFuseOutermost(const isl::schedule_node_band &LHS,
+                             const isl::schedule_node_band &RHS,
+                             const isl::union_map &Deps) {
+  // { LHSDomain[] -> Scatter[] }
+  isl::union_map LHSPartSched =
+      LHS.get_partial_schedule().get_at(0).as_union_map();
+
+  // { Domain[] -> Scatter[] }
+  isl::union_map RHSPartSched =
+      RHS.get_partial_schedule().get_at(0).as_union_map();
+
+  // Dependencies that are already resolved because LHS executes before RHS, but
+  // will not be anymore after fusion. { DefDomain[] -> UseDomain[] }
+  isl::union_map OrderedBySequence =
+      Deps.intersect_domain(LHSPartSched.domain())
+          .intersect_range(RHSPartSched.domain());
+
+  isl::space ParamSpace = OrderedBySequence.get_space().params();
+  isl::space NewScatterSpace = ParamSpace.add_unnamed_tuple(1);
+
+  // { Scatter[] -> Scatter[] }
+  isl::map After = isl::map::lex_gt(NewScatterSpace);
+
+  // After fusion, instances with smaller (or equal, which means they will be
+  // executed in the same iteration, but the LHS instance is still sequenced
+  // before RHS) scatter value will still be executed before. This are the
+  // orderings where this is not necessarily the case.
+  // { LHSDomain[] -> RHSDomain[] }
+  isl::union_map MightBeAfterDoms = After.apply_domain(LHSPartSched.reverse())
+                                        .apply_range(RHSPartSched.reverse());
+
+  // Dependencies that are not resolved by the new execution order.
+  isl::union_map WithBefore = OrderedBySequence.intersect(MightBeAfterDoms);
+
+  return WithBefore.is_empty();
+}
+
+/// Fuse @p LHS and @p RHS if possible while preserving validity dependenvies.
+static isl::schedule tryGreedyFuse(isl::schedule_node_band LHS,
+                                   isl::schedule_node_band RHS,
+                                   const isl::union_map &Deps) {
+  if (!canFuseOutermost(LHS, RHS, Deps))
+    return {};
+
+  LLVM_DEBUG({
+    dbgs() << "Found loops for greedy fusion:\n";
+    dumpIslObj(LHS, dbgs());
+    dbgs() << "and\n";
+    dumpIslObj(RHS, dbgs());
+    dbgs() << "\n";
+  });
+
+  // The partial schedule of the bands outermost loop that we need to combine
+  // for the fusion.
+  isl::union_pw_aff LHSPartOuterSched = LHS.get_partial_schedule().get_at(0);
+  isl::union_pw_aff RHSPartOuterSched = RHS.get_partial_schedule().get_at(0);
+
+  // Isolate band bodies as roots of their own schedule trees.
+  IdentityRewriter Rewriter;
+  isl::schedule LHSBody = Rewriter.visit(LHS.first_child());
+  isl::schedule RHSBody = Rewriter.visit(RHS.first_child());
+
+  // Reconstruct the non-outermost (not going to be fused) loops from both
+  // bands.
+  // TODO: Maybe it is possibly to transfer the 'permutability' property from
+  // LHS+RHS. At minimum we need merge multiple band members at once, otherwise
+  // permutability has no meaning.
+  isl::schedule LHSNewBody =
+      rebuildBand(LHS, LHSBody, [](int i) { return i > 0; });
+  isl::schedule RHSNewBody =
+      rebuildBand(RHS, RHSBody, [](int i) { return i > 0; });
+
+  // The loop body of the fused loop.
+  isl::schedule NewCommonBody = LHSNewBody.sequence(RHSNewBody);
+
+  // Combine the partial schedules of both loops to a new one. Instances with
+  // the same scatter value are put together.
+  isl::union_map NewCommonPartialSched =
+      LHSPartOuterSched.as_union_map().unite(RHSPartOuterSched.as_union_map());
+  isl::schedule NewCommonSchedule = NewCommonBody.insert_partial_schedule(
+      NewCommonPartialSched.as_multi_union_pw_aff());
+
+  return NewCommonSchedule;
+}
+
+static isl::schedule tryGreedyFuse(isl::schedule_node LHS,
+                                   isl::schedule_node RHS,
+                                   const isl::union_map &Deps) {
+  // TODO: Non-bands could be interpreted as a band with just as single
+  // iteration. However, this is only useful if both ends of a fused loop were
+  // originally loops themselves.
+  if (!LHS.isa<isl::schedule_node_band>())
+    return {};
+  if (!RHS.isa<isl::schedule_node_band>())
+    return {};
+
+  return tryGreedyFuse(LHS.as<isl::schedule_node_band>(),
+                       RHS.as<isl::schedule_node_band>(), Deps);
+}
+
+/// Fuse all fusable loop top-down in a schedule tree.
+///
+/// The isl::union_map parameters is the set of validity dependencies that have
+/// not been resolved/carried by a parent schedule node.
+class GreedyFusionRewriter
+    : public ScheduleTreeRewriter<GreedyFusionRewriter, isl::union_map> {
+private:
+  using BaseTy = ScheduleTreeRewriter<GreedyFusionRewriter, isl::union_map>;
+  BaseTy &getBase() { return *this; }
+  const BaseTy &getBase() const { return *this; }
+
+public:
+  /// Is set to true if anything has been fused.
+  bool AnyChange = false;
+
+  isl::schedule visitBand(isl::schedule_node_band Band, isl::union_map Deps) {
+    int NumLoops = Band.n_member().release();
+
+    // { Domain[] -> Scatter[] }
+    isl::union_map PartSched =
+        isl::union_map::from(Band.get_partial_schedule());
+    assert(getNumScatterDims(PartSched) == NumLoops);
+    isl::space ParamSpace = PartSched.get_space().params();
+
+    // { Scatter[] -> Domain[] }
+    isl::union_map PartSchedRev = PartSched.reverse();
+
+    // Possible within the same iteration. Dependencies with smaller scatter
+    // value are carried by this loop and therefore have been resolved by the
+    // in-order execution if the loop iteration. A dependency with small scatter
+    // value would be a dependency violation that we assume did not happen. {
+    // Domain[] -> Domain[] }
+    isl::union_map Unsequenced = PartSchedRev.apply_domain(PartSchedRev);
+
+    // Actual dependencies within the same iteration.
+    // { DefDomain[] -> UseDomain[] }
+    isl::union_map RemDeps = Deps.intersect(Unsequenced);
+
+    return getBase().visitBand(Band, RemDeps);
+  }
+
+  isl::schedule visitSequence(isl::schedule_node_sequence Sequence,
+                              isl::union_map Deps) {
+    int NumChildren = isl_schedule_node_n_children(Sequence.get());
+
+    // List of fusion candidates. The first element is the fusion candidate, the
+    // second is candidate's ancestor that is the sequence's direct child. It is
+    // preferable to use the direct child if not if its non-direct children is
+    // fused to preserve its structure such as mark nodes.
+    SmallVector<std::pair<isl::schedule_node, isl::schedule_node>> Bands;
+    for (auto i : seq<int>(0, NumChildren)) {
+      isl::schedule_node Child = Sequence.child(i);
+      collectPotentiallyFusableBands(Child, Bands, Child);
+    }
+
+    // Direct children that had at least one of its decendants fused.
+    SmallDenseSet<isl_schedule_node *, 4> ChangedDirectChildren;
+
+    // Fuse neigboring bands until reaching the end of candidates.
+    int i = 0;
+    while (i + 1 < (int)Bands.size()) {
+      isl::schedule Fused =
+          tryGreedyFuse(Bands[i].first, Bands[i + 1].first, Deps);
+      if (Fused.is_null()) {
+        // Cannot merge this node with the next; look at next pair.
+        i += 1;
+        continue;
+      }
+
+      // Mark the direct children as (partially) fused.
+      if (!Bands[i].second.is_null())
+        ChangedDirectChildren.insert(Bands[i].second.get());
+      if (!Bands[i + 1].second.is_null())
+        ChangedDirectChildren.insert(Bands[i + 1].second.get());
+
+      // Collapse the neigbros to a single new candidate that could be fused
+      // with the next candidate.
+      Bands[i] = {Fused.get_root(), {}};
+      Bands.erase(Bands.begin() + i + 1);
+
+      AnyChange = true;
+    }
+
+    // By construction equal if done with collectPotentiallyFusableBands's
+    // output.
+    SmallVector<isl::union_set> SubDomains;
+    SubDomains.reserve(NumChildren);
+    for (int i = 0; i < NumChildren; i += 1)
+      SubDomains.push_back(Sequence.child(i).domain());
+    auto SubRemainingDeps = remainigDepsFromSequence(SubDomains, Deps);
+
+    // We may iterate over direct children multiple times, be sure to add each
+    // at most once.
+    SmallDenseSet<isl_schedule_node *, 4> AlreadyAdded;
+
+    isl::schedule Result;
+    for (auto &P : Bands) {
+      isl::schedule_node MaybeFused = P.first;
+      isl::schedule_node DirectChild = P.second;
+
+      // If not modified, use the direct child.
+      if (!DirectChild.is_null() &&
+          !ChangedDirectChildren.count(DirectChild.get())) {
+        if (AlreadyAdded.count(DirectChild.get()))
+          continue;
+        AlreadyAdded.insert(DirectChild.get());
+        MaybeFused = DirectChild;
+      } else {
+        assert(AnyChange &&
+               "Need changed flag for be consistent with actual change");
+      }
+
+      // Top-down recursion: If the outermost loop has been fused, their nested
+      // bands might be fusable now as well.
+      isl::schedule InnerFused = visit(MaybeFused, SubRemainingDeps);
+
+      // Reconstruct the sequence, with some of the children fused.
+      if (Result.is_null())
+        Result = InnerFused;
+      else
+        Result = Result.sequence(InnerFused);
+    }
+
+    return Result;
+  }
+};
+
 } // namespace
 
 bool polly::isBandMark(const isl::schedule_node &Node) {
@@ -774,3 +1254,19 @@
 
   return Fissioned.get_schedule();
 }
+
+isl::schedule polly::applyGreedyFusion(isl::schedule Sched,
+                                       const isl::union_map &Deps) {
+  LLVM_DEBUG(dbgs() << "Greedy loop fusion\n");
+
+  GreedyFusionRewriter Rewriter;
+  isl::schedule Result = Rewriter.visit(Sched, Deps);
+  if (!Rewriter.AnyChange) {
+    LLVM_DEBUG(dbgs() << "Found nothing to fuse\n");
+    return Sched;
+  }
+
+  // GreedyFusionRewriter due to working loop-by-loop, bands with multiple loops
+  // may have been split into multiple bands.
+  return collapseBands(Result);
+}
diff --git a/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll b/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
new file mode 100644
index 0000000..807abe3
--- /dev/null
+++ b/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
@@ -0,0 +1,78 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+
+define void @func(i32 %n, [1024 x double]*  noalias nonnull %A,  [1024 x double]*  noalias nonnull %B) {
+entry:
+  br label %outer.for1
+
+outer.for1:
+  %k1 = phi i32 [0, %entry], [%k1.inc, %outer.inc1]
+  %k1.cmp = icmp slt i32 %k1, %n
+  br i1 %k1.cmp, label %for1, label %outer.exit1
+
+  for1:
+    %j1 = phi i32 [0, %outer.for1], [%j1.inc, %inc1]
+    %j1.cmp = icmp slt i32 %j1, %n
+    br i1 %j1.cmp, label %body1, label %exit1
+
+      body1:
+        %arrayidx1 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i32 %k1, i32 %j1
+        store double 21.0, double* %arrayidx1
+        br label %inc1
+
+  inc1:
+    %j1.inc = add nuw nsw i32 %j1, 1
+    br label %for1
+
+  exit1:
+    br label %outer.inc1
+
+outer.inc1:
+  %k1.inc = add nuw nsw i32 %k1, 1
+  br label %outer.for1
+
+outer.exit1:
+  br label %outer.for2
+
+outer.for2:
+  %k2 = phi i32 [0, %outer.exit1], [%k2.inc, %outer.inc2]
+  %k2.cmp = icmp slt i32 %k2, %n
+  br i1 %k2.cmp, label %for2, label %outer.exit2
+
+  for2:
+    %j2 = phi i32 [0, %outer.for2], [%j2.inc, %inc2]
+    %j2.cmp = icmp slt i32 %j2, %n
+    br i1 %j2.cmp, label %body2, label %exit2
+
+      body2:
+        %arrayidx2 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i32 %k2, i32 %j2
+        store double 42.0, double* %arrayidx2
+        br label %inc2
+
+  inc2:
+    %j2.inc = add nuw nsw i32 %j2, 1
+    br label %for2
+
+  exit2:
+    br label %outer.inc2
+
+outer.inc2:
+  %k2.inc = add nuw nsw i32 %k2, 1
+  br label %outer.for2
+
+outer.exit2:
+  br label %return
+
+return:
+  ret void
+}
+
+
+; CHECK:      Calculated schedule:
+; CHECK-NEXT: domain: "[n] -> { Stmt_body2[i0, i1] : 0 <= i0 < n and 0 <= i1 < n; Stmt_body1[i0, i1] : 0 <= i0 < n and 0 <= i1 < n }"
+; CHECK-NEXT: child:
+; CHECK-NEXT:   schedule: "[n] -> [{ Stmt_body2[i0, i1] -> [(i0)]; Stmt_body1[i0, i1] -> [(i0)] }, { Stmt_body2[i0, i1] -> [(i1)]; Stmt_body1[i0, i1] -> [(i1)] }]"
+; CHECK-NEXT:   child:
+; CHECK-NEXT:     sequence:
+; CHECK-NEXT:     - filter: "[n] -> { Stmt_body1[i0, i1] }"
+; CHECK-NEXT:     - filter: "[n] -> { Stmt_body2[i0, i1] }"
diff --git a/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll b/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
new file mode 100644
index 0000000..601d381
--- /dev/null
+++ b/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
@@ -0,0 +1,90 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,OPT
+
+define void @func(i32 %n, double* noalias nonnull %A, double* noalias nonnull %B, i32 %k) {
+entry:
+  br label %for1
+
+
+for1:
+  %j1 = phi i32 [0, %entry], [%j1.inc, %inc1]
+  %j1.cmp = icmp slt i32 %j1, %n
+  br i1 %j1.cmp, label %body1, label %exit1
+
+    body1:
+      %idx1 = add i32 %j1, %k
+      %arrayidx1 = getelementptr inbounds double, double* %B, i32 %idx1
+      store double 21.0, double* %arrayidx1
+      br label %inc1
+
+inc1:
+  %j1.inc = add nuw nsw i32 %j1, 1
+  br label %for1, !llvm.loop !1
+
+exit1:
+  br label %for2
+
+
+for2:
+  %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2]
+  %j2.cmp = icmp slt i32 %j2, %n
+  br i1 %j2.cmp, label %body2, label %exit2
+
+    body2:
+      %arrayidx2 = getelementptr inbounds double, double* %B, i32 %j2
+      store double 42.0, double* %arrayidx2
+      br label %inc2
+
+inc2:
+  %j2.inc = add nuw nsw i32 %j2, 1
+  br label %for2
+
+exit2:
+  br label %for3
+
+
+for3:
+  %j3 = phi i32 [0, %exit2], [%j3.inc, %inc3]
+  %j3.cmp = icmp slt i32 %j3, %n
+  br i1 %j3.cmp, label %body3, label %exit3
+
+    body3:
+      %arrayidx3 = getelementptr inbounds double, double* %A, i32 %j3
+      store double 84.0, double* %arrayidx3
+      br label %inc3
+
+inc3:
+  %j3.inc = add nuw nsw i32 %j3, 1
+  br label %for3
+
+exit3:
+  br label %return
+
+
+return:
+  ret void
+}
+
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.id", !"Hello World!"}
+
+
+; CHECK:      Calculated schedule:
+; CHECK-NEXT: domain: "[n, k] -> { Stmt_body2[i0] : 0 <= i0 < n; Stmt_body1[i0] : 0 <= i0 < n; Stmt_body3[i0] : 0 <= i0 < n }"
+; CHECK-NEXT: child:
+; CHECK-NEXT:   sequence:
+; CHECK-NEXT:   - filter: "[n, k] -> { Stmt_body1[i0] }"
+; CHECK-NEXT:     child:
+; RAW-NEXT:         mark: "Loop with Metadata"
+; RAW-NEXT:         child:
+; CHECK-NEXT:         schedule: "[n, k] -> [{ Stmt_body1[i0] -> [(i0)] }]"
+; OPT-NEXT:           permutable: 1
+; OPT-NEXT:           coincident: [ 1 ]
+; CHECK-NEXT:   - filter: "[n, k] -> { Stmt_body2[i0]; Stmt_body3[i0] }"
+; CHECK-NEXT:     child:
+; CHECK-NEXT:       schedule: "[n, k] -> [{ Stmt_body2[i0] -> [(i0)]; Stmt_body3[i0] -> [(i0)] }]"
+; CHECK-NEXT:       child:
+; CHECK-NEXT:         sequence:
+; CHECK-NEXT:         - filter: "[n, k] -> { Stmt_body2[i0] }"
+; CHECK-NEXT:         - filter: "[n, k] -> { Stmt_body3[i0] }"
diff --git a/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll b/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
new file mode 100644
index 0000000..eeb20e6
--- /dev/null
+++ b/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
@@ -0,0 +1,88 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK
+
+define void @func(i32 %n, double* noalias nonnull %A, double* noalias nonnull %B, i32 %k) {
+entry:
+  br label %for1
+
+
+for1:
+  %j1 = phi i32 [0, %entry], [%j1.inc, %inc1]
+  %j1.cmp = icmp slt i32 %j1, %n
+  br i1 %j1.cmp, label %body1, label %exit1
+
+    body1:
+      %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1
+      store double 21.0, double* %arrayidx1
+      br label %inc1
+
+inc1:
+  %j1.inc = add nuw nsw i32 %j1, 1
+  br label %for1
+
+exit1:
+  br label %for2
+
+
+for2:
+  %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2]
+  %j2.cmp = icmp slt i32 %j2, %n
+  br i1 %j2.cmp, label %body2, label %exit2
+
+    body2:
+      %arrayidx2 = getelementptr inbounds double, double* %B, i32 %j1
+      store double 42.0, double* %arrayidx2
+      br label %inc2
+
+inc2:
+  %j2.inc = add nuw nsw i32 %j2, 1
+  br label %for2
+
+exit2:
+  br label %for3
+
+
+for3:
+  %j3 = phi i32 [0, %exit2], [%j3.inc, %inc3]
+  %j3.cmp = icmp slt i32 %j3, %n
+  br i1 %j3.cmp, label %body3, label %exit3
+
+    body3:
+      %idx3 = add i32 %j3, %k
+      %arrayidx3 = getelementptr inbounds double, double* %B, i32 %idx3
+      store double 84.0, double* %arrayidx3
+      br label %inc3
+
+inc3:
+  %j3.inc = add nuw nsw i32 %j3, 1
+  br label %for3,  !llvm.loop !1
+
+exit3:
+  br label %return
+
+
+return:
+  ret void
+}
+
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.id", !"Hello World!"}
+
+
+; CHECK:      Calculated schedule:
+; CHECK-NEXT: domain: "[n, k] -> { Stmt_body2[i0] : 0 <= i0 < n; Stmt_body1[i0] : 0 <= i0 < n; Stmt_body3[i0] : 0 <= i0 < n }"
+; CHECK-NEXT: child:
+; CHECK-NEXT:   sequence:
+; CHECK-NEXT:   - filter: "[n, k] -> { Stmt_body2[i0]; Stmt_body1[i0] }"
+; CHECK-NEXT:     child:
+; CHECK-NEXT:       schedule: "[n, k] -> [{ Stmt_body2[i0] -> [(i0)]; Stmt_body1[i0] -> [(i0)] }]"
+; CHECK-NEXT:       child:
+; CHECK-NEXT:         sequence:
+; CHECK-NEXT:         - filter: "[n, k] -> { Stmt_body1[i0] }"
+; CHECK-NEXT:         - filter: "[n, k] -> { Stmt_body2[i0] }"
+; CHECK-NEXT:   - filter: "[n, k] -> { Stmt_body3[i0] }"
+; CHECK-NEXT:     child:
+; RAW-NEXT:       mark: "Loop with Metadata"
+; RAW-NEXT:       child:
+; CHECK-NEXT:         schedule: "[n, k] -> [{ Stmt_body3[i0] -> [(i0)] }]"
diff --git a/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll b/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
new file mode 100644
index 0000000..387c77c
--- /dev/null
+++ b/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
@@ -0,0 +1,69 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,OPT
+
+define void @func(i32 %n, double* noalias nonnull %A) {
+entry:
+  br label %outer.for
+
+outer.for:
+  %k = phi i32 [0, %entry], [%k.inc, %outer.inc]
+  %k.cmp = icmp slt i32 %k, %n
+  br i1 %k.cmp, label %for1, label %outer.exit
+
+  for1:
+    %j1 = phi i32 [0, %outer.for], [%j1.inc, %inc1]
+    %j1.cmp = icmp slt i32 %j1, %n
+    br i1 %j1.cmp, label %body1, label %exit1
+
+      body1:
+        %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1
+        store double 21.0, double* %arrayidx1
+        br label %inc1
+
+  inc1:
+    %j1.inc = add nuw nsw i32 %j1, 1
+    br label %for1
+
+  exit1:
+    br label %for2
+
+  for2:
+    %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2]
+    %j2.cmp = icmp slt i32 %j2, %n
+    br i1 %j2.cmp, label %body2, label %exit2
+
+      body2:
+        %arrayidx2 = getelementptr inbounds double, double* %A, i32 %j2
+        store double 42.0, double* %arrayidx2
+        br label %inc2
+
+  inc2:
+    %j2.inc = add nuw nsw i32 %j2, 1
+    br label %for2
+
+  exit2:
+    br label %outer.inc
+
+outer.inc:
+  %k.inc = add nuw nsw i32 %k, 1
+  br label %outer.for
+
+outer.exit:
+    br label %return
+
+return:
+  ret void
+}
+
+
+; CHECK:      Calculated schedule:
+; CHECK-NEXT: domain: "[n] -> { Stmt_body2[i0, i1] : 0 <= i0 < n and 0 <= i1 < n; Stmt_body1[i0, i1] : 0 <= i0 < n and 0 <= i1 < n }"
+; CHECK-NEXT: child:
+; RAW-NEXT:     schedule: "[n] -> [{ Stmt_body2[i0, i1] -> [(i0)]; Stmt_body1[i0, i1] -> [(i0)] }, { Stmt_body2[i0, i1] -> [(i1)]; Stmt_body1[i0, i1] -> [(i1)] }]"
+; OPT-NEXT:     schedule: "[n] -> [{ Stmt_body2[i0, i1] -> [(i1)]; Stmt_body1[i0, i1] -> [(i1)] }, { Stmt_body2[i0, i1] -> [(i0)]; Stmt_body1[i0, i1] -> [(i0)] }]"
+; OPT-NEXT:     permutable: 1
+; OPT-NEXT:     coincident: [ 1, 0 ]
+; CHECK-NEXT:   child:
+; CHECK-NEXT:     sequence:
+; CHECK-NEXT:     - filter: "[n] -> { Stmt_body1[i0, i1] }"
+; CHECK-NEXT:     - filter: "[n] -> { Stmt_body2[i0, i1] }"
diff --git a/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll b/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
new file mode 100644
index 0000000..00eb041
--- /dev/null
+++ b/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
@@ -0,0 +1,88 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK
+
+define void @func(i32 %n, double* noalias nonnull %A, double* noalias nonnull %B, i32 %k) {
+entry:
+  br label %for1
+
+
+for1:
+  %j1 = phi i32 [0, %entry], [%j1.inc, %inc1]
+  %j1.cmp = icmp slt i32 %j1, %n
+  br i1 %j1.cmp, label %body1, label %exit1
+
+    body1:
+      %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1
+      store double 21.0, double* %arrayidx1
+      br label %inc1
+
+inc1:
+  %j1.inc = add nuw nsw i32 %j1, 1
+  br label %for1
+
+exit1:
+  br label %for2
+
+
+for2:
+  %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2]
+  %j2.cmp = icmp slt i32 %j2, %n
+  br i1 %j2.cmp, label %body2, label %exit2
+
+    body2:
+      %arrayidx2 = getelementptr inbounds double, double* %B, i32 %j2
+      store double 42.0, double* %arrayidx2
+      br label %inc2
+
+inc2:
+  %j2.inc = add nuw nsw i32 %j2, 1
+  br label %for2
+
+exit2:
+  br label %for3
+
+
+for3:
+  %j3 = phi i32 [0, %exit2], [%j3.inc, %inc3]
+  %j3.cmp = icmp slt i32 %j3, %n
+  br i1 %j3.cmp, label %body3, label %exit3
+
+    body3:
+      %idx3 = add i32 %j3, %k
+      %arrayidx3 = getelementptr inbounds double, double* %B, i32 %idx3
+      store double 84.0, double* %arrayidx3
+      br label %inc3
+
+inc3:
+  %j3.inc = add nuw nsw i32 %j3, 1
+  br label %for3,  !llvm.loop !1
+
+exit3:
+  br label %return
+
+
+return:
+  ret void
+}
+
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.id", !"Hello World!"}
+
+
+; CHECK:      Calculated schedule:
+; CHECK-NEXT: domain: "[n, k] -> { Stmt_body2[i0] : 0 <= i0 < n; Stmt_body1[i0] : 0 <= i0 < n; Stmt_body3[i0] : 0 <= i0 < n }"
+; CHECK-NEXT: child:
+; CHECK-NEXT:   sequence:
+; CHECK-NEXT:   - filter: "[n, k] -> { Stmt_body2[i0]; Stmt_body1[i0] }"
+; CHECK-NEXT:     child:
+; CHECK-NEXT:       schedule: "[n, k] -> [{ Stmt_body2[i0] -> [(i0)]; Stmt_body1[i0] -> [(i0)] }]"
+; CHECK-NEXT:       child:
+; CHECK-NEXT:         sequence:
+; CHECK-NEXT:         - filter: "[n, k] -> { Stmt_body1[i0] }"
+; CHECK-NEXT:         - filter: "[n, k] -> { Stmt_body2[i0] }"
+; CHECK-NEXT:   - filter: "[n, k] -> { Stmt_body3[i0] }"
+; CHECK-NEXT:     child:
+; RAW-NEXT:       mark: "Loop with Metadata"
+; RAW-NEXT:       child:
+; CHECK-NEXT:         schedule: "[n, k] -> [{ Stmt_body3[i0] -> [(i0)] }]"
diff --git a/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll b/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
new file mode 100644
index 0000000..ada3934
--- /dev/null
+++ b/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
@@ -0,0 +1,66 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+
+define void @func(i32 %n, [1024 x double]* noalias nonnull %A) {
+entry:
+  br label %outer.for
+
+outer.for:
+  %k = phi i32 [0, %entry], [%k.inc, %outer.inc]
+  %k.cmp = icmp slt i32 %k, %n
+  br i1 %k.cmp, label %for1, label %outer.exit
+
+  for1:
+    %j1 = phi i32 [0, %outer.for], [%j1.inc, %inc1]
+    %j1.cmp = icmp slt i32 %j1, %n
+    br i1 %j1.cmp, label %body1, label %exit1
+
+      body1:
+        %arrayidx1 = getelementptr inbounds  [1024 x double], [1024 x double]* %A, i32 %k, i32 %j1
+        store double 21.0, double* %arrayidx1
+        br label %inc1
+
+  inc1:
+    %j1.inc = add nuw nsw i32 %j1, 1
+    br label %for1
+
+  exit1:
+    br label %for2
+
+  for2:
+    %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2]
+    %j2.cmp = icmp slt i32 %j2, %n
+    br i1 %j2.cmp, label %body2, label %exit2
+
+      body2:
+        %arrayidx2 = getelementptr inbounds  [1024 x double], [1024 x double]* %A, i32 %k, i32 %j2
+        store double 42.0, double* %arrayidx2
+        br label %inc2
+
+  inc2:
+    %j2.inc = add nuw nsw i32 %j2, 1
+    br label %for2
+
+  exit2:
+    br label %outer.inc
+
+outer.inc:
+  %k.inc = add nuw nsw i32 %k, 1
+  br label %outer.for
+
+outer.exit:
+    br label %return
+
+return:
+  ret void
+}
+
+
+; CHECK:      Calculated schedule:
+; CHECK-NEXT: domain: "[n] -> { Stmt_body2[i0, i1] : 0 <= i0 < n and 0 <= i1 < n; Stmt_body1[i0, i1] : 0 <= i0 < n and 0 <= i1 < n }"
+; CHECK-NEXT: child:
+; CHECK-NEXT:   schedule: "[n] -> [{ Stmt_body2[i0, i1] -> [(i0)]; Stmt_body1[i0, i1] -> [(i0)] }, { Stmt_body2[i0, i1] -> [(i1)]; Stmt_body1[i0, i1] -> [(i1)] }]"
+; CHECK-NEXT:   child:
+; CHECK-NEXT:     sequence:
+; CHECK-NEXT:     - filter: "[n] -> { Stmt_body1[i0, i1] }"
+; CHECK-NEXT:     - filter: "[n] -> { Stmt_body2[i0, i1] }"
diff --git a/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll b/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
new file mode 100644
index 0000000..ba083ac
--- /dev/null
+++ b/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
@@ -0,0 +1,54 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+
+define void @func(i32 %n, double* noalias nonnull %A) {
+entry:
+  br label %for1
+
+for1:
+  %j1 = phi i32 [0, %entry], [%j1.inc, %inc1]
+  %j1.cmp = icmp slt i32 %j1, %n
+  br i1 %j1.cmp, label %body1, label %exit1
+
+    body1:
+      %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1
+      store double 21.0, double* %arrayidx1
+      br label %inc1
+
+inc1:
+  %j1.inc = add nuw nsw i32 %j1, 1
+  br label %for1
+
+exit1:
+  br label %for2
+
+for2:
+  %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2]
+  %j2.cmp = icmp slt i32 %j2, %n
+  br i1 %j2.cmp, label %body2, label %exit2
+
+    body2:
+      %arrayidx2 = getelementptr inbounds double, double* %A, i32 %j2
+      store double 42.0, double* %arrayidx2
+      br label %inc2
+
+inc2:
+  %j2.inc = add nuw nsw i32 %j2, 1
+  br label %for2
+
+exit2:
+  br label %return
+
+return:
+  ret void
+}
+
+
+; CHECK:      Calculated schedule:
+; CHECK-NEXT: domain: "[n] -> { Stmt_body2[i0] : 0 <= i0 < n; Stmt_body1[i0] : 0 <= i0 < n }"
+; CHECK-NEXT: child:
+; CHECK-NEXT:   schedule: "[n] -> [{ Stmt_body2[i0] -> [(i0)]; Stmt_body1[i0] -> [(i0)] }]"
+; CHECK-NEXT:   child:
+; CHECK-NEXT:     sequence:
+; CHECK-NEXT:     - filter: "[n] -> { Stmt_body1[i0] }"
+; CHECK-NEXT:     - filter: "[n] -> { Stmt_body2[i0] }"
diff --git a/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll b/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
new file mode 100644
index 0000000..1509aa9
--- /dev/null
+++ b/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
@@ -0,0 +1,51 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+
+; This could theoretically be fused by adjusting the offset of the second loop by %k (instead of relying on schedule dimensions).
+
+define void @func(i32 %n, double* noalias nonnull %A, i32 %k) {
+entry:
+  br label %for1
+
+for1:
+  %j1 = phi i32 [0, %entry], [%j1.inc, %inc1]
+  %j1.cmp = icmp slt i32 %j1, %n
+  br i1 %j1.cmp, label %body1, label %exit1
+
+    body1:
+      %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1
+      store double 21.0, double* %arrayidx1
+      br label %inc1
+
+inc1:
+  %j1.inc = add nuw nsw i32 %j1, 1
+  br label %for1
+
+exit1:
+  br label %for2
+
+for2:
+  %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2]
+  %j2.cmp = icmp slt i32 %j2, %n
+  br i1 %j2.cmp, label %body2, label %exit2
+
+    body2:
+      %idx2 = add i32 %j2, %k
+      %arrayidx2 = getelementptr inbounds double, double* %A, i32 %idx2
+      store double 42.0, double* %arrayidx2
+      br label %inc2
+
+inc2:
+  %j2.inc = add nuw nsw i32 %j2, 1
+  br label %for2
+
+exit2:
+  br label %return
+
+return:
+  ret void
+}
+
+
+; CHECK:      Calculated schedule:
+; CHECK-NEXT: n/a
diff --git a/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll b/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
new file mode 100644
index 0000000..e52b89b
--- /dev/null
+++ b/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
@@ -0,0 +1,57 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+
+define void @func(i32 %n, double* noalias nonnull %A, double* noalias nonnull %B, i32 %k) {
+entry:
+  br label %for1
+
+
+for1:
+  %j1 = phi i32 [0, %entry], [%j1.inc, %inc1]
+  %j1.cmp = icmp slt i32 %j1, %n
+  br i1 %j1.cmp, label %body1, label %exit1
+
+    body1:
+      %idx1 = add i32 %j1, %k
+      %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1
+      store double 21.0, double* %arrayidx1
+      br label %inc1
+
+inc1:
+  %j1.inc = add nuw nsw i32 %j1, 1
+  br label %for1
+
+exit1:
+  br label %middle2
+
+
+middle2:
+  store double 52.0, double* %A
+  br label %for3
+
+
+for3:
+  %j3 = phi i32 [0, %middle2], [%j3.inc, %inc3]
+  %j3.cmp = icmp slt i32 %j3, %n
+  br i1 %j3.cmp, label %body3, label %exit3
+
+    body3:
+      %arrayidx3 = getelementptr inbounds double, double* %B, i32 %j3
+      store double 84.0, double* %arrayidx3
+      br label %inc3
+
+inc3:
+  %j3.inc = add nuw nsw i32 %j3, 1
+  br label %for3
+
+exit3:
+  br label %return
+
+
+return:
+  ret void
+}
+
+
+; CHECK:      Calculated schedule:
+; CHECK-NEXT:   n/a