[OpenMP] Add OpenMP 5.0 nonmonotonic code

This patch adds:
* New omp_sched_monotonic flag to omp_sched_t which is handled within the runtime
* Parsing of monotonic/nonmonotonic in OMP_SCHEDULE
* Tests for the monotonic flag and envirable parsing
* Logic to force monotonic when hierarchical scheduling is used

Differential Revision: https://reviews.llvm.org/D60979


git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@359601 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/runtime/src/include/50/omp.h.var b/runtime/src/include/50/omp.h.var
index 5f68908..05dc266 100644
--- a/runtime/src/include/50/omp.h.var
+++ b/runtime/src/include/50/omp.h.var
@@ -43,10 +43,11 @@
 
     /* schedule kind constants */
     typedef enum omp_sched_t {
-	omp_sched_static  = 1,
-	omp_sched_dynamic = 2,
-	omp_sched_guided  = 3,
-	omp_sched_auto    = 4
+        omp_sched_static  = 1,
+        omp_sched_dynamic = 2,
+        omp_sched_guided  = 3,
+        omp_sched_auto    = 4,
+        omp_sched_monotonic = 0x80000000
     } omp_sched_t;
 
     /* set API functions */
diff --git a/runtime/src/include/50/omp_lib.f.var b/runtime/src/include/50/omp_lib.f.var
index 8b7cffe..17d0a2a 100644
--- a/runtime/src/include/50/omp_lib.f.var
+++ b/runtime/src/include/50/omp_lib.f.var
@@ -61,6 +61,7 @@
         integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
         integer(kind=omp_sched_kind), parameter :: omp_sched_guided  = 3
         integer(kind=omp_sched_kind), parameter :: omp_sched_auto    = 4
+        integer(kind=omp_sched_kind), parameter :: omp_sched_monotonic = Z'80000000'
 
         integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_false = 0
         integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_true = 1
diff --git a/runtime/src/include/50/omp_lib.f90.var b/runtime/src/include/50/omp_lib.f90.var
index af12927..cef6f33 100644
--- a/runtime/src/include/50/omp_lib.f90.var
+++ b/runtime/src/include/50/omp_lib.f90.var
@@ -59,7 +59,7 @@
         integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
         integer(kind=omp_sched_kind), parameter :: omp_sched_guided  = 3
         integer(kind=omp_sched_kind), parameter :: omp_sched_auto    = 4
-
+        integer(kind=omp_sched_kind), parameter :: omp_sched_monotonic = Z'80000000'
 
         integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_false = 0
         integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_true = 1
diff --git a/runtime/src/include/50/omp_lib.h.var b/runtime/src/include/50/omp_lib.h.var
index 94612b0..cf86183 100644
--- a/runtime/src/include/50/omp_lib.h.var
+++ b/runtime/src/include/50/omp_lib.h.var
@@ -68,6 +68,8 @@
       parameter(omp_sched_guided=3)
       integer(kind=omp_sched_kind)omp_sched_auto
       parameter(omp_sched_auto=4)
+      integer(kind=omp_sched_kind)omp_sched_monotonic
+      parameter(omp_sched_monotonic=Z'80000000')
 
       integer(kind=omp_proc_bind_kind)omp_proc_bind_false
       parameter(omp_proc_bind_false=0)
diff --git a/runtime/src/kmp.h b/runtime/src/kmp.h
index 26077bd..68d1ccd 100644
--- a/runtime/src/kmp.h
+++ b/runtime/src/kmp.h
@@ -325,7 +325,8 @@
   kmp_sched_static_steal = 102, // mapped to kmp_sch_static_steal (44)
 #endif
   kmp_sched_upper,
-  kmp_sched_default = kmp_sched_static // default scheduling
+  kmp_sched_default = kmp_sched_static, // default scheduling
+  kmp_sched_monotonic = 0x80000000
 } kmp_sched_t;
 #endif
 
@@ -438,6 +439,11 @@
 #define SCHEDULE_HAS_NONMONOTONIC(s) (((s)&kmp_sch_modifier_nonmonotonic) != 0)
 #define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
   (((s) & (kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)) == 0)
+#define SCHEDULE_GET_MODIFIERS(s)                                              \
+  ((enum sched_type)(                                                          \
+      (s) & (kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)))
+#define SCHEDULE_SET_MODIFIERS(s, m)                                           \
+  (s = (enum sched_type)((kmp_int32)s | (kmp_int32)m))
 #else
 /* By doing this we hope to avoid multiple tests on OMP_45_ENABLED. Compilers
    can now eliminate tests on compile time constants and dead code that results
@@ -446,11 +452,47 @@
 #define SCHEDULE_HAS_MONOTONIC(s) false
 #define SCHEDULE_HAS_NONMONOTONIC(s) false
 #define SCHEDULE_HAS_NO_MODIFIERS(s) true
+#define SCHEDULE_GET_MODIFIERS(s) ((enum sched_type)0)
+#define SCHEDULE_SET_MODIFIERS(s, m) /* Nothing */
 #endif
+#define SCHEDULE_NONMONOTONIC 0
+#define SCHEDULE_MONOTONIC 1
 
   kmp_sch_default = kmp_sch_static /**< default scheduling algorithm */
 };
 
+// Apply modifiers on internal kind to standard kind
+static inline void
+__kmp_sched_apply_mods_stdkind(kmp_sched_t *kind,
+                               enum sched_type internal_kind) {
+#if OMP_50_ENABLED
+  if (SCHEDULE_HAS_MONOTONIC(internal_kind)) {
+    *kind = (kmp_sched_t)((int)*kind | (int)kmp_sched_monotonic);
+  }
+#endif
+}
+
+// Apply modifiers on standard kind to internal kind
+static inline void
+__kmp_sched_apply_mods_intkind(kmp_sched_t kind,
+                               enum sched_type *internal_kind) {
+#if OMP_50_ENABLED
+  if ((int)kind & (int)kmp_sched_monotonic) {
+    *internal_kind = (enum sched_type)((int)*internal_kind |
+                                       (int)kmp_sch_modifier_monotonic);
+  }
+#endif
+}
+
+// Get standard schedule without modifiers
+static inline kmp_sched_t __kmp_sched_without_mods(kmp_sched_t kind) {
+#if OMP_50_ENABLED
+  return (kmp_sched_t)((int)kind & ~((int)kmp_sched_monotonic));
+#else
+  return kind;
+#endif
+}
+
 /* Type to keep runtime schedule set via OMP_SCHEDULE or omp_set_schedule() */
 typedef union kmp_r_sched {
   struct {
diff --git a/runtime/src/kmp_dispatch.cpp b/runtime/src/kmp_dispatch.cpp
index ee786ae..6ae8234 100644
--- a/runtime/src/kmp_dispatch.cpp
+++ b/runtime/src/kmp_dispatch.cpp
@@ -68,6 +68,20 @@
   }
 }
 
+// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
+static inline int __kmp_get_monotonicity(enum sched_type schedule,
+                                         bool use_hier = false) {
+  // Pick up the nonmonotonic/monotonic bits from the scheduling type
+  int monotonicity;
+  // default to monotonic
+  monotonicity = SCHEDULE_MONOTONIC;
+  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
+    monotonicity = SCHEDULE_NONMONOTONIC;
+  else if (SCHEDULE_HAS_MONOTONIC(schedule))
+    monotonicity = SCHEDULE_MONOTONIC;
+  return monotonicity;
+}
+
 // Initialize a dispatch_private_info_template<T> buffer for a particular
 // type of schedule,chunk.  The loop description is found in lb (lower bound),
 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
@@ -95,6 +109,8 @@
   T tc;
   kmp_info_t *th;
   kmp_team_t *team;
+  int monotonicity;
+  bool use_hier;
 
 #ifdef KMP_DEBUG
   typedef typename traits_t<T>::signed_t ST;
@@ -125,13 +141,16 @@
 #endif
                                     team->t.t_active_level == 1;
 #endif
-#if (KMP_STATIC_STEAL_ENABLED)
-  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
-    // AC: we now have only one implementation of stealing, so use it
-    schedule = kmp_sch_static_steal;
-  else
+
+#if KMP_USE_HIER_SCHED
+  use_hier = pr->flags.use_hier;
+#else
+  use_hier = false;
 #endif
-    schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+
+  /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
+  monotonicity = __kmp_get_monotonicity(schedule, use_hier);
+  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
 
   /* Pick up the nomerge/ordered bits from the scheduling type */
   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
@@ -149,6 +168,10 @@
   } else {
     pr->flags.ordered = FALSE;
   }
+  // Ordered overrides nonmonotonic
+  if (pr->flags.ordered) {
+    monotonicity = SCHEDULE_MONOTONIC;
+  }
 
   if (schedule == kmp_sch_static) {
     schedule = __kmp_static;
@@ -157,6 +180,8 @@
       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
       // not specified)
       schedule = team->t.t_sched.r_sched_type;
+      monotonicity = __kmp_get_monotonicity(schedule, use_hier);
+      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
       // Detail the schedule if needed (global controls are differentiated
       // appropriately)
       if (schedule == kmp_sch_guided_chunked) {
@@ -207,7 +232,13 @@
       }
 #endif
     }
-
+#if KMP_STATIC_STEAL_ENABLED
+    // map nonmonotonic:dynamic to static steal
+    if (schedule == kmp_sch_dynamic_chunked) {
+      if (monotonicity == SCHEDULE_NONMONOTONIC)
+        schedule = kmp_sch_static_steal;
+    }
+#endif
     /* guided analytical not safe for too many threads */
     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
       schedule = kmp_sch_guided_iterative_chunked;
@@ -217,6 +248,8 @@
     if (schedule == kmp_sch_runtime_simd) {
       // compiler provides simd_width in the chunk parameter
       schedule = team->t.t_sched.r_sched_type;
+      monotonicity = __kmp_get_monotonicity(schedule, use_hier);
+      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
       // Detail the schedule if needed (global controls are differentiated
       // appropriately)
       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
@@ -236,9 +269,10 @@
       {
         char *buff;
         // create format specifiers before the debug output
-        buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
-                                " chunk:%%%s\n",
-                                traits_t<ST>::spec);
+        buff = __kmp_str_format(
+            "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
+            " chunk:%%%s\n",
+            traits_t<ST>::spec);
         KD_TRACE(10, (buff, gtid, schedule, chunk));
         __kmp_str_free(&buff);
       }
@@ -331,7 +365,10 @@
       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
 
       pr->u.p.parm2 = lb;
-      // pr->pfields.parm3 = 0; // it's not used in static_steal
+      // parm3 is the number of times to attempt stealing which is
+      // proportional to the number of chunks per thread up until
+      // the maximum value of nproc.
+      pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
       pr->u.p.st = st;
       if (traits_t<T>::type_size > 4) {
@@ -1184,7 +1221,7 @@
       }
       if (!status) { // try to steal
         kmp_info_t **other_threads = team->t.t_threads;
-        int while_limit = nproc; // nproc attempts to find a victim
+        int while_limit = pr->u.p.parm3;
         int while_index = 0;
         // TODO: algorithm of searching for a victim
         // should be cleaned up and measured
@@ -1282,7 +1319,7 @@
 
       if (!status) {
         kmp_info_t **other_threads = team->t.t_threads;
-        int while_limit = nproc; // nproc attempts to find a victim
+        int while_limit = pr->u.p.parm3;
         int while_index = 0;
 
         // TODO: algorithm of searching for a victim
diff --git a/runtime/src/kmp_dispatch_hier.h b/runtime/src/kmp_dispatch_hier.h
index 3fde9bb..dfcee94 100644
--- a/runtime/src/kmp_dispatch_hier.h
+++ b/runtime/src/kmp_dispatch_hier.h
@@ -691,6 +691,7 @@
           sizeof(kmp_hier_top_unit_t<T>) * max);
       for (int j = 0; j < max; ++j) {
         layers[i][j].active = 0;
+        layers[i][j].hier_pr.flags.use_hier = TRUE;
       }
     }
     valid = true;
@@ -949,26 +950,23 @@
   active = !team->t.t_serialized;
   th->th.th_ident = loc;
   num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
-  if (!active) {
-    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
-                  "Using normal dispatch functions.\n",
-                  gtid));
-    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
-        th->th.th_dispatch->th_disp_buffer);
-    KMP_DEBUG_ASSERT(pr);
-    pr->flags.use_hier = FALSE;
-    pr->flags.contains_last = FALSE;
-    return;
-  }
   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
-
   my_buffer_index = th->th.th_dispatch->th_disp_index;
   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
       &th->th.th_dispatch
            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
   sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
       &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+  if (!active) {
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
+                  "Using normal dispatch functions.\n",
+                  gtid));
+    KMP_DEBUG_ASSERT(pr);
+    pr->flags.use_hier = FALSE;
+    pr->flags.contains_last = FALSE;
+    return;
+  }
   KMP_DEBUG_ASSERT(pr);
   KMP_DEBUG_ASSERT(sh);
   pr->flags.use_hier = TRUE;
diff --git a/runtime/src/kmp_runtime.cpp b/runtime/src/kmp_runtime.cpp
index cd5bdba..231d168 100644
--- a/runtime/src/kmp_runtime.cpp
+++ b/runtime/src/kmp_runtime.cpp
@@ -2801,9 +2801,13 @@
   return thread->th.th_current_task->td_icvs.max_active_levels;
 }
 
+KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
+KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
+
 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
   kmp_info_t *thread;
+  kmp_sched_t orig_kind;
   //    kmp_team_t *team;
 
   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
@@ -2814,6 +2818,9 @@
   // Valid parameters should fit in one of two intervals - standard or extended:
   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
+  orig_kind = kind;
+  kind = __kmp_sched_without_mods(kind);
+
   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
     // TODO: Hint needs attention in case we change the default schedule.
@@ -2844,6 +2851,8 @@
         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
                       kmp_sched_lower - 2];
   }
+  __kmp_sched_apply_mods_intkind(
+      orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
   if (kind == kmp_sched_auto || chunk < 1) {
     // ignore parameter chunk for schedule auto
     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
@@ -2863,12 +2872,12 @@
   thread = __kmp_threads[gtid];
 
   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
-
-  switch (th_type) {
+  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
   case kmp_sch_static:
   case kmp_sch_static_greedy:
   case kmp_sch_static_balanced:
     *kind = kmp_sched_static;
+    __kmp_sched_apply_mods_stdkind(kind, th_type);
     *chunk = 0; // chunk was not set, try to show this fact via zero value
     return;
   case kmp_sch_static_chunked:
@@ -2897,6 +2906,7 @@
     KMP_FATAL(UnknownSchedulingType, th_type);
   }
 
+  __kmp_sched_apply_mods_stdkind(kind, th_type);
   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
 }
 
@@ -3025,15 +3035,22 @@
   // __kmp_guided. __kmp_sched should keep original value, so that user can set
   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
   // different roots (even in OMP 2.5)
-  if (__kmp_sched == kmp_sch_static) {
+  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
+#if OMP_45_ENABLED
+  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
+#endif
+  if (s == kmp_sch_static) {
     // replace STATIC with more detailed schedule (balanced or greedy)
     r_sched.r_sched_type = __kmp_static;
-  } else if (__kmp_sched == kmp_sch_guided_chunked) {
+  } else if (s == kmp_sch_guided_chunked) {
     // replace GUIDED with more detailed schedule (iterative or analytical)
     r_sched.r_sched_type = __kmp_guided;
   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
     r_sched.r_sched_type = __kmp_sched;
   }
+#if OMP_45_ENABLED
+  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
+#endif
 
   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
     // __kmp_chunk may be wrong here (if it was not ever set)
diff --git a/runtime/src/kmp_settings.cpp b/runtime/src/kmp_settings.cpp
index 70817d9..3f38bec 100644
--- a/runtime/src/kmp_settings.cpp
+++ b/runtime/src/kmp_settings.cpp
@@ -3662,104 +3662,139 @@
   __kmp_sched = kmp_sch_default;
 }
 
+// if parse_hier = true:
+//    Parse [HW,][modifier:]kind[,chunk]
+// else:
+//    Parse [modifier:]kind[,chunk]
 static const char *__kmp_parse_single_omp_schedule(const char *name,
                                                    const char *value,
                                                    bool parse_hier = false) {
   /* get the specified scheduling style */
   const char *ptr = value;
-  const char *comma = strchr(ptr, ',');
   const char *delim;
   int chunk = 0;
   enum sched_type sched = kmp_sch_default;
   if (*ptr == '\0')
     return NULL;
+  delim = ptr;
+  while (*delim != ',' && *delim != ':' && *delim != '\0')
+    delim++;
 #if KMP_USE_HIER_SCHED
   kmp_hier_layer_e layer = kmp_hier_layer_e::LAYER_THREAD;
   if (parse_hier) {
-    if (!__kmp_strcasecmp_with_sentinel("L1", ptr, ',')) {
-      layer = kmp_hier_layer_e::LAYER_L1;
-    } else if (!__kmp_strcasecmp_with_sentinel("L2", ptr, ',')) {
-      layer = kmp_hier_layer_e::LAYER_L2;
-    } else if (!__kmp_strcasecmp_with_sentinel("L3", ptr, ',')) {
-      layer = kmp_hier_layer_e::LAYER_L3;
-    } else if (!__kmp_strcasecmp_with_sentinel("NUMA", ptr, ',')) {
-      layer = kmp_hier_layer_e::LAYER_NUMA;
+    if (*delim == ',') {
+      if (!__kmp_strcasecmp_with_sentinel("L1", ptr, ',')) {
+        layer = kmp_hier_layer_e::LAYER_L1;
+      } else if (!__kmp_strcasecmp_with_sentinel("L2", ptr, ',')) {
+        layer = kmp_hier_layer_e::LAYER_L2;
+      } else if (!__kmp_strcasecmp_with_sentinel("L3", ptr, ',')) {
+        layer = kmp_hier_layer_e::LAYER_L3;
+      } else if (!__kmp_strcasecmp_with_sentinel("NUMA", ptr, ',')) {
+        layer = kmp_hier_layer_e::LAYER_NUMA;
+      }
     }
-    if (layer != kmp_hier_layer_e::LAYER_THREAD && !comma) {
+    if (layer != kmp_hier_layer_e::LAYER_THREAD && *delim != ',') {
       // If there is no comma after the layer, then this schedule is invalid
       KMP_WARNING(StgInvalidValue, name, value);
       __kmp_omp_schedule_restore();
       return NULL;
     } else if (layer != kmp_hier_layer_e::LAYER_THREAD) {
-      ptr = ++comma;
-      comma = strchr(ptr, ',');
+      ptr = ++delim;
+      while (*delim != ',' && *delim != ':' && *delim != '\0')
+        delim++;
     }
   }
-  delim = ptr;
-  while (*delim != ',' && *delim != ':' && *delim != '\0')
-    delim++;
-#else // KMP_USE_HIER_SCHED
-  delim = ptr;
-  while (*delim != ',' && *delim != '\0')
-    delim++;
 #endif // KMP_USE_HIER_SCHED
-  if (!__kmp_strcasecmp_with_sentinel("dynamic", ptr, *delim)) /* DYNAMIC */
-    sched = kmp_sch_dynamic_chunked;
-  else if (!__kmp_strcasecmp_with_sentinel("guided", ptr, *delim)) /* GUIDED */
-    sched = kmp_sch_guided_chunked;
-  // AC: TODO: add AUTO schedule, and probably remove TRAPEZOIDAL (OMP 3.0 does
-  // not allow it)
-  else if (!__kmp_strcasecmp_with_sentinel("auto", ptr, *delim)) { /* AUTO */
-    sched = kmp_sch_auto;
-    if (comma) {
-      __kmp_msg(kmp_ms_warning, KMP_MSG(IgnoreChunk, name, comma),
-                __kmp_msg_null);
-      comma = NULL;
+#if OMP_45_ENABLED
+  // Read in schedule modifier if specified
+  enum sched_type sched_modifier = (enum sched_type)0;
+  if (*delim == ':') {
+    if (!__kmp_strcasecmp_with_sentinel("monotonic", ptr, *delim)) {
+      sched_modifier = sched_type::kmp_sch_modifier_monotonic;
+      ptr = ++delim;
+      while (*delim != ',' && *delim != ':' && *delim != '\0')
+        delim++;
+    } else if (!__kmp_strcasecmp_with_sentinel("nonmonotonic", ptr, *delim)) {
+      sched_modifier = sched_type::kmp_sch_modifier_nonmonotonic;
+      ptr = ++delim;
+      while (*delim != ',' && *delim != ':' && *delim != '\0')
+        delim++;
+    } else if (!parse_hier) {
+      // If there is no proper schedule modifier, then this schedule is invalid
+      KMP_WARNING(StgInvalidValue, name, value);
+      __kmp_omp_schedule_restore();
+      return NULL;
     }
-  } else if (!__kmp_strcasecmp_with_sentinel("trapezoidal", ptr,
-                                             *delim)) /* TRAPEZOIDAL */
+  }
+#endif
+  // Read in schedule kind (required)
+  if (!__kmp_strcasecmp_with_sentinel("dynamic", ptr, *delim))
+    sched = kmp_sch_dynamic_chunked;
+  else if (!__kmp_strcasecmp_with_sentinel("guided", ptr, *delim))
+    sched = kmp_sch_guided_chunked;
+  // AC: TODO: probably remove TRAPEZOIDAL (OMP 3.0 does not allow it)
+  else if (!__kmp_strcasecmp_with_sentinel("auto", ptr, *delim))
+    sched = kmp_sch_auto;
+  else if (!__kmp_strcasecmp_with_sentinel("trapezoidal", ptr, *delim))
     sched = kmp_sch_trapezoidal;
-  else if (!__kmp_strcasecmp_with_sentinel("static", ptr, *delim)) /* STATIC */
+  else if (!__kmp_strcasecmp_with_sentinel("static", ptr, *delim))
     sched = kmp_sch_static;
 #if KMP_STATIC_STEAL_ENABLED
   else if (!__kmp_strcasecmp_with_sentinel("static_steal", ptr, *delim))
     sched = kmp_sch_static_steal;
 #endif
   else {
+    // If there is no proper schedule kind, then this schedule is invalid
     KMP_WARNING(StgInvalidValue, name, value);
     __kmp_omp_schedule_restore();
     return NULL;
   }
-  if (ptr && comma && *comma == *delim) {
-    ptr = comma + 1;
-    SKIP_DIGITS(ptr);
 
-    if (sched == kmp_sch_static)
-      sched = kmp_sch_static_chunked;
-    ++comma;
-    chunk = __kmp_str_to_int(comma, *ptr);
-    if (chunk < 1) {
-      chunk = KMP_DEFAULT_CHUNK;
-      __kmp_msg(kmp_ms_warning, KMP_MSG(InvalidChunk, name, comma),
-                __kmp_msg_null);
-      KMP_INFORM(Using_int_Value, name, __kmp_chunk);
-      // AC: next block commented out until KMP_DEFAULT_CHUNK != KMP_MIN_CHUNK
-      // (to improve code coverage :)
-      //     The default chunk size is 1 according to standard, thus making
-      //     KMP_MIN_CHUNK not 1 we would introduce mess:
-      //     wrong chunk becomes 1, but it will be impossible to explicitely set
-      //     1, because it becomes KMP_MIN_CHUNK...
-      //                } else if ( chunk < KMP_MIN_CHUNK ) {
-      //                    chunk = KMP_MIN_CHUNK;
-    } else if (chunk > KMP_MAX_CHUNK) {
-      chunk = KMP_MAX_CHUNK;
-      __kmp_msg(kmp_ms_warning, KMP_MSG(LargeChunk, name, comma),
-                __kmp_msg_null);
-      KMP_INFORM(Using_int_Value, name, chunk);
+  // Read in schedule chunk size if specified
+  if (*delim == ',') {
+    ptr = delim + 1;
+    SKIP_WS(ptr);
+    if (!isdigit(*ptr)) {
+      // If there is no chunk after comma, then this schedule is invalid
+      KMP_WARNING(StgInvalidValue, name, value);
+      __kmp_omp_schedule_restore();
+      return NULL;
     }
-  } else if (ptr) {
-    SKIP_TOKEN(ptr);
+    SKIP_DIGITS(ptr);
+    // auto schedule should not specify chunk size
+    if (sched == kmp_sch_auto) {
+      __kmp_msg(kmp_ms_warning, KMP_MSG(IgnoreChunk, name, delim),
+                __kmp_msg_null);
+    } else {
+      if (sched == kmp_sch_static)
+        sched = kmp_sch_static_chunked;
+      chunk = __kmp_str_to_int(delim + 1, *ptr);
+      if (chunk < 1) {
+        chunk = KMP_DEFAULT_CHUNK;
+        __kmp_msg(kmp_ms_warning, KMP_MSG(InvalidChunk, name, delim),
+                  __kmp_msg_null);
+        KMP_INFORM(Using_int_Value, name, __kmp_chunk);
+        // AC: next block commented out until KMP_DEFAULT_CHUNK != KMP_MIN_CHUNK
+        // (to improve code coverage :)
+        // The default chunk size is 1 according to standard, thus making
+        // KMP_MIN_CHUNK not 1 we would introduce mess:
+        // wrong chunk becomes 1, but it will be impossible to explicitly set
+        // to 1 because it becomes KMP_MIN_CHUNK...
+        // } else if ( chunk < KMP_MIN_CHUNK ) {
+        //   chunk = KMP_MIN_CHUNK;
+      } else if (chunk > KMP_MAX_CHUNK) {
+        chunk = KMP_MAX_CHUNK;
+        __kmp_msg(kmp_ms_warning, KMP_MSG(LargeChunk, name, delim),
+                  __kmp_msg_null);
+        KMP_INFORM(Using_int_Value, name, chunk);
+      }
+    }
+  } else {
+    ptr = delim;
   }
+
+  SCHEDULE_SET_MODIFIERS(sched, sched_modifier);
+
 #if KMP_USE_HIER_SCHED
   if (layer != kmp_hier_layer_e::LAYER_THREAD) {
     __kmp_hier_scheds.append(sched, chunk, layer);
@@ -3790,6 +3825,8 @@
         while ((ptr = __kmp_parse_single_omp_schedule(name, ptr, true))) {
           while (*ptr == ' ' || *ptr == '\t' || *ptr == ':')
             ptr++;
+          if (*ptr == '\0')
+            break;
         }
       } else
 #endif
@@ -3813,8 +3850,14 @@
   } else {
     __kmp_str_buf_print(buffer, "   %s='", name);
   }
+  enum sched_type sched = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
+  if (SCHEDULE_HAS_MONOTONIC(__kmp_sched)) {
+    __kmp_str_buf_print(buffer, "monotonic:");
+  } else if (SCHEDULE_HAS_NONMONOTONIC(__kmp_sched)) {
+    __kmp_str_buf_print(buffer, "nonmonotonic:");
+  }
   if (__kmp_chunk) {
-    switch (__kmp_sched) {
+    switch (sched) {
     case kmp_sch_dynamic_chunked:
       __kmp_str_buf_print(buffer, "%s,%d'\n", "dynamic", __kmp_chunk);
       break;
@@ -3839,7 +3882,7 @@
       break;
     }
   } else {
-    switch (__kmp_sched) {
+    switch (sched) {
     case kmp_sch_dynamic_chunked:
       __kmp_str_buf_print(buffer, "%s'\n", "dynamic");
       break;
diff --git a/runtime/test/worksharing/for/omp_monotonic_env.c b/runtime/test/worksharing/for/omp_monotonic_env.c
new file mode 100644
index 0000000..c8cfd2a
--- /dev/null
+++ b/runtime/test/worksharing/for/omp_monotonic_env.c
@@ -0,0 +1,86 @@
+// RUN: %libomp-compile
+// RUN: env OMP_SCHEDULE=monotonic:dynamic,50 %libomp-run monotonic dynamic 50
+// RUN: env OMP_SCHEDULE=monotonic:guided,51 %libomp-run monotonic guided 51
+// RUN: env OMP_SCHEDULE=monotonic:static,52 %libomp-run monotonic static 52
+// RUN: env OMP_SCHEDULE=nonmonotonic:dynamic,53 %libomp-run nonmonotonic dynamic 53
+// RUN: env OMP_SCHEDULE=nonmonotonic:guided,54 %libomp-run nonmonotonic guided 54
+
+// The test checks OMP 5.0 monotonic/nonmonotonic OMP_SCHEDULE parsing
+// The nonmonotonic tests see if the parser accepts nonmonotonic, if the
+// parser doesn't then a static schedule is assumed
+
+#include <stdio.h>
+#include <string.h>
+#include <omp.h>
+
+int err = 0;
+
+omp_sched_t sched_without_modifiers(omp_sched_t sched) {
+  return (omp_sched_t)((int)sched & ~((int)omp_sched_monotonic));
+}
+
+int sched_has_modifiers(omp_sched_t sched, omp_sched_t modifiers) {
+  return (int)sched & (int)modifiers;
+}
+
+// check that sched = hope | modifiers
+void check_schedule(const char *extra, const omp_sched_t sched, int chunk,
+                    omp_sched_t hope_sched, int hope_chunk) {
+
+  if (sched != hope_sched || chunk != hope_chunk) {
+    ++err;
+    printf("Error: %s: schedule: (%d, %d) is not equal to (%d, %d)\n", extra,
+           (int)hope_sched, hope_chunk, (int)sched, chunk);
+  }
+}
+
+omp_sched_t str2omp_sched(const char *str) {
+  if (!strcmp(str, "dynamic"))
+    return omp_sched_dynamic;
+  if (!strcmp(str, "static"))
+    return omp_sched_static;
+  if (!strcmp(str, "guided"))
+    return omp_sched_guided;
+  printf("Error: Unknown schedule type: %s\n", str);
+  exit(1);
+}
+
+int is_monotonic(const char *str) { return !strcmp(str, "monotonic"); }
+
+int main(int argc, char **argv) {
+  int i, monotonic, chunk, ref_chunk;
+  omp_sched_t sched, ref_sched;
+
+  if (argc != 4) {
+    printf("Error: usage: <executable> monotonic|nonmonotonic <schedule> "
+           "<chunk-size>\n");
+    exit(1);
+  }
+
+  monotonic = is_monotonic(argv[1]);
+  ref_sched = str2omp_sched(argv[2]);
+  ref_chunk = atoi(argv[3]);
+
+  omp_get_schedule(&sched, &chunk);
+
+  if (monotonic && !sched_has_modifiers(sched, omp_sched_monotonic)) {
+    printf("Error: sched (0x%x) does not have monotonic modifier\n",
+           (int)sched);
+    ++err;
+  }
+  sched = sched_without_modifiers(sched);
+  if (sched != ref_sched) {
+    printf("Error: sched (0x%x) is not 0x%x\n", (int)sched, (int)ref_sched);
+    ++err;
+  }
+  if (chunk != ref_chunk) {
+    printf("Error: chunk is not %d\n", ref_chunk);
+    ++err;
+  }
+  if (err > 0) {
+    printf("Failed\n");
+    return 1;
+  }
+  printf("Passed\n");
+  return 0;
+}
diff --git a/runtime/test/worksharing/for/omp_monotonic_schedule_set_get.c b/runtime/test/worksharing/for/omp_monotonic_schedule_set_get.c
new file mode 100644
index 0000000..94896eb
--- /dev/null
+++ b/runtime/test/worksharing/for/omp_monotonic_schedule_set_get.c
@@ -0,0 +1,134 @@
+// RUN: %libomp-compile-and-run
+
+// The test checks OMP 5.0 monotonic/nonmonotonic scheduling API
+//   1. initial schedule should be (static,0)
+//   2. omp_get_schedule() should return the schedule set by omp_set_schedule()
+//   3. schedules set inside parallel should not impact outer tasks' schedules
+
+#include <stdio.h>
+#ifndef __INTEL_COMPILER
+#define _OMPIMP
+#endif
+
+#define NO_MODIFIERS ((omp_sched_t)0)
+
+#include "omp.h"
+
+int global = 0;
+int err = 0;
+
+omp_sched_t sched_append_modifiers(omp_sched_t sched, omp_sched_t modifiers) {
+  return (omp_sched_t)((int)sched | (int)modifiers);
+}
+
+omp_sched_t sched_without_modifiers(omp_sched_t sched) {
+  return (omp_sched_t)((int)sched & ~((int)omp_sched_monotonic));
+}
+
+int sched_has_modifiers(omp_sched_t sched, omp_sched_t modifiers) {
+  return (((int)sched & ((int)omp_sched_monotonic)) > 0);
+}
+
+// check that sched = hope | modifiers
+void check_schedule(const char *extra, const omp_sched_t sched, int chunk,
+                    omp_sched_t hope_sched, int hope_chunk) {
+
+  if (sched != hope_sched || chunk != hope_chunk) {
+#pragma omp atomic
+    ++err;
+    printf("Error: %s: schedule: (%d, %d) is not equal to (%d, %d)\n", extra,
+           (int)hope_sched, hope_chunk, (int)sched, chunk);
+  }
+}
+
+int main() {
+  int i;
+  int chunk;
+  omp_sched_t sched0;
+
+  omp_set_dynamic(0);
+  omp_set_nested(1);
+
+  // check serial region
+  omp_get_schedule(&sched0, &chunk);
+#ifdef DEBUG
+  printf("initial: (%d, %d)\n", sched0, chunk);
+#endif
+  check_schedule("initial", omp_sched_static, 0, sched0, chunk);
+  // set schedule before the parallel, check it after the parallel
+  omp_set_schedule(
+      sched_append_modifiers(omp_sched_dynamic, omp_sched_monotonic), 3);
+
+#pragma omp parallel num_threads(3) private(i)
+  {
+    omp_sched_t n_outer_set, n_outer_get;
+    int c_outer;
+    int tid = omp_get_thread_num();
+
+    n_outer_set = sched_append_modifiers((omp_sched_t)(tid + 1),
+                                         omp_sched_monotonic); // 1, 2, 3
+
+    // check outer parallel region
+    // master sets (static, unchunked), others - (dynamic, 1), (guided, 2)
+    // set schedule before inner parallel, check it after the parallel
+    omp_set_schedule(n_outer_set, tid);
+
+// Make sure this schedule doesn't crash the runtime
+#pragma omp for
+    for (i = 0; i < 100; ++i) {
+#pragma omp atomic
+      global++;
+    }
+
+#pragma omp parallel num_threads(3) private(i) shared(n_outer_set)
+    {
+      omp_sched_t n_inner_set, n_inner_get;
+      int c_inner_set, c_inner_get;
+      int tid = omp_get_thread_num();
+
+      n_inner_set = (omp_sched_t)(tid + 1); // 1, 2, 3
+      c_inner_set = (int)(n_outer_set)*10 +
+                    (int)n_inner_set; // 11, 12, 13, 21, 22, 23, 31, 32, 33
+      n_inner_set = sched_append_modifiers(n_inner_set, omp_sched_monotonic);
+      // schedules set inside parallel should not impact outer schedules
+      omp_set_schedule(n_inner_set, c_inner_set);
+
+// Make sure this schedule doesn't crash the runtime
+#pragma omp for
+      for (i = 0; i < 100; ++i) {
+#pragma omp atomic
+        global++;
+      }
+
+#pragma omp barrier
+      omp_get_schedule(&n_inner_get, &c_inner_get);
+#ifdef DEBUG
+      printf("inner parallel: o_th %d, i_th %d, (%d, %d)\n", n_outer_set - 1,
+             tid, n_inner_get, c_inner_get);
+#endif
+      check_schedule("inner", n_inner_set, c_inner_set, n_inner_get,
+                     c_inner_get);
+    }
+
+    omp_get_schedule(&n_outer_get, &c_outer);
+#ifdef DEBUG
+    printf("outer parallel: thread %d, (%d, %d)\n", tid, n_outer_get, c_outer);
+#endif
+    check_schedule("outer", n_outer_set, tid, n_outer_get, c_outer);
+  }
+
+  omp_get_schedule(&sched0, &chunk);
+#ifdef DEBUG
+  printf("after parallels: (%d, %d)\n", sched0, chunk);
+#endif
+  check_schedule("after parallels",
+                 sched_append_modifiers(omp_sched_dynamic, omp_sched_monotonic),
+                 3, sched0, chunk);
+
+  if (err > 0) {
+    printf("Failed\n");
+    return 1;
+  }
+  printf("Passed\n");
+  return 0;
+}