[pstl] Support Threading Building Blocks 2020 (oneTBB) for "tbb" parallel backend.

After the changes the "tbb" parallel backend will work with old TBB versions(TBB_INTERFACE_VERSION <= 12000) and new ones (TBB 2020 and greater)

More about oneTBB:
https://github.com/oneapi-src/oneTBB

Phabricator Review:
https://reviews.llvm.org/D87380

GitOrigin-RevId: 0b2e0e80d963f3498705a38e8d02eafe541ca9d6
diff --git a/include/pstl/internal/parallel_backend_tbb.h b/include/pstl/internal/parallel_backend_tbb.h
index a9ea0c7..f1836aa 100644
--- a/include/pstl/internal/parallel_backend_tbb.h
+++ b/include/pstl/internal/parallel_backend_tbb.h
@@ -25,6 +25,7 @@
 #include <tbb/parallel_invoke.h>
 #include <tbb/task_arena.h>
 #include <tbb/tbb_allocator.h>
+#include <tbb/task.h>
 
 #if TBB_INTERFACE_VERSION < 10000
 #    error Intel(R) Threading Building Blocks 2018 is required; older versions are not supported.
@@ -71,7 +72,11 @@
 inline void
 __cancel_execution()
 {
+#if TBB_INTERFACE_VERSION <= 12000
     tbb::task::self().group()->cancel_group_execution();
+#else
+    tbb::task::current_context()->cancel_group_execution();
+#endif
 }
 
 //------------------------------------------------------------------------
@@ -413,17 +418,308 @@
 //------------------------------------------------------------------------
 #define _PSTL_MERGE_CUT_OFF 2000
 
+template <typename _Func>
+class __func_task;
+template <typename _Func>
+class __root_task;
+
+#if TBB_INTERFACE_VERSION <= 12000
+class __task : public tbb::task
+{
+  public:
+    template <typename _Fn>
+    __task*
+    make_continuation(_Fn&& __f)
+    {
+        return new (allocate_continuation()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
+    }
+
+    template <typename _Fn>
+    __task*
+    make_child_of(__task* parent, _Fn&& __f)
+    {
+        return new (parent->allocate_child()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
+    }
+
+    template <typename _Fn>
+    __task*
+    make_additional_child_of(tbb::task* parent, _Fn&& __f)
+    {
+        return new (tbb::task::allocate_additional_child_of(*parent))
+            __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
+    }
+
+    inline void
+    recycle_as_continuation()
+    {
+        tbb::task::recycle_as_continuation();
+    }
+
+    inline void
+    recycle_as_child_of(__task* parent)
+    {
+        tbb::task::recycle_as_child_of(*parent);
+    }
+
+    inline void
+    spawn(__task* __t)
+    {
+        tbb::task::spawn(*__t);
+    }
+
+    template <typename _Fn>
+    static inline void
+    spawn_root_and_wait(__root_task<_Fn>& __root)
+    {
+        tbb::task::spawn_root_and_wait(*__root._M_task);
+    }
+};
+
+template <typename _Func>
+class __func_task : public __task
+{
+    _Func _M_func;
+
+    tbb::task*
+    execute()
+    {
+        return _M_func(this);
+    };
+
+  public:
+    template <typename _Fn>
+    __func_task(_Fn&& __f) : _M_func{std::forward<_Fn>(__f)}
+    {
+    }
+
+    _Func&
+    body()
+    {
+        return _M_func;
+    }
+};
+
+template <typename _Func>
+class __root_task
+{
+    tbb::task* _M_task;
+
+  public:
+    template <typename... Args>
+    __root_task(Args&&... args)
+        : _M_task{new (tbb::task::allocate_root()) __func_task<_Func>{_Func(std::forward<Args>(args)...)}}
+    {
+    }
+
+    friend class __task;
+    friend class __func_task<_Func>;
+};
+
+#else  // TBB_INTERFACE_VERSION <= 12000
+class __task : public tbb::detail::d1::task
+{
+  protected:
+    tbb::detail::d1::small_object_allocator _M_allocator{};
+    tbb::detail::d1::execution_data* _M_execute_data{};
+    __task* _M_parent{};
+    std::atomic<int> _M_refcount{};
+    bool _M_recycle{};
+
+    template <typename _Fn>
+    __task*
+    allocate_func_task(_Fn&& __f)
+    {
+        assert(_M_execute_data != nullptr);
+        tbb::detail::d1::small_object_allocator __alloc{};
+        auto __t =
+            __alloc.new_object<__func_task<typename std::decay<_Fn>::type>>(*_M_execute_data, std::forward<_Fn>(__f));
+        __t->_M_allocator = __alloc;
+        return __t;
+    }
+
+  public:
+    __task*
+    parent()
+    {
+        return _M_parent;
+    }
+
+    void
+    set_ref_count(int __n)
+    {
+        _M_refcount.store(__n, std::memory_order_release);
+    }
+
+    template <typename _Fn>
+    __task*
+    make_continuation(_Fn&& __f)
+    {
+        auto __t = allocate_func_task(std::forward<_Fn&&>(__f));
+        __t->_M_parent = _M_parent;
+        _M_parent = nullptr;
+        return __t;
+    }
+
+    template <typename _Fn>
+    __task*
+    make_child_of(__task* __parent, _Fn&& __f)
+    {
+        auto __t = allocate_func_task(std::forward<_Fn&&>(__f));
+        __t->_M_parent = __parent;
+        return __t;
+    }
+
+    template <typename _Fn>
+    __task*
+    make_additional_child_of(__task* __parent, _Fn&& __f)
+    {
+        auto __t = make_child_of(__parent, std::forward<_Fn>(__f));
+        assert(__parent->_M_refcount.load(std::memory_order_relaxed) > 0);
+        ++__parent->_M_refcount;
+        return __t;
+    }
+
+    inline void
+    recycle_as_continuation()
+    {
+        _M_recycle = true;
+    }
+
+    inline void
+    recycle_as_child_of(__task* parent)
+    {
+        _M_recycle = true;
+        _M_parent = parent;
+    }
+
+    inline void
+    spawn(__task* __t)
+    {
+        assert(_M_execute_data != nullptr);
+        tbb::detail::d1::spawn(*__t, *_M_execute_data->context);
+    }
+
+    template <typename _Fn>
+    static inline void
+    spawn_root_and_wait(__root_task<_Fn>& __root)
+    {
+        tbb::detail::d1::execute_and_wait(*__root._M_func_task, __root._M_context, __root._M_wait_object,
+                                          __root._M_context);
+    }
+
+    template <typename _Func>
+    friend class __func_task;
+};
+
+template <typename _Func>
+class __func_task : public __task
+{
+    _Func _M_func;
+
+    __task*
+    execute(tbb::detail::d1::execution_data& __ed) override
+    {
+        _M_execute_data = &__ed;
+        _M_recycle = false;
+        __task* __next = _M_func(this);
+        return finalize(__next);
+    };
+
+    __task*
+    cancel(tbb::detail::d1::execution_data& __ed) override
+    {
+        return finalize(nullptr);
+    }
+
+    __task*
+    finalize(__task* __next)
+    {
+        bool __recycle = _M_recycle;
+        _M_recycle = false;
+
+        if (__recycle)
+        {
+            return __next;
+        }
+
+        auto __parent = _M_parent;
+        auto __alloc = _M_allocator;
+        auto __ed = _M_execute_data;
+
+        this->~__func_task();
+
+        assert(__parent != nullptr);
+        assert(__parent->_M_refcount.load(std::memory_order_relaxed) > 0);
+        if (--__parent->_M_refcount == 0)
+        {
+            assert(__next == nullptr);
+            __alloc.deallocate(this, *__ed);
+            return __parent;
+        }
+
+        return __next;
+    }
+
+    friend class __root_task<_Func>;
+
+  public:
+    template <typename _Fn>
+    __func_task(_Fn&& __f) : _M_func(std::forward<_Fn>(__f))
+    {
+    }
+
+    _Func&
+    body()
+    {
+        return _M_func;
+    }
+};
+
+template <typename _Func>
+class __root_task : public __task
+{
+    __task*
+    execute(tbb::detail::d1::execution_data& __ed) override
+    {
+        _M_wait_object.release();
+        return nullptr;
+    };
+
+    __task*
+    cancel(tbb::detail::d1::execution_data& __ed) override
+    {
+        _M_wait_object.release();
+        return nullptr;
+    }
+
+    __func_task<_Func>* _M_func_task{};
+    tbb::detail::d1::wait_context _M_wait_object{0};
+    tbb::task_group_context _M_context{};
+
+  public:
+    template <typename... Args>
+    __root_task(Args&&... args) : _M_wait_object{1}
+    {
+        tbb::detail::d1::small_object_allocator __alloc{};
+        _M_func_task = __alloc.new_object<__func_task<_Func>>(_Func(std::forward<Args>(args)...));
+        _M_func_task->_M_allocator = __alloc;
+        _M_func_task->_M_parent = this;
+        _M_refcount.store(1, std::memory_order_relaxed);
+    }
+
+    friend class __task;
+};
+#endif // TBB_INTERFACE_VERSION <= 12000
+
 template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _Cleanup,
           typename _LeafMerge>
-class __merge_task : public tbb::task
+class __merge_func
 {
     typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
     typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
     typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
     typedef typename std::iterator_traits<_RandomAccessIterator1>::value_type _ValueType;
 
-    /*override*/ tbb::task*
-    execute();
     _RandomAccessIterator1 _M_x_beg;
     _RandomAccessIterator2 _M_z_beg;
 
@@ -529,7 +825,7 @@
     };
 
   public:
-    __merge_task(_SizeType __xs, _SizeType __xe, _SizeType __ys, _SizeType __ye, _SizeType __zs, _Compare __comp,
+    __merge_func(_SizeType __xs, _SizeType __xe, _SizeType __ys, _SizeType __ye, _SizeType __zs, _Compare __comp,
                  _Cleanup, _LeafMerge __leaf_merge, _SizeType __nsort, _RandomAccessIterator1 __x_beg,
                  _RandomAccessIterator2 __z_beg, bool __x_orig, bool __y_orig, bool __root)
         : _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_x_beg(__x_beg), _M_z_beg(__z_beg),
@@ -554,12 +850,14 @@
             _y_orig = __on_off;
     }
 
+    __task*
+    operator()(__task* __self);
+
   private:
-    __merge_task*
-    parent_merge() const
+    __merge_func*
+    parent_merge(__task* __self) const
     {
-        tbb::task* p = (_root ? nullptr : parent());
-        return static_cast<__merge_task*>(p);
+        return _root ? nullptr : &static_cast<__func_task<__merge_func>*>(__self->parent())->body();
     }
     bool
     x_less_y()
@@ -615,8 +913,8 @@
 
         _y_orig = !_y_orig;
     }
-    tbb::task*
-    merge_ranges()
+    __task*
+    merge_ranges(__task* __self)
     {
         assert(_x_orig == _y_orig); //two merged subrange must be lie into the same buffer
 
@@ -626,7 +924,7 @@
 
         // need to merge {x} and {y}
         if (__n > __merge_cut_off)
-            return split_merging();
+            return split_merging(__self);
 
         //merge to buffer
         if (_x_orig)
@@ -634,7 +932,7 @@
             _M_leaf_merge(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs,
                           _M_comp, __move_value_construct(), __move_value_construct(), __move_range_construct(),
                           __move_range_construct());
-            assert(parent_merge()); //not root merging task
+            assert(parent_merge(__self)); //not root merging task
         }
         //merge to "origin"
         else
@@ -656,13 +954,13 @@
         return nullptr;
     }
 
-    tbb::task*
-    process_ranges()
+    __task*
+    process_ranges(__task* __self)
     {
         assert(_x_orig == _y_orig);
         assert(!_split);
 
-        auto p = parent_merge();
+        auto p = parent_merge(__self);
 
         if (!p)
         { //root merging task
@@ -685,7 +983,7 @@
                 move_y_range(); //parallel moving
             }
             // need to merge {x} and {y}.
-            return merge_ranges();
+            return merge_ranges(__self);
         }
         //else: not root merging task (parent_merge() == NULL)
         //optimization, just for sort algorithm, //{x} <= {y}
@@ -699,12 +997,12 @@
         const auto id_range = _M_zs;
         p->set_odd(id_range, !_x_orig);
 
-        return merge_ranges();
+        return merge_ranges(__self);
     }
 
     //splitting as merge task into 2 of the same level
-    tbb::task*
-    split_merging()
+    __task*
+    split_merging(__task* __self)
     {
         assert(_x_orig == _y_orig);
         const auto __nx = (_M_xe - _M_xs);
@@ -732,43 +1030,42 @@
         }
 
         auto __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys));
+        __merge_func __right_func(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _Cleanup(), _M_leaf_merge, _M_nsort,
+                                  _M_x_beg, _M_z_beg, _x_orig, _y_orig, _root);
+        __right_func._split = true;
+        auto __merge_task = __self->make_additional_child_of(__self->parent(), std::move(__right_func));
+        __self->spawn(__merge_task);
+        __self->recycle_as_continuation();
 
-        __merge_task* __right = new (tbb::task::allocate_additional_child_of(*parent()))
-            __merge_task(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _Cleanup(), _M_leaf_merge, _M_nsort, _M_x_beg,
-                         _M_z_beg, _x_orig, _y_orig, _root);
-
-        __right->_split = true;
-
-        tbb::task::spawn(*__right);
-        tbb::task::recycle_as_continuation();
         _M_xe = __xm;
         _M_ye = __ym;
         _split = true;
 
-        return this;
+        return __self;
     }
 };
 
 template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename __M_Compare, typename _Cleanup,
           typename _LeafMerge>
-tbb::task*
-__merge_task<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Cleanup, _LeafMerge>::execute()
+__task*
+__merge_func<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Cleanup, _LeafMerge>::
+operator()(__task* __self)
 {
     //a. split merge task into 2 of the same level; the special logic,
     //without processing(process_ranges) adjacent sub-ranges x and y
     if (_split)
-        return merge_ranges();
+        return merge_ranges(__self);
 
     //b. General merging of adjacent sub-ranges x and y (with optimization in case of {x} <= {y} )
 
     //1. x and y are in the even buffer
     //2. x and y are in the odd buffer
     if (_x_orig == _y_orig)
-        return process_ranges();
+        return process_ranges(__self);
 
     //3. x is in even buffer, y is in the odd buffer
     //4. x is in odd buffer, y is in the even buffer
-    if (!parent_merge())
+    if (!parent_merge(__self))
     { //root merge task
         if (_x_orig)
             move_x_range();
@@ -788,11 +1085,11 @@
             move_y_range();
     }
 
-    return process_ranges();
+    return process_ranges(__self);
 }
 
 template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort>
-class __stable_sort_task : public tbb::task
+class __stable_sort_func
 {
   public:
     typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
@@ -800,8 +1097,6 @@
     typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
 
   private:
-    /*override*/ tbb::task*
-    execute();
     _RandomAccessIterator1 _M_xs, _M_xe, _M_x_beg;
     _RandomAccessIterator2 _M_zs, _M_z_beg;
     _Compare _M_comp;
@@ -810,22 +1105,25 @@
     _SizeType _M_nsort; //zero or number of elements to be sorted for partial_sort alforithm
 
   public:
-    __stable_sort_task(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __zs,
+    __stable_sort_func(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __zs,
                        bool __root, _Compare __comp, _LeafSort __leaf_sort, _SizeType __nsort,
                        _RandomAccessIterator1 __x_beg, _RandomAccessIterator2 __z_beg)
         : _M_xs(__xs), _M_xe(__xe), _M_x_beg(__x_beg), _M_zs(__zs), _M_z_beg(__z_beg), _M_comp(__comp),
           _M_leaf_sort(__leaf_sort), _M_root(__root), _M_nsort(__nsort)
     {
     }
+
+    __task*
+    operator()(__task* __self);
 };
 
 #define _PSTL_STABLE_SORT_CUT_OFF 500
 
 template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort>
-tbb::task*
-__stable_sort_task<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _LeafSort>::execute()
+__task*
+__stable_sort_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _LeafSort>::operator()(__task* __self)
 {
-    typedef __merge_task<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, __utils::__serial_destroy,
+    typedef __merge_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, __utils::__serial_destroy,
                          __utils::__serial_move_merge>
         _MergeTaskType;
 
@@ -835,34 +1133,27 @@
     if (__n <= __sort_cut_off)
     {
         _M_leaf_sort(_M_xs, _M_xe, _M_comp);
-
         assert(!_M_root);
-
-        tbb::task* p = parent();
-        const auto id_range = _M_xs - _M_x_beg;
-
         return nullptr;
     }
 
     const _RandomAccessIterator1 __xm = _M_xs + __n / 2;
     const _RandomAccessIterator2 __zm = _M_zs + (__xm - _M_xs);
     const _RandomAccessIterator2 __ze = _M_zs + __n;
-    _MergeTaskType* __m = new (allocate_continuation()) _MergeTaskType(
-        _M_xs - _M_x_beg, __xm - _M_x_beg, __xm - _M_x_beg, _M_xe - _M_x_beg, _M_zs - _M_z_beg, _M_comp,
-        __utils::__serial_destroy(), __utils::__serial_move_merge(__nmerge), _M_nsort, _M_x_beg, _M_z_beg,
-        /*x_orig*/ true, /*y_orig*/ true, /*root*/ _M_root);
-
+    _MergeTaskType __m(_MergeTaskType(_M_xs - _M_x_beg, __xm - _M_x_beg, __xm - _M_x_beg, _M_xe - _M_x_beg,
+                                      _M_zs - _M_z_beg, _M_comp, __utils::__serial_destroy(),
+                                      __utils::__serial_move_merge(__nmerge), _M_nsort, _M_x_beg, _M_z_beg,
+                                      /*x_orig*/ true, /*y_orig*/ true, /*root*/ _M_root));
+    auto __parent = __self->make_continuation(std::move(__m));
+    __parent->set_ref_count(2);
+    auto __right = __self->make_child_of(
+        __parent, __stable_sort_func(__xm, _M_xe, __zm, false, _M_comp, _M_leaf_sort, _M_nsort, _M_x_beg, _M_z_beg));
+    __self->spawn(__right);
+    __self->recycle_as_child_of(__parent);
     _M_root = false;
-
-    __m->set_ref_count(2);
-    auto __right = new (__m->allocate_child())
-        __stable_sort_task(__xm, _M_xe, __zm, _M_root, _M_comp, _M_leaf_sort, _M_nsort, _M_x_beg, _M_z_beg);
-
-    spawn(*__right);
-    recycle_as_child_of(*__m);
     _M_xe = __xm;
 
-    return this;
+    return __self;
 }
 
 template <class _ExecutionPolicy, typename _RandomAccessIterator, typename _Compare, typename _LeafSort>
@@ -882,11 +1173,9 @@
         if (__n > __sort_cut_off)
         {
             __buffer<_ValueType> __buf(__n);
-            tbb::task* root = new (tbb::task::allocate_root())
-                __stable_sort_task<_RandomAccessIterator, _ValueType*, _Compare, _LeafSort>(
-                    __xs, __xe, __buf.get(), true, __comp, __leaf_sort, __nsort, __xs, __buf.get());
-            tbb::task::spawn_root_and_wait(*root);
-
+            __root_task<__stable_sort_func<_RandomAccessIterator, _ValueType*, _Compare, _LeafSort>> __root{
+                __xs, __xe, __buf.get(), true, __comp, __leaf_sort, __nsort, __xs, __buf.get()};
+            __task::spawn_root_and_wait(__root);
             return;
         }
         //serial sort
@@ -899,10 +1188,8 @@
 //------------------------------------------------------------------------
 template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
           typename _Compare, typename _LeafMerge>
-class __merge_task_static : public tbb::task
+class __merge_func_static
 {
-    /*override*/ tbb::task*
-    execute();
     _RandomAccessIterator1 _M_xs, _M_xe;
     _RandomAccessIterator2 _M_ys, _M_ye;
     _RandomAccessIterator3 _M_zs;
@@ -910,20 +1197,23 @@
     _LeafMerge _M_leaf_merge;
 
   public:
-    __merge_task_static(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys,
+    __merge_func_static(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys,
                         _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp,
                         _LeafMerge __leaf_merge)
         : _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_comp(__comp), _M_leaf_merge(__leaf_merge)
     {
     }
+
+    __task*
+    operator()(__task* __self);
 };
 
 //TODO: consider usage of parallel_for with a custom blocked_range
 template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
           typename __M_Compare, typename _LeafMerge>
-tbb::task*
-__merge_task_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, __M_Compare,
-                    _LeafMerge>::execute()
+__task*
+__merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, __M_Compare, _LeafMerge>::
+operator()(__task* __self)
 {
     typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
     typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
@@ -949,14 +1239,14 @@
         __ym = std::lower_bound(_M_ys, _M_ye, *__xm, _M_comp);
     }
     const _RandomAccessIterator3 __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys));
-    tbb::task* __right = new (tbb::task::allocate_additional_child_of(*parent()))
-        __merge_task_static(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _M_leaf_merge);
-    tbb::task::spawn(*__right);
-    tbb::task::recycle_as_continuation();
+    auto __right = __self->make_additional_child_of(
+        __self->parent(), __merge_func_static(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _M_leaf_merge));
+    __self->spawn(__right);
+    __self->recycle_as_continuation();
     _M_xe = __xm;
     _M_ye = __ym;
 
-    return this;
+    return __self;
 }
 
 template <class _ExecutionPolicy, typename _RandomAccessIterator1, typename _RandomAccessIterator2,
@@ -979,11 +1269,11 @@
     else
     {
         tbb::this_task_arena::isolate([=]() {
-            typedef __merge_task_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3,
+            typedef __merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3,
                                         _Compare, _LeafMerge>
                 _TaskType;
-            tbb::task::spawn_root_and_wait(*new (tbb::task::allocate_root())
-                                               _TaskType(__xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge));
+            __root_task<_TaskType> __root{__xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge};
+            __task::spawn_root_and_wait(__root);
         });
     }
 }