final/runtime/test/tasking/kmp_task_reduction_nest.cpp - openmp - Git at Google

 // RUN: %libomp-cxx-compile-and-run
 // RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run
 // GCC-5 is needed for OpenMP 4.0 support (taskgroup)
 // XFAIL: gcc-4
 #include <cstdio>
 #include <cmath>
 #include <cassert>
 #include <omp.h>

 // Total number of loop iterations, should be multiple of T for this test
 #define N 10000

 // Flag to request lazy (1) or eager (0) allocation of reduction objects
 #ifndef FLG
 #define FLG 0
 #endif

 /*
   // initial user's code that corresponds to pseudo code of the test
   #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x)
   {
     for( int l = 0; l < N; ++l ) {
       #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x)
       {
         i += l;
         if( l%2 )
           x *= 1.0 / (l + 1);
         else
           x *= (l + 1);
       }
     }

     #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)
     {
       for( int l = 0; l < N; ++l ) {
         #pragma omp task firstprivate(l) in_reduction(+:j,y) \
             in_reduction(*:x) in_reduction(-:k)
         {
           j += l;
           k -= l;
           y += (double)l;
           if( l%2 )
             x *= 1.0 / (l + 1);
           else
             x *= (l + 1);
         }
         #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)
         {
           i -= l;
           k -= l;
           y += (double)l;
         }
         #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x)
         {
           j += l;
           if( l%2 )
             x *= 1.0 / (l + 1);
           else
             x *= (l + 1);
         }
       }
     } // inner reduction

     for( int l = 0; l < N; ++l ) {
       #pragma omp task firstprivate(l) in_reduction(+:j)
         j += l;
     }
   } // outer reduction
 */

 //------------------------------------------------
 // OpenMP runtime library routines
 #ifdef __cplusplus
 extern "C" {
 #endif
 extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);
 extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);
 extern int __kmpc_global_thread_num(void*);
 #ifdef __cplusplus
 }
 #endif

 //------------------------------------------------
 // Compiler-generated code

 typedef struct _task_red_item {
     void       *shar; // shared reduction item
     size_t      size; // size of data item
     void       *f_init; // data initialization routine
     void       *f_fini; // data finalization routine
     void       *f_comb; // data combiner routine
     unsigned    flags;
 } _task_red_item_t;

 // int:+   no need in init/fini callbacks, valid for subtraction
 void __red_int_add_comb(void *lhs, void *rhs) // combiner
 { *(int*)lhs += *(int*)rhs; }

 // long long:+   no need in init/fini callbacks, valid for subtraction
 void __red_llong_add_comb(void *lhs, void *rhs) // combiner
 { *(long long*)lhs += *(long long*)rhs; }

 // double:*   no need in fini callback
 void __red_dbl_mul_init(void *data) // initializer
 { *(double*)data = 1.0; }
 void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner
 { *(double*)lhs *= *(double*)rhs; }

 // double:+   no need in init/fini callbacks
 void __red_dbl_add_comb(void *lhs, void *rhs) // combiner
 { *(double*)lhs += *(double*)rhs; }

 // ==============================

 void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)
 {
     for( int l = 0; l < N; ++l ) {
         *pi += l;
         if( l%2 )
           *px *= 1.0 / (l + 1);
         else
           *px *= (l + 1);
     }
     for( int l = 0; l < N; ++l ) {
         *pj += l;
         *pk -= l;
         *py += (double)l;
         if( l%2 )
             *px *= 1.0 / (l + 1);
         else
             *px *= (l + 1);

         *pi -= l;
         *pk -= l;
         *py += (double)l;

         *pj += l;
         if( l%2 )
             *px *= 1.0 / (l + 1);
         else
             *px *= (l + 1);
     }
     for( int l = 0; l < N; ++l ) {
         *pj += l;
     }
 }

 //------------------------------------------------
 // Test case
 int main()
 {
   int nthreads = omp_get_max_threads();
   int err = 0;
   void** ptrs = (void**)malloc(nthreads*sizeof(void*));

   // user's code ======================================
   // variables for serial calculations:
   int is = 3;
   long long js = -9999999;
   double xs = 99999.0;
   long long ks = 99999999;
   double ys = -99999999.0;
   // variables for parallel calculations:
   int ip = 3;
   long long jp = -9999999;
   double xp = 99999.0;
   long long kp = 99999999;
   double yp = -99999999.0;

   calc_serial(&is, &js, &xs, &ks, &ys);
   // ==================================================
   for (int i = 0; i < nthreads; ++i)
     ptrs[i] = NULL;
   #pragma omp parallel
   {
     #pragma omp single nowait
     {
       // outer taskgroup reduces (i,j,x)
       #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)
       {
         _task_red_item_t red_data[3];
         red_data[0].shar = &ip;
         red_data[0].size = sizeof(ip);
         red_data[0].f_init = NULL; // RTL will zero thread-specific objects
         red_data[0].f_fini = NULL; // no destructors needed
         red_data[0].f_comb = (void*)&__red_int_add_comb;
         red_data[0].flags = FLG;
         red_data[1].shar = &jp;
         red_data[1].size = sizeof(jp);
         red_data[1].f_init = NULL; // RTL will zero thread-specific objects
         red_data[1].f_fini = NULL; // no destructors needed
         red_data[1].f_comb = (void*)&__red_llong_add_comb;
         red_data[1].flags = FLG;
         red_data[2].shar = &xp;
         red_data[2].size = sizeof(xp);
         red_data[2].f_init = (void*)&__red_dbl_mul_init;
         red_data[2].f_fini = NULL; // no destructors needed
         red_data[2].f_comb = (void*)&__red_dbl_mul_comb;
         red_data[2].flags = FLG;
         int gtid = __kmpc_global_thread_num(NULL);
         void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data);

         for( int l = 0; l < N; l += 2 ) {
           // 2 iterations per task to get correct x value; actually any even
           // number of iters per task will work, otherwise x looses precision
           #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)
           {
             int gtid = __kmpc_global_thread_num(NULL);
             int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip);
             double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
                                         gtid, tg1, &xp);
             if (!ptrs[gtid]) ptrs[gtid] = p_xp;

             // user's pseudo-code ==============================
             *p_ip += l;
             *p_xp *= (l + 1);

             *p_ip += l + 1;
             *p_xp *= 1.0 / (l + 2);
             // ==================================================
           }
         }
         // inner taskgroup reduces (i,k,y), i is same object as in outer one
         #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)
         {
           _task_red_item_t red_data[3];
           red_data[0].shar = &ip;
           red_data[0].size = sizeof(ip);
           red_data[0].f_init = NULL; // RTL will zero thread-specific objects
           red_data[0].f_fini = NULL; // no destructors needed
           red_data[0].f_comb = (void*)&__red_int_add_comb;
           red_data[0].flags = FLG;
           red_data[1].shar = &kp;
           red_data[1].size = sizeof(kp);
           red_data[1].f_init = NULL; // RTL will zero thread-specific objects
           red_data[1].f_fini = NULL; // no destructors needed
           red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and -
           red_data[1].flags = FLG;
           red_data[2].shar = &yp;
           red_data[2].size = sizeof(yp);
           red_data[2].f_init = NULL; // RTL will zero thread-specific objects
           red_data[2].f_fini = NULL; // no destructors needed
           red_data[2].f_comb = (void*)&__red_dbl_add_comb;
           red_data[2].flags = FLG;
           int gtid = __kmpc_global_thread_num(NULL);
           void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data);

           for( int l = 0; l < N; l += 2 ) {
             #pragma omp task firstprivate(l)
             // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k)
             {
               int gtid = __kmpc_global_thread_num(NULL);
               long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
                                                 gtid, tg1, &jp);
               long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
                                                 gtid, tg2, &kp);
               double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
                                           gtid, tg1, &xp);
               double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
                                           gtid, tg2, &yp);
               // user's pseudo-code ==============================
               *p_jp += l;
               *p_kp -= l;
               *p_yp += (double)l;
               *p_xp *= (l + 1);

               *p_jp += l + 1;
               *p_kp -= l + 1;
               *p_yp += (double)(l + 1);
               *p_xp *= 1.0 / (l + 2);
               // =================================================
 {
   // the following code is here just to check __kmpc_task_reduction_get_th_data:
   int tid = omp_get_thread_num();
   void *addr1;
   void *addr2;
   addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared
   addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private
   if (addr1 != addr2) {
     #pragma omp atomic
       ++err;
     printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);
   }
   // from neighbour w/o taskgroup (should start lookup from current tg2)
   if (tid > 0) {
     if (ptrs[tid-1]) {
       addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]);
       if (addr1 != addr2) {
         #pragma omp atomic
           ++err;
         printf("Wrong thread-specific addresses %d s:%p n:%p\n",
                tid, addr1, addr2);
       }
     }
   } else {
     if (ptrs[nthreads-1]) {
       addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]);
       if (addr1 != addr2) {
         #pragma omp atomic
           ++err;
         printf("Wrong thread-specific addresses %d s:%p n:%p\n",
                tid, addr1, addr2);
       }
     }
   }
   // ----------------------------------------------
 }
             }
             #pragma omp task firstprivate(l)
             // in_reduction(+:y) in_reduction(-:i,k)
             {
               int gtid = __kmpc_global_thread_num(NULL);
               int *p_ip = (int*)__kmpc_task_reduction_get_th_data(
                                     gtid, tg2, &ip);
               long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
                                                 gtid, tg2, &kp);
               double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
                                           gtid, tg2, &yp);

               // user's pseudo-code ==============================
               *p_ip -= l;
               *p_kp -= l;
               *p_yp += (double)l;

               *p_ip -= l + 1;
               *p_kp -= l + 1;
               *p_yp += (double)(l + 1);
               // =================================================
             }
             #pragma omp task firstprivate(l)
             // in_reduction(+:j) in_reduction(*:x)
             {
               int gtid = __kmpc_global_thread_num(NULL);
               long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
                                                 gtid, tg1, &jp);
               double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
                                           gtid, tg1, &xp);
               // user's pseudo-code ==============================
               *p_jp += l;
               *p_xp *= (l + 1);

               *p_jp += l + 1;
               *p_xp *= 1.0 / (l + 2);
               // =================================================
             }
           }
         } // inner reduction

         for( int l = 0; l < N; l += 2 ) {
           #pragma omp task firstprivate(l) // in_reduction(+:j)
           {
             int gtid = __kmpc_global_thread_num(NULL);
             long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
                                               gtid, tg1, &jp);
             // user's pseudo-code ==============================
             *p_jp += l;
             *p_jp += l + 1;
             // =================================================
           }
         }
       } // outer reduction
     } // end single
   } // end parallel
   // check results
 #if _DEBUG
   printf("reduction flags = %u\n", FLG);
 #endif
   if (ip == is && jp == js && ks == kp &&
       fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01)
     printf("passed\n");
   else
     printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",
       is, js, xs, ks, ys,
       ip, jp, xp, kp, yp);
   return 0;
 }
	// RUN: %libomp-cxx-compile-and-run
	// RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run
	// GCC-5 is needed for OpenMP 4.0 support (taskgroup)
	// XFAIL: gcc-4
	#include <cstdio>
	#include <cmath>
	#include <cassert>
	#include <omp.h>

	// Total number of loop iterations, should be multiple of T for this test
	#define N 10000

	// Flag to request lazy (1) or eager (0) allocation of reduction objects
	#ifndef FLG
	#define FLG 0
	#endif

	/*
	// initial user's code that corresponds to pseudo code of the test
	#pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x)
	{
	for( int l = 0; l < N; ++l ) {
	#pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x)
	{
	i += l;
	if( l%2 )
	x *= 1.0 / (l + 1);
	else
	x *= (l + 1);
	}
	}

	#pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)
	{
	for( int l = 0; l < N; ++l ) {
	#pragma omp task firstprivate(l) in_reduction(+:j,y) \
	in_reduction(*:x) in_reduction(-:k)
	{
	j += l;
	k -= l;
	y += (double)l;
	if( l%2 )
	x *= 1.0 / (l + 1);
	else
	x *= (l + 1);
	}
	#pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)
	{
	i -= l;
	k -= l;
	y += (double)l;
	}
	#pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x)
	{
	j += l;
	if( l%2 )
	x *= 1.0 / (l + 1);
	else
	x *= (l + 1);
	}
	}
	} // inner reduction

	for( int l = 0; l < N; ++l ) {
	#pragma omp task firstprivate(l) in_reduction(+:j)
	j += l;
	}
	} // outer reduction
	*/

	//------------------------------------------------
	// OpenMP runtime library routines
	#ifdef __cplusplus
	extern "C" {
	#endif
	extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);
	extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);
	extern int __kmpc_global_thread_num(void*);
	#ifdef __cplusplus
	}
	#endif

	//------------------------------------------------
	// Compiler-generated code

	typedef struct _task_red_item {
	void *shar; // shared reduction item
	size_t size; // size of data item
	void *f_init; // data initialization routine
	void *f_fini; // data finalization routine
	void *f_comb; // data combiner routine
	unsigned flags;
	} _task_red_item_t;

	// int:+ no need in init/fini callbacks, valid for subtraction
	void __red_int_add_comb(void lhs, void rhs) // combiner
	{ (int)lhs += (int)rhs; }

	// long long:+ no need in init/fini callbacks, valid for subtraction
	void __red_llong_add_comb(void lhs, void rhs) // combiner
	{ (long long)lhs += (long long)rhs; }

	// double:* no need in fini callback
	void __red_dbl_mul_init(void *data) // initializer
	{ (double)data = 1.0; }
	void __red_dbl_mul_comb(void lhs, void rhs) // combiner
	{ (double)lhs = (double*)rhs; }

	// double:+ no need in init/fini callbacks
	void __red_dbl_add_comb(void lhs, void rhs) // combiner
	{ (double)lhs += (double)rhs; }

	// ==============================

	void calc_serial(int pi, long long pj, double px, long long pk, double *py)
	{
	for( int l = 0; l < N; ++l ) {
	*pi += l;
	if( l%2 )
	px = 1.0 / (l + 1);
	else
	px = (l + 1);
	}
	for( int l = 0; l < N; ++l ) {
	*pj += l;
	*pk -= l;
	*py += (double)l;
	if( l%2 )
	px = 1.0 / (l + 1);
	else
	px = (l + 1);

	*pi -= l;
	*pk -= l;
	*py += (double)l;

	*pj += l;
	if( l%2 )
	px = 1.0 / (l + 1);
	else
	px = (l + 1);
	}
	for( int l = 0; l < N; ++l ) {
	*pj += l;
	}
	}

	//------------------------------------------------
	// Test case
	int main()
	{
	int nthreads = omp_get_max_threads();
	int err = 0;
	void ptrs = (void)malloc(nthreadssizeof(void));

	// user's code ======================================
	// variables for serial calculations:
	int is = 3;
	long long js = -9999999;
	double xs = 99999.0;
	long long ks = 99999999;
	double ys = -99999999.0;
	// variables for parallel calculations:
	int ip = 3;
	long long jp = -9999999;
	double xp = 99999.0;
	long long kp = 99999999;
	double yp = -99999999.0;

	calc_serial(&is, &js, &xs, &ks, &ys);
	// ==================================================
	for (int i = 0; i < nthreads; ++i)
	ptrs[i] = NULL;
	#pragma omp parallel
	{
	#pragma omp single nowait
	{
	// outer taskgroup reduces (i,j,x)
	#pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)
	{
	_task_red_item_t red_data[3];
	red_data[0].shar = &ip;
	red_data[0].size = sizeof(ip);
	red_data[0].f_init = NULL; // RTL will zero thread-specific objects
	red_data[0].f_fini = NULL; // no destructors needed
	red_data[0].f_comb = (void*)&__red_int_add_comb;
	red_data[0].flags = FLG;
	red_data[1].shar = &jp;
	red_data[1].size = sizeof(jp);
	red_data[1].f_init = NULL; // RTL will zero thread-specific objects
	red_data[1].f_fini = NULL; // no destructors needed
	red_data[1].f_comb = (void*)&__red_llong_add_comb;
	red_data[1].flags = FLG;
	red_data[2].shar = &xp;
	red_data[2].size = sizeof(xp);
	red_data[2].f_init = (void*)&__red_dbl_mul_init;
	red_data[2].f_fini = NULL; // no destructors needed
	red_data[2].f_comb = (void*)&__red_dbl_mul_comb;
	red_data[2].flags = FLG;
	int gtid = __kmpc_global_thread_num(NULL);
	void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data);

	for( int l = 0; l < N; l += 2 ) {
	// 2 iterations per task to get correct x value; actually any even
	// number of iters per task will work, otherwise x looses precision
	#pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)
	{
	int gtid = __kmpc_global_thread_num(NULL);
	int p_ip = (int)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip);
	double p_xp = (double)__kmpc_task_reduction_get_th_data(
	gtid, tg1, &xp);
	if (!ptrs[gtid]) ptrs[gtid] = p_xp;

	// user's pseudo-code ==============================
	*p_ip += l;
	p_xp = (l + 1);

	*p_ip += l + 1;
	p_xp = 1.0 / (l + 2);
	// ==================================================
	}
	}
	// inner taskgroup reduces (i,k,y), i is same object as in outer one
	#pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)
	{
	_task_red_item_t red_data[3];
	red_data[0].shar = &ip;
	red_data[0].size = sizeof(ip);
	red_data[0].f_init = NULL; // RTL will zero thread-specific objects
	red_data[0].f_fini = NULL; // no destructors needed
	red_data[0].f_comb = (void*)&__red_int_add_comb;
	red_data[0].flags = FLG;
	red_data[1].shar = &kp;
	red_data[1].size = sizeof(kp);
	red_data[1].f_init = NULL; // RTL will zero thread-specific objects
	red_data[1].f_fini = NULL; // no destructors needed
	red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and -
	red_data[1].flags = FLG;
	red_data[2].shar = &yp;
	red_data[2].size = sizeof(yp);
	red_data[2].f_init = NULL; // RTL will zero thread-specific objects
	red_data[2].f_fini = NULL; // no destructors needed
	red_data[2].f_comb = (void*)&__red_dbl_add_comb;
	red_data[2].flags = FLG;
	int gtid = __kmpc_global_thread_num(NULL);
	void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data);

	for( int l = 0; l < N; l += 2 ) {
	#pragma omp task firstprivate(l)
	// in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k)
	{
	int gtid = __kmpc_global_thread_num(NULL);
	long long p_jp = (long long)__kmpc_task_reduction_get_th_data(
	gtid, tg1, &jp);
	long long p_kp = (long long)__kmpc_task_reduction_get_th_data(
	gtid, tg2, &kp);
	double p_xp = (double)__kmpc_task_reduction_get_th_data(
	gtid, tg1, &xp);
	double p_yp = (double)__kmpc_task_reduction_get_th_data(
	gtid, tg2, &yp);
	// user's pseudo-code ==============================
	*p_jp += l;
	*p_kp -= l;
	*p_yp += (double)l;
	p_xp = (l + 1);

	*p_jp += l + 1;
	*p_kp -= l + 1;
	*p_yp += (double)(l + 1);
	p_xp = 1.0 / (l + 2);
	// =================================================
	{
	// the following code is here just to check __kmpc_task_reduction_get_th_data:
	int tid = omp_get_thread_num();
	void *addr1;
	void *addr2;
	addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared
	addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private
	if (addr1 != addr2) {
	#pragma omp atomic
	++err;
	printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);
	}
	// from neighbour w/o taskgroup (should start lookup from current tg2)
	if (tid > 0) {
	if (ptrs[tid-1]) {
	addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]);
	if (addr1 != addr2) {
	#pragma omp atomic
	++err;
	printf("Wrong thread-specific addresses %d s:%p n:%p\n",
	tid, addr1, addr2);
	}
	}
	} else {
	if (ptrs[nthreads-1]) {
	addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]);
	if (addr1 != addr2) {
	#pragma omp atomic
	++err;
	printf("Wrong thread-specific addresses %d s:%p n:%p\n",
	tid, addr1, addr2);
	}
	}
	}
	// ----------------------------------------------
	}
	}
	#pragma omp task firstprivate(l)
	// in_reduction(+:y) in_reduction(-:i,k)
	{
	int gtid = __kmpc_global_thread_num(NULL);
	int p_ip = (int)__kmpc_task_reduction_get_th_data(
	gtid, tg2, &ip);
	long long p_kp = (long long)__kmpc_task_reduction_get_th_data(
	gtid, tg2, &kp);
	double p_yp = (double)__kmpc_task_reduction_get_th_data(
	gtid, tg2, &yp);

	// user's pseudo-code ==============================
	*p_ip -= l;
	*p_kp -= l;
	*p_yp += (double)l;

	*p_ip -= l + 1;
	*p_kp -= l + 1;
	*p_yp += (double)(l + 1);
	// =================================================
	}
	#pragma omp task firstprivate(l)
	// in_reduction(+:j) in_reduction(*:x)
	{
	int gtid = __kmpc_global_thread_num(NULL);
	long long p_jp = (long long)__kmpc_task_reduction_get_th_data(
	gtid, tg1, &jp);
	double p_xp = (double)__kmpc_task_reduction_get_th_data(
	gtid, tg1, &xp);
	// user's pseudo-code ==============================
	*p_jp += l;
	p_xp = (l + 1);

	*p_jp += l + 1;
	p_xp = 1.0 / (l + 2);
	// =================================================
	}
	}
	} // inner reduction

	for( int l = 0; l < N; l += 2 ) {
	#pragma omp task firstprivate(l) // in_reduction(+:j)
	{
	int gtid = __kmpc_global_thread_num(NULL);
	long long p_jp = (long long)__kmpc_task_reduction_get_th_data(
	gtid, tg1, &jp);
	// user's pseudo-code ==============================
	*p_jp += l;
	*p_jp += l + 1;
	// =================================================
	}
	}
	} // outer reduction
	} // end single
	} // end parallel
	// check results
	#if _DEBUG
	printf("reduction flags = %u\n", FLG);
	#endif
	if (ip == is && jp == js && ks == kp &&
	fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01)
	printf("passed\n");
	else
	printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",
	is, js, xs, ks, ys,
	ip, jp, xp, kp, yp);
	return 0;
	}