| // |
| // See README-LCALS_license.txt for access and distribution restrictions |
| // |
| |
| // |
| // Source file containing LCALS "A" subset forall lambda loops using |
| // the google benchmark library. |
| // |
| |
| #include <benchmark/benchmark.h> |
| #include "../LCALSSuite.hxx" |
| #include "../SubsetDataA.hxx" |
| #include "../LCALSTraversalMethods.hxx" |
| |
| static void BM_PRESSURE_CALC_LAMBDA(benchmark::State& state) { |
| |
| LoopData& loop_data = getLoopData(); |
| |
| loopInit(PRESSURE_CALC); |
| |
| Real_ptr compression = loop_data.array_1D_Real[0]; |
| Real_ptr bvc = loop_data.array_1D_Real[1]; |
| Real_ptr p_new = loop_data.array_1D_Real[2]; |
| Real_ptr e_old = loop_data.array_1D_Real[3]; |
| Real_ptr vnewc = loop_data.array_1D_Real[4]; |
| |
| const Real_type cls = loop_data.scalar_Real[0]; |
| const Real_type p_cut = loop_data.scalar_Real[1]; |
| const Real_type pmin = loop_data.scalar_Real[2]; |
| const Real_type eosvmax = loop_data.scalar_Real[3]; |
| |
| for( auto _ : state) { |
| |
| forall<exec_policy>(0, state.range(0), |
| [&] (Index_type i) { |
| bvc[i] = cls * (compression[i] + 1.0); |
| } ); |
| |
| forall<exec_policy>(0, state.range(0), |
| [&] (Index_type i) { |
| p_new[i] = bvc[i] * e_old[i] ; |
| |
| if ( fabs(p_new[i]) < p_cut ) p_new[i] = 0.0 ; |
| |
| if ( vnewc[i] >= eosvmax ) p_new[i] = 0.0 ; |
| |
| if ( p_new[i] < pmin ) p_new[i] = pmin ; |
| } ); |
| |
| } |
| } |
| |
| BENCHMARK(BM_PRESSURE_CALC_LAMBDA)->Arg(171)->Arg(5001)-> |
| Arg(44217)->Unit(benchmark::kMicrosecond); |
| |
| |
| static void BM_ENERGY_CALC_LAMBDA(benchmark::State& state) { |
| |
| LoopData& loop_data = getLoopData(); |
| |
| loopInit(ENERGY_CALC); |
| |
| Real_ptr e_new = loop_data.array_1D_Real[0]; |
| Real_ptr e_old = loop_data.array_1D_Real[1]; |
| Real_ptr delvc = loop_data.array_1D_Real[2]; |
| Real_ptr p_new = loop_data.array_1D_Real[3]; |
| Real_ptr p_old = loop_data.array_1D_Real[4]; |
| Real_ptr q_new = loop_data.array_1D_Real[5]; |
| Real_ptr q_old = loop_data.array_1D_Real[6]; |
| Real_ptr work = loop_data.array_1D_Real[7]; |
| Real_ptr compHalfStep = loop_data.array_1D_Real[8]; |
| Real_ptr pHalfStep = loop_data.array_1D_Real[9]; |
| Real_ptr bvc = loop_data.array_1D_Real[10]; |
| Real_ptr pbvc = loop_data.array_1D_Real[11]; |
| Real_ptr ql_old = loop_data.array_1D_Real[12]; |
| Real_ptr qq_old = loop_data.array_1D_Real[13]; |
| Real_ptr vnewc = loop_data.array_1D_Real[14]; |
| |
| const Real_type rho0 = loop_data.scalar_Real[0]; |
| const Real_type e_cut = loop_data.scalar_Real[1]; |
| const Real_type emin = loop_data.scalar_Real[2]; |
| const Real_type q_cut = loop_data.scalar_Real[3]; |
| |
| for( auto _ : state) { |
| |
| forall<exec_policy>(0, state.range(0), |
| [&] (Index_type i) { |
| e_new[i] = e_old[i] - 0.5 * delvc[i] * |
| (p_old[i] + q_old[i]) + 0.5 * work[i]; |
| } ); |
| |
| forall<exec_policy>(0, state.range(0), |
| [&] (Index_type i) { |
| if ( delvc[i] > 0.0 ) { |
| q_new[i] = 0.0 ; |
| } |
| else { |
| Real_type vhalf = 1.0 / (1.0 + compHalfStep[i]) ; |
| Real_type ssc = ( pbvc[i] * e_new[i] |
| + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ; |
| |
| if ( ssc <= 0.1111111e-36 ) { |
| ssc = 0.3333333e-18 ; |
| } else { |
| ssc = sqrt(ssc) ; |
| } |
| |
| q_new[i] = (ssc*ql_old[i] + qq_old[i]) ; |
| } |
| } ); |
| |
| forall<exec_policy>(0, state.range(0), |
| [&] (Index_type i) { |
| e_new[i] = e_new[i] + 0.5 * delvc[i] |
| * ( 3.0*(p_old[i] + q_old[i]) |
| - 4.0*(pHalfStep[i] + q_new[i])) ; |
| } ); |
| |
| forall<exec_policy>(0, state.range(0), |
| [&] (Index_type i) { |
| e_new[i] += 0.5 * work[i]; |
| |
| if ( fabs(e_new[i]) < e_cut ) { e_new[i] = 0.0 ; } |
| |
| if ( e_new[i] < emin ) { e_new[i] = emin ; } |
| } ); |
| |
| forall<exec_policy>(0, state.range(0), |
| [&] (Index_type i) { |
| Real_type q_tilde ; |
| |
| if (delvc[i] > 0.0) { |
| q_tilde = 0. ; |
| } |
| else { |
| Real_type ssc = ( pbvc[i] * e_new[i] |
| + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ; |
| |
| if ( ssc <= 0.1111111e-36 ) { |
| ssc = 0.3333333e-18 ; |
| } else { |
| ssc = sqrt(ssc) ; |
| } |
| |
| q_tilde = (ssc*ql_old[i] + qq_old[i]) ; |
| } |
| |
| e_new[i] = e_new[i] - ( 7.0*(p_old[i] + q_old[i]) |
| - 8.0*(pHalfStep[i] + q_new[i]) |
| + (p_new[i] + q_tilde)) * delvc[i] / 6.0 ; |
| |
| if ( fabs(e_new[i]) < e_cut ) { |
| e_new[i] = 0.0 ; |
| } |
| if ( e_new[i] < emin ) { |
| e_new[i] = emin ; |
| } |
| } ); |
| |
| forall<exec_policy>(0, state.range(0), |
| [&] (Index_type i) { |
| if ( delvc[i] <= 0.0 ) { |
| Real_type ssc = ( pbvc[i] * e_new[i] |
| + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ; |
| |
| if ( ssc <= 0.1111111e-36 ) { |
| ssc = 0.3333333e-18 ; |
| } else { |
| ssc = sqrt(ssc) ; |
| } |
| |
| q_new[i] = (ssc*ql_old[i] + qq_old[i]) ; |
| |
| if (fabs(q_new[i]) < q_cut) q_new[i] = 0.0 ; |
| } |
| } ); |
| |
| } |
| } |
| |
| BENCHMARK(BM_ENERGY_CALC_LAMBDA)->Arg(171)->Arg(5001)-> |
| Arg(44217)->Unit(benchmark::kMicrosecond); |
| |
| static void BM_VOL3D_CALC_LAMBDA(benchmark::State& state) { |
| |
| LoopData& loop_data = getLoopData(); |
| |
| loopInit(VOL3D_CALC); |
| |
| Real_ptr x = loop_data.array_1D_Real[0]; |
| Real_ptr y = loop_data.array_1D_Real[1]; |
| Real_ptr z = loop_data.array_1D_Real[2]; |
| Real_ptr vol = loop_data.array_1D_Real[3]; |
| |
| ADomain domain(state.range(0), /* ndims = */ 3); |
| |
| UnalignedReal_ptr x0,x1,x2,x3,x4,x5,x6,x7 ; |
| UnalignedReal_ptr y0,y1,y2,y3,y4,y5,y6,y7 ; |
| UnalignedReal_ptr z0,z1,z2,z3,z4,z5,z6,z7 ; |
| |
| NDPTRSET(x,x0,x1,x2,x3,x4,x5,x6,x7) ; |
| NDPTRSET(y,y0,y1,y2,y3,y4,y5,y6,y7) ; |
| NDPTRSET(z,z0,z1,z2,z3,z4,z5,z6,z7) ; |
| |
| const Real_type vnormq = 0.083333333333333333; /* vnormq = 1/12 */ |
| |
| for (auto _ : state) { |
| |
| forall<exec_policy>(domain.fpz, domain.lpz + 1, |
| [&] (Index_type i) { |
| |
| Real_type x71 = x7[i] - x1[i] ; |
| Real_type x72 = x7[i] - x2[i] ; |
| Real_type x74 = x7[i] - x4[i] ; |
| Real_type x30 = x3[i] - x0[i] ; |
| Real_type x50 = x5[i] - x0[i] ; |
| Real_type x60 = x6[i] - x0[i] ; |
| |
| Real_type y71 = y7[i] - y1[i] ; |
| Real_type y72 = y7[i] - y2[i] ; |
| Real_type y74 = y7[i] - y4[i] ; |
| Real_type y30 = y3[i] - y0[i] ; |
| Real_type y50 = y5[i] - y0[i] ; |
| Real_type y60 = y6[i] - y0[i] ; |
| |
| Real_type z71 = z7[i] - z1[i] ; |
| Real_type z72 = z7[i] - z2[i] ; |
| Real_type z74 = z7[i] - z4[i] ; |
| Real_type z30 = z3[i] - z0[i] ; |
| Real_type z50 = z5[i] - z0[i] ; |
| Real_type z60 = z6[i] - z0[i] ; |
| |
| Real_type xps = x71 + x60 ; |
| Real_type yps = y71 + y60 ; |
| Real_type zps = z71 + z60 ; |
| |
| Real_type cyz = y72 * z30 - z72 * y30 ; |
| Real_type czx = z72 * x30 - x72 * z30 ; |
| Real_type cxy = x72 * y30 - y72 * x30 ; |
| vol[i] = xps * cyz + yps * czx + zps * cxy ; |
| |
| xps = x72 + x50 ; |
| yps = y72 + y50 ; |
| zps = z72 + z50 ; |
| |
| cyz = y74 * z60 - z74 * y60 ; |
| czx = z74 * x60 - x74 * z60 ; |
| cxy = x74 * y60 - y74 * x60 ; |
| vol[i] += xps * cyz + yps * czx + zps * cxy ; |
| |
| xps = x74 + x30 ; |
| yps = y74 + y30 ; |
| zps = z74 + z30 ; |
| |
| cyz = y71 * z50 - z71 * y50 ; |
| czx = z71 * x50 - x71 * z50 ; |
| cxy = x71 * y50 - y71 * x50 ; |
| vol[i] += xps * cyz + yps * czx + zps * cxy ; |
| |
| vol[i] *= vnormq ; |
| } ); |
| |
| } |
| } |
| |
| BENCHMARK(BM_VOL3D_CALC_LAMBDA)->Arg(SHORT)->Arg(MEDIUM)-> |
| Arg(LONG)->Unit(benchmark::kMicrosecond); |
| |
| static void BM_DEL_DOT_VEC_2D_LAMBDA(benchmark::State& state) { |
| |
| LoopData& loop_data = getLoopData(); |
| |
| loopInit(DEL_DOT_VEC_2D); |
| |
| Real_ptr x = loop_data.array_1D_Real[0]; |
| Real_ptr y = loop_data.array_1D_Real[1]; |
| Real_ptr xdot = loop_data.array_1D_Real[2]; |
| Real_ptr ydot = loop_data.array_1D_Real[3]; |
| Real_ptr div = loop_data.array_1D_Real[4]; |
| |
| ADomain domain(state.range(0), /* ndims = */ 2); |
| |
| UnalignedReal_ptr x1,x2,x3,x4 ; |
| UnalignedReal_ptr y1,y2,y3,y4 ; |
| UnalignedReal_ptr fx1,fx2,fx3,fx4 ; |
| UnalignedReal_ptr fy1,fy2,fy3,fy4 ; |
| |
| NDSET2D(x,x1,x2,x3,x4) ; |
| NDSET2D(y,y1,y2,y3,y4) ; |
| NDSET2D(xdot,fx1,fx2,fx3,fx4) ; |
| NDSET2D(ydot,fy1,fy2,fy3,fy4) ; |
| |
| const Real_type ptiny = 1.0e-20; |
| const Real_type half = 0.5; |
| |
| for ( auto _ : state ) { |
| |
| forall<exec_policy>(0, domain.n_real_zones, |
| [&] (Index_type ii) { |
| |
| Index_type i = domain.real_zones[ii] ; |
| |
| Real_type xi = half * ( x1[i] + x2[i] - x3[i] - x4[i] ) ; |
| Real_type xj = half * ( x2[i] + x3[i] - x4[i] - x1[i] ) ; |
| |
| Real_type yi = half * ( y1[i] + y2[i] - y3[i] - y4[i] ) ; |
| Real_type yj = half * ( y2[i] + y3[i] - y4[i] - y1[i] ) ; |
| |
| Real_type fxi = half * ( fx1[i] + fx2[i] - fx3[i] - fx4[i] ) ; |
| Real_type fxj = half * ( fx2[i] + fx3[i] - fx4[i] - fx1[i] ) ; |
| |
| Real_type fyi = half * ( fy1[i] + fy2[i] - fy3[i] - fy4[i] ) ; |
| Real_type fyj = half * ( fy2[i] + fy3[i] - fy4[i] - fy1[i] ) ; |
| |
| Real_type rarea = 1.0 / ( xi * yj - xj * yi + ptiny ) ; |
| |
| Real_type dfxdx = rarea * ( fxi * yj - fxj * yi ) ; |
| |
| Real_type dfydy = rarea * ( fyj * xi - fyi * xj ) ; |
| |
| Real_type affine = ( fy1[i] + fy2[i] + fy3[i] + fy4[i] ) / |
| ( y1[i] + y2[i] + y3[i] + y4[i] ) ; |
| |
| div[i] = dfxdx + dfydy + affine ; |
| } ); |
| |
| } |
| } |
| |
| BENCHMARK(BM_DEL_DOT_VEC_2D_LAMBDA)->Arg(SHORT)->Arg(MEDIUM)-> |
| Arg(LONG)->Unit(benchmark::kMicrosecond); |
| |
| static void BM_COUPLE_LAMBDA(benchmark::State& state) { |
| |
| LoopData& loop_data = getLoopData(); |
| |
| loopInit(COUPLE); |
| |
| Complex_ptr t0 = loop_data.array_1D_Complex[0]; |
| Complex_ptr t1 = loop_data.array_1D_Complex[1]; |
| Complex_ptr t2 = loop_data.array_1D_Complex[2]; |
| Complex_ptr denac = loop_data.array_1D_Complex[3]; |
| Complex_ptr denlw = loop_data.array_1D_Complex[4]; |
| |
| |
| ADomain domain(state.range(0), /* ndims = */ 3); |
| |
| Index_type imin = domain.imin; |
| Index_type imax = domain.imax; |
| Index_type jmin = domain.jmin; |
| Index_type jmax = domain.jmax; |
| Index_type kmin = domain.kmin; |
| Index_type kmax = domain.kmax; |
| |
| const Real_type clight=3.e+10; |
| const Real_type csound=3.09e+7; |
| const Real_type omega0= 0.9; |
| const Real_type omegar= 0.9; |
| const Real_type dt= 0.208; |
| const Real_type c10 = 0.25 * (clight / csound); |
| const Real_type fratio = sqrt(omegar / omega0); |
| const Real_type r_fratio = 1.0/fratio; |
| const Real_type c20 = 0.25 * (clight / csound) * r_fratio; |
| const Complex_type ireal(0.0, 1.0); |
| |
| for ( auto _ : state ) { |
| |
| forall<exec_policy>(kmin, kmax, |
| [&] (Index_type k) { |
| |
| for (Index_type j = jmin; j < jmax; j++) { |
| |
| Index_type it0= ((k)*(jmax+1) + (j))*(imax+1) ; |
| Index_type idenac= ((k)*(jmax+2) + (j))*(imax+2) ; |
| |
| for (Index_type i = imin; i < imax; i++) { |
| |
| Complex_type c1 = c10 * denac[idenac+i]; |
| Complex_type c2 = c20 * denlw[it0+i]; |
| |
| /* promote to doubles to avoid possible divide by zero |
| errors later on. */ |
| Real_type c1re = real(c1); Real_type c1im = imag(c1); |
| Real_type c2re = real(c2); Real_type c2im = imag(c2); |
| |
| /* compute lamda = sqrt(|c1|^2 + |c2|^2) using doubles |
| to avoid underflow. */ |
| Real_type zlam = c1re*c1re + c1im*c1im + |
| c2re*c2re + c2im*c2im + 1.0e-34; |
| zlam = sqrt(zlam); |
| Real_type snlamt = sin(zlam * dt * 0.5); |
| Real_type cslamt = cos(zlam * dt * 0.5); |
| |
| Complex_type a0t = t0[it0+i]; |
| Complex_type a1t = t1[it0+i]; |
| Complex_type a2t = t2[it0+i] * fratio; |
| |
| Real_type r_zlam= 1.0/zlam; |
| c1 *= r_zlam; |
| c2 *= r_zlam; |
| Real_type zac1 = zabs2(c1); |
| Real_type zac2 = zabs2(c2); |
| |
| /* compute new A0 */ |
| Complex_type z3 = ( c1 * a1t + c2 * a2t ) * snlamt ; |
| t0[it0+i] = a0t * cslamt - ireal * z3; |
| |
| /* compute new A1 */ |
| Real_type r = zac1 * cslamt + zac2; |
| Complex_type z5 = c2 * a2t; |
| Complex_type z4 = conj(c1) * z5 * (cslamt-1); |
| z3 = conj(c1) * a0t * snlamt; |
| t1[it0+i] = a1t * r + z4 - ireal * z3; |
| |
| /* compute new A2 */ |
| r = zac1 + zac2 * cslamt; |
| z5 = c1 * a1t; |
| z4 = conj(c2) * z5 * (cslamt-1); |
| z3 = conj(c2) * a0t * snlamt; |
| t2[it0+i] = ( a2t * r + z4 - ireal * z3 ) * r_fratio; |
| |
| } // i loop |
| |
| } // j loop |
| |
| } ); // k loop |
| |
| } // google benchmark loop |
| } |
| |
| BENCHMARK(BM_COUPLE_LAMBDA)->Arg(SHORT)->Arg(MEDIUM)-> |
| Arg(LONG)->Unit(benchmark::kMicrosecond); |
| |
| static void BM_FIR_LAMBDA(benchmark::State& state) { |
| |
| LoopData& loop_data = getLoopData(); |
| |
| loopInit(FIR); |
| |
| Real_ptr out = loop_data.array_1D_Real[0]; |
| Real_ptr in = loop_data.array_1D_Real[1]; |
| |
| const Index_type coefflen = 16; |
| Real_type coeff[coefflen] = { 3.0, -1.0, -1.0, -1.0, |
| -1.0, 3.0, -1.0, -1.0, |
| -1.0, -1.0, 3.0, -1.0, |
| -1.0, -1.0, -1.0, 3.0 }; |
| const Index_type len_minus_coeff = state.range(0) - coefflen; |
| |
| Index_type val = 0; |
| |
| for ( auto _ : state ) { |
| |
| forall<exec_policy>(0, len_minus_coeff, |
| [&] (Index_type i) { |
| Real_type sum = 0.0; |
| for (Index_type j = 0; j < coefflen; ++j ) { |
| sum += coeff[j]*in[i+j]; |
| } |
| out[i] = sum; |
| } ); |
| |
| } |
| } |
| |
| BENCHMARK(BM_FIR_LAMBDA)->Arg(171)->Arg(5001)-> |
| Arg(44217)->Unit(benchmark::kMicrosecond); |