MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/time_kernels.hpp - llvm-test-suite - Git at Google

 #ifndef _time_kernels_hpp_
 #define _time_kernels_hpp_

 //@HEADER
 // ************************************************************************
 //
 // MiniFE: Simple Finite Element Assembly and Solve
 // Copyright (2006-2013) Sandia Corporation
 //
 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
 // license for use of this work by or on behalf of the U.S. Government.
 //
 // This library is free software; you can redistribute it and/or modify
 // it under the terms of the GNU Lesser General Public License as
 // published by the Free Software Foundation; either version 2.1 of the
 // License, or (at your option) any later version.
 //
 // This library is distributed in the hope that it will be useful, but
 // WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 // Lesser General Public License for more details.
 //
 // You should have received a copy of the GNU Lesser General Public
 // License along with this library; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 // USA
 //
 // ************************************************************************
 //@HEADER

 #include <cmath>

 #include <Vector_functions.hpp>
 #include <mytimer.hpp>

 #ifdef MINIFE_HAVE_CUDA
 #include <cuda.h>
 #endif

 namespace miniFE {

 template<typename OperatorType,
          typename VectorType,
          typename Matvec>
 void
 time_kernels(OperatorType& A,
              const VectorType& b,
              VectorType& x,
              Matvec matvec,
              typename OperatorType::LocalOrdinalType max_iter,
              typename OperatorType::ScalarType& xdotp,
              timer_type* my_kern_times)
 {
   typedef typename OperatorType::ScalarType ScalarType;
   typedef typename OperatorType::LocalOrdinalType OrdinalType;
   typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;

   timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0;

   int myproc = 0;
 #ifdef HAVE_MPI
   MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
 #endif

   if (!A.has_local_indices) {
     std::cerr << "miniFE::time_kernels ERROR, A.has_local_indices is false, needs to be true. This probably means "
        << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::time_kernels."
        << std::endl;
     return;
   }

   OrdinalType nrows = A.rows.size();
   OrdinalType ncols = A.num_cols;

   VectorType p(0, ncols, b.compute_node);

   ScalarType one = 1.0;
   ScalarType zero = 0.0;

   typedef typename VectorType::ComputeNodeType ComputeNodeType;
   ComputeNodeType& compute_node = x.compute_node;

   //The following lines that create and initialize buffers are no-ops in many
   //cases, but perform actual allocations and copies if a off-cpu device such as
   //a GPU is being used by compute_node.

   //Do any required allocations for buffers that will be needed during CG:
   ScalarType* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
   ScalarType* d_p = compute_node.get_buffer(&p.coefs[0], p.coefs.size());
   ScalarType* d_b = compute_node.get_buffer(&b.coefs[0], b.coefs.size());
   OrdinalType* d_Arowoff = compute_node.get_buffer(&A.row_offsets[0], A.row_offsets.size());
   OrdinalType* d_Acols   = compute_node.get_buffer(&A.packed_cols[0], A.packed_cols.size());
   ScalarType* d_Acoefs  = compute_node.get_buffer(&A.packed_coefs[0], A.packed_coefs.size());

   //Copy data to buffers that need to be initialized from input data:
   compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
   compute_node.copy_to_buffer(&b.coefs[0], b.coefs.size(), d_b);
   compute_node.copy_to_buffer(&A.row_offsets[0], A.row_offsets.size(), d_Arowoff);
   compute_node.copy_to_buffer(&A.packed_cols[0], A.packed_cols.size(), d_Acols);
   compute_node.copy_to_buffer(&A.packed_coefs[0], A.packed_coefs.size(), d_Acoefs);

   TICK();
   for(OrdinalType i=0; i<max_iter; ++i) {
     waxpby(one, x, zero, x, p);
   }
 #ifdef MINIFE_HAVE_CUDA
   cudaThreadSynchronize();
 #endif
   TOCK(tWAXPY);

   TICK();
   for(OrdinalType i=0; i<max_iter; ++i) {
     matvec(A, p, x);
   }
 #ifdef MINIFE_HAVE_CUDA
   cudaThreadSynchronize();
 #endif
   TOCK(tMATVEC);

   TICK();
   xdotp = 0;
   for(OrdinalType i=0; i<max_iter; ++i) {
     xdotp += dot(x, p);
   }
 #ifdef MINIFE_HAVE_CUDA
   cudaThreadSynchronize();
 #endif
   TOCK(tDOT);

   my_kern_times[WAXPY] = tWAXPY;
   my_kern_times[DOT] = tDOT;
   my_kern_times[MATVEC] = tMATVEC;
   my_kern_times[TOTAL] = 0;
 }

 }//namespace miniFE

 #endif
	#ifndef _time_kernels_hpp_
	#define _time_kernels_hpp_

	//@HEADER
	// ************************************************************************
	//
	// MiniFE: Simple Finite Element Assembly and Solve
	// Copyright (2006-2013) Sandia Corporation
	//
	// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
	// license for use of this work by or on behalf of the U.S. Government.
	//
	// This library is free software; you can redistribute it and/or modify
	// it under the terms of the GNU Lesser General Public License as
	// published by the Free Software Foundation; either version 2.1 of the
	// License, or (at your option) any later version.
	//
	// This library is distributed in the hope that it will be useful, but
	// WITHOUT ANY WARRANTY; without even the implied warranty of
	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	// Lesser General Public License for more details.
	//
	// You should have received a copy of the GNU Lesser General Public
	// License along with this library; if not, write to the Free Software
	// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
	// USA
	//
	// ************************************************************************
	//@HEADER

	#include <cmath>

	#include <Vector_functions.hpp>
	#include <mytimer.hpp>

	#ifdef MINIFE_HAVE_CUDA
	#include <cuda.h>
	#endif

	namespace miniFE {

	template<typename OperatorType,
	typename VectorType,
	typename Matvec>
	void
	time_kernels(OperatorType& A,
	const VectorType& b,
	VectorType& x,
	Matvec matvec,
	typename OperatorType::LocalOrdinalType max_iter,
	typename OperatorType::ScalarType& xdotp,
	timer_type* my_kern_times)
	{
	typedef typename OperatorType::ScalarType ScalarType;
	typedef typename OperatorType::LocalOrdinalType OrdinalType;
	typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;

	timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0;

	int myproc = 0;
	#ifdef HAVE_MPI
	MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
	#endif

	if (!A.has_local_indices) {
	std::cerr << "miniFE::time_kernels ERROR, A.has_local_indices is false, needs to be true. This probably means "
	<< "miniFE::make_local_matrix(A) was not called prior to calling miniFE::time_kernels."
	<< std::endl;
	return;
	}

	OrdinalType nrows = A.rows.size();
	OrdinalType ncols = A.num_cols;

	VectorType p(0, ncols, b.compute_node);

	ScalarType one = 1.0;
	ScalarType zero = 0.0;

	typedef typename VectorType::ComputeNodeType ComputeNodeType;
	ComputeNodeType& compute_node = x.compute_node;

	//The following lines that create and initialize buffers are no-ops in many
	//cases, but perform actual allocations and copies if a off-cpu device such as
	//a GPU is being used by compute_node.

	//Do any required allocations for buffers that will be needed during CG:
	ScalarType* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
	ScalarType* d_p = compute_node.get_buffer(&p.coefs[0], p.coefs.size());
	ScalarType* d_b = compute_node.get_buffer(&b.coefs[0], b.coefs.size());
	OrdinalType* d_Arowoff = compute_node.get_buffer(&A.row_offsets[0], A.row_offsets.size());
	OrdinalType* d_Acols = compute_node.get_buffer(&A.packed_cols[0], A.packed_cols.size());
	ScalarType* d_Acoefs = compute_node.get_buffer(&A.packed_coefs[0], A.packed_coefs.size());

	//Copy data to buffers that need to be initialized from input data:
	compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
	compute_node.copy_to_buffer(&b.coefs[0], b.coefs.size(), d_b);
	compute_node.copy_to_buffer(&A.row_offsets[0], A.row_offsets.size(), d_Arowoff);
	compute_node.copy_to_buffer(&A.packed_cols[0], A.packed_cols.size(), d_Acols);
	compute_node.copy_to_buffer(&A.packed_coefs[0], A.packed_coefs.size(), d_Acoefs);

	TICK();
	for(OrdinalType i=0; i<max_iter; ++i) {
	waxpby(one, x, zero, x, p);
	}
	#ifdef MINIFE_HAVE_CUDA
	cudaThreadSynchronize();
	#endif
	TOCK(tWAXPY);

	TICK();
	for(OrdinalType i=0; i<max_iter; ++i) {
	matvec(A, p, x);
	}
	#ifdef MINIFE_HAVE_CUDA
	cudaThreadSynchronize();
	#endif
	TOCK(tMATVEC);

	TICK();
	xdotp = 0;
	for(OrdinalType i=0; i<max_iter; ++i) {
	xdotp += dot(x, p);
	}
	#ifdef MINIFE_HAVE_CUDA
	cudaThreadSynchronize();
	#endif
	TOCK(tDOT);

	my_kern_times[WAXPY] = tWAXPY;
	my_kern_times[DOT] = tDOT;
	my_kern_times[MATVEC] = tMATVEC;
	my_kern_times[TOTAL] = 0;
	}

	}//namespace miniFE

	#endif