External/HIP/saxpy.hip - llvm-test-suite - Git at Google

 #include <iostream>

 #include <hip/hip_runtime.h>

 #define N  (1024 * 500)

 __global__ void saxpy(float a, float* x, float* y) {
     size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid < N) y[tid] = a * x[tid] + y[tid];
 }

 int main() {

   const float a = 100.0f;
   float* x = (float*)malloc(N * sizeof(float));
   float* y = (float*)malloc(N * sizeof(float));

   // Initialize the input data.
   for (size_t i = 0; i < N; ++i) {
     x[i] = static_cast<float>(i);
     y[i] = static_cast<float>(i * 2);
   }

   // Make a copy for the GPU implementation.
   float* d_x;
   float* d_y;
   hipMalloc((void**)&d_x, N * sizeof(float));
   hipMalloc((void**)&d_y, N * sizeof(float));
   hipMemcpy(d_x, x, N * sizeof(float), hipMemcpyHostToDevice);
   hipMemcpy(d_y, y, N * sizeof(float), hipMemcpyHostToDevice);

   // CPU implementation of saxpy.
   for (int i = 0; i < N; i++) {
     y[i] = a * x[i] + y[i];
   }

   // Launch a GPU kernel to compute the saxpy.
   saxpy<<<(N+255)/256, 256>>>(a, d_x, d_y);

   // Copy the device results to host.
   float* h_y = (float*)malloc(N * sizeof(float));
   hipDeviceSynchronize();
   hipMemcpy(h_y, d_y, N * sizeof(float), hipMemcpyDeviceToHost);

   // Verify the results match CPU.
   int errors = 0;
   for (int i = 0; i < N; i++) {
     if (fabs(y[i] - h_y[i]) > fabs(y[i] * 0.0001f))
       errors++;
   }
   if (errors != 0)
     std::cout << errors << " errors" << std::endl;
   else
     std::cout << "PASSED!" << std::endl;

   free(h_y);
   free(x);
   free(y);
   hipFree(d_x);
   hipFree(d_y);
   return errors;
 }
	#include <iostream>

	#include <hip/hip_runtime.h>

	#define N (1024 * 500)

	__global__ void saxpy(float a, float* x, float* y) {
	size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
	if (tid < N) y[tid] = a * x[tid] + y[tid];
	}

	int main() {

	const float a = 100.0f;
	float* x = (float)malloc(N sizeof(float));
	float* y = (float)malloc(N sizeof(float));

	// Initialize the input data.
	for (size_t i = 0; i < N; ++i) {
	x[i] = static_cast<float>(i);
	y[i] = static_cast<float>(i * 2);
	}

	// Make a copy for the GPU implementation.
	float* d_x;
	float* d_y;
	hipMalloc((void*)&d_x, N sizeof(float));
	hipMalloc((void*)&d_y, N sizeof(float));
	hipMemcpy(d_x, x, N * sizeof(float), hipMemcpyHostToDevice);
	hipMemcpy(d_y, y, N * sizeof(float), hipMemcpyHostToDevice);

	// CPU implementation of saxpy.
	for (int i = 0; i < N; i++) {
	y[i] = a * x[i] + y[i];
	}

	// Launch a GPU kernel to compute the saxpy.
	saxpy<<<(N+255)/256, 256>>>(a, d_x, d_y);

	// Copy the device results to host.
	float* h_y = (float)malloc(N sizeof(float));
	hipDeviceSynchronize();
	hipMemcpy(h_y, d_y, N * sizeof(float), hipMemcpyDeviceToHost);

	// Verify the results match CPU.
	int errors = 0;
	for (int i = 0; i < N; i++) {
	if (fabs(y[i] - h_y[i]) > fabs(y[i] * 0.0001f))
	errors++;
	}
	if (errors != 0)
	std::cout << errors << " errors" << std::endl;
	else
	std::cout << "PASSED!" << std::endl;

	free(h_y);
	free(x);
	free(y);
	hipFree(d_x);
	hipFree(d_y);
	return errors;
	}