| #include <iostream> |
| |
| #include <hip/hip_runtime.h> |
| |
| #define N (1024 * 500) |
| |
| __global__ void saxpy(float a, float* x, float* y) { |
| size_t tid = blockIdx.x * blockDim.x + threadIdx.x; |
| if (tid < N) y[tid] = a * x[tid] + y[tid]; |
| } |
| |
| int main() { |
| |
| const float a = 100.0f; |
| float* x = (float*)malloc(N * sizeof(float)); |
| float* y = (float*)malloc(N * sizeof(float)); |
| |
| // Initialize the input data. |
| for (size_t i = 0; i < N; ++i) { |
| x[i] = static_cast<float>(i); |
| y[i] = static_cast<float>(i * 2); |
| } |
| |
| // Make a copy for the GPU implementation. |
| float* d_x; |
| float* d_y; |
| hipMalloc((void**)&d_x, N * sizeof(float)); |
| hipMalloc((void**)&d_y, N * sizeof(float)); |
| hipMemcpy(d_x, x, N * sizeof(float), hipMemcpyHostToDevice); |
| hipMemcpy(d_y, y, N * sizeof(float), hipMemcpyHostToDevice); |
| |
| // CPU implementation of saxpy. |
| for (int i = 0; i < N; i++) { |
| y[i] = a * x[i] + y[i]; |
| } |
| |
| // Launch a GPU kernel to compute the saxpy. |
| saxpy<<<(N+255)/256, 256>>>(a, d_x, d_y); |
| |
| // Copy the device results to host. |
| float* h_y = (float*)malloc(N * sizeof(float)); |
| hipDeviceSynchronize(); |
| hipMemcpy(h_y, d_y, N * sizeof(float), hipMemcpyDeviceToHost); |
| |
| // Verify the results match CPU. |
| int errors = 0; |
| for (int i = 0; i < N; i++) { |
| if (fabs(y[i] - h_y[i]) > fabs(y[i] * 0.0001f)) |
| errors++; |
| } |
| if (errors != 0) |
| std::cout << errors << " errors" << std::endl; |
| else |
| std::cout << "PASSED!" << std::endl; |
| |
| free(h_y); |
| free(x); |
| free(y); |
| hipFree(d_x); |
| hipFree(d_y); |
| return errors; |
| } |