Thank you for the advice, yes I can give you to both programs and the commands on how I compile them.
OpenMP version: (called omp_kernel.cpp)
#include <stdio.h>
#include <math.h>
#include <chrono>
#include <iostream>
#include "omp.h"
#include <cuda_runtime.h>
// https://leimao.github.io/blog/Proper-CUDA-Error-Checking/
#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
template <typename T>
void check(T err, const char *const func, const char *const file,
const int line)
{
if (err != cudaSuccess)
{
std::cerr << "CUDA Runtime Error at: " << file << ":" << line
<< std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
std::exit(EXIT_FAILURE);
}
}
int main(void)
{
constexpr size_t N = 1000000; // 10**6
constexpr float r = 2.0;
constexpr float yval = 5.0;
constexpr float xval = 10.0;
/* Initialization of vectors */
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
float *x = new float[N];
float *y = new float[N];
std::fill(x, x + N, xval);
std::fill(y, y + N, yval);
float *d_x;
float *d_y;
CHECK_CUDA_ERROR(cudaMalloc((void **)&d_x, sizeof(float) * N));
CHECK_CUDA_ERROR(cudaMalloc((void **)&d_y, sizeof(float) * N));
CHECK_CUDA_ERROR(cudaMemcpy((void *)d_x, (void *)x, sizeof(float) * N, cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaMemcpy((void *)d_y, (void *)y, sizeof(float) * N, cudaMemcpyHostToDevice));
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
std::cout << "Cuda mempcy: " << std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << " [μs]" << std::endl;
begin = std::chrono::steady_clock::now();
/* Dot product of two vectors */
omp_set_num_teams(4096);
omp_set_num_threads(512);
#pragma omp target teams distribute parallel for schedule(static, 1) is_device_ptr(d_x, d_y) num_teams(4096) num_threads(128)
for (size_t i = 0; i < N; i++)
{
d_y[i] = (r * d_x[i]) + d_y[i];
}
end = std::chrono::steady_clock::now();
std::cout << "Saxpy: " << std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << " [μs]" << std::endl;
CHECK_CUDA_ERROR(cudaMemcpy((void *)y, (void *)d_y, sizeof(float) * N, cudaMemcpyDeviceToHost));
double sum = 0.0;
for (size_t i = 0; i < N; i++)
{
sum += y[i];
}
double expected = (((r * xval) * N) + yval * N);
printf("The sum is: %2.2f (expected: %2.2f)\n", sum, expected);
return 0;
}
CUDA version: (called cuda_kernel.cu)
#include <algorithm>
#include <chrono>
#include <iostream>
#include <stdio.h>
#include <cuda.h>
// https://leimao.github.io/blog/Proper-CUDA-Error-Checking/
#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
template <typename T>
void check(T err, const char *const func, const char *const file,
const int line)
{
if (err != cudaSuccess)
{
std::cerr << "CUDA Runtime Error at: " << file << ":" << line
<< std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
std::exit(EXIT_FAILURE);
}
}
__global__ void saxpy(int n, float a, float *x, float *y)
{
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < n;
i += blockDim.x * gridDim.x)
{
y[i] = (a * x[i]) + y[i];
}
}
int main()
{
constexpr size_t N = 1000000; // 10**6
constexpr float r = 2.0;
constexpr float yval = 5.0;
constexpr float xval = 10.0;
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
float *x = new float[N];
float *y = new float[N];
std::fill(x, x + N, xval);
std::fill(y, y + N, yval);
float *d_x;
float *d_y;
CHECK_CUDA_ERROR(cudaMalloc((void **)&d_x, sizeof(float) * N));
CHECK_CUDA_ERROR(cudaMalloc((void **)&d_y, sizeof(float) * N));
CHECK_CUDA_ERROR(cudaMemcpy((void *)d_x, (void *)x, sizeof(float) * N, cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaMemcpy((void *)d_y, (void *)y, sizeof(float) * N, cudaMemcpyHostToDevice));
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
std::cout << "Cuda mempcy: " << std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << " [μs]" << std::endl;
begin = std::chrono::steady_clock::now();
saxpy<<<4096, 128>>>(N, r, d_x, d_y);
cudaDeviceSynchronize();
end = std::chrono::steady_clock::now();
std::cout << "Saxpy: " << std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << " [μs]" << std::endl;
CHECK_CUDA_ERROR(cudaMemcpy((void *)y, (void *)d_y, sizeof(float) * N, cudaMemcpyDeviceToHost));
double sum = 0.0;
for (size_t i = 0; i < N; i++)
{
sum += y[i];
}
double expected = (((r * xval) * N) + yval * N);
printf("The sum is: %2.2f (expected: %2.2f)\n", sum, expected);
}
This is how I compile them:
nvc++ -cuda -mp=gpu -Minfo -gpu=cc86 -o omp omp_kernel.cpp
nvc++ -cuda -mp=gpu -Minfo -gpu=cc86 -o cuda cuda_kernel.cu
So far I have only tested the runtime on my local laptop with NVIDIA GeForce RTX 3050.
A notice, when I try to set constexpr size_t N = 1000000; // 10^6 to for example 10^9 I get a segmentation fault, so I have left the N at 10^6 so far)
I was too focused on getting the CUDA representation I forgot to profile the code. I will try loop and profiling next, thank you.