When using OpenMP with Nvidia HPC compilers, num_threads(X) in the OpenMP pragma clause is ignored for values bigger than 128, which is then set to 128. Could this be a bug? For values such as 32 and 64 I receive the correct value from omp_get_num_threads().
I have a small program that shows the result with comments on line 55 showing the result. Here is my program:
#include <stdio.h>
#include <math.h>
#include <chrono>
#include <iostream>
#include "omp.h"
#include <cuda_runtime.h>
// https://leimao.github.io/blog/Proper-CUDA-Error-Checking/
#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
template <typename T>
void check(T err, const char *const func, const char *const file,
const int line)
{
if (err != cudaSuccess)
{
std::cerr << "CUDA Runtime Error at: " << file << ":" << line
<< std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
std::exit(EXIT_FAILURE);
}
}
int main(void)
{
constexpr size_t N = 1000000; // 10**6
constexpr float r = 2.0;
constexpr float yval = 5.0;
constexpr float xval = 10.0;
/* Initialization of vectors */
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
float *x = new float[N];
float *y = new float[N];
std::fill(x, x + N, xval);
std::fill(y, y + N, yval);
float *d_x;
float *d_y;
CHECK_CUDA_ERROR(cudaMalloc((void **)&d_x, sizeof(float) * N));
CHECK_CUDA_ERROR(cudaMalloc((void **)&d_y, sizeof(float) * N));
CHECK_CUDA_ERROR(cudaMemcpy((void *)d_x, (void *)x, sizeof(float) * N, cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaMemcpy((void *)d_y, (void *)y, sizeof(float) * N, cudaMemcpyHostToDevice));
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
std::cout << "Cuda mempcy: " << std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << " [μs]" << std::endl;
begin = std::chrono::steady_clock::now();
/* Dot product of two vectors */
omp_set_num_teams(4096);
omp_set_num_threads(512);
#pragma omp target teams distribute parallel for schedule(static) is_device_ptr(d_x, d_y) num_teams(4096) num_threads(512)
for (size_t i = 0; i < N; i++)
{
int num_teams = omp_get_num_teams();
int num_threads = omp_get_num_threads();
printf("%d, %d\n", num_teams, num_threads); // It prints 128 instead of 512, when num_threads si set to 64 it prints 64!!!
d_y[i] = (r * d_x[i]) + d_y[i];
}
end = std::chrono::steady_clock::now();
std::cout << "Saxpy: " << std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << " [μs]" << std::endl;
CHECK_CUDA_ERROR(cudaMemcpy((void *)y, (void *)d_y, sizeof(float) * N, cudaMemcpyDeviceToHost));
double sum = 0.0;
for (size_t i = 0; i < N; i++)
{
sum += y[i];
}
double expected = (((r * xval) * N) + yval * N);
printf("The sum is: %2.2f (expected: %2.2f)\n", sum, expected);
return 0;
}
I have compiled the program with the following command:
nvc++ -cuda -mp=gpu -Minfo -gpu=cc86 -o omp_kernel omp_kernel.cpp