Hi there,
I’m applying UVA and OpenMP in my algorithm to make it powerful.
The thing is that when I launch a parallel kernel, that is for example, 3 CPU threads launch one kernel at the same time. One thread has nan values.
It seems that GPU X cannot read a variable from GPU0.
That is weird taking into account that I grant access to every GPU to 0 (In this case 1 and 2).
Is there a problem to use UVA and OpenMP together? Or is a problem of the code?
Here is the code and the results.
UVA Access:
if(num_gpus > 1){
for(int i=1; i<=num_gpus-1; i++){
cudaDeviceProp dprop0, dpropX;
cudaGetDeviceProperties(&dprop0, 0);
cudaGetDeviceProperties(&dpropX, i);
int canAccessPeer0_x, canAccessPeerx_0;
cudaDeviceCanAccessPeer(&canAccessPeer0_x, 0, i);
cudaDeviceCanAccessPeer(&canAccessPeerx_0 , i, 0);
printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", dprop0.name, 0, dpropX.name, i, canAccessPeer0_x ? "Yes" : "No");
printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", dpropX.name, i, dprop0.name, 0, canAccessPeerx_0 ? "Yes" : "No");
if(canAccessPeer0_x == 0 || canAccessPeerx_0 == 0){
printf("Two or more SM 2.0 class GPUs are required for %s to run.\n", argv[0]);
printf("Support for UVA requires a GPU with SM 2.0 capabilities.\n");
printf("Peer to Peer access is not available between GPU%d <-> GPU%d, waiving test.\n", 0, i);
exit(EXIT_SUCCESS);
}else{
cudaSetDevice(0);
cudaDeviceEnablePeerAccess(i,0);
printf("Granting access from 0 to %d\n", i);
cudaSetDevice(i);
cudaDeviceEnablePeerAccess(0,0);
printf("Granting access from %d to 0\n", i);
printf("Checking GPU%d and GPU%d for UVA capabilities...\n", 0, 1);
const bool has_uva = (dprop0.unifiedAddressing && dpropX.unifiedAddressing);
printf("> %s (GPU%d) supports UVA: %s\n", dprop0.name, 0, (dprop0.unifiedAddressing ? "Yes" : "No"));
printf("> %s (GPU%d) supports UVA: %s\n", dpropX.name, i, (dpropX.unifiedAddressing ? "Yes" : "No"));
if (has_uva){
printf("Both GPUs can support UVA, enabling...\n");
}
else{
printf("At least one of the two GPUs does NOT support UVA, waiving test.\n");
exit(EXIT_SUCCESS);
}
}
}
}
cudaSetDevice(0);
gpuErrchk(cudaMalloc((void**)&device_I, sizeof(cufftComplex)*M*N));
gpuErrchk(cudaMalloc((void**)&device_total_atten_image, sizeof(cufftComplex)*M*N));
gpuErrchk(cudaMalloc((void**)&device_noise_image, sizeof(cufftComplex)*M*N));
gpuErrchk(cudaMalloc((void**)&device_fg_image, sizeof(cufftComplex)*M*N));
gpuErrchk(cudaMalloc((void**)&device_dphi, sizeof(float)*M*N));
gpuErrchk(cudaMalloc((void**)&device_dchi2_total, sizeof(float)*M*N));
gpuErrchk(cudaMalloc((void**)&device_dH, sizeof(float)*M*N));
gpuErrchk(cudaMalloc((void**)&device_gridUV, sizeof(float2)*M*N));
gpuErrchk(cudaMalloc((void**)&device_H, sizeof(float)*M*N));
gpuErrchk(cudaMemset(device_I, 0, sizeof(cufftComplex)*M*N));
gpuErrchk(cudaMemset(device_total_atten_image, 0, sizeof(cufftComplex)*M*N));
gpuErrchk(cudaMemset(device_noise_image, 0, sizeof(cufftComplex)*M*N));
gpuErrchk(cudaMemset(device_fg_image, 0, sizeof(cufftComplex)*M*N));
gpuErrchk(cudaMemset(device_H, 0, sizeof(float)*M*N));
gpuErrchk(cudaMemset(device_dH, 0, sizeof(float)*M*N));
gpuErrchk(cudaMemset(device_dchi2_total, 0, sizeof(float)*M*N));
gpuErrchk(cudaMemset(device_dphi, 0, sizeof(float)*M*N));
gpuErrchk(cudaMemset(device_gridUV, 0, sizeof(float2)*M*N));
//ERROR HERE, A GPU CANNOT READ device_gridUV INFO
gpuErrchk(cudaMemcpy2D(device_gridUV, sizeof(float2), host_griduv, sizeof(float2), sizeof(float2), M*N, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy2D(device_I, sizeof(cufftComplex), host_I, sizeof(cufftComplex), sizeof(cufftComplex), M*N, cudaMemcpyHostToDevice));
#pragma omp parallel
{
unsigned int i = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
// set and check the CUDA device for this CPU thread
int gpu_id = -1;
cudaSetDevice(i % num_gpus); // "% num_gpus" allows more CPU threads than GPU devices
//printf("GPU %d\n", i%num_gpus);
cudaGetDevice(&gpu_id);
printf("CPU thread %d (of %d) uses CUDA device %d\n", i, num_cpu_threads, gpu_id);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
getcoeff<<<visibilities[i].numBlocksUV, visibilities[i].threadsPerBlockUV>>>(device_gridUV, device_visibilities[i].u, device_visibilities[i].v, device_vars[i].X, device_vars[i].coeff, data.numVisibilitiesPerFreq[i], N, deltau, deltav);
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
//printf("CUDA getcoeff execution time = %f ms\n",time);
global_time = global_time + time;
}
Kernel
__global__ void getcoeff(float2 *nuv, float *u, float *v, int *X, float *coeff, long numVisibilities, long N, float deltau, float deltav, int tid)
{
// Get our global thread ID
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < numVisibilities){
int x = floorf(N/2 + u[i]/deltau);
int y = floorf(N/2 + v[i]/deltav);
X[2*i] = x;
X[2*i+1] = y;
float nuv1 = nuv[N*y+x].x;
float nuv2 = nuv[N*y+x].y;
float alphau1 = (u[i]-nuv1)/deltau;
float alphau2 = 1 - alphau1;
float alphav1 = (v[i]-nuv2)/deltav;
float alphav2 = 1 - alphav1;
coeff[4*i] = alphav2*alphau2;
coeff[4*i+1] = alphav2*alphau1;
coeff[4*i+2] = alphav1*alphau2;
coeff[4*i+3] = alphav1*alphau1;
if(i==0){
printf("Tid %d\n", tid);
printf("X: %d, Y: %d\n", x, y);
printf("deltau = %f, deltav = %f\n", deltau, deltav);
printf("N = %d\n", N);
printf("u = %f, v= %f\n", u[i], v[i]);
printf("nuv1 = %f, nuv2 = %f\n", nuv[0].x, nuv[0].y);
printf("Coef 1: %f, Coef 2: %f, Coef 3: %f, Coef 4: %f\n", coeff[4*i], coeff[4*i+1], coeff[4*i+2], coeff[4*i+3]);
printf("\n");
}
}
}
Results
> Peer-to-Peer (P2P) access from Tesla K80 (GPU0) -> Tesla K80 (GPU1) : Yes
> Peer-to-Peer (P2P) access from Tesla K80 (GPU1) -> Tesla K80 (GPU0) : Yes
Granting access from 0 to 1
Granting access from 1 to 0
Checking GPU0 and GPU1 for UVA capabilities...
> Tesla K80 (GPU0) supports UVA: Yes
> Tesla K80 (GPU1) supports UVA: Yes
Both GPUs can support UVA, enabling...
> Peer-to-Peer (P2P) access from Tesla K80 (GPU0) -> Tesla K80 (GPU2) : Yes
> Peer-to-Peer (P2P) access from Tesla K80 (GPU2) -> Tesla K80 (GPU0) : Yes
Granting access from 0 to 2
Granting access from 2 to 0
Checking GPU0 and GPU1 for UVA capabilities...
> Tesla K80 (GPU0) supports UVA: Yes
> Tesla K80 (GPU2) supports UVA: Yes
Both GPUs can support UVA, enabling...
Reading visibilities and FITS input files...
FITS Files READ
Database connection okay again!
NumVisibilities per frequency 0 = 15034 = 708889018368.000000
NumVisibilities per frequency 1 = 23808 = 693874393088.000000
NumVisibilities per frequency 2 = 30250 = 691457359872.000000
MS: Ra: -2.10880231857299804687500, dec: -0.73867833614349365234375
FITS: Ra: 4.17438316345214843750000, dec: -0.73867833614349365234375
Image Center: 256.33322143554687500000000, 256.00000000000000000000000
fg_scale = 0.000012
CPU thread 1 (of 3) uses CUDA device 1
CPU thread 0 (of 3) uses CUDA device 0
CPU thread 2 (of 3) uses CUDA device 2
Tid 0
X: 275, Y: 258
deltau = 20143.046875, deltav = 20143.046875
N = 512
u = 396961.093750, v= 40705.386719
nuv1 = -5136477.000000, nuv2 = -5136477.000000
Coef 1: 2.557445, Coef 2: -0.578703, Coef 3: -1.264985, Coef 4: 0.286243
Tid 2
X: 266, Y: 237
deltau = 20143.046875, deltav = 20143.046875
N = 512
u = 205166.968750, v= -378639.687500
nuv1 = -5136477.000000, nuv2 = -5136477.000000
Coef 1: 3.260033, Coef 2: -1.462941, Coef 3: -1.445974, Coef 4: 0.648881
Tid 1
X: 268, Y: 244
deltau = 20143.046875, deltav = 20143.046875
N = 512
u = 249470.984375, v= -226985.406250
nuv1 = -nan, nuv2 = -nan
Coef 1: nan, Coef 2: nan, Coef 3: nan, Coef 4: nan
Minimal Viable Code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "math_constants.h"
#include <omp.h>
#include <cufft.h>
inline bool IsGPUCapableP2P(cudaDeviceProp *pProp)
{
#ifdef _WIN32
return (bool)(pProp->tccDriver ? true : false);
#else
return (bool)(pProp->major >= 2);
#endif
}
inline bool IsAppBuiltAs64()
{
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
return 1;
#else
return 0;
#endif
}
__global__ void kernelFunction(cufftComplex *I, int i, int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
int k = threadIdx.y + blockDim.y * blockIdx.y;
if(j==0 & k==0){
printf("I'm thread %d and I'm reading device_I[0] = %f\n", i, I[N*j+k].x);
}
}
__host__ int main(int argc, char **argv) {
int num_gpus;
cudaGetDeviceCount(&num_gpus);
if(num_gpus < 1){
printf("No CUDA capable devices were detected\n");
return 1;
}
if (!IsAppBuiltAs64()){
printf("%s is only supported with on 64-bit OSs and the application must be built as a 64-bit target. Test is being waived.\n", argv[0]);
exit(EXIT_SUCCESS);
}
printf("Number of host CPUs:\t%d\n", omp_get_num_procs());
printf("Number of CUDA devices:\t%d\n", num_gpus);
for(int i = 0; i < num_gpus; i++){
cudaDeviceProp dprop;
cudaGetDeviceProperties(&dprop, i);
printf("> GPU%d = \"%15s\" %s capable of Peer-to-Peer (P2P)\n", i, dprop.name, (IsGPUCapableP2P(&dprop) ? "IS " : "NOT"));
//printf(" %d: %s\n", i, dprop.name);
}
printf("---------------------------\n");
num_gpus = 3; //The case that fails
omp_set_num_threads(num_gpus);
if(num_gpus > 1){
for(int i=1; i<num_gpus; i++){
cudaDeviceProp dprop0, dpropX;
cudaGetDeviceProperties(&dprop0, 0);
cudaGetDeviceProperties(&dpropX, i);
int canAccessPeer0_x, canAccessPeerx_0;
cudaDeviceCanAccessPeer(&canAccessPeer0_x, 0, i);
cudaDeviceCanAccessPeer(&canAccessPeerx_0 , i, 0);
printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", dprop0.name, 0, dpropX.name, i, canAccessPeer0_x ? "Yes" : "No");
printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", dpropX.name, i, dprop0.name, 0, canAccessPeerx_0 ? "Yes" : "No");
if(canAccessPeer0_x == 0 || canAccessPeerx_0 == 0){
printf("Two or more SM 2.0 class GPUs are required for %s to run.\n", argv[0]);
printf("Support for UVA requires a GPU with SM 2.0 capabilities.\n");
printf("Peer to Peer access is not available between GPU%d <-> GPU%d, waiving test.\n", 0, i);
exit(EXIT_SUCCESS);
}else{
cudaSetDevice(0);
printf("Granting access from 0 to %d...\n", i);
cudaDeviceEnablePeerAccess(i,0);
cudaSetDevice(i);
printf("Granting access from %d to 0...\n", i);
cudaDeviceEnablePeerAccess(0,0);
printf("Checking GPU%d and GPU%d for UVA capabilities...\n", 0, 1);
const bool has_uva = (dprop0.unifiedAddressing && dpropX.unifiedAddressing);
printf("> %s (GPU%d) supports UVA: %s\n", dprop0.name, 0, (dprop0.unifiedAddressing ? "Yes" : "No"));
printf("> %s (GPU%d) supports UVA: %s\n", dpropX.name, i, (dpropX.unifiedAddressing ? "Yes" : "No"));
if (has_uva){
printf("Both GPUs can support UVA, enabling...\n");
}
else{
printf("At least one of the two GPUs does NOT support UVA, waiving test.\n");
exit(EXIT_SUCCESS);
}
}
}
}
int M = 512;
int N = 512;
cufftComplex *host_I = (cufftComplex*)malloc(M*N*sizeof(cufftComplex));
for(int i=0;i<M;i++){
for(int j=0;j<N;j++){
host_I[N*i+j].x = 0.001;
host_I[N*i+j].y = 0;
}
}
cufftComplex *device_I;
cudaSetDevice(0);
cudaMalloc((void**)&device_I, sizeof(cufftComplex)*M*N);
cudaMemset(device_I, 0, sizeof(cufftComplex)*M*N);
cudaMemcpy2D(device_I, sizeof(cufftComplex), host_I, sizeof(cufftComplex), sizeof(cufftComplex), M*N, cudaMemcpyHostToDevice);
dim3 threads(32,32);
dim3 blocks(M/threads.x, N/threads.y);
dim3 threadsPerBlockNN = threads;
dim3 numBlocksNN = blocks;
#pragma omp parallel
{
unsigned int i = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
// set and check the CUDA device for this CPU thread
int gpu_id = -1;
cudaSetDevice(i % num_gpus); // "% num_gpus" allows more CPU threads than GPU devices
cudaGetDevice(&gpu_id);
//printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
kernelFunction<<<numBlocksNN, threadsPerBlockNN>>>(device_I, i, N);
cudaDeviceSynchronize();
}
cudaFree(device_I);
for(int i=1; i<num_gpus; i++){
cudaSetDevice(0);
cudaDeviceDisablePeerAccess(i);
cudaSetDevice(i);
cudaDeviceDisablePeerAccess(0);
}
for(int i=0; i<num_gpus; i++ ){
cudaSetDevice(i);
cudaDeviceReset();
}
free(host_I);
}
If you need more information about this, just ask me. I really want to resolve this bug in my program.
Thanks!