Getting nan results with CUDA UVA and OpenMP (Tesla K80 Cluster)

Hi there,

I’m applying UVA and OpenMP in my algorithm to make it powerful.

The thing is that when I launch a parallel kernel, that is for example, 3 CPU threads launch one kernel at the same time. One thread has nan values.

It seems that GPU X cannot read a variable from GPU0.

That is weird taking into account that I grant access to every GPU to 0 (In this case 1 and 2).

Is there a problem to use UVA and OpenMP together? Or is a problem of the code?

Here is the code and the results.

UVA Access:

if(num_gpus > 1){
	  for(int i=1; i<=num_gpus-1; i++){
			cudaDeviceProp dprop0, dpropX;
			cudaGetDeviceProperties(&dprop0, 0);
			cudaGetDeviceProperties(&dpropX, i);
			int canAccessPeer0_x, canAccessPeerx_0;
			cudaDeviceCanAccessPeer(&canAccessPeer0_x, 0, i);
			cudaDeviceCanAccessPeer(&canAccessPeerx_0 , i, 0);
			printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", dprop0.name, 0, dpropX.name, i, canAccessPeer0_x ? "Yes" : "No");
    	printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", dpropX.name, i, dprop0.name, 0, canAccessPeerx_0 ? "Yes" : "No");
			if(canAccessPeer0_x == 0 || canAccessPeerx_0 == 0){
				printf("Two or more SM 2.0 class GPUs are required for %s to run.\n", argv[0]);
        printf("Support for UVA requires a GPU with SM 2.0 capabilities.\n");
        printf("Peer to Peer access is not available between GPU%d <-> GPU%d, waiving test.\n", 0, i);
        exit(EXIT_SUCCESS);
			}else{
				cudaSetDevice(0);
				cudaDeviceEnablePeerAccess(i,0);
				printf("Granting access from 0 to %d\n", i);
				cudaSetDevice(i);
				cudaDeviceEnablePeerAccess(0,0);
				printf("Granting access from %d to 0\n", i);

				printf("Checking GPU%d and GPU%d for UVA capabilities...\n", 0, 1);
				const bool has_uva = (dprop0.unifiedAddressing && dpropX.unifiedAddressing);
				printf("> %s (GPU%d) supports UVA: %s\n", dprop0.name, 0, (dprop0.unifiedAddressing ? "Yes" : "No"));
    		printf("> %s (GPU%d) supports UVA: %s\n", dpropX.name, i, (dpropX.unifiedAddressing ? "Yes" : "No"));
				if (has_uva){
        	printf("Both GPUs can support UVA, enabling...\n");
    		}
    		else{
        	printf("At least one of the two GPUs does NOT support UVA, waiving test.\n");
        	exit(EXIT_SUCCESS);
    		}
			}
	 	}
  }
cudaSetDevice(0);
	gpuErrchk(cudaMalloc((void**)&device_I, sizeof(cufftComplex)*M*N));
	gpuErrchk(cudaMalloc((void**)&device_total_atten_image, sizeof(cufftComplex)*M*N));
	gpuErrchk(cudaMalloc((void**)&device_noise_image, sizeof(cufftComplex)*M*N));
	gpuErrchk(cudaMalloc((void**)&device_fg_image, sizeof(cufftComplex)*M*N));

	gpuErrchk(cudaMalloc((void**)&device_dphi, sizeof(float)*M*N));
	gpuErrchk(cudaMalloc((void**)&device_dchi2_total, sizeof(float)*M*N));
	gpuErrchk(cudaMalloc((void**)&device_dH, sizeof(float)*M*N));

	gpuErrchk(cudaMalloc((void**)&device_gridUV, sizeof(float2)*M*N));

	gpuErrchk(cudaMalloc((void**)&device_H, sizeof(float)*M*N));

	gpuErrchk(cudaMemset(device_I, 0, sizeof(cufftComplex)*M*N));
	gpuErrchk(cudaMemset(device_total_atten_image, 0, sizeof(cufftComplex)*M*N));
	gpuErrchk(cudaMemset(device_noise_image, 0, sizeof(cufftComplex)*M*N));
	gpuErrchk(cudaMemset(device_fg_image, 0, sizeof(cufftComplex)*M*N));

gpuErrchk(cudaMemset(device_H, 0, sizeof(float)*M*N));
  gpuErrchk(cudaMemset(device_dH, 0, sizeof(float)*M*N));
  gpuErrchk(cudaMemset(device_dchi2_total, 0, sizeof(float)*M*N));
  gpuErrchk(cudaMemset(device_dphi, 0, sizeof(float)*M*N));
  gpuErrchk(cudaMemset(device_gridUV, 0, sizeof(float2)*M*N));

  //ERROR HERE, A GPU CANNOT READ device_gridUV INFO
  gpuErrchk(cudaMemcpy2D(device_gridUV, sizeof(float2), host_griduv, sizeof(float2), sizeof(float2), M*N, cudaMemcpyHostToDevice));
  gpuErrchk(cudaMemcpy2D(device_I, sizeof(cufftComplex), host_I, sizeof(cufftComplex), sizeof(cufftComplex), M*N, cudaMemcpyHostToDevice));
#pragma omp parallel
		{
			unsigned int i = omp_get_thread_num();
			unsigned int num_cpu_threads = omp_get_num_threads();

			// set and check the CUDA device for this CPU thread
			int gpu_id = -1;
			cudaSetDevice(i % num_gpus);   // "% num_gpus" allows more CPU threads than GPU devices
      //printf("GPU %d\n", i%num_gpus);
			cudaGetDevice(&gpu_id);
      printf("CPU thread %d (of %d) uses CUDA device %d\n", i, num_cpu_threads, gpu_id);
			cudaEventCreate(&start);
			cudaEventCreate(&stop);
			cudaEventRecord(start, 0);
			getcoeff<<<visibilities[i].numBlocksUV, visibilities[i].threadsPerBlockUV>>>(device_gridUV, device_visibilities[i].u, device_visibilities[i].v, device_vars[i].X, device_vars[i].coeff, data.numVisibilitiesPerFreq[i], N, deltau, deltav);
			gpuErrchk(cudaDeviceSynchronize());
			cudaEventRecord(stop, 0);
			cudaEventSynchronize(stop);
			cudaEventElapsedTime(&time, start, stop);
			//printf("CUDA getcoeff execution time = %f ms\n",time);
			global_time = global_time + time;
		}

Kernel

__global__ void getcoeff(float2 *nuv, float *u, float *v, int *X, float *coeff, long numVisibilities, long N, float deltau, float deltav, int tid)
{
    // Get our global thread ID
    int i = threadIdx.x + blockDim.x * blockIdx.x;

    if (i < numVisibilities){
			int x = floorf(N/2 + u[i]/deltau);
			int y = floorf(N/2 + v[i]/deltav);

			X[2*i] = x;
			X[2*i+1] = y;

			float nuv1 = nuv[N*y+x].x;
			float nuv2 = nuv[N*y+x].y;

			float alphau1 = (u[i]-nuv1)/deltau;
			float alphau2 = 1 - alphau1;

			float alphav1 = (v[i]-nuv2)/deltav;
			float alphav2 = 1 - alphav1;

			coeff[4*i]   = alphav2*alphau2;
			coeff[4*i+1] = alphav2*alphau1;
			coeff[4*i+2] = alphav1*alphau2;
			coeff[4*i+3] = alphav1*alphau1;
      if(i==0){
        printf("Tid %d\n", tid);
        printf("X: %d, Y: %d\n", x, y);
        printf("deltau = %f, deltav = %f\n", deltau, deltav);
        printf("N = %d\n", N);
        printf("u = %f, v= %f\n", u[i], v[i]);
        printf("nuv1 = %f, nuv2 = %f\n", nuv[0].x, nuv[0].y);
        printf("Coef 1: %f, Coef 2: %f, Coef 3: %f, Coef 4: %f\n", coeff[4*i], coeff[4*i+1], coeff[4*i+2], coeff[4*i+3]);
        printf("\n");
      }
		}

}

Results

> Peer-to-Peer (P2P) access from Tesla K80 (GPU0) -> Tesla K80 (GPU1) : Yes
> Peer-to-Peer (P2P) access from Tesla K80 (GPU1) -> Tesla K80 (GPU0) : Yes
Granting access from 0 to 1
Granting access from 1 to 0
Checking GPU0 and GPU1 for UVA capabilities...
> Tesla K80 (GPU0) supports UVA: Yes
> Tesla K80 (GPU1) supports UVA: Yes
Both GPUs can support UVA, enabling...
> Peer-to-Peer (P2P) access from Tesla K80 (GPU0) -> Tesla K80 (GPU2) : Yes
> Peer-to-Peer (P2P) access from Tesla K80 (GPU2) -> Tesla K80 (GPU0) : Yes
Granting access from 0 to 2
Granting access from 2 to 0
Checking GPU0 and GPU1 for UVA capabilities...
> Tesla K80 (GPU0) supports UVA: Yes
> Tesla K80 (GPU2) supports UVA: Yes
Both GPUs can support UVA, enabling...
Reading visibilities and FITS input files...
FITS Files READ
Database connection okay again!
NumVisibilities per frequency 0 = 15034 = 708889018368.000000
NumVisibilities per frequency 1 = 23808 = 693874393088.000000
NumVisibilities per frequency 2 = 30250 = 691457359872.000000
MS: Ra: -2.10880231857299804687500, dec: -0.73867833614349365234375
FITS: Ra: 4.17438316345214843750000, dec: -0.73867833614349365234375
Image Center: 256.33322143554687500000000, 256.00000000000000000000000
fg_scale = 0.000012
CPU thread 1 (of 3) uses CUDA device 1
CPU thread 0 (of 3) uses CUDA device 0
CPU thread 2 (of 3) uses CUDA device 2
Tid 0
X: 275, Y: 258
deltau = 20143.046875, deltav = 20143.046875
N = 512
u = 396961.093750, v= 40705.386719
nuv1 = -5136477.000000, nuv2 = -5136477.000000
Coef 1: 2.557445, Coef 2: -0.578703, Coef 3: -1.264985, Coef 4: 0.286243

Tid 2
X: 266, Y: 237
deltau = 20143.046875, deltav = 20143.046875
N = 512
u = 205166.968750, v= -378639.687500
nuv1 = -5136477.000000, nuv2 = -5136477.000000
Coef 1: 3.260033, Coef 2: -1.462941, Coef 3: -1.445974, Coef 4: 0.648881

Tid 1
X: 268, Y: 244
deltau = 20143.046875, deltav = 20143.046875
N = 512
u = 249470.984375, v= -226985.406250
nuv1 = -nan, nuv2 = -nan
Coef 1: nan, Coef 2: nan, Coef 3: nan, Coef 4: nan

Minimal Viable Code:

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "math_constants.h"
#include <omp.h>
#include <cufft.h>

inline bool IsGPUCapableP2P(cudaDeviceProp *pProp)
{
  #ifdef _WIN32
      return (bool)(pProp->tccDriver ? true : false);
  #else
      return (bool)(pProp->major >= 2);
  #endif
}

inline bool IsAppBuiltAs64()
{
  #if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
      return 1;
  #else
      return 0;
  #endif
}

__global__ void kernelFunction(cufftComplex *I, int i, int N)
{
	int j = threadIdx.x + blockDim.x * blockIdx.x;
	int k = threadIdx.y + blockDim.y * blockIdx.y;
	
	if(j==0 & k==0){
		printf("I'm thread %d and I'm reading device_I[0] = %f\n", i, I[N*j+k].x);
	}
}

__host__ int main(int argc, char **argv) {
	int num_gpus;
	cudaGetDeviceCount(&num_gpus);

	if(num_gpus < 1){
		printf("No CUDA capable devices were detected\n");
    		return 1;
	}

	if (!IsAppBuiltAs64()){
		printf("%s is only supported with on 64-bit OSs and the application must be built as a 64-bit target. Test is being waived.\n", argv[0]);
		exit(EXIT_SUCCESS);
  	}

	printf("Number of host CPUs:\t%d\n", omp_get_num_procs());
  	printf("Number of CUDA devices:\t%d\n", num_gpus);

	for(int i = 0; i < num_gpus; i++){
  		cudaDeviceProp dprop;
    		cudaGetDeviceProperties(&dprop, i);

    		printf("> GPU%d = \"%15s\" %s capable of Peer-to-Peer (P2P)\n", i, dprop.name, (IsGPUCapableP2P(&dprop) ? "IS " : "NOT"));

    		//printf("   %d: %s\n", i, dprop.name);
  	}
  	printf("---------------------------\n");

	num_gpus = 3; //The case that fails
	omp_set_num_threads(num_gpus);

	if(num_gpus > 1){
	  for(int i=1; i<num_gpus; i++){
			cudaDeviceProp dprop0, dpropX;
			cudaGetDeviceProperties(&dprop0, 0);
			cudaGetDeviceProperties(&dpropX, i);
			int canAccessPeer0_x, canAccessPeerx_0;
			cudaDeviceCanAccessPeer(&canAccessPeer0_x, 0, i);
			cudaDeviceCanAccessPeer(&canAccessPeerx_0 , i, 0);
			printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", dprop0.name, 0, dpropX.name, i, canAccessPeer0_x ? "Yes" : "No");
    			printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", dpropX.name, i, dprop0.name, 0, canAccessPeerx_0 ? "Yes" : "No");
			if(canAccessPeer0_x == 0 || canAccessPeerx_0 == 0){
				printf("Two or more SM 2.0 class GPUs are required for %s to run.\n", argv[0]);
				printf("Support for UVA requires a GPU with SM 2.0 capabilities.\n");
				printf("Peer to Peer access is not available between GPU%d <-> GPU%d, waiving test.\n", 0, i);
				exit(EXIT_SUCCESS);
			}else{
				cudaSetDevice(0);
        			printf("Granting access from 0 to %d...\n", i);
				cudaDeviceEnablePeerAccess(i,0);
				cudaSetDevice(i);
        			printf("Granting access from %d to 0...\n", i);
				cudaDeviceEnablePeerAccess(0,0);

				printf("Checking GPU%d and GPU%d for UVA capabilities...\n", 0, 1);
				const bool has_uva = (dprop0.unifiedAddressing && dpropX.unifiedAddressing);
				printf("> %s (GPU%d) supports UVA: %s\n", dprop0.name, 0, (dprop0.unifiedAddressing ? "Yes" : "No"));
    				printf("> %s (GPU%d) supports UVA: %s\n", dpropX.name, i, (dpropX.unifiedAddressing ? "Yes" : "No"));
				if (has_uva){
        				printf("Both GPUs can support UVA, enabling...\n");
    				}
    				else{
        				printf("At least one of the two GPUs does NOT support UVA, waiving test.\n");
        				exit(EXIT_SUCCESS);
    				}
			}
	 	}
 	}

	int M = 512;
	int N = 512;

	cufftComplex *host_I = (cufftComplex*)malloc(M*N*sizeof(cufftComplex));
	for(int i=0;i<M;i++){
		for(int j=0;j<N;j++){
			host_I[N*i+j].x = 0.001;
			host_I[N*i+j].y = 0;
		}
	}

	cufftComplex *device_I;
	cudaSetDevice(0);
	cudaMalloc((void**)&device_I, sizeof(cufftComplex)*M*N);
  	cudaMemset(device_I, 0, sizeof(cufftComplex)*M*N);
	cudaMemcpy2D(device_I, sizeof(cufftComplex), host_I, sizeof(cufftComplex), sizeof(cufftComplex), M*N, cudaMemcpyHostToDevice);
	
	dim3 threads(32,32);
	dim3 blocks(M/threads.x, N/threads.y);
	dim3 threadsPerBlockNN = threads;
	dim3 numBlocksNN = blocks;
	#pragma omp parallel
	{
		unsigned int i = omp_get_thread_num();
		unsigned int num_cpu_threads = omp_get_num_threads();

		// set and check the CUDA device for this CPU thread
		int gpu_id = -1;
		cudaSetDevice(i % num_gpus);   // "% num_gpus" allows more CPU threads than GPU devices
		cudaGetDevice(&gpu_id);
		//printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
		kernelFunction<<<numBlocksNN, threadsPerBlockNN>>>(device_I, i, N);
		cudaDeviceSynchronize();
	}

	cudaFree(device_I);

	for(int i=1; i<num_gpus; i++){
		cudaSetDevice(0);
		cudaDeviceDisablePeerAccess(i);
		cudaSetDevice(i);
		cudaDeviceDisablePeerAccess(0);
  	}
  
	for(int i=0; i<num_gpus; i++ ){
		cudaSetDevice(i);
		cudaDeviceReset();
	}
	
	free(host_I);

	

}

If you need more information about this, just ask me. I really want to resolve this bug in my program.

Thanks!

normally when reading an array results in an ‘unexpected’ value, it is because a) the reading itself was done correctly, and the array value itself is an ‘unexpected’ value, or b) the reading itself was done incorrectly - the pointer or index is wrong, for instance

you can ensure the reading part via the debugger - that both the array pointer and index is proper

if the ‘unexpected value’ persists, i would point out that mp and uva does not guarantee synchronization per se; the value may be nan, as it is the value at that point in time
the associated memory transaction may be issued, but still be in flight, such that it is not completedyou show pieces of the code; this makes it difficult to follow what happens, and what is ready, when

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "math_constants.h"
#include <omp.h>
#include <cufft.h>

inline bool IsGPUCapableP2P(cudaDeviceProp *pProp)
{
  #ifdef _WIN32
      return (bool)(pProp->tccDriver ? true : false);
  #else
      return (bool)(pProp->major >= 2);
  #endif
}

inline bool IsAppBuiltAs64()
{
  #if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
      return 1;
  #else
      return 0;
  #endif
}

__global__ void kernelFunction(cufftComplex *I, int i, int N)
{
	int j = threadIdx.x + blockDim.x * blockIdx.x;
	int k = threadIdx.y + blockDim.y * blockIdx.y;
	
	if(j==0 & k==0){
		printf("I'm thread %d and I'm reading device_I[0] = %f\n", i, I[N*j+k].x);
	}
}

__host__ int main(int argc, char **argv) {
	int num_gpus;
	cudaGetDeviceCount(&num_gpus);

	if(num_gpus < 1){
		printf("No CUDA capable devices were detected\n");
    		return 1;
	}

	if (!IsAppBuiltAs64()){
		printf("%s is only supported with on 64-bit OSs and the application must be built as a 64-bit target. Test is being waived.\n", argv[0]);
		exit(EXIT_SUCCESS);
  	}



	printf("Number of host CPUs:\t%d\n", omp_get_num_procs());
  	printf("Number of CUDA devices:\t%d\n", num_gpus);


	for(int i = 0; i < num_gpus; i++){
  		cudaDeviceProp dprop;
    		cudaGetDeviceProperties(&dprop, i);

    		printf("> GPU%d = \"%15s\" %s capable of Peer-to-Peer (P2P)\n", i, dprop.name, (IsGPUCapableP2P(&dprop) ? "IS " : "NOT"));

    		//printf("   %d: %s\n", i, dprop.name);
  	}
  	printf("---------------------------\n");


	num_gpus = 3; //The case that fails
	omp_set_num_threads(num_gpus);

	if(num_gpus > 1){
	  for(int i=1; i<num_gpus; i++){
			cudaDeviceProp dprop0, dpropX;
			cudaGetDeviceProperties(&dprop0, 0);
			cudaGetDeviceProperties(&dpropX, i);
			int canAccessPeer0_x, canAccessPeerx_0;
			cudaDeviceCanAccessPeer(&canAccessPeer0_x, 0, i);
			cudaDeviceCanAccessPeer(&canAccessPeerx_0 , i, 0);
			printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", dprop0.name, 0, dpropX.name, i, canAccessPeer0_x ? "Yes" : "No");
    			printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", dpropX.name, i, dprop0.name, 0, canAccessPeerx_0 ? "Yes" : "No");
			if(canAccessPeer0_x == 0 || canAccessPeerx_0 == 0){
				printf("Two or more SM 2.0 class GPUs are required for %s to run.\n", argv[0]);
				printf("Support for UVA requires a GPU with SM 2.0 capabilities.\n");
				printf("Peer to Peer access is not available between GPU%d <-> GPU%d, waiving test.\n", 0, i);
				exit(EXIT_SUCCESS);
			}else{
				cudaSetDevice(0);
        			printf("Granting access from 0 to %d...\n", i);
				cudaDeviceEnablePeerAccess(i,0);
				cudaSetDevice(i);
        			printf("Granting access from %d to 0...\n", i);
				cudaDeviceEnablePeerAccess(0,0);

				printf("Checking GPU%d and GPU%d for UVA capabilities...\n", 0, 1);
				const bool has_uva = (dprop0.unifiedAddressing && dpropX.unifiedAddressing);
				printf("> %s (GPU%d) supports UVA: %s\n", dprop0.name, 0, (dprop0.unifiedAddressing ? "Yes" : "No"));
    				printf("> %s (GPU%d) supports UVA: %s\n", dpropX.name, i, (dpropX.unifiedAddressing ? "Yes" : "No"));
				if (has_uva){
        				printf("Both GPUs can support UVA, enabling...\n");
    				}
    				else{
        				printf("At least one of the two GPUs does NOT support UVA, waiving test.\n");
        				exit(EXIT_SUCCESS);
    				}
			}
	 	}
 	}

	int M = 512;
	int N = 512;

	cufftComplex *host_I = (cufftComplex*)malloc(M*N*sizeof(cufftComplex));
	for(int i=0;i<M;i++){
		for(int j=0;j<N;j++){
			host_I[N*i+j].x = 0.001;
			host_I[N*i+j].y = 0;
		}
	}

	cufftComplex *device_I;
	cudaSetDevice(0);
	cudaMalloc((void**)&device_I, sizeof(cufftComplex)*M*N);
  	cudaMemset(device_I, 0, sizeof(cufftComplex)*M*N);
	cudaMemcpy2D(device_I, sizeof(cufftComplex), host_I, sizeof(cufftComplex), sizeof(cufftComplex), M*N, cudaMemcpyHostToDevice);
	
	dim3 threads(32,32);
	dim3 blocks(M/threads.x, N/threads.y);
	dim3 threadsPerBlockNN = threads;
	dim3 numBlocksNN = blocks;
	#pragma omp parallel
	{
		unsigned int i = omp_get_thread_num();
		unsigned int num_cpu_threads = omp_get_num_threads();

		// set and check the CUDA device for this CPU thread
		int gpu_id = -1;
		cudaSetDevice(i % num_gpus);   // "% num_gpus" allows more CPU threads than GPU devices
		cudaGetDevice(&gpu_id);
		//printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
		kernelFunction<<<numBlocksNN, threadsPerBlockNN>>>(device_I, i, N);
		cudaDeviceSynchronize();
	}

	cudaFree(device_I);

	for(int i=1; i<num_gpus; i++){
		cudaSetDevice(0);
		cudaDeviceDisablePeerAccess(i);
		cudaSetDevice(i);
		cudaDeviceDisablePeerAccess(0);
  	}
  
	for(int i=0; i<num_gpus; i++ ){
		cudaSetDevice(i);
		cudaDeviceReset();
	}
	
	free(host_I);

	


}

Here I added an MVC Compilable code.

nvcc -Xcompiler -fopenmp -lgomp -arch=sm_37 main.cu -lcufft

The results are the same.

cross posting:

[url]c - Getting nan results using Peer-to-Peer in Tesla K80 Cluster - Stack Overflow

Results of ./simpleP2P

./simpleP2P 
[./simpleP2P] - Starting...
Checking for multiple GPUs...
CUDA-capable device count: 8
> GPU0 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)
> GPU1 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)
> GPU2 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)
> GPU3 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)
> GPU4 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)
> GPU5 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)
> GPU6 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)
> GPU7 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)

Checking GPU(s) for support of peer to peer memory access...
> Peer-to-Peer (P2P) access from Tesla K80 (GPU0) -> Tesla K80 (GPU1) : Yes
> Peer-to-Peer (P2P) access from Tesla K80 (GPU1) -> Tesla K80 (GPU0) : Yes
Enabling peer access between GPU0 and GPU1...
Checking GPU0 and GPU1 for UVA capabilities...
> Tesla K80 (GPU0) supports UVA: Yes
> Tesla K80 (GPU1) supports UVA: Yes
Both GPUs can support UVA, enabling...
Allocating buffers (64MB on GPU0, GPU1 and CPU Host)...
Creating event handles...
cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU1: 0.79GB/s
Preparing host buffer and memcpy to GPU0...
Run kernel on GPU1, taking source data from GPU0 and writing to GPU1...
Run kernel on GPU0, taking source data from GPU1 and writing to GPU0...
Copy data back to host from GPU0 and verify results...
Verification error @ element 0: val = nan, ref = 0.000000
Verification error @ element 1: val = nan, ref = 4.000000
Verification error @ element 2: val = nan, ref = 8.000000
Verification error @ element 3: val = nan, ref = 12.000000
Verification error @ element 4: val = nan, ref = 16.000000
Verification error @ element 5: val = nan, ref = 20.000000
Verification error @ element 6: val = nan, ref = 24.000000
Verification error @ element 7: val = nan, ref = 28.000000
Verification error @ element 8: val = nan, ref = 32.000000
Verification error @ element 9: val = nan, ref = 36.000000
Verification error @ element 10: val = nan, ref = 40.000000
Verification error @ element 11: val = nan, ref = 44.000000
Enabling peer access...
Shutting down...
Test failed!

‘fresh’ memory normally reads nan - memory freshly allocated, and not yet written to
thus, back to my original point - maybe the reading is valid, it is simply a case of a race - an unsynchronized read

you could initialize your arrays to initial values (0 or 1), synchronize on that first, and then commence as you do currently; if you then read the initial values, it is likely a case of poor synchronization

perhaps i am reading it wrong, but i do not see a synchronization primitive/ command/ directive between the initial memory allocation and memory copy, and the subsequent switching to multiple threads
thus, i do not see how it is possible to assume the transfer is complete, by the time the device starts reading, for all devices

you could also change the physical devices to host thread mappings around
i think you are simply going to see the nan values move to another device accordingly, if the issue is indeed synchronization/ race related