Hi,
I have a piece of code which runs on multi-gpu configuration where, memory is present in device 1 and kernel is running on device 0. But the output of my kernel is wrong and printf is not working.
#include<iostream>
#include<stdio.h>
__global__ void vAdd(float* Ad, float* Bd, float* Cd)
{
int tx = threadIdx.x;
Cd[tx] = Ad[tx] + Bd[tx];
printf("%f\n",Cd[tx]);
}
int main(){
int st;
cudaDeviceCanAccessPeer(&st, 2, 1);
std::cout<<st<<std::endl;
cudaDeviceCanAccessPeer(&st, 1, 2);
std::cout<<st<<std::endl;
cudaSetDevice(1);
cudaDeviceEnablePeerAccess(2, 0);
cudaSetDevice(2);
cudaDeviceEnablePeerAccess(1, 0);
cudaDeviceCanAccessPeer(&st, 1, 2);
std::cout<<st<<std::endl;
cudaDeviceCanAccessPeer(&st, 2, 1);
std::cout<<st<<std::endl;
cudaSetDevice(2);
float *A = new float[64];
float *B = new float[64];
float *C = new float[64];
for(int i=0;i<64; i++) {
A[i] = 1;
B[i] = 2;
C[i] = 0;
}
float *Ad, *Bd, *Cd;
cudaMalloc(&Ad, 64*4);
cudaMalloc(&Bd, 64*4);
cudaMalloc(&Cd, 64*4);
cudaMemcpy(Ad, A, 64*4, cudaMemcpyHostToDevice);
cudaMemcpy(Bd, B, 64*4, cudaMemcpyHostToDevice);
cudaMemcpy(Cd, C, 64*4, cudaMemcpyHostToDevice);
cudaSetDevice(1);
vAdd<<<dim3(1,1,1), dim3(64,1,1)>>>(Ad, Bd, Cd);
cudaSetDevice(2);
cudaMemcpy(C, Cd, 64*4, cudaMemcpyDeviceToHost);
std::cout<<C[10]<<std::endl;
}
Building with
nvcc -arch=compute_50 test.cu
I have devices on 0, 1, 2. Where, only 1 and 2 are p2p capable.