Unable to run kernel on device 1 with memory in device 2

Hi,
I have a piece of code which runs on multi-gpu configuration where, memory is present in device 1 and kernel is running on device 0. But the output of my kernel is wrong and printf is not working.

#include<iostream>
#include<stdio.h>
__global__ void vAdd(float* Ad, float* Bd, float* Cd)
{
  int tx = threadIdx.x;
  Cd[tx] = Ad[tx] + Bd[tx];
  printf("%f\n",Cd[tx]);
}


int main(){
  int st;
  cudaDeviceCanAccessPeer(&st, 2, 1);
  std::cout<<st<<std::endl;
  cudaDeviceCanAccessPeer(&st, 1, 2);
  std::cout<<st<<std::endl;
  cudaSetDevice(1);
  cudaDeviceEnablePeerAccess(2, 0);
  cudaSetDevice(2);
  cudaDeviceEnablePeerAccess(1, 0);

  cudaDeviceCanAccessPeer(&st, 1, 2);
  std::cout<<st<<std::endl;
  cudaDeviceCanAccessPeer(&st, 2, 1);
  std::cout<<st<<std::endl;

  cudaSetDevice(2);
  float *A = new float[64];
  float *B = new float[64];
  float *C = new float[64];
  for(int i=0;i<64; i++) {
    A[i] = 1;
    B[i] = 2;
    C[i] = 0;
  }
  float *Ad, *Bd, *Cd;
  cudaMalloc(&Ad, 64*4);
  cudaMalloc(&Bd, 64*4);
  cudaMalloc(&Cd, 64*4);

  cudaMemcpy(Ad, A, 64*4, cudaMemcpyHostToDevice);
  cudaMemcpy(Bd, B, 64*4, cudaMemcpyHostToDevice);
  cudaMemcpy(Cd, C, 64*4, cudaMemcpyHostToDevice);

  cudaSetDevice(1);
  vAdd<<<dim3(1,1,1), dim3(64,1,1)>>>(Ad, Bd, Cd);
  cudaSetDevice(2);

  cudaMemcpy(C, Cd, 64*4, cudaMemcpyDeviceToHost);
  std::cout<<C[10]<<std::endl;
}

Building with

nvcc -arch=compute_50 test.cu

I have devices on 0, 1, 2. Where, only 1 and 2 are p2p capable.

no error checking?

I removed it to post here. Here is the full source and output:

#include<iostream>
#include<stdio.h>
__global__ void vAdd(float* Ad, float* Bd, float* Cd)
{
  int tx = threadIdx.x;
  Cd[tx] = Ad[tx] + Bd[tx];
  printf("%f\n",Cd[tx]);
}

#define CUDA_CHECK(status) \
  std::cout<<cudaGetErrorName(status)<<" at line: "<<__LINE__<<std::endl;

int main(){
  int st;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 2, 1));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 1, 2));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaSetDevice(1));
  CUDA_CHECK(cudaDeviceEnablePeerAccess(2, 0));
  CUDA_CHECK(cudaSetDevice(2));
  CUDA_CHECK(cudaDeviceEnablePeerAccess(1, 0));

  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 1, 2));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 2, 1));
  std::cout<<st<<std::endl;

  CUDA_CHECK(cudaSetDevice(2));
  float *A = new float[64];
  float *B = new float[64];
  float *C = new float[64];
  for(int i=0;i<64; i++) {
    A[i] = 1;
    B[i] = 2;
    C[i] = 0;
  }
  float *Ad, *Bd, *Cd;
  CUDA_CHECK(cudaMalloc(&Ad, 64*4));
  CUDA_CHECK(cudaMalloc(&Bd, 64*4));
  CUDA_CHECK(cudaMalloc(&Cd, 64*4));

  CUDA_CHECK(cudaMemcpy(Ad, A, 64*4, cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(Bd, B, 64*4, cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(Cd, C, 64*4, cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaSetDevice(1));
  vAdd<<<dim3(1,1,1), dim3(64,1,1)>>>(Ad, Bd, Cd);
  CUDA_CHECK(cudaSetDevice(2));

  CUDA_CHECK(cudaMemcpy(C, Cd, 64*4, cudaMemcpyDeviceToHost));
  std::cout<<C[10]<<std::endl;
}
cudaSuccess at line: 15
1
cudaSuccess at line: 17
1
cudaSuccess at line: 19
cudaSuccess at line: 20
cudaSuccess at line: 21
cudaSuccess at line: 22
cudaSuccess at line: 24
1
cudaSuccess at line: 26
1
cudaSuccess at line: 29
cudaSuccess at line: 39
cudaSuccess at line: 40
cudaSuccess at line: 41
cudaSuccess at line: 43
cudaSuccess at line: 44
cudaSuccess at line: 45
cudaSuccess at line: 47
cudaSuccess at line: 49
cudaSuccess at line: 51
0

Your code actually runs correctly for me (that is the last printout is 3, not 0).

However your code has a logical error in it. Independent devices have independent default streams*. Since your cudaMemcpy operations are issued to one device and the kernel is issued to the other, these operations are actually issued to independent streams and may run concurrently.

Try adding a cudaDeviceSynchronize(); immediately after the kernel call, to see if it resolves the issue.

(*CUDA streams and events have implicit device association:

http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-selection)

Oh. Totally forgot cudaDeviceSynchronize existed. I called it for each device.

cudaDeviceSynchronize();
  CUDA_CHECK(cudaSetDevice(2));
  cudaDeviceSynchronize();

It didn’t work.

without the cudaDeviceSynchronize(), you don’t see the actual printf output from the kernel (as is evident in the printout you have posted).

After you add the cudaDevicSynchronize() call, do you see the output from the in-kernel printf ?

Also, wrap those cudaDeviceSynchronize calls in your CUDA_CHECK macro.

Hi,
Updated source

#include<iostream>
#include<stdio.h>
__global__ void vAdd(float* Ad, float* Bd, float* Cd)
{
  int tx = threadIdx.x;
  Cd[tx] = Ad[tx] + Bd[tx];
  printf("%f\n",Cd[tx]);
}

#define CUDA_CHECK(status) \
  std::cout<<cudaGetErrorName(status)<<" at line: "<<__LINE__<<std::endl;

int main(){
  int st;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 2, 1));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 1, 2));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaSetDevice(1));
  CUDA_CHECK(cudaDeviceEnablePeerAccess(2, 0));
  CUDA_CHECK(cudaSetDevice(2));
  CUDA_CHECK(cudaDeviceEnablePeerAccess(1, 0));

  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 1, 2));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 2, 1));
  std::cout<<st<<std::endl;
  cudaStream_t stream1, stream2;
  CUDA_CHECK(cudaSetDevice(2));
  cudaStreamCreate(&stream2);
  CUDA_CHECK(cudaSetDevice(1));
  cudaStreamCreate(&stream1);
  CUDA_CHECK(cudaSetDevice(2));
  float *A = new float[64];
  float *B = new float[64];
  float *C = new float[64];
  for(int i=0;i<64; i++) {
    A[i] = 1;
    B[i] = 2;
    C[i] = 0;
  }
  float *Ad, *Bd, *Cd;
  CUDA_CHECK(cudaMalloc(&Ad, 64*4));
  CUDA_CHECK(cudaMalloc(&Bd, 64*4));
  CUDA_CHECK(cudaMalloc(&Cd, 64*4));

  CUDA_CHECK(cudaMemcpyAsync(Ad, A, 64*4, cudaMemcpyHostToDevice, stream2));
  CUDA_CHECK(cudaMemcpyAsync(Bd, B, 64*4, cudaMemcpyHostToDevice, stream2));
  CUDA_CHECK(cudaMemcpyAsync(Cd, C, 64*4, cudaMemcpyHostToDevice, stream2));

  CUDA_CHECK(cudaSetDevice(1));
  vAdd<<<dim3(1,1,1), dim3(64,1,1),0,stream1>>>(Ad, Bd, Cd);
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaSetDevice(2));
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaMemcpy(C, Cd, 64*4, cudaMemcpyDeviceToHost));
  std::cout<<C[10]<<std::endl;
}

what does the output look like when running this on your machine?

txbob identified the problem: you have a stream-level synchronization race. Device 2 may not have finished its memory copy before device 1 accesses it. Add a cudaDeviceSynchronize() at line 50,after the memcopies to device 2, but before the kernel on device 1 tries to access that memory. Alternatively you could use cudaMemcpy() instead of cudaMemcpyAsync().

I missed that!

In your code, you converted from cudaMemcpy (in your original posting) to cudaMemcpyAsync in your second code posting, immediately before the kernel call. As a result, you now need a cudaDeviceSynchronize after those operations, on that device, before you switch to the device where you launch the kernel.

Hi,
I tried by using streams to make things simpler.
Here is my changed code.

#include<iostream>
#include<stdio.h>
__global__ void vAdd(float* Ad, float* Bd, float* Cd)
{
  int tx = threadIdx.x;
  Cd[tx] = Ad[tx] + Bd[tx];
  printf("%f\n",Cd[tx]);
}

#define CUDA_CHECK(status) \
  std::cout<<cudaGetErrorName(status)<<" at line: "<<__LINE__<<std::endl;

int main(){
  int st;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 2, 1));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 1, 2));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaSetDevice(1));
  CUDA_CHECK(cudaDeviceEnablePeerAccess(2, 0));
  CUDA_CHECK(cudaSetDevice(2));
  CUDA_CHECK(cudaDeviceEnablePeerAccess(1, 0));

  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 1, 2));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 2, 1));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaSetDevice(2));
  float *A = new float[64];
  float *B = new float[64];
  float *C = new float[64];
  for(int i=0;i<64; i++) {
    A[i] = 1;
    B[i] = 2;
    C[i] = 0;
  }
  float *Ad, *Bd, *Cd;
  CUDA_CHECK(cudaMalloc(&Ad, 64*4));
  CUDA_CHECK(cudaMalloc(&Bd, 64*4));
  CUDA_CHECK(cudaMalloc(&Cd, 64*4));

  CUDA_CHECK(cudaMemcpy(Ad, A, 64*4, cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(Bd, B, 64*4, cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(Cd, C, 64*4, cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaSetDevice(1));
  vAdd<<<dim3(1,1,1), dim3(64,1,1)>>>(Ad, Bd, Cd);
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaSetDevice(2));
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaMemcpy(C, Cd, 64*4, cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaDeviceSynchronize());
  std::cout<<C[10]<<std::endl;
}