Unable to run kernel on device 1 with memory in device 2

Aditya_Avinash · January 23, 2017, 4:05pm

Hi,
I have a piece of code which runs on multi-gpu configuration where, memory is present in device 1 and kernel is running on device 0. But the output of my kernel is wrong and printf is not working.

#include<iostream>
#include<stdio.h>
__global__ void vAdd(float* Ad, float* Bd, float* Cd)
{
  int tx = threadIdx.x;
  Cd[tx] = Ad[tx] + Bd[tx];
  printf("%f\n",Cd[tx]);
}


int main(){
  int st;
  cudaDeviceCanAccessPeer(&st, 2, 1);
  std::cout<<st<<std::endl;
  cudaDeviceCanAccessPeer(&st, 1, 2);
  std::cout<<st<<std::endl;
  cudaSetDevice(1);
  cudaDeviceEnablePeerAccess(2, 0);
  cudaSetDevice(2);
  cudaDeviceEnablePeerAccess(1, 0);

  cudaDeviceCanAccessPeer(&st, 1, 2);
  std::cout<<st<<std::endl;
  cudaDeviceCanAccessPeer(&st, 2, 1);
  std::cout<<st<<std::endl;

  cudaSetDevice(2);
  float *A = new float[64];
  float *B = new float[64];
  float *C = new float[64];
  for(int i=0;i<64; i++) {
    A[i] = 1;
    B[i] = 2;
    C[i] = 0;
  }
  float *Ad, *Bd, *Cd;
  cudaMalloc(&Ad, 64*4);
  cudaMalloc(&Bd, 64*4);
  cudaMalloc(&Cd, 64*4);

  cudaMemcpy(Ad, A, 64*4, cudaMemcpyHostToDevice);
  cudaMemcpy(Bd, B, 64*4, cudaMemcpyHostToDevice);
  cudaMemcpy(Cd, C, 64*4, cudaMemcpyHostToDevice);

  cudaSetDevice(1);
  vAdd<<<dim3(1,1,1), dim3(64,1,1)>>>(Ad, Bd, Cd);
  cudaSetDevice(2);

  cudaMemcpy(C, Cd, 64*4, cudaMemcpyDeviceToHost);
  std::cout<<C[10]<<std::endl;
}

Building with

nvcc -arch=compute_50 test.cu

I have devices on 0, 1, 2. Where, only 1 and 2 are p2p capable.

Robert_Crovella · January 23, 2017, 5:22pm

no error checking?

Aditya_Avinash · January 23, 2017, 5:45pm

I removed it to post here. Here is the full source and output:

#include<iostream>
#include<stdio.h>
__global__ void vAdd(float* Ad, float* Bd, float* Cd)
{
  int tx = threadIdx.x;
  Cd[tx] = Ad[tx] + Bd[tx];
  printf("%f\n",Cd[tx]);
}

#define CUDA_CHECK(status) \
  std::cout<<cudaGetErrorName(status)<<" at line: "<<__LINE__<<std::endl;

int main(){
  int st;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 2, 1));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 1, 2));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaSetDevice(1));
  CUDA_CHECK(cudaDeviceEnablePeerAccess(2, 0));
  CUDA_CHECK(cudaSetDevice(2));
  CUDA_CHECK(cudaDeviceEnablePeerAccess(1, 0));

  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 1, 2));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 2, 1));
  std::cout<<st<<std::endl;

  CUDA_CHECK(cudaSetDevice(2));
  float *A = new float[64];
  float *B = new float[64];
  float *C = new float[64];
  for(int i=0;i<64; i++) {
    A[i] = 1;
    B[i] = 2;
    C[i] = 0;
  }
  float *Ad, *Bd, *Cd;
  CUDA_CHECK(cudaMalloc(&Ad, 64*4));
  CUDA_CHECK(cudaMalloc(&Bd, 64*4));
  CUDA_CHECK(cudaMalloc(&Cd, 64*4));

  CUDA_CHECK(cudaMemcpy(Ad, A, 64*4, cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(Bd, B, 64*4, cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(Cd, C, 64*4, cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaSetDevice(1));
  vAdd<<<dim3(1,1,1), dim3(64,1,1)>>>(Ad, Bd, Cd);
  CUDA_CHECK(cudaSetDevice(2));

  CUDA_CHECK(cudaMemcpy(C, Cd, 64*4, cudaMemcpyDeviceToHost));
  std::cout<<C[10]<<std::endl;
}

cudaSuccess at line: 15
1
cudaSuccess at line: 17
1
cudaSuccess at line: 19
cudaSuccess at line: 20
cudaSuccess at line: 21
cudaSuccess at line: 22
cudaSuccess at line: 24
1
cudaSuccess at line: 26
1
cudaSuccess at line: 29
cudaSuccess at line: 39
cudaSuccess at line: 40
cudaSuccess at line: 41
cudaSuccess at line: 43
cudaSuccess at line: 44
cudaSuccess at line: 45
cudaSuccess at line: 47
cudaSuccess at line: 49
cudaSuccess at line: 51
0

Robert_Crovella · January 23, 2017, 6:10pm

Your code actually runs correctly for me (that is the last printout is 3, not 0).

However your code has a logical error in it. Independent devices have independent default streams*. Since your cudaMemcpy operations are issued to one device and the kernel is issued to the other, these operations are actually issued to independent streams and may run concurrently.

Try adding a cudaDeviceSynchronize(); immediately after the kernel call, to see if it resolves the issue.

(*CUDA streams and events have implicit device association:

[url]http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-selection[/url])

Aditya_Avinash · January 23, 2017, 7:13pm

Oh. Totally forgot cudaDeviceSynchronize existed. I called it for each device.

cudaDeviceSynchronize();
  CUDA_CHECK(cudaSetDevice(2));
  cudaDeviceSynchronize();

It didn’t work.

Robert_Crovella · January 23, 2017, 7:24pm

without the cudaDeviceSynchronize(), you don’t see the actual printf output from the kernel (as is evident in the printout you have posted).

After you add the cudaDevicSynchronize() call, do you see the output from the in-kernel printf ?

Also, wrap those cudaDeviceSynchronize calls in your CUDA_CHECK macro.

Aditya_Avinash · January 23, 2017, 8:37pm

Hi,
Updated source

#include<iostream>
#include<stdio.h>
__global__ void vAdd(float* Ad, float* Bd, float* Cd)
{
  int tx = threadIdx.x;
  Cd[tx] = Ad[tx] + Bd[tx];
  printf("%f\n",Cd[tx]);
}

#define CUDA_CHECK(status) \
  std::cout<<cudaGetErrorName(status)<<" at line: "<<__LINE__<<std::endl;

int main(){
  int st;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 2, 1));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 1, 2));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaSetDevice(1));
  CUDA_CHECK(cudaDeviceEnablePeerAccess(2, 0));
  CUDA_CHECK(cudaSetDevice(2));
  CUDA_CHECK(cudaDeviceEnablePeerAccess(1, 0));

  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 1, 2));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 2, 1));
  std::cout<<st<<std::endl;
  cudaStream_t stream1, stream2;
  CUDA_CHECK(cudaSetDevice(2));
  cudaStreamCreate(&stream2);
  CUDA_CHECK(cudaSetDevice(1));
  cudaStreamCreate(&stream1);
  CUDA_CHECK(cudaSetDevice(2));
  float *A = new float[64];
  float *B = new float[64];
  float *C = new float[64];
  for(int i=0;i<64; i++) {
    A[i] = 1;
    B[i] = 2;
    C[i] = 0;
  }
  float *Ad, *Bd, *Cd;
  CUDA_CHECK(cudaMalloc(&Ad, 64*4));
  CUDA_CHECK(cudaMalloc(&Bd, 64*4));
  CUDA_CHECK(cudaMalloc(&Cd, 64*4));

  CUDA_CHECK(cudaMemcpyAsync(Ad, A, 64*4, cudaMemcpyHostToDevice, stream2));
  CUDA_CHECK(cudaMemcpyAsync(Bd, B, 64*4, cudaMemcpyHostToDevice, stream2));
  CUDA_CHECK(cudaMemcpyAsync(Cd, C, 64*4, cudaMemcpyHostToDevice, stream2));

  CUDA_CHECK(cudaSetDevice(1));
  vAdd<<<dim3(1,1,1), dim3(64,1,1),0,stream1>>>(Ad, Bd, Cd);
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaSetDevice(2));
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaMemcpy(C, Cd, 64*4, cudaMemcpyDeviceToHost));
  std::cout<<C[10]<<std::endl;
}

Robert_Crovella · January 23, 2017, 10:06pm

what does the output look like when running this on your machine?

SPWorley · January 23, 2017, 10:36pm

txbob identified the problem: you have a stream-level synchronization race. Device 2 may not have finished its memory copy before device 1 accesses it. Add a cudaDeviceSynchronize() at line 50,after the memcopies to device 2, but before the kernel on device 1 tries to access that memory. Alternatively you could use cudaMemcpy() instead of cudaMemcpyAsync().

Robert_Crovella · January 24, 2017, 12:03am

I missed that!

In your code, you converted from cudaMemcpy (in your original posting) to cudaMemcpyAsync in your second code posting, immediately before the kernel call. As a result, you now need a cudaDeviceSynchronize after those operations, on that device, before you switch to the device where you launch the kernel.

Aditya_Avinash · January 24, 2017, 3:20pm

Hi,
I tried by using streams to make things simpler.
Here is my changed code.

#include<iostream>
#include<stdio.h>
__global__ void vAdd(float* Ad, float* Bd, float* Cd)
{
  int tx = threadIdx.x;
  Cd[tx] = Ad[tx] + Bd[tx];
  printf("%f\n",Cd[tx]);
}

#define CUDA_CHECK(status) \
  std::cout<<cudaGetErrorName(status)<<" at line: "<<__LINE__<<std::endl;

int main(){
  int st;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 2, 1));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 1, 2));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaSetDevice(1));
  CUDA_CHECK(cudaDeviceEnablePeerAccess(2, 0));
  CUDA_CHECK(cudaSetDevice(2));
  CUDA_CHECK(cudaDeviceEnablePeerAccess(1, 0));

  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 1, 2));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaDeviceCanAccessPeer(&st, 2, 1));
  std::cout<<st<<std::endl;
  CUDA_CHECK(cudaSetDevice(2));
  float *A = new float[64];
  float *B = new float[64];
  float *C = new float[64];
  for(int i=0;i<64; i++) {
    A[i] = 1;
    B[i] = 2;
    C[i] = 0;
  }
  float *Ad, *Bd, *Cd;
  CUDA_CHECK(cudaMalloc(&Ad, 64*4));
  CUDA_CHECK(cudaMalloc(&Bd, 64*4));
  CUDA_CHECK(cudaMalloc(&Cd, 64*4));

  CUDA_CHECK(cudaMemcpy(Ad, A, 64*4, cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(Bd, B, 64*4, cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(Cd, C, 64*4, cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaSetDevice(1));
  vAdd<<<dim3(1,1,1), dim3(64,1,1)>>>(Ad, Bd, Cd);
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaSetDevice(2));
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaMemcpy(C, Cd, 64*4, cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaDeviceSynchronize());
  std::cout<<C[10]<<std::endl;
}

Topic		Replies	Views
cudaDeviceSynchronize needed between kernel launch and cudaMemcpy ? CUDA Programming and Performance	15	16070	September 29, 2017
cudaSynchronizeDevice() returns error code 6 CUDA Programming and Performance	7	8590	June 16, 2011
Asynchronous multi streaming: not working... CUDA Programming and Performance	2	513	May 13, 2018
Cannot force kernels to concurrent execution CUDA Programming and Performance	8	5541	April 28, 2012
incomprehensible behaviour limitations on kernel calls for host function? CUDA Programming and Performance	12	7031	April 28, 2011
My streams are not running concurrently CUDA Programming and Performance	7	1740	March 6, 2018
MultiGPU start help CUDA Programming and Performance	8	10521	August 10, 2010
11.2 > cudaMemPool_t and Peer2Peer CUDA Programming and Performance	4	1020	January 14, 2021
Can't get any concurrency on simple vector add across multi-GPU and streams CUDA Programming and Performance	17	5752	April 28, 2012
Does cudaStreamWaitEvent(stream2, event1, 0) also block the stream to record event1? CUDA Programming and Performance	3	197	May 24, 2024

Unable to run kernel on device 1 with memory in device 2

Related topics