P2P broken on Cuda5.0 driver

Hi all,

our regression tests using CUDA 5.0 are failing when testing the P2P copy.

I’m was able to create a simple test case that shows the problem:

#include <cuda_runtime_api.h>

#include <stdlib.h>

#include <iostream>

int main() {

  float* myVectorDevice0;

  float* myVectorDevice1;

  float* myInputHost;

  float* myOutputHost;

const size_t myVectorSize = (1<<19);

cudaSetDevice(0);

  cudaMalloc((void**)&myVectorDevice0, myVectorSize*sizeof(float));

cudaSetDevice(1);

  cudaMalloc((void**)&myVectorDevice1, myVectorSize*sizeof(float));

myInputHost = (float*)malloc(myVectorSize*sizeof(float));

  myOutputHost = (float*)malloc(myVectorSize*sizeof(float));

cudaSetDevice(0);

  cudaDeviceEnablePeerAccess(1, 0); //Disabling this or the following line makes the tests passing

  cudaSetDevice(1);

for (int i = 0; i < myVectorSize; ++i) {

    myInputHost[i] = 1.0f*rand() / RAND_MAX;

    myOutputHost[i] = 1.0f*rand() / RAND_MAX;

  }

cudaMemcpy(myVectorDevice0, myInputHost, myVectorSize*sizeof(float), cudaMemcpyDefault);

  cudaMemcpy(myVectorDevice1, myVectorDevice0, myVectorSize*sizeof(float), cudaMemcpyDefault);

  cudaMemcpy(myOutputHost, myVectorDevice1, myVectorSize*sizeof(float), cudaMemcpyDefault);

for (int i = 0; i < myVectorSize; ++i) {

    if (myInputHost[i] != myOutputHost[i]) {

      std::cout << "i:" << i << " SRC: " << myInputHost[i] << " DST: " << myOutputHost[i] << std::endl;

      abort();

    }

  }

}

I compile the code above in this way:

g++ main_copy_error.cpp -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart

Note that disabling the line mentioned on the code then all works, tryed to use CUDA4.2 but with driver for CUDA5.0

and test still fails, so I guess is just a driver issue.

can you post the output of deviceQuery, nvidia-smi, and lspci -t? I’ve looked at this on a couple different machines now and it works on every machine.

also, can you use my slightly updated code? it’s a little better about checking errors around cudaEnablePeerAccess:

#include <cuda_runtime_api.h>

#include <stdlib.h>

#include <iostream>

int main() {

  float* myVectorDevice0;

  float* myVectorDevice1;

  float* myInputHost;

  float* myOutputHost;

cudaError_t status;

const size_t myVectorSize = (1<<19);

cudaSetDevice(0);

  cudaMalloc((void**)&myVectorDevice0, myVectorSize*sizeof(float));

cudaSetDevice(1);

  cudaMalloc((void**)&myVectorDevice1, myVectorSize*sizeof(float));

myInputHost = (float*)malloc(myVectorSize*sizeof(float));

  myOutputHost = (float*)malloc(myVectorSize*sizeof(float));

cudaSetDevice(0);

  cudaDeviceEnablePeerAccess(1, 0); //Disabling this or the following line makes the tests passing                                                                                                                                           

  cudaSetDevice(1);

status = cudaGetLastError();

  if (status != cudaSuccess)

    printf("there was an error! it was %d\n", status);

  else

    printf("no error\n");

for (int i = 0; i < myVectorSize; ++i) {

    myInputHost[i] = 1.0f*rand() / RAND_MAX;

    myOutputHost[i] = 1.0f*rand() / RAND_MAX;

  }

cudaMemcpy(myVectorDevice0, myInputHost, myVectorSize*sizeof(float), cudaMemcpyDefault);

  cudaMemcpy(myVectorDevice1, myVectorDevice0, myVectorSize*sizeof(float), cudaMemcpyDefault);

  cudaMemcpy(myOutputHost, myVectorDevice1, myVectorSize*sizeof(float), cudaMemcpyDefault);

for (int i = 0; i < myVectorSize; ++i) {

    if (myInputHost[i] != myOutputHost[i]) {

      std::cout << "i:" << i << " SRC: " << myInputHost[i] << " DST: " << myOutputHost[i] << std::endl;

      abort();

    }

  }

  printf("done!\n");

}

Something wrong is going on,

look at what I did this morning (log of my shell):

$ su -

Password:

lsmod | grep nvidia

nvidia 12319475 0

./devdriver_5.0_linux_64_302.06.03.run

Verifying archive integrity… OK

Uncompressing NVIDIA Accelerated Graphics Driver for Linux-x86_64 302.06.03…

logout

$ g++ main_copy_error.cpp -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart

./a.out

i:0 SRC: 0.840188 DST: 0.285239

Aborted (core dumped)

$ vi main_copy_error_2.cpp << This is the code you posted

$ g++ main_copy_error_2.cpp -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart

main_copy_error_2.cpp: In function ‘int main()’:

main_copy_error_2.cpp:30: error: ‘printf’ was not declared in this scope

main_copy_error_2.cpp:32: error: ‘printf’ was not declared in this scope

main_copy_error_2.cpp:49: error: ‘printf’ was not declared in this scope

$ vi main_copy_error_2.cpp << I added the #include <stdio.h>

$ g++ main_copy_error_2.cpp -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart

$ ./a.out

no error

done!

From this moment even mine doesn’t fail anymore:

$ g++ main_copy_error.cpp -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -o main_copy_error

$ g++ main_copy_error_2.cpp -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -o main_copy_error_2

$ ./main_copy_error

$ ./main_copy_error_2

no error

done!

And this is very strange! I guess there is a race condition somewhere.

I’m still able to make the test failing if in another shell I run the following code:

// interfere code

#include <cuda_runtime_api.h>

int main() {

  float* myVectorDevice0;

  float* myVectorDevice1;

const size_t myVectorSize = (1<<19);

while(true) {

    cudaSetDevice(0);

    cudaMalloc((void**)&myVectorDevice0, myVectorSize*sizeof(float));

cudaSetDevice(1);

    cudaMalloc((void**)&myVectorDevice1, myVectorSize*sizeof(float));

cudaFree(myVectorDevice0);

    cudaFree(myVectorDevice1);

  }

}

Executing your test case in a loop I get the random fails:

$ for ((;<img src='http://forums.nvidia.com/public/style_emoticons/<#EMO_DIR#>/wink.gif' class='bbc_emoticon' alt=';)' />) ; do ./main_copy_error_2; done

no error

done!

no error

done!

no error

done!

no error

done!

no error

done!

no error

done!

no error

i:141824 SRC: 0.0912551 DST: 0

Aborted (core dumped)

no error

i:0 SRC: 0.840188 DST: 0.688742

Aborted (core dumped)

no error

done!

no error

done!

no error

done!

no error

i:0 SRC: 0.840188 DST: 0.688742

Aborted (core dumped)

no error

i:262144 SRC: 0.688742 DST: 0.840188

Aborted (core dumped)

no error

done!

no error

done!

no error

done!

no error

done!

no error

i:403968 SRC: 0.510556 DST: 0

Aborted (core dumped)

no error

done!

no error

done!

no error

i:0 SRC: 0.840188 DST: 0.688742

Aborted (core dumped)

no error

i:141824 SRC: 0.0912551 DST: 0

Aborted (core dumped)

no error

i:262144 SRC: 0.688742 DST: 0.840188

Aborted (core dumped)

no error

done!

so as you can see if another process is allocating/freeing memory this simple tests case randomly

fails.

And here the nvidia-smi and the lspci output

$ nvidia-smi

Fri Jun 29 10:32:02 2012       

+------------------------------------------------------+                       

| NVIDIA-SMI 3.302...   Driver Version: 302.06.03      |                       

|-------------------------------+----------------------+----------------------+

| GPU  Name                     | Bus-Id        Disp.  | Volatile ECC SB / DB |

| Fan  Temp  Perf  Pwr:Usage/Cap| Memory-Usage         | GPU-Util  Compute M. |

|===============================+======================+======================|

|   0  Tesla C2050              | 0000:02:00.0     Off |       Off            |

| 30%   78C    P0    N/A /  N/A |   0%    6MB / 3071MB |      0%      Default |

+-------------------------------+----------------------+----------------------+

|   1  Tesla C2050              | 0000:03:00.0     Off |       Off            |

| 30%   67C    P0    N/A /  N/A |   0%    6MB / 3071MB |      0%      Default |

+-------------------------------+----------------------+----------------------+

|   2  Tesla C2050              | 0000:83:00.0     Off |       Off            |

| 30%   79C    P0    N/A /  N/A |   0%    6MB / 3071MB |      0%      Default |

+-------------------------------+----------------------+----------------------+

|   3  Tesla C2050              | 0000:84:00.0     Off |       Off            |

| 30%   77C    P0    N/A /  N/A |   0%    6MB / 3071MB |      0%      Default |

+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+

| Compute processes:                                               GPU Memory |

|  GPU       PID  Process name                                     Usage      |

|=============================================================================|

|  No running compute processes found                                         |

+-----------------------------------------------------------------------------+

-+-[0000:80]-+-00.0-[0000:81]--

 |           +-01.0-[0000:82]--

 |           +-03.0-[0000:83]--+-00.0

 |           |                 \-00.1

 |           +-07.0-[0000:84]--+-00.0

 |           |                 \-00.1

 |           +-13.0

 |           +-14.0

 |           +-14.1

 |           +-14.2

 |           +-14.3

 |           +-16.0

 |           +-16.1

 |           +-16.2

 |           +-16.3

 |           +-16.4

 |           +-16.5

 |           +-16.6

 |           \-16.7

 \-[0000:00]-+-00.0

             +-01.0-[0000:01]----00.0

             +-03.0-[0000:02]--+-00.0

             |                 \-00.1

             +-07.0-[0000:03]--+-00.0

             |                 \-00.1

             +-13.0

             +-14.0

             +-16.0

             +-16.1

             +-16.2

             +-16.3

             +-16.4

             +-16.5

             +-16.6

             +-16.7

             +-1a.0

             +-1a.1

             +-1a.2

             +-1a.7

             +-1b.0

             +-1c.0-[0000:04]----00.0

             +-1c.4-[0000:05]----00.0

             +-1c.5-[0000:06]----00.0

             +-1d.0

             +-1d.1

             +-1d.2

             +-1d.7

             +-1e.0-[0000:07]----01.0

             +-1f.0

             +-1f.2

             \-1f.3

I did a try with CUDA4.2 with drivers 295.41, running the code allocating/freeing the memory and the test in loop in another shell, I get what I believe a bug in CUDA4.2, the allocating/freeing code does a memory leaks and the test in loop fails like this:

$ for ((;<img src='http://forums.nvidia.com/public/style_emoticons/<#EMO_DIR#>/wink.gif' class='bbc_emoticon' alt=';)' />) ; do ./copy2; done

no error

done!

no error

done!

there was an error! it was 2:out of memory

Segmentation fault (core dumped)

there was an error! it was 2:out of memory

Segmentation fault (core dumped)

and indeed as you can see the memory on device 0 has a 100% usage

$ nvidia-smi 

Fri Jun 29 11:39:04 2012       

+------------------------------------------------------+                       

| NVIDIA-SMI 3.295.41   Driver Version: 295.41         |                       

|-------------------------------+----------------------+----------------------+

| Nb.  Name                     | Bus Id        Disp.  | Volatile ECC SB / DB |

| Fan   Temp   Power Usage /Cap | Memory Usage         | GPU Util. Compute M. |

|===============================+======================+======================|

| 0.  Tesla C2050               | 0000:02:00.0  Off    |       Off            |

|  30%   81 C  P0    N/A /  N/A | 100% 3071MB / 3071MB |    0%     Default    |

|-------------------------------+----------------------+----------------------|

| 1.  Tesla C2050               | 0000:03:00.0  Off    |       Off            |

|  30%   73 C  P0    N/A /  N/A |   2%   75MB / 3071MB |    0%     Default    |

|-------------------------------+----------------------+----------------------|

| 2.  Tesla C2050               | 0000:83:00.0  Off    |       Off            |

|  30%   74 C  P0    N/A /  N/A |   0%    6MB / 3071MB |    0%     Default    |

|-------------------------------+----------------------+----------------------|

| 3.  Tesla C2050               | 0000:84:00.0  Off    |       Off            |

|  30%   76 C  P0    N/A /  N/A |   0%    6MB / 3071MB |    0%     Default    |

|-------------------------------+----------------------+----------------------|

| Compute processes:                                               GPU Memory |

|  GPU  PID     Process name                                       Usage      |

|=============================================================================|

|  0.  12318    ./interfere                                           3110MB  |

|  1.  12318    ./interfere                                           3112MB  |

+-----------------------------------------------------------------------------+

So it seems that in CUDA4.2 in order to free the memory the cudaFree isn’t able to know which device is involved (without an explicit cudaSetDevice), I don’t even like the fact that nvidia-smi is reporting the “./interfere” executable using 3GB of memory per card, indeed “interfere” is doing “leakage” only on device0.

Let’s try again with CUDA5.0:

$ ./interfere

and nvidia-smi:

Fri Jun 29 11:54:14 2012       

+------------------------------------------------------+                       

| NVIDIA-SMI 3.302...   Driver Version: 302.06.03      |                       

|-------------------------------+----------------------+----------------------+

| GPU  Name                     | Bus-Id        Disp.  | Volatile ECC SB / DB |

| Fan  Temp  Perf  Pwr:Usage/Cap| Memory-Usage         | GPU-Util  Compute M. |

|===============================+======================+======================|

|   0  Tesla C2050              | 0000:02:00.0     Off |       Off            |

| 30%   78C    P0    N/A /  N/A |   2%   67MB / 3071MB |      0%      Default |

+-------------------------------+----------------------+----------------------+

|   1  Tesla C2050              | 0000:03:00.0     Off |       Off            |

| 30%   69C    P0    N/A /  N/A |   2%   69MB / 3071MB |      0%      Default |

+-------------------------------+----------------------+----------------------+

|   2  Tesla C2050              | 0000:83:00.0     Off |       Off            |

| 30%   69C   P12    N/A /  N/A |   0%    6MB / 3071MB |      0%      Default |

+-------------------------------+----------------------+----------------------+

|   3  Tesla C2050              | 0000:84:00.0     Off |       Off            |

| 30%   69C   P12    N/A /  N/A |   0%    6MB / 3071MB |      0%      Default |

+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+

| Compute processes:                                               GPU Memory |

|  GPU       PID  Process name                                     Usage      |

|=============================================================================|

|    0     13702  ./interfere                                          120MB  |

|    1     13702  ./interfere                                          120MB  |

+-----------------------------------------------------------------------------+

so now with CUDA5.0 cudaFree is able to detect the card involved and there is no need to perform a cudaSetDevice, but

as said before while the “interfere” is running the test doing the p2p copy keeps failing:

$ for ((;<img src='http://forums.nvidia.com/public/style_emoticons/<#EMO_DIR#>/wink.gif' class='bbc_emoticon' alt=';)' />) ; do ./copy2; done

no error

i:0 SRC: 0.840188 DST: 0.285239

Aborted (core dumped)

no error

i:0 SRC: 0.840188 DST: 0.966038

Aborted (core dumped)

no error

i:0 SRC: 0.840188 DST: 0.688742

Aborted (core dumped)

no error

i:262144 SRC: 0.688742 DST: 0.840188

Aborted (core dumped)

no error

done!

no error

i:0 SRC: 0.840188 DST: 0.688742

Aborted (core dumped)

no error

i:262144 SRC: 0.688742 DST: 0.840188

Aborted (core dumped)

Let’s now fixing the “leak” (putting an explicit cudaSetDevice before the cudaFree) on the interfere source code and trying again with Cuda4.2 to see if the p2p copy works (just to verify that is not an hardware issue):

(while interfere is running)

$ for ((;<img src='http://forums.nvidia.com/public/style_emoticons/<#EMO_DIR#>/wink.gif' class='bbc_emoticon' alt=';)' />) ; do ./copy2; done

no error

done!

no error

done!

no error

done!

no error

done!

no error

done!

no error

done!

no error

done!

.....

the test is now running on CUDA4.2 since 20 mins without problems.

So to sum up:

CUDA4.2:

Needs to have a cudaSetDevice performed before the cudaFree or it will leak. BUG.

Allocating/Free memory in a process while performing a cudaMemCopy (with P2P enable) in another process

works without problems.

CUDA5.0:

No need to perform a cudaSetDevice before to perform the cudaFree.

Allocating/Free memory in a process while performing a cudaMemCopy (with P2P enable) in another process

doesn’t work. BUG.

with 5.0, you’re almost certainly hitting the known issue of context creation sometimes failing at random.

I guess I have to wait then.

What about the cudaSetDevice needed to be performed before a cudaFree in cuda 4.2 ?