As CUDA doc(*1) says, we need to call cudaSetDevice(i) to allocate device memory on the specific device(id=i) before calling cudaMalloc().
However, how about cudaFree()? There are no mentions about cudaFree() in the doc.
Practically, I know cudaFree(p) unallocates without cudaSetDevice() even if p is on other devices.
// nvcc main.cu && ./a.out
#include <iostream>
#include <cstdlib>
#include <unistd.h>
void checkResult(cudaError_t e){
if(e != cudaSuccess) std::cerr << cudaGetErrorName(e) << std::endl;
}
int main(){
void *ptr[2];
for(int i = 0;i <2;i++){
checkResult(cudaSetDevice(i));
checkResult(cudaMalloc(ptr+i, 1024*1024*1024));
}
std::system("nvidia-smi"); // Memory-Usage +1024MiB on both devices
checkResult(cudaSetDevice(0));
checkResult(cudaFree(ptr[0]));
checkResult(cudaFree(ptr[1]));
sleep(10);
std::system("nvidia-smi"); // Memory-Usage -1024MiB on both devices (not only in device 0)
checkResult(cudaSetDevice(1));
checkResult(cudaFree(ptr[0])); // cudaErrorInvalidDevicePointer
checkResult(cudaFree(ptr[1])); // cudaErrorInvalidDevicePointer
sleep(10);
std::system("nvidia-smi"); // says Memory-Usage is not changed on both devices
}
I want to know whether that behavior depends on implementation (CUDA SDK versions) or meets CUDA specification.
I also know NCCL offical examples have both usage, with cudaSetDevice version(*2) and without(*3).
I need to know the rule to review our product codes.
- *1: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g159587909ffa0791bbe4b40187a4c6bb
- *2: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/examples.html#example-1-single-process-single-thread-multiple-devices
- *3: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/examples.html#example-3-multiple-devices-per-thread