This program is just a test to see if I can malloc an array on device in a kernel, to have that new array outlive the kernel, and be transferred back to the host.
I try it with both malloc and cudaMalloc on device. At least with malloc, I can successfully read while on the device a value that was assigned to the malloc’d array, whereas cudaMalloc on device doesn’t seem to work at all - I can’t even successfully read on the device a value that I assigned to it. Using cuda-memcheck, it says the 2nd cudaMemcpyAsync fails, even though the addresses of the source device-malloc’d arrays (retVal[0] and matrix_dev) match when printed out.
#include <stdio.h>
__global__ void testRetVal(float **retVal) {
retVal[0] = (float*)malloc(sizeof(float)*5);
// cudaMalloc((void**)(&(retVal[0])), 5*sizeof(float));
retVal[0][0] = 0;
retVal[0][1] = 1;
retVal[0][2] = 2;
retVal[0][3] = 3;
retVal[0][4] = (float)4;
printf("retVal[0][4] = %f\n", retVal[0][4]);
printf("retVal[0] = %llx\n", retVal[0]);
}
int main(int argc, char **argv) {
float **retVal;
cudaMalloc(&retVal, sizeof(float*));
cudaStream_t strm;
cudaStreamCreate(&strm);
testRetVal<<<1,1,0,strm>>>(retVal);
float *matrix_dev;
float matrix[5];
cudaMemcpyAsync(&matrix_dev, retVal, sizeof(float*),
cudaMemcpyDeviceToHost, strm);
printf("matrix_dev = %llx\n", matrix_dev);
cudaMemcpyAsync(matrix, matrix_dev, sizeof(float)*5,
cudaMemcpyDeviceToHost, strm);
cudaStreamSynchronize(strm);
printf("matrix[4] = %f\n", matrix[4]);
return 0;
}