Hi to everybody,
I have a problem using malloc inside device code and copy data from host memory.
I start with code
#include <cuda.h>
#include <cuda_runtime_api.h>
class CuMat {
public:
typedef CuMat* Ptr;
char* data;
__device__ CuMat() {
unsigned int sizeByte = 10;
data = (unsigned char *) malloc(sizeByte);
printf("allocated %d bytes at position %x\n", sizeByte, (int)data);
memset(data, 1, sizeByte);
}
}
// from Host
void testFuction {
char* test = (char*) malloc(10); // allocate data on host
CuMat::Ptr cuSrc;
cudaError_t error = cudaMalloc(&cuSrc, sizeof(CuMat)); // allocate data on device
initCudaMat<<< 1 , 1 >>>(cuSrc); // initialize device
cudaDeviceSynchronize();
CuMat::Ptr cuSrcHost = (CuMat::Ptr) malloc(sizeof(CuMat));
error = cudaMemcpy(cuSrcHost, cuSrc, sizeof(CuMat), cudaMemcpyDeviceToHost); // OK
error = cudaMemcpy(cuSrcHost->data, test, 10, cudaMemcpyDeviceToHost); // FAIL
}
__global__ void initCudaMat(CuMat* mat) {
new (mat) CuMat();
}
The problem seems to be the code:
error = cudaMemcpy(cuSrcHost->data, prova, 10, cudaMemcpyDeviceToHost);
but I don’t understand the cause of the problem.
It’s true that cuSrcHost is on host, but cuSrcHost->data contains a pointer that reside on device heap memory (as written in B.18. Dynamic Global Memory Allocation and Operations of http://docs.nvidia.com/cuda/pdf/CUDA_C_Programming_Guide.pdf).
Or am I wrong?
Which is my mistake?
Thanks to all!