CUDA class - allocate memory using malloc (Dynamic Global Memory Allocation and Operations)

Hi to everybody,

I have a problem using malloc inside device code and copy data from host memory.
I start with code

#include <cuda.h>
#include <cuda_runtime_api.h>

class CuMat {
   typedef CuMat* Ptr;
   char* data;

    __device__ CuMat() {
        unsigned int sizeByte = 10;
        data = (unsigned char *) malloc(sizeByte);
        printf("allocated %d bytes at position %x\n", sizeByte, (int)data);
        memset(data, 1, sizeByte);

// from Host

void testFuction {

    char* test = (char*) malloc(10); // allocate data on host

    CuMat::Ptr cuSrc;
    cudaError_t error = cudaMalloc(&cuSrc, sizeof(CuMat)); // allocate data on device
    initCudaMat<<< 1 , 1 >>>(cuSrc);           // initialize device


    CuMat::Ptr cuSrcHost = (CuMat::Ptr) malloc(sizeof(CuMat));
    error = cudaMemcpy(cuSrcHost, cuSrc, sizeof(CuMat), cudaMemcpyDeviceToHost); // OK

    error = cudaMemcpy(cuSrcHost->data, test, 10, cudaMemcpyDeviceToHost); // FAIL


__global__ void initCudaMat(CuMat* mat) {
    new (mat) CuMat();

The problem seems to be the code:
error = cudaMemcpy(cuSrcHost->data, prova, 10, cudaMemcpyDeviceToHost);

but I don’t understand the cause of the problem.
It’s true that cuSrcHost is on host, but cuSrcHost->data contains a pointer that reside on device heap memory (as written in B.18. Dynamic Global Memory Allocation and Operations of

Or am I wrong?

Which is my mistake?

Thanks to all!

Maybe is this my problem?
So, allocate heap memory in device code is not accessible using cudaMemcpy(…) API?
Why isn’t compatible?


Thanks for the answer!

But I don’t understand: why does this limit exist?