CUDA class - allocate memory using malloc (Dynamic Global Memory Allocation and Operations)

Hi to everybody,

I have a problem using malloc inside device code and copy data from host memory.
I start with code

#include <cuda.h>
#include <cuda_runtime_api.h>

class CuMat {
public:
   typedef CuMat* Ptr;
   char* data;

    __device__ CuMat() {
        unsigned int sizeByte = 10;
        data = (unsigned char *) malloc(sizeByte);
        printf("allocated %d bytes at position %x\n", sizeByte, (int)data);
        memset(data, 1, sizeByte);
    }
}

// from Host

void testFuction {

    char* test = (char*) malloc(10); // allocate data on host

    CuMat::Ptr cuSrc;
    cudaError_t error = cudaMalloc(&cuSrc, sizeof(CuMat)); // allocate data on device
    initCudaMat<<< 1 , 1 >>>(cuSrc);           // initialize device

    cudaDeviceSynchronize();

    CuMat::Ptr cuSrcHost = (CuMat::Ptr) malloc(sizeof(CuMat));
    error = cudaMemcpy(cuSrcHost, cuSrc, sizeof(CuMat), cudaMemcpyDeviceToHost); // OK

    error = cudaMemcpy(cuSrcHost->data, test, 10, cudaMemcpyDeviceToHost); // FAIL

}

__global__ void initCudaMat(CuMat* mat) {
    new (mat) CuMat();
}

The problem seems to be the code:
error = cudaMemcpy(cuSrcHost->data, prova, 10, cudaMemcpyDeviceToHost);

but I don’t understand the cause of the problem.
It’s true that cuSrcHost is on host, but cuSrcHost->data contains a pointer that reside on device heap memory (as written in B.18. Dynamic Global Memory Allocation and Operations of http://docs.nvidia.com/cuda/pdf/CUDA_C_Programming_Guide.pdf).

Or am I wrong?

Which is my mistake?

Thanks to all!

Maybe is this my problem? http://stackoverflow.com/a/13043240
So, allocate heap memory in device code is not accessible using cudaMemcpy(…) API?
Why isn’t compatible?

Correct.

Thanks for the answer!

But I don’t understand: why does this limit exist?