cudaMemcpy to device allocated memory (via malloc) fails with

Hello,
I am allocating a chunk of memory on the device by using malloc from inside a kernel/device function.
But when i copy to that memory using cudaMemcpy, it fails.
To ease porting of existing code, it would be nicer if i could allocate memory via malloc and still copy to it via cudaMemcpy.
Here is the reproducer:
#include <stdio.h>

#define dugCudaCheck(result)
do {
cudaError_t status = (result);
if (status != cudaSuccess) {
fprintf(stderr, “%s#%d: %s\n”, FILE, LINE, cudaGetErrorString(status));
exit(status);
}
} while(0)

global void cuda_hello(){
printf(“Hello World from GPU\n”);
}

global void device_alloc(unsigned long* addr) {
char* a = (char*) malloc(1 * sizeof(char));
*a = ‘d’;
addr[0] = (unsigned long) a;
}

global void print_data(unsigned long addr) {
char c = (char)addr;
printf(“print_data: char is %c\n”, *c);
}

global void print_data_by_pointer(char* c) {
printf(“print_data_by_pointer: char is %c\n”, *c);
}

int main() {
cudaSetDevice(0);

    cuda_hello<<<1,1>>>();
    cudaDeviceSynchronize();

    unsigned long* addr;
    dugCudaCheck(cudaMalloc((void**)&addr, sizeof(unsigned long)));

    device_alloc<<<1,1>>>(addr);

    unsigned long addr_host[1];
    dugCudaCheck(cudaMemcpy(addr_host, addr,  sizeof(unsigned long), cudaMemcpyDeviceToHost));
    printf("CUDA addr = %p\n", addr_host[0]);
    printf("Lets check we can access this address on device and print data\n"); 
    print_data<<<1,1>>>(addr_host[0]);
    char* c_dev = (char*) addr_host[0];
    print_data_by_pointer<<<1,1>>>(c_dev);
    cudaDeviceSynchronize();

    char c[1], c2[1];
    char* c_dev2;
    dugCudaCheck(cudaMalloc((void**)&c_dev2, sizeof(char))); 
    printf("Trying to cudaMemcpy from pointer allocated from cudaMalloc\n");
    dugCudaCheck(cudaMemcpy(c2, c_dev2, sizeof(char), cudaMemcpyDeviceToHost));
    printf("SUCCESS\n");
    printf("Trying to cudaMemcpy from pointer allocated form malloc\n");
    dugCudaCheck(cudaMemcpy(c, c_dev, sizeof(char), cudaMemcpyDeviceToHost));
    printf("SUCCESS\n");

    dugCudaCheck(cudaFree(addr));
    cudaDeviceSynchronize();

}
After compiling it with nvcc mmcp.cu and running i get: mmcp.cu#61: invalid argument

I understand that this might be due to the fact that cudaMemcpy can only operate on memory that is known to it (by cudaMalloc) but is there a way to register the device memory range after the fact?
Regards.
Jacek Tomaka

Right, so this is c++ - cudaMemcpy to host for device-allocated memory still not possible? - Stack Overflow

Any work arounds?