cudaMemcpy returns "invalid argument" for in-kernel malloc-ed memory

I’m trying to cudaMemcpy into the memory allocated on the device via in-kernel malloc.
But cudaMemcpy fails with “invalid argument”. Any ideas why this is happening???
Using CUDA 5.0, Tesla 2075, driver 310.19, nvcc --gpu-architecture=sm_21

Code sample:

__device__ void* ptr;

__global__ void kernel(){
        ptr=malloc(1024);
        assert(ptr);
}

#define CUDA_CALL(x) if((x)!=cudaSuccess) assert(0);
int main(){

        CUDA_CALL(cudaDeviceSetLimit(cudaLimitMallocHeapSize,1024*1024));
        kernel<<<1,1>>>();
        CUDA_CALL(cudaDeviceSynchronize());
        void* d_ptr;
        void* d_ptr_content;

        CUDA_CALL(cudaGetSymbolAddress(&d_ptr,ptr));
        CUDA_CALL(cudaMemcpy(&d_ptr_content,d_ptr,sizeof(void*),cudaMemcpyDeviceToHost));

        char data[]="asdfwertasdf";

        cudaPointerAttributes attributes;
        CUDA_CALL(cudaPointerGetAttributes(&attributes, d_ptr_content));
        fprintf(stderr,"attr: %s\n", attributes.memoryType==cudaMemoryTypeHost?"host":"device");
        // this call prints "device"

        cudaError_t t=cudaMemcpy(d_ptr_content,data,1,cudaMemcpyHostToDevice);
        // this call always fails
        if (t!=cudaSuccess) fprintf(stderr,"error: %s\n", cudaGetErrorString(t));

}