Invalid Argument after calling cudaMalloc on device but not host

I am getting an invalid argument from cudaMemcpy after calling cudaMalloc on the device side. What I can’t understand is why it works when I move cudaMalloc from device to host. It probably makes more sense to move the initialization up to host anyway, but at the moment I’m only trying to get the ported code working.

The essence of the code is below. Note that I’m checking every return code and cuda-memcheck does not report an error until I hit cudaMemcpy. The contents of the struct appear to be correct after returning from kernel (which I can easily verify in Visual Studio).

struct R
    float *v;
    // additional members

__global__ void kernel(struct R *r, int len)
    // initialize members of r and allocate memory
    cudaMalloc((void**)&r->v, sizeof(float)*len);

void init(float *vin, int len)
    struct *d_r, *h_r;
    size_t available, total;
    cudaMemGetInfo(&available, &total);
    // so device can allocate more than 8MB
    cudaDeviceSetLimit(cudaLimitMallocHeapSize, available-200000000);
    cudaHostAlloc((void **)&h_r, sizeof(struct R), cudaHostAllocMapped);
    cudaHostGetDevicePointer(&d_r, h_r, 0);  // not really necessary due to UVA
    kernel<< <1, 1 >> >(d_r,len);

    // uncomment following line to make this work
    //cudaMalloc((void**)&r->v, sizeof(float)*len);

    // now d_r->v == h_r->v == r->v (in kernel) but the next line fails???
    cudaMemcpy((void *)h_r->v, vin, sizeof(float)*len, cudaMemcpyHostToDevice);

pointers allocated via in-kernel new, malloc, or cudaMalloc cannot participate in the host API calls like cudaMemcpy or cudaFree.