I am getting an invalid argument from cudaMemcpy after calling cudaMalloc on the device side. What I can’t understand is why it works when I move cudaMalloc from device to host. It probably makes more sense to move the initialization up to host anyway, but at the moment I’m only trying to get the ported code working.
The essence of the code is below. Note that I’m checking every return code and cuda-memcheck does not report an error until I hit cudaMemcpy. The contents of the struct appear to be correct after returning from kernel (which I can easily verify in Visual Studio).
struct R
{
float *v;
// additional members
};
__global__ void kernel(struct R *r, int len)
{
// initialize members of r and allocate memory
cudaMalloc((void**)&r->v, sizeof(float)*len);
}
void init(float *vin, int len)
{
struct *d_r, *h_r;
size_t available, total;
cudaMemGetInfo(&available, &total);
// so device can allocate more than 8MB
cudaDeviceSetLimit(cudaLimitMallocHeapSize, available-200000000);
cudaHostAlloc((void **)&h_r, sizeof(struct R), cudaHostAllocMapped);
cudaHostGetDevicePointer(&d_r, h_r, 0); // not really necessary due to UVA
kernel<< <1, 1 >> >(d_r,len);
cudaDeviceSynchronize();
// uncomment following line to make this work
//cudaMalloc((void**)&r->v, sizeof(float)*len);
// now d_r->v == h_r->v == r->v (in kernel) but the next line fails???
cudaMemcpy((void *)h_r->v, vin, sizeof(float)*len, cudaMemcpyHostToDevice);
}