cudaMalloc and cudaMemcpy for pointer in struct

I am working on creating a ray tracer that supports multiple triangular meshes and I am having issues with getting some of my data structures moved over to the GPU. Right now I am using tinyobjloader to load OBJ files with multiple meshes into my rendering program. Everything looks good CPU side, but when I try to move everything over to the GPU and run my kernel, NSight shows that I have quite a few memory access violations. Here is the offending data structure and variables of interest (sorry for being over verbose. I figure more details up front makes things clearer):

__device__ Sphere* dev_spheres;
__device__ Plane* dev_planes;
__device__ BoundingBox* dev_boundingVolumes;
__device__ Triangle** dev_triangles; //A sparse 2d array containing all the triangles that make up each mesh in the OBJ scene {ex. {Array of triangles for mesh 1}, {Array of triangles for mesh 2}, {Array of triangles for mesh 3}, ...}

__device__ RenderData* dev_renderInfo;

__device__ Camera dev_cam;
__device__ LightSource dev_light;

struct RenderData
	int sphereCount; 
	int planeCount;  
	int triangularMeshes; //How many triangular meshes are in the scene as described in the OBJ file
	int* trianglesPerMesh; //An array of size 'triangularMeshes' listing how many triangles are in each mesh (ex. {996, 1024, 673...})

Here are some of the memory allocations that I am performing:

//----These work and I am able to view the struct vars with NSight when a breakpoint is triggered:----
HANDLE_ERROR( cudaMemcpyToSymbol( dev_light, temp_l, sizeof(LightSource) ) );
HANDLE_ERROR( cudaMemcpyToSymbol( dev_cam, temp_c, sizeof(Camera) ) );

//----These are the ones giving me problems:----
//Malloc and memcpy spheres
HANDLE_ERROR( cudaMalloc((void**)&dev_spheres, sizeof(Sphere) * SPHERE_COUNT) );
HANDLE_ERROR( cudaMemcpy(dev_spheres, temp_s, sizeof(Sphere) * SPHERE_COUNT, cudaMemcpyHostToDevice) );
free( temp_s );

//Malloc and memcpy planes
HANDLE_ERROR( cudaMalloc((void**)&dev_planes, sizeof(Plane) * PLANE_COUNT) );
HANDLE_ERROR( cudaMemcpy(dev_planes, temp_p, sizeof(Plane) * PLANE_COUNT, cudaMemcpyHostToDevice) );
free( temp_p );

//Malloc and memcpy triangle
HANDLE_ERROR( cudaMalloc((void**) &dev_triangles, sizeof(Triangle*) * temp_data->triangularMeshes) );
for(int i = 0; i < temp_data->triangularMeshes; i++)
	char* offset = (((char*)dev_triangles) + (sizeof(Triangle*) * i));
	HANDLE_ERROR( cudaMalloc((void**) &offset, sizeof(Triangle) * temp_data->trianglesPerMesh[i]) );
	HANDLE_ERROR( cudaMemcpy(offset, temp_t[i], sizeof(Triangle) * temp_data->trianglesPerMesh[i], cudaMemcpyHostToDevice) );


//Malloc and memcpy render data
HANDLE_ERROR( cudaMalloc((void**) &dev_renderInfo, sizeof(RenderData)) );
char* planeOffset = (((char*)dev_renderInfo) + offsetof(RenderData, planeCount));
HANDLE_ERROR( cudaMemcpy(planeOffset, &temp_data->planeCount, sizeof(int), cudaMemcpyHostToDevice) );

char* sphereOffset = (((char*)dev_renderInfo) + offsetof(RenderData, sphereCount));
HANDLE_ERROR( cudaMemcpy(sphereOffset, &temp_data->sphereCount, sizeof(int), cudaMemcpyHostToDevice) );

char* triOffset = (((char*)dev_renderInfo) + offsetof(RenderData, triangularMeshes));
HANDLE_ERROR( cudaMemcpy(triOffset, &temp_data->triangularMeshes, sizeof(int), cudaMemcpyHostToDevice) );
char* triCountOffset = (((char*)dev_renderInfo) + offsetof(RenderData, trianglesPerMesh));
HANDLE_ERROR( cudaMalloc((void**) &(triCountOffset), sizeof(int) * temp_data->triangularMeshes) );
HANDLE_ERROR( cudaMemcpy(triCountOffset, temp_data->trianglesPerMesh, sizeof(int) * temp_data->triangularMeshes, cudaMemcpyHostToDevice) );


Some questions about my code:

I think my main problem is a lack of fulling understanding the cudaMemcpy and cudaMalloc calls and misusing them. I don’t think offseting the malloc call for the pointer in the struct is the right way of doing it, but I am not really sure the best way to do this. Also is device really needed for specifying device allocated variables? As a side note I am on using CUDA runtime 6.0. Should i be using unified memory instead of explicitly making all the memory allocations myself?

Thank you in advance for the help. It is much appreciated :).

Yes, you are misusing cudaMalloc. You cannot use cudaMalloc directly on a device variable:

HANDLE_ERROR( cudaMalloc((void**)&dev_spheres, sizeof(Sphere) * SPHERE_COUNT) );

It is illegal to take the address of a device variable (&dev_spheres) in host code. Furthermore, the pointer that you pass to cudaMalloc must point to a location on the host. You could do something like this if you really wanted to:

Sphere *h_dev_spheres;
HANDLE_ERROR( cudaMalloc((void**)&h_dev_spheres, sizeof(Sphere) * SPHERE_COUNT) );
HANDLE_ERROR( cudaMemcpyToSymbo(dev_spheres, &h_dev_spheres, sizeof(Sphere *) );

but I wouldn’t recommend it for multiple reasons relating to unnecessary complexity. For example, I cannot conveniently use cudaMemcpy or even cudaMemcpyToSymbol to copy to the intended allocation at that point. Instead I would have to use cudaMemcpy to h_dev_spheres, knowing that the data will end up in the same place as is pointed to by dev_spheres.

Use of device variables as storage for pointers is therefore in my opinion not that sensible. Instead I would convert all of those cases to ordinary dynamically allocated variables using cudaMalloc and cudaMemcpy, and pass those variables to kernels that need to refer to them as kernel parameters.

Thanks for the response! That definitely makes things quite a bit clearer. I’ll change up my mem allocations and see if that fixes things.

So I was able to fix all the other allocations, but I am still having issues creating the sparse 2d array.

Currently I have:

Triangle** h_temp_tri;
	HANDLE_ERROR( cudaMalloc((void**) &h_temp_tri, sizeof(Triangle*) * temp_data->triangularMeshes) );
	HANDLE_ERROR( cudaMemcpyToSymbol(dev_triangles, &h_temp_tri, sizeof(Triangle*) * temp_data->triangularMeshes) );

I have not yet allocated space for each individual mesh, but shouldn’t that allocate space for the given number of Triangle* pointers?

I am currently getting an invalid argument error at the memCpyToSymbol line.

Yes, it’s observably incorrect because this:


produces a triple level of indirection:

but you are casting it to a double pointer:

(void **)

So although that isn’t the crux of the issue, it indicates that the methodology is flawed because there should never be a need to do this with cudaMalloc.

In general, 2D arrays (i.e. arrays that could be accessed via double C subscripts on the host) can present some challenges. There are a variety of ways to handle it, but the simplest approach is usually to “flatten” the array to 1D and then, if desired, simulate 2D access where needed via pointer or index arithmetic.

I ended up flattening the array which made the whole thing much easier to work with. I only have one more issue which I can’t seem to work out. I am currently passing in the cudaMalloced triangles pointer, but would prefer to copy it to the global symbol. It makes it easier seeing as I use it in several places and would rather not pass it from function to function.

When uncommented, the cudaMemcpyToSymbol fails and the application crashes. Does the source arg have to be a double pointer? I figured it would just copy the address to the symbol.

__device__ Triangle* dev_triangles;

Triangle* d_temp_tri;
HANDLE_ERROR( cudaMalloc((void**) &d_temp_tri, sizeof(Triangle) * temp_data->totalTriangles) );	
//HANDLE_ERROR( cudaMemcpyToSymbol(dev_triangles, d_temp_tri, sizeof(Triangle*)) );
HANDLE_ERROR( cudaMemcpy(d_temp_tri, temp_t, sizeof(Triangle) * temp_data->totalTriangles, cudaMemcpyHostToDevice) );

Thanks for helping me with getting this up and running :)!

Change this:

HANDLE_ERROR( cudaMemcpyToSymbol(dev_triangles, d_temp_tri, sizeof(Triangle*)) );

to this:

HANDLE_ERROR( cudaMemcpyToSymbol(dev_triangles, &d_temp_tri, sizeof(Triangle*)) );

Your intention in this case is to pass the pointer value itself, instead of whatever the pointer points to. So you must pass a pointer-to-pointer to get cudaMemcpy to copy the proper thing.

Worked! Thanks :)!