I am working on creating a ray tracer that supports multiple triangular meshes and I am having issues with getting some of my data structures moved over to the GPU. Right now I am using tinyobjloader to load OBJ files with multiple meshes into my rendering program. Everything looks good CPU side, but when I try to move everything over to the GPU and run my kernel, NSight shows that I have quite a few memory access violations. Here is the offending data structure and variables of interest (sorry for being over verbose. I figure more details up front makes things clearer):
__device__ Sphere* dev_spheres;
__device__ Plane* dev_planes;
__device__ BoundingBox* dev_boundingVolumes;
__device__ Triangle** dev_triangles; //A sparse 2d array containing all the triangles that make up each mesh in the OBJ scene {ex. {Array of triangles for mesh 1}, {Array of triangles for mesh 2}, {Array of triangles for mesh 3}, ...}
__device__ RenderData* dev_renderInfo;
__device__ Camera dev_cam;
__device__ LightSource dev_light;
struct RenderData
{
int sphereCount;
int planeCount;
int triangularMeshes; //How many triangular meshes are in the scene as described in the OBJ file
int* trianglesPerMesh; //An array of size 'triangularMeshes' listing how many triangles are in each mesh (ex. {996, 1024, 673...})
};
Here are some of the memory allocations that I am performing:
//----These work and I am able to view the struct vars with NSight when a breakpoint is triggered:----
HANDLE_ERROR( cudaMemcpyToSymbol( dev_light, temp_l, sizeof(LightSource) ) );
HANDLE_ERROR( cudaMemcpyToSymbol( dev_cam, temp_c, sizeof(Camera) ) );
//----These are the ones giving me problems:----
//Malloc and memcpy spheres
HANDLE_ERROR( cudaMalloc((void**)&dev_spheres, sizeof(Sphere) * SPHERE_COUNT) );
HANDLE_ERROR( cudaMemcpy(dev_spheres, temp_s, sizeof(Sphere) * SPHERE_COUNT, cudaMemcpyHostToDevice) );
free( temp_s );
//Malloc and memcpy planes
HANDLE_ERROR( cudaMalloc((void**)&dev_planes, sizeof(Plane) * PLANE_COUNT) );
HANDLE_ERROR( cudaMemcpy(dev_planes, temp_p, sizeof(Plane) * PLANE_COUNT, cudaMemcpyHostToDevice) );
free( temp_p );
//Malloc and memcpy triangle
HANDLE_ERROR( cudaMalloc((void**) &dev_triangles, sizeof(Triangle*) * temp_data->triangularMeshes) );
for(int i = 0; i < temp_data->triangularMeshes; i++)
{
char* offset = (((char*)dev_triangles) + (sizeof(Triangle*) * i));
HANDLE_ERROR( cudaMalloc((void**) &offset, sizeof(Triangle) * temp_data->trianglesPerMesh[i]) );
HANDLE_ERROR( cudaMemcpy(offset, temp_t[i], sizeof(Triangle) * temp_data->trianglesPerMesh[i], cudaMemcpyHostToDevice) );
free(temp_t[i]);
}
free(temp_t);
//Malloc and memcpy render data
HANDLE_ERROR( cudaMalloc((void**) &dev_renderInfo, sizeof(RenderData)) );
char* planeOffset = (((char*)dev_renderInfo) + offsetof(RenderData, planeCount));
HANDLE_ERROR( cudaMemcpy(planeOffset, &temp_data->planeCount, sizeof(int), cudaMemcpyHostToDevice) );
char* sphereOffset = (((char*)dev_renderInfo) + offsetof(RenderData, sphereCount));
HANDLE_ERROR( cudaMemcpy(sphereOffset, &temp_data->sphereCount, sizeof(int), cudaMemcpyHostToDevice) );
char* triOffset = (((char*)dev_renderInfo) + offsetof(RenderData, triangularMeshes));
HANDLE_ERROR( cudaMemcpy(triOffset, &temp_data->triangularMeshes, sizeof(int), cudaMemcpyHostToDevice) );
char* triCountOffset = (((char*)dev_renderInfo) + offsetof(RenderData, trianglesPerMesh));
HANDLE_ERROR( cudaMalloc((void**) &(triCountOffset), sizeof(int) * temp_data->triangularMeshes) );
HANDLE_ERROR( cudaMemcpy(triCountOffset, temp_data->trianglesPerMesh, sizeof(int) * temp_data->triangularMeshes, cudaMemcpyHostToDevice) );
free(temp_data->trianglesPerMesh);
free(temp_data);
Some questions about my code:
I think my main problem is a lack of fulling understanding the cudaMemcpy and cudaMalloc calls and misusing them. I don’t think offseting the malloc call for the pointer in the struct is the right way of doing it, but I am not really sure the best way to do this. Also is device really needed for specifying device allocated variables? As a side note I am on using CUDA runtime 6.0. Should i be using unified memory instead of explicitly making all the memory allocations myself?
Thank you in advance for the help. It is much appreciated :).