pMyType[i].var = (uint4*)pPTR_D[i];
pPTR_D[i] will cause an access violation, as it’s a device ptr. I’ve been messing around with this same thing for a little bit now.
struct Object
{
int numberOfPoints;
float3* points;
};
struct Group
{
int numberOfObjects;
Object* objects;
};
__global__ void
kernel(Group* group, float3* points_0, float3* points_1)
{
int index;
for(index = 0; index < group->objects[0].numberOfPoints; index++)
points_0[index] = group->objects[0].points[index];
for(index = 0; index < group->objects[1].numberOfPoints; index++)
points_1[index] = group->objects[1].points[index];
}
void
runTest( int argc, char** argv)
{
int ret, index;
CUT_DEVICE_INIT();
// sample case
Group group;
group.numberOfObjects = 2;
group.objects = (Object*)malloc(sizeof(Object) * group.numberOfObjects);
group.objects[0].numberOfPoints = 2;
group.objects[0].points = (float3*)malloc(sizeof(float3) * group.objects[0].numberOfPoints);
group.objects[0].points[0].x = 0; group.objects[0].points[0].y = 1; group.objects[0].points[0].z = 0;
group.objects[0].points[1].x = 1; group.objects[0].points[1].y = 1; group.objects[0].points[1].z = 0;
group.objects[1].numberOfPoints = 1;
group.objects[1].points = (float3*)malloc(sizeof(float3) * group.objects[1].numberOfPoints);
group.objects[1].points[0].x = 0; group.objects[1].points[0].y = 0; group.objects[1].points[0].z = 1;
// start the transfers
Group* p_device_group;
ret = cudaMalloc((void**) &p_device_group, sizeof(Group));
printf("p_device_group cudaMalloc: %i\n", ret);
printf("p_devuce_group ptr: device: %p | host: %p\n", p_device_group, &p_device_group);
// copy Group to device
ret = cudaMemcpy(p_device_group, &group, sizeof(Group), cudaMemcpyHostToDevice);
printf("p_device_group cudaMemcpy: %i\n", ret);
float3* d_pnt_0, *d_pnt_1, *h_pnt_0, *h_pnt_1;
ret = cudaMalloc((void**) &d_pnt_0, sizeof(float3) * group.objects[0].numberOfPoints);
printf("d_pnt0 cudaMalloc: %i\n", ret);
ret = cudaMalloc((void**) &d_pnt_1, sizeof(float3) * group.objects[1].numberOfPoints);
printf("d_pnt1 cudaMalloc: %i\n", ret);
h_pnt_0 = (float3*)malloc(sizeof(float3) * group.objects[0].numberOfPoints);
h_pnt_1 = (float3*)malloc(sizeof(float3) * group.objects[1].numberOfPoints);
dim3 grid(1,0,0);
dim3 threads(1,0,0);
kernel<<< grid, threads >>>(p_device_group, d_pnt_0, d_pnt_1);
// copy from device to host (illustrates the host pointers aren't copied)
ret = cudaMemcpy(h_pnt_0, d_pnt_0, sizeof(float3) * group.objects[0].numberOfPoints, cudaMemcpyDeviceToHost);
printf("h_pnt_0 cudaMemcpy: %i\n", ret);
ret = cudaMemcpy(h_pnt_1, d_pnt_1, sizeof(float3) * group.objects[0].numberOfPoints, cudaMemcpyDeviceToHost);
printf("h_pnt_1 cudaMemcpy: %i\n", ret);
for(index = 0; index < group.objects[0].numberOfPoints; index++)
printf("h_pnt_0: %f, %f, %f\n", h_pnt_0[index].x, h_pnt_0[index].y, h_pnt_0[index].z);
for(index = 0; index < group.objects[1].numberOfPoints; index++)
printf("h_pnt_1: %f, %f, %f\n", h_pnt_1[index].x, h_pnt_1[index].y, h_pnt_1[index].z);
// attempt to replace the host ptrs with device ptrs
// Group::objects is +0x04
void* offset = p_device_group + 0x04;
ret = cudaMalloc((void**)&offset, sizeof(Object) * group.numberOfObjects);
printf("p_device_group->objects cudaMalloc: %i\n", ret);
// now, get the device ptr for p_device_group->objects
Object* p_device_objects = 0;
ret = cudaMemcpy(&p_device_objects, offset, sizeof(Object*), cudaMemcpyDeviceToHost);
printf("p_device_objets cudaMemcpy: %i\n", ret);
printf("p_device_objects ptr: device: %p | host: %p\n\n", p_device_objects, &p_device_objects);
// stopping here, because device ptr is null...
cudaFree(d_pnt_0);
cudaFree(d_pnt_1);
cudaFree(p_device_objects);
cudaFree(p_device_group);
}
output:
p_device_group cudaMalloc: 0
p_device_group ptr: device: 11000E00 | host: 0012FDD0
p_device_group cudaMemcpy: 0
d_pnt0 cudaMalloc: 0
d_pnt1 cudaMalloc: 0
h_pnt_0 cudaMemcpy: 0
h_pnt_1 cudaMemcpy: 0
h_pnt_0: -1.#QNAN0, -1.999985, -5192289121409582100000000000000000.000000
h_pnt_0: -0.000030, -562949248778240.000000, -0.000000
h_pnt_1: -2492281892774463600000000000000000000.000000, -37154775710159392000000
000000000000000.000000, -7377763840.000000
p_device_group->objects cudaMalloc: 0
p_device_objets cudaMemcpy: 0
p_device_objects ptr: device: 00000000 | host: 0012FDDC
I can’t remember it fully at the moment, but I think I had a non-null ptr and tried the following:
Object* p_object = (Object*)(p_device_objects + sizeof(Object) * index);
which returns invalid device pointer (17) from cudaMemcpy and cudaMalloc…
Hopefully, I’m missing something simple that would solve all this.