Complex data structures

Hi, I would like to know if it is possible to use a struct with a pointer in one of its members in a kernel. For example:

struct my_type_t {
uint4 * var;
global void kernel(my_type_t * input) {

void invokeKernel() {
//I can create the structure my_type_t in CPU with:
my_type_t * big_structure = new my_type_t[10];

//populate the structure
big_structure[0].var = make_uint4( values ... );

// But how would I cudaMalloc() this structure into the GPU ?


I didn’t find any examples with this kind of structure in the samples, so I’m thinking that I should split my structure into simpler types. It would be so much easier with my complex struct, so I’m still looking for solution (if any) for this.

Any suggestions?

Hmm. Don’t know whether I understood that right.

But I think you can do it maybe that way:

#include <stdio.h>


struct my_type_t {

   uint4 *var;


__global__ void kernel(my_type_t * input);

__host__ int main(void)


	void *pMem_D;

	cudaMalloc(&pMem_D, NUMBER_OF_STRUCTURES*sizeof(my_type_t));


	struct my_type_t *pMyType = (struct my_type_t*) malloc(NUMBER_OF_STRUCTURES*sizeof(my_type_t));

	for(int i=0; i<NUMBER_OF_STRUCTURES; i++) {

  cudaMalloc(&(pPTR_D[i]), sizeof(uint4));

  pMyType[i].var = (uint4*)pPTR_D[i];


	cudaMemcpy((void*)pMem_D, (const void*)pMyType, NUMBER_OF_STRUCTURES*sizeof(my_type_t), cudaMemcpyHostToDevice);




	for(int i=0; i<NUMBER_OF_STRUCTURES; i++)



	return 0;


__global__ void kernel(my_type_t * input) {

	my_type_t *big_structure = input;

	*(big_structure[0].var) = make_uint4(1,2,3,4);

 pMyType[i].var = (uint4*)pPTR_D[i];

pPTR_D[i] will cause an access violation, as it’s a device ptr. I’ve been messing around with this same thing for a little bit now.

struct Object


	int numberOfPoints;

	float3* points;


struct Group


	int numberOfObjects;

	Object* objects;


__global__ void

kernel(Group* group, float3* points_0, float3* points_1)


	int index;

	for(index = 0; index < group->objects[0].numberOfPoints; index++)

  points_0[index] = group->objects[0].points[index];

	for(index = 0; index < group->objects[1].numberOfPoints; index++)

  points_1[index] = group->objects[1].points[index];



runTest( int argc, char** argv) 


	int ret, index;


	// sample case

	Group group;

	group.numberOfObjects = 2;

	group.objects = (Object*)malloc(sizeof(Object) * group.numberOfObjects);

	group.objects[0].numberOfPoints = 2;

	group.objects[0].points = (float3*)malloc(sizeof(float3) * group.objects[0].numberOfPoints);

	group.objects[0].points[0].x = 0; group.objects[0].points[0].y = 1; group.objects[0].points[0].z = 0;

	group.objects[0].points[1].x = 1; group.objects[0].points[1].y = 1; group.objects[0].points[1].z = 0;

	group.objects[1].numberOfPoints = 1;

	group.objects[1].points = (float3*)malloc(sizeof(float3) * group.objects[1].numberOfPoints);

	group.objects[1].points[0].x = 0; group.objects[1].points[0].y = 0; group.objects[1].points[0].z = 1;

	// start the transfers

	Group* p_device_group;

	ret = cudaMalloc((void**) &p_device_group, sizeof(Group));

	printf("p_device_group cudaMalloc: %i\n", ret);

	printf("p_devuce_group ptr: device: %p | host: %p\n", p_device_group, &p_device_group);

	// copy Group to device

	ret = cudaMemcpy(p_device_group, &group, sizeof(Group), cudaMemcpyHostToDevice);

	printf("p_device_group cudaMemcpy: %i\n", ret);

	float3* d_pnt_0, *d_pnt_1, *h_pnt_0, *h_pnt_1;

	ret = cudaMalloc((void**) &d_pnt_0, sizeof(float3) * group.objects[0].numberOfPoints);

	printf("d_pnt0 cudaMalloc: %i\n", ret);

	ret = cudaMalloc((void**) &d_pnt_1, sizeof(float3) * group.objects[1].numberOfPoints);

	printf("d_pnt1 cudaMalloc: %i\n", ret);

	h_pnt_0 = (float3*)malloc(sizeof(float3) * group.objects[0].numberOfPoints);

	h_pnt_1 = (float3*)malloc(sizeof(float3) * group.objects[1].numberOfPoints);

	dim3 grid(1,0,0);

	dim3 threads(1,0,0);

	kernel<<< grid, threads >>>(p_device_group, d_pnt_0, d_pnt_1);

	// copy from device to host (illustrates the host pointers aren't copied)

	ret = cudaMemcpy(h_pnt_0, d_pnt_0, sizeof(float3) * group.objects[0].numberOfPoints, cudaMemcpyDeviceToHost);

	printf("h_pnt_0 cudaMemcpy: %i\n", ret);

	ret = cudaMemcpy(h_pnt_1, d_pnt_1, sizeof(float3) * group.objects[0].numberOfPoints, cudaMemcpyDeviceToHost);

	printf("h_pnt_1 cudaMemcpy: %i\n", ret);

	for(index = 0; index < group.objects[0].numberOfPoints; index++)

  printf("h_pnt_0: %f, %f, %f\n", h_pnt_0[index].x, h_pnt_0[index].y, h_pnt_0[index].z);

	for(index = 0; index < group.objects[1].numberOfPoints; index++)

  printf("h_pnt_1: %f, %f, %f\n", h_pnt_1[index].x, h_pnt_1[index].y, h_pnt_1[index].z);

	// attempt to replace the host ptrs with device ptrs

	// Group::objects is +0x04

	void* offset = p_device_group + 0x04;

	ret = cudaMalloc((void**)&offset, sizeof(Object) * group.numberOfObjects);

	printf("p_device_group->objects cudaMalloc: %i\n", ret);

	// now, get the device ptr for p_device_group->objects

	Object* p_device_objects = 0;

	ret = cudaMemcpy(&p_device_objects, offset, sizeof(Object*), cudaMemcpyDeviceToHost);

	printf("p_device_objets cudaMemcpy: %i\n", ret);

	printf("p_device_objects ptr: device: %p | host: %p\n\n", p_device_objects, &p_device_objects);

// stopping here, because device ptr is null...







p_device_group cudaMalloc: 0

p_device_group ptr: device: 11000E00 | host: 0012FDD0

p_device_group cudaMemcpy: 0

d_pnt0 cudaMalloc: 0

d_pnt1 cudaMalloc: 0

h_pnt_0 cudaMemcpy: 0

h_pnt_1 cudaMemcpy: 0

h_pnt_0: -1.#QNAN0, -1.999985, -5192289121409582100000000000000000.000000

h_pnt_0: -0.000030, -562949248778240.000000, -0.000000

h_pnt_1: -2492281892774463600000000000000000000.000000, -37154775710159392000000

000000000000000.000000, -7377763840.000000

p_device_group->objects cudaMalloc: 0

p_device_objets cudaMemcpy: 0

p_device_objects ptr: device: 00000000 | host: 0012FDDC

I can’t remember it fully at the moment, but I think I had a non-null ptr and tried the following:

Object* p_object = (Object*)(p_device_objects + sizeof(Object) * index);

which returns invalid device pointer (17) from cudaMemcpy and cudaMalloc…

Hopefully, I’m missing something simple that would solve all this.

This code creates a local array of void* pointers on the host side.
This line of code should not cause an access violation because pPtr_D[i] is not being deferenced.
pMyType[i].var = (uint4