Complex data structures

edisongustavo · April 8, 2008, 2:11pm

Hi, I would like to know if it is possible to use a struct with a pointer in one of its members in a kernel. For example:

struct my_type_t {
uint4 * var;
}
global void kernel(my_type_t * input) {
}

void invokeKernel() {
//I can create the structure my_type_t in CPU with:
my_type_t * big_structure = new my_type_t[10];

//populate the structure
big_structure[0].var = make_uint4( values ... );

// But how would I cudaMalloc() this structure into the GPU ?

}

I didn’t find any examples with this kind of structure in the samples, so I’m thinking that I should split my structure into simpler types. It would be so much easier with my complex struct, so I’m still looking for solution (if any) for this.

Any suggestions?

chubaca · April 8, 2008, 4:09pm

Hmm. Don’t know whether I understood that right.

But I think you can do it maybe that way:

#include <stdio.h>

#define NUMBER_OF_STRUCTURES 10

struct my_type_t {

   uint4 *var;

};

__global__ void kernel(my_type_t * input);

__host__ int main(void)

{

	void *pMem_D;

	cudaMalloc(&pMem_D, NUMBER_OF_STRUCTURES*sizeof(my_type_t));

	void *pPTR_D[NUMBER_OF_STRUCTURES];

	struct my_type_t *pMyType = (struct my_type_t*) malloc(NUMBER_OF_STRUCTURES*sizeof(my_type_t));

	for(int i=0; i<NUMBER_OF_STRUCTURES; i++) {

  cudaMalloc(&(pPTR_D[i]), sizeof(uint4));

  pMyType[i].var = (uint4*)pPTR_D[i];

	};

	cudaMemcpy((void*)pMem_D, (const void*)pMyType, NUMBER_OF_STRUCTURES*sizeof(my_type_t), cudaMemcpyHostToDevice);

	free(pMyType);

	kernel<<<1,1>>>((my_type_t*)pMem_D);

	cudaFree(pMem_D);

	for(int i=0; i<NUMBER_OF_STRUCTURES; i++)

  cudaFree(pPTR_D[i]);

  

	return 0;

};

__global__ void kernel(my_type_t * input) {

	my_type_t *big_structure = input;

	*(big_structure[0].var) = make_uint4(1,2,3,4);

};

MikeAnderson25 · April 22, 2008, 7:18pm

 pMyType[i].var = (uint4*)pPTR_D[i];

pPTR_D[i] will cause an access violation, as it’s a device ptr. I’ve been messing around with this same thing for a little bit now.

struct Object

{

	int numberOfPoints;

	float3* points;

};

struct Group

{

	int numberOfObjects;

	Object* objects;

};

__global__ void

kernel(Group* group, float3* points_0, float3* points_1)

{

	int index;

	for(index = 0; index < group->objects[0].numberOfPoints; index++)

  points_0[index] = group->objects[0].points[index];

	for(index = 0; index < group->objects[1].numberOfPoints; index++)

  points_1[index] = group->objects[1].points[index];

}

void

runTest( int argc, char** argv) 

{

	int ret, index;

	CUT_DEVICE_INIT();

	// sample case

	Group group;

	group.numberOfObjects = 2;

	group.objects = (Object*)malloc(sizeof(Object) * group.numberOfObjects);

	group.objects[0].numberOfPoints = 2;

	group.objects[0].points = (float3*)malloc(sizeof(float3) * group.objects[0].numberOfPoints);

	group.objects[0].points[0].x = 0; group.objects[0].points[0].y = 1; group.objects[0].points[0].z = 0;

	group.objects[0].points[1].x = 1; group.objects[0].points[1].y = 1; group.objects[0].points[1].z = 0;

	group.objects[1].numberOfPoints = 1;

	group.objects[1].points = (float3*)malloc(sizeof(float3) * group.objects[1].numberOfPoints);

	group.objects[1].points[0].x = 0; group.objects[1].points[0].y = 0; group.objects[1].points[0].z = 1;

	// start the transfers

	Group* p_device_group;

	ret = cudaMalloc((void**) &p_device_group, sizeof(Group));

	printf("p_device_group cudaMalloc: %i\n", ret);

	printf("p_devuce_group ptr: device: %p | host: %p\n", p_device_group, &p_device_group);

	// copy Group to device

	ret = cudaMemcpy(p_device_group, &group, sizeof(Group), cudaMemcpyHostToDevice);

	printf("p_device_group cudaMemcpy: %i\n", ret);

	float3* d_pnt_0, *d_pnt_1, *h_pnt_0, *h_pnt_1;

	ret = cudaMalloc((void**) &d_pnt_0, sizeof(float3) * group.objects[0].numberOfPoints);

	printf("d_pnt0 cudaMalloc: %i\n", ret);

	ret = cudaMalloc((void**) &d_pnt_1, sizeof(float3) * group.objects[1].numberOfPoints);

	printf("d_pnt1 cudaMalloc: %i\n", ret);

	h_pnt_0 = (float3*)malloc(sizeof(float3) * group.objects[0].numberOfPoints);

	h_pnt_1 = (float3*)malloc(sizeof(float3) * group.objects[1].numberOfPoints);

	dim3 grid(1,0,0);

	dim3 threads(1,0,0);

	kernel<<< grid, threads >>>(p_device_group, d_pnt_0, d_pnt_1);

	// copy from device to host (illustrates the host pointers aren't copied)

	ret = cudaMemcpy(h_pnt_0, d_pnt_0, sizeof(float3) * group.objects[0].numberOfPoints, cudaMemcpyDeviceToHost);

	printf("h_pnt_0 cudaMemcpy: %i\n", ret);

	ret = cudaMemcpy(h_pnt_1, d_pnt_1, sizeof(float3) * group.objects[0].numberOfPoints, cudaMemcpyDeviceToHost);

	printf("h_pnt_1 cudaMemcpy: %i\n", ret);

	for(index = 0; index < group.objects[0].numberOfPoints; index++)

  printf("h_pnt_0: %f, %f, %f\n", h_pnt_0[index].x, h_pnt_0[index].y, h_pnt_0[index].z);

	for(index = 0; index < group.objects[1].numberOfPoints; index++)

  printf("h_pnt_1: %f, %f, %f\n", h_pnt_1[index].x, h_pnt_1[index].y, h_pnt_1[index].z);

	// attempt to replace the host ptrs with device ptrs

	// Group::objects is +0x04

	void* offset = p_device_group + 0x04;

	ret = cudaMalloc((void**)&offset, sizeof(Object) * group.numberOfObjects);

	printf("p_device_group->objects cudaMalloc: %i\n", ret);

	// now, get the device ptr for p_device_group->objects

	Object* p_device_objects = 0;

	ret = cudaMemcpy(&p_device_objects, offset, sizeof(Object*), cudaMemcpyDeviceToHost);

	printf("p_device_objets cudaMemcpy: %i\n", ret);

	printf("p_device_objects ptr: device: %p | host: %p\n\n", p_device_objects, &p_device_objects);

// stopping here, because device ptr is null...

	cudaFree(d_pnt_0);

	cudaFree(d_pnt_1);

	cudaFree(p_device_objects);

	cudaFree(p_device_group);

}

output:

p_device_group cudaMalloc: 0

p_device_group ptr: device: 11000E00 | host: 0012FDD0

p_device_group cudaMemcpy: 0

d_pnt0 cudaMalloc: 0

d_pnt1 cudaMalloc: 0

h_pnt_0 cudaMemcpy: 0

h_pnt_1 cudaMemcpy: 0

h_pnt_0: -1.#QNAN0, -1.999985, -5192289121409582100000000000000000.000000

h_pnt_0: -0.000030, -562949248778240.000000, -0.000000

h_pnt_1: -2492281892774463600000000000000000000.000000, -37154775710159392000000

000000000000000.000000, -7377763840.000000

p_device_group->objects cudaMalloc: 0

p_device_objets cudaMemcpy: 0

p_device_objects ptr: device: 00000000 | host: 0012FDDC

I can’t remember it fully at the moment, but I think I had a non-null ptr and tried the following:

Object* p_object = (Object*)(p_device_objects + sizeof(Object) * index);

which returns invalid device pointer (17) from cudaMemcpy and cudaMalloc…

Hopefully, I’m missing something simple that would solve all this.

chris22 · April 22, 2008, 9:07pm

This code creates a local array of void* pointers on the host side.
void pPTR_D[NUMBER_OF_STRUCTURES];
This line of code should not cause an access violation because pPtr_D[i] is not being deferenced.
pMyType[i].var = (uint4)pPTR_D[i];

Topic		Replies	Views
How to pass large arguments in CUDA kernels Kernel arguments CUDA Programming and Performance	10	19040	December 18, 2009
Passing a structure with a pointer How do you pass a structure with a pointer in it to a kernel CUDA Programming and Performance	8	1237	March 22, 2011
How do you copy entire struct's with pointers? Problem with copying structures containing pointe CUDA Programming and Performance	6	9795	July 13, 2011
Transfering struct with pointers to device memory Used for variable argument list CUDA Programming and Performance	11	26989	January 19, 2011
Passing structures into CUDA kernels CUDA Programming and Performance	9	20330	November 19, 2020
Passing Structs to kernel CUDA Programming and Performance	4	6112	November 6, 2009
Another Device Memory Question CUDA Programming and Performance	7	2305	February 9, 2010
structs containing pointers CUDA Programming and Performance	1	773	January 14, 2013
Complex structs in CUDA CUDA Programming and Performance	5	880	May 7, 2024
Accessing pointer values inside struct copied to CUDA device CUDA Programming and Performance cuda , kernel	2	299	April 24, 2024

Complex data structures

Related topics