Passing structures into CUDA kernels

KChou · September 17, 2010, 11:41pm

Hello, I’m pretty new to programming, and I’m really new to CUDA

Is it possible to pass structures into CUDA kernels?

for example, I have:

struct matrix{int width; int height; int size; int bitSize; int wstart; int hstart; int *arrayPtr;};

int main(){

	struct matrix h_sample, h_f, h_result;

	struct matrix d_sample, d_f, d_result;

		//then I assign each parameter of h_sample and h_f values, and an array for arrayPtr

		//allocate host memory, etc, etc

		//allocate device memory

	cudaMalloc( (void **) &d_sample.arrayPtr, d_sample.bitSize);

	cudaMalloc( (void **) &d_f.arrayPtr, d_f.bitSize);

	cudaMalloc( (void **) &d_result.arrayPtr, d_result.bitSize);

		//copy the arrays

	cudaMemcpy( d_sample.arrayPtr, h_sample.arrayPtr, h_sample.bitSize, cudaMemcpyHostToDevice );

	cudaMemcpy( d_f.arrayPtr, h_f.arrayPtr, h_f.bitSize, cudaMemcpyHostToDevice );

	cudaMemcpy( d_result.arrayPtr, h_result.arrayPtr, h_result.bitSize, cudaMemcpyHostToDevice );	

		//copy other values of the structures

	d_sample = h_sample;

	d_f = h_f;

	d_result = h_result;

	int numThreadsPerBlock = 4;

	dim3 dimGrid(68/numThreadsPerBlock, 68);

	dim3 dimBlock(numThreadsPerBlock);

	

	//launch kernel on Device

	dilateOnDevice<<< dimGrid, dimBlock >>>(d_sample, d_f, d_result);

		// block until the device has completed

		cudaThreadSynchronize();

	   // Check for any CUDA errors

	   checkCUDAError("kernel invocation");

.....

}

//my program always quits at this point! External Image

my function for checkCUDAerror and my kernel are:

void checkCUDAError(const char *msg)

{

	cudaError_t err = cudaGetLastError();

	if( cudaSuccess != err) 

	{

		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

		system("pause");

		exit(EXIT_FAILURE);

	}						 

}

//my kernel:

__global__ void dilateOnDevice(matrix d_sample, matrix d_f, matrix d_result)

{

	int i,j,down,across,max,value,resultIndex, cheight, bheight;

	i = blockDim.x*blockIdx.x+threadIdx.x;

	j = blockDim.y*blockIdx.y+threadIdx.y;

	resultIndex = j+i*d_result.width;

	d_result.arrayPtr[resultIndex]=0;

	for(down=0;down<d_f.height;down++){		//multiplies each column

		max = 0;

		cheight = down*d_result.width;

		bheight = down*d_f.width;

		for(across=0;across<d_f.width;across++){		//multiplies each row

			value = d_f.arrayPtr[across+bheight]*d_sample.arrayPtr[resultIndex + across+cheight];

			if(value>max)

				max = value;

		}

	}

	d_result.arrayPtr[resultIndex]=max;

}

I’m trying to write a simple dilation function here…

since checkCUDAerror print out “unknown error” I’m pretty sure the error is in the kernel, but I don’t know where it is?

or is my kernel just retarded?

my CPU version of the code is working, and I just tried to do the same for my kernel (is this the reason my kernel is being retarded??):

void dilateOnHost(matrix a, matrix b, matrix c,int *result){

	int i,j,down,across,max,value,resultIndex, cheight, bheight;

	for(i=0;i<c.height;i++){	//moves filter down

		for (j=0;j<c.width;j++){	//moves filter across

			resultIndex = j+i*c.width;

			result[resultIndex]=0;

			for(down=0;down<b.height;down++){		//multiplies each column

				max = 0;

				cheight = down*c.width;

				bheight = down*b.width;

				for(across=0;across<b.width;across++){		//multiplies each row

					value = b.arrayPtr[across+bheight]*a.arrayPtr[resultIndex + across+cheight];

					if(value>max)

						max = value;

				}

			}

			result[resultIndex]=max;

		}

	}

}

one suspicion I have right now is that I can’t pass structure into the kernel that way.

so, what is the right way to pass structure into the kernel??

avidday · September 18, 2010, 7:17am

This bit:

//copy other values of the structures

	d_sample = h_sample;

	d_f = h_f;

	d_result = h_result;

looks pretty suspicious. Won’t it overwrite your just allocated and assigned device memory pointers with the host values?

avidday · September 18, 2010, 7:17am

This bit:

//copy other values of the structures

	d_sample = h_sample;

	d_f = h_f;

	d_result = h_result;

looks pretty suspicious. Won’t it overwrite your just allocated and assigned device memory pointers with the host values?

KChou · September 20, 2010, 9:30pm

This bit:
//copy other values of the structures

	d_sample = h_sample;

	d_f = h_f;

	d_result = h_result;
looks pretty suspicious. Won’t it overwrite your just allocated and assigned device memory pointers with the host values?

aren’t they supposed to be the same to begin with? hence the cudaMemCpy…

KChou · September 20, 2010, 9:30pm

This bit:
//copy other values of the structures

	d_sample = h_sample;

	d_f = h_f;

	d_result = h_result;
looks pretty suspicious. Won’t it overwrite your just allocated and assigned device memory pointers with the host values?

aren’t they supposed to be the same to begin with? hence the cudaMemCpy…

betete · September 24, 2010, 1:19pm

I have used structures many times in CUDA. I generally used structure pointers to do this. Here I show you an example. Hope it helps you.

struct ve_s{
int idx;
float real;
}

global void Kernel(…, struct ve_s* sv, …)
{
…
sv->real = 34.5;
…
}

main()
{
struct ve_s vs;
cutilSafeCall( cudaMalloc((void**)&d_vs, sizeof(ve_s)) );
cutilSafeCall( cudaMemcpy(d_vs, &vs, sizeof(ve_s), cudaMemcpyHostToDevice) );

Kernel<<< blocksPerGrid, threadsPerBlock>>>(…, d_vs,…);
}

betete · September 24, 2010, 1:19pm