is there any limit on # of arguments in cuda kernel?

Hi,

Is there any maximum limit to the number of arguments in cuda kernel?

cuda_kernel<<nblocks, block_size>>(arg1, arg2, arg3, arg4,…argn)…how many arguments (arg) I can pass at most? Is there any limit?

My code runs but the result is not right when I pass more than 24 args thru cuda kernel.

I am working on MSVS2008, ver 9.0. Win7 64x.

Thanks
Shadab

Device 0: “GeForce GTX 260”
CUDA Driver Version: 2.30
CUDA Capability Major revision number: 1
CUDA Capability Minor revision number: 3
Total amount of global memory: 1879048192 bytes
Number of multiprocessors: 24
Number of cores: 192
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 16384 bytes
Total number of registers available per block: 16384
Warp size: 32
Maximum number of threads per block: 512
Maximum sizes of each dimension of a block: 512 x 512 x 64
Maximum sizes of each dimension of a grid: 65535 x 65535 x 1
Maximum memory pitch: 262144 bytes
Texture alignment: 256 bytes
Clock rate: 1.08 GHz
Concurrent copy and execution: Yes
Run time limit on kernels: No
Integrated: No
Support host page-locked memory mapping: Yes
Compute mode: Default (multiple host threads
can use this device simultaneously)

There’s a limit on the total size of the arguments passed. Not sure if there’s a limit on the number of arguments - it’s likely there’s one.

Thanks for your reply.

what is the limit on size of arguements I can pass? I am trying to pass 43 arguments of size N*sizeof(float), where is 6561.

My code finish successfully , but at the end there is this error message in the debug output window at the bottom of MSVS 2008.

First-chance exception at 0x000007fefd68aa7d in test.exe: Microsoft C++ exception: cudaError_enum at memory location 0x002bdc08…

First-chance exception at 0x000007fefd68aa7d in test.exe: Microsoft C++ exception: cudaError_enum at memory location 0x002bdc08…

The results are not good. This code works fine when I try to pass 40 arg of size N*sizeof(float), where N=6561.

I will appreciate any input to understand and potentially solve this problem.

Thanks

Shadab

256 byte maximum. pass pointers instead

(this is in the documentation)

may be I didn’t put it in right way. I have been passing arguments as pointers. following is my kernel call

int blockSize = 512;

int nBlocks = N/blockSize + (N%blockSize == 0?0:1);

LBM_comp <<< nBlocks, blockSize >>> (rho0_d, ux0_d, uy0_d, rho1_d, ux1_d, uy1_d, N,

f0_d, f1_d, f2_d, f3_d, f4_d, f5_d, f6_d, f7_d, f8_d, ftemp0_d, ftemp1_d, ftemp2_d, ftemp3_d, ftemp4_d, ftemp5_d, ftemp6_d, ftemp7_d, ftemp8_d,

g0_d, g1_d, g2_d, g3_d, g4_d, g5_d, g6_d, g7_d, g8_d, gtemp0_d, gtemp1_d, gtemp2_d, gtemp3_d, gtemp4_d, gtemp5_d, gtemp6_d, gtemp7_d, gtemp8_d,

is_solid_d, frame_rate);

Following is my CUDA memory allocation for all the arguments:

int frame_rate, N=81*81;

cudaMalloc((void **) &rho0_d, size);

cudaMalloc((void **) &ux0_d, size);

cudaMalloc((void **) &uy0_d, size);

cudaMalloc((void **) &is_solid_d, size);

cudaMalloc((void **) &f0_d, size); cudaMalloc((void **) &ftemp0_d, size);

cudaMalloc((void **) &f1_d, size); cudaMalloc((void **) &ftemp1_d, size);

cudaMalloc((void **) &f2_d, size); cudaMalloc((void **) &ftemp2_d, size);

cudaMalloc((void **) &f3_d, size); cudaMalloc((void **) &ftemp3_d, size);

cudaMalloc((void **) &f4_d, size); cudaMalloc((void **) &ftemp4_d, size);

cudaMalloc((void **) &f5_d, size); cudaMalloc((void **) &ftemp5_d, size);

cudaMalloc((void **) &f6_d, size); cudaMalloc((void **) &ftemp6_d, size);

cudaMalloc((void **) &f7_d, size); cudaMalloc((void **) &ftemp7_d, size);

cudaMalloc((void **) &f8_d, size); cudaMalloc((void **) &ftemp8_d, size);

cudaMalloc((void **) &rho1_d, size);

cudaMalloc((void **) &ux1_d, size);

cudaMalloc((void **) &uy1_d, size);

cudaMalloc((void **) &g0_d, size); cudaMalloc((void **) &gtemp0_d, size);

cudaMalloc((void **) &g1_d, size); cudaMalloc((void **) &gtemp1_d, size);

cudaMalloc((void **) &g2_d, size); cudaMalloc((void **) &gtemp2_d, size);

cudaMalloc((void **) &g3_d, size); cudaMalloc((void **) &gtemp3_d, size);

cudaMalloc((void **) &g4_d, size); cudaMalloc((void **) &gtemp4_d, size);

cudaMalloc((void **) &g5_d, size); cudaMalloc((void **) &gtemp5_d, size);

cudaMalloc((void **) &g6_d, size); cudaMalloc((void **) &gtemp6_d, size);

cudaMalloc((void **) &g7_d, size); cudaMalloc((void **) &gtemp7_d, size);

cudaMalloc((void **) &g8_d, size); cudaMalloc((void **) &gtemp8_d, size);

43 * 8 = 344 bytes, which is too large. The argument size llimit is 256 bytes. You will have to build and pass a structure, or write the addresses of the malloced storage onto some device symbols and avoid passing them as argument completely.

Thanks for your comments avidday. I dont know how to write address of storage onto device, hence I choose to go with structure definition. are structures in CUDA exactly same as C or is there any difference in accessing structure elements on GPU’s than C?

reply with an example will be much appreciated.

Shadab

This ought to get you started. Remember that you will need more than one initialization kernel with a total of 43 structure members for the same reason you are jumping through these extra burning hoops in the first place. I have no idea whether this will compile and run on any flavour of Windows, but it certainly works fine on Linux.

#include <assert.h>

#include <stdio.h>

#include <cuda_runtime.h>

#ifndef gpuAssert

#include <stdio.h>

#define gpuAssert( condition ) {if( (condition) != 0 ) { fprintf( stderr, "\n FAILURE %d in %s, line %d\n", condition, __FILE__, __LINE__ );exit( 1 );}}

#endif

#define _DSIZE (32)

typedef struct 

{

	float *a;

	float *b;

	float *c;

	float *d;

} arguments;

__global__ void initkernel(float *a, float *b, float*c, float *d, arguments *out)

{

	if (threadIdx.x == 0) {

		out->a = a;

		out->b = b;

		out->c = c;

		out->d = d;

	}

}

__global__ void testkernel(arguments *inout)

{

	unsigned int idx = threadIdx.x + blockDim.x*blockIdx.x;

	inout->d[idx] += inout->a[idx] + inout->b[idx] + inout->c[idx];

}

int main()

{

	float *a, *b, *c, *d;

	float *_a, *_b, *_c, *_d;

	arguments *_args;

	assert( !(( a = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );

	assert( !(( b = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );

	assert( !(( c = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );

	assert( !(( d = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );

	gpuAssert( cudaMalloc( (void**)&_a, _DSIZE * sizeof(float) ) );

	gpuAssert( cudaMalloc( (void**)&_b, _DSIZE * sizeof(float) ) );

	gpuAssert( cudaMalloc( (void**)&_c, _DSIZE * sizeof(float) ) );

	gpuAssert( cudaMalloc( (void**)&_d, _DSIZE * sizeof(float) ) );

	gpuAssert( cudaMalloc( (void**)&_args, sizeof(arguments) ) );

	for(int i = 0; i < _DSIZE; i++) {

		a[i] = 1.f;

		b[i] = 3.f;

		c[i] = 5.f;

		d[i] = (float)i;

	}

	gpuAssert( cudaMemcpy(_a, a, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );

	gpuAssert( cudaMemcpy(_b, b, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );

	gpuAssert( cudaMemcpy(_c, c, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );

	gpuAssert( cudaMemcpy(_d, d, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );

	initkernel <<< 1, 1 >>> (_a, _b, _c, _d, _args);

	gpuAssert( cudaThreadSynchronize() );

	testkernel <<< 1, _DSIZE >>> (_args);

	gpuAssert( cudaThreadSynchronize() );

	gpuAssert( cudaMemcpy(d, _d, _DSIZE * sizeof(float), cudaMemcpyDeviceToHost) );

	for(int i = 0; i < _DSIZE; i++) {

		fprintf(stdout, "%2d %6.1f\n", i, d[i]);

	}

	cudaFree(_a);

	cudaFree(_b);

	cudaFree(_c);

	cudaFree(_d);

	cudaFree(_args);

	free(a);

	free(b);

	free(c);

	free(d);

	return cudaThreadExit();

}

Thanks avidday, your reply will be very helpful.

Shadab

Thank you very much for this post and please forgive me for my ignorance but why is the initkernel necessary. In other words, why can’t you pass the structure directly? I am new to CUDA and appreciate any details you can share from your journey.

Grateful

-Joshua

There is a limit of 256 bytes for arguments to kernels, so a structure of that size can’t be passed by value, it has to be passed by pointer. There is no way to dereference GPU pointers on the host, so directly allocating memory to members of a device structure can’t be done either. The only two alternatives are to use an init kernel of the type shown in the earlier post, or use the cudaMemcpyToSymbol() function to write addresses onto the values of GPU constant memory or global memory symbols.

True, but surely you could just take the device addresses returned from cudaMalloc() and write them into a host-side structure which you then memcpy to the device? This would avoid the extra kernel invocation (and the fact that you have the same size limitation passing arguments to initKernel as you would to testKernel).

–Cliff

Thanks!