invalid configuration argument

I have the following code:

__global __ void zeros(int * in, int * out, int width, int height, int channels) {

		int ii;

	int xIndex = threadIdx.x + blockIdx.x*blockDim.x;

	int yIndex = threadIdx.y + blockIdx.y*blockDim.y;

	int index = channels*(xIndex + yIndex*width);

	if (xIndex < width && yIndex < height) {

		for (ii = 0; ii < channels; ii++)

			out[index+ii] = 0;



set the block and grid dim

dim3 CUDA_BlockDimensions;

dim3 CUDA_GridDimensions;

CUDA_BlockDimensions.x = 16;

CUDA_BlockDimensions.y = 16;

CUDA_GridDimensions.x = 32;

CUDA_GridDimensions.y = 32;

And launch it using:


zeros<<<CUDA_BlockDimensions, CUDA_GridDimensions>>>(CUDA_input1, CUDA_output1, input2, input3, input4);

CUDALink_Err = cudaGetLastError();

if( CUDA_Err != cudaSuccess)



printf("BlockDimensions x=%d, y=%d, z=%d\n", CUDA_BlockDimensions.x, CUDA_BlockDimensions.y, CUDA_BlockDimensions.z);

printf("GridDimensions x=%d, y=%d, z=%d\n", CUDA_GridDimensions.x, CUDA_GridDimensions.y, CUDA_GridDimensions.z);


printf("ERRROR on line %d in %s:%s\n", __LINE__, __FILE__, __func__);

printf("CUDA Error %s\n", cudaGetErrorString(CUDA_Err));


return dllErr;


Here is the result from the printf:


BlockDimensions x=16, y=16, z=1

GridDimensions x=32, y=32, z=1


ERRROR on line 136 in CUDALink_CUDAZero

CUDA Error invalid configuration argument


the block and grid dimensions seem reasonable to me.


You are supplying the block and grid arguments to the kernel call in the wrong order. Try this:

zeros<<<CUDA_GridDimensions,CUDA_BlockDimensions>>>(CUDA_input1, CUDA_output1, input2, input3, input4);

The reason it doesn’t work for you is because you are asking for 1024 threads per block in your current call (which exceeds the 512 limit).

ahh. switching the arguments fixed the issue. Thanks