Problem launching kernel with driverapi


I get a CUDA_ERROR_LAUNCH_FAILED (code 700) if I launch my kernel with cuLaunchGridAsync() or cuLaunchgrid().

Its just one of my kernels that fails. The other ones launch and are working.

In fact it should work, because if i execute it in the kernelfunc<<<>>()-style there is no problem. (and the output is ok)

the kernels

__global__ void changeGamma_kernel(unsigned char* ImgPtr, int pitch, int gamma)


	// calculate x and y coordinates over all blocks

	int x=blockDim.x * blockIdx.x + threadIdx.x;

	int y=blockDim.y * blockIdx.y + threadIdx.y;

	// get the pixel

	int elem=y*pitch+3*x;

	unsigned char cg=(unsigned char)gamma;

	uchar3 pixel=make_uchar3(((unsigned char)ImgPtr[elem]), ((unsigned char)ImgPtr[elem+1]), ((unsigned char)ImgPtr[elem+2]));

	if ((pixel.x+cg)>255) ImgPtr[elem]=255;

	else ImgPtr[elem]=pixel.x + cg;

	if ((pixel.y+cg)>255) ImgPtr[elem+1]= 255;

	else ImgPtr[elem+1]=pixel.y + cg;

	if ((pixel.z+cg)>255) ImgPtr[elem+2]= 255;

	else ImgPtr[elem+2]=pixel.z + cg;


//__shared__ uchar3 startShared;

__global__ void resizeImage_kernel(char *inputPtr, char *outputPtr,int inputHeight, int inputWidth, int outputHeight, int outputWidth)


	//where is our pixel

	int x=blockDim.x * blockIdx.x + threadIdx.x;

	int y=blockDim.y * blockIdx.y + threadIdx.y;

	float xSrc=(float)x * (float)((float)inputWidth /(float) outputWidth);

	float ySrc=(float)y * (float)((float)inputHeight /(float) outputHeight);

	int index1=((int)xSrc)+((int)ySrc)*inputWidth;

	int index2=((int)xSrc+1)+((int)ySrc)*inputWidth;

	int index3=((int)xSrc)+((int)ySrc + 1)*inputWidth;

	int index4=((int)xSrc+1)+((int)ySrc + 1)*inputWidth;

	int size=inputHeight*inputWidth-1;

	if (index1>size)index1=size;

	if (index2>size)index2=size;

	if (index3>size)index3=size;

	if (index4>size)index4=size;

	uchar3 p1=((uchar3*)inputPtr)[index1 ];

	uchar3 p2=((uchar3*)inputPtr)[index2];

	uchar3 p3=((uchar3*)inputPtr)[ index3];

	uchar3 p4=((uchar3*)inputPtr)[ index4];

	uchar3 pixel=bilinearInt(xSrc-(int)xSrc,ySrc-(int)ySrc, p1,p2,p3,p4);

	//write in new image

	outputPtr[x*3 + y*outputWidth*3]=pixel.x;

	outputPtr[x*3 + y*outputWidth*3 +1]=pixel.y;

	outputPtr[x*3 + y*outputWidth*3 +2]=pixel.z;


__device__ uchar3 bilinearInt(float	x, float	y, uchar3 p1,uchar3 p2,uchar3 p3,uchar3 p4)


	uchar3 result;

	float m0 = (1.0 - x) *(float) p1.x + x *(float) p2.x;

	float m1 = (1.0 - x) *(float) p3.x + x *(float) p4.x;

	float r = (1.0 - y) * m0 + y * m1;

	if (r>255) result.x=255;

	else result.x = ((unsigned char) r);

	m0 = (1.0 - x) * (float)p1.y + x *(float) p2.y;

	m1 = (1.0 - x) * (float)p3.y + x *(float) p4.y;

	float g = (1.0 - y) * m0 + y * m1;

	if (g>255) result.y=255;

	else result.y = ((unsigned char) g);

	m0 = (1.0 - x) * (float) p1.z + x *(float) p2.z;

	m1 = (1.0 - x) * (float) p3.z + x *(float) p4.z;

	float b = (1.0 - y) * m0 + y * m1;

	if (b>255) result.z=255;

		else result.z = ((unsigned char) b);

	//result.z = ((unsigned char) (1.0 - y) * m0 + y * m1);

	return result;


and the culaunchGrid calls

//copy image to device

	std::cout<<"copy "<<p*h/1024<<"kilobytes to device\n";

	m_iReturn=cuMemcpyHtoDAsync(devPtr, pHostMem, p * h,*m_pStream);

	if(m_iReturn!=CUDA_SUCCESS)return 0;


	int gamma=100;





	cuParamSetSize(*m_pChangeGammaFunc, 3 * sizeof(int) );

	cuLaunchGridAsync(*m_pChangeGammaFunc,( w / BLOCK_SIZE),( h/ BLOCK_SIZE),*m_pStream);





	dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);   std::cout<<"blocksize:"<<BLOCK_SIZE<<"*"<<BLOCK_SIZE<<"\n";

	dim3 dimGrid(w/BLOCK_SIZE,h/BLOCK_SIZE);  std::cout<<"gridsize: "<<dimGrid.x<<"*"<<dimGrid.y<<"\n";

	changeGamma_kernel<<<dimGrid,dimBlock,0,*m_pStream>>>((char*) devPtr, p, 48);


	std::cout<<"gamma changed\n";


	//prepare for gridlaunch and launch gamma change


	if(m_iReturn!=CUDA_SUCCESS)return 0;


	if(m_iReturn!=CUDA_SUCCESS)return 0;

	cuParamSeti(*m_pResizeFunc,0,(unsigned int)devPtr);

	cuParamSeti(*m_pResizeFunc,sizeof(int),(unsigned int)newPtr);





	cuFuncSetSharedSize(*m_pResizeFunc, 20);

	cuParamSetSize(*m_pResizeFunc, sizeof(int)*6);

	std::cout<<"params set starting grid ..gridsize: "<<newW / BLOCK_SIZE<<" "<<newH / BLOCK_SIZE<<" "<<std::endl;

	m_iReturn=cuLaunchGridAsync(*m_pResizeFunc, newW / BLOCK_SIZE, newH / BLOCK_SIZE,*m_pStream);

	if(m_iReturn!=CUDA_SUCCESS)return 0;



	std::cout<<"grid started"<<std::endl;



	//resizeImage_kernel<<<dimGrid,dimBlock,0,*m_pStream>>>((char*)devPtr, (char*)newPtr,h,w,newH,newW);

The error code comes a little bit later in code. Because of the cuCtxSynchronize between the to cuLaunch calls, I think that the first kernel

is starting and working. (But if this is not necessarily true PLEASE correct me)

Only the second kernel is calling a device function. As I read those functions are implicitly inline. So I should not worry that the bilinearInt func was not in the cubin?

Before you ask there were no problems loading the cubin as a module and extracting the functions was also sucessful.

The host memory was allocated by cuMemAllocHost() and filled with “standard c” memcpy.

the compiler was not printing any warnings

some other Information:

my hardware: geforce gtx280

software: cuda toolkit 2.1 and sdk 2.1

os: a gento linux 64bit

sorry thats all I can say now because the system seems to be down and I am accessing it through ssh.

(its located somewhere in the university, dont now exactly where and its sunday :( )

i am thankfull for every hint to solve the problem

feel free to ask about more information (I am sure I forgot something)

ps: sorry for scribal errors i am from germany, (i hope you understand me at all ;) )

ok, i solved the problem.
because nobody posted a solution, i think it could be interesting to post “my” solution.
the launch error resulted from the cuParamSeti() where i passed the CUdevicePtr.
they should be passed to the kernel launch by a cuparamSetv() call. (because of the x86_64 system)
i got the idea from this post (thanks ;) ):…rt=#entry497435
now everything works fine. :)
but i would suggest to mention this problem in the programming guide.