Problem launching kernel with driverapi

st3fan82 · March 29, 2009, 11:34am

Hi,

I get a CUDA_ERROR_LAUNCH_FAILED (code 700) if I launch my kernel with cuLaunchGridAsync() or cuLaunchgrid().

Its just one of my kernels that fails. The other ones launch and are working.

In fact it should work, because if i execute it in the kernelfunc<<<>>()-style there is no problem. (and the output is ok)

the kernels

__global__ void changeGamma_kernel(unsigned char* ImgPtr, int pitch, int gamma)

{

	// calculate x and y coordinates over all blocks

	int x=blockDim.x * blockIdx.x + threadIdx.x;

	int y=blockDim.y * blockIdx.y + threadIdx.y;

	// get the pixel

	int elem=y*pitch+3*x;

	unsigned char cg=(unsigned char)gamma;

	uchar3 pixel=make_uchar3(((unsigned char)ImgPtr[elem]), ((unsigned char)ImgPtr[elem+1]), ((unsigned char)ImgPtr[elem+2]));

	if ((pixel.x+cg)>255) ImgPtr[elem]=255;

	else ImgPtr[elem]=pixel.x + cg;

	if ((pixel.y+cg)>255) ImgPtr[elem+1]= 255;

	else ImgPtr[elem+1]=pixel.y + cg;

	if ((pixel.z+cg)>255) ImgPtr[elem+2]= 255;

	else ImgPtr[elem+2]=pixel.z + cg;

}

//__shared__ uchar3 startShared;

__global__ void resizeImage_kernel(char *inputPtr, char *outputPtr,int inputHeight, int inputWidth, int outputHeight, int outputWidth)

{

	//where is our pixel

	int x=blockDim.x * blockIdx.x + threadIdx.x;

	int y=blockDim.y * blockIdx.y + threadIdx.y;

	float xSrc=(float)x * (float)((float)inputWidth /(float) outputWidth);

	float ySrc=(float)y * (float)((float)inputHeight /(float) outputHeight);

	int index1=((int)xSrc)+((int)ySrc)*inputWidth;

	int index2=((int)xSrc+1)+((int)ySrc)*inputWidth;

	int index3=((int)xSrc)+((int)ySrc + 1)*inputWidth;

	int index4=((int)xSrc+1)+((int)ySrc + 1)*inputWidth;

	int size=inputHeight*inputWidth-1;

	if (index1>size)index1=size;

	if (index2>size)index2=size;

	if (index3>size)index3=size;

	if (index4>size)index4=size;

	uchar3 p1=((uchar3*)inputPtr)[index1 ];

	uchar3 p2=((uchar3*)inputPtr)[index2];

	uchar3 p3=((uchar3*)inputPtr)[ index3];

	uchar3 p4=((uchar3*)inputPtr)[ index4];

	uchar3 pixel=bilinearInt(xSrc-(int)xSrc,ySrc-(int)ySrc, p1,p2,p3,p4);

	//write in new image

	outputPtr[x*3 + y*outputWidth*3]=pixel.x;

	outputPtr[x*3 + y*outputWidth*3 +1]=pixel.y;

	outputPtr[x*3 + y*outputWidth*3 +2]=pixel.z;

}

__device__ uchar3 bilinearInt(float	x, float	y, uchar3 p1,uchar3 p2,uchar3 p3,uchar3 p4)

{

	uchar3 result;

	float m0 = (1.0 - x) *(float) p1.x + x *(float) p2.x;

	float m1 = (1.0 - x) *(float) p3.x + x *(float) p4.x;

	float r = (1.0 - y) * m0 + y * m1;

	if (r>255) result.x=255;

	else result.x = ((unsigned char) r);

	m0 = (1.0 - x) * (float)p1.y + x *(float) p2.y;

	m1 = (1.0 - x) * (float)p3.y + x *(float) p4.y;

	float g = (1.0 - y) * m0 + y * m1;

	if (g>255) result.y=255;

	else result.y = ((unsigned char) g);

	m0 = (1.0 - x) * (float) p1.z + x *(float) p2.z;

	m1 = (1.0 - x) * (float) p3.z + x *(float) p4.z;

	float b = (1.0 - y) * m0 + y * m1;

	if (b>255) result.z=255;

		else result.z = ((unsigned char) b);

	//result.z = ((unsigned char) (1.0 - y) * m0 + y * m1);

	return result;

}

and the culaunchGrid calls

//copy image to device

	std::cout<<"copy "<<p*h/1024<<"kilobytes to device\n";

	m_iReturn=cuMemcpyHtoDAsync(devPtr, pHostMem, p * h,*m_pStream);

	if(m_iReturn!=CUDA_SUCCESS)return 0;

	

	int gamma=100;

	cuFuncSetBlockShape(*m_pChangeGammaFunc,BLOCK_SIZE,BLOCK_SIZE,1);

	cuParamSeti(*m_pChangeGammaFunc,0,devPtr);

	cuParamSeti(*m_pChangeGammaFunc,sizeof(int),p);

	cuParamSeti(*m_pChangeGammaFunc,sizeof(int)*2,gamma);

	cuParamSetSize(*m_pChangeGammaFunc, 3 * sizeof(int) );

	cuLaunchGridAsync(*m_pChangeGammaFunc,( w / BLOCK_SIZE),( h/ BLOCK_SIZE),*m_pStream);

	

	//cuStreamSynchronize(*m_pStream);

	

	/*

	dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);   std::cout<<"blocksize:"<<BLOCK_SIZE<<"*"<<BLOCK_SIZE<<"\n";

	dim3 dimGrid(w/BLOCK_SIZE,h/BLOCK_SIZE);  std::cout<<"gridsize: "<<dimGrid.x<<"*"<<dimGrid.y<<"\n";

	changeGamma_kernel<<<dimGrid,dimBlock,0,*m_pStream>>>((char*) devPtr, p, 48);

	*/

	std::cout<<"gamma changed\n";

	

	//prepare for gridlaunch and launch gamma change

	m_iReturn=cuCtxSynchronize();

	if(m_iReturn!=CUDA_SUCCESS)return 0;

	m_iReturn=cuFuncSetBlockShape(*m_pResizeFunc,BLOCK_SIZE,BLOCK_SIZE,1);

	if(m_iReturn!=CUDA_SUCCESS)return 0;

	cuParamSeti(*m_pResizeFunc,0,(unsigned int)devPtr);

	cuParamSeti(*m_pResizeFunc,sizeof(int),(unsigned int)newPtr);

	cuParamSeti(*m_pResizeFunc,sizeof(int)*2,h);

	cuParamSeti(*m_pResizeFunc,sizeof(int)*3,w);

	cuParamSeti(*m_pResizeFunc,sizeof(int)*4,newH);

	cuParamSeti(*m_pResizeFunc,sizeof(int)*5,newW);

	cuFuncSetSharedSize(*m_pResizeFunc, 20);

	cuParamSetSize(*m_pResizeFunc, sizeof(int)*6);

	std::cout<<"params set starting grid ..gridsize: "<<newW / BLOCK_SIZE<<" "<<newH / BLOCK_SIZE<<" "<<std::endl;

	m_iReturn=cuLaunchGridAsync(*m_pResizeFunc, newW / BLOCK_SIZE, newH / BLOCK_SIZE,*m_pStream);

	if(m_iReturn!=CUDA_SUCCESS)return 0;

	

	cuStreamSynchronize(*m_pStream);

	std::cout<<"grid started"<<std::endl;

	//dimGrid.x=newW/BLOCK_SIZE;

	//dimGrid.y=newH/BLOCK_SIZE;

	//resizeImage_kernel<<<dimGrid,dimBlock,0,*m_pStream>>>((char*)devPtr, (char*)newPtr,h,w,newH,newW);

The error code comes a little bit later in code. Because of the cuCtxSynchronize between the to cuLaunch calls, I think that the first kernel

is starting and working. (But if this is not necessarily true PLEASE correct me)

Only the second kernel is calling a device function. As I read those functions are implicitly inline. So I should not worry that the bilinearInt func was not in the cubin?

Before you ask there were no problems loading the cubin as a module and extracting the functions was also sucessful.

The host memory was allocated by cuMemAllocHost() and filled with “standard c” memcpy.

the compiler was not printing any warnings

some other Information:

my hardware: geforce gtx280

software: cuda toolkit 2.1 and sdk 2.1

os: a gento linux 64bit

sorry thats all I can say now because the system seems to be down and I am accessing it through ssh.

(its located somewhere in the university, dont now exactly where and its sunday :( )

i am thankfull for every hint to solve the problem

feel free to ask about more information (I am sure I forgot something)

ps: sorry for scribal errors i am from germany, (i hope you understand me at all ;) )

st3fan82 · April 7, 2009, 10:20am

ok, i solved the problem.
because nobody posted a solution, i think it could be interesting to post “my” solution.
the launch error resulted from the cuParamSeti() where i passed the CUdevicePtr.
they should be passed to the kernel launch by a cuparamSetv() call. (because of the x86_64 system)
i got the idea from this post (thanks ;) ):
[url=“http://forums.nvidia.com/index.php?showtopic=86254&pid=497435&mode=threaded&start=#entry497435”]http://forums.nvidia.com/index.php?showtop...rt=#entry497435[/url]
now everything works fine. :)
but i would suggest to mention this problem in the programming guide.

Topic		Replies	Views
problem launching kernel with cuLaunchGrid CUDA Programming and Performance	2	3860	July 15, 2009
nvrtc execution problems CUDA Programming and Performance	3	1271	June 27, 2016
CudaAPIError: [1] Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE with Numba CUDA Programming and Performance	1	2345	January 21, 2022
cudaSynchronizeDevice() returns error code 6 CUDA Programming and Performance	7	8609	June 16, 2011
CUDA grid launch failed error CUDA Programming and Performance	0	7242	March 11, 2011
Problem with "unspecified launch failure" CUDA Programming and Performance	4	3316	February 27, 2009
cudaDecodeGL example error? CUDA Programming and Performance	7	2118	September 27, 2010
cudaLaunchKernel failed to launch kernel CUDA Programming and Performance cuda	2	1676	April 19, 2022
Kernel fails to launch, when called from Java CUDA Programming and Performance	4	7376	October 31, 2008
cudaMemcpy problem problem with using cudaMemcpy CUDA Programming and Performance	12	5909	January 5, 2010

Problem launching kernel with driverapi

Related topics