Hi,
I get a CUDA_ERROR_LAUNCH_FAILED (code 700) if I launch my kernel with cuLaunchGridAsync() or cuLaunchgrid().
Its just one of my kernels that fails. The other ones launch and are working.
In fact it should work, because if i execute it in the kernelfunc<<<>>()-style there is no problem. (and the output is ok)
the kernels
__global__ void changeGamma_kernel(unsigned char* ImgPtr, int pitch, int gamma)
{
// calculate x and y coordinates over all blocks
int x=blockDim.x * blockIdx.x + threadIdx.x;
int y=blockDim.y * blockIdx.y + threadIdx.y;
// get the pixel
int elem=y*pitch+3*x;
unsigned char cg=(unsigned char)gamma;
uchar3 pixel=make_uchar3(((unsigned char)ImgPtr[elem]), ((unsigned char)ImgPtr[elem+1]), ((unsigned char)ImgPtr[elem+2]));
if ((pixel.x+cg)>255) ImgPtr[elem]=255;
else ImgPtr[elem]=pixel.x + cg;
if ((pixel.y+cg)>255) ImgPtr[elem+1]= 255;
else ImgPtr[elem+1]=pixel.y + cg;
if ((pixel.z+cg)>255) ImgPtr[elem+2]= 255;
else ImgPtr[elem+2]=pixel.z + cg;
}
//__shared__ uchar3 startShared;
__global__ void resizeImage_kernel(char *inputPtr, char *outputPtr,int inputHeight, int inputWidth, int outputHeight, int outputWidth)
{
//where is our pixel
int x=blockDim.x * blockIdx.x + threadIdx.x;
int y=blockDim.y * blockIdx.y + threadIdx.y;
float xSrc=(float)x * (float)((float)inputWidth /(float) outputWidth);
float ySrc=(float)y * (float)((float)inputHeight /(float) outputHeight);
int index1=((int)xSrc)+((int)ySrc)*inputWidth;
int index2=((int)xSrc+1)+((int)ySrc)*inputWidth;
int index3=((int)xSrc)+((int)ySrc + 1)*inputWidth;
int index4=((int)xSrc+1)+((int)ySrc + 1)*inputWidth;
int size=inputHeight*inputWidth-1;
if (index1>size)index1=size;
if (index2>size)index2=size;
if (index3>size)index3=size;
if (index4>size)index4=size;
uchar3 p1=((uchar3*)inputPtr)[index1 ];
uchar3 p2=((uchar3*)inputPtr)[index2];
uchar3 p3=((uchar3*)inputPtr)[ index3];
uchar3 p4=((uchar3*)inputPtr)[ index4];
uchar3 pixel=bilinearInt(xSrc-(int)xSrc,ySrc-(int)ySrc, p1,p2,p3,p4);
//write in new image
outputPtr[x*3 + y*outputWidth*3]=pixel.x;
outputPtr[x*3 + y*outputWidth*3 +1]=pixel.y;
outputPtr[x*3 + y*outputWidth*3 +2]=pixel.z;
}
__device__ uchar3 bilinearInt(float x, float y, uchar3 p1,uchar3 p2,uchar3 p3,uchar3 p4)
{
uchar3 result;
float m0 = (1.0 - x) *(float) p1.x + x *(float) p2.x;
float m1 = (1.0 - x) *(float) p3.x + x *(float) p4.x;
float r = (1.0 - y) * m0 + y * m1;
if (r>255) result.x=255;
else result.x = ((unsigned char) r);
m0 = (1.0 - x) * (float)p1.y + x *(float) p2.y;
m1 = (1.0 - x) * (float)p3.y + x *(float) p4.y;
float g = (1.0 - y) * m0 + y * m1;
if (g>255) result.y=255;
else result.y = ((unsigned char) g);
m0 = (1.0 - x) * (float) p1.z + x *(float) p2.z;
m1 = (1.0 - x) * (float) p3.z + x *(float) p4.z;
float b = (1.0 - y) * m0 + y * m1;
if (b>255) result.z=255;
else result.z = ((unsigned char) b);
//result.z = ((unsigned char) (1.0 - y) * m0 + y * m1);
return result;
}
and the culaunchGrid calls
//copy image to device
std::cout<<"copy "<<p*h/1024<<"kilobytes to device\n";
m_iReturn=cuMemcpyHtoDAsync(devPtr, pHostMem, p * h,*m_pStream);
if(m_iReturn!=CUDA_SUCCESS)return 0;
int gamma=100;
cuFuncSetBlockShape(*m_pChangeGammaFunc,BLOCK_SIZE,BLOCK_SIZE,1);
cuParamSeti(*m_pChangeGammaFunc,0,devPtr);
cuParamSeti(*m_pChangeGammaFunc,sizeof(int),p);
cuParamSeti(*m_pChangeGammaFunc,sizeof(int)*2,gamma);
cuParamSetSize(*m_pChangeGammaFunc, 3 * sizeof(int) );
cuLaunchGridAsync(*m_pChangeGammaFunc,( w / BLOCK_SIZE),( h/ BLOCK_SIZE),*m_pStream);
//cuStreamSynchronize(*m_pStream);
/*
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); std::cout<<"blocksize:"<<BLOCK_SIZE<<"*"<<BLOCK_SIZE<<"\n";
dim3 dimGrid(w/BLOCK_SIZE,h/BLOCK_SIZE); std::cout<<"gridsize: "<<dimGrid.x<<"*"<<dimGrid.y<<"\n";
changeGamma_kernel<<<dimGrid,dimBlock,0,*m_pStream>>>((char*) devPtr, p, 48);
*/
std::cout<<"gamma changed\n";
//prepare for gridlaunch and launch gamma change
m_iReturn=cuCtxSynchronize();
if(m_iReturn!=CUDA_SUCCESS)return 0;
m_iReturn=cuFuncSetBlockShape(*m_pResizeFunc,BLOCK_SIZE,BLOCK_SIZE,1);
if(m_iReturn!=CUDA_SUCCESS)return 0;
cuParamSeti(*m_pResizeFunc,0,(unsigned int)devPtr);
cuParamSeti(*m_pResizeFunc,sizeof(int),(unsigned int)newPtr);
cuParamSeti(*m_pResizeFunc,sizeof(int)*2,h);
cuParamSeti(*m_pResizeFunc,sizeof(int)*3,w);
cuParamSeti(*m_pResizeFunc,sizeof(int)*4,newH);
cuParamSeti(*m_pResizeFunc,sizeof(int)*5,newW);
cuFuncSetSharedSize(*m_pResizeFunc, 20);
cuParamSetSize(*m_pResizeFunc, sizeof(int)*6);
std::cout<<"params set starting grid ..gridsize: "<<newW / BLOCK_SIZE<<" "<<newH / BLOCK_SIZE<<" "<<std::endl;
m_iReturn=cuLaunchGridAsync(*m_pResizeFunc, newW / BLOCK_SIZE, newH / BLOCK_SIZE,*m_pStream);
if(m_iReturn!=CUDA_SUCCESS)return 0;
cuStreamSynchronize(*m_pStream);
std::cout<<"grid started"<<std::endl;
//dimGrid.x=newW/BLOCK_SIZE;
//dimGrid.y=newH/BLOCK_SIZE;
//resizeImage_kernel<<<dimGrid,dimBlock,0,*m_pStream>>>((char*)devPtr, (char*)newPtr,h,w,newH,newW);
The error code comes a little bit later in code. Because of the cuCtxSynchronize between the to cuLaunch calls, I think that the first kernel
is starting and working. (But if this is not necessarily true PLEASE correct me)
Only the second kernel is calling a device function. As I read those functions are implicitly inline. So I should not worry that the bilinearInt func was not in the cubin?
Before you ask there were no problems loading the cubin as a module and extracting the functions was also sucessful.
The host memory was allocated by cuMemAllocHost() and filled with “standard c” memcpy.
the compiler was not printing any warnings
some other Information:
my hardware: geforce gtx280
software: cuda toolkit 2.1 and sdk 2.1
os: a gento linux 64bit
sorry thats all I can say now because the system seems to be down and I am accessing it through ssh.
(its located somewhere in the university, dont now exactly where and its sunday :( )
i am thankfull for every hint to solve the problem
feel free to ask about more information (I am sure I forgot something)
ps: sorry for scribal errors i am from germany, (i hope you understand me at all ;) )