I need some more help here, i am triying to launch my own kernel over each frame during the H264 decoding in cudaDecodeGL but it seems to be unable to modify anything and, even more, if i create a new variable like an uint32 for example, i am not able even to modify it and it always returns “0” to the host, just like if i try to modify de frame in the &pPBOData pointer to device memory where BGRA resulting frames are being decoded, the result is always the same, zeros as return from the device as if the kernel didnt execute anytime.
I don´t know if this is becouse of some cuContextLock or somthing like this that can be running in this example. I would apreciate it so much if someone can help me to understand why can´t I launch my own kernel inside the cudaDecodeGL example. It is so important to achieve that for my current project.
The variables are being allocated like this:
[codebox]
long int memSize = nWidthnHeight4;
cudaMallocHost( (void **)&g_bFrameData, memSize );
cuMemAlloc( &pPreviousDecodedFrame,memSize);
cuMemAlloc( &pResultFrame,memSize);[/codebox]
My kernel is launching this way:
[codebox]
g_pImageGL->map(&pPBOData, &nPBOPitch);
nPBOPitch = g_nWindowWidth * 4;
cudaPostProcessFrame(&pDecodedFrame, nDecodedPitch, &pPBOData, nPBOPitch, g_pCudaModule->getModule(), g_kernelNV12toARGB, g_KernelSID);
if(g_KernelSID)
{
CUresult error;
error = cudaLaunchMyKernel ( pPBOData ,pPreviousDecodedFrame ,pResultFrame ,g_bFrameData, nWidth, nHeight);[/codebox]
cudaLaunchMyKernel :
[codebox]// CUDA kernel for frame post-processing.
extern “C”
global void MyKernel(uint32 * decoded, unsigned char * previous, unsigned char * result)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int tz = threadIdx.z;
int bx = blockIdx.x;
int by = blockIdx.y;
// 16 16 x 16 = 256 80 4(bytes por pixel)
uint32 offset = (bx*blockDim.x + by*(blockDim.x*blockDim.y)*gridDim.x) * 4;//Esto ajusta el inicio de cada bloque... o eso creo...LOL
uint32 pixel_bloque = ty*(blockDim.x) + tx;//Esto ajusta cada pixel en su bloque, sumandose al offset del bloque.
result[ offset + pixel_bloque + tz ] = abs(decoded[ offset + pixel_bloque + tz] - previous[ offset + pixel_bloque + tz]);
}
extern “C”
CUresult cudaLaunchMyKernel(CUdeviceptr d_DecodedFrame, CUdeviceptr d_PreviousFrame,CUdeviceptr d_ResultFrame,unsigned char * h_ResultFrame,uint32 width,uint32 * height )
{
unsigned int memSize = width * height * 4;
//CUcontext temp_ctx;
//cuCtxAttach(&temp_ctx,0);
dim3 threadsPerBlock(16, 16, 3);// Bloques de 16x16 hilos, profundidad en z de 3 hilos para ajustar cada uno a una componente de color BGR
dim3 numBlocks(width/16,height/16);
MyKernel<<<numBlocks, threadsPerBlock>>>((unsigned char *)d_DecodedFrame, (unsigned char *) d_PreviousFrame, (unsigned char *) d_ResultFrame);
cudaThreadSynchronize();
//Copy the result form de device to de Host
CUresult cu;
cu = cuMemcpyDtoH( height ,prueba, sizeof(uint32) );
if(cu != CUDA_SUCCESS) cout << "cudaLaunchMyKernel Error : " <<cu << endl;
//Swap actual frame with previous one
error = cudaMemcpy(&d_DecodedFrame,d_PreviousFrame,memSize,cudaMemcpyDeviceT
oDevice);
//cuCtxDetach(temp_ctx);
return cu;
}[/codebox]
Thank you.