cudaDecodeGL example error?

Hi,

I am working on the cudaDecodeGL code example and i am triying to make some asynchronous readbacks from the device to host.

In de example code it is found that it should be this way:

[codebox]// If streams are enabled, we can perform the readback to the host while the kernel is executing

if (g_ReadbackSID)

{

cuMemcpyDtoHAsync(g_bFrameData, pDecodedFrame, (nDecodedPitch * nHeight * 3 / 2), g_ReadbackSID);

}[/codebox]

The problem here is that cuMemcpyDtoHAsync is returning 1 as an INVALID_VALUE so there are no readbacks from the device. It happens with every frame except with the first one.

I have seen that by modifying the code as this:

[codebox]// If streams are enabled, we can perform the readback to the host while the kernel is executing

if (g_ReadbackSID)

{

              CUresult cu = cuMemcpyDtoHAsync(g_bFrameData, pDecodedFrame, (nDecodedPitch * nHeight * 3 / 2), g_ReadbackSID);

              if(cu != CUDA_SUCCESS) std::cout << "Not Working " <<cu << std::endl;

			std::cout << g_bFrameData[0] << " " <<g_bFrameData[10] << " "<< g_bFrameData[100] << std::endl;

}[/codebox]

cout is allways showing the same value and Not Working 1 error code.

I hope some nVidia engineers can look after that in order to see if its me who is wrong.

Thank you anyway.

Well, I have found the problem, it may be usefull for nVidia engineers in order to leave de cudaDecodeGL example almost perfect…

cuMemcpyDtoHAsync was returning invalid value becouse it was receiving some PAGEABLE memory and it needs NON_PAGEABLE memory:

[codebox]if (g_bFirstFrame && g_ReadbackSID) {

			long int memSize = 1280*720*4;//My needed size.

			cudaHostAlloc( (void **)&g_bFrameData, memSize, cudaHostAllocWriteCombined  );//Non pageable memory

                            g_bFirstFrame = false;

}[/codebox]

So you can now use:

[codebox]// If streams are enabled, we can perform the readback to the host while the kernel is executing

if (g_ReadbackSID) {

            CUresult cu = cuMemcpyDtoHAsync(g_bFrameData, pDecodedFrame, (nDecodedPitch * nHeight * 3 / 2), g_ReadbackSID);

			if(cu != CUDA_SUCCESS) std::cout << "cuMemcpyDtoHAsync Failure " <<cu << std::endl;

}[/codebox]

Now I need some other help. I am writing each frame to a file in the hard drive but I am getting twice as frames as the video has, I do it this way:

[codebox]cudaPostProcessFrame(&pDecodedFrame, nDecodedPitch, &pPBOData, nPBOPitch, g_pCudaModule->getModule(), g_kernelNV12toARGB, g_KernelSID);

if(g_KernelSID)

{

CUresult cu = cuMemcpyDtoHAsync(g_bFrameData, pPBOData, 12807204, g_KernelSID);//I read synchronously with g_KernelSID becouse I need post procesed frames.

if(cu != CUDA_SUCCESS) cout << "ReadBack Stream Error Nº " <<cu << endl;

if(cu == CUDA_SUCCESS) cout << "STREAAAAAAAAAAAMMMMMMMMM!!! " <<cu << endl;

			//Write each pixel RGB and jump over its A component.

			for(long int i=0;i<1280*720;i++)//Para cada pixel

			{

		

			fwrite(&g_bFrameData[4*i], 3, 1, file_pointer_output);//Escribimos para cada pixel solo los valores RGB y saltamos al pixel siguiente i*4

			

			}

}[/codebox]

Every thing is working properly except for that i am getting twice the frames that the console info is showing to have been decoded…

Other thing that should be correctec in the example is that it is said to be postProcessing the frames as ARGB but it is really outputing RGBA format.

Can anyone help me with the doubled frames result problem ??

Thank you!

Well, I have found the problem, it may be usefull for nVidia engineers in order to leave de cudaDecodeGL example almost perfect…

cuMemcpyDtoHAsync was returning invalid value becouse it was receiving some PAGEABLE memory and it needs NON_PAGEABLE memory:

[codebox]if (g_bFirstFrame && g_ReadbackSID) {

			long int memSize = 1280*720*4;//My needed size.

			cudaHostAlloc( (void **)&g_bFrameData, memSize, cudaHostAllocWriteCombined  );//Non pageable memory

                            g_bFirstFrame = false;

}[/codebox]

So you can now use:

[codebox]// If streams are enabled, we can perform the readback to the host while the kernel is executing

if (g_ReadbackSID) {

            CUresult cu = cuMemcpyDtoHAsync(g_bFrameData, pDecodedFrame, (nDecodedPitch * nHeight * 3 / 2), g_ReadbackSID);

			if(cu != CUDA_SUCCESS) std::cout << "cuMemcpyDtoHAsync Failure " <<cu << std::endl;

}[/codebox]

Now I need some other help. I am writing each frame to a file in the hard drive but I am getting twice as frames as the video has, I do it this way:

[codebox]cudaPostProcessFrame(&pDecodedFrame, nDecodedPitch, &pPBOData, nPBOPitch, g_pCudaModule->getModule(), g_kernelNV12toARGB, g_KernelSID);

if(g_KernelSID)

{

CUresult cu = cuMemcpyDtoHAsync(g_bFrameData, pPBOData, 12807204, g_KernelSID);//I read synchronously with g_KernelSID becouse I need post procesed frames.

if(cu != CUDA_SUCCESS) cout << "ReadBack Stream Error Nº " <<cu << endl;

if(cu == CUDA_SUCCESS) cout << "STREAAAAAAAAAAAMMMMMMMMM!!! " <<cu << endl;

			//Write each pixel RGB and jump over its A component.

			for(long int i=0;i<1280*720;i++)//Para cada pixel

			{

		

			fwrite(&g_bFrameData[4*i], 3, 1, file_pointer_output);//Escribimos para cada pixel solo los valores RGB y saltamos al pixel siguiente i*4

			

			}

}[/codebox]

Every thing is working properly except for that i am getting twice the frames that the console info is showing to have been decoded…

Other thing that should be correctec in the example is that it is said to be postProcessing the frames as ARGB but it is really outputing RGBA format.

Can anyone help me with the doubled frames result problem ??

Thank you!

Hi I have a similar problem but there is nothing in this forum to help me.
Another problem in this forum helped me a lot but I think this problem there is not much experience.
I have another problem but I would like to retrieve only the part of the moving image (using h264) Is it possible?

Hi I have a similar problem but there is nothing in this forum to help me.
Another problem in this forum helped me a lot but I think this problem there is not much experience.
I have another problem but I would like to retrieve only the part of the moving image (using h264) Is it possible?

I need some more help here, i am triying to launch my own kernel over each frame during the H264 decoding in cudaDecodeGL but it seems to be unable to modify anything and, even more, if i create a new variable like an uint32 for example, i am not able even to modify it and it always returns “0” to the host, just like if i try to modify de frame in the &pPBOData pointer to device memory where BGRA resulting frames are being decoded, the result is always the same, zeros as return from the device as if the kernel didnt execute anytime.

I don´t know if this is becouse of some cuContextLock or somthing like this that can be running in this example. I would apreciate it so much if someone can help me to understand why can´t I launch my own kernel inside the cudaDecodeGL example. It is so important to achieve that for my current project.

The variables are being allocated like this:

[codebox]

long int memSize = nWidthnHeight4;

cudaMallocHost( (void **)&g_bFrameData, memSize );

cuMemAlloc( &pPreviousDecodedFrame,memSize);

cuMemAlloc( &pResultFrame,memSize);[/codebox]

My kernel is launching this way:

[codebox]

g_pImageGL->map(&pPBOData, &nPBOPitch);

nPBOPitch = g_nWindowWidth * 4;

cudaPostProcessFrame(&pDecodedFrame, nDecodedPitch, &pPBOData, nPBOPitch, g_pCudaModule->getModule(), g_kernelNV12toARGB, g_KernelSID);

if(g_KernelSID)

{

CUresult error;

error = cudaLaunchMyKernel ( pPBOData ,pPreviousDecodedFrame ,pResultFrame ,g_bFrameData, nWidth, nHeight);[/codebox]

cudaLaunchMyKernel :

[codebox]// CUDA kernel for frame post-processing.

extern “C”

global void MyKernel(uint32 * decoded, unsigned char * previous, unsigned char * result)

{

int tx = threadIdx.x;

int ty = threadIdx.y;

int tz = threadIdx.z;

int bx = blockIdx.x;

int by = blockIdx.y;

//				16				16  x  16  = 256      80         4(bytes por pixel)

uint32 offset = (bx*blockDim.x + by*(blockDim.x*blockDim.y)*gridDim.x) * 4;//Esto ajusta el inicio de cada bloque... o eso creo...LOL

uint32 pixel_bloque = ty*(blockDim.x)  + tx;//Esto ajusta cada pixel en su bloque, sumandose al offset del bloque.

result[ offset + pixel_bloque + tz ] = abs(decoded[ offset + pixel_bloque + tz] - previous[ offset + pixel_bloque + tz]);

}

extern “C”

CUresult cudaLaunchMyKernel(CUdeviceptr d_DecodedFrame, CUdeviceptr d_PreviousFrame,CUdeviceptr d_ResultFrame,unsigned char * h_ResultFrame,uint32 width,uint32 * height )

{

unsigned int memSize = width * height * 4;

//CUcontext temp_ctx;

//cuCtxAttach(&temp_ctx,0);	

dim3 threadsPerBlock(16, 16, 3);// Bloques de 16x16 hilos, profundidad en z de 3 hilos para ajustar cada uno a una componente de color BGR

dim3 numBlocks(width/16,height/16);

MyKernel<<<numBlocks, threadsPerBlock>>>((unsigned char *)d_DecodedFrame, (unsigned char *) d_PreviousFrame, (unsigned char *) d_ResultFrame);

cudaThreadSynchronize();

//Copy the result form de device to de Host

CUresult cu;

cu = cuMemcpyDtoH(	height ,prueba,	sizeof(uint32)	);

if(cu != CUDA_SUCCESS) cout << "cudaLaunchMyKernel Error : " <<cu << endl;

//Swap actual frame with previous one

error = cudaMemcpy(&d_DecodedFrame,d_PreviousFrame,memSize,cudaMemcpyDeviceT

oDevice);

 //cuCtxDetach(temp_ctx);

return cu;

}[/codebox]

Thank you.

I need some more help here, i am triying to launch my own kernel over each frame during the H264 decoding in cudaDecodeGL but it seems to be unable to modify anything and, even more, if i create a new variable like an uint32 for example, i am not able even to modify it and it always returns “0” to the host, just like if i try to modify de frame in the &pPBOData pointer to device memory where BGRA resulting frames are being decoded, the result is always the same, zeros as return from the device as if the kernel didnt execute anytime.

I don´t know if this is becouse of some cuContextLock or somthing like this that can be running in this example. I would apreciate it so much if someone can help me to understand why can´t I launch my own kernel inside the cudaDecodeGL example. It is so important to achieve that for my current project.

The variables are being allocated like this:

[codebox]

long int memSize = nWidthnHeight4;

cudaMallocHost( (void **)&g_bFrameData, memSize );

cuMemAlloc( &pPreviousDecodedFrame,memSize);

cuMemAlloc( &pResultFrame,memSize);[/codebox]

My kernel is launching this way:

[codebox]

g_pImageGL->map(&pPBOData, &nPBOPitch);

nPBOPitch = g_nWindowWidth * 4;

cudaPostProcessFrame(&pDecodedFrame, nDecodedPitch, &pPBOData, nPBOPitch, g_pCudaModule->getModule(), g_kernelNV12toARGB, g_KernelSID);

if(g_KernelSID)

{

CUresult error;

error = cudaLaunchMyKernel ( pPBOData ,pPreviousDecodedFrame ,pResultFrame ,g_bFrameData, nWidth, nHeight);[/codebox]

cudaLaunchMyKernel :

[codebox]// CUDA kernel for frame post-processing.

extern “C”

global void MyKernel(uint32 * decoded, unsigned char * previous, unsigned char * result)

{

int tx = threadIdx.x;

int ty = threadIdx.y;

int tz = threadIdx.z;

int bx = blockIdx.x;

int by = blockIdx.y;

//				16				16  x  16  = 256      80         4(bytes por pixel)

uint32 offset = (bx*blockDim.x + by*(blockDim.x*blockDim.y)*gridDim.x) * 4;//Esto ajusta el inicio de cada bloque... o eso creo...LOL

uint32 pixel_bloque = ty*(blockDim.x)  + tx;//Esto ajusta cada pixel en su bloque, sumandose al offset del bloque.

result[ offset + pixel_bloque + tz ] = abs(decoded[ offset + pixel_bloque + tz] - previous[ offset + pixel_bloque + tz]);

}

extern “C”

CUresult cudaLaunchMyKernel(CUdeviceptr d_DecodedFrame, CUdeviceptr d_PreviousFrame,CUdeviceptr d_ResultFrame,unsigned char * h_ResultFrame,uint32 width,uint32 * height )

{

unsigned int memSize = width * height * 4;

//CUcontext temp_ctx;

//cuCtxAttach(&temp_ctx,0);	

dim3 threadsPerBlock(16, 16, 3);// Bloques de 16x16 hilos, profundidad en z de 3 hilos para ajustar cada uno a una componente de color BGR

dim3 numBlocks(width/16,height/16);

MyKernel<<<numBlocks, threadsPerBlock>>>((unsigned char *)d_DecodedFrame, (unsigned char *) d_PreviousFrame, (unsigned char *) d_ResultFrame);

cudaThreadSynchronize();

//Copy the result form de device to de Host

CUresult cu;

cu = cuMemcpyDtoH(	height ,prueba,	sizeof(uint32)	);

if(cu != CUDA_SUCCESS) cout << "cudaLaunchMyKernel Error : " <<cu << endl;

//Swap actual frame with previous one

error = cudaMemcpy(&d_DecodedFrame,d_PreviousFrame,memSize,cudaMemcpyDeviceT

oDevice);

 //cuCtxDetach(temp_ctx);

return cu;

}[/codebox]

Thank you.

I need some more help here, i am triying to launch my own kernel over each frame during the H264 decoding in cudaDecodeGL but it seems to be unable to modify anything and, even more, if i create a new variable like an uint32 for example, i am not able even to modify it and it always returns “0” to the host, just like if i try to modify de frame in the &pPBOData pointer to device memory where BGRA resulting frames are being decoded, the result is always the same, zeros as return from the device as if the kernel didnt execute anytime.

I don´t know if this is becouse of some cuContextLock or somthing like this that can be running in this example. I would apreciate it so much if someone can help me to understand why can´t I launch my own kernel inside the cudaDecodeGL example. It is so important to achieve that for my current project.

The variables are being allocated like this:

[codebox]

long int memSize = nWidthnHeight4;

cudaMallocHost( (void **)&g_bFrameData, memSize );

cuMemAlloc( &pPreviousDecodedFrame,memSize);

cuMemAlloc( &pResultFrame,memSize);[/codebox]

My kernel is launching this way:

[codebox]

g_pImageGL->map(&pPBOData, &nPBOPitch);

nPBOPitch = g_nWindowWidth * 4;

cudaPostProcessFrame(&pDecodedFrame, nDecodedPitch, &pPBOData, nPBOPitch, g_pCudaModule->getModule(), g_kernelNV12toARGB, g_KernelSID);

if(g_KernelSID)

{

CUresult error;

error = cudaLaunchMyKernel ( pPBOData ,pPreviousDecodedFrame ,pResultFrame ,g_bFrameData, nWidth, nHeight);[/codebox]

cudaLaunchMyKernel :

[codebox]// CUDA kernel for frame post-processing.

extern “C”

global void MyKernel(uint32 * decoded, unsigned char * previous, unsigned char * result)

{

int tx = threadIdx.x;

int ty = threadIdx.y;

int tz = threadIdx.z;

int bx = blockIdx.x;

int by = blockIdx.y;

//				16				16  x  16  = 256      80         4(bytes por pixel)

uint32 offset = (bx*blockDim.x + by*(blockDim.x*blockDim.y)*gridDim.x) * 4;//Esto ajusta el inicio de cada bloque... o eso creo...LOL

uint32 pixel_bloque = ty*(blockDim.x)  + tx;//Esto ajusta cada pixel en su bloque, sumandose al offset del bloque.

result[ offset + pixel_bloque + tz ] = abs(decoded[ offset + pixel_bloque + tz] - previous[ offset + pixel_bloque + tz]);

}

extern “C”

CUresult cudaLaunchMyKernel(CUdeviceptr d_DecodedFrame, CUdeviceptr d_PreviousFrame,CUdeviceptr d_ResultFrame,unsigned char * h_ResultFrame,uint32 width,uint32 * height )

{

unsigned int memSize = width * height * 4;

//CUcontext temp_ctx;

//cuCtxAttach(&temp_ctx,0);	

dim3 threadsPerBlock(16, 16, 3);// Bloques de 16x16 hilos, profundidad en z de 3 hilos para ajustar cada uno a una componente de color BGR

dim3 numBlocks(width/16,height/16);

MyKernel<<<numBlocks, threadsPerBlock>>>((unsigned char *)d_DecodedFrame, (unsigned char *) d_PreviousFrame, (unsigned char *) d_ResultFrame);

cudaThreadSynchronize();

//Copy the result form de device to de Host

CUresult cu;

cu = cuMemcpyDtoH(	height ,prueba,	sizeof(uint32)	);

if(cu != CUDA_SUCCESS) cout << "cudaLaunchMyKernel Error : " <<cu << endl;

//Swap actual frame with previous one

error = cudaMemcpy(&d_DecodedFrame,d_PreviousFrame,memSize,cudaMemcpyDeviceT

oDevice);

 //cuCtxDetach(temp_ctx);

return cu;

}[/codebox]

Thank you.