Color conversion differs on 8400GS and 9500GT NV12 to YUV420 source code included

I have developed the simple kernel, which converts NV12 to YUV420.

it works fine on my GeForce 9500 GT.

However, the same simple code running on GeForce 8400 GS gives me the texture, which viewer displays as green field.

First< i checked the picture dimension and thread block size and grid size limits.

It is not exceeded on 320x240 resolution.

When compared chip capabilities I encountered the MultiProcessors number 4 vs 1 respectively.

But it shouldn’t be the matter, I think.

I feel, that I missed something important, that doesn’t allow to get the same result on both cards.

Here is the kernel code, is something wrong here?

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <cutil.h>

#include <nvcuvid.h>

global void NV12toYUV420_gpu(int TotalThreads, CUdeviceptr src, CUdeviceptr dst, int width, int height, int pitch)


int idx = blockIdx.x * blockDim.x + threadIdx.x ;

char * Dst = (char*)dst;

char * Src = (char*)src;

int inputYsize = pitch*height;

int outYsize = width*height;

if ( idx >= TotalThreads ) return;

// copy intensity

if ( idx < outYsize )


	int lineNo = idx/width;

	int Hposition = idx-lineNo*width;

	int inIdx = lineNo*pitch+Hposition;

	Dst[idx] = Src[inIdx];



// copy color components

Dst += outYsize;

Src += inputYsize;

int inI = (idx-outYsize);

int outI = inI >> 1; // offset in output field. YUV420 format

if ( inI & 1 )


	Dst += outYsize/4;


int lineN = outI/(width/2);

int Horiz = inI - lineN*width;

int inColorPos = Horiz + lineN*pitch; // offset in input field NV12 format

Dst[outI] = Src[inColorPos];




// Source texture lies in GPU memory

CUresult NV12toYUV420(CUdeviceptr src, char * dst, int width, int height, int pitch)


int Ysize = width * height;

int Size = Ysize + (Ysize/2);

CUresult result = CUDA_SUCCESS;

CUdeviceptr b = 0;

dim3 block;

dim3 grid;

CUDA_SAFE_CALL( cudaMalloc((void**) &b, Size));

if (!b) return CUDA_ERROR_OUT_OF_MEMORY;

grid.x = (Size/256)+1; // lazy to align presizely :)

block.x = 256;

NV12toYUV420_gpu<<<grid, block>>>(Size, src, b, width, height, pitch);

CUDA_SAFE_CALL( cudaThreadSynchronize() );

result = cuMemcpyDtoH(dst, b, Size);

CUDA_SAFE_CALL( cudaFree((void*)B));

return result;



Thanks to everybody for help.