Questions about the interop of cuda and D3D11texture2D

Hi:
Now I mapped the D3D11texture2D to cuda, and then I convert the RGB data to YUV data,but the YUV data I got is wrong,anybody can tell me why? Here is the code.(here is not the complete code,but almost all the major parts have been shown)

global void rgb2yuvKernel(float* dstY, float* dstU, float* dstV)
{
int tid = (blockIdx.ygridDim.x + blockIdx.x)(blockDim.xblockDim.y) + threadIdx.yblockDim.y + threadIdx.x;
dstY[tid] = 0.299src[tid4]+0.587src[tid4+1]+0.114src[tid4+2];
dstU[tid] = -0.1687src[tid4]-0.3313src[tid4+1]+0.5src[tid4+2]+128;
dstV[tid] = 0.5src[tid4]-0.4187src[tid4+1]-0.0813src[tid4+2]+128;
}

extern “C”
void cudaRGB2YUV(cudaArray* cudaSrc, float* SrcY, float* SrcU, float* SrcV)
{
int uint = 16;
dim3 grid((2160 + uint - 1) / uint, (1200 + uint - 1) / uint);
dim3 block(uint, uint);
rgb2yuvKernel << < grid, block >> > (SrcY, SrcU, SrcV);
}

void main();
{
ID3D11Texture2D* pTexture2D;
D3DX11CreateTextureFromFile(
pDevice,
pImgFile,
&ImgLoadInfo,
NULL,
(ID3D11Resource *)&pTexture2D,
NULL);
cudaGraphicsResource
cudaResource;
cudaGraphicsD3D11RegisterResource(&cudaResource, pTexture2D, cudaGraphicsRegisterFlagsNone);
cudaError_t err = cudaGetLastError();
cudaGraphicsResourceSetMapFlags(cudaResource, cudaGraphicsMapFlagsNone);
err = cudaGetLastError();
cudaGraphicsMapResources(1, &cudaResource, 0);
err = cudaGetLastError();

cudaArray* mappedArrary;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindFloat);
cudaMallocArray(&mappedArrary, &channelDesc, 2160, 1200);
cudaGraphicsSubResourceGetMappedArray(&mappedArrary, cudaResource, 0, 0);
err = cudaGetLastError();

    int* cudaSrcY, *cudaSrcU, *cudaSrcV;
int* cudaSrc;
int* cudaSrcYUV;


size_t sizesrc = 2160 * 1200 * 4;
size_t sizeyuv = 2160 * 1200 * 3;
size_t size = 2160 * 1200;
cudaMalloc((void**)&cudaSrc, sizesrc * sizeof(int));
cudaMalloc((void**)&cudaSrcYUV, sizeyuv * sizeof(int));
cudaMalloc((void**)&cudaSrcY, size * sizeof(int));
cudaMalloc((void**)&cudaSrcU, size * sizeof(int));
cudaMalloc((void**)&cudaSrcV, size * sizeof(int));

    cudaRGB2YUV(mappedArrary, cudaSrcY, cudaSrcU, cudaSrcV);

}