HELP ! ! Depth Component opengl interop

Help Please,I created depth image in opengl and want to compare with another image using cuda

here is how I create texture

glGenTextures(1,&texture);

	glBindTexture(GL_TEXTURE_2D,texture);

	glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);

    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);

	glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);

    glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);

    glTexParameteri(GL_TEXTURE_2D, GL_GENERATE_MIPMAP, GL_TRUE);

	glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH_COMPONENT32F, TEXTURE_WIDTH, TEXTURE_HEIGHT, 0, GL_DEPTH_COMPONENT, GL_FLOAT, 0);

	glBindTexture(GL_TEXTURE_2D,0);

I register texture using cudaGraphicsGLRegisterImage()

however it crash when I want to copy my result back from computation, at line

cudaMemcpyToArray(dstArray,0,0,g_dstBuffer,bufferSize,cudaMemcpyDeviceToDevice);

the problem seem to the size of the array as I don’t know the data type of texture, I try random data type,but not successful Please Help !!

here is the full code

#include <cuda_runtime_api.h>

#include <cuda.h>

#include "cutil.h"

#include "cutil_inline_runtime.h"

#include "math_functions.h"

#define __cplusplus

#define __CUDACC__

#include "texture_fetch_functions.h"

#include "cuda_texture_types.h"

#define BLOCK_SIZE 16

float4* g_dstBuffer = NULL;

float4* g_srcBuffer = NULL;

size_t g_BufferSize =0;

texture<float4, cudaTextureType2D, cudaReadModeElementType> texRef;

__global__ void postProcessKernel(float4* dst, float4* src,unsigned int imgWidth, unsigned int imgHeight)

{

	int x = threadIdx.x + blockIdx.x * blockDim.x;

	int y = threadIdx.y + blockIdx.y * blockDim.y;

	if(x>imgWidth || y >imgHeight) return;

	unsigned int index = y*imgWidth + x;

	float4 tempcolor;

	float4 color = tex2D(texRef,x,y);

	tempcolor = color;

	dst[index] = tempcolor;

	__syncthreads();

	return;

}

void postProcessCUDA(cudaGraphicsResource_t &dst, cudaGraphicsResource_t &src1, cudaGraphicsResource_t &src2, unsigned int width, unsigned int height)

{

	cudaGraphicsResource_t resources[3] = {src1,src2,dst};

	 cudaGraphicsMapResources(3,resources) ;

	cudaArray* src1Array;

	cudaArray* src2Array;

	cudaArray* dstArray;

	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(4,0,0,0,cudaChannelFormatKindFloat);

	const textureReference* texRefPtr;

	cudaGetTextureReference(&texRefPtr,"texRef");

	 cudaGraphicsSubResourceGetMappedArray (&src1Array,src1,0,0);

	 cudaGraphicsSubResourceGetMappedArray (&src2Array,src2,0,0);

	 cudaGraphicsSubResourceGetMappedArray (&dstArray,dst,0,0);

	cudaBindTextureToArray( texRefPtr,src1Array,&channelDesc);

	size_t bufferSize = width*height*sizeof(float4);

	if(g_BufferSize != bufferSize)

	{

		if(g_dstBuffer !=NULL)

		{

			cudaFree(g_dstBuffer);

		}

		if(g_srcBuffer != NULL)

		{

			cudaFree(g_srcBuffer);

		}

		g_BufferSize = bufferSize;

		cudaMalloc((void**) &g_dstBuffer, g_BufferSize);

		cudaMalloc((void**) &g_srcBuffer,g_BufferSize);

	}

	size_t blocksW = (size_t) ceilf (width /(float) BLOCK_SIZE );

	size_t blocksH = (size_t) ceilf( height / (float) BLOCK_SIZE);

	dim3 gridDim( blocksW,blocksH,1);

	dim3 blockDim(BLOCK_SIZE,BLOCK_SIZE,1);

	//cudaMemcpyFromArray(g_srcBuffer,src1Array,0,0,bufferSize,cudaMemcpyDeviceToDevice);

	postProcessKernel<<<gridDim,blockDim>>>(g_dstBuffer,g_srcBuffer,width,height);

	cudaError_t b;

	cudaMemcpyToArray(dstArray,0,0,g_dstBuffer,bufferSize,cudaMemcpyDeviceToDevice);

	cudaUnbindTexture (texRefPtr);

	cudaGraphicsUnmapResources(3,resources);

	return ;

}