Performanceproblem with cudaMemcpyToArray

May be I have a Performance problem with cudaMemcpyToArray

I am trying to do some imageprocessing with CUDA and i wrote my code into a .dll (this part works fine now)

But i think i have some performance problem, when i use cudaMemcpyToArray.

struct ImageDataForProcessing


	unsigned char *pInImage;  //  704*576*3

	unsigned char *pOutImage;	//  704*576*3


extern "C" _declspec(dllexport) void

	CIPDLL_ProcessImageData( ImageDataForProcessing data )


unsigned char  *pcInputImage;

cudaMalloc( (void**) &pcInputImage,    704*576*3 );

unsigned char  *tempUChar4Image;

cudaMalloc( (void**) &tempUChar4Image,    704*576*4 );

// The exec. time of following call is about: ~0.03ms

cudaMemcpy(pcInputImage, data.pInImage, 704*576*3, cudaMemcpyHostToDevice);

// The exec. time of following call is also about: ~0.03ms

CopyFromUChar3ToUchar4<<<gridDim_2D, blockDim_2D>>>(pcInputImage, tempUChar4Image);

cudaArray      *pca_NewImage;

cudaChannelFormatDesc channelDescUchar4 = cudaCreateChannelDesc<uchar4>();

cudaMallocArray( &pca_NewImage, &channelDescUchar4, 704, 576 );

//The exec. time of following call is about: ~2.7ms

cudaMemcpyToArray(pca_NewImage, 0, 0, tempUChar4Image,  704*576*4, cudaMemcpyDeviceToDevice);



Does anyone have an idea, why the cudaMemcpyToArray needs so much more time compared to cudaMemcpy or kernel call?

Thanks for your help!…damemcpytoarray