Performanceproblem with cudaMemcpyToArray

May be I have a Performance problem with cudaMemcpyToArray

I am trying to do some imageprocessing with CUDA and i wrote my code into a .dll (this part works fine now)

But i think i have some performance problem, when i use cudaMemcpyToArray.

struct ImageDataForProcessing

{

	unsigned char *pInImage;  //  704*576*3

	unsigned char *pOutImage;	//  704*576*3

};

extern "C" _declspec(dllexport) void

	CIPDLL_ProcessImageData( ImageDataForProcessing data )

{

unsigned char  *pcInputImage;

cudaMalloc( (void**) &pcInputImage,    704*576*3 );

unsigned char  *tempUChar4Image;

cudaMalloc( (void**) &tempUChar4Image,    704*576*4 );

// The exec. time of following call is about: ~0.03ms

cudaMemcpy(pcInputImage, data.pInImage, 704*576*3, cudaMemcpyHostToDevice);

// The exec. time of following call is also about: ~0.03ms

CopyFromUChar3ToUchar4<<<gridDim_2D, blockDim_2D>>>(pcInputImage, tempUChar4Image);

cudaArray      *pca_NewImage;

cudaChannelFormatDesc channelDescUchar4 = cudaCreateChannelDesc<uchar4>();

cudaMallocArray( &pca_NewImage, &channelDescUchar4, 704, 576 );

//The exec. time of following call is about: ~2.7ms

cudaMemcpyToArray(pca_NewImage, 0, 0, tempUChar4Image,  704*576*4, cudaMemcpyDeviceToDevice);

...

}

Does anyone have an idea, why the cudaMemcpyToArray needs so much more time compared to cudaMemcpy or kernel call?

Thanks for your help!

http://forums.nvidia.com/index.php?showtop…damemcpytoarray