May be I have a Performance problem with cudaMemcpyToArray
I am trying to do some imageprocessing with CUDA and i wrote my code into a .dll (this part works fine now)
But i think i have some performance problem, when i use cudaMemcpyToArray.
struct ImageDataForProcessing
{
unsigned char *pInImage; // 704*576*3
unsigned char *pOutImage; // 704*576*3
};
extern "C" _declspec(dllexport) void
CIPDLL_ProcessImageData( ImageDataForProcessing data )
{
unsigned char *pcInputImage;
cudaMalloc( (void**) &pcInputImage, 704*576*3 );
unsigned char *tempUChar4Image;
cudaMalloc( (void**) &tempUChar4Image, 704*576*4 );
// The exec. time of following call is about: ~0.03ms
cudaMemcpy(pcInputImage, data.pInImage, 704*576*3, cudaMemcpyHostToDevice);
// The exec. time of following call is also about: ~0.03ms
CopyFromUChar3ToUchar4<<<gridDim_2D, blockDim_2D>>>(pcInputImage, tempUChar4Image);
cudaArray *pca_NewImage;
cudaChannelFormatDesc channelDescUchar4 = cudaCreateChannelDesc<uchar4>();
cudaMallocArray( &pca_NewImage, &channelDescUchar4, 704, 576 );
//The exec. time of following call is about: ~2.7ms
cudaMemcpyToArray(pca_NewImage, 0, 0, tempUChar4Image, 704*576*4, cudaMemcpyDeviceToDevice);
...
}
Does anyone have an idea, why the cudaMemcpyToArray needs so much more time compared to cudaMemcpy or kernel call?
Thanks for your help!