About CUDA

I want to use CUDA programming. So, I try image convert program.
The problem is cudaMemcpy function. I evaluate time of alloc, host to device cudaMemcpy , compute, device to host cudaMemcpy. By the way, the time of device to host cudaMemcpy is very longer than others. Above all, if the time of host to device memcpy is 1sec, the time of device to host memcpy is 10sec.

This program is below

CUresult cudaLaunchNV12toARGB(uint8 *d_srcY, uint8 *d_srcU, uint8 *d_srcV, uint8 *d_dstR, uint8 *d_dstG, uint8 *d_dstB, int srcline, int height, int dstline)
{
cudaError_t error = cudaSuccess;

//When src transform dst, each memory size is that(src size: 670x346, dst size:640x346;)
const int srcsize = srclineheightsizeof(char);
const int dstsize = dstlineheightsizeof(char);

//variance declaration for output time
FILE *pf1, *pf2, *pf3;
pf1 = fopen(“first time.txt”, “a”);
pf2 = fopen(“second time.txt”, “a”);
pf3 = fopen(“third time.txt”, “a”);

//Variance for time check
LARGE_INTEGER counter;
static LARGE_INTEGER initial;
static LARGE_INTEGER freq;
double StartQuerytime,EndQuerytime;

//host->device memcpy
cudaMemcpy( srcY, d_srcY, srcsize , cudaMemcpyHostToDevice);
cudaMemcpy( srcU, d_srcU, srcsize/2 , cudaMemcpyHostToDevice);
cudaMemcpy( srcV, d_srcV, srcsize/4 , cudaMemcpyHostToDevice);

dim3 block(16, 16);
dim3 grid(dstline/16, height/16);

//Transform compute
NV12ToARGB<<<grid, block>>>(srcY, srcU, srcV,dstR, dstG, dstB, srcline, height, dstline);

//device->host memcpy, size is ths smallest
//For example, if sum up 1500 times iteration results, this require 7~8sec
//Finally, I knew the result that first memcpy(device to host) requires very long time.
//If I do 1Byte memcpy in here, it requires 7~8sec
//What is the problem???
//I don’t understand why I’m getting this result
QueryPerformanceCounter(&initial);
QueryPerformanceCounter(&counter);
QueryPerformanceFrequency(&freq);
StartQuerytime = ((double)(counter.QuadPart-initial.QuadPart))/((double)freq.QuadPart);

cudaMemcpy( d_dstB, dstB, dstsize/4, cudaMemcpyDeviceToHost);

QueryPerformanceCounter(&counter);
QueryPerformanceFrequency(&freq);
EndQuerytime =((double)(counter.QuadPart-initial.QuadPart))/((double)freq.QuadPart);
fprintf(pf1,“%lf\n”,(double)(EndQuerytime-StartQuerytime));
fclose(pf1);

//device->host memcpy, size is middle
//For example, if sum up 1500 times iteration results, this require 0.2sec
QueryPerformanceCounter(&initial);
QueryPerformanceCounter(&counter);
QueryPerformanceFrequency(&freq);
StartQuerytime = ((double)(counter.QuadPart-initial.QuadPart))/((double)freq.QuadPart);

cudaMemcpy( d_dstG, dstG, dstsize/2, cudaMemcpyDeviceToHost);

QueryPerformanceCounter(&counter);
QueryPerformanceFrequency(&freq);
EndQuerytime =((double)(counter.QuadPart-initial.QuadPart))/((double)freq.QuadPart);
fprintf(pf2,“%lf\n”,(double)(EndQuerytime-StartQuerytime));
fclose(pf2);

//device->host memcpy, size is the biggest
//For example, if sum up 1500 times iteration results, this require 0.3sec
QueryPerformanceCounter(&initial);
QueryPerformanceCounter(&counter);
QueryPerformanceFrequency(&freq);
StartQuerytime = ((double)(counter.QuadPart-initial.QuadPart))/((double)freq.QuadPart);

cudaMemcpy( d_dstR, dstR, dstsize, cudaMemcpyDeviceToHost);

QueryPerformanceCounter(&counter);
QueryPerformanceFrequency(&freq);
EndQuerytime =((double)(counter.QuadPart-initial.QuadPart))/((double)freq.QuadPart);
fprintf(pf3,“%lf\n”,(double)(EndQuerytime-StartQuerytime));
fclose(pf3);

error = cudaGetLastError();
if (error != cudaSuccess) {
printf(“failed! → NV12ToARGB grid=(%d,%d), block(%d,%d)\n”, grid.x, grid.y, block.x, block.y);
}

return ((error == cudaSuccess) ? CUDA_SUCCESS : CUDA_ERROR_UNKNOWN);
}

You’re including the time it takes to execute the kernel in the timing of your second memcpy. Add a cudaThreadSynchronize() after your kernel call.

(man, it seems like I answer this question at least 5 times a day)

Asynchronous function calls are kind of counter-intuitive at first. :)