How much time is cudaMemcpy() use?

I wrote a test application.

CPU: Pentium D 2.8
GPU: GF9600GT

source:

#define BLOCK_SIZE 16

global void
My_NaturalNumber_Mult_device(UINT* t1, USHORT* b, USHORT* c)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;

// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;

int idx = bx * BLOCK_SIZE + tx;
int idy = by * BLOCK_SIZE + ty;

t1[idx * 64 + idy] = b[idx] * c[idy];	

}

extern “C” void
My_NaturalNumber_Mult(UINT * a, UINT * b, UINT * c, int argc, char* argv)
{
UINT t1[64 * 64];
USHORT* d_b;
USHORT* d_c;
UINT* d_t1;
int i, j;

CUT_DEVICE_INIT(argc, argv);

// allocate device memory
CUDA_SAFE_CALL(cudaMalloc((void**) &d_b, 64 * sizeof(USHORT)));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_c, 64 * sizeof(USHORT)));	
CUDA_SAFE_CALL(cudaMalloc((void**) &d_t1, 64 * 64 * sizeof(UINT)));

// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_b, b, 64 * sizeof(USHORT),
                          cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_c, c, 64 * sizeof(USHORT),
                          cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_t1, t1, 64 * 64 * sizeof(UINT),
                          cudaMemcpyHostToDevice) );
                          
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));

// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(32 / threads.x, 32 / threads.y);

// execute the kernel
My_NaturalNumber_Mult_device<<< grid, threads >>>(d_t1, d_b, d_c);

// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");

// stop and destroy timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf("Processing time: %f (ms) \n", cutGetTimerValue(timer));
CUT_SAFE_CALL(cutDeleteTimer(timer));

// copy result from device to host
CUDA_SAFE_CALL(cudaMemcpy(t1, d_t1, 64 * 64 * sizeof(UINT),
                          cudaMemcpyDeviceToHost) );

CUDA_SAFE_CALL(cudaFree(d_b));
CUDA_SAFE_CALL(cudaFree(d_c));
CUDA_SAFE_CALL(cudaFree(d_t1));

for(i = 0; i < 64; i++)
{
    for(j = 0; j < 64; j++)
    {
        printf("%d ", t1[i * 64 + j]);
    }
    printf("\n");
}

CUT_EXIT(argc, argv);

}

Processing time is 0.8ms.

And i modify the source:

extern “C” void
My_NaturalNumber_Mult(UINT * a, UINT * b, UINT * c, int argc, char* argv)
{
UINT t1[64 * 64];
USHORT* d_b;
USHORT* d_c;
UINT* d_t1;
int i, j;

CUT_DEVICE_INIT(argc, argv);

unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));

// allocate device memory
CUDA_SAFE_CALL(cudaMalloc((void**) &d_b, 64 * sizeof(USHORT)));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_c, 64 * sizeof(USHORT)));	
CUDA_SAFE_CALL(cudaMalloc((void**) &d_t1, 64 * 64 * sizeof(UINT)));

// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_b, b, 64 * sizeof(USHORT),
                          cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_c, c, 64 * sizeof(USHORT),
                          cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_t1, t1, 64 * 64 * sizeof(UINT),
                          cudaMemcpyHostToDevice) );
                      
// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(32 / threads.x, 32 / threads.y);

// execute the kernel
My_NaturalNumber_Mult_device<<< grid, threads >>>(d_t1, d_b, d_c);

// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");

// copy result from device to host
CUDA_SAFE_CALL(cudaMemcpy(t1, d_t1, 64 * 64 * sizeof(UINT),
                          cudaMemcpyDeviceToHost) );

CUDA_SAFE_CALL(cudaFree(d_b));
CUDA_SAFE_CALL(cudaFree(d_c));
CUDA_SAFE_CALL(cudaFree(d_t1));

// stop and destroy timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf("Processing time: %f (ms) \n", cutGetTimerValue(timer));
CUT_SAFE_CALL(cutDeleteTimer(timer));

for(i = 0; i < 64; i++)
{
    for(j = 0; j < 64; j++)
    {
        printf("%d ", t1[i * 64 + j]);
    }
    printf("\n");
}

CUT_EXIT(argc, argv);

}

Processing time is 60ms.

Is there any mistake in the modified source?

hi,

for upper code version:

Guide 4.2.1.4:

try use cudaThreadSynchronize() after your <<<>>> line.

for lower code version:

since Device to Host memcopies are blocking, there is no need for a cudaThreadSynchronize() here.