How much time is cudaMemcpy() use?

chickenjun · July 30, 2008, 6:28am

I wrote a test application.

CPU: Pentium D 2.8
GPU: GF9600GT

source:

#define BLOCK_SIZE 16

global void
My_NaturalNumber_Mult_device(UINT* t1, USHORT* b, USHORT* c)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;

// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;

int idx = bx * BLOCK_SIZE + tx;
int idy = by * BLOCK_SIZE + ty;

t1[idx * 64 + idy] = b[idx] * c[idy];

}

extern “C” void
My_NaturalNumber_Mult(UINT * a, UINT * b, UINT * c, int argc, char* argv)
{
UINT t1[64 * 64];
USHORT* d_b;
USHORT* d_c;
UINT* d_t1;
int i, j;

CUT_DEVICE_INIT(argc, argv);

// allocate device memory
CUDA_SAFE_CALL(cudaMalloc((void**) &d_b, 64 * sizeof(USHORT)));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_c, 64 * sizeof(USHORT)));	
CUDA_SAFE_CALL(cudaMalloc((void**) &d_t1, 64 * 64 * sizeof(UINT)));

// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_b, b, 64 * sizeof(USHORT),
                          cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_c, c, 64 * sizeof(USHORT),
                          cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_t1, t1, 64 * 64 * sizeof(UINT),
                          cudaMemcpyHostToDevice) );
                          
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));

// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(32 / threads.x, 32 / threads.y);

// execute the kernel
My_NaturalNumber_Mult_device<<< grid, threads >>>(d_t1, d_b, d_c);

// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");

// stop and destroy timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf("Processing time: %f (ms) \n", cutGetTimerValue(timer));
CUT_SAFE_CALL(cutDeleteTimer(timer));

// copy result from device to host
CUDA_SAFE_CALL(cudaMemcpy(t1, d_t1, 64 * 64 * sizeof(UINT),
                          cudaMemcpyDeviceToHost) );

CUDA_SAFE_CALL(cudaFree(d_b));
CUDA_SAFE_CALL(cudaFree(d_c));
CUDA_SAFE_CALL(cudaFree(d_t1));

for(i = 0; i < 64; i++)
{
    for(j = 0; j < 64; j++)
    {
        printf("%d ", t1[i * 64 + j]);
    }
    printf("\n");
}

CUT_EXIT(argc, argv);

}

Processing time is 0.8ms.

And i modify the source:

extern “C” void
My_NaturalNumber_Mult(UINT * a, UINT * b, UINT * c, int argc, char* argv)
{
UINT t1[64 * 64];
USHORT* d_b;
USHORT* d_c;
UINT* d_t1;
int i, j;

CUT_DEVICE_INIT(argc, argv);

unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));

// allocate device memory
CUDA_SAFE_CALL(cudaMalloc((void**) &d_b, 64 * sizeof(USHORT)));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_c, 64 * sizeof(USHORT)));	
CUDA_SAFE_CALL(cudaMalloc((void**) &d_t1, 64 * 64 * sizeof(UINT)));

// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_b, b, 64 * sizeof(USHORT),
                          cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_c, c, 64 * sizeof(USHORT),
                          cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_t1, t1, 64 * 64 * sizeof(UINT),
                          cudaMemcpyHostToDevice) );
                      
// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(32 / threads.x, 32 / threads.y);

// execute the kernel
My_NaturalNumber_Mult_device<<< grid, threads >>>(d_t1, d_b, d_c);

// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");

// copy result from device to host
CUDA_SAFE_CALL(cudaMemcpy(t1, d_t1, 64 * 64 * sizeof(UINT),
                          cudaMemcpyDeviceToHost) );

CUDA_SAFE_CALL(cudaFree(d_b));
CUDA_SAFE_CALL(cudaFree(d_c));
CUDA_SAFE_CALL(cudaFree(d_t1));

// stop and destroy timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf("Processing time: %f (ms) \n", cutGetTimerValue(timer));
CUT_SAFE_CALL(cutDeleteTimer(timer));

for(i = 0; i < 64; i++)
{
    for(j = 0; j < 64; j++)
    {
        printf("%d ", t1[i * 64 + j]);
    }
    printf("\n");
}

CUT_EXIT(argc, argv);

}

Processing time is 60ms.

Is there any mistake in the modified source?

yk_cadcg · July 30, 2008, 11:03am

hi,

for upper code version:

Guide 4.2.1.4:

try use cudaThreadSynchronize() after your <<<>>> line.

for lower code version:

since Device to Host memcopies are blocking, there is no need for a cudaThreadSynchronize() here.

Topic		Replies	Views
processing time check CUDA Programming and Performance	5	551	November 16, 2010
cudaMemcpy execution time CUDA Programming and Performance	5	6777	June 17, 2010
cudaMemcpy() Best approach when you need to call it many times? CUDA Programming and Performance	8	25061	March 8, 2010
About CUDA CUDA Programming and Performance	2	4712	December 3, 2008
CUDA trouble CUDA Programming and Performance	3	977	March 19, 2013
Why is there the difference of memory copy speed between cpu>gpu and gpu>cpu CUDA Programming and Performance	3	1278	April 10, 2014
Fast memcpy micro-benchmarking: CUDA-Python wrapper multi-GPU seg fault CUDA Programming and Performance cuda , performance , python	0	446	December 1, 2023
Memory Transfer CUDA Programming and Performance	7	2959	October 10, 2008
cudaMemcpy2D() and a few gray hairs It's very slow CUDA Programming and Performance	8	4533	February 13, 2009
strange behavior with device emulation CUDA Programming and Performance	5	2693	May 20, 2008

How much time is cudaMemcpy() use?

Related topics