I wrote a test application.
CPU: Pentium D 2.8
GPU: GF9600GT
source:
#define BLOCK_SIZE 16
global void
My_NaturalNumber_Mult_device(UINT* t1, USHORT* b, USHORT* c)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int idx = bx * BLOCK_SIZE + tx;
int idy = by * BLOCK_SIZE + ty;
t1[idx * 64 + idy] = b[idx] * c[idy];
}
extern “C” void
My_NaturalNumber_Mult(UINT * a, UINT * b, UINT * c, int argc, char* argv)
{
UINT t1[64 * 64];
USHORT* d_b;
USHORT* d_c;
UINT* d_t1;
int i, j;
CUT_DEVICE_INIT(argc, argv);
// allocate device memory
CUDA_SAFE_CALL(cudaMalloc((void**) &d_b, 64 * sizeof(USHORT)));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_c, 64 * sizeof(USHORT)));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_t1, 64 * 64 * sizeof(UINT)));
// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_b, b, 64 * sizeof(USHORT),
cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_c, c, 64 * sizeof(USHORT),
cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_t1, t1, 64 * 64 * sizeof(UINT),
cudaMemcpyHostToDevice) );
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(32 / threads.x, 32 / threads.y);
// execute the kernel
My_NaturalNumber_Mult_device<<< grid, threads >>>(d_t1, d_b, d_c);
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
// stop and destroy timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf("Processing time: %f (ms) \n", cutGetTimerValue(timer));
CUT_SAFE_CALL(cutDeleteTimer(timer));
// copy result from device to host
CUDA_SAFE_CALL(cudaMemcpy(t1, d_t1, 64 * 64 * sizeof(UINT),
cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL(cudaFree(d_b));
CUDA_SAFE_CALL(cudaFree(d_c));
CUDA_SAFE_CALL(cudaFree(d_t1));
for(i = 0; i < 64; i++)
{
for(j = 0; j < 64; j++)
{
printf("%d ", t1[i * 64 + j]);
}
printf("\n");
}
CUT_EXIT(argc, argv);
}
Processing time is 0.8ms.
And i modify the source:
extern “C” void
My_NaturalNumber_Mult(UINT * a, UINT * b, UINT * c, int argc, char* argv)
{
UINT t1[64 * 64];
USHORT* d_b;
USHORT* d_c;
UINT* d_t1;
int i, j;
CUT_DEVICE_INIT(argc, argv);
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
// allocate device memory
CUDA_SAFE_CALL(cudaMalloc((void**) &d_b, 64 * sizeof(USHORT)));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_c, 64 * sizeof(USHORT)));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_t1, 64 * 64 * sizeof(UINT)));
// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_b, b, 64 * sizeof(USHORT),
cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_c, c, 64 * sizeof(USHORT),
cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_t1, t1, 64 * 64 * sizeof(UINT),
cudaMemcpyHostToDevice) );
// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(32 / threads.x, 32 / threads.y);
// execute the kernel
My_NaturalNumber_Mult_device<<< grid, threads >>>(d_t1, d_b, d_c);
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
// copy result from device to host
CUDA_SAFE_CALL(cudaMemcpy(t1, d_t1, 64 * 64 * sizeof(UINT),
cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL(cudaFree(d_b));
CUDA_SAFE_CALL(cudaFree(d_c));
CUDA_SAFE_CALL(cudaFree(d_t1));
// stop and destroy timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf("Processing time: %f (ms) \n", cutGetTimerValue(timer));
CUT_SAFE_CALL(cutDeleteTimer(timer));
for(i = 0; i < 64; i++)
{
for(j = 0; j < 64; j++)
{
printf("%d ", t1[i * 64 + j]);
}
printf("\n");
}
CUT_EXIT(argc, argv);
}
Processing time is 60ms.
Is there any mistake in the modified source?