Hi all,
In order to use GPU for a project I wrote a simple test to compare it with CPU. The target task is to convert a string into integer.
Here there is the code (please let me know if you can see some errors).
//
// main.cu
//
#include <stdio.h>
// Error handling function and defines
// ===================================
//
static void HandleError(char *cmd, cudaError_t err, const char *file, int line)
{
if(err != cudaSuccess)
{
printf("%s in %s at line %d\n %s\n", cudaGetErrorString(err), file, line, cmd);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR(err) (HandleError(#err, err, __FILE__, __LINE__))
// Timimg struct and functions
// ===========================
// Used to measure GPU elasped time
//
typedef struct cu_timer
{
cudaEvent_t _start;
cudaEvent_t _stop;
cu_timer();
void start();
float stop();
} cu_timer_t;
cu_timer::cu_timer()
{
cudaEventCreate(&_start);
cudaEventCreate(&_stop);
}
void cu_timer::start()
{
cudaEventRecord(_start, 0);
}
float cu_timer::stop()
{
cudaEventRecord(_stop, 0);
cudaEventSynchronize(_stop);
float elapsedTime = 0.0f;
cudaEventElapsedTime(&elapsedTime, _start, _stop);
cudaEventDestroy(_start);
cudaEventDestroy(_stop);
return elapsedTime;
}
// CUDA Kernel and device functions
// ================================
//
__host__ __device__ int cu_atoi(const char *str)
{
int i = 0;
int result = 0;
int sign = 1;
// Skip spaces, tab and CR
for(; str[i]==' ' || str[i]=='\n' || str[i]=='\t' ; i++);
// Check sign
if(str[i]=='+' || str[i]=='-')
sign = (str[i++]=='+') ? 1 : -1;
// Calculate number
for(; str[i]>='0' && str[i]<='9' ; i++)
result = 10*result + (str[i]-'0');
return sign * result;
}
__global__ void atoi_kernel(char *data, int *results, long data_num, int data_size)
{
for(long tid=threadIdx.x + blockIdx.x * blockDim.x ; tid<data_num ; tid += blockDim.x * gridDim.x)
{
const char *ptr = data + tid*data_size;
results[tid] = cu_atoi(ptr);
}
}
// Results verification function
// =============================
// Returns 0 if no errors.
//
int verify_results(char*buffer, int data_size, long data_num, int *results)
{
int result = EXIT_SUCCESS;
for(long i=0 ; i<data_num ; i++)
{
char *ptr = buffer + i*data_size;
if(results[i] != (int)atoi(ptr))
{
result = EXIT_FAILURE;
break;
}
}
return result;
}
// Host function equivalent to kernel
// ==================================
//
void atoi_host(char *data, int *results, long data_num, int data_size)
{
for(long tid=0 ; tid<data_num ; tid++)
{
char *ptr = data + tid*data_size;
results[tid] = cu_atoi(ptr);
}
}
#define DEFAULT_DATA_SIZE (12)
#define DEFAULT_DATA_NUM (65535*10)
// Main function
// =============
// Usage: Test_atoi [num]
// num.....: Number of 65353 block of strings, default 10.
//
int main(int argc, char **argv)
{
cudaDeviceProp deviceProp;
int device;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&deviceProp, device));
printf("Running on device.................: %s\n\n", deviceProp.name);
const char *str = "-12345";
const long DATA_NUM = argc > 1 ? atoi(argv[1]) * 65535 : DEFAULT_DATA_NUM;
const int DATA_SIZE = strlen(str);
const long maxThreads = 512;
const long maxBlocks = 65535;
const float inputBufferSize = ((float)DATA_NUM*DATA_SIZE)/1024.0/1024.0;
const float outputBufferSize = ((float)sizeof(int)*DATA_NUM)/1024.0/1024.0;
const float ioBufferSize = inputBufferSize + outputBufferSize;
printf("Parse %ld integers\n"
"\tInput buffer size.........: %.1f MByte\n"
"\tOutput buffer size........: %.1f MByte\n"
"\tTotal I/O buffer size.....: %.1f MByte\n\n",
DATA_NUM, inputBufferSize, outputBufferSize, ioBufferSize);
// Input buffer creation
// =====================
char *buffer = NULL;
HANDLE_ERROR(cudaHostAlloc((void **)&buffer, DATA_NUM * DATA_SIZE * sizeof(char), cudaHostAllocDefault));
for(int i=0 ; i<DATA_NUM ; i++)
{
char *ptr = buffer + i*DATA_SIZE;
strncpy(ptr, str, DATA_SIZE-1);
ptr[DATA_SIZE-1] = '\0';
}
// GPU computation
// ===============
char *d_data = NULL;
int *d_results = NULL;
int *results = NULL;
cu_timer_t gpu_allocation_timer;
cu_timer_t gpu_timer;
cu_timer_t gpu_computation_timer;
gpu_allocation_timer.start();
HANDLE_ERROR(cudaMalloc((void**)&d_data, DATA_NUM * DATA_SIZE * sizeof(char)));
HANDLE_ERROR(cudaMalloc((void**)&d_results, DATA_NUM * sizeof(int)));
HANDLE_ERROR(cudaHostAlloc((void **)&results, DATA_NUM * sizeof(int), cudaHostAllocDefault));
float gpu_allocation_time = gpu_allocation_timer.stop();
gpu_timer.start();
HANDLE_ERROR(cudaMemcpy(d_data, buffer, DATA_NUM * DATA_SIZE * sizeof(char), cudaMemcpyHostToDevice));
gpu_computation_timer.start();
atoi_kernel<<<maxBlocks, maxThreads>>>(d_data, d_results, DATA_NUM, DATA_SIZE);
float gpu_computation_time = gpu_computation_timer.stop();
HANDLE_ERROR(cudaMemcpy(results, d_results, DATA_NUM * sizeof(int), cudaMemcpyDeviceToHost));
float gpu_time = gpu_timer.stop();
if(verify_results(buffer, DATA_SIZE, DATA_NUM, results))
printf("** GPU Error(s)\n");
HANDLE_ERROR(cudaFree(d_data));
HANDLE_ERROR(cudaFree(d_results));
HANDLE_ERROR(cudaFreeHost(results));
HANDLE_ERROR(cudaFreeHost(buffer));
// CPU computaion
// ==============
cu_timer_t cpu_timer;
buffer = (char *)calloc(DATA_NUM*DATA_SIZE, sizeof(char));
results = (int *)calloc(DATA_NUM, sizeof(int));
for(int i=0 ; i<DATA_NUM ; i++)
{
char *ptr = buffer + i*DATA_SIZE;
strncpy(ptr, str, DATA_SIZE-1);
ptr[DATA_SIZE-1] = '\0';
}
cpu_timer.start();
atoi_host(buffer, results, DATA_NUM, DATA_SIZE);
float cpu_time = cpu_timer.stop();
if(verify_results(buffer, DATA_SIZE, DATA_NUM, results))
printf("** CPU Error(s)\n");
printf("Total CPU execution time..........: %.2f ms\n", cpu_time);
printf("Total GPU execution time..........: %.2f ms\n", gpu_time);
printf("\tGPU memory allocation time: %.2f ms\n", gpu_allocation_time);
printf("\tGPU computation time......: %.2f ms\n", gpu_computation_time);
printf("\tGPU data transfer time....: %.2f ms\n", (gpu_time - gpu_computation_time));
printf("\tGPU data transfer speed...: %.2f MByte/sec\n", ioBufferSize/(gpu_time - gpu_computation_time)*1000.0);
return 0;
}
I executed such code in a i7 860 3.5 with old NVidia 9800 GT, the results are the following:
Running on device.................: GeForce 9800 GT
Parse 6553500 integers
Input buffer size.........: 37.5 MByte
Output buffer size........: 25.0 MByte
Total I/O buffer size.....: 62.5 MByte
Total CPU execution time..........: 161.02 ms
Total GPU execution time..........: 370.46 ms
GPU memory allocation time: 30.82 ms
GPU computation time......: 333.18 ms
GPU data transfer time....: 37.28 ms
GPU data transfer speed...: 1676.47 MByte/sec
Can you execute the same code on different GPU and report the results?
best regards and happy new year !!
–
jboydon