GPU - CPU Performance comparison on string conversion i7 860 3.5GHz beat out NVidia 9800 GT

Hi all,

In order to use GPU for a project I wrote a simple test to compare it with CPU. The target task is to convert a string into integer.

Here there is the code (please let me know if you can see some errors).

//

// main.cu

//

#include <stdio.h>

// Error handling function and defines

// ===================================

//

static void HandleError(char *cmd, cudaError_t err, const char *file, int line) 

{

    if(err != cudaSuccess) 

	{

        printf("%s in %s at line %d\n    %s\n", cudaGetErrorString(err), file, line, cmd);

        exit(EXIT_FAILURE);

    }

}

#define HANDLE_ERROR(err)		(HandleError(#err, err, __FILE__, __LINE__))

// Timimg struct and functions

// ===========================

// Used to measure GPU elasped time

//

typedef struct cu_timer

{

	cudaEvent_t _start;

	cudaEvent_t _stop;

	

	cu_timer();	

	void start();	

	float stop();

} cu_timer_t;

cu_timer::cu_timer() 

{

	cudaEventCreate(&_start);

	cudaEventCreate(&_stop);

}

void cu_timer::start()

{

	cudaEventRecord(_start, 0);

}

float cu_timer::stop()

{

	cudaEventRecord(_stop, 0);

	cudaEventSynchronize(_stop);

	

	float elapsedTime = 0.0f;

	cudaEventElapsedTime(&elapsedTime, _start, _stop);

	cudaEventDestroy(_start);

	cudaEventDestroy(_stop);

	

	return elapsedTime;

}

// CUDA Kernel and device functions

// ================================

//

__host__ __device__ int cu_atoi(const char *str)

{

	int i = 0;

	int result = 0;

	int sign = 1;

	

	// Skip spaces, tab and CR

	for(; str[i]==' ' || str[i]=='\n' || str[i]=='\t' ; i++);

	

	// Check sign

	if(str[i]=='+' || str[i]=='-')

		sign = (str[i++]=='+') ? 1 : -1;

	

	// Calculate number

	for(; str[i]>='0' && str[i]<='9' ; i++)

		result = 10*result + (str[i]-'0');

	

	return sign * result;

}

__global__ void atoi_kernel(char *data, int *results, long data_num, int data_size)

{

	for(long tid=threadIdx.x + blockIdx.x * blockDim.x ; tid<data_num ; tid += blockDim.x * gridDim.x)

	{

		const char *ptr = data + tid*data_size;

		

		results[tid] = cu_atoi(ptr);

	}

}

// Results verification function

// =============================

// Returns 0 if no errors.

//

int verify_results(char*buffer, int data_size, long data_num, int *results)

{

	int result = EXIT_SUCCESS;

	

	for(long i=0 ; i<data_num ; i++)

	{

		char *ptr = buffer + i*data_size;

		if(results[i] != (int)atoi(ptr))

		{

			result = EXIT_FAILURE;

			break;

		}

	}

	

	return result;

}

// Host function equivalent to kernel

// ==================================

//

void atoi_host(char *data, int *results, long data_num, int data_size)

{

	for(long tid=0 ; tid<data_num ; tid++)

	{

		char *ptr = data + tid*data_size;

		

		results[tid] = cu_atoi(ptr);

	}

}

#define DEFAULT_DATA_SIZE		(12)

#define DEFAULT_DATA_NUM		(65535*10)

// Main function

// =============

// Usage: Test_atoi [num]

//		num.....: Number of 65353 block of strings, default 10.

//

int main(int argc, char **argv) 

{

	cudaDeviceProp deviceProp;

	int device;

	HANDLE_ERROR(cudaGetDevice(&device));

	HANDLE_ERROR(cudaGetDeviceProperties(&deviceProp, device));

	

	printf("Running on device.................: %s\n\n", deviceProp.name);

	const char *str = "-12345";

	const long DATA_NUM = argc > 1 ? atoi(argv[1]) * 65535 : DEFAULT_DATA_NUM;

	const int DATA_SIZE = strlen(str);

	const long maxThreads = 512; 

	const long maxBlocks = 65535;

	

	const float inputBufferSize = ((float)DATA_NUM*DATA_SIZE)/1024.0/1024.0;

	const float outputBufferSize = ((float)sizeof(int)*DATA_NUM)/1024.0/1024.0;

	const float ioBufferSize = inputBufferSize + outputBufferSize;

	

	printf("Parse %ld integers\n"

		   "\tInput buffer size.........: %.1f MByte\n"

		   "\tOutput buffer size........: %.1f MByte\n"

		   "\tTotal I/O buffer size.....: %.1f MByte\n\n",

		   DATA_NUM, inputBufferSize, outputBufferSize, ioBufferSize);

	

	// Input buffer creation

	// =====================

	char *buffer = NULL;

	HANDLE_ERROR(cudaHostAlloc((void **)&buffer, DATA_NUM * DATA_SIZE * sizeof(char), cudaHostAllocDefault));

	for(int i=0 ; i<DATA_NUM ; i++)

	{

		char *ptr = buffer + i*DATA_SIZE;

		strncpy(ptr, str, DATA_SIZE-1);

		ptr[DATA_SIZE-1] = '\0';

	}

	// GPU computation

	// ===============

	char *d_data = NULL;

	int *d_results = NULL;

	int *results = NULL;

	cu_timer_t gpu_allocation_timer;

	cu_timer_t gpu_timer;

	cu_timer_t gpu_computation_timer;

	gpu_allocation_timer.start();

	HANDLE_ERROR(cudaMalloc((void**)&d_data, DATA_NUM * DATA_SIZE * sizeof(char)));

	HANDLE_ERROR(cudaMalloc((void**)&d_results, DATA_NUM * sizeof(int)));

	HANDLE_ERROR(cudaHostAlloc((void **)&results, DATA_NUM * sizeof(int), cudaHostAllocDefault));

	float gpu_allocation_time = gpu_allocation_timer.stop();

	

	gpu_timer.start();	

	HANDLE_ERROR(cudaMemcpy(d_data, buffer, DATA_NUM * DATA_SIZE * sizeof(char), cudaMemcpyHostToDevice));

	gpu_computation_timer.start();

	atoi_kernel<<<maxBlocks, maxThreads>>>(d_data, d_results, DATA_NUM, DATA_SIZE);

	float gpu_computation_time = gpu_computation_timer.stop();

	HANDLE_ERROR(cudaMemcpy(results, d_results, DATA_NUM * sizeof(int), cudaMemcpyDeviceToHost));

	float gpu_time = gpu_timer.stop();

	if(verify_results(buffer, DATA_SIZE, DATA_NUM, results))

		printf("** GPU Error(s)\n");

		

	HANDLE_ERROR(cudaFree(d_data));

	HANDLE_ERROR(cudaFree(d_results));

	HANDLE_ERROR(cudaFreeHost(results));

	HANDLE_ERROR(cudaFreeHost(buffer));

	// CPU computaion

	// ==============

	cu_timer_t cpu_timer;

	buffer = (char *)calloc(DATA_NUM*DATA_SIZE, sizeof(char));

	results = (int *)calloc(DATA_NUM, sizeof(int));

	for(int i=0 ; i<DATA_NUM ; i++)

	{

		char *ptr = buffer + i*DATA_SIZE;

		strncpy(ptr, str, DATA_SIZE-1);

		ptr[DATA_SIZE-1] = '\0';

	}

	cpu_timer.start();

	atoi_host(buffer, results, DATA_NUM, DATA_SIZE);

	float cpu_time = cpu_timer.stop();

	

	if(verify_results(buffer, DATA_SIZE, DATA_NUM, results))

		printf("** CPU Error(s)\n");

	

	printf("Total CPU execution time..........: %.2f ms\n", cpu_time);

	printf("Total GPU execution time..........: %.2f ms\n", gpu_time);

	printf("\tGPU memory allocation time: %.2f ms\n", gpu_allocation_time);

	printf("\tGPU computation time......: %.2f ms\n", gpu_computation_time);

	printf("\tGPU data transfer time....: %.2f ms\n", (gpu_time - gpu_computation_time));

	printf("\tGPU data transfer speed...: %.2f MByte/sec\n", ioBufferSize/(gpu_time - gpu_computation_time)*1000.0);

	

    return 0;

}

I executed such code in a i7 860 3.5 with old NVidia 9800 GT, the results are the following:

Running on device.................: GeForce 9800 GT

Parse 6553500 integers

	Input buffer size.........: 37.5 MByte

	Output buffer size........: 25.0 MByte

	Total I/O buffer size.....: 62.5 MByte

Total CPU execution time..........: 161.02 ms

Total GPU execution time..........: 370.46 ms

	GPU memory allocation time: 30.82 ms

	GPU computation time......: 333.18 ms

	GPU data transfer time....: 37.28 ms

	GPU data transfer speed...: 1676.47 MByte/sec

Can you execute the same code on different GPU and report the results?

best regards and happy new year !!

jboydon

Looking at the code, it really isn’t very suprising that the GPU is slower. The memory access patterns are about as suboptimal as you could possibly make them for a compute 1.1 device. There is a section in the programming guide which discusses the concept of memory coalescing and how to achieve peak memory throughput on the GPU. You might want to review it.

Hi AvidDay, thanks for the reply.

As you can imagine I am new to CUDA programming, just started couple of weeks ago. I had only a quick look to the programming guide.

About memory access the only thing that came in my mind is related to string alignment. The length of strings can be easily changed to be aligned with 8 bytes, from the test I made, nothing changed in the performance, so it is not the right way.

I will follow your advice and spend more time in the programming guide and on the web to understand how to improve this simple test.

thanks and regards

jboydon

cu_atoi must be rewritten in a branch-free fashion. If all your CUDA threads take different branches, only one of them will execute at a time, essentially transforming a multi-core SM into a single-core. Use a lookup table[256] for every character and perform the same computations on every step. Increment i on every step (by 1 on some threads, by 0 on others), initialize your sign to 1 and subtract 2 if you find ‘-’, otherwise subtract 0, then multiply your result by 10 and add a precomputed digit for every char (5 for ‘5’, 0 for everything else). Once you find \0 (EOS) you can return, which will branch those threads without impacting performance since there’s nothing for them to do anymore. Try to use as many blocks as you have SM’s so if you run this on a GPU with 8 SM’s you’ll always have a minimum of 8 threads running; beyond that you’ll get parallelism for as long as your strings have similar lengths. Another thing you could do is perform an initial copy of all strings from global memory to shared memory, as global memory likes wide 128-bit read transactions but then shared memory will respond better to small byte-wise reads. Since you have 16K of shared memory, you may wish to process 16K/#blocks chunks at a time, with your threads initially arranged to perform aligned copies and then unaligned string-wise reads.

Try to use as many blocks as you have SM’s so if you run this on a GPU
with 8 SM’s you’ll always have a minimum of 8 threads running; beyond
that you’ll get parallelism for as long as your strings have similar lengths.

Well, the straightforward multi-threading of such string conversion procedure(s) for i7 (4-cores) may easily speed it up ~x4. The bottom line is that CUDA is really a stellar performer for data-parallelism and it is totally inefficient for task-parallelism purposes. Another benefit of CUDA is that it paves the “API-environment” for future MIMD based GPU (probably 10+ years away if ever;o( Unfortunately, the “marketing” information around CUDA is quite misleading; the data-parallelism aspects/advantages of GPU/CUDA and its inefficiency for the task-parallelism should be clearly exposed.

Stefan

Strange results, CPU is 0.0ms, why? (did copy/paste into vectorAdd-Sample of SDK):

C:\CUDA\NVIDIA GPU Computing SDK\C\bin\win32\Release>vectorAdd 100

Running on device.................: GeForce GTX 470

Parse 6553500 integers

        Input buffer size.........: 37.5 MByte

        Output buffer size........: 25.0 MByte

        Total I/O buffer size.....: 62.5 MByte

Total CPU execution time..........: 0.00 ms

Total GPU execution time..........: 12.78 ms

        GPU memory allocation time: 7.72 ms

        GPU computation time......: 2.00 ms

        GPU data transfer time....: 10.78 ms

        GPU data transfer speed...: 5797.79 MByte/sec

Hi Nighthawk13, may be your CPU is very fast. Try to change the line:

printf("Total CPU execution time..........: %.2f ms\n", cpu_time);

with:

printf("Total CPU execution time..........: %f ms\n", cpu_time);

And/or use the cu_timer version with clock() instead of cudaEvent:

// Timimg struct and functions

// ===========================

typedef struct cu_timer

{

	float time;

	

	cu_timer();	

	void start();	

	float stop();

} cu_timer_t;

cu_timer::cu_timer() 

{

}

void cu_timer::start()

{

	time = clock();

}

float cu_timer::stop()

{

	return ((float)clock() - time)/CLOCKS_PER_SEC*1000;

}

regards

boydon

Hi Oxydius thank you so much for your interesting explanation, I definitely need to study more on cuda.

One point is not clear to me about your statement:

In this particular (and useless) example all the strings in input are identical, the parallelism should be assured. Am I right?

regards

boydon

That’s correct. If you had as many identical strings as you have CUDA cores, you would get optimal parallelism. The only area of improvement would be the transition from global to shared memory (see ‘coalescing’ in Optimization Guide). This may be going too far for your needs, but the optimal data layout in shared memory would be having each consecutive string starting in a different memory bank, so that each of the 16 threads uses its own bank in parallel. There are 16 banks of 4 bytes each, so you would want string1’s address / 4 == 0, string2’s address / 4 = 1, and so on. After reading 4 bytes, thread1 would move from the first to the second bank, thread2 from the second to the third… You could create this layout in a pinned memory buffer as you copy the data from the CPU to the GPU, leaving padding between each string (data[0]-data[3] will end up on the same shared memory bank as data[64]-[data[67], so you would want the next string in data[68]). Then, your kernel would move the whole thing to shared memory and start processing (2 steps within every CUDA block). If your strings are much smaller than 64 bytes, you may want the CPU to interleave them instead, so long as each thread reads from a different memory bank on every character input.

When your input strings aren’t identical, branching will reduce efficiency, as all 16 threads per SM execute the same instruction (SIMT). If only one thread takes a branch, the other 15 execute the instructions but perform no action. If you eliminate the branching using a lookup table method, make sure you store it in constant memory or even 1D texture cache to avoid global memory latency.

I also wrote some code a while back.
In my scenario I was reading a large file of variable length floating point data and parsed 12 million floats a second. At least a 5 fold speed up over CPU
(includes the delay of reading the file from a rather slow disk. I should try again data already in an array)

http://forums.nvidia.com/index.php?showtopic=105782&pid=584431&start=&st=#entry584431

I would love it if someone wanted to speed up the code I wrote, we have very large files that were taking over 5 minutes each on CPU so this is something the GPU can help with.

kbam

Using clocks() fixed the timer for CPU part. Results show a speedup factor of ~5:

Running on device.................: GeForce GTX 470

Parse 6553500 integers

        Input buffer size.........: 37.5 MByte

        Output buffer size........: 25.0 MByte

        Total I/O buffer size.....: 62.5 MByte

Total CPU execution time..........: 71.00000000 ms

Total GPU execution time..........: 13.00000000 ms

        GPU memory allocation time: 9.00000000 ms

        GPU computation time......: 2.00000000 ms

        GPU data transfer time....: 11.00000000 ms

        GPU data transfer speed...: 5681.73148415 MByte/sec

Fermi cache seems to help here in comparison to 9800GT ;-)

CPU is Core i7-920.

Here the full Code that work for me:

//

// main.cu

//

#include <stdio.h>

// Error handling function and defines

// ===================================

//

static void HandleError(char *cmd, cudaError_t err, const char *file, int line) 

{

    if(err != cudaSuccess) 

        {

        printf("%s in %s at line %d\n    %s\n", cudaGetErrorString(err), file, line, cmd);

        exit(EXIT_FAILURE);

    }

}

#define HANDLE_ERROR(err)               (HandleError(#err, err, __FILE__, __LINE__))

// Timimg struct and functions

// ===========================

// Used to measure GPU elasped time

//

// Timimg struct and functions

// ===========================

typedef struct cu_timer

{

        float time;

cu_timer();     

        void start();   

        float stop();

} cu_timer_t;

cu_timer::cu_timer() 

{

}

void cu_timer::start()

{

        time = clock();

}

float cu_timer::stop()

{

        return ((float)clock() - time)/CLOCKS_PER_SEC*1000.0f;

}

// CUDA Kernel and device functions

// ================================

//

__host__ __device__ int cu_atoi(const char *str)

{

        int i = 0;

        int result = 0;

        int sign = 1;

// Skip spaces, tab and CR

        for(; str[i]==' ' || str[i]=='\n' || str[i]=='\t' ; i++);

// Check sign

        if(str[i]=='+' || str[i]=='-')

                sign = (str[i++]=='+') ? 1 : -1;

// Calculate number

        for(; str[i]>='0' && str[i]<='9' ; i++)

                result = 10*result + (str[i]-'0');

return sign * result;

}

__global__ void atoi_kernel(char *data, int *results, long data_num, int data_size)

{

        for(long tid=threadIdx.x + blockIdx.x * blockDim.x ; tid<data_num ; tid += blockDim.x * gridDim.x)

        {

                const char *ptr = data + tid*data_size;

results[tid] = cu_atoi(ptr);

        }

}

// Results verification function

// =============================

// Returns 0 if no errors.

//

int verify_results(char*buffer, int data_size, long data_num, int *results)

{

        int result = EXIT_SUCCESS;

for(long i=0 ; i<data_num ; i++)

        {

                char *ptr = buffer + i*data_size;

if(results[i] != (int)atoi(ptr))

                {

                        result = EXIT_FAILURE;

                        break;

                }

        }

return result;

}

// Host function equivalent to kernel

// ==================================

//

void atoi_host(char *data, int *results, long data_num, int data_size)

{

        for(long tid=0 ; tid<data_num ; tid++)

        {

                char *ptr = data + tid*data_size;

results[tid] = cu_atoi(ptr);

        }

}

#define DEFAULT_DATA_SIZE               (12)

#define DEFAULT_DATA_NUM                (65535*100)

// Main function

// =============

// Usage: Test_atoi [num]

//              num.....: Number of 65353 block of strings, default 10.

//

int main(int argc, char **argv) 

{

        cudaDeviceProp deviceProp;

        int device;

        HANDLE_ERROR(cudaGetDevice(&device));

        HANDLE_ERROR(cudaGetDeviceProperties(&deviceProp, device));

printf("Running on device.................: %s\n\n", deviceProp.name);

const char *str = "-12345";

        const long DATA_NUM = argc > 1 ? atoi(argv[1]) * 65535 : DEFAULT_DATA_NUM;

        const int DATA_SIZE = strlen(str);

const long maxThreads = 512; 

        const long maxBlocks = 65535;

const float inputBufferSize = ((float)DATA_NUM*DATA_SIZE)/1024.0/1024.0;

        const float outputBufferSize = ((float)sizeof(int)*DATA_NUM)/1024.0/1024.0;

        const float ioBufferSize = inputBufferSize + outputBufferSize;

printf("Parse %ld integers\n"

                   "\tInput buffer size.........: %.1f MByte\n"

                   "\tOutput buffer size........: %.1f MByte\n"

                   "\tTotal I/O buffer size.....: %.1f MByte\n\n",

                   DATA_NUM, inputBufferSize, outputBufferSize, ioBufferSize);

// Input buffer creation

        // =====================

        char *buffer = NULL;

        HANDLE_ERROR(cudaHostAlloc((void **)&buffer, DATA_NUM * DATA_SIZE * sizeof(char), cudaHostAllocDefault));

for(int i=0 ; i<DATA_NUM ; i++)

        {

                char *ptr = buffer + i*DATA_SIZE;

strncpy(ptr, str, DATA_SIZE-1);

                ptr[DATA_SIZE-1] = '\0';

        }

// GPU computation

        // ===============

        char *d_data = NULL;

        int *d_results = NULL;

        int *results = NULL;

cu_timer_t gpu_allocation_timer;

        cu_timer_t gpu_timer;

        cu_timer_t gpu_computation_timer;

gpu_allocation_timer.start();

        HANDLE_ERROR(cudaMalloc((void**)&d_data, DATA_NUM * DATA_SIZE * sizeof(char)));

        HANDLE_ERROR(cudaMalloc((void**)&d_results, DATA_NUM * sizeof(int)));

        HANDLE_ERROR(cudaHostAlloc((void **)&results, DATA_NUM * sizeof(int), cudaHostAllocDefault));

        float gpu_allocation_time = gpu_allocation_timer.stop();

gpu_timer.start();      

        HANDLE_ERROR(cudaMemcpy(d_data, buffer, DATA_NUM * DATA_SIZE * sizeof(char), cudaMemcpyHostToDevice));

        gpu_computation_timer.start();

        atoi_kernel<<<maxBlocks, maxThreads>>>(d_data, d_results, DATA_NUM, DATA_SIZE);

        cudaThreadSynchronize();

        float gpu_computation_time = gpu_computation_timer.stop();

        HANDLE_ERROR(cudaMemcpy(results, d_results, DATA_NUM * sizeof(int), cudaMemcpyDeviceToHost));

        float gpu_time = gpu_timer.stop();

if(verify_results(buffer, DATA_SIZE, DATA_NUM, results))

                printf("** GPU Error(s)\n");

HANDLE_ERROR(cudaFree(d_data));

        HANDLE_ERROR(cudaFree(d_results));

        HANDLE_ERROR(cudaFreeHost(results));

        HANDLE_ERROR(cudaFreeHost(buffer));

// CPU computaion

        // ==============

        cu_timer_t cpu_timer;

buffer = (char *)calloc(DATA_NUM*DATA_SIZE, sizeof(char));

        results = (int *)calloc(DATA_NUM, sizeof(int));

for(int i=0 ; i<DATA_NUM ; i++)

        {

                char *ptr = buffer + i*DATA_SIZE;

strncpy(ptr, str, DATA_SIZE-1);

                ptr[DATA_SIZE-1] = '\0';

        }

cpu_timer.start();

        atoi_host(buffer, results, DATA_NUM, DATA_SIZE);

        float cpu_time = cpu_timer.stop();

if(verify_results(buffer, DATA_SIZE, DATA_NUM, results))

                printf("** CPU Error(s)\n");

printf("Total CPU execution time..........: %.8f ms\n", cpu_time);

        printf("Total GPU execution time..........: %.8f ms\n", gpu_time);

        printf("\tGPU memory allocation time: %.8f ms\n", gpu_allocation_time);

        printf("\tGPU computation time......: %.8f ms\n", gpu_computation_time);

        printf("\tGPU data transfer time....: %.8f ms\n", (gpu_time - gpu_computation_time));

        printf("\tGPU data transfer speed...: %.8f MByte/sec\n", ioBufferSize/(gpu_time - gpu_computation_time)*1000.0);

return 0;

}

Hi Nighthawk, definitely I need to upgrade my video card !

The atoi_kernel and cu_atoi need to be improved further because with a multi-thread program, the CPU performance can be increased by a factor of ~4 in a i7 CPU. Unfortunately my cuda skills are not good enough (I planned to study more) to improve the parsing benchmark. Of course all comments/suggestions are more than welcome.

regards

jboydon