execution time on CPU and GPU

Manalo · February 24, 2015, 6:52pm

Dear all
I had developed a serial merge sort and tested it on Intel core 2 Due, then I parallelized it on GPU using cuda c run time, but what unexpextd for me that the serial program is faster than the parallel program. why does this happened and how can I make the parallel version faster than serial?

this is serial code

// Merge6Serial.cpp : Defines the entry point for the console application.
//
#include <windows.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
//#include <cmath>

#define PP  1
#define  P (PP*2048)
#define N  P*8*1024 

void threadsMerge1(int *a, int *temp, int rec_num);
void threadsMerge0(int *a);

int main(){

	 int *a, *temp;
	 FILE *fp = fopen("E:\\data.txt", "w");// file for data

	a = (int *)malloc(N*sizeof(int));
	temp= (int *)malloc(N*sizeof(int));

	//FILE *fp = fopen("C:\\output_merge_serial.txt", "w");// file for data
	srand(time(NULL));
	for (int i = 0; i < N; i++)
	{
		int num = rand() % 100;
		a[i] = num;
	}

	printf("\n ******************** \n");

	int x = log2(P);

for (int i = 0; i < N; i = i + (8*1024)){
		qsort(a + i,8*1024, sizeof(int), compare);
	}
	

	
	//start execution
	//*********************************************************************************
	//----------------------------------------

	LARGE_INTEGER freq;
	LARGE_INTEGER t0, tF, tDiff;
	double elapsedTime;
	double resolution;
	QueryPerformanceFrequency(&freq);
	QueryPerformanceCounter(&t0);
	//-----------------------------------------

	//sort array of size up to 2048*1024. up to N=8*2048*1024
	//------------------------------------------------

		
		threadsMerge1(a, temp, 16*1024, x);

	//*********************************************************************************

	printf("\n ++++++++++++++++++++++++++++++++++++++\n\n\n\n ");

	printf("\n ++++++++++++++++++++++++++++++++++++++\n\n\n\n ");

	QueryPerformanceCounter(&tF);
	tDiff.QuadPart = tF.QuadPart - t0.QuadPart;
	elapsedTime = tDiff.QuadPart / (double)freq.QuadPart;

	printf("\n Code under test took %lf sec\n", elapsedTime);
	

	printf("\n ******************** \n");

	free(a);
	free(temp);
	//------------------------------------------

	//------------------------------------------
	fclose(fp);
	return 0;
}

void threadsMerge1(int *a, int *temp,int rec_num,int u)
{
	 int index1, endIndex1, index2, endIndex2, targetIndex, sortedSize;
	 for (int i = 0; i <N/rec_num; i++){
		 
		 index1 =  (i*rec_num);
		 endIndex1 = index1 + (rec_num/2)-1;
	     index2 = endIndex1 + 1;
		 endIndex2 = index2 + (rec_num/2)-1;
		 targetIndex = index1;
		 sortedSize = 0;
	
		while (index1 <= endIndex1 && index2 <= endIndex2){

			if (a[index1] <= a[index2]){
				temp[targetIndex] = a[index1];
				++sortedSize;
				++index1;
				++targetIndex;
			}
			else{
				temp[targetIndex] = a[index2];
				++index2;
				++sortedSize;
				++targetIndex;
			}
		}

		if (index1 < endIndex1 &&index2 == endIndex2){

			if (sortedSize < rec_num){
				int x = 0;
				while (x < (rec_num / 2)){

					temp[targetIndex] = a[index1];
					++sortedSize;
					++targetIndex;
					++index1;
					++x;
				}
			}

		}

		if (index2 > endIndex2 &&index1 <= endIndex1){
			if (sortedSize < rec_num){
				int x = 0;
				while (x < (rec_num / 2)){

					temp[targetIndex] = a[index1];
					++sortedSize;
					++targetIndex;
					++index1;
					++x;
					if (index1 > endIndex1)
						break;
				}

			}
		}

		if (index1 > endIndex1 &&index2 <= endIndex2){
			if (sortedSize < rec_num){
				int x = 0;
				while (x < rec_num / 2){

					temp[targetIndex] = a[index2];
					++sortedSize;
					++targetIndex;
					++index2;
					++x;
					if (index2 > endIndex2)
						break;
				}
			}
		}
	}

	 if (u > 1){

		 threadsMerge1(temp, a, 2 * rec_num, --u);

	 }
	
}

this is the parallel code

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cmath>

#define  P (PP*2048)
#define N  P*8*1024 // size of array to be sorted must be power of 2 multipled of 1024. Minimum size of array is 8*1024

// Define this to turn on error checking
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
 
//****************************************************************************************
// functions for cuda error checking
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}
#endif

	return;
}
inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
	cudaError err = cudaGetLastError();
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}

	// More careful checking. However, this will affect performance.
	// Comment away if needed.
	err = cudaDeviceSynchronize();
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}
#endif

	return;
}
//****************************************************************************************

//sort until maximum size of array = 1*1024*1024
//***************************************************************************************
// (kernel) function that merge each 2048 of the subbarys of size 1024 (2048*1024) in the origin array
__device__ void FinalMergePart1(int *a, int *temp, int rec_num, int d, int x,int s,int b)
{

	int threadId = threadIdx.x;

	if (0 <= threadId && threadId < (s / (2 * d))){

		int BlockStart = blockIdx.x*(b *8* 1024);
		

		int index1 = BlockStart + rec_num * (threadId);
		int endIndex1 = index1 + (rec_num / 2) - 1;
		int index2 = endIndex1 + 1;
		int endIndex2 = index2 + (rec_num / 2) - 1;
		int targetIndex = index1;
		int sortedSize = 0;
		//printf("th= %d ", threadId);
		while (index1 <= endIndex1 && index2 <= endIndex2){

			if (a[index1] <= a[index2]){
				temp[targetIndex] = a[index1];
				++sortedSize;
				++index1;
				++targetIndex;
			}
			else{
				temp[targetIndex] = a[index2];
				++index2;
				++sortedSize;
				++targetIndex;
			}
		}

		if (index1 < endIndex1 &&index2 == endIndex2){

			if (sortedSize < rec_num){
				int bb = 0;
				while (bb < (rec_num / 2)){

					temp[targetIndex] = a[index1];
					++sortedSize;
					++targetIndex;
					++index1;
					++bb;
				}
			}

		}

		if (index2 > endIndex2 &&index1 <= endIndex1){
			if (sortedSize < rec_num){
				int bb = 0;
				while (bb < (rec_num / 2)){

					temp[targetIndex] = a[index1];
					++sortedSize;
					++targetIndex;
					++index1;
					++bb;
					if (index1 > endIndex1)
						break;
				}

			}
		}

		if (index1 > endIndex1 &&index2 <= endIndex2){
			if (sortedSize < rec_num){
				int bb = 0;
				while (bb < rec_num / 2){

					temp[targetIndex] = a[index2];
					++sortedSize;
					++targetIndex;
					++index2;
					++bb;
					if (index2 > endIndex2)
						break;
				}
			}
		}
	}
	__syncthreads();
	if (x > 1){
	
		FinalMergePart1(temp, a, 2 * rec_num, 2 * d, --x,s,b);
		
	}
	

}

//  (kernel) functions that sort each subarray of the origin array, each sorted subarray is of size 1024

__global__ void endphase1(int *a, int *temp, int x, int s,int b){

	FinalMergePart1(a, temp, 2048*8, 1, x,s,b);

}

//*********************************************************************************************
//*********************************************************************************************
int compare(const void* left, const void* right)
{
	int   *lt = (int*)left,
		*rt = (int*)right,
		diff = *lt - *rt;

	if (diff < 0) return -1;
	if (diff > 0) return +1;
	return 0;
}

//****************************************************************************************************************************************
int main()
{
	int *a; // the main array to be sorted
	int *dev_a, *dev_temp1;   // array on the device

	int sum1 = 0;// variables used in error checking
	int sum2 = 0;

	//FILE *fp = fopen("C:\\data.txt", "w");// file for data

	//*********************************************
	cudaMalloc((void **)&dev_a, N*sizeof(int));//allocate memory on device
	cudaMalloc((void **)&dev_temp1, N*sizeof(int));//allocate memory on device
	a = (int *)malloc(N*sizeof(int)); //allocate memory on host

	//*******1*************************************

	//--------------------------
	//genetate random data
	for (int i = 0; i < N; i++)
	{
		int num = rand() % 100;
		a[i] = num;
	}
	//-------------------------

	for (int i = 0; i < N; i = i + (8*1024)){
		qsort(a + i,8*1024, sizeof(int), compare);
	}
	

	printf("\n 1 way \n");
	printf("P= %d\n",P);

	//*********************************************************
	//transfer data from host memory to device memory, where dev_a is the destination
	cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
	//*********************************************************
	int x = log2(P);
	int z = log2(PP);

	//*************************************************************
	// calculating elapsed time, phase1
	cudaEvent_t start, stop; float time;
	cudaEventCreate(&start); cudaEventCreate(&stop);
	cudaEventRecord(start, 0);

	//**************************************************************

	//start execution
	//*********************************************************************************

	//sort up to N=1024*1024
 if (1 < P&&P <= 2048){

		if (P == 2048){
			endphase1 << <1, 1024 >> >(dev_a, dev_temp1, x, P,2048);
		}

		else {
			endphase1 << <1, P >> >(dev_a, dev_temp1, x, P,2048);
		}

	}

	//***********************************************************************
	// calculating elapsed time, phase2
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	printf("kernel time in ms:\t%.7f\n", time);
	//**********************************************************************
	//------------------------------------------------------------
	// cuda error checking
	cudaError_t error = cudaGetLastError();
	if (error != cudaSuccess)
	{
		printf("CUDA Error: %s\n", cudaGetErrorString(error));

		// we can't recover from the error -- exit the program
		return 1;
	}
	//-----------------------------------------------------------
	cudaDeviceSynchronize();
	if (x%2==0){
		cudaMemcpy(a, dev_a, N*sizeof(int), cudaMemcpyDeviceToHost); // return sorted array to host
	}
	
	else{
		cudaMemcpy(a, dev_temp1, N*sizeof(int), cudaMemcpyDeviceToHost); // return sorted array to host
	}

	//***********************************************************************************

	for (int i = 0; i < N; i++)
	{
		if (a[i] == 0){
			++sum2;
		}

	}

	
	cudaFree(dev_a);//free memory on device
	cudaFree(dev_temp1);
	free(a);//free memory on host

}

CudaaduC · February 25, 2015, 3:18am

Writing a custom merge sort application in CUDA is not a beginner task.

Instead compare to thrust::sort() using device pointers. That sort should be about 50-70x faster for large N than STD::sort().

Also your CUDA implementation looks wonky, so take clues from the the CUDA SDK mergeSort sample. The code is in there for examination. For N = 4,194,304, this was my output:

Initializing GPU merge sort...
Running GPU merge sort...
Time: 9.452534 ms
Reading back GPU merge sort results...
Inspecting the results...
...inspecting keys array: OK
...inspecting keys and values array: OK
...stability property: stable!
Shutting down...

Also keep in mind the fastest sort implementations on GPUs for primitives are usually Radix sorts.

Manalo · February 25, 2015, 8:10pm

thank you for your contribution

I think that thrust sort sorts the array as a black box, which I means you don’t worry about each thread how does it works.
I’m very interested with each thread on what element of array to work on.
what can i do?

CudaaduC · February 25, 2015, 8:21pm

After a quick search I came across this paper which in a general sense goes over different approaches for sorting in parallel:

[url]https://docs.google.com/viewer?url=http://nets.cis.fiu.edu/images/gpuSort.pdf[/url]

While those authors do not contribute any detailed new techniques, it is a good general overview of others previous work on the topic. Most books on CUDA have sections on sorting, so take a look at those as well.

Also take a look at the CUDA SDK sample code for simple quick sort, but that implementation ( I believe) requires compute capability of 3.5 or higher.

Manalo · February 25, 2015, 8:37pm

I get the quick sort
what is its benefit for merge sort?

CudaaduC · February 25, 2015, 8:54pm

They are both ‘Divide-and-Conquer’ algorithms.

If you are truly interested in this topic, you are going to need to do your own research.

Good luck!

Manalo · February 26, 2015, 4:18am

I will do, thank you

Topic		Replies	Views
Fastest sorting algorithm on GPU currently CUDA Programming and Performance	20	46139	May 2, 2021
Code: efficient sorting with CUDA CUDA Programming and Performance	40	62041	December 20, 2011
why doesn't this dog run? Legacy PGI Compilers	11	5005	March 9, 2011
problem in timing of GPU work CUDA Programming and Performance	5	898	September 11, 2015
Odd Even Merge Sort CUDA Programming and Performance	1	1044	April 8, 2014
How do I sort using CUDA? CUDA Programming and Performance	2	5219	July 9, 2019
Fortran OMP Parrallel Merge preformance Legacy PGI Compilers	2	8035	April 30, 2013
SRTS Radix Sort: High Performance and Scalable GPU Radix Sorting CUDA Programming and Performance	58	19397	May 27, 2013
Cuda program taking more time. CUDA Programming and Performance	15	7188	November 21, 2010
Confused about GPU vs CPU speed in multiplication CUDA Programming and Performance	8	6660	February 19, 2009

execution time on CPU and GPU

Related topics