GPU sorting code, need help "shell sort" code, please help to modify it.

Dear my helper:

I want to calculate percentile of ~1000 columns together, each columns with ~25000 data points.

First I use sorting method to get my result.

But I now have 2 problems,

(1) Some errors happened when I use DATA size of 2560 x 1024, but okay for small size. Pls help to modify it. (for line 16 //for actually large data test)

(2) The sorting method “shell sort” do not have good performance for GPU. Any better methods ?

Can you help to modify it ?

Thanks in advance.

My hardware is 9800 GT with 1G RAM, 14 multiprocessors.

iamjyj801

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <cutil_inline.h>

//for simple data check

#define DATA_SIZE 102 // 51250*1024 //data 25600 x 1024 (row x column)

#define BLOCK_NUM 2 //1024 //equal to column number

#define THREAD_NUM 10 //512 //each column with THREAD_NUM threrads

/*

//for test too

#define DATA_SIZE 1011024 //data 25600 x 1024 (row x column)

#define BLOCK_NUM 1024 //equal to column number

#define THREAD_NUM 10 //each column with THREAD_NUM threrads

//for actually large data test

#define DATA_SIZE 512 * 50 * 1024 //data 25600 x 1024 (row x column)

#define BLOCK_NUM 1024 //equal to column number

#define THREAD_NUM 512 //each column with THREAD_NUM threrads

*/

#define DATA_ROW DATA_SIZE/BLOCK_NUM //each column with DATAROW for sort

float data[DATA_SIZE]; //now use one dimension array, want to change to 2 array

int floatCompare(const void* a, const void* B); //for CPU sort

void shellsort2(float number, int MAX) ; //for CPU sort

global static void SortByGPU(float num, float result_set1, float * result_sum)

{

//extern

__shared__ float shared_set1[BLOCK_NUM];

//extern

__shared__ float shared_sum[BLOCK_NUM];

const int tid = threadIdx.x;

const int bid = blockIdx.x;

shared_set1[tid] = 0;

shared_sum[tid] = 0;

float temp;

int index, index_addgap;

int i, j,k,gap, MAX;

MAX = DATA_ROW;

for (gap = DATA_ROW ; gap > 0; gap /= 2){			//use shell sort

	//each thread calculate their part  for(k = 0; k < gap; k++) { }

	if (threadIdx.x <gap ) 

	{

		k = threadIdx.x;

		for(i = k+gap; i < MAX; i+=gap) { 

            for(j = i - gap; j >= k; j-=gap) { 

				index = j * BLOCK_NUM + bid ;			//Since I use one dimension array to simulate 2 dim

				index_addgap= (j+gap) * BLOCK_NUM + bid ;

                if(num[index] > num[index_addgap]) {    //SWAP(num[index], num[index+gap]); 

					temp = num[index];					//x

					num[index] = num[index_addgap];		//x = y

					num[index_addgap] = temp ;			//y = t

                } 

                else break;		

            } 

		}

	}

	__syncthreads();

}

}

void GenerateNumbers(float *number, int size)

{

int mycolumns=BLOCK_NUM;

for(int k = 0; k < mycolumns; k++) {	  

  for(int j = 0; j < (DATA_ROW); j++) {

	//number[j*mycolumns+k] = (float) (rand() % 1000 +0.32 ) *1  ; //just for float

	number[j*mycolumns+k] = (float) (DATA_ROW - j )  ; //for check result 

	}

}

for (int k=0; k< DATA_SIZE; k++)

{ 	printf("k=%d  %f\n",k, number[k] ); if (k>= 21) break;  //only print out 20 data

}

fprintf(stderr, "Data generated.\n");

getchar();

}

bool InitCUDA()

{

int count;

cudaGetDeviceCount(&count);

if(count == 0) {

    fprintf(stderr, "There is no device.\n");

    return false;

}



int i;

for(i = 0; i < count; i++) {

    cudaDeviceProp prop;

    if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {

        if(prop.major >= 1) {

            break;

        }

    }

}

if(i == count) {

    fprintf(stderr, "There is no device supporting CUDA 1.x.\n");

    return false;

}

cudaSetDevice(i);

return true;

}

#define SWAP(x,y) {float t; t = x; x = y; y = t;}

int floatCompare(const void* a, const void* B)

{

union{

  float result; 

  int retval; 

}value;

value.result = ((float)a > (float)B);

return value.retval;

}

void shellsort2(float number, int MAX) {

int i, j, k, gap; 

for (gap = MAX / 2; gap > 0; gap /= 2){

    for(k = 0; k < gap; k++) { 

        for(i = k+gap; i < MAX; i+=gap) { 

            for(j = i - gap; j >= k; j-=gap) { 

                if(number[j] > number[j+gap]) { 

                    SWAP(number[j], number[j+gap]); 

                } 

                else 

                    break; 

            } 

        } 

    }  

} 

}

int main()

{

if(!InitCUDA()) {

    return 0;

}

printf(“CUDA initialized.\n”);

GenerateNumbers(data, DATA_SIZE);

float result_check[DATA_SIZE];

unsigned int timer;

cutCreateTimer(&timer);

cutStartTimer(timer);

float* gpudata, *result_set1, *result_sum;

// clock_t* time;

cudaMalloc((void**) &gpudata, sizeof(float) * DATA_SIZE);

// cudaMalloc((void**) &result_set1, sizeof(float) * BLOCK_NUM);

cudaMalloc((void**) &result_set1, sizeof(float) * BLOCK_NUM); //special add sum after sum square

cudaMalloc((void**) &result_sum      , sizeof(float) * BLOCK_NUM);

// cudaMalloc((void**) &time, sizeof(clock_t) * BLOCK_NUM * 2);

cudaMemcpy(gpudata, data, sizeof(float) * DATA_SIZE, cudaMemcpyHostToDevice);

//sumOfSquares<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(float) * 2 >>>(gpudata, result_set1, result_sum );

SortByGPU<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(float) * 2 >>>(gpudata, result_set1, result_sum  );

float set1[BLOCK_NUM], sum0[BLOCK_NUM];

//clock_t time_used[BLOCK_NUM * 2];

cudaMemcpy(&set1, result_set1, sizeof(float) * BLOCK_NUM , cudaMemcpyDeviceToHost);

cudaMemcpy(&sum0     , result_sum      , sizeof(float) * BLOCK_NUM,  cudaMemcpyDeviceToHost);

cudaMemcpy(&result_check , gpudata     , sizeof(float) * DATA_SIZE,  cudaMemcpyDeviceToHost);	//copy back data

cudaFree(gpudata);

cudaFree(result_set1);

cudaFree(time);

cutStopTimer(timer);

float GPUTime = cutGetTimerValue(timer);

printf(“GPU time): %0.7f\n”, GPUTime);

for (int k=0; k< DATA_SIZE; k++)

{ 	printf("k=%d  %f\n",k, result_check[k] ); if (k>= 21) break;  //only print out 20 data

	if (k> 21) break;

}

fprintf(stderr, "Data generated.\n");

// next for CPU

cutResetTimer(timer);

cutStartTimer(timer);		//Timer begin

float tmp;

int mycolumns=BLOCK_NUM;

float be_sorted_data[DATA_ROW]={0};

for(int k = 0; k < mycolumns; k++) {	  

  for(int j = 0; j < (DATA_ROW); j++) {			//loop for each column

	tmp=data[j*mycolumns+k];					//each data for column j

	be_sorted_data[j]=tmp;

	}

	shellsort2( be_sorted_data, DATA_ROW);

}

cutStopTimer(timer);		//Timer end

float CPUTime = cutGetTimerValue(timer);

printf("CPU time): %0.7f\n", CPUTime);

printf("GPU time): %0.7f\n", GPUTime);

getchar();

return 0;

}

[/codebox]
For_cuda_sort2.zip (63.9 KB)

From what I’ve read, the bitonic sort is very effective on the GPU. There’s an example project using it included with the CUDA SDK you should take a look at.

Thank for your response.

Actually, I tried the bitonic sort method before.
But, I need to handle float numbers instead of integer and each column with 25600 data points . Then use 512 (or 1024) blocks of multithreads for individual column sorting.

The CUDA bitonic example I tried but with my capability I just can not direct adapt it or modify it.

I hope to identify which method is suitable for me, the bitonic method or any parallel method (like above shell sort).

Can you or anyone help to provide the bitonic code or modify my code?

Many thanks,