Efficient kernel sort algorithm for very small Arrays

Hi, Guys

i need a fast sorting algorithm for small Arrays. I already tried out the bitonic sort algorithm from the example code, but the performace was horrible. Sorting a 256 elements array takes 5 ms,

on a 560 gtx.

The globale Kernel:

__global__ void initCCL (const uchar* device_fdata,uchar* device_buffer,uint* buffer,int width,int height,int pitch,int block2Log) {

	const uint x =  blockIdx.x *blockDim.x+ threadIdx.x;

	const uint y = blockIdx.y *blockDim.y+ threadIdx.y;

	const uint idx = x + y * pitch; 

	

	const int localIdx = threadIdx.x + blockDim.y * threadIdx.y;

	

	

	__shared__ uint unionBuffer[TILE_SIZE];	

	__shared__ uint updateBuffer[TILE_SIZE];

	

	__shared__ uint2 tupelBuffer[TILE_SIZE*3];

	__shared__ uint2 tupelSwapBuffer[TILE_SIZE*3]; 

	__shared__ uint s_key[TILE_SIZE*3];

	__shared__ uint s_ident[TILE_SIZE*3];

	uint px = device_fdata[idx];

	updateBuffer[localIdx] = 0;

	

	s_key[localIdx] = 0;

	s_ident[localIdx] =localIdx;

	/*

	* Every Pixel in  a Quad is connected to each other

	* so every Pixel is the same group

	*/

	uint quadid = QUADID(threadIdx.x,threadIdx.y);

	//quadid = (px>0)*quadid;

	

	if(px < 1) {

		quadid= 0;		

	}else {		

		quadid = QUADID(threadIdx.x,threadIdx.y);

	}

	unionBuffer[localIdx] = quadid;

	__syncthreads();

	/*

	const uint tn = THREADSNEEDED(2);

	uint currentVal = 0;

	uint rt,r,rb;

	if(localIdx< 64) {

		if(localIdx < tn){

		const uint temp2 = localIdx * 2 + ((localIdx /7)*2) +1;

		currentVal = unionBuffer[temp2];

		rt = unionBuffer[LIMIT_MIN(temp2 -16,0)];

		r = unionBuffer[temp2+1];

		rb = unionBuffer[LIMIT_MAX(temp2+17,255)];

		

		uint2 a = {currentVal*3,rt} ;	

		sizeswap(a);

		tupelBuffer[localIdx] = a;

		s_key[localIdx *3 ] = a.y;

		

		uint2 b = {currentVal,r};

		sizeswap(b);

		tupelBuffer[localIdx*3+1] = b;

		s_key[localIdx *3 +1 ] = b.y;

		

		uint2 c = {currentVal,rb};

		sizeswap(c);

		tupelBuffer[localIdx*3+2] = c;

		s_key[localIdx * 3 +1 ]= b.y;

		s_ident[localIdx *   3] = localIdx *   3;

		s_ident[localIdx * 3+1] = localIdx * 3+1;

		s_ident[localIdx * 3+2] = localIdx * 3+2;

		}else {

			

			//Build Min Buffer

		}

	}

	*/

	if(localIdx < 128)

	bitonic_merge_sort(s_key,s_ident,localIdx,256);

	__syncthreads();

	//Reduce the tupel buffer

	device_buffer[idx] = unionBuffer[localIdx];

}

And the sorting Algorithm:

inline __device__ void bitonic_merge_sort (uint* s_key,uint* s_val,const uint& localIdx,const uint& arrayLength,uint dir){

	     for(uint size = 2; size < arrayLength; size <<= 1){

        //Bitonic merge

        uint ddd = dir ^ ( (localIdx& (size / 2)) != 0 );

        for(uint stride = size / 2; stride > 0; stride >>= 1){

            __syncthreads();

            uint pos = 2 * localIdx - (localIdx & (stride - 1));

            Comparator(

                s_key[pos +      0], s_val[pos +      0],

                s_key[pos + stride], s_val[pos + stride],

                ddd

            );

        }

    }

//ddd == dir for the last bitonic merge step

    {

        for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){

            __syncthreads();

            uint pos = 2 * localIdx - (localIdx & (stride - 1));

            Comparator(

                s_key[pos +      0], s_val[pos +      0],

                s_key[pos + stride], s_val[pos + stride],

                dir

            );

        }

    }

__syncthreads();

 }

Any Idea why the code is so slow?

thx

For such small arrays, why not sort them on the host? I would imagine the poor performance would be because most of the 5 ms consists of GPU overhead.

You could try radix sort from Duanne Meril since you are sorting integers but for an array of 256 elements i don’t know if it will be faster than bitonic sort

I need to sort them in the Kernel, because i have to eliminate dublicates. The algorithm you see isnt complete, so it would be a big overhead to transfer back und elimate dublicates.