Hi, Guys
i need a fast sorting algorithm for small Arrays. I already tried out the bitonic sort algorithm from the example code, but the performace was horrible. Sorting a 256 elements array takes 5 ms,
on a 560 gtx.
The globale Kernel:
__global__ void initCCL (const uchar* device_fdata,uchar* device_buffer,uint* buffer,int width,int height,int pitch,int block2Log) {
const uint x = blockIdx.x *blockDim.x+ threadIdx.x;
const uint y = blockIdx.y *blockDim.y+ threadIdx.y;
const uint idx = x + y * pitch;
const int localIdx = threadIdx.x + blockDim.y * threadIdx.y;
__shared__ uint unionBuffer[TILE_SIZE];
__shared__ uint updateBuffer[TILE_SIZE];
__shared__ uint2 tupelBuffer[TILE_SIZE*3];
__shared__ uint2 tupelSwapBuffer[TILE_SIZE*3];
__shared__ uint s_key[TILE_SIZE*3];
__shared__ uint s_ident[TILE_SIZE*3];
uint px = device_fdata[idx];
updateBuffer[localIdx] = 0;
s_key[localIdx] = 0;
s_ident[localIdx] =localIdx;
/*
* Every Pixel in a Quad is connected to each other
* so every Pixel is the same group
*/
uint quadid = QUADID(threadIdx.x,threadIdx.y);
//quadid = (px>0)*quadid;
if(px < 1) {
quadid= 0;
}else {
quadid = QUADID(threadIdx.x,threadIdx.y);
}
unionBuffer[localIdx] = quadid;
__syncthreads();
/*
const uint tn = THREADSNEEDED(2);
uint currentVal = 0;
uint rt,r,rb;
if(localIdx< 64) {
if(localIdx < tn){
const uint temp2 = localIdx * 2 + ((localIdx /7)*2) +1;
currentVal = unionBuffer[temp2];
rt = unionBuffer[LIMIT_MIN(temp2 -16,0)];
r = unionBuffer[temp2+1];
rb = unionBuffer[LIMIT_MAX(temp2+17,255)];
uint2 a = {currentVal*3,rt} ;
sizeswap(a);
tupelBuffer[localIdx] = a;
s_key[localIdx *3 ] = a.y;
uint2 b = {currentVal,r};
sizeswap(b);
tupelBuffer[localIdx*3+1] = b;
s_key[localIdx *3 +1 ] = b.y;
uint2 c = {currentVal,rb};
sizeswap(c);
tupelBuffer[localIdx*3+2] = c;
s_key[localIdx * 3 +1 ]= b.y;
s_ident[localIdx * 3] = localIdx * 3;
s_ident[localIdx * 3+1] = localIdx * 3+1;
s_ident[localIdx * 3+2] = localIdx * 3+2;
}else {
//Build Min Buffer
}
}
*/
if(localIdx < 128)
bitonic_merge_sort(s_key,s_ident,localIdx,256);
__syncthreads();
//Reduce the tupel buffer
device_buffer[idx] = unionBuffer[localIdx];
}
And the sorting Algorithm:
inline __device__ void bitonic_merge_sort (uint* s_key,uint* s_val,const uint& localIdx,const uint& arrayLength,uint dir){
for(uint size = 2; size < arrayLength; size <<= 1){
//Bitonic merge
uint ddd = dir ^ ( (localIdx& (size / 2)) != 0 );
for(uint stride = size / 2; stride > 0; stride >>= 1){
__syncthreads();
uint pos = 2 * localIdx - (localIdx & (stride - 1));
Comparator(
s_key[pos + 0], s_val[pos + 0],
s_key[pos + stride], s_val[pos + stride],
ddd
);
}
}
//ddd == dir for the last bitonic merge step
{
for(uint stride = arrayLength / 2; stride > 0; stride >>= 1){
__syncthreads();
uint pos = 2 * localIdx - (localIdx & (stride - 1));
Comparator(
s_key[pos + 0], s_val[pos + 0],
s_key[pos + stride], s_val[pos + stride],
dir
);
}
}
__syncthreads();
}
Any Idea why the code is so slow?
thx