Hello
I’m writing counting radix sort and I have a problem. I have a list of N numbers and my code works only for floor( N/Block_size) of them.
(cellOld[particleID] = index of particle’s cell)
[codebox]global void CUDA_sortCellArray(int* cellOld, int* cellId, int* parId, PSYSATTR* A){
int indx = blockDim.x * blockIdx.x + threadIdx.x;
int thId = threadIdx.x;
int count = (*A).count;
if(indx < count){
int newId = 0;
int RnewId = 0;
int thCell = cellOld[indx];
for(int i=0; (i*BLOCK_SIZE)<count; i++){
__shared__ int cell[BLOCK_SIZE];
if(i*BLOCK_SIZE+thId < count) cell[thId] = cellOld[i*BLOCK_SIZE+ thId];
syncthreads();
for(int j=0; j<BLOCK_SIZE && ((i*BLOCK_SIZE+j)<count); j++){
if(cell[j]<thCell) newId+=1;
if(cell[j]==thCell && (i*BLOCK_SIZE+j)<indx) newId+=1;
}
syncthreads();
}
// Here I'm calculating new index without shared memory and it works alright
for(int j=0; j<count; j++){
if(cellOld[j]<thCell) RnewId+=1;
if(cellOld[j]==thCell && (j)<indx) RnewId+=1;
}
cellId[newId] = indx;
parId[newId] = newId;
}
}[/codebox]
Have anyone any idea what might be wrong??