So I first tried Bitonic Sort on GPU using CUDA and it worked successfully. I also had CUDA Merge sort and that worked well too. The same functions, when I combined them into a single project to be run in an if else choice, the __syncthreads() on my Bitonic Sort GPU function is always throwing an error. Can’t seem to figure out why.
This is the function
// GPU kernel for Bitonic Sort
global void bitonicSortGPU(int* arr, int size) {
shared int sharedArr[8192];
int tid = threadIdx.x;
int gid = threadIdx.x + blockIdx.x * blockDim.x;
// Load data from global memory to shared memory
if (gid < size) {
sharedArr[tid] = arr[gid];
}
else {
// Set out-of-range elements to a large value (sentinel)
sharedArr[tid] = INT_MAX;
}
// Synchronize to ensure all threads have loaded the data
__syncthreads();
// Bitonic sort algorithm
for (int k = 2; k <= size; k *= 2) {
for (int j = k / 2; j > 0; j /= 2) {
int ixj = tid ^ j;
// Check if the indices are within bounds
if (ixj < size) {
// Sort in ascending order
if (tid < ixj) {
if ((tid & k) == 0 && sharedArr[tid] > sharedArr[ixj]) {
int temp = sharedArr[tid];
sharedArr[tid] = sharedArr[ixj];
sharedArr[ixj] = temp;
}
if ((tid & k) != 0 && sharedArr[tid] < sharedArr[ixj]) {
int temp = sharedArr[tid];
sharedArr[tid] = sharedArr[ixj];
sharedArr[ixj] = temp;
}
}
}
// Synchronize after each comparison and swap
__syncthreads();
}
}
// Copy sorted data back to global memory
if (gid < size) {
arr[gid] = sharedArr[tid];
}
}
This is the way I am calling the function. I have allocated necessary cuda containers outside the else block and deallocating after outside.
else
{
// GPU variables
int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
cudaEventRecord(startGPU);
bitonicSortGPU <<<blocksPerGrid, threadsPerBlock >>> (gpuArr, size);
cudaEventRecord(stopGPU);
// Perform CPU Bitonic Sort and measure time
startCPU = clock();
bitonicSortCPU(carr, size);
endCPU = clock();
}