My Syncthreads function seems to be a pain in the a**

So I first tried Bitonic Sort on GPU using CUDA and it worked successfully. I also had CUDA Merge sort and that worked well too. The same functions, when I combined them into a single project to be run in an if else choice, the __syncthreads() on my Bitonic Sort GPU function is always throwing an error. Can’t seem to figure out why.

This is the function
// GPU kernel for Bitonic Sort
global void bitonicSortGPU(int* arr, int size) {
shared int sharedArr[8192];

int tid = threadIdx.x;
int gid = threadIdx.x + blockIdx.x * blockDim.x;

// Load data from global memory to shared memory
if (gid < size) {
    sharedArr[tid] = arr[gid];
}
else {
    // Set out-of-range elements to a large value (sentinel)
    sharedArr[tid] = INT_MAX;
}

// Synchronize to ensure all threads have loaded the data
__syncthreads();

// Bitonic sort algorithm
for (int k = 2; k <= size; k *= 2) {
    for (int j = k / 2; j > 0; j /= 2) {
        int ixj = tid ^ j;

        // Check if the indices are within bounds
        if (ixj < size) {
            // Sort in ascending order
            if (tid < ixj) {
                if ((tid & k) == 0 && sharedArr[tid] > sharedArr[ixj]) {
                    int temp = sharedArr[tid];
                    sharedArr[tid] = sharedArr[ixj];
                    sharedArr[ixj] = temp;
                }
                if ((tid & k) != 0 && sharedArr[tid] < sharedArr[ixj]) {
                    int temp = sharedArr[tid];
                    sharedArr[tid] = sharedArr[ixj];
                    sharedArr[ixj] = temp;
                }
            }
        }

        // Synchronize after each comparison and swap
        __syncthreads();
    }
}

// Copy sorted data back to global memory
if (gid < size) {
    arr[gid] = sharedArr[tid];
}

}

This is the way I am calling the function. I have allocated necessary cuda containers outside the else block and deallocating after outside.
else
{
// GPU variables
int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;

    cudaEventRecord(startGPU);
    bitonicSortGPU <<<blocksPerGrid, threadsPerBlock >>> (gpuArr, size);
    cudaEventRecord(stopGPU);

    // Perform CPU Bitonic Sort and measure time
    startCPU = clock();
    bitonicSortCPU(carr, size);
    endCPU = clock();
}