Random failures in quick-sort. Compute-sanitizer says there's no error

When I compile & run the program below as a single .cu file, it runs random number of times successfully then fails. Then it re-tries with exact same input and it randomly successes or fails.

Windows 11 - 23H2
Driver 561.09
Compiled for: compute_89, sm_89
RTX4070
MSVC 2022’s NVCC compiler

// hybrid quicksort
// bugs with driver: 561.09
// RTX-4070 alone for computing, screen attached to iGPU
// MSVC 2022 - Nvidia Compiler: compute_89, sm_89
// compute-sanitizer: no errors
// when chunk size is greater than 1024, it does quicksort steps
// continues splitting chunks like: left, middle(just count), right
// when chunk size is 1024 or less, executes parallel odd-even sort (todo: shear-sort 1024 + network 32)

#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cuda_device_runtime_api.h>
#include <device_functions.h>

#include<iostream>
#include<vector>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void resetNumTasks( int* numTasks)
{
    if (threadIdx.x == 0)
    {
        numTasks[0] = 0;
        numTasks[1] = 0;
    }
}

__global__ void copyTasksBack( int* arr,  int* leftMem,  int* rightMem,  int* numTasks,
    int* tasks, int* tasks2, int* tasks3, int* tasks4)
{
    const int id = threadIdx.x;
    const int n = numTasks[0];
    const int n2 = numTasks[1];
    const int steps = 1 + n / 1024;
    const int steps2 = 1 + n2 / 1024;

    // make quick-sort tasks usable
    for (int i = 0; i < steps; i++)
    {
        const int curId = id + i * 1024;
        if (curId < n)
        {
            tasks[curId * 2] = tasks2[curId * 2];
            tasks[curId * 2 + 1] = tasks2[curId * 2 + 1];
        }
    }


    // make brute-force tasks usable
    for (int i = 0; i < steps2; i++)
    {
        const int curId = id + i * 1024;
        if (curId < n2)
        {
            tasks3[curId * 2] = tasks4[curId * 2];
            tasks3[curId * 2 + 1] = tasks4[curId * 2 + 1];
        }
    }

}


#define compareSwap(a,x,y) if(a[y]<a[x]){a[x]^=a[y];a[y]^=a[x];a[x]^=a[y];}

__global__ void bruteSort(int * __restrict__ arr, int* __restrict__ tasks3)
{
    const int id = threadIdx.x;
    const int gid = blockIdx.x;
    const int startIncluded = tasks3[gid * 2];
    const int stopIncluded = tasks3[gid * 2 + 1];
    const int num = stopIncluded - startIncluded + 1;

   
    
    __shared__ int cache[1024];
    if (startIncluded + id <= stopIncluded)
    {
        cache[id] = arr[startIncluded+id];
    }
    __syncthreads();

    
    for (int i = 0; i < num; i++)
    {
        if (id +1< num)
        {
            if ((id % 2 == 0))
            {
                compareSwap(cache, id, id+1)
            }
        }
        __syncthreads();
        if (id +1 < num)
        {
            if ((id % 2 == 1))
            {
                compareSwap(cache, id, id + 1)
            }
        }
        __syncthreads();
    }
    

    if (startIncluded + id <= stopIncluded)
    {
        arr[startIncluded + id]= cache[id];
    }
}

// task pattern: 
//              task 0      task 1      task 2      task 3      ---> array chunks to sort (no overlap)
//              start stop  start stop  start stop  start stop  ---> tasks buffer
//              block 0     block 1     block 2     block 3     ---> cuda blocks
__global__ void quickSortWithoutStreamCompaction(
    int* __restrict__ arr, int* __restrict__ leftMem, int* __restrict__ rightMem, int* __restrict__ numTasks,
    int* __restrict__ tasks, int* __restrict__ tasks2, int* __restrict__ tasks4)
{
    // 1 block = 1 chunk of data
    const int gid = blockIdx.x;
    const int id = threadIdx.x;


    const int startIncluded = tasks[gid * 2];
    const int stopIncluded = tasks[gid * 2 + 1];
    const int num = stopIncluded - startIncluded + 1;

    __shared__ int indexLeft;
    __shared__ int indexMid;
    __shared__ int indexRight;


    const int bd = blockDim.x;
    const int pivot = arr[stopIncluded];


    int nLeft = 0;
    int nMid = 0;
    int nRight = 0;
    if (id == 0)
    {
        indexLeft = 0;
        indexMid = 0;
        indexRight = 0;
    }
    __syncthreads();
    
    const int stepsArray = (num / bd) + 1;
    for (int i = 0; i < stepsArray; i++)
    {
        const int curId = i * bd + id;
        if (curId < num)
        {
            const auto data = arr[curId + startIncluded];
            if (data < pivot)
                leftMem[startIncluded + atomicAdd(&indexLeft, 1)] = data;
            else if (data > pivot)
                rightMem[startIncluded + atomicAdd(&indexRight, 1)] = data;
            else
                atomicAdd(&indexMid, 1); // this is a counting-sort-like optimization for one of worst-cases
        }
    }
    

    __syncthreads();
    nLeft = indexLeft;
    nMid = indexMid;
    nRight = indexRight;

    // move left
    const int stepsLeft = (nLeft / bd) + 1;
    for (int i = 0; i < stepsLeft; i++)
    {
        const int curId = i * bd + id;
        if (curId < nLeft)
        {
            arr[curId + startIncluded] = leftMem[startIncluded + curId];

        }
    }
    

    // move mid
    const int stepsMid = (nMid / bd) + 1;
    for (int i = 0; i < stepsMid; i++)
    {
        const int curId = i * bd + id;
        if (curId < nMid)
        {
            arr[curId + startIncluded+nLeft] = pivot;

        }
    }
    

    
    // move right
    const int stepsRight = (nRight / bd) + 1;
    for (int i = 0; i < stepsRight; i++)
    {
        const int curId = i * bd + id;
        if (curId< nRight)
        {
            arr[curId + startIncluded + nLeft + nMid] = rightMem[startIncluded + curId];
        }
    }
    
    __syncthreads();

    if (nLeft + nRight + nMid != num)
        printf(" @@ ERROR: wrong partition values @@");
    if (id == 0)
    {
        // push new "quick" task
        if (nLeft > 1)
        {            
            if (nLeft <= 1024) // push new "brute-force" task
            {
                const int index = atomicAdd(&numTasks[1], 1);
                tasks4[index * 2] = startIncluded;
                tasks4[index * 2 + 1] = startIncluded + nLeft-1;
            }
            else// push new "quick" task
            {
                const int index = atomicAdd(&numTasks[0], 1);
                tasks2[index * 2] = startIncluded;
                tasks2[index * 2 + 1] = startIncluded + nLeft-1;
            }            
        }
        

        
        if (nRight > 1)
        {

            if (nRight <= 1024) // push new "brute-force" task
            {
                const int index = atomicAdd(&numTasks[1], 1);
                tasks4[index * 2] = stopIncluded-nRight+1;
                tasks4[index * 2 + 1] = stopIncluded;
            }
            else // push new "quick" task
            {
                const int index = atomicAdd(&numTasks[0], 1);
                tasks2[index * 2] = stopIncluded - nRight+1;
                tasks2[index * 2 + 1] = stopIncluded;
            }
            
        }
    }
}



void test()
{
    constexpr int n = 18000;
    int* data, * left, * right, * numTasks;
    int* tasks, * tasks2,*tasks3,*tasks4;
    std::vector< int> hostData(n),backup(n);
    std::vector<int> hostTasks(2);
   
    gpuErrchk( cudaSetDevice(0));
    gpuErrchk( cudaDeviceSynchronize());
    gpuErrchk( cudaMalloc(&data, n * sizeof(int)));
    gpuErrchk( cudaMalloc(&left, n * sizeof(int)));
    gpuErrchk( cudaMalloc(&right, n * sizeof(int)));
    gpuErrchk( cudaMalloc(&numTasks, 2 * sizeof(int)));
    gpuErrchk( cudaMalloc(&tasks, n * sizeof(int)));
    gpuErrchk( cudaMalloc(&tasks2, n * sizeof(int)));
    gpuErrchk( cudaMalloc(&tasks3, n * sizeof(int)));
    gpuErrchk( cudaMalloc(&tasks4, n * sizeof(int)));


    int numTasksHost[2];
    int nQuickTask = 1;
    int nBruteTask =0;
        
    for (int j = 0; j < 4000; j++)
    {
        for (int i = 0; i < n; i++)
        {
            hostData[i] = rand();//n-i; //rand();
            backup[i] = hostData[i];
        }
        auto qSort = [&]() {
            numTasksHost[0] = 1; // launch 1 block first
            numTasksHost[1] = 0;
            hostTasks[0] = 0;
            hostTasks[1] = n - 1; // first block's chunk limits: 0 - n-1
            gpuErrchk(cudaMemcpy((void*)data, hostData.data(), n * sizeof(int), cudaMemcpyHostToDevice));
            gpuErrchk(cudaMemcpy((void*)numTasks, numTasksHost, 2 * sizeof(int), cudaMemcpyHostToDevice));
            gpuErrchk(cudaMemcpy((void*)tasks, hostTasks.data(), 2 * sizeof(int), cudaMemcpyHostToDevice)); // host only gives 1 task with 2 parameters
            nQuickTask = 1;
            nBruteTask = 0;

            while (nQuickTask > 0 || nBruteTask > 0)
            {

                //qSortMain << <1, 1 >> > (data, left, right, 0, numTasks, tasks, tasks2);
                if (nQuickTask > 0)
                    quickSortWithoutStreamCompaction << <nQuickTask, 1024 >> > (data, left, right, numTasks, tasks, tasks2, tasks4);
                gpuErrchk(cudaGetLastError());
                if (nBruteTask > 0)
                    bruteSort << <nBruteTask, 1024 >> > (data, tasks3);
                gpuErrchk(cudaGetLastError());
                gpuErrchk(cudaDeviceSynchronize());





                gpuErrchk(cudaMemcpy(numTasksHost, (void*)numTasks, 2 * sizeof(int), cudaMemcpyDeviceToHost));
                nQuickTask = numTasksHost[0];
                nBruteTask = numTasksHost[1];

                if (nQuickTask > 0 || nBruteTask > 0)
                {
                    copyTasksBack << <1, 1024 >> > (data, left, right, numTasks, tasks, tasks2, tasks3, tasks4);
                    gpuErrchk(cudaGetLastError());
                    resetNumTasks << <1, 1 >> > (numTasks);
                    gpuErrchk(cudaGetLastError());
                }
                else
                {
                    break;
                }
                gpuErrchk(cudaDeviceSynchronize());


            }
            gpuErrchk(cudaMemcpy(hostData.data(), (void*)data, n * sizeof(int), cudaMemcpyDeviceToHost));
        };

        qSort();
        bool err = false,err2=false;
        for (int i = 0; i < n - 2; i++)
            if (hostData[i] > hostData[i + 1])
            {
                std::cout << "error at: " << i << ": " << hostData[i] << " " << hostData[i + 1] << " " << hostData[i + 2] << std::endl;
                err = true;
                j = 1000000;
                // re-testing with same input:
                std::cout << "re-testing with exactly same input elements:" << std::endl;
                for (int i = 0; i < n; i++)
                {
                    hostData[i] = backup[i];
                }
                qSort();
                err = false;
                for (int i = 0; i < n - 2; i++)
                    if (hostData[i] > hostData[i + 1])
                    {
                        std::cout << "Error happened again!" << std::endl;
                        err = true;
                        
                        break;
                    }

                if (!err)
                {
                    std::cout << "quicksort completed successfully with same input!!!" << std::endl;
                    // for (int i = 0; i < 35; i++)
                     //    std::cout << hostData[i] << " ";
                    err2 = true;
                }
                break;
            }

        if (!err && !err2)
        {
            std::cout << "quicksort completed successfully " << j << std::endl;
            // for (int i = 0; i < 35; i++)
             //    std::cout << hostData[i] << " ";
        }
    }

    gpuErrchk(cudaFree(data));
    gpuErrchk(cudaFree(left));
    gpuErrchk(cudaFree(right));
    gpuErrchk(cudaFree(tasks));
    gpuErrchk(cudaFree(tasks2));
    gpuErrchk(cudaFree(tasks3));
    gpuErrchk(cudaFree(tasks4));
    gpuErrchk(cudaFree(numTasks));

}

int main()
{
    test();

    return 0;
}

Algorithm is doing this:

  • Takes a task that is made of just a start point and an end point inclusive index for a chunk of array.
  • Splits it into 3 parts, left, pivot(s),right
  • if a chunk(left or right) bigger than 1, add it to task list atomically
  • if chunk size is 1024 or less, add it to brute-force task list atomically
  • keep running chunks until there’s no quicksort chunk nor brute-force chunk left in list
  • quick-sort & brute-force kernels are launched together (with pipelined task list approach) to use gpu better

For all quick-sort tasks, only single kernel is launched (1 block per chunk).

For all brute-force tasks, another kernel is launched (1 block per chunk). This works but for arrays bigger than 1024, it randomly fails. When it fails, compute-sanitizer says no error. If algorithm is wrong, then how can it work successfully for 1000 times in a row for 4M sized input (and much less probability of error, the bigger the array the less the error probability, until 1024 which is brute force that works)?


It was using dynamic-parallelism feature before and I thought the error was from this feature but to be sure about this I converted it to host-launched version as above. Still same behavior.

What kind of bug causes a kernel to randomly fail but with decreasing rate with increased input size? It’s a bit weird to me. Normally more data should make it fail quicker in terms of number of sorts. But it almost looks like too much data does some cache-thrashing or similar thing to hide the error related to perhaps caching.


The graphics card is not used for screen output. I have iGPU giving output to screen. So there shouldn’t be any kind of screen related bug I guess.


__syncthreads() instructions are not inside diverging branches.


There is no other application using the same gpu (all memory belongs to this app, also tests used only array sizes up to 4M elements).


RAM is stable, GPU/VRAM are not overclocked (even tried down-clocking).

try running your code with:

compute-sanitizer --tool initcheck ...

when I do that, errors are reported:

# nvcc -o t286 t286.cu -arch=sm_89 -lineinfo
# compute-sanitizer --tool initcheck ./t286
========= COMPUTE-SANITIZER
========= Uninitialized __global__ memory read of size 4 bytes
=========     at 0x420 in /root/bobc/t286.cu:50:copyTasksBack(int *, int *, int *, int *, int *, int *, int *, int *)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x7ff6da846a00
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x32e950]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:libcudart_static_4d8b33a106dceb3c07a56e26de61f2d53bb62a68 [0x126be]
=========                in /root/bobc/./t286
=========     Host Frame:cudaLaunchKernel [0x728ce]
=========                in /root/bobc/./t286
=========     Host Frame:cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*) [0xc61f]
=========                in /root/bobc/./t286
=========     Host Frame:__device_stub__Z13copyTasksBackPiS_S_S_S_S_S_S_(int*, int*, int*, int*, int*, int*, int*, int*) [0xbf93]
=========                in /root/bobc/./t286
=========     Host Frame:copyTasksBack(int*, int*, int*, int*, int*, int*, int*, int*) [0xbffd]
=========                in /root/bobc/./t286
=========     Host Frame:test()::{lambda()#1}::operator()() const [0xb114]
=========                in /root/bobc/./t286
=========     Host Frame:test() [0xb649]
=========                in /root/bobc/./t286
=========     Host Frame:main [0xbc06]
=========                in /root/bobc/./t286
=========     Host Frame:../sysdeps/nptl/libc_start_call_main.h:58:__libc_start_call_main [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:../csu/libc-start.c:379:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xab95]
=========                in /root/bobc/./t286
=========
========= Uninitialized __global__ memory read of size 4 bytes
=========     at 0x460 in /root/bobc/t286.cu:51:copyTasksBack(int *, int *, int *, int *, int *, int *, int *, int *)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x7ff6da846a04
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x32e950]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame:libcudart_static_4d8b33a106dceb3c07a56e26de61f2d53bb62a68 [0x126be]
=========                in /root/bobc/./t286
=========     Host Frame:cudaLaunchKernel [0x728ce]
=========                in /root/bobc/./t286
=========     Host Frame:cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*) [0xc61f]
=========                in /root/bobc/./t286
=========     Host Frame:__device_stub__Z13copyTasksBackPiS_S_S_S_S_S_S_(int*, int*, int*, int*, int*, int*, int*, int*) [0xbf93]
=========                in /root/bobc/./t286
=========     Host Frame:copyTasksBack(int*, int*, int*, int*, int*, int*, int*, int*) [0xbffd]
=========                in /root/bobc/./t286
=========     Host Frame:test()::{lambda()#1}::operator()() const [0xb114]
=========                in /root/bobc/./t286
=========     Host Frame:test() [0xb649]
=========                in /root/bobc/./t286
=========     Host Frame:main [0xbc06]
=========                in /root/bobc/./t286
=========     Host Frame:../sysdeps/nptl/libc_start_call_main.h:58:__libc_start_call_main [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:../csu/libc-start.c:379:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xab95]
=========                in /root/bobc/./t286
=========

As indicated above, it is often useful to compile your code with -lineinfo when using compute-sanitizer. See here. Unit 12 of this online series also discusses debugging.

1 Like

I applied the flags and it shows the relevant lines. Thank you. But still, the allocations are not failing:

    gpuErrchk( cudaMalloc(&tasks, n * sizeof(int)));
    gpuErrchk( cudaMalloc(&tasks2, n * sizeof(int)));
    gpuErrchk( cudaMalloc(&tasks3, n * sizeof(int)));
    gpuErrchk( cudaMalloc(&tasks4, n * sizeof(int)));

but a new kernel resetting them fails:

__global__ void resetTasks(int * tasks, int * tasks2, int * tasks3, int * tasks4, const int n)
{
    const int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id < n)
    {
        tasks[id] = -1;
        tasks2[id] = -1;
        tasks3[id] = -1;
        tasks4[id] = -1;
    }
}

...

resetTasks << <1 + n / 1024, 1024 >> > (tasks, tasks2, tasks3, tasks4, n);

output:

GPUassert: an illegal memory access was encountered C:\Users\admin\Desktop\opencl\CudaApp\kernel2.cu 330

It looks like another kernel is failing now. I think I can solve rest of problems by using this line info. Thank you.

I didn’t say anything about allocations “failing” and that is not what that error report from compute-sanitizer initcheck means. You may wish to read the compute-sanitizer docs or study the resource I linked.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.