When I compile & run the program below as a single .cu file, it runs random number of times successfully then fails. Then it re-tries with exact same input and it randomly successes or fails.
Windows 11 - 23H2
Driver 561.09
Compiled for: compute_89, sm_89
RTX4070
MSVC 2022’s NVCC compiler
// hybrid quicksort
// bugs with driver: 561.09
// RTX-4070 alone for computing, screen attached to iGPU
// MSVC 2022 - Nvidia Compiler: compute_89, sm_89
// compute-sanitizer: no errors
// when chunk size is greater than 1024, it does quicksort steps
// continues splitting chunks like: left, middle(just count), right
// when chunk size is 1024 or less, executes parallel odd-even sort (todo: shear-sort 1024 + network 32)
#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cuda_device_runtime_api.h>
#include <device_functions.h>
#include<iostream>
#include<vector>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void resetNumTasks( int* numTasks)
{
if (threadIdx.x == 0)
{
numTasks[0] = 0;
numTasks[1] = 0;
}
}
__global__ void copyTasksBack( int* arr, int* leftMem, int* rightMem, int* numTasks,
int* tasks, int* tasks2, int* tasks3, int* tasks4)
{
const int id = threadIdx.x;
const int n = numTasks[0];
const int n2 = numTasks[1];
const int steps = 1 + n / 1024;
const int steps2 = 1 + n2 / 1024;
// make quick-sort tasks usable
for (int i = 0; i < steps; i++)
{
const int curId = id + i * 1024;
if (curId < n)
{
tasks[curId * 2] = tasks2[curId * 2];
tasks[curId * 2 + 1] = tasks2[curId * 2 + 1];
}
}
// make brute-force tasks usable
for (int i = 0; i < steps2; i++)
{
const int curId = id + i * 1024;
if (curId < n2)
{
tasks3[curId * 2] = tasks4[curId * 2];
tasks3[curId * 2 + 1] = tasks4[curId * 2 + 1];
}
}
}
#define compareSwap(a,x,y) if(a[y]<a[x]){a[x]^=a[y];a[y]^=a[x];a[x]^=a[y];}
__global__ void bruteSort(int * __restrict__ arr, int* __restrict__ tasks3)
{
const int id = threadIdx.x;
const int gid = blockIdx.x;
const int startIncluded = tasks3[gid * 2];
const int stopIncluded = tasks3[gid * 2 + 1];
const int num = stopIncluded - startIncluded + 1;
__shared__ int cache[1024];
if (startIncluded + id <= stopIncluded)
{
cache[id] = arr[startIncluded+id];
}
__syncthreads();
for (int i = 0; i < num; i++)
{
if (id +1< num)
{
if ((id % 2 == 0))
{
compareSwap(cache, id, id+1)
}
}
__syncthreads();
if (id +1 < num)
{
if ((id % 2 == 1))
{
compareSwap(cache, id, id + 1)
}
}
__syncthreads();
}
if (startIncluded + id <= stopIncluded)
{
arr[startIncluded + id]= cache[id];
}
}
// task pattern:
// task 0 task 1 task 2 task 3 ---> array chunks to sort (no overlap)
// start stop start stop start stop start stop ---> tasks buffer
// block 0 block 1 block 2 block 3 ---> cuda blocks
__global__ void quickSortWithoutStreamCompaction(
int* __restrict__ arr, int* __restrict__ leftMem, int* __restrict__ rightMem, int* __restrict__ numTasks,
int* __restrict__ tasks, int* __restrict__ tasks2, int* __restrict__ tasks4)
{
// 1 block = 1 chunk of data
const int gid = blockIdx.x;
const int id = threadIdx.x;
const int startIncluded = tasks[gid * 2];
const int stopIncluded = tasks[gid * 2 + 1];
const int num = stopIncluded - startIncluded + 1;
__shared__ int indexLeft;
__shared__ int indexMid;
__shared__ int indexRight;
const int bd = blockDim.x;
const int pivot = arr[stopIncluded];
int nLeft = 0;
int nMid = 0;
int nRight = 0;
if (id == 0)
{
indexLeft = 0;
indexMid = 0;
indexRight = 0;
}
__syncthreads();
const int stepsArray = (num / bd) + 1;
for (int i = 0; i < stepsArray; i++)
{
const int curId = i * bd + id;
if (curId < num)
{
const auto data = arr[curId + startIncluded];
if (data < pivot)
leftMem[startIncluded + atomicAdd(&indexLeft, 1)] = data;
else if (data > pivot)
rightMem[startIncluded + atomicAdd(&indexRight, 1)] = data;
else
atomicAdd(&indexMid, 1); // this is a counting-sort-like optimization for one of worst-cases
}
}
__syncthreads();
nLeft = indexLeft;
nMid = indexMid;
nRight = indexRight;
// move left
const int stepsLeft = (nLeft / bd) + 1;
for (int i = 0; i < stepsLeft; i++)
{
const int curId = i * bd + id;
if (curId < nLeft)
{
arr[curId + startIncluded] = leftMem[startIncluded + curId];
}
}
// move mid
const int stepsMid = (nMid / bd) + 1;
for (int i = 0; i < stepsMid; i++)
{
const int curId = i * bd + id;
if (curId < nMid)
{
arr[curId + startIncluded+nLeft] = pivot;
}
}
// move right
const int stepsRight = (nRight / bd) + 1;
for (int i = 0; i < stepsRight; i++)
{
const int curId = i * bd + id;
if (curId< nRight)
{
arr[curId + startIncluded + nLeft + nMid] = rightMem[startIncluded + curId];
}
}
__syncthreads();
if (nLeft + nRight + nMid != num)
printf(" @@ ERROR: wrong partition values @@");
if (id == 0)
{
// push new "quick" task
if (nLeft > 1)
{
if (nLeft <= 1024) // push new "brute-force" task
{
const int index = atomicAdd(&numTasks[1], 1);
tasks4[index * 2] = startIncluded;
tasks4[index * 2 + 1] = startIncluded + nLeft-1;
}
else// push new "quick" task
{
const int index = atomicAdd(&numTasks[0], 1);
tasks2[index * 2] = startIncluded;
tasks2[index * 2 + 1] = startIncluded + nLeft-1;
}
}
if (nRight > 1)
{
if (nRight <= 1024) // push new "brute-force" task
{
const int index = atomicAdd(&numTasks[1], 1);
tasks4[index * 2] = stopIncluded-nRight+1;
tasks4[index * 2 + 1] = stopIncluded;
}
else // push new "quick" task
{
const int index = atomicAdd(&numTasks[0], 1);
tasks2[index * 2] = stopIncluded - nRight+1;
tasks2[index * 2 + 1] = stopIncluded;
}
}
}
}
void test()
{
constexpr int n = 18000;
int* data, * left, * right, * numTasks;
int* tasks, * tasks2,*tasks3,*tasks4;
std::vector< int> hostData(n),backup(n);
std::vector<int> hostTasks(2);
gpuErrchk( cudaSetDevice(0));
gpuErrchk( cudaDeviceSynchronize());
gpuErrchk( cudaMalloc(&data, n * sizeof(int)));
gpuErrchk( cudaMalloc(&left, n * sizeof(int)));
gpuErrchk( cudaMalloc(&right, n * sizeof(int)));
gpuErrchk( cudaMalloc(&numTasks, 2 * sizeof(int)));
gpuErrchk( cudaMalloc(&tasks, n * sizeof(int)));
gpuErrchk( cudaMalloc(&tasks2, n * sizeof(int)));
gpuErrchk( cudaMalloc(&tasks3, n * sizeof(int)));
gpuErrchk( cudaMalloc(&tasks4, n * sizeof(int)));
int numTasksHost[2];
int nQuickTask = 1;
int nBruteTask =0;
for (int j = 0; j < 4000; j++)
{
for (int i = 0; i < n; i++)
{
hostData[i] = rand();//n-i; //rand();
backup[i] = hostData[i];
}
auto qSort = [&]() {
numTasksHost[0] = 1; // launch 1 block first
numTasksHost[1] = 0;
hostTasks[0] = 0;
hostTasks[1] = n - 1; // first block's chunk limits: 0 - n-1
gpuErrchk(cudaMemcpy((void*)data, hostData.data(), n * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy((void*)numTasks, numTasksHost, 2 * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy((void*)tasks, hostTasks.data(), 2 * sizeof(int), cudaMemcpyHostToDevice)); // host only gives 1 task with 2 parameters
nQuickTask = 1;
nBruteTask = 0;
while (nQuickTask > 0 || nBruteTask > 0)
{
//qSortMain << <1, 1 >> > (data, left, right, 0, numTasks, tasks, tasks2);
if (nQuickTask > 0)
quickSortWithoutStreamCompaction << <nQuickTask, 1024 >> > (data, left, right, numTasks, tasks, tasks2, tasks4);
gpuErrchk(cudaGetLastError());
if (nBruteTask > 0)
bruteSort << <nBruteTask, 1024 >> > (data, tasks3);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(numTasksHost, (void*)numTasks, 2 * sizeof(int), cudaMemcpyDeviceToHost));
nQuickTask = numTasksHost[0];
nBruteTask = numTasksHost[1];
if (nQuickTask > 0 || nBruteTask > 0)
{
copyTasksBack << <1, 1024 >> > (data, left, right, numTasks, tasks, tasks2, tasks3, tasks4);
gpuErrchk(cudaGetLastError());
resetNumTasks << <1, 1 >> > (numTasks);
gpuErrchk(cudaGetLastError());
}
else
{
break;
}
gpuErrchk(cudaDeviceSynchronize());
}
gpuErrchk(cudaMemcpy(hostData.data(), (void*)data, n * sizeof(int), cudaMemcpyDeviceToHost));
};
qSort();
bool err = false,err2=false;
for (int i = 0; i < n - 2; i++)
if (hostData[i] > hostData[i + 1])
{
std::cout << "error at: " << i << ": " << hostData[i] << " " << hostData[i + 1] << " " << hostData[i + 2] << std::endl;
err = true;
j = 1000000;
// re-testing with same input:
std::cout << "re-testing with exactly same input elements:" << std::endl;
for (int i = 0; i < n; i++)
{
hostData[i] = backup[i];
}
qSort();
err = false;
for (int i = 0; i < n - 2; i++)
if (hostData[i] > hostData[i + 1])
{
std::cout << "Error happened again!" << std::endl;
err = true;
break;
}
if (!err)
{
std::cout << "quicksort completed successfully with same input!!!" << std::endl;
// for (int i = 0; i < 35; i++)
// std::cout << hostData[i] << " ";
err2 = true;
}
break;
}
if (!err && !err2)
{
std::cout << "quicksort completed successfully " << j << std::endl;
// for (int i = 0; i < 35; i++)
// std::cout << hostData[i] << " ";
}
}
gpuErrchk(cudaFree(data));
gpuErrchk(cudaFree(left));
gpuErrchk(cudaFree(right));
gpuErrchk(cudaFree(tasks));
gpuErrchk(cudaFree(tasks2));
gpuErrchk(cudaFree(tasks3));
gpuErrchk(cudaFree(tasks4));
gpuErrchk(cudaFree(numTasks));
}
int main()
{
test();
return 0;
}
Algorithm is doing this:
- Takes a task that is made of just a start point and an end point inclusive index for a chunk of array.
- Splits it into 3 parts, left, pivot(s),right
- if a chunk(left or right) bigger than 1, add it to task list atomically
- if chunk size is 1024 or less, add it to brute-force task list atomically
- keep running chunks until there’s no quicksort chunk nor brute-force chunk left in list
- quick-sort & brute-force kernels are launched together (with pipelined task list approach) to use gpu better
For all quick-sort tasks, only single kernel is launched (1 block per chunk).
For all brute-force tasks, another kernel is launched (1 block per chunk). This works but for arrays bigger than 1024, it randomly fails. When it fails, compute-sanitizer says no error. If algorithm is wrong, then how can it work successfully for 1000 times in a row for 4M sized input (and much less probability of error, the bigger the array the less the error probability, until 1024 which is brute force that works)?
It was using dynamic-parallelism feature before and I thought the error was from this feature but to be sure about this I converted it to host-launched version as above. Still same behavior.
What kind of bug causes a kernel to randomly fail but with decreasing rate with increased input size? It’s a bit weird to me. Normally more data should make it fail quicker in terms of number of sorts. But it almost looks like too much data does some cache-thrashing or similar thing to hide the error related to perhaps caching.
The graphics card is not used for screen output. I have iGPU giving output to screen. So there shouldn’t be any kind of screen related bug I guess.
__syncthreads()
instructions are not inside diverging branches.
There is no other application using the same gpu (all memory belongs to this app, also tests used only array sizes up to 4M elements).
RAM is stable, GPU/VRAM are not overclocked (even tried down-clocking).