Hello, I’m new to CUDA and recently learned about race conditions.
So I created a simple kernel that has a race condition and observed the results.
__global__ void raceKernel(float *A, float *result, int n) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < n && col < n) {
*result = A[row*n + col];
}
}
I expected the result to be random based on which thread finishes execution last and overwrites the value.
But when I run this kernel many times with different launch parameters I see a lot of repeating outcomes.
And in some combinations of input matrix size and block size the result is always from the same (0th) index of an arbitrary warp.
Example: matrix size 48x48, block size = 16x16
Launched the kernel 10000 times with the same parameters.
// WarpIdx X Y means result came from Xth warp Yth thread in that warp
Iteration 1 - Result: 1056 - WarpIdx: 33 0
Iteration 2 - Result: 960 - WarpIdx: 30 0
Iteration 6 - Result: 1440 - WarpIdx: 45 0
Iteration 284 - Result: 1248 - WarpIdx: 39 0
Iteration 292 - Result: 96 - WarpIdx: 3 0
Iteration 332 - Result: 864 - WarpIdx: 27 0
Iteration 338 - Result: 1856 - WarpIdx: 58 0
Iteration 486 - Result: 672 - WarpIdx: 21 0
Iteration 1002 - Result: 768 - WarpIdx: 24 0
Iteration 2277 - Result: 1664 - WarpIdx: 52 0
Iteration 6334 - Result: 288 - WarpIdx: 9 0
Iteration 6622 - Result: 2048 - WarpIdx: 64 0
12 different outcomes
1 unique warp indexes: 0
I only tested a couple combinations but these have the same property
matrix width, block width
- 2048, 32
- 24, 8
- 16, 16
- or any matrix size that is divisible by 32 and block size of 32
And there are other interesting outcomes too
8, 6 - the result is always 48
16, 16 - the results are always multiples of 32
and for 16, 32 results are multiples of 64
Iteration 1 - Result: 128 - WarpIdx: 4 0
Iteration 2 - Result: 720 - WarpIdx: 22 16
Iteration 112 - Result: 192 - WarpIdx: 6 0
Iteration 275 - Result: 656 - WarpIdx: 20 16
Iteration 505 - Result: 448 - WarpIdx: 14 0
Iteration 684 - Result: 384 - WarpIdx: 12 0
Iteration 9306 - Result: 528 - WarpIdx: 16 16
7 different outcomes
2 unique warp indexes: 0 - 16
I did not test all combinations because my code assumes number of total threads and threads in a block are divisible by 32.
Here is the C code I used:
Summary
#include <cuda_runtime.h>
#include <stdio.h>
int is_in(float value, float *array, int size) {
for (int i = 0; i < size; i++) {
if (array[i] == value) {
return 1;
}
}
return 0;
}
__global__ void raceKernel(float *A, float *result, int n) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < n && col < n) {
*result = A[row*n + col];
}
}
int main() {
int n = 48;
int size = n*n * sizeof(float);
float *A = (float *)malloc(size);
for (int a = 0; a < n*n; a++){
A[a] = a;
}
float *dA;
cudaMalloc((void**)&dA, size);
cudaMemcpy(dA, A, size, cudaMemcpyHostToDevice);
float result;
float *dresult;
cudaMalloc((void**)&dresult, sizeof(float));
int blockSize = 16; // max is 32
int gridSize = (n + blockSize - 1)/blockSize;
dim3 blockDim(blockSize, blockSize);
dim3 gridDim(gridSize, gridSize);
int iters = 10000;
int nResults = 0;
float results[iters];
int nIdxs = 0;
float warpIdxs[32];
for (int i = 0; i < iters; i++) {
results[i] = -1; // will be overwritten
result = 0.0;
cudaMemcpy(dresult, &result, sizeof(float), cudaMemcpyHostToDevice);
raceKernel<<<gridDim, blockDim>>>(dA, dresult, n);
cudaMemcpy(&result, dresult, sizeof(float), cudaMemcpyDeviceToHost);
if (!is_in(result, results, i)) {
int idx = (int)result%32; // index inside a warp
if (!is_in(idx, warpIdxs, nIdxs)){
warpIdxs[nIdxs] = idx;
nIdxs++;
}
results[i] = result;
nResults++;
printf("Iteration %d - Result: %d - WarpIdx: %d %d\n", i+1, (int)result, (int)result/32, idx);
}
}
printf("%d different outcomes\n", nResults);
printf("%d unique warp indexes: ", nIdxs);
for(int j = 0; j < nIdxs; j++){
printf("%d - ", (int)warpIdxs[j]);
}
printf("\n");
cudaFree(dA);
cudaFree(dresult);
free(A);
return 0;
}
Again I’m new to CUDA and might be missing a crucial point here but if the way I’m calculating this and my assumptions about warps are correct I am curious about the reason of this behaviour.
Thanks.