Hello! I am trying to understand why the following code hangs:
#include <cassert>
#include <cmath>
#include <ctime>
#include <exception>
#include <fstream>
#include <iostream>
#include <iterator>
#include <map>
#include <mutex>
#include <numeric>
#include <random>
#include <set>
#include <sstream>
#include <string>
#include <thread>
#include <vector>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
void gpuAssert(cudaError_t code, const char *file, int line) {
if (code != cudaSuccess) {
printf("GPU kernel assert: %s %s %d\n", cudaGetErrorString(code), file, line);
assert(false);
}
}
__constant__ int C[13][13] = {
{1, },
{1, 1, },
{1, 2, 1, },
{1, 3, 3, 1, },
{1, 4, 6, 4, 1, },
{1, 5, 10, 10, 5, 1, },
{1, 6, 15, 20, 15, 6, 1, },
{1, 7, 21, 35, 35, 21, 7, 1, },
{1, 8, 28, 56, 70, 56, 28, 8, 1, },
{1, 9, 36, 84, 126, 126, 84, 36, 9, 1, },
{1, 10, 45, 120, 210, 252, 210, 120, 45, 10, 1, },
{1, 11, 55, 165, 330, 462, 462, 330, 165, 55, 11, 1, },
{1, 12, 66, 220, 495, 792, 924, 792, 495, 220, 66, 12, 1, },
};
__device__ int64_t cnt_such() {
const int left_cnts[7] = {0, 1, 1, 1, 1, 2, 5};
int64_t cnt_such = 1;
for (int k = 1, remaining = 12; k < 7; ++k) {
assert(remaining >= 0);
assert(left_cnts[k] >= 0);
cnt_such *= C[remaining][left_cnts[k]];
remaining -= left_cnts[k];
}
return cnt_such;
}
const int k1 = 512, k2 = 128, k12 = k1 * k2;
__global__ void hpc_preprocess_hater_arrangement() {
int64_t id = blockIdx.x * blockDim.x + threadIdx.x, size = cnt_such();
int64_t ubsync = size / (8 * k12) * (8 * k12);
assert(size == 1995840);
assert(ubsync == 1572864);
int cnt_syncs = 0;
for (int64_t anchor = id * 8; anchor < size; anchor += k12 * 8) {
if (anchor < ubsync) {
cnt_syncs++;
__syncthreads();
}
}
printf("%d ", cnt_syncs);
//assert(cnt_syncs == 3);
}
int main() {
std::cout << "started" << std::endl;
hpc_preprocess_hater_arrangement<<<k1,k2>>>();
gpuErrchk(cudaDeviceSynchronize());
}
The code seems to be minimal reproducible example, in particular, cnt_such() cannot be replaced with the constant it always returns. I compile without optimizations: just nvcc expl.cu
.
It is easy to prove that __syncthreads gets called exactly 3 times in all threads. Furthermore, there even seems to be the same number of instructions passed by the threads before these calls (although I hope it is not important).
Also, if I comment __syncthreads() and uncomment assert(cnt_syncs == 3), then the code terminates and all asserts pass, which further supports the claim that syncthreads gets called exactly three times, so there it no situation that some threads call syncthreads and others don’t and those who call wait for those who don’t forever.
Please help understanding why this code hangs
This code doesn’t compute anything useful, because actually it is a minimization of the real 800-lines code that computes something useful. In particular, it seems obvious that I would better remove syncthreads here, as it is supposed to do nothing in this case, but in the original code this function does something useful, which is omitted here. So please help understanding why this exact code hangs
Thanks!