Host was blocked after calling a nestted kernel! (꒦_꒦)

I wrote CUDA code like below: once the total parent threads exceeds a number, program is blocked in hundreds of frames(seems the more sub-threads configed the more stably the program is o(╥﹏╥)o), and the threads counter is not correct and unstable when error happens. so what’s the rule of configing a nestted CUDA kernel?

__global__ void  sub_kernel(int* counter)
{
    int index = atomicAdd(counter, 1);
}

__global__ void  kernel(int * counter)
{        
    sub_kernel << <800, 80 >> > (counter);  // this version has much more threads but work well (very slow) !!
    // sub_kernel << <10, 10 >> > (counter);  // will cuase host die in hundreds of frames
}

int TT()
{
    int* fragCounter_dev = nullptr, counter = 0;
    CHECK(cudaMalloc(&fragCounter_dev, sizeof(int)), End);

    kernel << <32,  128>> > (fragCounter_dev);
    // kernel << <32, 32 >> > (fragCounter_dev); // ok for any sub-kernel config

    CHECK(cudaGetLastError(), End);
    CHECK(cudaDeviceSynchronize(), End);  // blocked here in case of error
    CHECK(cudaMemcpy(&counter, fragCounter_dev, 4, cudaMemcpyDeviceToHost), End);
    CHECK(cudaFree(fragCounter_dev), End);

End:
    return counter;
}

int main(int argc, const char** argv)
{

int key = 0;
int frame = 0;
while (key != 0x20)
{
    int threads = Cuda::TT();
    cout << ++frame << "--" << threads << endl;

    cv::waitKey(10);
}
return 0;

}

Device properties:
//////////////////////////////////////////////////
totalGlobalMem 8589606912
sharedMemPerBlock 49152
regsPerBlock 65536
warpSize 32
memPitch 2147483647
maxThreadsPerBlock 1024

  • maxThreadsDim 0x000000524cd8f674 {1024, 1024, 64}
  • maxGridSize 0x000000524cd8f680 {2147483647, 65535, 65535}
    clockRate 1695000
    totalConstMem 65536
    major 7
    minor 5
    textureAlignment 512
    texturePitchAlignment 32
    deviceOverlap 1
    multiProcessorCount 34
    kernelExecTimeoutEnabled 1
    egrated 0
    canMapHostMemory 1
    computeMode 0
    maxTexture1D 131072
    maxTexture1DMipmap 32768
    maxTexture1DLinear 268435456
1 Like