cudaDeviceSynchronize hang -- a bug?

Hi all,

Recently I’m working on some parallel computing project on Jetson TX1, and sometimes cudaDeviceSynchronize() hang.

The problem description is listed below:

  1. In the kernel, I'm using curand to generate uniformly distributed doubles to implement Monte Carlo.
  2. Then some calculation is done to see if the choice can pass the check.
  3. After kernel launch, I call cudaDeviceSynchroinze(), then a cudaMemcpy(). - I know that cudaDeviceSynchronize() is unnecessary here because cudaMemcpy() will do synchronizing first.
  4. If I remove cudaDeviceSynchronize() before cudaMemcpy(), then sometimes I got "unspecified launch failure"

Some of the configuration:

  1. The kernel timeout is disabled, and now I can run program on GPU for very long time: more than 200 seconds. (Actually sometimes the problem happens in 0.5 seconds)
  2. The numbers of threads are set to 5000 or 2300.


Sometimes the program works properly, but sometimes not – it hangs
. When hang, I found that the GPU usage dropped from 99% to 0%:
RAM 823/3854MB (lfb 658x4MB) SWAP 0/0MB (cached 0MB) cpu [65%,12%,11%,6%]@1734 EMC 13%@1600 AVP 0%@408 VDE 0 GR3D 99%@998 EDP limit 1734
RAM 825/3854MB (lfb 657x4MB) SWAP 0/0MB (cached 0MB) cpu [53%,8%,5%,100%]@1734 EMC 7%@1600 AVP 0%@408 VDE 0 GR3D 99%@998 EDP limit 1734
RAM 830/3854MB (lfb 656x4MB) SWAP 0/0MB (cached 0MB) cpu [48%,10%,5%,100%]@1734 EMC 2%@1600 AVP 0%@408 VDE 0 GR3D 99%@998 EDP limit 1734
RAM 830/3854MB (lfb 656x4MB) SWAP 0/0MB (cached 0MB) cpu [44%,38%,33%,52%]@1734 EMC 1%@1600 AVP 0%@80 VDE 0 GR3D 0%@998 EDP limit 1734
RAM 830/3854MB (lfb 656x4MB) SWAP 0/0MB (cached 0MB) cpu [32%,28%,99%,6%]@1734 EMC 0%@1600 AVP 0%@80 VDE 0 GR3D 0%@998 EDP limit 1734
RAM 830/3854MB (lfb 656x4MB) SWAP 0/0MB (cached 0MB) cpu [58%,4%,100%,7%]@1734 EMC 0%@1600 AVP 0%@80 VDE 0 GR3D 0%@998 EDP limit 1734
RAM 830/3854MB (lfb 656x4MB) SWAP 0/0MB (cached 0MB) cpu [57%,5%,100%,7%]@1734 EMC 0%@1600 AVP 0%@80 VDE 0 GR3D 0%@998 EDP limit 1734

Then I attached to the process and get the backtrace below:
(gdb) bt
#0 0xf415d0e0 in nanosleep () at …/sysdeps/unix/syscall-template.S:81
#1 0xf417a738 in usleep (useconds=20) at …/sysdeps/unix/sysv/linux/usleep.c:32
#2 0xf3735e84 in ?? () from /usr/lib/arm-linux-gnueabihf/tegra/libcuda.so.1
#3 0xf3672e84 in ?? () from /usr/lib/arm-linux-gnueabihf/tegra/libcuda.so.1
#4 0xf367303c in ?? () from /usr/lib/arm-linux-gnueabihf/tegra/libcuda.so.1
#5 0xf3661288 in ?? () from /usr/lib/arm-linux-gnueabihf/tegra/libcuda.so.1
#6 0xf36317fc in cuCtxSynchronize () from /usr/lib/arm-linux-gnueabihf/tegra/libcuda.so.1
#7 0xf7271844 in ?? () from /usr/local/cuda-7.0/lib/libcudart.so.7.0
#8 0xf7293b44 in cudaDeviceSynchronize () from /usr/local/cuda-7.0/lib/libcudart.so.7.

Actually, I think the problem is caused by the nvlink warnings:
nvlink warning : Stack size for entry function ‘_Z20DestroySamplerKernelPP8ISampleri’ cannot be statically determined
nvlink warning : Stack size for entry function ‘_Z20DestroySamplerKernelPP8ISampleri’ cannot be statically determined

After remove this warning by breaking the inheritance from

class CudaSampler : public ISampler
{
//...
};

to

class CudaSampler 
{
//...
};

the warnings changed to:
nvlink warning : Stack size for entry function ‘_Z20DestroySamplerKernelPP11CudaSampleri’ cannot be statically determined
nvlink warning : Stack size for entry function ‘_Z20DestroySamplerKernelPP11CudaSampleri’ cannot be statically determined

, then the program can be run correctly on TX1. So I extract the related code to below files and create an new project, which still get the link warnings above.

The code has been reviewed with another developer, and we all agree the usage of virtual destructor is correct.

[b]My opinion about this warning is that, this should not be a warning because the usage of inheritance and abstract class conforms to C++ standard

So, anything goes wrong here? or do we miss some important point?[/b]

ISampler.h:

#ifndef _I_SAMPLER_H
#define _I_SAMPLER_H

class ISampler
{
public:
    // Note: the initialization may have overloads
    __device__
    virtual void InitializeT2(double logT2Max, double logT2Min, int nItems) = 0;
    __device__
    virtual void InitializeRandom(double start, double end) = 0;
    __device__
    virtual int& Get(int& dp) = 0;
    __device__
    virtual ~ISampler() {};
};

#endif

IRandom.h

#ifndef _I_RANDOM_H
#define _I_RANDOM_H

class IRandom
{
public:
    __device__
    virtual double Next() = 0;
    __device__
    virtual ~IRandom() {};
};

#endif

CudaSampler.h

#ifndef _CUDA_SAMPLER_H
#define _CUDA_SAMPLER_H

#include "ISampler.h"
#include <cuda_runtime.h>

class IRandom;

class CudaSampler : public ISampler
{
public:
    __device__
    CudaSampler(double* t2Pool, int nT2, int threadId);

public:
    __device__
    virtual void InitializeT2(double logT2Max, double logT2Min, int nItems);
    __device__
    virtual void InitializeRandom(double start, double end);
    __device__
    virtual int& Get(int& dp);
    __device__
    virtual ~CudaSampler();

private:
    IRandom* mPRNG;
    int mThreadId;
};

#endif

CudaUniformRandom.h

#ifndef _CUDA_UNIFORM_RANDOM_H
#define _CUDA_UNIFORM_RANDOM_H

#include <curand_kernel.h>
#include "IRandom.h"

class CudaUniformRandom : public IRandom
{
public:
    __device__
    CudaUniformRandom(int id);
    __device__
    virtual double Next();
    __device__
    virtual ~CudaUniformRandom();
private:
    curandState* mPRNGState;
};

#endif

CudaSampler.cu

#include "CudaSampler.h"
#include "CudaUniformRandom.h"

__device__
CudaSampler::CudaSampler(double* t2Pool, int nT2, int threadId)
    : mPRNG(0)
    , mThreadId(threadId)
{
    (void)nT2;
    (void)t2Pool;
}

__device__
void CudaSampler::InitializeT2(double, double, int)
{
}

__device__
void CudaSampler::InitializeRandom(double, double)
{
    mPRNG = new CudaUniformRandom(mThreadId);
}

__device__
int& CudaSampler::Get(int& s)
{
    mPRNG->Next();
    return s;
}

__device__
CudaSampler::~CudaSampler()
{
    delete mPRNG;
}

CudaUniformRandom.cu

#include "CudaUniformRandom.h"

__device__
CudaUniformRandom::CudaUniformRandom(int id) : mPRNGState(0)
{
    mPRNGState = (curandState*)malloc(sizeof(curandState));//new curandState();
    curand_init(id, 0, 0, mPRNGState);
}

__device__
double CudaUniformRandom::Next()
{
    return curand_uniform(mPRNGState);
}

__device__
CudaUniformRandom::~CudaUniformRandom()
{
    // delete mPRNGState;
    free(mPRNGState);
}

Client code:
Client.cu

#include <cuda.h>
#include "CudaSampler.h"
#include "Utilities.h"

__global__
void SetupSamplerKernel(ISampler** samplers, int nItems)
{
    int threadId = blockDim.x * blockIdx.x + threadIdx.x;
    if (threadId < nItems)
    {
        samplers[threadId] = new CudaSampler(0, 0, threadId);
        samplers[threadId]->InitializeT2(0.0, 0.0, 0.0);
        samplers[threadId]->InitializeRandom(0.0, 0.0);
    }
}

__global__
void DestroySamplerKernel(ISampler** samplers, int nItems)
{
    int threadId = blockDim.x * blockIdx.x + threadIdx.x;
    if (threadId < nItems);
        delete samplers[threadId];
}

__global__
void RunKernel(ISampler** samplers, int nItems)
{
    int threadId = blockDim.x * blockIdx.x + threadIdx.x;
    if (threadId < nItems)
    {
        for (int i = 0; i < threadId * 2; ++i)
        {
            samplers[threadId]->Get(i);
        }
    }
}

void Run()
{
    const int nThreads = 5000;
    ISampler** samplers = 0;
    cudaMalloc(reinterpret_cast<void**>(&samplers), nThreads * sizeof(ISampler*));
    int blks = (nThreads + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
    SetupSamplerKernel<<<blks, THREADS_PER_BLOCK>>>(samplers, nThreads);
    cudaDeviceSynchronize();
    RunKernel<<<blks, THREADS_PER_BLOCK>>>(samplers, nThreads);
    cudaDeviceSynchronize();
    DestroySamplerKernel<<<blks, THREADS_PER_BLOCK>>>(samplers, nThreads);
    cudaFree(samplers);
}

Hi raof01,

Regarding this CUDA specific programming issue, suggest to file it to “CUDA Programming and Performance” board as below:
https://devtalk.nvidia.com/default/board/57/

Thanks