Got wrong result when not using cudaDeviceSynchronize in threads.

Hi, I packed 2 cuda reduce kernels as a device function and tried to call the function inside another kernel. I find that a cudaDeviceSynchronize() function must be called after the cudaReduceSum() to get the correct result.

The stream of kernels is set to 0 (default), so my understanding is that the code should run serial within each thread, thus no synchronize needed.

Could somebody tell me what is the real issue here? Thanks!

__global__ void kernM(dim3 grid,dim3 block,
                          const unsigned int comps,
                          const unsigned int pts,
                          const unsigned int dim,
                          const float *X, float* loggamma,
                          float *working, float *Pi,float *Mu,
                          float* Sigma, const float weight,
                          const float tol_spt){

    const unsigned int tid = threadIdx.x;

    float *loggammaK=&loggamma[tid*pts];
    float *workingK=&working[tid*pts*dim*dim];

    cudaReduceSum(grid,block,pts,loggammaK,workingK,0);
    cudaDeviceSynchronize();

    const float maxArg=workingK[0];

    cudaReduceSum(grid,block,pts,workingK,workingK,0);
    cudaDeviceSynchronize();
   
}

__global__
void kernReduceSum(unsigned int num,const float *d_in, float *d_out)
{
    unsigned int tid = threadIdx.x;
    unsigned int idx = blockIdx.x * (blockDim.x * 2) + threadIdx.x;

    volatile __shared__ float sdata[1024];
    sdata[tid] = 0;

    if (idx + blockDim.x< num) {
        sdata[tid] = d_in[idx] + d_in[idx+blockDim.x];
    }
    else if(idx<num){
        sdata[tid] = d_in[idx];
    }
    __syncthreads();

    if (blockDim.x >= 1024) { if (tid < 512) { sdata[tid] += sdata[tid + 512]; } __syncthreads(); }
    if (blockDim.x >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
    if (blockDim.x >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
    if (blockDim.x >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
    if (tid < 32) {
        if (blockDim.x >= 64) sdata[tid] += sdata[tid + 32];
        if (blockDim.x >= 32) sdata[tid] += sdata[tid + 16];
        if (blockDim.x >= 16) sdata[tid] += sdata[tid + 8];
        if (blockDim.x >= 8) sdata[tid] += sdata[tid + 4];
        if (blockDim.x >= 4) sdata[tid] += sdata[tid + 2];
        if (blockDim.x >= 2) sdata[tid] += sdata[tid + 1];
    }
    if (tid == 0) d_out[blockIdx.x] = sdata[0];
}

__host__ __device__
void cudaReduceSum(const dim3 grid,const dim3 block,
                   const unsigned int num, const float* dIn,float *dOut,
                   cudaStream_t stream) {
    kernReduceSum<<<(grid.x+1)/2,block,0,stream>>>(num,dIn,dOut);
    if((grid.x+1)/2!=1) {
        const int n = (grid.x / 2) + (grid.x % 2);
        kernReduceSum << < 1, block ,0,stream>> > (n, dOut, dOut);
    }
}

Another issue is that I tried to create a stream in a host function, but cudaStreamCreate function is very slow. Any possible reason? Thanks!

auto start = std::chrono::high_resolution_clock::now();

    cudaStream_t streams[1];

    cudaStreamCreate(&streams[0]);

auto end = std::chrono::high_resolution_clock::now();

    auto duration =
            std::chrono::duration_cast<std::chrono::duration<float>>(end - start);
    std::cout << "Time elapsed: " << duration.count() << "s" << std::endl;

   // result: Time elapsed: 3.15592s

First of all, I suggest providing a complete code that someone else could test if you want help with a question like this.

Second, a CDP (Cuda Dynamic Parallelism) code can be challenging. I recommend doing rigorous error checking, just as you would in host code, on device code kernel launches and use of the device runtime API. Without a complete code, its impossible to tell how many outstanding kernel launches you will have, since you are launching kernels on each device thread. The machine has capacity limits here, including a pending launch limit. The cudaDeviceSynchronize() calls could possibly be keeping you from hitting pending launch limits when you are launching a sequence of kernels.

If you are having trouble with a CDP code, you might want to read the relevant section of the programming guide. It’s not all that long:

https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-dynamic-parallelism

Regarding the long time for stream creation, maybe this is just CUDA start-up overhead. If this is the first call, try issuing another CUDA call before timing this call, or else do a 2nd stream create and time that also.

Thanks! txbob.