cudaDeviceSynchronize from device code is deprecated

so I have this function

__global__ void ndCudaInitBodyArray(ndCudaSceneInfo& info)
{
	if (info.m_frameIsValid)
	{
		const unsigned bodyCount = info.m_bodyArray.m_size - 1;
		const unsigned blocksCount = (bodyCount + D_THREADS_PER_BLOCK - 1) / D_THREADS_PER_BLOCK;
		ndCudaInitBodyArrayInternal << <blocksCount, D_THREADS_PER_BLOCK, 0 >> > (info);
		ndCudaMergeAabbInternal << <1, D_THREADS_PER_BLOCK, 0 >> > (info);

		cudaDeviceSynchronize();
		info.m_histogram.m_size = info->m_cellsCount;
		if (info->m_cellsCount > info.m_histogram.m_capacity)
		{
			cuInvalidateFrame(info, __FUNCTION__, __LINE__);
			return;
		}
                unsigned blocksCells = (info->m_cellsCount + D_THREADS_PER_BLOCK -1) / D_THREADS_PER_BLOCK
		ndCudaCountAabbInternal << <blocksCells, D_THREADS_PER_BLOCK, 0 >> > (info);
	}
}

and I get this warning

builds\newton-4.00\sdk\dNewton\dExtensions\dCuda>“C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin\nvcc.exe” -gencode=arch=compute_52,code="sm_52,compute_52" --use-local-env -ccbin “C:\Program Files\Microsoft Visual Studio\2022\Professional\VC\Tools\MSVC\14.32.31326\bin\HostX64\x64” -x cu -rdc=true -I"C:\Development\newton-dynamics\newton-4.00\sdk\dCore"
-4.00\sdk\dNewton\dExtensions\dCuda\dContext\ndCudaSceneInfo.h(130): warning #1444-D: function “cudaDeviceSynchronize”
1>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\include\cuda_device_runtime_api.h(142): here was declared deprecated ("Use of cudaDeviceSynchronize from device code is deprecated and will not be supported in a future release.

The docs say this:

  • Use of cudaDeviceSynchronize in device code was deprecated in CUDA 11.6 and will not be supported in a future release. Note that this is different from host-side cudaDeviceSynchronize, which is still supported.

I read all the docs, regarding dynamics parallelism, most them come for Mark Harris. All of them say that is you read data from a child kernel launched form a parent kernels, that the app most call cudaDeviceSynchronize

but that the doc say that this is not correct, so what is the correct way of doing it.

If I do no use that way, the simply does not work, because for some reason the child kernel are still running when is returns.
I spend hour and hour debug this because behavior, and it seem it is the only ways that can work.

before when I was no using dynamic paralleslism, this worked, but in order to since the number of item generated from the kernel ndCudaMergeAabbInternal
determine the number of blocks, I have to pass a ridicules number of block,
and check in the kernel if the block number was smaller that the valid blocks,
make the child kernels increasingly more and more complex.

so my question is what is the correct was of since the child threard after sdk 11.6?

1 Like

That methodology is deprecated.

So you should not synchronize on a child kernel completion, in a parent kernel. If you need to consume results from the child kernel in device code, launch a new kernel.

Yes, this will require refactoring your code. There is no zero-impact workaround that I am aware of.

1 Like

What does “launch a new kernel” mean exactly? Does it mean a new child kernel inside the parent kernel?

You can launch a new child kernel, but that doesn’t create a device-wide parent-to-child sync point. If what you desire is a device-wide parent-to-child synchronization, then its a new parent kernel.

#include <cstdio>
__device__ int x = 0;

__global__ void child(int s){
  // child device code
  printf("child: %d val: %d\n", s, x);
  x = s;
}

__global__ void parent(){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  x = 0;
  if (idx == 0) {
    child<<<1,1>>>(1);
    child<<<1,1>>>(2);
    //cudaDeviceSynchronize(); // possible future deprecation
    }
  printf("parent val: %d\n", x);
}

int main(){

  parent<<<1,1>>>();
  child<<<1,1>>>(3);
  cudaDeviceSynchronize();
}

Without the cudaDeviceSynchronize() in the parent kernel, my expectation is that the printout would be:

parent val: 0
child: 1 val: 0
child: 2 val: 1
child: 3 val: 2

with the caveat that the parent val: 0 printout could theoretically appear at any position in the printout sequence (except the last position), and could print out any value of 0, 1, or 2. Note that the second invocation of child is guaranteed to see the results of the first invocation of child. stream semantics guarantee this. Likewise, for the same reason, the invocation of child from host code is also guaranteed to see any updates from the previous invocation of parent from host code (including the child grids launched by parent)

1 Like

this is embarrassing
i ever
After completely switching to the GPU side, I used CUDA Dynamic Parallelism to write a lot of numerical calculation code that was completed only on the GPU side.The printf() function provided by cuda is highly functional and can be used to redirection to a file. It is possible to save the data, and there is no need to transfer the calculation results to the host side via memory.
Another reason is to local the memory used for internal calculation on the GPU side from the host side.
Sometimes it’s much more intuitive and easier to code to allocate/deallocate memory using new/delete inside global functions (even if we can’t use unique_ptr inside global)
The code below is an integral calculation program that completes only with GPU
cudaDeviceSynchronize() is embedded in the important part and it is very unrepairable.
If cudaDeviceSynchronize() support is dropped
I declare that I will quit nVIDIA user

constexpr  size_t Ki = 1024;
constexpr  size_t Mi = Ki * Ki;
constexpr  size_t Gi = Mi * Ki;

template<typename X=double>
struct
Sinc2 {
  __device__
 X
  operator()(X const& x)  const {
		・・・
  }
};

template<
  template<typename> typename Functor, 
  typename X
>
__global__
void
coreIntegrate(
  const X s,
  const X e,
  const X delta,
  X * fx
){
	・・・
}

constexpr size_t BLKMAX = Ki;
constexpr size_t THRDMAX = Ki;
constexpr  size_t DIVISION = Mi;

template<
  template<typename> typename Functor,
  typename X
>
__global__
void
Integrate(
  const X s,
  const X e,
  X *result
) {
  X* fx = new X[DIVISION+1];

  X delta = (e - s) / static_cast<X>(DIVISION);
  coreIntegrate<Functor, X> << < BLKMAX, THRDMAX >> > (s, e, delta, fx);  
  Sumfx << <1, THRDMAX >> > (fx, sumfx); 

  __syncthreads();
  cudaDeviceSynchronize();

	・・・

  delete[] fx;
  delete[] sumfx;
}



__global__
void
GPUMain() {
	・・・
  double * dresult = new double;
  Integrate<Sinc2, double> <<< 1, 1 >>> ( 0.0, 10.0, dresult);

  __syncthreads();
  cudaDeviceSynchronize();

  printf("%20.18e\n",*dresult);

  // \int_{0}^{x} sinc(t)^2 dt, x=0...100;
	・・・	
  for (size_t i = 0; i <= Ki*100; i++) {
    Integrate<Sinc2, double> << < 1, 1 >> > ( x, x+delta, dresult);
    //x = i * delta;
    x += delta;
    __syncthreads();
    cudaDeviceSynchronize();

    result[i] = *dresult;
  }

  double sum = 0.;
  x = 0.;
  for (size_t i = 0; i <= Ki * 100; i++ ) {
    //x = i * delta;
    printf("%20.18e %20.18e\n", x, sum);
    x += delta;
    sum += result[i];
  }
  
  delete[] result;
  delete dresult;

}


int
main() {
  cudaDeviceSetLimit(cudaLimitMallocHeapSize, size_t(Gi+Gi));
  GPUMain<<<1,1>>>();
}
  

Wasn’t there some new CUDA task graph support for device code in CUDA 10 or was it announced for a future Toolkit version?

FWIW, CUDA 12 has introduced new CDP functionality. This may also be of interest.