Depricated cudaDeviceSynchronize() in Dynamic Parallelism

Hello!
I’m trying to write a CUDA program with Dynamic Parallelism.
My simplified situation is:

__device__ int f(int i)
{
    return i + 1;
}

__global__ void f_i(int* data, size_t size)
{
    size_t pos = (blockDim.x * blockIdx.x) + threadIdx.x;
    if (pos < size)
        data[pos] = f(data[pos]);
}

__global__ void run(int* data, size_t size)
{
    // 1024, because it is correct for my Titan Black,
    // and cudaGetDeviceProperties(...) is host-only function.
    blocks            = max(1.0, ceil(static_cast<double>(m_size) / 1024.0));
    threads_per_block = min(1024, m_size);    
    f_i<<<blocks, threads_per_block>>>(data, size);
    cudaDeviceSynchronize(); // Can I use it in kernels with Dynamic Parallelism?
}

void main()
{
    // Transfer some data to GPU.    
    run<<<1, 1>>>(data, size); // All program logic is in GPU (main thread need to do manipulations with data).
    // Transfer some data from GPU.
}

In fact i have a lot of kernels, which calls each other. I understand, that <<<1, 1>>> is a bad idea, but it is
only place without parallelism.

There is the warning in “CUDA C Programming Guide” with text: “Explicit synchronization with child kernels from
a parent block (i.e. using cudaDeviceSynchronize() in device code) is deprecated in CUDA 11.6, and is slated for
removal in a future CUDA release.”

Also there is the compiler warning with text: “Use of cudaDeviceSynchronize from device code is deprecated and will not be supported in a future release. Disable this warning with -D__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING.”

But all examples in “CUDA C Programming Guide” are with cudaDeviceSynchronize().
The Guide advises to use __syncthreads(), but what if I need to synchronize all thread blocks?
And I took wrong results, when I tried to use __syncthreads() in child kernels instead of cudaDeviceSynchronize() in parent kernel.

I wrote a partial solution with stream and event:

template<typename... Arguments>
__device__ __host__ cudaError_t
run_kernel(size_t blocksDim, size_t threadsDim, __global__ void (*kernel)(Arguments...), Arguments... args)
{
    cudaStream_t stream;
    cudaEvent_t  event;

    // I think it is a huge overhead, if there are a lot of kernel invocations.
    // But it works and not depricated.
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    cudaEventCreateWithFlags(&event, cudaEventDisableTiming);

    kernel<<<blocksDim, threadsDim, 0, stream>>>(args...); // I need only 1D-arrays.

    cudaEventRecord(event, stream);
    cudaStreamWaitEvent(stream, event, cudaStreamNonBlocking);
    cudaStreamDestroy(stream);
    cudaEventDestroy(event);

    return cudaGetLastError();
}

Now synchronised kernel can be invoked as run_kernel(blocks, threads, f_i, data, size), and it’s better, than without it.
Sometimes it looses some data (sometimes not).

And I have two warnings for run_kernel definition in compile-time:
“warning #181-D: argument is incompatible with corresponding format string conversion”
and
“warning #1835-D: attribute “global” does not apply here”.

It also writes, that “global” directive is ignored. Without global there is no warnings, but results are totaly wrong.

Maybe my toolchain is not configured properly (it’s about warnings: not functionality). This is my cmake CUDA initialization:

macro(for_all_targets)
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-rdc=true") # For Dynamic Parallelism.
endmacro()

macro(after_add_target target_name)
    pkg_config_wraper(${ARGV0} cuda)
    set_property(TARGET ${ARGV0} PROPERTY CUDA_ARCHITECTURES 35) # My Titan Black.
    set_target_properties(${ARGV0} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) # For Dynamic Parallelism.
    if(CMAKE_BUILD_TYPE STREQUAL "Debug")
        target_compile_options(${ARGV0} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-G>)
    endif()
    target_compile_options(vi_test_total PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
                            --expt-relaxed-constexpr  # For std::numeric_limits and others.
                            --use_fast_math           # For GPU math intrinsics.
                            --extended-lambda         # For lambda-functions [] __device__ () {}.
                            >)
    target_include_directories(${ARGV0} PUBLIC ${BASE_SOURCE_DIR}/lib/public/CUDA/)
endmacro()

Where pkg_config_wraper (it works properly and I put it here just for understanding) is:

set(PKG_CONFIG "pkg-config")

function (pkg_config_wraper target package)
    execute_process(COMMAND ${PKG_CONFIG} --cflags --libs ${ARGV1} OUTPUT_VARIABLE PKG_OUTPUT)
    string(REPLACE "\n" " " PKG_OUTPUT ${PKG_OUTPUT})
    string(REPLACE " " ";" PKG_LIST ${PKG_OUTPUT})
    foreach (compiler_option ${PKG_LIST})
        string(SUBSTRING ${compiler_option} 0 2 type)
        string(SUBSTRING ${compiler_option} 2 -1 value)
        if (${type} STREQUAL "-I")
            target_include_directories(${ARGV0} PUBLIC ${value})
        elseif (${type} STREQUAL "-L")
            link_directories(${value})
        elseif (${type} STREQUAL "-l")
            target_link_libraries(${ARGV0} PUBLIC ${compiler_option})
        elseif ()
            add_compile_options(${compiler_option})
        endif ()
    endforeach ()
endfunction (pkg_config_wraper)

Also I think, that it can be not so fast, as cudaDeviceSynchronize().

It’s OK, when I use cudaDeviceSynchronize(), but I don’t want to use deprecated functions.
And I don’t want to see many warnings too.

And one more little question.
Is there a device-analog of cudaGetDeviceProperties?
I need too obtain maximum threads per block parameter like that:

cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
max_threads_per_block = prop.maxThreadsPerBlock;

But it’s only host code.

The main question is how to synchronise threads without cudaDeviceSynchronize() inside dynamic-parallel kernels.

Thanks!