Another question regarding the bizarre behavior of grid nesting and synchronization in cuda samples

Hi everyone,

Sorry for bothering you again with another problem puzzled me. And great thanks for the guy answered my last question. I am now stuck with another tutorial script in sample repos while moving forward. It is the cdpSimplePrint.cu in 0_simple:

/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <helper_cuda.h>
#include <helper_string.h>

////////////////////////////////////////////////////////////////////////////////
// Variable on the GPU used to generate unique identifiers of blocks.
////////////////////////////////////////////////////////////////////////////////
__device__ int g_uids = 0;

////////////////////////////////////////////////////////////////////////////////
// Print a simple message to signal the block which is currently executing.
////////////////////////////////////////////////////////////////////////////////
__device__ void print_info(int depth, int thread, int uid, int parent_uid)
{
    if (threadIdx.x == 0)
    {
        if (depth == 0)
            printf("BLOCK %d launched by the host\n", uid);
        else
        {
            char buffer[32];

            for (int i = 0 ; i < depth ; ++i)
            {
                buffer[3*i+0] = '|';
                buffer[3*i+1] = ' ';
                buffer[3*i+2] = ' ';
            }

            buffer[3*depth] = '

/**

  • Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
  • Please refer to the NVIDIA end user license agreement (EULA) associated
  • with this source code for terms and conditions that govern your use of
  • this software. Any use, reproduction, disclosure, or distribution of
  • this software and related documentation outside the terms of the EULA
  • is strictly prohibited.

*/

#include
#include
#include
#include <helper_cuda.h>
#include <helper_string.h>

////////////////////////////////////////////////////////////////////////////////
// Variable on the GPU used to generate unique identifiers of blocks.
////////////////////////////////////////////////////////////////////////////////
device int g_uids = 0;

////////////////////////////////////////////////////////////////////////////////
// Print a simple message to signal the block which is currently executing.
////////////////////////////////////////////////////////////////////////////////
device void print_info(int depth, int thread, int uid, int parent_uid)
{
if (threadIdx.x == 0)
{
if (depth == 0)
printf(“BLOCK %d launched by the host\n”, uid);
else
{
char buffer[32];

        for (int i = 0 ; i < depth ; ++i)
        {
            buffer[3*i+0] = '|';
            buffer[3*i+1] = ' ';
            buffer[3*i+2] = ' ';
        }

        buffer[3*depth] = '\0';
        printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, thread, parent_uid);
    }
}

__syncthreads();

}

////////////////////////////////////////////////////////////////////////////////
// The kernel using CUDA dynamic parallelism.
//
// It generates a unique identifier for each block. Prints the information
// about that block. Finally, if the ‘max_depth’ has not been reached, the
// block launches new blocks directly from the GPU.
////////////////////////////////////////////////////////////////////////////////
global void cdp_kernel(int max_depth, int depth, int thread, int parent_uid)
{
// We create a unique ID per block. Thread 0 does that and shares the value with the other threads.
shared int s_uid;

if (threadIdx.x == 0)
{
    s_uid = atomicAdd(&g_uids, 1);
}

__syncthreads();

// We print the ID of the block and information about its parent.
print_info(depth, thread, s_uid, parent_uid);

// We launch new blocks if we haven't reached the max_depth yet.
if (++depth >= max_depth)
{
    return;
}

cdp_kernel<<<gridDim.x, blockDim.x>>>(max_depth, depth, threadIdx.x, s_uid);

}

////////////////////////////////////////////////////////////////////////////////
// Main entry point.
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf(“starting Simple Print (CUDA Dynamic Parallelism)\n”);

// Parse a few command-line arguments.
int max_depth = 2;

if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
    checkCmdLineFlag(argc, (const char **)argv, "h"))
{
    printf("Usage: %s depth=<max_depth>\t(where max_depth is a value between 1 and 8).\n", argv[0]);
    exit(EXIT_SUCCESS);
}

if (checkCmdLineFlag(argc, (const char **)argv, "depth"))
{
    max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth");

    if (max_depth < 1 || max_depth > 8)
    {
        printf("depth parameter has to be between 1 and 8\n");
        exit(EXIT_FAILURE);
    }
}

// Find/set the device.
int device_count = 0, device = -1;

if(checkCmdLineFlag(argc, (const char **)argv, "device"))
{
    device = getCmdLineArgumentInt(argc, (const char **)argv, "device");

    cudaDeviceProp properties;
    checkCudaErrors(cudaGetDeviceProperties(&properties, device));
    
    if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
    {
        std::cout << "Running on GPU " << device << " (" << properties.name << ")" << std::endl;
    }
    else
    {
        std::cout << "ERROR: cdpsimplePrint requires GPU devices with compute SM 3.5 or higher."<< std::endl;
        std::cout << "Current GPU device has compute SM" << properties.major <<"."<< properties.minor <<". Exiting..." << std::endl;
        exit(EXIT_FAILURE);
    }

}
else
{
    checkCudaErrors(cudaGetDeviceCount(&device_count));
    for (int i = 0 ; i < device_count ; ++i)
    {
        cudaDeviceProp properties;
        checkCudaErrors(cudaGetDeviceProperties(&properties, i));
        if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
        {
            device = i;
            std::cout << "Running on GPU " << i << " (" << properties.name << ")" << std::endl;
            break;
        }
        std::cout << "GPU " << i << " (" << properties.name << ") does not support CUDA Dynamic Parallelism" << std::endl;
    }
}
if (device == -1)
{
          std::cerr << "cdpSimplePrint requires GPU devices with compute SM 3.5 or higher.  Exiting..." << std::endl;
          exit(EXIT_WAIVED);
 }
cudaSetDevice(device);

// Print a message describing what the sample does.
printf("***************************************************************************\n");
printf("The CPU launches 2 blocks of 2 threads each. On the device each thread will\n");
printf("launch 2 blocks of 2 threads each. The GPU we will do that recursively\n");
printf("until it reaches max_depth=%d\n\n", max_depth);
printf("In total 2");
int num_blocks = 2, sum = 2;

for (int i = 1 ; i < max_depth ; ++i)
{
    num_blocks *= 4;
    printf("+%d", num_blocks);
    sum += num_blocks;
}

printf("=%d blocks are launched!!! (%d from the GPU)\n", sum, sum-2);
printf("***************************************************************************\n\n");

// We set the recursion limit for CDP to max_depth.
cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);

// Launch the kernel from the CPU.
printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n\n");
cdp_kernel<<<2, 2>>>(max_depth, 0, 0, -1);
checkCudaErrors(cudaGetLastError());

// Finalize.
checkCudaErrors(cudaDeviceSynchronize());

// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice.  It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
checkCudaErrors(cudaDeviceReset());

exit(EXIT_SUCCESS);

}

';
            printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, thread, parent_uid);
        }
    }

    __syncthreads();
}

////////////////////////////////////////////////////////////////////////////////
// The kernel using CUDA dynamic parallelism.
//
// It generates a unique identifier for each block. Prints the information
// about that block. Finally, if the 'max_depth' has not been reached, the
// block launches new blocks directly from the GPU.
////////////////////////////////////////////////////////////////////////////////
__global__ void cdp_kernel(int max_depth, int depth, int thread, int parent_uid)
{
    // We create a unique ID per block. Thread 0 does that and shares the value with the other threads.
    __shared__ int s_uid;

    if (threadIdx.x == 0)
    {
        s_uid = atomicAdd(&g_uids, 1);
    }

    __syncthreads();

    // We print the ID of the block and information about its parent.
    print_info(depth, thread, s_uid, parent_uid);

    // We launch new blocks if we haven't reached the max_depth yet.
    if (++depth >= max_depth)
    {
        return;
    }

    cdp_kernel<<<gridDim.x, blockDim.x>>>(max_depth, depth, threadIdx.x, s_uid);
}

////////////////////////////////////////////////////////////////////////////////
// Main entry point.
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    printf("starting Simple Print (CUDA Dynamic Parallelism)\n");

    // Parse a few command-line arguments.
    int max_depth = 2;

    if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
        checkCmdLineFlag(argc, (const char **)argv, "h"))
    {
        printf("Usage: %s depth=<max_depth>\t(where max_depth is a value between 1 and 8).\n", argv[0]);
        exit(EXIT_SUCCESS);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "depth"))
    {
        max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth");

        if (max_depth < 1 || max_depth > 8)
        {
            printf("depth parameter has to be between 1 and 8\n");
            exit(EXIT_FAILURE);
        }
    }

    // Find/set the device.
    int device_count = 0, device = -1;
    
    if(checkCmdLineFlag(argc, (const char **)argv, "device"))
    {
        device = getCmdLineArgumentInt(argc, (const char **)argv, "device");

        cudaDeviceProp properties;
        checkCudaErrors(cudaGetDeviceProperties(&properties, device));
        
        if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
        {
            std::cout << "Running on GPU " << device << " (" << properties.name << ")" << std::endl;
        }
        else
        {
            std::cout << "ERROR: cdpsimplePrint requires GPU devices with compute SM 3.5 or higher."<< std::endl;
            std::cout << "Current GPU device has compute SM" << properties.major <<"."<< properties.minor <<". Exiting..." << std::endl;
            exit(EXIT_FAILURE);
        }

    }
    else
    {
        checkCudaErrors(cudaGetDeviceCount(&device_count));
        for (int i = 0 ; i < device_count ; ++i)
        {
            cudaDeviceProp properties;
            checkCudaErrors(cudaGetDeviceProperties(&properties, i));
            if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
            {
                device = i;
                std::cout << "Running on GPU " << i << " (" << properties.name << ")" << std::endl;
                break;
            }
            std::cout << "GPU " << i << " (" << properties.name << ") does not support CUDA Dynamic Parallelism" << std::endl;
        }
    }
    if (device == -1)
    {
              std::cerr << "cdpSimplePrint requires GPU devices with compute SM 3.5 or higher.  Exiting..." << std::endl;
              exit(EXIT_WAIVED);
     }
    cudaSetDevice(device);

    // Print a message describing what the sample does.
    printf("***************************************************************************\n");
    printf("The CPU launches 2 blocks of 2 threads each. On the device each thread will\n");
    printf("launch 2 blocks of 2 threads each. The GPU we will do that recursively\n");
    printf("until it reaches max_depth=%d\n\n", max_depth);
    printf("In total 2");
    int num_blocks = 2, sum = 2;

    for (int i = 1 ; i < max_depth ; ++i)
    {
        num_blocks *= 4;
        printf("+%d", num_blocks);
        sum += num_blocks;
    }

    printf("=%d blocks are launched!!! (%d from the GPU)\n", sum, sum-2);
    printf("***************************************************************************\n\n");

    // We set the recursion limit for CDP to max_depth.
    cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);

    // Launch the kernel from the CPU.
    printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n\n");
    cdp_kernel<<<2, 2>>>(max_depth, 0, 0, -1);
    checkCudaErrors(cudaGetLastError());

    // Finalize.
    checkCudaErrors(cudaDeviceSynchronize());
    
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    checkCudaErrors(cudaDeviceReset());

    exit(EXIT_SUCCESS);
}

With Cuda Dynamic Parallelism, parent blocks 0 and 1 each initiate 2 child blocks, which recursively execute similar functions of their parents. Before execution, I was expecting output with some randomness like this:

BLOCK 0 launched by the host
BLOCK 1 launched by the host
|  BLOCK 2 launched by thread 0 of block 0
|  BLOCK 4 launched by thread 0 of block 1
|  BLOCK 7 launched by thread 1 of block 0
|  BLOCK 8 launched by thread 1 of block 1
|  BLOCK 3 launched by thread 0 of block 0
|  BLOCK 5 launched by thread 0 of block 1
|  BLOCK 6 launched by thread 1 of block 0
|  BLOCK 9 launched by thread 1 of block 1

However everytime it returns result with pattern like this:

BLOCK 0 launched by the host
BLOCK 1 launched by the host
|  BLOCK 2 launched by thread 0 of block 0
|  BLOCK 4 launched by thread 0 of block 1
|  BLOCK 3 launched by thread 0 of block 0
|  BLOCK 5 launched by thread 0 of block 1
|  BLOCK 6 launched by thread 1 of block 0
|  BLOCK 7 launched by thread 1 of block 0
|  BLOCK 8 launched by thread 1 of block 1
|  BLOCK 9 launched by thread 1 of block 1

Where for child blocks it always put thread 0 ahead of thread 1. And when I change the max_depth to 3, the output is even more interesting:

BLOCK 0 launched by the host
BLOCK 1 launched by the host
|  BLOCK 2 launched by thread 0 of block 0
|  BLOCK 4 launched by thread 0 of block 1
|  BLOCK 5 launched by thread 0 of block 1
|  BLOCK 3 launched by thread 0 of block 0
|  |  BLOCK 6 launched by thread 0 of block 4
|  |  BLOCK 7 launched by thread 0 of block 4
|  |  BLOCK 8 launched by thread 0 of block 2
|  |  BLOCK 9 launched by thread 0 of block 2
|  |  BLOCK 11 launched by thread 0 of block 5
|  |  BLOCK 10 launched by thread 0 of block 5
|  |  BLOCK 12 launched by thread 0 of block 3
|  |  BLOCK 13 launched by thread 0 of block 3
|  |  BLOCK 14 launched by thread 1 of block 4
|  |  BLOCK 16 launched by thread 1 of block 2
|  |  BLOCK 17 launched by thread 1 of block 2
|  |  BLOCK 15 launched by thread 1 of block 4
|  |  BLOCK 18 launched by thread 1 of block 3
|  |  BLOCK 19 launched by thread 1 of block 3
|  |  BLOCK 21 launched by thread 1 of block 5
|  |  BLOCK 20 launched by thread 1 of block 5
|  BLOCK 22 launched by thread 1 of block 0
|  BLOCK 23 launched by thread 1 of block 0
|  BLOCK 24 launched by thread 1 of block 1
|  BLOCK 25 launched by thread 1 of block 1
|  |  BLOCK 27 launched by thread 0 of block 22
|  |  BLOCK 26 launched by thread 0 of block 22
|  |  BLOCK 28 launched by thread 0 of block 23
|  |  BLOCK 29 launched by thread 0 of block 23
|  |  BLOCK 30 launched by thread 0 of block 24
|  |  BLOCK 31 launched by thread 0 of block 24
|  |  BLOCK 32 launched by thread 0 of block 25
|  |  BLOCK 33 launched by thread 0 of block 25
|  |  BLOCK 34 launched by thread 1 of block 22
|  |  BLOCK 35 launched by thread 1 of block 22
|  |  BLOCK 37 launched by thread 1 of block 23
|  |  BLOCK 36 launched by thread 1 of block 23
|  |  BLOCK 39 launched by thread 1 of block 24
|  |  BLOCK 38 launched by thread 1 of block 24
|  |  BLOCK 40 launched by thread 1 of block 25
|  |  BLOCK 41 launched by thread 1 of block 25

It seems that instead of executing all threads concurrently, it executes in a pattern of Depth First Tree Search by the order of thread? (not even by block) How could that be possible? Is it because of something triggered by Grid Nesting?