Hi everyone,
Sorry for bothering you again with another problem puzzled me. And great thanks for the guy answered my last question. I am now stuck with another tutorial script in sample repos while moving forward. It is the cdpSimplePrint.cu in 0_simple:
/**
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <helper_cuda.h>
#include <helper_string.h>
////////////////////////////////////////////////////////////////////////////////
// Variable on the GPU used to generate unique identifiers of blocks.
////////////////////////////////////////////////////////////////////////////////
__device__ int g_uids = 0;
////////////////////////////////////////////////////////////////////////////////
// Print a simple message to signal the block which is currently executing.
////////////////////////////////////////////////////////////////////////////////
__device__ void print_info(int depth, int thread, int uid, int parent_uid)
{
if (threadIdx.x == 0)
{
if (depth == 0)
printf("BLOCK %d launched by the host\n", uid);
else
{
char buffer[32];
for (int i = 0 ; i < depth ; ++i)
{
buffer[3*i+0] = '|';
buffer[3*i+1] = ' ';
buffer[3*i+2] = ' ';
}
buffer[3*depth] = '
/**
- Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
- Please refer to the NVIDIA end user license agreement (EULA) associated
- with this source code for terms and conditions that govern your use of
- this software. Any use, reproduction, disclosure, or distribution of
- this software and related documentation outside the terms of the EULA
- is strictly prohibited.
*/
#include
#include
#include
#include <helper_cuda.h>
#include <helper_string.h>
////////////////////////////////////////////////////////////////////////////////
// Variable on the GPU used to generate unique identifiers of blocks.
////////////////////////////////////////////////////////////////////////////////
device int g_uids = 0;
////////////////////////////////////////////////////////////////////////////////
// Print a simple message to signal the block which is currently executing.
////////////////////////////////////////////////////////////////////////////////
device void print_info(int depth, int thread, int uid, int parent_uid)
{
if (threadIdx.x == 0)
{
if (depth == 0)
printf(“BLOCK %d launched by the host\n”, uid);
else
{
char buffer[32];
for (int i = 0 ; i < depth ; ++i)
{
buffer[3*i+0] = '|';
buffer[3*i+1] = ' ';
buffer[3*i+2] = ' ';
}
buffer[3*depth] = '\0';
printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, thread, parent_uid);
}
}
__syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// The kernel using CUDA dynamic parallelism.
//
// It generates a unique identifier for each block. Prints the information
// about that block. Finally, if the ‘max_depth’ has not been reached, the
// block launches new blocks directly from the GPU.
////////////////////////////////////////////////////////////////////////////////
global void cdp_kernel(int max_depth, int depth, int thread, int parent_uid)
{
// We create a unique ID per block. Thread 0 does that and shares the value with the other threads.
shared int s_uid;
if (threadIdx.x == 0)
{
s_uid = atomicAdd(&g_uids, 1);
}
__syncthreads();
// We print the ID of the block and information about its parent.
print_info(depth, thread, s_uid, parent_uid);
// We launch new blocks if we haven't reached the max_depth yet.
if (++depth >= max_depth)
{
return;
}
cdp_kernel<<<gridDim.x, blockDim.x>>>(max_depth, depth, threadIdx.x, s_uid);
}
////////////////////////////////////////////////////////////////////////////////
// Main entry point.
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf(“starting Simple Print (CUDA Dynamic Parallelism)\n”);
// Parse a few command-line arguments.
int max_depth = 2;
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
checkCmdLineFlag(argc, (const char **)argv, "h"))
{
printf("Usage: %s depth=<max_depth>\t(where max_depth is a value between 1 and 8).\n", argv[0]);
exit(EXIT_SUCCESS);
}
if (checkCmdLineFlag(argc, (const char **)argv, "depth"))
{
max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth");
if (max_depth < 1 || max_depth > 8)
{
printf("depth parameter has to be between 1 and 8\n");
exit(EXIT_FAILURE);
}
}
// Find/set the device.
int device_count = 0, device = -1;
if(checkCmdLineFlag(argc, (const char **)argv, "device"))
{
device = getCmdLineArgumentInt(argc, (const char **)argv, "device");
cudaDeviceProp properties;
checkCudaErrors(cudaGetDeviceProperties(&properties, device));
if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
{
std::cout << "Running on GPU " << device << " (" << properties.name << ")" << std::endl;
}
else
{
std::cout << "ERROR: cdpsimplePrint requires GPU devices with compute SM 3.5 or higher."<< std::endl;
std::cout << "Current GPU device has compute SM" << properties.major <<"."<< properties.minor <<". Exiting..." << std::endl;
exit(EXIT_FAILURE);
}
}
else
{
checkCudaErrors(cudaGetDeviceCount(&device_count));
for (int i = 0 ; i < device_count ; ++i)
{
cudaDeviceProp properties;
checkCudaErrors(cudaGetDeviceProperties(&properties, i));
if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
{
device = i;
std::cout << "Running on GPU " << i << " (" << properties.name << ")" << std::endl;
break;
}
std::cout << "GPU " << i << " (" << properties.name << ") does not support CUDA Dynamic Parallelism" << std::endl;
}
}
if (device == -1)
{
std::cerr << "cdpSimplePrint requires GPU devices with compute SM 3.5 or higher. Exiting..." << std::endl;
exit(EXIT_WAIVED);
}
cudaSetDevice(device);
// Print a message describing what the sample does.
printf("***************************************************************************\n");
printf("The CPU launches 2 blocks of 2 threads each. On the device each thread will\n");
printf("launch 2 blocks of 2 threads each. The GPU we will do that recursively\n");
printf("until it reaches max_depth=%d\n\n", max_depth);
printf("In total 2");
int num_blocks = 2, sum = 2;
for (int i = 1 ; i < max_depth ; ++i)
{
num_blocks *= 4;
printf("+%d", num_blocks);
sum += num_blocks;
}
printf("=%d blocks are launched!!! (%d from the GPU)\n", sum, sum-2);
printf("***************************************************************************\n\n");
// We set the recursion limit for CDP to max_depth.
cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);
// Launch the kernel from the CPU.
printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n\n");
cdp_kernel<<<2, 2>>>(max_depth, 0, 0, -1);
checkCudaErrors(cudaGetLastError());
// Finalize.
checkCudaErrors(cudaDeviceSynchronize());
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
checkCudaErrors(cudaDeviceReset());
exit(EXIT_SUCCESS);
}
';
printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, thread, parent_uid);
}
}
__syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// The kernel using CUDA dynamic parallelism.
//
// It generates a unique identifier for each block. Prints the information
// about that block. Finally, if the 'max_depth' has not been reached, the
// block launches new blocks directly from the GPU.
////////////////////////////////////////////////////////////////////////////////
__global__ void cdp_kernel(int max_depth, int depth, int thread, int parent_uid)
{
// We create a unique ID per block. Thread 0 does that and shares the value with the other threads.
__shared__ int s_uid;
if (threadIdx.x == 0)
{
s_uid = atomicAdd(&g_uids, 1);
}
__syncthreads();
// We print the ID of the block and information about its parent.
print_info(depth, thread, s_uid, parent_uid);
// We launch new blocks if we haven't reached the max_depth yet.
if (++depth >= max_depth)
{
return;
}
cdp_kernel<<<gridDim.x, blockDim.x>>>(max_depth, depth, threadIdx.x, s_uid);
}
////////////////////////////////////////////////////////////////////////////////
// Main entry point.
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("starting Simple Print (CUDA Dynamic Parallelism)\n");
// Parse a few command-line arguments.
int max_depth = 2;
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
checkCmdLineFlag(argc, (const char **)argv, "h"))
{
printf("Usage: %s depth=<max_depth>\t(where max_depth is a value between 1 and 8).\n", argv[0]);
exit(EXIT_SUCCESS);
}
if (checkCmdLineFlag(argc, (const char **)argv, "depth"))
{
max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth");
if (max_depth < 1 || max_depth > 8)
{
printf("depth parameter has to be between 1 and 8\n");
exit(EXIT_FAILURE);
}
}
// Find/set the device.
int device_count = 0, device = -1;
if(checkCmdLineFlag(argc, (const char **)argv, "device"))
{
device = getCmdLineArgumentInt(argc, (const char **)argv, "device");
cudaDeviceProp properties;
checkCudaErrors(cudaGetDeviceProperties(&properties, device));
if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
{
std::cout << "Running on GPU " << device << " (" << properties.name << ")" << std::endl;
}
else
{
std::cout << "ERROR: cdpsimplePrint requires GPU devices with compute SM 3.5 or higher."<< std::endl;
std::cout << "Current GPU device has compute SM" << properties.major <<"."<< properties.minor <<". Exiting..." << std::endl;
exit(EXIT_FAILURE);
}
}
else
{
checkCudaErrors(cudaGetDeviceCount(&device_count));
for (int i = 0 ; i < device_count ; ++i)
{
cudaDeviceProp properties;
checkCudaErrors(cudaGetDeviceProperties(&properties, i));
if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
{
device = i;
std::cout << "Running on GPU " << i << " (" << properties.name << ")" << std::endl;
break;
}
std::cout << "GPU " << i << " (" << properties.name << ") does not support CUDA Dynamic Parallelism" << std::endl;
}
}
if (device == -1)
{
std::cerr << "cdpSimplePrint requires GPU devices with compute SM 3.5 or higher. Exiting..." << std::endl;
exit(EXIT_WAIVED);
}
cudaSetDevice(device);
// Print a message describing what the sample does.
printf("***************************************************************************\n");
printf("The CPU launches 2 blocks of 2 threads each. On the device each thread will\n");
printf("launch 2 blocks of 2 threads each. The GPU we will do that recursively\n");
printf("until it reaches max_depth=%d\n\n", max_depth);
printf("In total 2");
int num_blocks = 2, sum = 2;
for (int i = 1 ; i < max_depth ; ++i)
{
num_blocks *= 4;
printf("+%d", num_blocks);
sum += num_blocks;
}
printf("=%d blocks are launched!!! (%d from the GPU)\n", sum, sum-2);
printf("***************************************************************************\n\n");
// We set the recursion limit for CDP to max_depth.
cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);
// Launch the kernel from the CPU.
printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n\n");
cdp_kernel<<<2, 2>>>(max_depth, 0, 0, -1);
checkCudaErrors(cudaGetLastError());
// Finalize.
checkCudaErrors(cudaDeviceSynchronize());
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
checkCudaErrors(cudaDeviceReset());
exit(EXIT_SUCCESS);
}
With Cuda Dynamic Parallelism, parent blocks 0 and 1 each initiate 2 child blocks, which recursively execute similar functions of their parents. Before execution, I was expecting output with some randomness like this:
BLOCK 0 launched by the host
BLOCK 1 launched by the host
| BLOCK 2 launched by thread 0 of block 0
| BLOCK 4 launched by thread 0 of block 1
| BLOCK 7 launched by thread 1 of block 0
| BLOCK 8 launched by thread 1 of block 1
| BLOCK 3 launched by thread 0 of block 0
| BLOCK 5 launched by thread 0 of block 1
| BLOCK 6 launched by thread 1 of block 0
| BLOCK 9 launched by thread 1 of block 1
However everytime it returns result with pattern like this:
BLOCK 0 launched by the host
BLOCK 1 launched by the host
| BLOCK 2 launched by thread 0 of block 0
| BLOCK 4 launched by thread 0 of block 1
| BLOCK 3 launched by thread 0 of block 0
| BLOCK 5 launched by thread 0 of block 1
| BLOCK 6 launched by thread 1 of block 0
| BLOCK 7 launched by thread 1 of block 0
| BLOCK 8 launched by thread 1 of block 1
| BLOCK 9 launched by thread 1 of block 1
Where for child blocks it always put thread 0 ahead of thread 1. And when I change the max_depth to 3, the output is even more interesting:
BLOCK 0 launched by the host
BLOCK 1 launched by the host
| BLOCK 2 launched by thread 0 of block 0
| BLOCK 4 launched by thread 0 of block 1
| BLOCK 5 launched by thread 0 of block 1
| BLOCK 3 launched by thread 0 of block 0
| | BLOCK 6 launched by thread 0 of block 4
| | BLOCK 7 launched by thread 0 of block 4
| | BLOCK 8 launched by thread 0 of block 2
| | BLOCK 9 launched by thread 0 of block 2
| | BLOCK 11 launched by thread 0 of block 5
| | BLOCK 10 launched by thread 0 of block 5
| | BLOCK 12 launched by thread 0 of block 3
| | BLOCK 13 launched by thread 0 of block 3
| | BLOCK 14 launched by thread 1 of block 4
| | BLOCK 16 launched by thread 1 of block 2
| | BLOCK 17 launched by thread 1 of block 2
| | BLOCK 15 launched by thread 1 of block 4
| | BLOCK 18 launched by thread 1 of block 3
| | BLOCK 19 launched by thread 1 of block 3
| | BLOCK 21 launched by thread 1 of block 5
| | BLOCK 20 launched by thread 1 of block 5
| BLOCK 22 launched by thread 1 of block 0
| BLOCK 23 launched by thread 1 of block 0
| BLOCK 24 launched by thread 1 of block 1
| BLOCK 25 launched by thread 1 of block 1
| | BLOCK 27 launched by thread 0 of block 22
| | BLOCK 26 launched by thread 0 of block 22
| | BLOCK 28 launched by thread 0 of block 23
| | BLOCK 29 launched by thread 0 of block 23
| | BLOCK 30 launched by thread 0 of block 24
| | BLOCK 31 launched by thread 0 of block 24
| | BLOCK 32 launched by thread 0 of block 25
| | BLOCK 33 launched by thread 0 of block 25
| | BLOCK 34 launched by thread 1 of block 22
| | BLOCK 35 launched by thread 1 of block 22
| | BLOCK 37 launched by thread 1 of block 23
| | BLOCK 36 launched by thread 1 of block 23
| | BLOCK 39 launched by thread 1 of block 24
| | BLOCK 38 launched by thread 1 of block 24
| | BLOCK 40 launched by thread 1 of block 25
| | BLOCK 41 launched by thread 1 of block 25
It seems that instead of executing all threads concurrently, it executes in a pattern of Depth First Tree Search by the order of thread? (not even by block) How could that be possible? Is it because of something triggered by Grid Nesting?