Passing dynamically allocated memory in kernel to sub kernel via dynamic parallelism

Lianguage · April 23, 2019, 9:59am

Is this something that is supported?

Currently I’m attempting this as follows in a reduce kernel as follows:

__device__ void d_reduce_easy( 
	opFunct_intint reduce_funct,
	int*           d_input,
	int            size
){
	int numLaunches = size;
	int depth = 0;
	int buffer_size = 0;

	//int* d_levels = new int[MAX_REDUCE_POW];
	int* d_levels = (int*)malloc(MAX_REDUCE_POW * sizeof(int));

	while( numLaunches > 1){
		d_levels[depth] = numLaunches;
		numLaunches = d_idiv_ceil(numLaunches, MAX_SHARMEM);
		buffer_size += numLaunches;
		depth++;
	}
	d_levels[depth] = numLaunches;
	buffer_size += numLaunches;

	//int* d_buff = new int[buffer_size];
	int* d_buff = (int*)malloc(buffer_size * sizeof(int)) ;

	cudaDeviceSynchronize();
	reduce_with_buffer<<<1,numLaunches,sizeof(int)>>>(
		reduce_funct,
		d_input,
		d_buff,
		d_levels,
		depth,
		0,
		0,
		1
	);
	cudaDeviceSynchronize();
	d_input[0] = d_buff[0];

	free(d_levels);
	free(d_buff);
	//delete d_levels;
	//delete d_buff;
}

__global__ void reduce_easy( 
	opFunct_intint reduce_funct,
	int*           d_input,
	int            size
){
	d_reduce_easy(reduce_funct, d_input, size);
}

int sum_reduce_easier(std::vector<int> h_input){
	int *d_input = NULL;
	gpuErrchk( cudaMalloc( (void**)&d_input , h_input.size() * sizeof(int) ));
	gpuErrchk( cudaMemcpy( d_input, h_input.data() , h_input.size() * sizeof(int), cudaMemcpyHostToDevice ) );
	
	cudaMemcpyFromSymbol(&h_funcp_sumpair, p_sumpair, sizeof(opFunct_intint));

	cudaDeviceSynchronize();
	reduce_easy<<<1,1>>>(h_funcp_sumpair, d_input, h_input.size());

	int reduced_sum;
	gpuErrchk( cudaMemcpy( &reduced_sum, d_input, sizeof(int) , cudaMemcpyDeviceToHost ) );
	cudaFree(d_input);

	return reduced_sum;
	//return 1234;
}

The kernel called dynamically : reduce_with_buffer
has been tested with host side code and seems to work fine, however when I adapt the host side code to run on the kernel I’m experiencing problems, where the data in the pointer seems to retain the values from a previous call to the kernel and I get incorrect values.

Is there some kind of bug or is this just not supported? Thanks.

Edit: I’m testing on a gtx980ti

Lianguage · April 24, 2019, 12:31pm

So I’ve done some more testing, it seems like the problem is something to do with dynamic parallelism.

Robert_Crovella · April 24, 2019, 4:53pm

I’m not able to compile or run the code you have posted.

The basic idea seems to work fine for me:

$ cat t451.cu
#include <cstdio>
#include <cassert>

__global__ void ck(int *data, int n){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < n)
    data[idx] += idx;
}

__global__ void pk(int n){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  int *data = (int *)malloc(n*sizeof(int));
  assert(data != NULL);
  for (int i = 0; i < n; i++) data[i] = idx;
  ck<<<(n+255)/256,256>>>(data, n);
  cudaDeviceSynchronize();
  for (int i = 0; i < n; i++)
    printf("parent thread: %d, data[%d] = %d\n", idx, i, data[i]);
}

int main(){

  pk<<<1,1>>>(5);
  cudaDeviceSynchronize();
}
$ nvcc -arch=sm_35 -rdc=true -o t451 t451.cu -lcudadevrt
$ cuda-memcheck ./t451
========= CUDA-MEMCHECK
parent thread: 0, data[0] = 0
parent thread: 0, data[1] = 1
parent thread: 0, data[2] = 2
parent thread: 0, data[3] = 3
parent thread: 0, data[4] = 4
========= ERROR SUMMARY: 0 errors
$

Lianguage · April 25, 2019, 4:17am

Hi Rob

Thanks for your reply, sorry I got lazy with my code snippet. I’ve made 2 compilable examples, one demonstrating the reduce working correctly and the other which adds another layer of kernel call causing the reduce algorithm to not work correctly for some reason.

I’ve debugged it a fair bit and I noticed that somewhere along the line some values are not taking the correct values but haven’t found the root cause why.

Anyways here are the code snippets:

// nobug.cu
// this should work correctly, giving the correct result
// 
// liang@z10pe:~/sandbox/dpbug$ nvcc -g -G -std=c++11 -arch=sm_50 -rdc=true -lcudadevrt nobug.cu -o nobug
// liang@z10pe:~/sandbox/dpbug$ ./nobug 
// reduce expected:2048
// reduce actual  :2048

#include <iostream>

#include <assert.h>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <memory>

#define MAX_THREADS 256
#define MAX_SHARMEM 512
#define MAX_SAFE_REDUCE 262684
#define MAX_REDUCE_POW 4
#define MAX_REDUCE 33554432

//http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
#define gpuErrchk(ans) { gpuAssert ((ans), __FILE__, __LINE__); }

//http://stackoverflow.com/questions/2080364/using-assert-within-kernel-invocation
#define  gpuKernelAssert(X) if(!(X)){printf("tid %d:%s,%d\n",threadIdx.x,__FILE__,__LINE__);return;}

cudaDeviceProp gpu_properties;

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
	if (code != cudaSuccess)
	{
		fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
		if (abort) exit(code);
	}
}

void saferCudaFree(void* dev_ptr ){
	if(dev_ptr != NULL){
		gpuErrchk( cudaFree(dev_ptr));
	}
}

int hostStepBlockDim(int size){
	if( size > MAX_THREADS ){
		return MAX_THREADS;
	}
	else {
		return size;
	}
}

int hostStepGridDim(int size){
	return (size/MAX_THREADS + 1);
}

__device__ int devStepBlockDim(int size){
	if( size > MAX_THREADS ){
		return MAX_THREADS;
	}
	else {
		return size;
	}
}

__device__ int devStepGridDim(int size){
	return (size/MAX_THREADS + 1);
}

__device__ int devStepSharedGridDim(int size){
	return (size/MAX_SHARMEM + 1);
}

__device__ int devStepShareMem(int size){
	if( size > MAX_SHARMEM ){
		return MAX_SHARMEM;
	}
	else{
		return size;
	}
}

__device__ int d_idiv_ceil( int numerator, int denominator ){
	return numerator / denominator + (((numerator < 0) ^ (denominator > 0)) && (numerator%denominator));
}

typedef int (*opFunct_intint) (int, int);

__device__ int sumpair( int target, int addor){
	return target + addor;
}

const __device__ opFunct_intint p_sumpair = sumpair;

opFunct_intint h_funcp_sumpair;

__global__ void reduce_with_buffer(
	opFunct_intint reduce_funct,
	int*           d_input,
	int*           d_buffer,
	int*           d_levels,
	int            depth,
	int            bookmark,
	int            origindex,
	int            maxindex
){
	int tid = threadIdx.x;

	int neworigindex1 = 2 * tid + bookmark;//origin index, as in the current cuda thread.
	int neworigindex2 = 2 * tid + 1 + bookmark; // called it origin index because subkernels spawn that also reduce

	if( depth > 0){
		int newdepth = depth - 1;
		int newmaxindex;
		if(newdepth == 0){
			newmaxindex = d_levels[newdepth];
		}
		else{
			newmaxindex = maxindex + d_levels[newdepth];		
		}
		
		if( neworigindex1 < newmaxindex){
			int newbookmark1 = 2 * tid * MAX_SHARMEM;
			if( newdepth != 0){ newbookmark1 += maxindex;}

			int nextNumElems = devStepShareMem( newmaxindex - newbookmark1 );
			int nextBLOCKSZ  = d_idiv_ceil(nextNumElems, 2 );
			int shareMemSz   = nextNumElems * sizeof(int);

			//cudaDeviceSynchronize();

			reduce_with_buffer<<<1,nextBLOCKSZ,shareMemSz>>>( 
				reduce_funct,
				d_input,
				d_buffer,
				d_levels,
				newdepth,
				newbookmark1, 
				neworigindex1,
				newmaxindex
			);
		}
		if( neworigindex2 < newmaxindex ){
			int newbookmark2 = (2 * tid + 1 ) * MAX_SHARMEM;
			if( newdepth != 0){ newbookmark2 += maxindex;}			

			int nextNumElems = devStepShareMem( newmaxindex - newbookmark2 );
			int nextBLOCKSZ  = d_idiv_ceil(nextNumElems, 2 );
			int shareMemSz   = nextNumElems * sizeof(int);

			//cudaDeviceSynchronize();
			
			reduce_with_buffer<<<1,nextBLOCKSZ,shareMemSz>>>( 
				reduce_funct,
				d_input,
				d_buffer,
				d_levels,
				newdepth, 
				newbookmark2, 
				neworigindex2,
				newmaxindex
			);
		}
	}
	cudaDeviceSynchronize();

	
	int* d_data = NULL;
	if( depth == 0 ){
		d_data = d_input;
	}
	else{
		d_data = d_buffer;
	}

	if( maxindex - bookmark >= MAX_SHARMEM  ){
		extern __shared__ int sdata[];

		sdata[ 2 * tid ]     = d_data[ neworigindex1 ];
		sdata[ 2 * tid + 1 ] = d_data[ neworigindex2 ];
		__syncthreads();

		for( int s = blockDim.x ; s > 0  ; s >>= 1 ){
			if(tid < s){
				sdata[tid] = reduce_funct(sdata[tid], sdata[tid + s]);
			}
			__syncthreads();
		}
		__syncthreads();

		if( tid == 0 ){
			d_buffer[origindex] = sdata[0];
		}

	}
	else{ //else should deal with all cases where number of elements is not equal to the max size

		extern __shared__ int sdata[];

		if( neworigindex1 < maxindex ){
			sdata[ 2 * tid ] = d_data[ neworigindex1 ];			
		}
		else{
			sdata[ 2 * tid ] = 0;
		}

		if( neworigindex2 < maxindex ){
			sdata[ 2 * tid + 1 ] = d_data[ neworigindex2 ];	
		}
		else{
			sdata[ 2 * tid + 1 ] = 0;
		}
		__syncthreads();

		int s = maxindex - bookmark; //Assumes that if blockDim.x !=  MAX_THREADS, then it must be the last block.

		if( s > 1 && (s & 1) == 1 && tid == 0 ){
			sdata[0] = reduce_funct(sdata[0], sdata[ s - 1 ]);
		}
		s >>= 1;
		for( ; s > 0 && tid < s ; s >>= 1 ){
			sdata[tid] = reduce_funct(sdata[tid], sdata[tid + s]);
			if( tid == 0 && s > 1 && (s & 1) == 1 ){
				sdata[0] = reduce_funct(sdata[0], sdata[ s - 1 ]);
			}
			__syncthreads();
		}
		//__syncthreads();

		if( tid == 0 ){
			d_buffer[origindex] = sdata[0];
		}
	}
	cudaDeviceSynchronize();
	
}

int idiv_ceil ( int numerator, int denominator ){
	return numerator / denominator + (((numerator < 0) ^ (denominator > 0)) && (numerator%denominator));
}

int sum_reduce_buffered(std::vector<int> h_input){
	
	int num_launches = h_input.size();
	int depth = 0;
	int buffer_size = 0;

	std::vector<int> h_levels(MAX_REDUCE_POW);
	//std::vector<int> h_levels();
	while( num_launches > 1){
		h_levels[depth] = num_launches;
		//h_levels.push_back(num_launches);
		num_launches = idiv_ceil(num_launches, MAX_SHARMEM);
		buffer_size += num_launches;
		depth++;
	}
	h_levels[depth] = num_launches;
	//h_levels.push_back(num_launches);
	buffer_size += num_launches;

	int* d_levels = NULL;
	int* d_buffer = NULL;
	int* d_input  = NULL;
	gpuErrchk( cudaMalloc( (void**)&d_levels , h_levels.size() * sizeof(int) ));
	gpuErrchk( cudaMalloc( (void**)&d_buffer, buffer_size * sizeof(int) ));
	gpuErrchk( cudaMalloc( (void**)&d_input, h_input.size() * sizeof(int) ));

	gpuErrchk( cudaMemcpy( d_input  , h_input.data()  , h_input.size()  * sizeof(int) , cudaMemcpyHostToDevice ) );
	gpuErrchk( cudaMemcpy( d_levels , h_levels.data() , h_levels.size() * sizeof(int) , cudaMemcpyHostToDevice ) );

	cudaMemcpyFromSymbol(&h_funcp_sumpair, p_sumpair, sizeof(opFunct_intint));

	cudaDeviceSynchronize();

	reduce_with_buffer<<<1,num_launches,sizeof(int)>>>(
		h_funcp_sumpair,
		d_input,
		d_buffer,
		d_levels,
		depth,
		0,//bookmark
		0,//origindex
		1 //maxindex
	);

	int reduced_sum;
	gpuErrchk( cudaMemcpy( &reduced_sum, d_buffer, sizeof(int) , cudaMemcpyDeviceToHost ) );
	cudaFree(d_input);
	cudaFree(d_buffer);
	cudaFree(d_levels);

	return reduced_sum;

}

int reduceTestBuffered( int numsum ){
	std::vector<int> h_ones( numsum, 1 );
	int h_sum = sum_reduce_buffered(h_ones);
	
	return h_sum;
}

int main(){
	int expected = 2048;
	int actual =  reduceTestBuffered(expected);
	std::cout << "reduce expected:" << expected << std::endl;
	std::cout << "reduce actual  :" << actual << std::endl;

	return 0;
}

Heres the code which demonstrates the miscalculation:

// dpbug.cu
// expected result is different from actual in reduce operation
// reduce algorithm almost identical to nobug.cu except that it adds 1 level of Dynamic parallelism depth to the process
//
// liang@z10pe:~/sandbox/dpbug$ nvcc -g -G -std=c++11 -arch=sm_50 -rdc=true -lcudadevrt dpbug.cu -o dpbug
// liang@z10pe:~/sandbox/dpbug$ ./dpbug 
// reduce expected:2048
// reduce actual  :512

#include <iostream>

#include <assert.h>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <memory>

#define MAX_THREADS 256
#define MAX_SHARMEM 512
#define MAX_SAFE_REDUCE 262684
#define MAX_REDUCE_POW 4
#define MAX_REDUCE 33554432

//http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
#define gpuErrchk(ans) { gpuAssert ((ans), __FILE__, __LINE__); }

//http://stackoverflow.com/questions/2080364/using-assert-within-kernel-invocation
#define  gpuKernelAssert(X) if(!(X)){printf("tid %d:%s,%d\n",threadIdx.x,__FILE__,__LINE__);return;}

cudaDeviceProp gpu_properties;

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
	if (code != cudaSuccess)
	{
		fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
		if (abort) exit(code);
	}
}

void saferCudaFree(void* dev_ptr ){
	if(dev_ptr != NULL){
		gpuErrchk( cudaFree(dev_ptr));
	}
}

int hostStepBlockDim(int size){
	if( size > MAX_THREADS ){
		return MAX_THREADS;
	}
	else {
		return size;
	}
}

int hostStepGridDim(int size){
	return (size/MAX_THREADS + 1);
}

__device__ int devStepBlockDim(int size){
	if( size > MAX_THREADS ){
		return MAX_THREADS;
	}
	else {
		return size;
	}
}

__device__ int devStepGridDim(int size){
	return (size/MAX_THREADS + 1);
}

__device__ int devStepSharedGridDim(int size){
	return (size/MAX_SHARMEM + 1);
}

__device__ int devStepShareMem(int size){
	if( size > MAX_SHARMEM ){
		return MAX_SHARMEM;
	}
	else{
		return size;
	}
}

__device__ int d_idiv_ceil( int numerator, int denominator ){
	return numerator / denominator + (((numerator < 0) ^ (denominator > 0)) && (numerator%denominator));
}

typedef int (*opFunct_intint) (int, int);

__device__ int sumpair( int target, int addor){
	return target + addor;
}

const __device__ opFunct_intint p_sumpair = sumpair;

opFunct_intint h_funcp_sumpair;

__global__ void reduce_with_buffer(
	opFunct_intint reduce_funct,
	int*           d_input,
	int*           d_buffer,
	int*           d_levels,
	int            depth,
	int            bookmark,
	int            origindex,
	int            maxindex
){
	int tid = threadIdx.x;

	int neworigindex1 = 2 * tid + bookmark;//origin index, as in the current cuda thread.
	int neworigindex2 = 2 * tid + 1 + bookmark; // called it origin index because subkernels spawn that also reduce

	if( depth > 0){
		int newdepth = depth - 1;
		int newmaxindex;
		if(newdepth == 0){
			newmaxindex = d_levels[newdepth];
		}
		else{
			newmaxindex = maxindex + d_levels[newdepth];		
		}
		
		if( neworigindex1 < newmaxindex){
			int newbookmark1 = 2 * tid * MAX_SHARMEM;
			if( newdepth != 0){ newbookmark1 += maxindex;}

			int nextNumElems = devStepShareMem( newmaxindex - newbookmark1 );
			int nextBLOCKSZ  = d_idiv_ceil(nextNumElems, 2 );
			int shareMemSz   = nextNumElems * sizeof(int);

			//cudaDeviceSynchronize();

			reduce_with_buffer<<<1,nextBLOCKSZ,shareMemSz>>>( 
				reduce_funct,
				d_input,
				d_buffer,
				d_levels,
				newdepth,
				newbookmark1, 
				neworigindex1,
				newmaxindex
			);
		}
		if( neworigindex2 < newmaxindex ){
			int newbookmark2 = (2 * tid + 1 ) * MAX_SHARMEM;
			if( newdepth != 0){ newbookmark2 += maxindex;}			

			int nextNumElems = devStepShareMem( newmaxindex - newbookmark2 );
			int nextBLOCKSZ  = d_idiv_ceil(nextNumElems, 2 );
			int shareMemSz   = nextNumElems * sizeof(int);

			//cudaDeviceSynchronize();
			
			reduce_with_buffer<<<1,nextBLOCKSZ,shareMemSz>>>( 
				reduce_funct,
				d_input,
				d_buffer,
				d_levels,
				newdepth, 
				newbookmark2, 
				neworigindex2,
				newmaxindex
			);
		}
	}
	cudaDeviceSynchronize();

	
	int* d_data = NULL;
	if( depth == 0 ){
		d_data = d_input;
	}
	else{
		d_data = d_buffer;
	}

	if( maxindex - bookmark >= MAX_SHARMEM  ){
		extern __shared__ int sdata[];

		sdata[ 2 * tid ]     = d_data[ neworigindex1 ];
		sdata[ 2 * tid + 1 ] = d_data[ neworigindex2 ];
		__syncthreads();

		for( int s = blockDim.x ; s > 0  ; s >>= 1 ){
			if(tid < s){
				sdata[tid] = reduce_funct(sdata[tid], sdata[tid + s]);
			}
			__syncthreads();
		}
		__syncthreads();

		if( tid == 0 ){
			d_buffer[origindex] = sdata[0];
		}

	}
	else{ //else should deal with all cases where number of elements is not equal to the max size

		extern __shared__ int sdata[];

		if( neworigindex1 < maxindex ){
			sdata[ 2 * tid ] = d_data[ neworigindex1 ];			
		}
		else{
			sdata[ 2 * tid ] = 0;
		}

		if( neworigindex2 < maxindex ){
			sdata[ 2 * tid + 1 ] = d_data[ neworigindex2 ];	
		}
		else{
			sdata[ 2 * tid + 1 ] = 0;
		}
		__syncthreads();

		int s = maxindex - bookmark; //Assumes that if blockDim.x !=  MAX_THREADS, then it must be the last block.

		if( s > 1 && (s & 1) == 1 && tid == 0 ){
			sdata[0] = reduce_funct(sdata[0], sdata[ s - 1 ]);
		}
		s >>= 1;
		for( ; s > 0 && tid < s ; s >>= 1 ){
			sdata[tid] = reduce_funct(sdata[tid], sdata[tid + s]);
			if( tid == 0 && s > 1 && (s & 1) == 1 ){
				sdata[0] = reduce_funct(sdata[0], sdata[ s - 1 ]);
			}
			__syncthreads();
		}
		//__syncthreads();

		if( tid == 0 ){
			d_buffer[origindex] = sdata[0];
		}
	}
	cudaDeviceSynchronize();
	
}

__global__ void reduce_easy( 
	opFunct_intint reduce_funct,
	int*           d_input,
	int*           d_buffer,
	int*           d_levelss,
	int            depp,
	int            size
){
	reduce_with_buffer<<<1,1,sizeof(int)>>>(
		reduce_funct,
		d_input,
		d_buffer,
		d_levelss,
		depp,
		0,
		0,
		1
	);

}

int idiv_ceil ( int numerator, int denominator ){
	return numerator / denominator + (((numerator < 0) ^ (denominator > 0)) && (numerator%denominator));
}

int sum_reduce_easier(std::vector<int> h_input){
	
	int num_launches = h_input.size();
	int depth = 0;
	int buffer_size = 0;

	std::vector<int> h_levels(MAX_REDUCE_POW);
	//std::vector<int> h_levels();
	while( num_launches > 1){
		h_levels[depth] = num_launches;
		//h_levels.push_back(num_launches);
		num_launches = idiv_ceil(num_launches, MAX_SHARMEM);
		buffer_size += num_launches;
		depth++;
	}
	h_levels[depth] = num_launches;
	//h_levels.push_back(num_launches);
	buffer_size += num_launches;

	int* d_levels = NULL;
	int* d_buffer = NULL;
	int* d_input  = NULL;
	gpuErrchk( cudaMalloc( (void**)&d_levels , h_levels.size() * sizeof(int) ));
	gpuErrchk( cudaMalloc( (void**)&d_buffer, buffer_size * sizeof(int) ));
	gpuErrchk( cudaMalloc( (void**)&d_input, h_input.size() * sizeof(int) ));

	gpuErrchk( cudaMemcpy( d_input  , h_input.data()  , h_input.size()  * sizeof(int) , cudaMemcpyHostToDevice ) );
	gpuErrchk( cudaMemcpy( d_levels , h_levels.data() , h_levels.size() * sizeof(int) , cudaMemcpyHostToDevice ) );

	cudaMemcpyFromSymbol(&h_funcp_sumpair, p_sumpair, sizeof(opFunct_intint));

	cudaDeviceSynchronize();

	reduce_easy<<<1,1>>>(h_funcp_sumpair, d_input, d_buffer, d_levels, depth, h_input.size());

	int reduced_sum;
	gpuErrchk( cudaMemcpy( &reduced_sum, d_buffer, sizeof(int) , cudaMemcpyDeviceToHost ) );
	cudaFree(d_input);
	cudaFree(d_buffer);
	cudaFree(d_levels);

	return reduced_sum;

}

int reduceTestEasier( int numsum ){
	std::vector<int> h_ones( numsum, 1 );
	int h_sum = sum_reduce_easier(h_ones);
	
	return h_sum;
}

int main(){
	int expected = 2048;
	int actual =  reduceTestEasier(expected);
	std::cout << "reduce expected:" << expected << std::endl;
	std::cout << "reduce actual  :" << actual << std::endl;

	return 0;
}

One more thing is that it doesn’t seem to be related to my hardware as I’ve gotten identical results on a GV100.

The incorrect values I’ve found are:

5.1 - Easy reduceKernel
-------------------------------------------------------------------------------
../unitest.cpp:333
...............................................................................

../unitest.cpp:337: FAILED:
  CHECK( numsum == 2048 )
with expansion:
  1025 (0x401) == 2048 (0x800)

../unitest.cpp:371: FAILED:
  CHECK( numsum == 515 )
with expansion:
  512 (0x200) == 515 (0x203)

../unitest.cpp:374: FAILED:
  CHECK( numsum == 1024 )
with expansion:
  512 (0x200) == 1024 (0x400)

../unitest.cpp:377: FAILED:
  CHECK( numsum == 2048 )
with expansion:
  55312896 (0x34c0200) == 2048 (0x800)

../unitest.cpp:380: FAILED:
  REQUIRE( numsum == 2049 )
with expansion:
  1025 (0x401) == 2049 (0x801)

What seems to be happening is that once recursion depth reaches 3 for Dynamically called Kernels, there seems to be some funny behaviour that I’ve yet to figure out. This is because for numbers greater than 262684 ( 512 x 512 ) the “working” example also yeilds strange results.

I’m saying that these results are strange because as far as I know the recursion depth limitation for DP is 24, and I don’t think I’ve exceeded any memory limits from kernel call overheads due to DP, especially on dpbug.cu

Any more help on this would be much appreciated.

Thanks

Robert_Crovella · May 2, 2019, 12:05pm

First of all, any time you are having trouble with a CUDA program, I recommend running your program with cuda-memcheck.

When I run your program with cuda-memcheck it is reporting CUDA kernel execution errors. Whether you understand the errors or not, its good practice to at least be aware of them, before asking others for help.

Second, when using CDP, and especially if you are having trouble, I encourage you to use proper CUDA error checking in device code (e.g. on any device runtime API calls, and after launching any kernels, in a fashion nearly identical to how you would do it in host code). This information is also useful to the debug process. (Of course, you should be doing proper CUDA error checking in host code also. Your error checking after a host code kernel launch is not quite correct either.)

When I add that error checking and printf of error string to your in-kernel cudaDeviceSynchronize after the kernel launch(es), I get output like this:

CDP kernel error: cudaDeviceSynchronize failed because caller's grid depth exceeds cudaLimitDevRuntimeSyncDepth
CDP kernel error: cudaDeviceSynchronize failed because caller's grid depth exceeds cudaLimitDevRuntimeSyncDepth

You can then review more detail on the meaning of that in the CDP documentation:

https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#nesting-and-synchronization-depth

Your recursion is exceeding the maximum sync depth limit, and you will need to fix that. Increase the limit, or limit your recursion depth.

I may choose to ignore debug-oriented questions where the asker has not demonstrated implementation of proper CUDA error checking, and reported the results of a run of their non-functional code with cuda-memcheck. Hopefully you can understand why. Hypothetically speaking, if someone is unwilling to invest the effort in rigorous, proper CUDA error checking, I would often consider it a waste of my time to do that work for them.

Lianguage · May 2, 2019, 11:36pm

The first result in google is literally the comment in stackoverflow you edited after posting that, and also literally a comment in my own code :p

I may not have used it correctly.

Thanks rob I’ll dig a bit further.

Robert_Crovella · May 3, 2019, 3:06am

Sorry if my posting here was misinterpreted.

I have now deleted that line.

Good luck!

Topic		Replies	Views
CUDA Dynamic Parallelism API and Principles Technical Blog	19	949	May 2, 2017
Dynamic parallelism, Kernel didn't launch CUDA Programming and Performance	6	1901	September 12, 2016
CUDA Dynamic Parallelism, the code doesn't run as expected CUDA Programming and Performance	18	468	July 20, 2024
Reduction Sum using Dynamic Parallelism working upto a magical number? CUDA Programming and Performance	1	1172	August 10, 2019
Parallel Reduction CUDA Programming and Performance	10	3764	June 26, 2011
Performance drops with dynamic parallelism CUDA Programming and Performance cuda , dynamic-control	12	733	June 3, 2024
Learning by coding recursive sum using dynamic parallelism CUDA Programming and Performance	2	735	January 17, 2018
Understanding and adjusting Mark Harris's array reduction CUDA Programming and Performance	11	4480	August 26, 2018
Inconsistent results for reduction, except while printf or cudamemcheck CUDA Programming and Performance	29	2447	September 13, 2016
dynamic parallelism CUDA Programming and Performance	3	1112	December 30, 2012

Passing dynamically allocated memory in kernel to sub kernel via dynamic parallelism

Related topics