Hi Rob
Thanks for your reply, sorry I got lazy with my code snippet. I’ve made 2 compilable examples, one demonstrating the reduce working correctly and the other which adds another layer of kernel call causing the reduce algorithm to not work correctly for some reason.
I’ve debugged it a fair bit and I noticed that somewhere along the line some values are not taking the correct values but haven’t found the root cause why.
Anyways here are the code snippets:
// nobug.cu
// this should work correctly, giving the correct result
//
// liang@z10pe:~/sandbox/dpbug$ nvcc -g -G -std=c++11 -arch=sm_50 -rdc=true -lcudadevrt nobug.cu -o nobug
// liang@z10pe:~/sandbox/dpbug$ ./nobug
// reduce expected:2048
// reduce actual :2048
#include <iostream>
#include <assert.h>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <memory>
#define MAX_THREADS 256
#define MAX_SHARMEM 512
#define MAX_SAFE_REDUCE 262684
#define MAX_REDUCE_POW 4
#define MAX_REDUCE 33554432
//http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
#define gpuErrchk(ans) { gpuAssert ((ans), __FILE__, __LINE__); }
//http://stackoverflow.com/questions/2080364/using-assert-within-kernel-invocation
#define gpuKernelAssert(X) if(!(X)){printf("tid %d:%s,%d\n",threadIdx.x,__FILE__,__LINE__);return;}
cudaDeviceProp gpu_properties;
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void saferCudaFree(void* dev_ptr ){
if(dev_ptr != NULL){
gpuErrchk( cudaFree(dev_ptr));
}
}
int hostStepBlockDim(int size){
if( size > MAX_THREADS ){
return MAX_THREADS;
}
else {
return size;
}
}
int hostStepGridDim(int size){
return (size/MAX_THREADS + 1);
}
__device__ int devStepBlockDim(int size){
if( size > MAX_THREADS ){
return MAX_THREADS;
}
else {
return size;
}
}
__device__ int devStepGridDim(int size){
return (size/MAX_THREADS + 1);
}
__device__ int devStepSharedGridDim(int size){
return (size/MAX_SHARMEM + 1);
}
__device__ int devStepShareMem(int size){
if( size > MAX_SHARMEM ){
return MAX_SHARMEM;
}
else{
return size;
}
}
__device__ int d_idiv_ceil( int numerator, int denominator ){
return numerator / denominator + (((numerator < 0) ^ (denominator > 0)) && (numerator%denominator));
}
typedef int (*opFunct_intint) (int, int);
__device__ int sumpair( int target, int addor){
return target + addor;
}
const __device__ opFunct_intint p_sumpair = sumpair;
opFunct_intint h_funcp_sumpair;
__global__ void reduce_with_buffer(
opFunct_intint reduce_funct,
int* d_input,
int* d_buffer,
int* d_levels,
int depth,
int bookmark,
int origindex,
int maxindex
){
int tid = threadIdx.x;
int neworigindex1 = 2 * tid + bookmark;//origin index, as in the current cuda thread.
int neworigindex2 = 2 * tid + 1 + bookmark; // called it origin index because subkernels spawn that also reduce
if( depth > 0){
int newdepth = depth - 1;
int newmaxindex;
if(newdepth == 0){
newmaxindex = d_levels[newdepth];
}
else{
newmaxindex = maxindex + d_levels[newdepth];
}
if( neworigindex1 < newmaxindex){
int newbookmark1 = 2 * tid * MAX_SHARMEM;
if( newdepth != 0){ newbookmark1 += maxindex;}
int nextNumElems = devStepShareMem( newmaxindex - newbookmark1 );
int nextBLOCKSZ = d_idiv_ceil(nextNumElems, 2 );
int shareMemSz = nextNumElems * sizeof(int);
//cudaDeviceSynchronize();
reduce_with_buffer<<<1,nextBLOCKSZ,shareMemSz>>>(
reduce_funct,
d_input,
d_buffer,
d_levels,
newdepth,
newbookmark1,
neworigindex1,
newmaxindex
);
}
if( neworigindex2 < newmaxindex ){
int newbookmark2 = (2 * tid + 1 ) * MAX_SHARMEM;
if( newdepth != 0){ newbookmark2 += maxindex;}
int nextNumElems = devStepShareMem( newmaxindex - newbookmark2 );
int nextBLOCKSZ = d_idiv_ceil(nextNumElems, 2 );
int shareMemSz = nextNumElems * sizeof(int);
//cudaDeviceSynchronize();
reduce_with_buffer<<<1,nextBLOCKSZ,shareMemSz>>>(
reduce_funct,
d_input,
d_buffer,
d_levels,
newdepth,
newbookmark2,
neworigindex2,
newmaxindex
);
}
}
cudaDeviceSynchronize();
int* d_data = NULL;
if( depth == 0 ){
d_data = d_input;
}
else{
d_data = d_buffer;
}
if( maxindex - bookmark >= MAX_SHARMEM ){
extern __shared__ int sdata[];
sdata[ 2 * tid ] = d_data[ neworigindex1 ];
sdata[ 2 * tid + 1 ] = d_data[ neworigindex2 ];
__syncthreads();
for( int s = blockDim.x ; s > 0 ; s >>= 1 ){
if(tid < s){
sdata[tid] = reduce_funct(sdata[tid], sdata[tid + s]);
}
__syncthreads();
}
__syncthreads();
if( tid == 0 ){
d_buffer[origindex] = sdata[0];
}
}
else{ //else should deal with all cases where number of elements is not equal to the max size
extern __shared__ int sdata[];
if( neworigindex1 < maxindex ){
sdata[ 2 * tid ] = d_data[ neworigindex1 ];
}
else{
sdata[ 2 * tid ] = 0;
}
if( neworigindex2 < maxindex ){
sdata[ 2 * tid + 1 ] = d_data[ neworigindex2 ];
}
else{
sdata[ 2 * tid + 1 ] = 0;
}
__syncthreads();
int s = maxindex - bookmark; //Assumes that if blockDim.x != MAX_THREADS, then it must be the last block.
if( s > 1 && (s & 1) == 1 && tid == 0 ){
sdata[0] = reduce_funct(sdata[0], sdata[ s - 1 ]);
}
s >>= 1;
for( ; s > 0 && tid < s ; s >>= 1 ){
sdata[tid] = reduce_funct(sdata[tid], sdata[tid + s]);
if( tid == 0 && s > 1 && (s & 1) == 1 ){
sdata[0] = reduce_funct(sdata[0], sdata[ s - 1 ]);
}
__syncthreads();
}
//__syncthreads();
if( tid == 0 ){
d_buffer[origindex] = sdata[0];
}
}
cudaDeviceSynchronize();
}
int idiv_ceil ( int numerator, int denominator ){
return numerator / denominator + (((numerator < 0) ^ (denominator > 0)) && (numerator%denominator));
}
int sum_reduce_buffered(std::vector<int> h_input){
int num_launches = h_input.size();
int depth = 0;
int buffer_size = 0;
std::vector<int> h_levels(MAX_REDUCE_POW);
//std::vector<int> h_levels();
while( num_launches > 1){
h_levels[depth] = num_launches;
//h_levels.push_back(num_launches);
num_launches = idiv_ceil(num_launches, MAX_SHARMEM);
buffer_size += num_launches;
depth++;
}
h_levels[depth] = num_launches;
//h_levels.push_back(num_launches);
buffer_size += num_launches;
int* d_levels = NULL;
int* d_buffer = NULL;
int* d_input = NULL;
gpuErrchk( cudaMalloc( (void**)&d_levels , h_levels.size() * sizeof(int) ));
gpuErrchk( cudaMalloc( (void**)&d_buffer, buffer_size * sizeof(int) ));
gpuErrchk( cudaMalloc( (void**)&d_input, h_input.size() * sizeof(int) ));
gpuErrchk( cudaMemcpy( d_input , h_input.data() , h_input.size() * sizeof(int) , cudaMemcpyHostToDevice ) );
gpuErrchk( cudaMemcpy( d_levels , h_levels.data() , h_levels.size() * sizeof(int) , cudaMemcpyHostToDevice ) );
cudaMemcpyFromSymbol(&h_funcp_sumpair, p_sumpair, sizeof(opFunct_intint));
cudaDeviceSynchronize();
reduce_with_buffer<<<1,num_launches,sizeof(int)>>>(
h_funcp_sumpair,
d_input,
d_buffer,
d_levels,
depth,
0,//bookmark
0,//origindex
1 //maxindex
);
int reduced_sum;
gpuErrchk( cudaMemcpy( &reduced_sum, d_buffer, sizeof(int) , cudaMemcpyDeviceToHost ) );
cudaFree(d_input);
cudaFree(d_buffer);
cudaFree(d_levels);
return reduced_sum;
}
int reduceTestBuffered( int numsum ){
std::vector<int> h_ones( numsum, 1 );
int h_sum = sum_reduce_buffered(h_ones);
return h_sum;
}
int main(){
int expected = 2048;
int actual = reduceTestBuffered(expected);
std::cout << "reduce expected:" << expected << std::endl;
std::cout << "reduce actual :" << actual << std::endl;
return 0;
}
Heres the code which demonstrates the miscalculation:
// dpbug.cu
// expected result is different from actual in reduce operation
// reduce algorithm almost identical to nobug.cu except that it adds 1 level of Dynamic parallelism depth to the process
//
// liang@z10pe:~/sandbox/dpbug$ nvcc -g -G -std=c++11 -arch=sm_50 -rdc=true -lcudadevrt dpbug.cu -o dpbug
// liang@z10pe:~/sandbox/dpbug$ ./dpbug
// reduce expected:2048
// reduce actual :512
#include <iostream>
#include <assert.h>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <memory>
#define MAX_THREADS 256
#define MAX_SHARMEM 512
#define MAX_SAFE_REDUCE 262684
#define MAX_REDUCE_POW 4
#define MAX_REDUCE 33554432
//http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
#define gpuErrchk(ans) { gpuAssert ((ans), __FILE__, __LINE__); }
//http://stackoverflow.com/questions/2080364/using-assert-within-kernel-invocation
#define gpuKernelAssert(X) if(!(X)){printf("tid %d:%s,%d\n",threadIdx.x,__FILE__,__LINE__);return;}
cudaDeviceProp gpu_properties;
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void saferCudaFree(void* dev_ptr ){
if(dev_ptr != NULL){
gpuErrchk( cudaFree(dev_ptr));
}
}
int hostStepBlockDim(int size){
if( size > MAX_THREADS ){
return MAX_THREADS;
}
else {
return size;
}
}
int hostStepGridDim(int size){
return (size/MAX_THREADS + 1);
}
__device__ int devStepBlockDim(int size){
if( size > MAX_THREADS ){
return MAX_THREADS;
}
else {
return size;
}
}
__device__ int devStepGridDim(int size){
return (size/MAX_THREADS + 1);
}
__device__ int devStepSharedGridDim(int size){
return (size/MAX_SHARMEM + 1);
}
__device__ int devStepShareMem(int size){
if( size > MAX_SHARMEM ){
return MAX_SHARMEM;
}
else{
return size;
}
}
__device__ int d_idiv_ceil( int numerator, int denominator ){
return numerator / denominator + (((numerator < 0) ^ (denominator > 0)) && (numerator%denominator));
}
typedef int (*opFunct_intint) (int, int);
__device__ int sumpair( int target, int addor){
return target + addor;
}
const __device__ opFunct_intint p_sumpair = sumpair;
opFunct_intint h_funcp_sumpair;
__global__ void reduce_with_buffer(
opFunct_intint reduce_funct,
int* d_input,
int* d_buffer,
int* d_levels,
int depth,
int bookmark,
int origindex,
int maxindex
){
int tid = threadIdx.x;
int neworigindex1 = 2 * tid + bookmark;//origin index, as in the current cuda thread.
int neworigindex2 = 2 * tid + 1 + bookmark; // called it origin index because subkernels spawn that also reduce
if( depth > 0){
int newdepth = depth - 1;
int newmaxindex;
if(newdepth == 0){
newmaxindex = d_levels[newdepth];
}
else{
newmaxindex = maxindex + d_levels[newdepth];
}
if( neworigindex1 < newmaxindex){
int newbookmark1 = 2 * tid * MAX_SHARMEM;
if( newdepth != 0){ newbookmark1 += maxindex;}
int nextNumElems = devStepShareMem( newmaxindex - newbookmark1 );
int nextBLOCKSZ = d_idiv_ceil(nextNumElems, 2 );
int shareMemSz = nextNumElems * sizeof(int);
//cudaDeviceSynchronize();
reduce_with_buffer<<<1,nextBLOCKSZ,shareMemSz>>>(
reduce_funct,
d_input,
d_buffer,
d_levels,
newdepth,
newbookmark1,
neworigindex1,
newmaxindex
);
}
if( neworigindex2 < newmaxindex ){
int newbookmark2 = (2 * tid + 1 ) * MAX_SHARMEM;
if( newdepth != 0){ newbookmark2 += maxindex;}
int nextNumElems = devStepShareMem( newmaxindex - newbookmark2 );
int nextBLOCKSZ = d_idiv_ceil(nextNumElems, 2 );
int shareMemSz = nextNumElems * sizeof(int);
//cudaDeviceSynchronize();
reduce_with_buffer<<<1,nextBLOCKSZ,shareMemSz>>>(
reduce_funct,
d_input,
d_buffer,
d_levels,
newdepth,
newbookmark2,
neworigindex2,
newmaxindex
);
}
}
cudaDeviceSynchronize();
int* d_data = NULL;
if( depth == 0 ){
d_data = d_input;
}
else{
d_data = d_buffer;
}
if( maxindex - bookmark >= MAX_SHARMEM ){
extern __shared__ int sdata[];
sdata[ 2 * tid ] = d_data[ neworigindex1 ];
sdata[ 2 * tid + 1 ] = d_data[ neworigindex2 ];
__syncthreads();
for( int s = blockDim.x ; s > 0 ; s >>= 1 ){
if(tid < s){
sdata[tid] = reduce_funct(sdata[tid], sdata[tid + s]);
}
__syncthreads();
}
__syncthreads();
if( tid == 0 ){
d_buffer[origindex] = sdata[0];
}
}
else{ //else should deal with all cases where number of elements is not equal to the max size
extern __shared__ int sdata[];
if( neworigindex1 < maxindex ){
sdata[ 2 * tid ] = d_data[ neworigindex1 ];
}
else{
sdata[ 2 * tid ] = 0;
}
if( neworigindex2 < maxindex ){
sdata[ 2 * tid + 1 ] = d_data[ neworigindex2 ];
}
else{
sdata[ 2 * tid + 1 ] = 0;
}
__syncthreads();
int s = maxindex - bookmark; //Assumes that if blockDim.x != MAX_THREADS, then it must be the last block.
if( s > 1 && (s & 1) == 1 && tid == 0 ){
sdata[0] = reduce_funct(sdata[0], sdata[ s - 1 ]);
}
s >>= 1;
for( ; s > 0 && tid < s ; s >>= 1 ){
sdata[tid] = reduce_funct(sdata[tid], sdata[tid + s]);
if( tid == 0 && s > 1 && (s & 1) == 1 ){
sdata[0] = reduce_funct(sdata[0], sdata[ s - 1 ]);
}
__syncthreads();
}
//__syncthreads();
if( tid == 0 ){
d_buffer[origindex] = sdata[0];
}
}
cudaDeviceSynchronize();
}
__global__ void reduce_easy(
opFunct_intint reduce_funct,
int* d_input,
int* d_buffer,
int* d_levelss,
int depp,
int size
){
reduce_with_buffer<<<1,1,sizeof(int)>>>(
reduce_funct,
d_input,
d_buffer,
d_levelss,
depp,
0,
0,
1
);
}
int idiv_ceil ( int numerator, int denominator ){
return numerator / denominator + (((numerator < 0) ^ (denominator > 0)) && (numerator%denominator));
}
int sum_reduce_easier(std::vector<int> h_input){
int num_launches = h_input.size();
int depth = 0;
int buffer_size = 0;
std::vector<int> h_levels(MAX_REDUCE_POW);
//std::vector<int> h_levels();
while( num_launches > 1){
h_levels[depth] = num_launches;
//h_levels.push_back(num_launches);
num_launches = idiv_ceil(num_launches, MAX_SHARMEM);
buffer_size += num_launches;
depth++;
}
h_levels[depth] = num_launches;
//h_levels.push_back(num_launches);
buffer_size += num_launches;
int* d_levels = NULL;
int* d_buffer = NULL;
int* d_input = NULL;
gpuErrchk( cudaMalloc( (void**)&d_levels , h_levels.size() * sizeof(int) ));
gpuErrchk( cudaMalloc( (void**)&d_buffer, buffer_size * sizeof(int) ));
gpuErrchk( cudaMalloc( (void**)&d_input, h_input.size() * sizeof(int) ));
gpuErrchk( cudaMemcpy( d_input , h_input.data() , h_input.size() * sizeof(int) , cudaMemcpyHostToDevice ) );
gpuErrchk( cudaMemcpy( d_levels , h_levels.data() , h_levels.size() * sizeof(int) , cudaMemcpyHostToDevice ) );
cudaMemcpyFromSymbol(&h_funcp_sumpair, p_sumpair, sizeof(opFunct_intint));
cudaDeviceSynchronize();
reduce_easy<<<1,1>>>(h_funcp_sumpair, d_input, d_buffer, d_levels, depth, h_input.size());
int reduced_sum;
gpuErrchk( cudaMemcpy( &reduced_sum, d_buffer, sizeof(int) , cudaMemcpyDeviceToHost ) );
cudaFree(d_input);
cudaFree(d_buffer);
cudaFree(d_levels);
return reduced_sum;
}
int reduceTestEasier( int numsum ){
std::vector<int> h_ones( numsum, 1 );
int h_sum = sum_reduce_easier(h_ones);
return h_sum;
}
int main(){
int expected = 2048;
int actual = reduceTestEasier(expected);
std::cout << "reduce expected:" << expected << std::endl;
std::cout << "reduce actual :" << actual << std::endl;
return 0;
}
One more thing is that it doesn’t seem to be related to my hardware as I’ve gotten identical results on a GV100.
The incorrect values I’ve found are:
5.1 - Easy reduceKernel
-------------------------------------------------------------------------------
../unitest.cpp:333
...............................................................................
../unitest.cpp:337: FAILED:
CHECK( numsum == 2048 )
with expansion:
1025 (0x401) == 2048 (0x800)
../unitest.cpp:371: FAILED:
CHECK( numsum == 515 )
with expansion:
512 (0x200) == 515 (0x203)
../unitest.cpp:374: FAILED:
CHECK( numsum == 1024 )
with expansion:
512 (0x200) == 1024 (0x400)
../unitest.cpp:377: FAILED:
CHECK( numsum == 2048 )
with expansion:
55312896 (0x34c0200) == 2048 (0x800)
../unitest.cpp:380: FAILED:
REQUIRE( numsum == 2049 )
with expansion:
1025 (0x401) == 2049 (0x801)
What seems to be happening is that once recursion depth reaches 3 for Dynamically called Kernels, there seems to be some funny behaviour that I’ve yet to figure out. This is because for numbers greater than 262684 ( 512 x 512 ) the “working” example also yeilds strange results.
I’m saying that these results are strange because as far as I know the recursion depth limitation for DP is 24, and I don’t think I’ve exceeded any memory limits from kernel call overheads due to DP, especially on dpbug.cu
Any more help on this would be much appreciated.
Thanks