Hello,
I’ve been playing with the following dynamic parallelism kernel to sum n elements.
Even when I comment out all syncs, the code still works, that is, it produces the correct result.
I have no idea why that is the case.
In fact, reading online, it looks like it should not work.
Help ?
#include <stdio.h>
typedef long long int number;
__global__ void gpusum_dyn_par_1 (number *input, number *output) {
int tid = threadIdx.x;
int size = 2*blockDim.x; // the number of threads we are launching is half of the number of elements that we need to process
// shift the data to the region this block is going to operate on
number *ptr_input = input + blockIdx.x*size;
number *ptr_output = &output[blockIdx.x];
// do the sum by hand and exit
if (size == 2 && tid == 0) {
ptr_output[0] = ptr_input[0] + ptr_input[1];
return;
}
int s = size/2;
// check that there is something to do and that this thread is supposed to do something
if(s > 1 && tid < s) {
ptr_input[tid] += ptr_input[tid + s];
}
// wait for everyone in the block to get here. we also need this to see what other threads in the block have been doing to memory
//__syncthreads();
// the thread 0 of each block launches a new grid of children threads
if(tid==0) {
// here we are only launching one block. Notice again that we only lauch half has many threads as the numbe of elements] we need to process
gpusum_dyn_par_1 <<<1, s/2>>>(ptr_input,ptr_output);
// this sync forces all the children to finish before parent-thread-0 continues
//cudaDeviceSynchronize();
}
// sync at block level again
//__syncthreads();
}
int main(){
int n = 32*1024 *1024;
int n_per_block = 1024;
int threads_per_block = n_per_block / 2;
int num_blocks = n / n_per_block;
number * v = (number *) malloc(n * sizeof(number));
for (int i = 0; i < n ; i++){
v[i] = i;
}
number * v_out = (number * ) malloc( num_blocks * sizeof(number) );
number * d_v;
number * d_v_out;
cudaMalloc( (void**)&d_v , n*sizeof(number) );
cudaMalloc( (void**)&d_v_out , num_blocks*sizeof(number) );
cudaMemcpy( (void*) d_v , (void*) v , n*sizeof(number) , cudaMemcpyHostToDevice );
gpusum_dyn_par_1<<< num_blocks , threads_per_block >>>(d_v, d_v_out);
cudaMemcpy( (void*) v_out , (void*) d_v_out , num_blocks*sizeof(number) , cudaMemcpyDeviceToHost );
number total_gpu = 0;
for (int i = 0; i < num_blocks ; i++){
total_gpu = total_gpu + v_out[i];
}
}
The code is only to work with power two size inputs.
I am compiling with
/usr/local/cuda-8.0/bin/nvcc -ccbin g++ -I…/…/common/inc -dc -arch=sm_35 -o sum_gpu_dyn_par.o -c sum_gpu_dyn_par.cu
/usr/local/cuda-8.0/bin/nvcc -ccbin g++ -arch=sm_35 -o a.out sum_gpu_dyn_par.o -lcudadevrt