Hello all,

I am trying to do a parallel sum using dynamic parallelism. I know it is very inefficient. I’m doing it to learn.

Basically, I am trying to replicate the behavior of the following sequential C code but in CUDA.

```
double rec_sum(double * v, int n){
if (n <=1)
return v[0];
double sum_left = rec_sum(v,n/2);
double sum_right = rec_sum(&v[n/2],n - (n/2));
return sum_left + sum_right;
}
```

However, I cannot get it to work. Any help? Maybe something to do with syncs, shared memory and those sort of things … !?

Here is my FULL CUDA code, which I cam compiling with

nvcc -arch=sm_35 -dc recursive_sum.cu

nvcc -arch=sm_35 recursive_sum.o -lcudadevrt

I am working with a TeslaK40.

The code now compiles and works but produces the wrong output.

3.380729 (correct) vs 4.165090 (wrong)

```
#include <stdio.h>
double rec_sum(double * v, int n){
if (n <=1)
return v[0];
double sum_left = rec_sum(v,n/2);
double sum_right = rec_sum(&v[n/2],n - (n/2));
return sum_left + sum_right;
}
__global__ void kernel_rec_sum_global_mem(double * v, int n){
if (n <=1){
if (threadIdx.x==0){
v[0] = v[0] + v[1];
}
return;
}else{
if (threadIdx.x == 0){
kernel_rec_sum_global_mem<<<1,2>>>(v , n/2 );
}
if (threadIdx.x == 1){
kernel_rec_sum_global_mem<<<1,2>>>(&v[n/2], n - (n/2));
}
__syncthreads(); //makes sure all the threads have launched their sub kernel
if(threadIdx.x == 0){
cudaDeviceSynchronize(); //makes sure that all the sub kernels have finished as far as thread 0 is concerned
}
__syncthreads();// makes sure global memory is consistent
if(threadIdx.x == 0){
v[0] = v[0] + v[n/2];
}
}
}
int main(){
int n = 2<<3;
double *v = (double *) malloc(n * sizeof(double));
for (int i = 0; i < n; i++){
v[i] = 1.0/(1.0 + ((double )i ));
}
double total;
// test recursive sum
total = rec_sum(v, n);
printf("%f \n", total);
// test recursive sum with dynamic parallelism
cudaSetDevice( 0 );
cudaDeviceReset();
double *v_dev;
double *sum_dev;
cudaMalloc((void**)&v_dev, n*sizeof(double));
cudaMalloc((void**)&sum_dev, 1*sizeof(double));
cudaMemcpy(v_dev, v, n*sizeof(double), cudaMemcpyHostToDevice);
kernel_rec_sum_global_mem<<<1,2>>>(v_dev, n);
cudaDeviceSynchronize();
total = 0;
cudaMemcpy(&total, v_dev, 1*sizeof(double), cudaMemcpyDeviceToHost);
printf("%f \n", total);
cudaFree(v_dev);
cudaFree(sum_dev);
return 0;
}
```

The first FULL CUDA code of my post had an obvious error. I was passing shared memory into a kernel argument, which is not allowed.

```
#include <stdio.h>
double rec_sum(double * v, int n){
if (n <=1)
return v[0];
double sum_left = rec_sum(v,n/2);
double sum_right = rec_sum(&v[n/2],n - (n/2));
return sum_left + sum_right;
}
__global__ void kernel_rec_sum(double* sum, double * v, int n){
if (n <=1){
*sum = v[0];
return;
}
__shared__ double sum_left;
__shared__ double sum_right;
if (threadIdx.x == 0){
kernel_rec_sum<<<1,2>>>(&sum_left, v , n/2 );
}
if (threadIdx.x == 1){
kernel_rec_sum<<<1,2>>>(&sum_right, &v[n/2], n - (n/2) );
}
__syncthreads(); //makes sure all the threads have launched their sub kernel
if(threadIdx.x == 0){
cudaDeviceSynchronize(); //makes sure that all the sub kernels have finished as far as thread 0 is concerned
}
__syncthreads();// makes sure shared memory is coordinated
if(threadIdx.x == 0){
*sum = sum_left + sum_right;
}
}
int main(){
int n = 2<<3;
double *v = (double *) malloc(n * sizeof(double));
for (int i = 0; i < n; i++){
v[i] = 1.0/(1.0 + ((double )i ));
}
double total;
// test recursive sum
total = rec_sum(v, n);
cend(&cputime);
printf("%f , %f\n", total,cputime);
// test recursive sum with dynamic parallelism
cudaSetDevice( 0 );
cudaDeviceReset();
double *v_dev;
double *sum_dev;
cudaMalloc((void**)&v_dev, n*sizeof(double));
cudaMalloc((void**)&sum_dev, 1*sizeof(double));
cudaMemcpy(v_dev, v, n*sizeof(double), cudaMemcpyHostToDevice);
kernel_rec_sum<<<1,2>>>(sum_dev, v_dev, n);
cudaDeviceSynchronize();
total = 0;
cudaMemcpy(&total, sum_dev, 1*sizeof(double), cudaMemcpyDeviceToHost);
printf("%f \n", total);
cudaFree(v_dev);
cudaFree(sum_dev);
return 0;
}
```