I would like to get familiar with dynamic parallelism in CUDA 5.0 and, perhaps, exploiting it to accelerate a certain code of mine. I’m using K20 cards which have the needed compute capability (3.5). Below is the code without dynamic parallelism.

```
__global__ void interpolation(cufftDoubleComplex *Uj, double *points, cufftDoubleComplex *result, int N, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int PP;
double P;
const double alfa=(2.-1./cc)*pi_double-0.01;
double phi_cap_s;
cufftDoubleComplex temp;
double cc_points=cc*points[i];
double r_cc_points=rint(cc*points[i]);
temp = make_cuDoubleComplex(0.,0.);
if(i<M) {
for(int m=0; m<(2*K+1); m++) {
P = (K*K-(cc_points-(r_cc_points+m-K))*(cc_points-(r_cc_points+m-K)));
if(P>0.) phi_cap_s = (1./pi_double)*((sinh(alfa*sqrt(P)))/sqrt(P));
if(P<0.) phi_cap_s = (1./pi_double)*((sin(alfa*sqrt(-P)))/sqrt(-P));
if(P==0.) phi_cap_s = alfa/pi_double;
PP = modulo((r_cc_points + m -K ),(cc*N));
temp.x = temp.x+phi_cap_s*Uj[PP].x;
temp.y = temp.y+phi_cap_s*Uj[PP].y;
}
result[i] = temp;
}
}
```

Here is the code exploiting dynamic parallelism

```
__global__ void series_terms(double *points, cufftDoubleComplex *Uj, double *P, int N)
{
int m = threadIdx.x;
double cc_points=cc*points[0];
double r_cc_points=rint(cc*points[0]);
P[m] = (K*K-(cc_points-(r_cc_points+m-K))*(cc_points-(r_cc_points+m-K)));
}
__global__ void interpolation(cufftDoubleComplex *Uj, double *points, cufftDoubleComplex *result, int N, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int PP;
const double alfa=(2.-1./cc)*pi_double-0.01;
double phi_cap_s;
cufftDoubleComplex temp;
double cc_points=cc*points[i];
double r_cc_points=rint(cc*points[i]);
temp = make_cuDoubleComplex(0.,0.);
dim3 dimBlock(2*K+1,1); dim3 dimGrid(1,1);
if(i<M) {
// --- Global memory variables
double P2;
__syncthreads();
double* P; cudaMalloc((void**)&P,sizeof(double)*(2*K+1));
series_terms<<<dimGrid,dimBlock>>>(&points[i],Uj,P,N);
cudaDeviceSynchronize();
for(int m=0; m<(2*K+1); m++) {
if(P[m]>0.) phi_cap_s = (1./pi_double)*((sinh(alfa*sqrt(P[m])))/sqrt(P[m]));
if(P[m]<0.) phi_cap_s = (1./pi_double)*((sin(alfa*sqrt(-P[m])))/sqrt(-P[m]));
if(P[m]==0.) phi_cap_s = alfa/pi_double;
PP = modulo((r_cc_points + m -K ),(cc*N));
temp.x = temp.x+phi_cap_s*Uj[PP].x;
temp.y = temp.y+phi_cap_s*Uj[PP].y;
}
P2 = (K*K-(cc_points-(r_cc_points+2-K))*(cc_points-(r_cc_points+2-K)));
result[i] = make_cuDoubleComplex(P[2],P2);
}
}
```

As you can see, the two codes have a different output. In the dynamic programming case, result compares the values of P as calculated by the series_terms (child) kernel and by the parent kernel. The two outputs are different for certain locations i and coincide for other locations i. If I run the code many times, the number of locations for which the two outputs differ reduce until they coincide from a certain run on.

I believe I have a synchronization problem, but I cannot figure out how to solve the problem.

Any ideas?