I have studied many posts with similar titles, however, none of the resolutions satisfies my needs. In a nutshell, I want to replace a massive loop over the same kernel (always loading the same, fixed matrix) into one kernel-call with a single load of the matrix and a subsequent loop. The problem is not even loading the NxN matrix repeatedly but launching a kernel with N blocks for N^2 times when N>1000, for which the overhead completely dominates the computation time. But having the loop inside a parent kernel requires dynamic parallelism, ie., calls to child-kernels by certain threads within the loop, which would require across-block synchronization before and/or after such calls at parent-level, for which cudaDeviceSynchronize() (or any other sync) apparently does not work. (The typical advice is to end the parent kernel to sync, which is not an option for me here!)

I enclosed a complete model CUDA-C code to illustrate my problem, where the cudaDeviceSynchronize() commands in the parent kernel donâ€™t have the desired effect and the while-loop races ahead uncontrollably (as the â€śloopâ€ť variable in each thread indicates). Here, each block of the parent loads a row of a fixed matrix M and repeatedly adds the rows to an evolving vector v. Each block results in a sum w which then the child trivially puts back into v for the next iteration. (In reality, more complex operations are involved.) Instead of the default inputs, running with n=8 and maxiter=10 should illustrate the problem.

```
/* nvcc -arch=sm_35 -rdc=true DynKerTestRedux.cu -o DynKerTestRedux.x */
#include<stdio.h>
#include<stdlib.h>
#define MAXTHREADS 256
#define MIN(a, b) ((a) < (b) ? (a) : (b))
__global__
void child(float *v,float *w,int n){
int index = blockDim.x*blockIdx.x+threadIdx.x;
v[index] = w[index];
printf("child: %5d %f\n",index,v[index]);
}
__global__
void parent(float *M,float *v,float *w,int *iter,int *maxiter,int maxiterations,int mingrid,int minblock,int n){
extern __shared__ float Mrow[];//one row of matrix in each block
__shared__ float r[MAXTHREADS];//needed for efficient reduction
int i,j,loop=0;
float thisr;
i = threadIdx.x;
while(i < n){
Mrow[i] = M[n*blockIdx.x+i];//shared storage, loaded only once
i += blockDim.x;//best for coalescence
}
if( (threadIdx.x == 0) && (blockIdx.x == 0)){
*iter = 0;
*maxiter = maxiterations;
}
__syncthreads();
do {//iter many times
i = threadIdx.x;
thisr = 0;
while(i < n){
thisr += v[i] + Mrow[i];
i += blockDim.x;//best for coalescence
}
r[threadIdx.x] = thisr;
__syncthreads();
for (j=blockDim.x/2; j>0; j>>=1) {//reduction
if (threadIdx.x < j) r[threadIdx.x] += r[threadIdx.x + j];
__syncthreads();
}
if (threadIdx.x == 0){
w[blockIdx.x] = r[0]/n;
printf("parent: %5d %5d %5d %f\n",blockIdx.x,*iter,loop++,w[blockIdx.x]);
}
cudaDeviceSynchronize();
if( (threadIdx.x == 0) && (blockIdx.x == 0)){//single-thread process!!!
child<<<mingrid,minblock>>>(v,w,n);
atomicAdd(iter,1);
}
cudaDeviceSynchronize();
}while( *iter < *maxiter );
}
int main(int argc, char **argv){
float *h_M,*M,*h_v,*v,*w;
int n,minblock,mingrid,i,j,maxiterations;
int *iter,*maxiter;
cudaSetDevice(1);
n = 8192;//typical defaults
maxiterations = 1000000;
if(argc > 2){
n = atoi(argv[1]);
maxiterations = atoi(argv[2]);
}
minblock = MIN(MAXTHREADS,n);
mingrid = (n + minblock - 1) / minblock;
printf("init: %5d %5d %5d\n",n,mingrid,minblock);
h_M = (float *) malloc (sizeof(float) * n * n);//flattened matrix M
for( i=0; i<n; i++) for( j=0; j<n; j++) h_M[i*n+j] = 1;
h_v = (float *) malloc (sizeof(float) * n);
for( i=0; i<n; i++) h_v[i] = 0;
cudaMalloc((void**)&iter, sizeof(int));
cudaMalloc((void**)&maxiter, sizeof(int));
cudaMalloc((void**)&M, sizeof(float) * n * n);
cudaMalloc((void**)&v, sizeof(float) * n);
cudaMalloc((void**)&w, sizeof(float) * n);
cudaMemcpy(M, h_M, sizeof(float) * n * n,cudaMemcpyHostToDevice);
cudaMemcpy(v, h_v, sizeof(float) * n,cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
parent<<<n,minblock,n*sizeof(float)>>>(M,v,w,iter,maxiter,maxiterations,mingrid,minblock,n);
cudaDeviceSynchronize();
cudaMemcpy(h_v, v, sizeof(float) * n,cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for( i=0; i<n; i++) printf("result: %5d %f\n",i,h_v[i]);
return(0);
}
```