Sync Kernel in the Kernel Function

Hi,

I noticed that if I do the following:

int i, n;

n = 10000;

for (i = 0; i < n; i++) {

   launch_kernel<<<gridDim, blockDim>>>(in, out, width);

   cudaMemcpy(hout + ii*sizeof(int), out, width*sizeof(int));

}

then the driver crashes on windows (for large n), but not on linux. I think that is because cuda needs to go through the windows api, but I am not 100% sure.

Any ways, is there a way to place this for loop inside the kernel, and sync when all computation is done? It is clear that __syncthreads() would not work, but what will.

thanks

bump

you will have to elaborate a little more about how the kernel works… and what it has to do?

I am trying to make sure that all threads in the grid have completed their task before doing something else. I have something like

__device__ static short allOnes(int * arry, int n) {

	int ii;

	for (ii = 0; ii < n; ii++) {

		if (arry[ii] != 1)

			return 0;

	}

	return 1;

}

__global__ static void compute_kernel(int * out, int width, int height, int * lock, int numBlocks) {

	extern __shared__ int buffer[];

	volatile int * lockVol;

	int tx = threadIdx.x, bx = blockIdx.x, dx = blockDim.x;

	int * newSmem = buffer;

	int * oldSmem = &buffer[bx+3];

	int i, index = bx*dx + tx;

	lockVol = lock;

	if (tx == 0) {

		lock[bx] = 0;

		if (bx == 0)

			lock[numBlocks] = 0;

	}

	__syncthreads();

	for (i = 1; i < height; i++) {

		if (i == 1) {

			oldSmem[tx + 1] = index < width ? out[index] : 0;

		} else if (tx == 0) {

			lock[bx] = 1;

			if (allOnes(lock, numBlocks))

				lock[numBlocks] = -1;

			while (lockVol[numBlocks] != -1);

			if (bx == 0)

				lock[numBlocks] = 0;

			while (lockVol[numBlocks] != 0);

			lock[bx] = 0;

		}

		__syncthreads();

		

		if (tx == 0)

			oldSmem[0] = index > 0 ? out[index + (i-1)*width - 1] : 0;

		if (tx == bx-1)

			oldSmem[bx+1] = index+1 < width ? out[index + (i-1)*width + 1] : 0;

		

		__syncthreads();

		if (index < width) {

			newSmem[tx+1] = fun(oldSmem[tx], oldSmem[tx+2]);

			out[index + ii*width] = newSmem[tx+1];

		}

		__syncthreads();

		oldSmem[tx+1] = newSmem[tx+1];

		__syncthreads();

	}

}

<snip>

	cudaMalloc((void **) &d_lock, (gridsize.x+1) * sizeof(int));

	compute_kernel<<<gridsize, blocksize, 2 * (blockDim+3) * sizeof(int)>>>(d_out, width, steps, d_lock, gridsize.x);

tell me if you need any more information