Hi,
I noticed that if I do the following:
int i, n;
n = 10000;
for (i = 0; i < n; i++) {
launch_kernel<<<gridDim, blockDim>>>(in, out, width);
cudaMemcpy(hout + ii*sizeof(int), out, width*sizeof(int));
}
then the driver crashes on windows (for large n), but not on linux. I think that is because cuda needs to go through the windows api, but I am not 100% sure.
Any ways, is there a way to place this for loop inside the kernel, and sync when all computation is done? It is clear that __syncthreads() would not work, but what will.
thanks
you will have to elaborate a little more about how the kernel works… and what it has to do?
I am trying to make sure that all threads in the grid have completed their task before doing something else. I have something like
__device__ static short allOnes(int * arry, int n) {
int ii;
for (ii = 0; ii < n; ii++) {
if (arry[ii] != 1)
return 0;
}
return 1;
}
__global__ static void compute_kernel(int * out, int width, int height, int * lock, int numBlocks) {
extern __shared__ int buffer[];
volatile int * lockVol;
int tx = threadIdx.x, bx = blockIdx.x, dx = blockDim.x;
int * newSmem = buffer;
int * oldSmem = &buffer[bx+3];
int i, index = bx*dx + tx;
lockVol = lock;
if (tx == 0) {
lock[bx] = 0;
if (bx == 0)
lock[numBlocks] = 0;
}
__syncthreads();
for (i = 1; i < height; i++) {
if (i == 1) {
oldSmem[tx + 1] = index < width ? out[index] : 0;
} else if (tx == 0) {
lock[bx] = 1;
if (allOnes(lock, numBlocks))
lock[numBlocks] = -1;
while (lockVol[numBlocks] != -1);
if (bx == 0)
lock[numBlocks] = 0;
while (lockVol[numBlocks] != 0);
lock[bx] = 0;
}
__syncthreads();
if (tx == 0)
oldSmem[0] = index > 0 ? out[index + (i-1)*width - 1] : 0;
if (tx == bx-1)
oldSmem[bx+1] = index+1 < width ? out[index + (i-1)*width + 1] : 0;
__syncthreads();
if (index < width) {
newSmem[tx+1] = fun(oldSmem[tx], oldSmem[tx+2]);
out[index + ii*width] = newSmem[tx+1];
}
__syncthreads();
oldSmem[tx+1] = newSmem[tx+1];
__syncthreads();
}
}
<snip>
cudaMalloc((void **) &d_lock, (gridsize.x+1) * sizeof(int));
compute_kernel<<<gridsize, blocksize, 2 * (blockDim+3) * sizeof(int)>>>(d_out, width, steps, d_lock, gridsize.x);
tell me if you need any more information