Blocks execution problem Unexpectable behavior

I have encountered the next problem:

Program returns the right result, when I’m using only one block:

…

kernel<<< 1, N>>>(d_x, N, dimx, dimy, d_ind, BOX,

		epsilon, K, d_box, d_lis, scalx, scaly, 

		BOX1, d_boxx1, d_boxy1, d_lisx1, d_lisy1, d_mxi, d_myi,d_dxy2, d_psi, d_xx, d_yy, d_indx, d_indy, d_boxx, d_boxy, d_lisx, d_lisy);

…

But it returns the wrong value, when I’m splitting it into several blocks:

…

kernel<<< N/BLOCKSIZE, BLOCKSIZE >>>(d_x, N, dimx, dimy, d_ind, BOX,

		epsilon, K, d_box, d_lis, scalx, scaly, 

		BOX1, d_boxx1, d_boxy1, d_lisx1, d_lisy1, d_mxi, d_myi,d_dxy2, d_psi, d_xx, d_yy, d_indx, d_indy, d_boxx, d_boxy, d_lisx, d_lisy);

…

Where N is a size of input data.

Here is the kernel code:

#include "kernel.cuh"

#define _K 7

#define _dimx 4

#define _dimy 4

__global__ void kernel(double *x, int N, int dimx, int dimy, int* ind, int bs,

		double epsgrid, int K, int *box, int *lis, double scalx, double scaly, 

		int bs1, int *boxx1,int * boxy1, int *lisx1, int *lisy1, int *mxi, int *myi, double* dxy2, double* psi,

		double* xx, double* yy, int* indx, int* indy, int* boxx, int* boxy, int* lisx, int* lisy) {

		

		int k, d;

		int i = threadIdx.x + blockIdx.x * blockDim.x;

		double epsx, epsy;

		double dy, dx;

		int nx2, ny2;

		double xc [_dimx+_dimy]; //dimx+dimy

		int nn [_K+1]; //K + 1

		for (d = 0; d < dimx + dimy; d++)

			xc[d] = x[d*N + ind[i]];

		neiK(x, N, dimx + dimy, 0, dimx, ind[i], bs, epsgrid, K, box, lis, nn);

		epsx = 0;

		for (d = 0; d < dimx; d++)

			for (k = 1; k <= K; k++)

				if ((dx = fabs(xc[d] - x[d*N + nn[k]])) > epsx)

					epsx = dx;

		epsy = 0;

		for (d = dimx; d < dimx + dimy; d++)

			for (k = 1; k <= K; k++)

				if ((dy = fabs(xc[d] - x[d*N + nn[k]])) > epsy)

					epsy = dy;

		nx2 = neiE1(x, ind[i], scalx, bs1, epsx, boxx1, lisx1, mxi);

		ny2 = neiE1(&x[dimx*N], ind[i], scaly, bs1, epsy, boxy1, lisy1, myi);

		dxy2[i] = psi[nx2] + psi[ny2];

		__syncthreads();

		

                //Reduction

                if(i == 0)

			for (k = 1; k<N; k++) {

				dxy2[0] += dxy2[k];

		}

		

}

__device__ void neiK(double *x, int N, int dim, int comp1, int comp2, int i, int bs,

		double epsgrid, int K, int *box, int *lis, int *nn) { 

	int k, ix, iy, ix1, iy1, ix2, jj, step, ib = bs - 1;

	int el;

	double dd, dy;

	int d;

	 double dn [_K+1]; //K+1

	 double xx [_dimx]; //dimx || dimy

	

	for (k = 0; k < dim; k++)

		xx[k] = x[k*N + i];

	dn[0] = 0;

	for (k = 1; k <= K; k++)

		dn[k] = 1e30;

	ix = (int) (xx[comp1] / epsgrid) & ib;

	iy = (int) (xx[comp2] / epsgrid) & ib;

	jj = 0;

	while (dn[K] > epsgrid * (jj - 1)) {

		step = (jj) ? 2 * jj : 1;

		for (ix1 = ix - jj; ix1 <= ix + jj; ix1++) {

			ix2 = ix1 & ib;

			for (iy1 = iy - jj; iy1 <= iy + jj; iy1 += step) {

				el = box[ix2*bs + (iy1 & ib)];

				while (el != -1) {

					if (el != i) {

						dd = fabs(xx[0] - x[el]);

						for (d = 1; d < dim; d++)

							if ((dy = fabs(xx[d] - x[d*N + el])) > dd)

								dd = dy;

						if (dd < dn[K]) {

							k = K;

							while (dd < dn[k]) {

								if (k < K) {

									dn[k + 1] = dn[k];

									nn[k + 1] = nn[k];

								}

								k--;

							}

							dn[k + 1] = dd;

							nn[k + 1] = el;

						}

					}

					el = lis[el];

				}

			}

		}

		for (ix1 = ix - jj; ix1 <= ix + jj; ix1 += step) {

			ix2 = ix1 & ib;

			for (iy1 = iy - jj + 1; iy1 <= iy + jj - 1; iy1++) {

				el = box[ix2*bs +(iy1 & ib)];

				while (el != -1) {

					if (el != i) {

						dd = fabs(xx[0] - x[el]);

						for (d = 1; d < dim; d++)

							if ((dy = fabs(xx[d] - x[d*N + el])) > dd)

								dd = dy;

						if ((dy = fabs(xx[1] - x[1*N + el])) > dd)

							dd = dy;

						if (dd < dn[K]) {

							k = K;

							while (dd < dn[k]) {

								if (k < K) {

									dn[k + 1] = dn[k];

									nn[k + 1] = nn[k];

								}

								k--;

							}

							dn[k + 1] = dd;

							nn[k + 1] = el;

						}

					}

					el = lis[el];

				}

			}

		}

		jj++;

		if (jj == bs / 2)

			break;

	}

	if (jj == (bs / 2)) { //half of the layer

		for (ix1 = ix - jj; ix1 < ix + jj; ix1++) {

			ix2 = ix1 & ib;

			iy1 = iy - jj;

			el = box[ix2*bs +(iy1 & ib)];

			while (el != -1) {

				if (el != i) {

					dd = fabs(xx[0] - x[el]);

					for (d = 1; d < dim; d++)

						if ((dy = fabs(xx[d] - x[d*N + el])) > dd)

							dd = dy;

					if (dd < dn[K]) {

						k = K;

						while (dd < dn[k]) {

							if (k < K) {

								dn[k + 1] = dn[k];

								nn[k + 1] = nn[k];

							}

							k--;

						}

						dn[k + 1] = dd;

						nn[k + 1] = el;

					}

				}

				el = lis[el];

			}

		}

		ix1 = ix - jj;

		ix2 = ix1 & ib;

		for (iy1 = iy - jj + 1; iy1 <= iy + jj - 1; iy1++) {

			el = box[ix2*bs +(iy1 & ib)];

			while (el != -1) {

				if (el != i) {

					dd = fabs(xx[0] - x[el]);

					for (d = 1; d < dim; d++)

						if ((dy = fabs(xx[d] - x[d*N + el])) > dd)

							dd = dy;

					if (dd < dn[K]) {

						k = K;

						while (dd < dn[k]) {

							if (k < K) {

								dn[k + 1] = dn[k];

								nn[k + 1] = nn[k];

							}

							k--;

						}

						dn[k + 1] = dd;

						nn[k + 1] = el;

					}

				}

				el = lis[el];

			}

		}

	}

}

__device__ int neiE1(double *x, int i, double scal, int bs, double eps, int *box,

		int *lis, int *mxi) {

	double dd;

	int mi, mp, mm, nx = 0;

	double xc = x[i];

	mp = int((xc + eps) * scal);

	if (mp > bs)

		mp = bs;

	mm = int((xc - eps) * scal);

	if (mm < 0)

		mm = 0;

	mi = box[mp];

	while (mi >= 0) {

		dd = x[mi] - xc;

		if (fabs(dd) <= eps)

			nx++;

		mi = lis[mi];

	}

	if (mm >= mp)

		return nx - 1;

	mi = box[mm];

	while (mi >= 0) {

		dd = xc - x[mi];

		if (fabs(dd) <= eps)

			nx++;

		mi = lis[mi];

	}

	nx += mxi[mp - 1] - mxi[mm];

	return nx - 1;

}

Have absolutely no idea: what could be the reason. Please, help. Tried different GPUs with compute capabilities 1.1 (GF 9600GT) and 1.3 (Tesla C2050). Both Windows and Linux. Same problem.

Without really having looked at the kernel code, I notice two things:

    If N is not a multiple of BLOCKSIZE, you need to round the number of blocks up (not down, as you currently do) and check at the beginning of the kernel whether the thread id is still within the desired range.

    You use __syncthreads() even though you don’t use shared memory. I assume you do this because of a data dependency in one of the many arrays passed to the kernel. However, __syncthreads() does not synchronize threads belonging to different blocks, so this is going to fail with more than one block.

How could I check the range of thread id? I should add code like

if (i < N) {

...

}

to the kernel?

And one more question.

Yes, I’m using __syncthreads()because of a data dependency. How could I synchronize threads in different blocks?

You can’t and trying to implement any algorithm that relies on interblock syncronization is probably not going to be successful. If you have a data dependency which requires synchronization, then use multiple kernel launches.

THANK YOU!!! I’ve removed the interblock syncronization and all works great!!!