Blocks execution problem Unexpectable behavior

A1taiR · February 1, 2011, 7:55pm

I have encountered the next problem:

Program returns the right result, when Iâ€™m using only one block:

â€¦

kernel<<< 1, N>>>(d_x, N, dimx, dimy, d_ind, BOX,

		epsilon, K, d_box, d_lis, scalx, scaly, 

		BOX1, d_boxx1, d_boxy1, d_lisx1, d_lisy1, d_mxi, d_myi,d_dxy2, d_psi, d_xx, d_yy, d_indx, d_indy, d_boxx, d_boxy, d_lisx, d_lisy);

â€¦

But it returns the wrong value, when Iâ€™m splitting it into several blocks:

â€¦

kernel<<< N/BLOCKSIZE, BLOCKSIZE >>>(d_x, N, dimx, dimy, d_ind, BOX,

		epsilon, K, d_box, d_lis, scalx, scaly, 

		BOX1, d_boxx1, d_boxy1, d_lisx1, d_lisy1, d_mxi, d_myi,d_dxy2, d_psi, d_xx, d_yy, d_indx, d_indy, d_boxx, d_boxy, d_lisx, d_lisy);

â€¦

Where N is a size of input data.

Here is the kernel code:

#include "kernel.cuh"

#define _K 7

#define _dimx 4

#define _dimy 4

__global__ void kernel(double *x, int N, int dimx, int dimy, int* ind, int bs,

		double epsgrid, int K, int *box, int *lis, double scalx, double scaly, 

		int bs1, int *boxx1,int * boxy1, int *lisx1, int *lisy1, int *mxi, int *myi, double* dxy2, double* psi,

		double* xx, double* yy, int* indx, int* indy, int* boxx, int* boxy, int* lisx, int* lisy) {

		

		int k, d;

		int i = threadIdx.x + blockIdx.x * blockDim.x;

		double epsx, epsy;

		double dy, dx;

		int nx2, ny2;

		double xc [_dimx+_dimy]; //dimx+dimy

		int nn [_K+1]; //K + 1

		for (d = 0; d < dimx + dimy; d++)

			xc[d] = x[d*N + ind[i]];

		neiK(x, N, dimx + dimy, 0, dimx, ind[i], bs, epsgrid, K, box, lis, nn);

		epsx = 0;

		for (d = 0; d < dimx; d++)

			for (k = 1; k <= K; k++)

				if ((dx = fabs(xc[d] - x[d*N + nn[k]])) > epsx)

					epsx = dx;

		epsy = 0;

		for (d = dimx; d < dimx + dimy; d++)

			for (k = 1; k <= K; k++)

				if ((dy = fabs(xc[d] - x[d*N + nn[k]])) > epsy)

					epsy = dy;

		nx2 = neiE1(x, ind[i], scalx, bs1, epsx, boxx1, lisx1, mxi);

		ny2 = neiE1(&x[dimx*N], ind[i], scaly, bs1, epsy, boxy1, lisy1, myi);

		dxy2[i] = psi[nx2] + psi[ny2];

		__syncthreads();

		

                //Reduction

                if(i == 0)

			for (k = 1; k<N; k++) {

				dxy2[0] += dxy2[k];

		}

		

}

__device__ void neiK(double *x, int N, int dim, int comp1, int comp2, int i, int bs,

		double epsgrid, int K, int *box, int *lis, int *nn) { 

	int k, ix, iy, ix1, iy1, ix2, jj, step, ib = bs - 1;

	int el;

	double dd, dy;

	int d;

	 double dn [_K+1]; //K+1

	 double xx [_dimx]; //dimx || dimy

	

	for (k = 0; k < dim; k++)

		xx[k] = x[k*N + i];

	dn[0] = 0;

	for (k = 1; k <= K; k++)

		dn[k] = 1e30;

	ix = (int) (xx[comp1] / epsgrid) & ib;

	iy = (int) (xx[comp2] / epsgrid) & ib;

	jj = 0;

	while (dn[K] > epsgrid * (jj - 1)) {

		step = (jj) ? 2 * jj : 1;

		for (ix1 = ix - jj; ix1 <= ix + jj; ix1++) {

			ix2 = ix1 & ib;

			for (iy1 = iy - jj; iy1 <= iy + jj; iy1 += step) {

				el = box[ix2*bs + (iy1 & ib)];

				while (el != -1) {

					if (el != i) {

						dd = fabs(xx[0] - x[el]);

						for (d = 1; d < dim; d++)

							if ((dy = fabs(xx[d] - x[d*N + el])) > dd)

								dd = dy;

						if (dd < dn[K]) {

							k = K;

							while (dd < dn[k]) {

								if (k < K) {

									dn[k + 1] = dn[k];

									nn[k + 1] = nn[k];

								}

								k--;

							}

							dn[k + 1] = dd;

							nn[k + 1] = el;

						}

					}

					el = lis[el];

				}

			}

		}

		for (ix1 = ix - jj; ix1 <= ix + jj; ix1 += step) {

			ix2 = ix1 & ib;

			for (iy1 = iy - jj + 1; iy1 <= iy + jj - 1; iy1++) {

				el = box[ix2*bs +(iy1 & ib)];

				while (el != -1) {

					if (el != i) {

						dd = fabs(xx[0] - x[el]);

						for (d = 1; d < dim; d++)

							if ((dy = fabs(xx[d] - x[d*N + el])) > dd)

								dd = dy;

						if ((dy = fabs(xx[1] - x[1*N + el])) > dd)

							dd = dy;

						if (dd < dn[K]) {

							k = K;

							while (dd < dn[k]) {

								if (k < K) {

									dn[k + 1] = dn[k];

									nn[k + 1] = nn[k];

								}

								k--;

							}

							dn[k + 1] = dd;

							nn[k + 1] = el;

						}

					}

					el = lis[el];

				}

			}

		}

		jj++;

		if (jj == bs / 2)

			break;

	}

	if (jj == (bs / 2)) { //half of the layer

		for (ix1 = ix - jj; ix1 < ix + jj; ix1++) {

			ix2 = ix1 & ib;

			iy1 = iy - jj;

			el = box[ix2*bs +(iy1 & ib)];

			while (el != -1) {

				if (el != i) {

					dd = fabs(xx[0] - x[el]);

					for (d = 1; d < dim; d++)

						if ((dy = fabs(xx[d] - x[d*N + el])) > dd)

							dd = dy;

					if (dd < dn[K]) {

						k = K;

						while (dd < dn[k]) {

							if (k < K) {

								dn[k + 1] = dn[k];

								nn[k + 1] = nn[k];

							}

							k--;

						}

						dn[k + 1] = dd;

						nn[k + 1] = el;

					}

				}

				el = lis[el];

			}

		}

		ix1 = ix - jj;

		ix2 = ix1 & ib;

		for (iy1 = iy - jj + 1; iy1 <= iy + jj - 1; iy1++) {

			el = box[ix2*bs +(iy1 & ib)];

			while (el != -1) {

				if (el != i) {

					dd = fabs(xx[0] - x[el]);

					for (d = 1; d < dim; d++)

						if ((dy = fabs(xx[d] - x[d*N + el])) > dd)

							dd = dy;

					if (dd < dn[K]) {

						k = K;

						while (dd < dn[k]) {

							if (k < K) {

								dn[k + 1] = dn[k];

								nn[k + 1] = nn[k];

							}

							k--;

						}

						dn[k + 1] = dd;

						nn[k + 1] = el;

					}

				}

				el = lis[el];

			}

		}

	}

}

__device__ int neiE1(double *x, int i, double scal, int bs, double eps, int *box,

		int *lis, int *mxi) {

	double dd;

	int mi, mp, mm, nx = 0;

	double xc = x[i];

	mp = int((xc + eps) * scal);

	if (mp > bs)

		mp = bs;

	mm = int((xc - eps) * scal);

	if (mm < 0)

		mm = 0;

	mi = box[mp];

	while (mi >= 0) {

		dd = x[mi] - xc;

		if (fabs(dd) <= eps)

			nx++;

		mi = lis[mi];

	}

	if (mm >= mp)

		return nx - 1;

	mi = box[mm];

	while (mi >= 0) {

		dd = xc - x[mi];

		if (fabs(dd) <= eps)

			nx++;

		mi = lis[mi];

	}

	nx += mxi[mp - 1] - mxi[mm];

	return nx - 1;

}

Have absolutely no idea: what could be the reason. Please, help. Tried different GPUs with compute capabilities 1.1 (GF 9600GT) and 1.3 (Tesla C2050). Both Windows and Linux. Same problem.

tera · February 2, 2011, 2:25am

Without really having looked at the kernel code, I notice two things:

[*]If N is not a multiple of BLOCKSIZE, you need to round the number of blocks up (not down, as you currently do) and check at the beginning of the kernel whether the thread id is still within the desired range.

[*]You use __syncthreads() even though you don’t use shared memory. I assume you do this because of a data dependency in one of the many arrays passed to the kernel. However, __syncthreads() does not synchronize threads belonging to different blocks, so this is going to fail with more than one block.

A1taiR · February 2, 2011, 8:50am

How could I check the range of thread id? I should add code like

if (i < N) {

...

}

to the kernel?

And one more question.

Yes, I’m using __syncthreads()because of a data dependency. How could I synchronize threads in different blocks?

avidday · February 2, 2011, 9:42am

You can’t and trying to implement any algorithm that relies on interblock syncronization is probably not going to be successful. If you have a data dependency which requires synchronization, then use multiple kernel launches.

A1taiR · February 2, 2011, 9:56am

THANK YOU!!! I’ve removed the interblock syncronization and all works great!!!

Topic		Replies	Views
do not understand thread/block division CUDA Programming and Performance	10	2894	April 23, 2012
Bug report: Incorrect block scheduling CUDA Programming and Performance	18	7876	February 19, 2010
Number of threads affecting answer; this should not happen a VERY strange error.. CUDA Programming and Performance	8	2578	July 17, 2009
Unexpected Behaviour 2+ block thread communication issues CUDA Programming and Performance	0	2302	February 23, 2010
Calculation stalls when using more than one block CUDA Programming and Performance	4	2920	August 17, 2010
Missing Kernel executions CUDA Programming and Performance	2	921	June 27, 2012
Wrong output when adding blocks what am I doing wrong? CUDA Programming and Performance	13	12090	December 4, 2007
Possible CUDA bug for multiple active blocks CUDA Programming and Performance	34	46839	June 17, 2009
synchronization and block independence CUDA Programming and Performance	3	1608	December 19, 2009
Not all blocks working properly in a computation CUDA Developer Tools	0	347	May 11, 2020

Blocks execution problem Unexpectable behavior

Related topics