I have encountered the next problem:
Program returns the right result, when I’m using only one block:
…
kernel<<< 1, N>>>(d_x, N, dimx, dimy, d_ind, BOX,
epsilon, K, d_box, d_lis, scalx, scaly,
BOX1, d_boxx1, d_boxy1, d_lisx1, d_lisy1, d_mxi, d_myi,d_dxy2, d_psi, d_xx, d_yy, d_indx, d_indy, d_boxx, d_boxy, d_lisx, d_lisy);
…
But it returns the wrong value, when I’m splitting it into several blocks:
…
kernel<<< N/BLOCKSIZE, BLOCKSIZE >>>(d_x, N, dimx, dimy, d_ind, BOX,
epsilon, K, d_box, d_lis, scalx, scaly,
BOX1, d_boxx1, d_boxy1, d_lisx1, d_lisy1, d_mxi, d_myi,d_dxy2, d_psi, d_xx, d_yy, d_indx, d_indy, d_boxx, d_boxy, d_lisx, d_lisy);
…
Where N is a size of input data.
Here is the kernel code:
#include "kernel.cuh"
#define _K 7
#define _dimx 4
#define _dimy 4
__global__ void kernel(double *x, int N, int dimx, int dimy, int* ind, int bs,
double epsgrid, int K, int *box, int *lis, double scalx, double scaly,
int bs1, int *boxx1,int * boxy1, int *lisx1, int *lisy1, int *mxi, int *myi, double* dxy2, double* psi,
double* xx, double* yy, int* indx, int* indy, int* boxx, int* boxy, int* lisx, int* lisy) {
int k, d;
int i = threadIdx.x + blockIdx.x * blockDim.x;
double epsx, epsy;
double dy, dx;
int nx2, ny2;
double xc [_dimx+_dimy]; //dimx+dimy
int nn [_K+1]; //K + 1
for (d = 0; d < dimx + dimy; d++)
xc[d] = x[d*N + ind[i]];
neiK(x, N, dimx + dimy, 0, dimx, ind[i], bs, epsgrid, K, box, lis, nn);
epsx = 0;
for (d = 0; d < dimx; d++)
for (k = 1; k <= K; k++)
if ((dx = fabs(xc[d] - x[d*N + nn[k]])) > epsx)
epsx = dx;
epsy = 0;
for (d = dimx; d < dimx + dimy; d++)
for (k = 1; k <= K; k++)
if ((dy = fabs(xc[d] - x[d*N + nn[k]])) > epsy)
epsy = dy;
nx2 = neiE1(x, ind[i], scalx, bs1, epsx, boxx1, lisx1, mxi);
ny2 = neiE1(&x[dimx*N], ind[i], scaly, bs1, epsy, boxy1, lisy1, myi);
dxy2[i] = psi[nx2] + psi[ny2];
__syncthreads();
//Reduction
if(i == 0)
for (k = 1; k<N; k++) {
dxy2[0] += dxy2[k];
}
}
__device__ void neiK(double *x, int N, int dim, int comp1, int comp2, int i, int bs,
double epsgrid, int K, int *box, int *lis, int *nn) {
int k, ix, iy, ix1, iy1, ix2, jj, step, ib = bs - 1;
int el;
double dd, dy;
int d;
double dn [_K+1]; //K+1
double xx [_dimx]; //dimx || dimy
for (k = 0; k < dim; k++)
xx[k] = x[k*N + i];
dn[0] = 0;
for (k = 1; k <= K; k++)
dn[k] = 1e30;
ix = (int) (xx[comp1] / epsgrid) & ib;
iy = (int) (xx[comp2] / epsgrid) & ib;
jj = 0;
while (dn[K] > epsgrid * (jj - 1)) {
step = (jj) ? 2 * jj : 1;
for (ix1 = ix - jj; ix1 <= ix + jj; ix1++) {
ix2 = ix1 & ib;
for (iy1 = iy - jj; iy1 <= iy + jj; iy1 += step) {
el = box[ix2*bs + (iy1 & ib)];
while (el != -1) {
if (el != i) {
dd = fabs(xx[0] - x[el]);
for (d = 1; d < dim; d++)
if ((dy = fabs(xx[d] - x[d*N + el])) > dd)
dd = dy;
if (dd < dn[K]) {
k = K;
while (dd < dn[k]) {
if (k < K) {
dn[k + 1] = dn[k];
nn[k + 1] = nn[k];
}
k--;
}
dn[k + 1] = dd;
nn[k + 1] = el;
}
}
el = lis[el];
}
}
}
for (ix1 = ix - jj; ix1 <= ix + jj; ix1 += step) {
ix2 = ix1 & ib;
for (iy1 = iy - jj + 1; iy1 <= iy + jj - 1; iy1++) {
el = box[ix2*bs +(iy1 & ib)];
while (el != -1) {
if (el != i) {
dd = fabs(xx[0] - x[el]);
for (d = 1; d < dim; d++)
if ((dy = fabs(xx[d] - x[d*N + el])) > dd)
dd = dy;
if ((dy = fabs(xx[1] - x[1*N + el])) > dd)
dd = dy;
if (dd < dn[K]) {
k = K;
while (dd < dn[k]) {
if (k < K) {
dn[k + 1] = dn[k];
nn[k + 1] = nn[k];
}
k--;
}
dn[k + 1] = dd;
nn[k + 1] = el;
}
}
el = lis[el];
}
}
}
jj++;
if (jj == bs / 2)
break;
}
if (jj == (bs / 2)) { //half of the layer
for (ix1 = ix - jj; ix1 < ix + jj; ix1++) {
ix2 = ix1 & ib;
iy1 = iy - jj;
el = box[ix2*bs +(iy1 & ib)];
while (el != -1) {
if (el != i) {
dd = fabs(xx[0] - x[el]);
for (d = 1; d < dim; d++)
if ((dy = fabs(xx[d] - x[d*N + el])) > dd)
dd = dy;
if (dd < dn[K]) {
k = K;
while (dd < dn[k]) {
if (k < K) {
dn[k + 1] = dn[k];
nn[k + 1] = nn[k];
}
k--;
}
dn[k + 1] = dd;
nn[k + 1] = el;
}
}
el = lis[el];
}
}
ix1 = ix - jj;
ix2 = ix1 & ib;
for (iy1 = iy - jj + 1; iy1 <= iy + jj - 1; iy1++) {
el = box[ix2*bs +(iy1 & ib)];
while (el != -1) {
if (el != i) {
dd = fabs(xx[0] - x[el]);
for (d = 1; d < dim; d++)
if ((dy = fabs(xx[d] - x[d*N + el])) > dd)
dd = dy;
if (dd < dn[K]) {
k = K;
while (dd < dn[k]) {
if (k < K) {
dn[k + 1] = dn[k];
nn[k + 1] = nn[k];
}
k--;
}
dn[k + 1] = dd;
nn[k + 1] = el;
}
}
el = lis[el];
}
}
}
}
__device__ int neiE1(double *x, int i, double scal, int bs, double eps, int *box,
int *lis, int *mxi) {
double dd;
int mi, mp, mm, nx = 0;
double xc = x[i];
mp = int((xc + eps) * scal);
if (mp > bs)
mp = bs;
mm = int((xc - eps) * scal);
if (mm < 0)
mm = 0;
mi = box[mp];
while (mi >= 0) {
dd = x[mi] - xc;
if (fabs(dd) <= eps)
nx++;
mi = lis[mi];
}
if (mm >= mp)
return nx - 1;
mi = box[mm];
while (mi >= 0) {
dd = xc - x[mi];
if (fabs(dd) <= eps)
nx++;
mi = lis[mi];
}
nx += mxi[mp - 1] - mxi[mm];
return nx - 1;
}
Have absolutely no idea: what could be the reason. Please, help. Tried different GPUs with compute capabilities 1.1 (GF 9600GT) and 1.3 (Tesla C2050). Both Windows and Linux. Same problem.