0 result

Hi,
I commented on the last few lines of code. This code is supposed to implement Finite element method with Dirichlet boundary condition. The outcome of this kernel should produce meaningful vectors: local_stiff & local_load.But this kernel produces 0 for the values of this vector. If I use debug mode to compile it, it creates correct output, namely local_stiff, local_load are correct. But release mode are incorrect. These two vectors are 0 as output. So I get inside and try to comment out individual lines to find out what the problem is. I realize after trial and error that
in the last few lines stiff_local[xxx] = y; when xxx is an variable, then all output are 0, however I used these lines in the upper part of the program. So if I comment out the last few lines with stiff_local, then there are non-zero output.

I don’t know what is wrong, is it a bug of the compiler??

----------------code------------------

global void kernel_local_fem(MATKEY *lstif_key, VECKEY *lld_key, MATVAL local_stiff, VECVAL * local_load, struct Elem * elem, struct Coor * coor, int * id, double * const_gauss, int ne){
int ind = THREADNUM
blockIdx.x+threadIdx.x;
if(ind >= ne){
return;
}

/* calculate the local stiffness matrix element wise*/
double x1 = coor[elem[ind].first-1].x;
double x2 = coor[elem[ind].second-1].x;
double x3 = coor[elem[ind].third-1].x;
double y1 = coor[elem[ind].first-1].y;
double y2 = coor[elem[ind].second-1].y;
double y3 = coor[elem[ind].third-1].y;
double diff_x13 = x1 - x3;
double diff_x23 = x2 - x3;
double diff_y13 = y1 - y3;
double diff_y23 = y2 - y3;
double absDet = fabs(diff_x13*diff_y23-diff_x23*diff_y13);
double c = 1.0/2.0/absDet;
double l00 = c*(diff_y23*diff_y23+diff_x23*diff_x23);
double l01 = c*(-diff_y23*diff_y13-diff_x23*diff_x13);
double l11 = c*(diff_y13*diff_y13+diff_x13*diff_x13);
int sbase = 9*ind;
/* update the local stiffness matrix element wise. 
	All exterior node element will have one of the index as 0.
	Update the value of each local stiffness matrix first*/
local_stiff[sbase] = l00;
local_stiff[sbase+1] = l01;
local_stiff[sbase+4] = l11;
local_stiff[sbase+2] = -l00 - l01;
local_stiff[sbase+5] = -l11 - l01;
local_stiff[sbase+8] = l00 + l11 + 2*l01;
local_stiff[sbase+3] = l01;
local_stiff[sbase+6] = -l00 - l01;
local_stiff[sbase+7] = -l11 - l01;

/* Update the global index of each local stiffness matrix*/

VECKEY key[3];

for(int i = 0; i < 3; i++){
	key[i] = elem[ind][i] - 1;
}

for(int i = 0; i < 3; i++){
	for(int j = 0; j < 3; j++){
		lstif_key[sbase+3*i+j].r = key[i]; 
		lstif_key[sbase+3*i+j].c = key[j]; 
	}
}

/*
	printf("%f\n", local_stiff[sbase]);
	printf("%f\n", local_stiff[sbase+1]);
	printf("%f\n", local_stiff[sbase+2]);
	printf("%f\n", local_stiff[sbase+3]);
	printf("%f\n", local_stiff[sbase+4]);
	printf("%f\n", local_stiff[sbase+5]);
	printf("%f\n", local_stiff[sbase+6]);
	printf("%f\n", local_stiff[sbase+7]);
	printf("%f\n", local_stiff[sbase+8]);
 */

//use gauss rule to interpolate
int lbase = 3*ind;
local_load[lbase] = 0;
local_load[lbase+1] = 0;
local_load[lbase+2] = 0;

/* calculate local load vector element wise */
for(int i = 0; i < 16; i++){
	int gbase = i*3;
	double x = const_gauss[gbase];
	double y = const_gauss[gbase+1];
	double hx = diff_x13*x+diff_x23*y+x3;
	double hy = diff_y13*x+diff_y23*y+y3;
	local_load[lbase] += const_gauss[gbase+2]*(fk(hx,hy)*x);
	local_load[lbase+1] += const_gauss[gbase+2]*(fk(hx,hy)*y);
	local_load[lbase+2] += const_gauss[gbase+2]*(fk(hx,hy)*(1-x-y));
}

/* update the local load vector value */
local_load[lbase] *= absDet;
local_load[lbase+1] *= absDet;
local_load[lbase+2] *= absDet;

/* update the global index of each local load vector*/

for(int i = 0; i < 3; i++){
	lld_key[lbase+i] = key[i];
}

/* modify the matrix according to the boundary condition
	if i,j are both boundary node, a_ij = 1, l_i = g_i 
	if i,j are both inner node, a_ij unchanged, l_i unchanged
	if i is boundary node, and j is inner, a_ij = 0, l_i = g_i
	if i is inner node, and j is boundary, a_ij = 0, l_i -= a_ij*g_j
 */
double bdry_vec[3];
for(int i = 0; i < 3; i++){
	bdry_vec[i] = bdry(coor[key[i]].x, coor[key[i]].y);
}

for(int i = 0; i < 3; i++){
	if(id[key[i]] == -1){
		local_load[lbase+i] = bdry_vec[i];
		for(int j = 0; j < 3; j++){
			local_stiff[sbase+3*i+j] = 0; // this line makes local_stiff and local_load 0
		}
		local_stiff[sbase+3*i+i] = 1.0; // this line makes local_stiff and local_load 0
	}else{
		for(int j = 0; j < 3; j++){
			if(id[key[j]] == -1){
				local_load[lbase+i] -= bdry_vec[i]*local_stiff[sbase+3*i+j]; // this line makes local_stiff and local_load 0
				local_stiff[sbase+3*i+j] = 0; // this line makes local_stiff and local_load 0
			}
		}
	}
}

}

find out that if i change the block size it become ok, i guess some block level resource is not enough,
glad to have a fix.