Hi…
I’m, developing simulation of atom behavior in crystals, (BCC, FCC lattice, [Fe, Au, Cu, …])
I have 2 different CUDA devices:
-
GeForce 9800 GX2 (Capability 1.1)
-
TESLA D870 (Capability 1.0)
Problem part of device code for generating atom network:
for(int i = 0; i < sizeB; i++){
curG.x = curB.x; curG.y = curB.y; curG.z = curB.z;
cellID_G = cellID_B;
for(int j = 0; j < sizeG; j++){
mask = maskBase;
/* If curG is behind triangle (grain border) inline device method "IsBehindTriang" returns 0 so cntConj will by 0 at the end.. */
cntConj = 1;
for(uint it = 0; it < cntTriangs; it++){
a = triangShared[it].n.x; b = triangShared[it].n.y; c = triangShared[it].n.z; d = triangShared[it].d;
cntConj *= IsBehindTriang(a, b, c, d, curG.x, curG.y, curG.z);
}
__syncthreads();
if(cntConj == 1){
out[idxOut].type = type;
out[idxOut].tempr = tempr;
out[idxOut].id = cellID_G;
out[idxOut].grainID = grainID;
out[idxOut].mask = mask;
out[idxOut].pos.x = curG.x;
out[idxOut].pos.y = curG.y;
out[idxOut].pos.z = curG.z;
out[idxOut].mate[DIR_BCC_0] = cellID_G + mateID[DIR_BCC_0];
out[idxOut].mate[DIR_BCC_1] = cellID_G + mateID[DIR_BCC_1];
out[idxOut].mate[DIR_BCC_2] = cellID_G + mateID[DIR_BCC_2];
out[idxOut].mate[DIR_BCC_3] = cellID_G + mateID[DIR_BCC_3];
out[idxOut].mate[DIR_BCC_4] = cellID_G + mateID[DIR_BCC_4];
out[idxOut].mate[DIR_BCC_5] = cellID_G + mateID[DIR_BCC_5];
out[idxOut].mate[DIR_BCC_6] = cellID_G + mateID[DIR_BCC_6];
out[idxOut].mate[DIR_BCC_7] = cellID_G + mateID[DIR_BCC_7];
}
__syncthreads();
curG.x += gx; curG.y += gy; curG.z += gz;
idxOut++;
cellID_G++;
}
curB.x += bx; curB.y += by; curB.z += bz;
cellID_B += stepB;
}
When i run this code on GeForce 9800 GX2, atom network is generated OK…
On Tesla i receive CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES…
When i disable this part of code Tesla works OK:
//out[idxOut].mate[DIR_BCC_0] = cellID_G + mateID[DIR_BCC_0];
//out[idxOut].mate[DIR_BCC_1] = cellID_G + mateID[DIR_BCC_1];
//out[idxOut].mate[DIR_BCC_2] = cellID_G + mateID[DIR_BCC_2];
//out[idxOut].mate[DIR_BCC_3] = cellID_G + mateID[DIR_BCC_3];
//out[idxOut].mate[DIR_BCC_4] = cellID_G + mateID[DIR_BCC_4];
//out[idxOut].mate[DIR_BCC_5] = cellID_G + mateID[DIR_BCC_5];
//out[idxOut].mate[DIR_BCC_6] = cellID_G + mateID[DIR_BCC_6];
//out[idxOut].mate[DIR_BCC_7] = cellID_G + mateID[DIR_BCC_7];
Any ideas what causes this problem?
Used stuff:
#define Vec float3
long cntConj, cntConjMate[MAX_MATES];
Atom* out ...
struct Atom{
char type;
ushort tempr;
int id;
uint grainID;
long mask;
Vec pos;
int mate[MAX_MATES];
};
My second problem is, that i want to improve inside loop to check actual curB and all his closest mates too:
for(uint it = 0; it < cntTriangs; it++){
a = triangShared[it].n.x; b = triangShared[it].n.y; c = triangShared[it].n.z; d = triangShared[it].d;
cntConj *= IsBehindTriang(a, b, c, d, curG.x, curG.y, curG.z);
cntConjMate[DIR_BCC_0] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_0].x, matePos[DIR_BCC_0].y, matePos[DIR_BCC_0].z);
cntConjMate[DIR_BCC_1] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_1].x, matePos[DIR_BCC_1].y, matePos[DIR_BCC_1].z);
cntConjMate[DIR_BCC_2] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_2].x, matePos[DIR_BCC_2].y, matePos[DIR_BCC_2].z);
cntConjMate[DIR_BCC_3] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_3].x, matePos[DIR_BCC_3].y, matePos[DIR_BCC_3].z);
cntConjMate[DIR_BCC_4] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_4].x, matePos[DIR_BCC_4].y, matePos[DIR_BCC_4].z);
cntConjMate[DIR_BCC_5] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_5].x, matePos[DIR_BCC_5].y, matePos[DIR_BCC_5].z);
cntConjMate[DIR_BCC_6] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_6].x, matePos[DIR_BCC_6].y, matePos[DIR_BCC_6].z);
cntConjMate[DIR_BCC_6] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_7].x, matePos[DIR_BCC_7].y, matePos[DIR_BCC_7].z);
}
__syncthreads();
mask |= (CELL_MASK_MATE_0_BIT_1 * cntConjMate[DIR_BCC_0]);
mask |= (CELL_MASK_MATE_1_BIT_1 * cntConjMate[DIR_BCC_1]);
mask |= (CELL_MASK_MATE_2_BIT_1 * cntConjMate[DIR_BCC_2]);
mask |= (CELL_MASK_MATE_3_BIT_1 * cntConjMate[DIR_BCC_3]);
mask |= (CELL_MASK_MATE_4_BIT_1 * cntConjMate[DIR_BCC_4]);
mask |= (CELL_MASK_MATE_5_BIT_1 * cntConjMate[DIR_BCC_5]);
mask |= (CELL_MASK_MATE_6_BIT_1 * cntConjMate[DIR_BCC_6]);
mask |= (CELL_MASK_MATE_7_BIT_1 * cntConjMate[DIR_BCC_7]);
if(cntConj == 1){
.........
out[idxOut].mask = mask; /* This causes CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES on GX2 and Tesla too... */
.........
}
__syncthreads();
So after inside for loop array cntConjMate consists 0 or 1,
and i want to set bit for that mate direction in atom mask…
When setting up this mask to global memory CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES is thrown…
I’m developing on Linux - Ubuntu 8.04 - 64bit , NVIDIA Driver Version: 180.06
Thanks for any help
Regards BasY