Since upgrading to 2.3 I’m getting new exceptions in emulation mode that resolve to cudaError_t = 25, “incorrect use of __syncthreads()” that I did not get before the upgrade. I have included some sample code wrapped in a try-catch loop that illustrates this problem. I see no reason for this code to trigger that error. Were there changes made in 2.3 that would explain this, and what must I do other than wrapping all my kernel calls in try-catch loops and filtering for that error?
[ Host side ]
extern "C"
void ProcessBreedEvents( AllDeviceData &d, int t, int sz, int ne_obs[MAX_T_OBS] ) {
< snip >
cudaThreadSynchronize();
cudaStream_t eStream;
cudaStreamCreate( &eStream );
try {
_RPTF0(_CRT_ERROR, " ProcessBreedEvents -> kProcess_IBreedEvents ..... \n");
kProcessIBreedEvents<<< dimGridLi, dimBlock, 0, eStream >>>( d.breedEvents, d.popILoci, d.LI_all_freq,
d.ObHomL1, d.Hmz, d.sumHmz, d.Ne, d.sumNe, d.ObHomL1Tot_obs, t, sz );
while( cudaStreamQuery(eStream) != 0) { };
err = cudaGetLastError();
cudaStreamDestroy(eStream);
if (err != 0) throw(err);
}
catch( cudaError_t e ) {
_RPTF1(_CRT_ERROR, " Error from kProcess_IBreedEvents: %d\n", e);
_RPTF1(_CRT_ERROR, " Error: %s\n",cudaGetErrorString(e));
}
catch( ... ) {
_RPTF0(_CRT_ERROR, " Unknown execption caught from kProcess_IBreedEvents \n");
}
}
[ Device side ]
__global__ void
kProcessIBreedEvents( BreedEvent* brdEvents, I_Locus* iLoci, FreqMatrix li_af, float* homL1,
float* hmz, unsigned int* sumHmz, unsigned int* d_ne, unsigned int* sumNe,
float* obHomTot_obs, int t, int sz)
{
__shared__ unsigned int popInc[256];
__shared__ unsigned int hmzInc[256];
int tNdx;
int gNdx;
int locNdx;
int afNdx1;
int afNdx2;
int e; // the event (sz)
int s; // the locus (NUM_X_LOCI)
int iTmp1;
int iTmp2;
volatile int num_loci = li_af.mZ;
volatile int num_ter = li_af.mX;
volatile int max_alleles = li_af.mY;
__syncthreads(); // added this one out of desperation, no joy!
if (threadIdx.x == 0 && s == 0) {
for (e=0; e<blockDim.x; e++) {
popInc[e] = 0;
hmzInc[e] = 0;
}
}
__syncthreads();
e = threadIdx.x + __mul24(blockIdx.x,blockDim.x); // sz
s = blockIdx.y; // num_loci
gNdx = t + e*MAX_T_OBS;
if (e < sz && s < num_loci && brdEvents[gNdx].male_id != USHORT_MAX && brdEvents[gNdx].female_id != USHORT_MAX) {
tNdx = brdEvents[gNdx].terNdx;
locNdx = brdEvents[gNdx].male_id + __mul24(s,MAX_POP_SIZE);
afNdx1 = tNdx + __mul24(num_ter,iLoci[locNdx].aID_1) + s*__mul24(num_ter,max_alleles);
afNdx2 = tNdx + __mul24(num_ter,iLoci[locNdx].aID_2) + s*__mul24(num_ter,max_alleles);
if (s == 0 && iLoci[brdEvents[gNdx].male_id].aID_1 == iLoci[brdEvents[gNdx].male_id].aID_2 ) hmzInc[e] = hmzInc[e] + 1;
if (s == 0) popInc[e] = popInc[e] + 1;
li_af.elements[afNdx1] = li_af.elements[afNdx1] + 1.0f;
li_af.elements[afNdx2] = li_af.elements[afNdx2] + 1.0f;
locNdx = brdEvents[gNdx].female_id + __mul24(s,MAX_POP_SIZE);
afNdx1 = tNdx + __mul24(num_ter,iLoci[locNdx].aID_1) + s*__mul24(num_ter,max_alleles);
afNdx2 = tNdx + __mul24(num_ter,iLoci[locNdx].aID_2) + s*__mul24(num_ter,max_alleles);
if (s == 0 && iLoci[brdEvents[gNdx].female_id].aID_1 == iLoci[brdEvents[gNdx].female_id].aID_2) hmzInc[e] = hmzInc[e] + 1;
if (s == 0) popInc[e] = popInc[e] + 1;
li_af.elements[afNdx1] = li_af.elements[afNdx1]+1.0f;
li_af.elements[afNdx2] = li_af.elements[afNdx2]+1.0f;
}
__syncthreads();
// end of block
if (threadIdx.x == blockDim.x-1) {
if (s == 0) {
iTmp1 = 0;
iTmp2 = 0;
for (e=0; e<blockDim.x; e++) {
iTmp1 = iTmp1 + popInc[e];
iTmp2 = iTmp2 + hmzInc[e];
}
atomicAdd(&d_ne[t],iTmp1);
atomicAdd(&sumNe[t],iTmp1);
atomicAdd(&sumHmz[t],iTmp2);
}
__threadfence();
if (blockIdx.x == gridDim.x-1 && blockIdx.y == gridDim.y-1) {
obHomTot_obs[t] = __fdiv_rn((float)sumHmz[t],(float)d_ne[t]);
hmz[t] = (float) sumHmz[t];
homL1[t] = (float) sumHmz[t];
}
}
}
The output:
ProcessBreedEvents → kProcess_IBreedEvents …
First-chance exception at 0x76dd619d in PiSim.exe: Microsoft C++ exception: cudaError at memory location 0x0012c180…
First-chance exception at 0x76dd619d in PiSim.exe: Microsoft C++ exception: cudaError at memory location 0x0012c184…
First-chance exception at 0x76dd619d in PiSim.exe: Microsoft C++ exception: cudaError at memory location 0x0012c4c8…
Error from kProcess_IBreedEvents: 25
Error: incorrect use of __syncthreads()
Does this error make any sense? Could it be the __threadfence()?
Thanks in advance for any help that can be provided.
- R