CUDA 2.3 problems with error 25 = "incorrect use of __syncthreads()"

Since upgrading to 2.3 I’m getting new exceptions in emulation mode that resolve to cudaError_t = 25, “incorrect use of __syncthreads()” that I did not get before the upgrade. I have included some sample code wrapped in a try-catch loop that illustrates this problem. I see no reason for this code to trigger that error. Were there changes made in 2.3 that would explain this, and what must I do other than wrapping all my kernel calls in try-catch loops and filtering for that error?

[ Host side ]

extern "C"

void ProcessBreedEvents( AllDeviceData &d, int t, int sz, int ne_obs[MAX_T_OBS] ) { 

	<  snip >

	

	cudaThreadSynchronize();

	cudaStream_t eStream;

	

	cudaStreamCreate( &eStream );

	try {

		_RPTF0(_CRT_ERROR, "	 ProcessBreedEvents -> kProcess_IBreedEvents .....  \n");

		kProcessIBreedEvents<<< dimGridLi, dimBlock, 0, eStream >>>( d.breedEvents, d.popILoci, d.LI_all_freq, 

				d.ObHomL1, d.Hmz, d.sumHmz, d.Ne, d.sumNe, d.ObHomL1Tot_obs, t, sz );

		while( cudaStreamQuery(eStream) != 0) { };

		err = cudaGetLastError();

		cudaStreamDestroy(eStream);

		if (err != 0) throw(err);

	}

	catch( cudaError_t e ) {

		_RPTF1(_CRT_ERROR, "  Error from kProcess_IBreedEvents: %d\n", e);

		_RPTF1(_CRT_ERROR, "		   Error: %s\n",cudaGetErrorString(e));

	}

	catch( ... ) {

		_RPTF0(_CRT_ERROR, "  Unknown execption caught from kProcess_IBreedEvents \n");

	}

}

[ Device side ]

__global__ void

kProcessIBreedEvents( BreedEvent* brdEvents, I_Locus* iLoci, FreqMatrix li_af, float* homL1, 

					 float* hmz, unsigned int* sumHmz, unsigned int* d_ne, unsigned int* sumNe, 

					 float* obHomTot_obs, int t, int sz)

{

	__shared__ unsigned int popInc[256]; 

	__shared__ unsigned int hmzInc[256];

	int tNdx;

	int gNdx;

	int locNdx;

	int afNdx1;

	int afNdx2;

	int e;  //   the event (sz)

	int s;  // the locus (NUM_X_LOCI)

	int iTmp1;

	int iTmp2;

	volatile int num_loci = li_af.mZ;

	volatile int num_ter = li_af.mX;

	volatile int max_alleles = li_af.mY;

	__syncthreads();  // added this one out of desperation, no joy!

	if (threadIdx.x == 0 && s == 0) {

		for (e=0; e<blockDim.x; e++) {

			popInc[e] = 0;  

			hmzInc[e] = 0; 

		}

	}

	__syncthreads();

	e = threadIdx.x + __mul24(blockIdx.x,blockDim.x); // sz

	s = blockIdx.y;  // num_loci

	gNdx = t + e*MAX_T_OBS;

	if (e < sz && s < num_loci && brdEvents[gNdx].male_id != USHORT_MAX && brdEvents[gNdx].female_id != USHORT_MAX) {

			tNdx = brdEvents[gNdx].terNdx;

			locNdx = brdEvents[gNdx].male_id + __mul24(s,MAX_POP_SIZE);

			afNdx1 = tNdx + __mul24(num_ter,iLoci[locNdx].aID_1) + s*__mul24(num_ter,max_alleles);

			afNdx2 = tNdx + __mul24(num_ter,iLoci[locNdx].aID_2) + s*__mul24(num_ter,max_alleles);

			if (s == 0 && iLoci[brdEvents[gNdx].male_id].aID_1 == iLoci[brdEvents[gNdx].male_id].aID_2 ) hmzInc[e] = hmzInc[e] + 1;

			if (s == 0) popInc[e] = popInc[e] + 1;

			li_af.elements[afNdx1] =  li_af.elements[afNdx1] + 1.0f;

			li_af.elements[afNdx2] =  li_af.elements[afNdx2] + 1.0f;

			

			locNdx = brdEvents[gNdx].female_id + __mul24(s,MAX_POP_SIZE);

			afNdx1 = tNdx +  __mul24(num_ter,iLoci[locNdx].aID_1) + s*__mul24(num_ter,max_alleles);

			afNdx2 = tNdx +  __mul24(num_ter,iLoci[locNdx].aID_2) + s*__mul24(num_ter,max_alleles);

			if (s == 0 && iLoci[brdEvents[gNdx].female_id].aID_1 == iLoci[brdEvents[gNdx].female_id].aID_2) hmzInc[e] = hmzInc[e] + 1;

			if (s == 0) popInc[e] = popInc[e] + 1;

			li_af.elements[afNdx1] =  li_af.elements[afNdx1]+1.0f;

			li_af.elements[afNdx2] =  li_af.elements[afNdx2]+1.0f;

	} 

	__syncthreads();

	// end of block

	if (threadIdx.x == blockDim.x-1) {

		if (s == 0) {

			iTmp1 = 0;

			iTmp2 = 0;

			for (e=0; e<blockDim.x; e++) {

				iTmp1 = iTmp1 + popInc[e];

				iTmp2 = iTmp2 + hmzInc[e];

			}

			atomicAdd(&d_ne[t],iTmp1);

			atomicAdd(&sumNe[t],iTmp1);

			atomicAdd(&sumHmz[t],iTmp2);	

		}	

		__threadfence();

		if (blockIdx.x == gridDim.x-1 && blockIdx.y == gridDim.y-1) {

			obHomTot_obs[t] = __fdiv_rn((float)sumHmz[t],(float)d_ne[t]);

			hmz[t] = (float) sumHmz[t];

			homL1[t] = (float) sumHmz[t];

		}

	}

}

The output:

ProcessBreedEvents -> kProcess_IBreedEvents …

First-chance exception at 0x76dd619d in PiSim.exe: Microsoft C++ exception: cudaError at memory location 0x0012c180…

First-chance exception at 0x76dd619d in PiSim.exe: Microsoft C++ exception: cudaError at memory location 0x0012c184…

First-chance exception at 0x76dd619d in PiSim.exe: Microsoft C++ exception: cudaError at memory location 0x0012c4c8…

    Error from kProcess_IBreedEvents: 25

            Error: incorrect use of __syncthreads()

Does this error make any sense? Could it be the __threadfence()?

Thanks in advance for any help that can be provided.

  • R

Did I post this question in the wrong forum, and if not then why no responses? Nvidia guys did you miss this post?

  • R

A simple and complete reproduction case is usually very appreciated by the nVidia guys. In fact, this is the first thing they tend to ask for ;)

Christian