Yet Another question about wired crash.

Hi Guys,

Just tried a very simple root finding algorithm as following. And every time I try to run it, it will crash on GPU. It runs well with EmuDebug mode. Does any one have any idea about what is wrong?

Thanks,

  • Yong.

////////////////////////////////////////////////////////////////////////////////////////////
#define GRID_DIM 32
#define BLOCK_DIM 32

int main( int argc, char** argv)
{
CUT_CHECK_DEVICE();

float *result;
CUDA_SAFE_CALL(cudaMalloc((void **)&result, sizeof(float)));

float *h_a0, *d_a0;
float *h_b0, *d_b0;
float *d_c;
h_a0 = (float*)malloc(sizeof(float)*GRID_DIM);
h_b0 = (float*)malloc(sizeof(float)*GRID_DIM);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_a0, GRID_DIM*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_b0, GRID_DIM*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_c, GRID_DIM*sizeof(float)));
for(int i=0; i<GRID_DIM; i++){
	h_a0[i] = 0.0;
	h_b0[i] = 0.5;
}
CUDA_SAFE_CALL(cudaMemcpy(d_a0, h_a0, GRID_DIM*sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_b0, h_b0, GRID_DIM*sizeof(float), cudaMemcpyHostToDevice));

NSectionRootFinder<<<32, BLOCK_DIM>>>(d_a0,d_b0,d_c);
printf("Final Result = %f\n", d_c[0]);
free(h_a0);
free(h_b0);
cudaFree(d_a0);
cudaFree(d_b0);
cudaFree(d_c);
CUT_CHECK_ERROR("Kernel execution failed");
CUT_EXIT(argc, argv);
return 0;

}

/////////////////////////////////////////////////////////////////////////////////
//
// Target function.
//
device float f(float x){
float f = 0;
f = (x-0.25)*(x-0.25);
return f;
}

// N-section root finding algorithm.

global void NSectionRootFinder(float *a, float *b, float *res)
{
shared float a_n[BLOCK_DIM];
shared float f_an[BLOCK_DIM];
shared bool bIsResult;
shared float result;

bIsResult = false;
int index;
__shared__ float a0, b0;

a0 = a[blockIdx.x];
b0 = b[blockIdx.x];
int iterCount = 0;

float fa0, fb0;
fa0 = 1.0;
fb0 = 1.0;
float ai;

while(!bIsResult){

	ai = a0+ (b0-a0)/BLOCK_DIM * threadIdx.x;
	a_n[threadIdx.x] = ai;
	f_an[threadIdx.x] = f(ai);
	__syncthreads();

	//printf("(%d,%d): f[%f]=%f\n", blockIdx.x, threadIdx.x, a_n[threadIdx.x], f_an[threadIdx.x]);

	if(threadIdx.x < BLOCK_DIM-1){
		index = threadIdx.x;
		if(f_an[index] * f_an[index+1] <=0){
			a0 = a_n[index];
			b0 = a_n[index+1];
			fa0 = f_an[index];
			fb0 = f_an[index+1];
			//printf("(%d,%d), Get it!!\n", blockIdx.x, threadIdx.x);
		}
	}
	__syncthreads();
	//printf("a0=%f, b0=%f. fa0=%f, fb0=%f\n", a0, b0, fa0, fb0);

	if(fa0*fb0>0){
		if(!bIsResult)
			result = 0.0;
		break;
	}
	
	if(fabsf(a0-b0) <= TOL|| fa0<=TOL || fb0<=TOL){
		bIsResult = true;
		if(fa0<=TOL){
			//printf("Final result a0= %f\n", a0);
			result = a0;
		}
		if(fb0<=TOL){
			//printf("Final result b0= %f\n", b0);
			result = b0;
		}
		break;
	}
	__syncthreads();
	iterCount++;
}
res[blockIdx.x] = result;
//printf("Final result = %f\n", result);

}

you’re calling syncthreads from inside a branch (the while loop). Some threads may exit the loop while others are still in the body resulting in a crash when sync_threads is called. That’s my guess.

Do you have any idea about how can I avoid the syncThread in the loop? In the case above, I have to synchronize all the thread before I move to next iteration.

Is there any signal type of function I can call in CUDA so I can notify other thread to so something according to thread?