Hi Guys,
Just tried a very simple root finding algorithm as following. And every time I try to run it, it will crash on GPU. It runs well with EmuDebug mode. Does any one have any idea about what is wrong?
Thanks,
- Yong.
////////////////////////////////////////////////////////////////////////////////////////////
#define GRID_DIM 32
#define BLOCK_DIM 32
int main( int argc, char** argv)
{
CUT_CHECK_DEVICE();
float *result;
CUDA_SAFE_CALL(cudaMalloc((void **)&result, sizeof(float)));
float *h_a0, *d_a0;
float *h_b0, *d_b0;
float *d_c;
h_a0 = (float*)malloc(sizeof(float)*GRID_DIM);
h_b0 = (float*)malloc(sizeof(float)*GRID_DIM);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_a0, GRID_DIM*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_b0, GRID_DIM*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_c, GRID_DIM*sizeof(float)));
for(int i=0; i<GRID_DIM; i++){
h_a0[i] = 0.0;
h_b0[i] = 0.5;
}
CUDA_SAFE_CALL(cudaMemcpy(d_a0, h_a0, GRID_DIM*sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_b0, h_b0, GRID_DIM*sizeof(float), cudaMemcpyHostToDevice));
NSectionRootFinder<<<32, BLOCK_DIM>>>(d_a0,d_b0,d_c);
printf("Final Result = %f\n", d_c[0]);
free(h_a0);
free(h_b0);
cudaFree(d_a0);
cudaFree(d_b0);
cudaFree(d_c);
CUT_CHECK_ERROR("Kernel execution failed");
CUT_EXIT(argc, argv);
return 0;
}
/////////////////////////////////////////////////////////////////////////////////
//
// Target function.
//
device float f(float x){
float f = 0;
f = (x-0.25)*(x-0.25);
return f;
}
// N-section root finding algorithm.
global void NSectionRootFinder(float *a, float *b, float *res)
{
shared float a_n[BLOCK_DIM];
shared float f_an[BLOCK_DIM];
shared bool bIsResult;
shared float result;
bIsResult = false;
int index;
__shared__ float a0, b0;
a0 = a[blockIdx.x];
b0 = b[blockIdx.x];
int iterCount = 0;
float fa0, fb0;
fa0 = 1.0;
fb0 = 1.0;
float ai;
while(!bIsResult){
ai = a0+ (b0-a0)/BLOCK_DIM * threadIdx.x;
a_n[threadIdx.x] = ai;
f_an[threadIdx.x] = f(ai);
__syncthreads();
//printf("(%d,%d): f[%f]=%f\n", blockIdx.x, threadIdx.x, a_n[threadIdx.x], f_an[threadIdx.x]);
if(threadIdx.x < BLOCK_DIM-1){
index = threadIdx.x;
if(f_an[index] * f_an[index+1] <=0){
a0 = a_n[index];
b0 = a_n[index+1];
fa0 = f_an[index];
fb0 = f_an[index+1];
//printf("(%d,%d), Get it!!\n", blockIdx.x, threadIdx.x);
}
}
__syncthreads();
//printf("a0=%f, b0=%f. fa0=%f, fb0=%f\n", a0, b0, fa0, fb0);
if(fa0*fb0>0){
if(!bIsResult)
result = 0.0;
break;
}
if(fabsf(a0-b0) <= TOL|| fa0<=TOL || fb0<=TOL){
bIsResult = true;
if(fa0<=TOL){
//printf("Final result a0= %f\n", a0);
result = a0;
}
if(fb0<=TOL){
//printf("Final result b0= %f\n", b0);
result = b0;
}
break;
}
__syncthreads();
iterCount++;
}
res[blockIdx.x] = result;
//printf("Final result = %f\n", result);
}