how to debug random errors?

I have a program that runs fine like 9/10 times, the occassionally it just screws up and goes fubar for no reason. (like all threads will write 0 to an array)

Here is part of the code:

__global__ void UT_k_rmsd_calc(uint numAtoms, uint numConfs, uint ctr_ID, 

                               float3 const * const d_X, float *d_rmsds) {

// need to use dynamic alloc in the end after all -_-'''''

    extern __shared__ float smem[];     

uint tid = threadIdx.x;

    uint t = blockDim.x*blockIdx.x + tid;

float *s_rmsds = smem;

    float3 *s_conf_A = (float3 *) &s_rmsds[THREADSPERBLOCK];

//we cannot have more than TPB atoms in our system!

    //cooperative loading of coords into ctr_ID.

    if(t < numAtoms) {

        s_conf_A[tid] = d_X[ctr_ID*numAtoms + tid];

    }

__syncthreads(); 

if(t < numConfs) {

        calc_rmsd(tid, numAtoms, &d_X[tid*numAtoms], s_conf_A, s_rmsds);

        __syncthreads();

        d_rmsds[t] = s_rmsds[tid];

    }

}

__syncthreads() can only be used in locations that all threads can reach. Not all threads can reach the body of the following if-statement:

if (t < numConfs) { ... }

From the CUDA C Progamming Guide, section B.6:

“__syncthreads() is allowed in conditional code but only if the conditional evaluates identically across the entire thread block, otherwise the code execution is likely to hang or produce unintended side effects.”

Does the problem go away when you re-write the if-statement as follows:

if(t < numConfs) {        

   calc_rmsd(tid, numAtoms, &d_X[tid*numAtoms], s_conf_A, s_rmsds);

}

__syncthreads();

if(t < numConfs) {  

   d_rmsds[t] = s_rmsds[tid];

}

I have a similar problem. I run the following cuda kernel always with the same input data. Each execution (some “consecutive” threads of) about 5% of blocks gives wrong results. There are blocks that fail every execution, but most of the blocks fails every now and then.

I tested the code on different PCs (all running Linux) and on different GPUs (GeForce GTX480 and Tesla C2075). There are not substantial differences between GeForce and Tesla.

It’s quite difficult for me to explain in detail the purpose of this code here, it should calculate a step of gauss-jordan reduction for a N*N complex matrix. Basically, I want to subtract to every element of the matrix a linear combination of its top row and leftmost column elements.

Sorry for my english, I hope it’s possible to understand. External Image

I can attach the whole code, if needed.

Thank you.

#define N 1024

#define N2 N*N

#define twN 2*N

#define ReduceThreadsX 32

#define ReduceThreadsY 8

#define ReduceBlocksX 32

#define ReduceBlocksY 128

//

// r is the index of the column to be read.

//

// x: depth (discriminates between real and complex part), y: length, z: heigth

//

__global__ void matrixReduce (unsigned r, double * dev_M) {

  if ( ( (blockIdx.y+1)*blockDim.y - 1) > r && ( (blockIdx.z+1)*blockDim.z - 1) > r ){ // Run the block only if it contains elements to be evaluated.

    __shared__ double Piv[2];

    unsigned tx = threadIdx.x;

    unsigned ty = threadIdx.y;

    unsigned tz = threadIdx.z;

if (ty == 0 && tz == 0){

      Piv[tx] = dev_M[2*r*(N+1)+tx];

    }

    __syncthreads();

if (Piv[0] != 0 || Piv[1] != 0){ //If pivot equals 0, do nothing

    __shared__ double Top[2][ReduceThreadsX];

    __shared__ double Lft[2][ReduceThreadsY];

    __shared__ double Buf[2][ReduceThreadsX][ReduceThreadsY];

unsigned tidy = threadIdx.y + blockIdx.y * blockDim.y;

    unsigned tidz = threadIdx.z + blockIdx.z * blockDim.z;

// load the needed part of the top row (at index r)

    if ( tz == 0 && tidy > r){

      Top[tx][ty] = dev_M[twN*r + 2*tidy + tx];

    }

// loa the needed part of the left column (at index r)

    if ( ty == 0 && tidz > r ){

      Lft[tx][tz] = dev_M[twN*tidz + 2*r + tx];

    }

    __syncthreads();

//update the matrix element

    if ( (tidy > r) && (tidz > r) ){

      unsigned glbidx = twN*tidz + 2*tidy + tx;

      Buf[tx][ty][tz] = dev_M[glbidx];

//BufR[tx][ty] -= ( LftR[ty]*(TopR[tx]*PivR+TopI[tx]*PivI) + LftI[ty]*(TopR[tx]*PivI-TopI[tx]*PivR) )/(PivR*PivR + PivI*PivI);

      //BufI[tx][ty] -= ( LftI[ty]*(TopR[tx]*PivR+TopI[tx]*PivI) - LftR[ty]*(TopR[tx]*PivI-TopI[tx]*PivR) )/(PivR*PivR + PivI*PivI);

      Buf[tx][ty][tz] -= ( Lft[tx][tz]*(Top[0][ty]*Piv[0]+Top[1][ty]*Piv[1]) + (-2.0*tx+1)*Lft[(tx+1)%2][tz]*(Top[0][ty]*Piv[1]-Top[1][ty]*Piv[0]) )/(Piv[0]*Piv[0] + Piv[1]*Piv[1]);

dev_M[glbidx] = Buf[tx][ty][tz];

    }

    }

}

}

Some news: CPU and GPU give the same result if I change the line:

Buf[tx][ty][tz] -= ( Lft[tx][tz]*(Top[0][ty]*Piv[0]+Top[1][ty]*Piv[1]) + (-2.0*tx+1)*Lft[(tx+1)%2][tz]*(Top[0][ty]*Piv[1]-Top[1][ty]*Piv[0]) )/(Piv[0]*Piv[0] + Piv[1]*Piv[1]);

into:

Buf[tx][ty][tz] = ( Lft[tx][tz]*(Top[0][ty]*Piv[0]+Top[1][ty]*Piv[1]) + (-2.0*tx+1)*Lft[(tx+1)%2][tz]*(Top[0][ty]*Piv[1]-Top[1][ty]*Piv[0]) )/(Piv[0]*Piv[0] + Piv[1]*Piv[1]);

The only change is that the 2nd has = instead of -=.

The only problem is that I want to update the value of Buf[tx][ty][tz], I don’t want to replace it.

Hum: you are sure about this line?

shared double Buf[2][ReduceThreadsX][ReduceThreadsY];

Are you sure you launch your kernel with a blocksize of [font=“Courier New”](2, ReduceThreadsX, ReduceThreadsY)[/font]?

Indeed the is no need to use a shared variable here, a scalar local to each thread would do just fine.

Blocksize was correctly set, but I declared gridsize as (2, ReduceBlocksX, ReduceBlocksY), instead of (1, ReduceBlocksX, ReduceBlocksY). It’s then quite simple to understand why I sometimes had errors…

Buf, declared as shared gives no differences in result, but degrades performance of about 10%.

Thank you very much, your help was really worthwhile!