Why this kernel "needs" a __syncthreads() statement to work?

I’ve been dealing with a weird race condition (I think) error in CUDA. Specifically with the code shown below:

__global__ void getGandDGG(float *gg, float *dgg, float3 *xi, float3 *g, long N)
    {

       int j = threadIdx.x + blockDim.x * blockIdx.x;
	   int i = threadIdx.y + blockDim.y * blockIdx.y;

       float gg_T, gg_tau, gg_beta;
       float dgg_T, dgg_tau, dgg_beta;

      gg_T = g[N*i+j].x * g[N*i+j].x;
      gg_tau = g[N*i+j].y * g[N*i+j].y;
      gg_beta = g[N*i+j].z * g[N*i+j].z;

      dgg_T = (xi[N*i+j].x + g[N*i+j].x) * xi[N*i+j].x;
      dgg_tau = (xi[N*i+j].y + g[N*i+j].y) * xi[N*i+j].y;
      dgg_beta = (xi[N*i+j].z + g[N*i+j].z) * xi[N*i+j].z;

     __syncthreads();

     gg[N*i+j] = gg_T + gg_tau + gg_beta;

     dgg[N*i+j] = dgg_T + dgg_tau + dgg_beta;
    }

If I erase the __synthreads() statement the sum reduce value of dgg is inf. This means that one or more of the values of the float array is/are inf. However, if I write a __syncthreads() statement or if I write a printf() statement, the sum reduced value of dgg is a float number.

When I call the kernel:

getGandDGG<<<numBlocksNN, threadsPerBlockNN>>>(device_gg_vector, device_dgg_vector, xi, device_g, N);
  	gpuErrchk(cudaDeviceSynchronize());

As far as I know gpuErrchk(cudaDeviceSynchronize());
should block the CPU thread until all GPU threads have finished. So the kernel call without a __syncthreads()
should work since each thread is reading an writing from and to its global and unique Id.

Also, as the total data array has 512x512 and the grid is squared, it would not be necessary to put an if
statement to prevent out of memory reading or writing.

For example, threadsPerBlockNN.x = 16. = 16 and threadsPerBlockNN.y = 16, then
numBlocksNN.x = 512/16 and numBlocksNN.y = 512/16.

Since one or more values of dgg is/are inf then I try to verify this putting this in the kernel instead of the syncthreads():

if(i==252 && j == 327){
        printf("ggs T: %f, tau: %f, beta: %f\n", gg_T, gg_tau, gg_beta);
        printf("dggs T: %f, tau: %f, beta: %f\n", dgg_T, dgg_tau, dgg_beta);
      }

I thought one of these values should be inf, and I chose that 2D Id because is where I got nan in my image. However, when I put that code, the sum reduce value of dgg wasn’t inf, was a float number…

What am I missing? Why the difference?

Thanks.

Well, you asked this question on stack overflow, and you were advised that you needed to provide a MCVE.

I would suggest that you do that.

If your claim is accurate, then it shouldn’t take more than about 20 extra lines of code, at the most.

Ready…

Here it is:

Compile with nvcc…

I encountered that if I fwrite my arrays with the __syncthreads statement the arrays were all right, but when I do the fwrite without __syncthreads the arrays weren’t correct.

I don’t get any difference in output whether I run with or without __syncthreads();

The output is always this:

dgg: inf, gg: 413284564992.000000

So I see no evidence of your claim that __syncthreads() makes a difference.

When I use this modified version of your file:

$ cat main.cu
#include<stdio.h>
#include <assert.h>
#include <math.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__host__ long NearestPowerOf2(long n)
{
  if (!n) return n;  //(0 == 2^0)

  int x = 1;
  while(x < n)
  {
      x <<= 1;
  }
  return x;
}

bool isPow2(unsigned int x)
{
    return ((x&(x-1))==0);
}

__global__ void getGGandDGG(float *gg, float *dgg, float3 *xi, float3 *g, long N)
{
  int j = threadIdx.x + blockDim.x * blockIdx.x;
        int i = threadIdx.y + blockDim.y * blockIdx.y;

  float gg_T, gg_tau, gg_beta;
  float dgg_T, dgg_tau, dgg_beta;

  gg_T = g[N*i+j].x * g[N*i+j].x;
  gg_tau = g[N*i+j].y * g[N*i+j].y;
  gg_beta = g[N*i+j].z * g[N*i+j].z;

  dgg_T = (xi[N*i+j].x + g[N*i+j].x) * xi[N*i+j].x;
  dgg_tau = (xi[N*i+j].y + g[N*i+j].y) * xi[N*i+j].y;
  dgg_beta = (xi[N*i+j].z + g[N*i+j].z) * xi[N*i+j].z;

  __syncthreads();
  /*if(i==252 && j == 327){
    printf("ggs T: %f, tau: %f, beta: %f\n", gg_T, gg_tau, gg_beta);
    printf("dggs T: %f, tau: %f, beta: %f\n", dgg_T, dgg_tau, dgg_beta);
  }*/

  gg[N*i+j] = gg_T + gg_tau + gg_beta;
  if (__isinf(gg[N*i+j])) {assert(0);};
  dgg[N*i+j] = dgg_T + dgg_tau + dgg_beta;
  if (__isinf(dgg[N*i+j])) {assert(0);};
}

template <bool nIsPow2>
__global__ void deviceReduceKernel(float *g_idata, float *g_odata, unsigned int n)
{
    extern __shared__ float sdata[];

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int blockSize = blockDim.x;
    unsigned int i = blockIdx.x*blockSize*2 + threadIdx.x;
    unsigned int gridSize = blockSize*2*gridDim.x;

    float mySum = 0.f;

    // we reduce multiple elements per thread.  The number is determined by the
    // number of active thread blocks (via gridDim).  More blocks will result
    // in a larger gridSize and therefore fewer elements per thread
    while (i < n)
    {
        mySum += g_idata[i];

        // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
        if (nIsPow2 || i + blockSize < n)
            mySum += g_idata[i+blockSize];

        i += gridSize;
    }

    // each thread puts its local sum into shared memory
    sdata[tid] = mySum;
    __syncthreads();

// do reduction in shared mem
    if ((blockSize >= 512) && (tid < 256))
    {
        sdata[tid] = mySum = mySum + sdata[tid + 256];
    }

    __syncthreads();

    if ((blockSize >= 256) &&(tid < 128))
    {
            sdata[tid] = mySum = mySum + sdata[tid + 128];
    }

     __syncthreads();

    if ((blockSize >= 128) && (tid <  64))
    {
       sdata[tid] = mySum = mySum + sdata[tid +  64];
    }

    __syncthreads();

#if (__CUDA_ARCH__ >= 300 )
    if ( tid < 32 )
    {
        // Fetch final intermediate sum from 2nd warp
        if (blockSize >=  64) mySum += sdata[tid + 32];
        // Reduce final warp using shuffle
        for (int offset = warpSize/2; offset > 0; offset /= 2)
        {
            mySum += __shfl_down(mySum, offset);
        }
    }
#else
    // fully unroll reduction within a single warp
    if ((blockSize >=  64) && (tid < 32))
    {
        sdata[tid] = mySum = mySum + sdata[tid + 32];
    }

    __syncthreads();

    if ((blockSize >=  32) && (tid < 16))
    {
        sdata[tid] = mySum = mySum + sdata[tid + 16];
    }

    __syncthreads();

    if ((blockSize >=  16) && (tid <  8))
    {
        sdata[tid] = mySum = mySum + sdata[tid +  8];
    }

    __syncthreads();

    if ((blockSize >=   8) && (tid <  4))
    {
        sdata[tid] = mySum = mySum + sdata[tid +  4];
    }

    __syncthreads();

    if ((blockSize >=   4) && (tid <  2))
    {
        sdata[tid] = mySum = mySum + sdata[tid +  2];
    }

    __syncthreads();

    if ((blockSize >=   2) && ( tid <  1))
    {
        sdata[tid] = mySum = mySum + sdata[tid +  1];
    }

    __syncthreads();
#endif

    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = mySum;
}

__host__ float deviceReduce(float *in, long N)
{
  float *device_out;
  gpuErrchk(cudaMalloc(&device_out, sizeof(float)*1024));
  gpuErrchk(cudaMemset(device_out, 0, sizeof(float)*1024));

  int threads = 512;
  int blocks = min((int(NearestPowerOf2(N)) + threads - 1) / threads, 1024);
  int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float);

  bool isPower2 = isPow2(N);
  if(isPower2){
    deviceReduceKernel<true><<<blocks, threads, smemSize>>>(in, device_out, N);
    gpuErrchk(cudaDeviceSynchronize());
  }else{
    deviceReduceKernel<false><<<blocks, threads, smemSize>>>(in, device_out, N);
    gpuErrchk(cudaDeviceSynchronize());
  }

  float *h_odata = (float *) malloc(blocks*sizeof(float));
  float sum = 0.0;

  gpuErrchk(cudaMemcpy(h_odata, device_out, blocks * sizeof(float),cudaMemcpyDeviceToHost));
  for (int i=0; i<blocks; i++)
  {
    sum += h_odata[i];
  }
  cudaFree(device_out);
  free(h_odata);
        return sum;
}

__host__ int main(int argc, char **argv) {
  long M = 512;
  long N = 512;

  float *host_xi_T, *host_xi_tau, *host_xi_beta;
  float *host_g_T, *host_g_tau, *host_g_beta;

  host_xi_T = (float*)malloc(M*N*sizeof(float));
  host_xi_tau = (float*)malloc(M*N*sizeof(float));
  host_xi_beta = (float*)malloc(M*N*sizeof(float));

  host_g_T = (float*)malloc(M*N*sizeof(float));
  host_g_tau = (float*)malloc(M*N*sizeof(float));
  host_g_beta = (float*)malloc(M*N*sizeof(float));

  FILE *fp_xi_T = NULL;
  FILE *fp_xi_tau = NULL;
  FILE *fp_xi_beta = NULL;

  FILE *fp_g_T = NULL;
  FILE *fp_g_tau = NULL;
  FILE *fp_g_beta = NULL;

  fp_xi_T=fopen("xi_T.raw", "r");
  fp_xi_tau=fopen("xi_tau.raw", "r");
  fp_xi_beta=fopen("xi_beta.raw", "r");

  fp_g_T=fopen("g_T.raw", "r");
  fp_g_tau=fopen("g_tau.raw", "r");
  fp_g_beta=fopen("g_beta.raw", "r");

  fread(host_xi_T, sizeof(float), M*N, fp_xi_T);
  fread(host_xi_tau, sizeof(float), M*N, fp_xi_tau);
  fread(host_xi_beta, sizeof(float), M*N, fp_xi_beta);

  fread(host_g_T, sizeof(float), M*N, fp_g_T);
  fread(host_g_tau, sizeof(float), M*N, fp_g_tau);
  fread(host_g_beta, sizeof(float), M*N, fp_g_beta);

  float3 *host_xi = (float3*)malloc(M*N*sizeof(float3));
  float3 *host_g = (float3*)malloc(M*N*sizeof(float3));

  for(int m=0;m<M*N;m++){
    host_xi[m].x = host_xi_T[m];
    assert(!isinf(host_xi[m].x));
    host_xi[m].y = host_xi_tau[m];
    assert(!isinf(host_xi[m].y));
    host_xi[m].z = host_xi_beta[m];
    assert(!isinf(host_xi[m].z));

    host_g[m].x = host_g_T[m];
    assert(!isinf(host_g[m].x));
    host_g[m].y = host_g_tau[m];
    assert(!isinf(host_g[m].y));
    host_g[m].z = host_g_beta[m];
    assert(!isinf(host_g[m].z));
  }

  fclose(fp_xi_T);
  fclose(fp_xi_tau);
  fclose(fp_xi_beta);

  fclose(fp_g_T);
  fclose(fp_g_tau);
  fclose(fp_g_beta);

free(host_xi_T);
  free(host_xi_tau);
  free(host_xi_beta);

  free(host_g_T);
  free(host_g_tau);
  free(host_g_beta);

  float3 *device_g, *xi;

  gpuErrchk(cudaMalloc((void**)&device_g, sizeof(float3)*M*N));
  gpuErrchk(cudaMemset(device_g, 0, sizeof(float3)*M*N));
  gpuErrchk(cudaMalloc((void**)&xi, sizeof(float3)*M*N));
  gpuErrchk(cudaMemset(xi, 0, sizeof(float3)*M*N));

  gpuErrchk(cudaMemcpy2D(xi, sizeof(float3), host_xi, sizeof(float3), sizeof(float3), M*N, cudaMemcpyHostToDevice));

  gpuErrchk(cudaMemcpy2D(device_g, sizeof(float3), host_g, sizeof(float3), sizeof(float3), M*N, cudaMemcpyHostToDevice));

  ///////////////////vectors for gg and dgg////////////////////
  float *device_gg_vector, *device_dgg_vector;

  //////////////////////////////////GPU MEMORY///////////////
  gpuErrchk(cudaMalloc((void**)&device_gg_vector, sizeof(float)*M*N));
  gpuErrchk(cudaMemset(device_gg_vector, 0, sizeof(float)*M*N));

  gpuErrchk(cudaMalloc((void**)&device_dgg_vector, sizeof(float)*M*N));
  gpuErrchk(cudaMemset(device_dgg_vector, 0, sizeof(float)*M*N));

  dim3 threadsPerBlockNN(16, 16);
        dim3 numBlocksNN(M/16, N/16);

  float gg, dgg;

  dgg = gg = 0.0;

  getGGandDGG<<<numBlocksNN, threadsPerBlockNN>>>(device_gg_vector, device_dgg_vector, xi, device_g, N);
  gpuErrchk(cudaDeviceSynchronize());

  gg = deviceReduce(device_gg_vector, M*N);
  dgg = deviceReduce(device_dgg_vector, M*N);

  printf("dgg: %f, gg: %f\n", dgg, gg);
}

I get this output:

$ nvcc -o main main.cu
nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
$ ./main
main: main.cu:259: int main(int, char**): Assertion `!isinf(host_xi[m].y)' failed.
Aborted (core dumped)
$

So you are reading in an inf value from one of your raw data files, and that inf value is propagating thru to the output.