NVCC bug ? unspecified launch failure

I am implementing a CUDA kernel for a 12x12 (complex) matrix inversion by LDL_dag decomposition.

The code is production proven to work by verfication on CPU architectures.

Right now I have a strange behaviour where my conclusion at the moment is a possible bug in NVCC compiler chain?

I reproduced the error with a reduced setup. Sorry for the 280 lines.

With CUDA 4.2 if I compile it with

nvcc -arch sm_20 -o main main.cu

and run it on a Fermi card on Linux 64 I get:

kernel call: unspecified launch failure

In the kernel body there is a #if switch. If I set it to #if 0 (thus the alternate if-block get used) the error disappears. But the code executed should be the same/equivalent in either case. (hasOrderedRep is true!!)

#include "cuda.h"

#include "stdio.h"

#include <iostream>

using namespace std;

/////////////////////////////////////

//

// First some basic types I need

// Implementation of a templated

// scalar and complex number

template<class T> class RScalar

{

public:

  __device__  RScalar() {}

__device__  ~RScalar() {}

template<class T1>  __device__   

  RScalar(const RScalar<T1>& rhs) : F(rhs.elem()) {}

template<class T1>  __device__

  RScalar(const T1& rhs) : F(rhs) {}

template<class T1> __device__ inline

  RScalar& operator=(const RScalar<T1>& rhs)  {

    elem() = rhs.elem();

    return *this;

  }

public:

  __device__ T& elem() {return F;}

  __device__ const T& elem() const {return F;}

private:

  T F;

};

template<class T> class RComplex

{

public:

  __device__  RComplex() {}

__device__  ~RComplex() {}

template<class T1, class T2>  __device__

  RComplex(const RScalar<T1>& _re, const RScalar<T2>& _im): 

    re(_re.elem()), im(_im.elem()) {}

template<class T1, class T2>  __device__

  RComplex(const T1& _re, const T2& _im): re(_re), im(_im) {}

template<class T1>  __device__

  RComplex(const T1& _re): re(_re), im() {}

template<class T1>

  __device__ inline

  RComplex& operator*=(const RScalar<T1>& rhs) 

    {

      real() *= rhs.elem();

      imag() *= rhs.elem();

      return *this;

    }

template<class T1>

  __device__ inline

  RComplex& operator-=(const RComplex<T1>& rhs) 

    {

      real() -= rhs.real();

      imag() -= rhs.imag();

      return *this;

    }

template<class T1>  __device__ inline

  RComplex& operator/=(const RComplex<T1>& rhs) 

    {

      RComplex<T> d;

      d = *this / rhs;

      real() = d.real();

      imag() = d.imag();

      return *this;

    }

public:

  __device__ T& real() {return re;}

  __device__ const T& real() const {return re;}

__device__ T& imag() {return im;}

  __device__ const T& imag() const {return im;}

private:

  T re;

  T im;

};

template<class T> __device__ RComplex<T>

operator*(const RComplex<T>& __restrict__ l, 

	  const RComplex<T>& __restrict__ r) 

{

  return RComplex<T>(l.real()*r.real() - l.imag()*r.imag(),

		     l.real()*r.imag() + l.imag()*r.real());

}

template<class T> __device__ RComplex<T>

operator/(const RComplex<T>& l, const RComplex<T>& r)

{

  T tmp = T(1.0) / (r.real()*r.real() + r.imag()*r.imag());

return RComplex<T>((l.real()*r.real() + l.imag()*r.imag()) * tmp,

		     (l.imag()*r.real() - l.real()*r.imag()) * tmp);

}

template<class T> __device__ RComplex<T>

operator*(const RComplex<T>& l, const RScalar<T>& r)

{

  return RComplex<T>(l.real()*r.elem(), 

		     l.imag()*r.elem());

}

//

//

//////////////////////////////////////////////

#define REALT float

#define Nc 3

struct PrimitiveClovTriang

{

  RScalar<REALT>   diag[2][2*Nc];

  RComplex<REALT>  offd[2][2*Nc*Nc-Nc];

};

__global__ void kernel(bool hasOrderedRep, int * siteTable, 

		       PrimitiveClovTriang* tri)

{

  RScalar<REALT> zip=0;

  int N = 2*Nc;

  int site;

//

  // First basic block results in an error,

  // second, runs fine! Since hasOrderedRep

  // is true, the code blocks should be

  // identical.

  //

#if 1

  if (hasOrderedRep) {

    site = blockDim.x * blockIdx.x + 

      blockDim.x * gridDim.x * blockIdx.y + 

      threadIdx.x;

  } else {

    int idx0 = blockDim.x * blockIdx.x + 

      blockDim.x * gridDim.x * blockIdx.y + 

      threadIdx.x;

    site = ((int*)(siteTable))[idx0];

  }

#else

  site = blockDim.x * blockIdx.x + blockDim.x * gridDim.x * blockIdx.y + threadIdx.x;

#endif

int site_neg_logdet=0;

  for(int block=0; block < 2; block++) { 

    RScalar<REALT> inv_d[6];

    RComplex<REALT> inv_offd[15];

    RComplex<REALT> v[6];

    RScalar<REALT>  diag_g[6];

    for(int i=0; i < N; i++) { 

      inv_d[i] = tri[site].diag[block][i];

    }

    for(int i=0; i < 15; i++) { 

      inv_offd[i]  =tri[site].offd[block][i];

    }

    for(int j=0; j < N; ++j) { 

      for(int i=0; i < j; i++) { 

	int elem_ji = j*(j-1)/2 + i;

	RComplex<REALT> A_ii = RComplex<REALT>( inv_d[i], zip );

	v[i] = A_ii*RComplex<REALT>(inv_offd[elem_ji].real(),-inv_offd[elem_ji].imag());

      }

      v[j] = RComplex<REALT>(inv_d[j],zip);

      for(int k=0; k < j; k++) { 

	int elem_jk = j*(j-1)/2 + k;

	v[j] -= inv_offd[elem_jk]*v[k];

      }

      inv_d[j].elem() = v[j].real();

      for(int k=j+1; k < N; k++) { 

	int elem_kj = k*(k-1)/2 + j;

	for(int l=0; l < j; l++) { 

	  int elem_kl = k*(k-1)/2 + l;

	  inv_offd[elem_kj] -= inv_offd[elem_kl] * v[l];

	}

	inv_offd[elem_kj] /= v[j];

      }

    }

    RScalar<REALT> one;

    one.elem() = (REALT)1;

    for(int i=0; i < N; i++) { 

      diag_g[i].elem() = one.elem()/inv_d[i].elem();

      //      ((PScalar<PScalar<RScalar<float> > > *)(args->dev_ptr[ 1 ] ))[site] .elem().elem().elem() += log(fabs(inv_d[i].elem()));

      if( inv_d[i].elem() < 0 ) { 

	site_neg_logdet++;

      }

    }

    RComplex<REALT> sum;

    for(int k = 0; k < N; ++k) {

      for(int i = 0; i < k; ++i) {

	v[i].real()=v[i].imag()=0;

      }

      v[k] = RComplex<REALT>(diag_g[k],zip);

      for(int i = k+1; i < N; ++i) {

	v[i].real()=v[i].imag()=0;

	for(int j = k; j < i; ++j) {

	  int elem_ij = i*(i-1)/2+j;	

	  v[i] -= inv_offd[elem_ij] *inv_d[j]*v[j];

	}

	v[i] *= diag_g[i];

      }

      for(int i = N-2; (int)i >= (int)k; --i) {

	for(int j = i+1; j < N; ++j) {

	  int elem_ji = j*(j-1)/2 + i;

	  v[i] -= RComplex<REALT>(inv_offd[elem_ji].real(),-inv_offd[elem_ji].imag()) * v[j];

	}

      }

      inv_d[k].elem() = v[k].real();

      for(int i = k+1; i < N; ++i) {

	int elem_ik = i*(i-1)/2+k;

	inv_offd[elem_ik] = v[i];

      }

    }

    for(int i=0; i < N; i++) { 

      tri[site].diag[block][i] = inv_d[i];

    }

    for(int i=0; i < 15; i++) { 

      tri[site].offd[block][i] = inv_offd[i];

    }

  }

  if( site_neg_logdet != 0 ) { 

  }

}

int main()

{

  int sites=1;

dim3  blocksPerGrid( 1 , 1 , 1 );

  dim3  threadsPerBlock( sites , 1, 1);

PrimitiveClovTriang* tri_dev;

  int * siteTable;

  cudaMalloc( (void**)&tri_dev , sizeof(PrimitiveClovTriang) * sites );

  cudaMalloc( (void**)&siteTable , sizeof(int) * sites );

  bool ord=true;

kernel<<< blocksPerGrid , threadsPerBlock , 0 >>>( ord , siteTable , tri_dev );

cudaDeviceSynchronize();

  cudaError_t kernel_call = cudaGetLastError();

cout << "kernel call: " << string(cudaGetErrorString(kernel_call)) << endl;

  cudaFree(tri_dev);

  cudaFree(siteTable);

return(0);

}

Update:

Just tried on a different machine which has CUDA 4.0 installed. There the error does not appear (same Fermi card model). Might be really a NVCC bug of CUDA 4.2. Since they switched to LLVM from CUDA 4.1 this is likely to be a bug in the new compiler suite.

We have similiar experience.

There exist a dozen of kernels developed by us with version 3.x of CUDA.
When compiled with the nvcc in CUDA 4.2, some of the kernels produced bad results while others are good.
Then we tried the nvcc in CUDA 4.1, kernels compiled produced exactly same bad results as 4.2.

Finally we tried nvcc in CUDA 5.0 preview, all results are good now.