cusolverDnSgeqrf

I am trying to use cusolverDnSgeqrf in CUDA 7.0. The code below always returns CUSOLVER_STATUS_EXECUTION_FAILED. Can someone please explain what I am doing wrong? By the way, should there be example code with the new CUSOLVER capabilities? I can’t seem to find any examples.

OS : Ubuntu 14.04
Driver : 346.46
GPU : 690

void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) { exit(code); }
   }
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }

int main()
{
  cusolverStatus_t status;

  cusolverDnHandle_t solver_handle;
  cusolverDnCreate(&solver_handle);

  int M = 4;
  int N = 3;
  int minMN = min(M,N);

  float *h_A = (float *)malloc(M * N * sizeof(float));

  h_A[0] = 1.0f;
  h_A[1] = 4.0f;
  h_A[2] = 7.0f;
  h_A[3] = 10.0f;

  h_A[4] = 2.0f;
  h_A[5] = 5.0f;
  h_A[6] = 8.0f;
  h_A[7] = 11.0f;

  h_A[8] = 3.0f;
  h_A[9] = 6.0f;
  h_A[10] = 9.0f;
  h_A[11] = 12.0f;

  float *d_A;
  gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
  gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

  int work_size = 0;
  status = cusolverDnSgeqrf_bufferSize(solver_handle, M, N, d_A, M, &work_size);
  cout << "status = " << status << endl;
  cout << "work_size = " << work_size << endl;
  if (status != CUSOLVER_STATUS_SUCCESS )
    cout << "Initialization of cuSolver failed." << endl;

  int devInfo = 0;
  float* TAU;
  gpuErrchk(cudaMalloc(&TAU, minMN * sizeof(float)));
  float* Workspace;
  gpuErrchk(cudaMalloc(&Workspace, work_size * sizeof(float)));
  status = cusolverDnSgeqrf(solver_handle, M, N, d_A, M, TAU, Workspace, work_size, &devInfo);
  cout << "devInfo = " << devInfo << endl;
  cout << "status = " << status << endl;
  switch(status)
  {
      case CUSOLVER_STATUS_SUCCESS:
        cout << "SVD computation success" << endl;
        break;
      case CUSOLVER_STATUS_NOT_INITIALIZED :
        cout << "Library cuSolver not initialized correctly" << endl;
        break;
      case CUSOLVER_STATUS_INVALID_VALUE:
        cout << "Invalid parameters passed" << endl;
        break;
      case CUSOLVER_STATUS_INTERNAL_ERROR:
        cout << "Internal operation failed" << endl;
        break;
      case CUSOLVER_STATUS_EXECUTION_FAILED:
        cout << "Execution failed" << endl;
        break;
  }

  cudaDeviceSynchronize();

  float* h_U = (float *)malloc(M * N * sizeof(float));
  gpuErrchk(cudaMemcpy(h_U, d_A, M * N * sizeof(float), cudaMemcpyDeviceToHost));
  
  cusolverDnDestroy(solver_handle);

  return 0;
}

Bah. Never mind. I had the devInfo wrong. It should be on the device. It is working now.