Problem with cusparseXcoo2csr and cuda-gdb

I’m trying to convert a COO matrix with ~200,000 elements to CSR format. The code I use is:

#include<iostream>

#include<cstdlib>

#include<cmath>

#include<iomanip>

#include"hamiltonian.h" //my header for code which makes the matrix

#include"cuda.h"

#include"cuda_runtime.h"

#include"cublas_v2.h"

#include"cusparse_v2.h"

#include"cuComplex.h"

using namespace std;

__host__ void lanczos(const int how_many, const int* num_Elem, d_hamiltonian*& Hamiltonian, int max_Iter, const int num_Eig, const double conv_req)

{

int* dim = (int*)malloc(how_many*sizeof(int));

  for(int i = 0; i < how_many; i++)

  {

    dim[i] = Hamiltonian[i].sectordim; //pull dimension of matrix out of custom struct

  }

cudaStream_t stream[how_many];

  cublasStatus_t cublas_status[how_many];

cublasHandle_t linalghandle;

  cublas_status[0] = cublasCreate(&linalghandle);

if (cublas_status[0] != CUBLAS_STATUS_SUCCESS)

  {

    std::cout<<"Initializing CUBLAS failed! Error: "<<cublas_status[0]<<std::endl;

  }

cusparseHandle_t sparsehandle;

  cusparseStatus_t cusparse_status[how_many];

  cusparse_status[0] = cusparseCreate(&sparsehandle);

if (cusparse_status[0] != CUSPARSE_STATUS_SUCCESS)

  {

    std::cout<<"Failed to initialize CUSPARSE! Error: "<<cusparse_status[0]<<std::endl;

  }

cusparseMatDescr_t H_descr[how_many];

  for(int i = 0; i<how_many; i++)

  {

    cusparse_status[i] = cusparseCreateMatDescr(&H_descr[i]);

if (cusparse_status[i] != CUSPARSE_STATUS_SUCCESS)

    {

      std::cout<<"Error creating matrix description: "<<cusparse_status[i]<<std::endl;

    }

    cusparse_status[i] = cusparseSetMatType(H_descr[i], CUSPARSE_MATRIX_TYPE_GENERAL);

    if (cusparse_status[i] != CUSPARSE_STATUS_SUCCESS)

    {

      std::cout<<"Error setting matrix type: "<<cusparse_status[i]<<std::endl;

    }

    cusparse_status[i] = cusparseSetMatIndexBase(H_descr[i], CUSPARSE_INDEX_BASE_ZERO);

    if (cusparse_status[i] != CUSPARSE_STATUS_SUCCESS)

    {

      std::cout<<"Error setting matrix index base: "<<cusparse_status[i]<<std::endl;

    }

}

  cudaError_t status[how_many];

  cout<<"Done creating descriptions"<<endl;

  int** d_H_rowptrs;

  d_H_rowptrs = (int**)malloc(how_many*sizeof(int*));

for(int i = 0; i < how_many; i++)

  {

    status[i] = cudaMalloc(&d_H_rowptrs[i], (dim[i] + 1)*sizeof(int));

    if (status[i] != CUDA_SUCCESS)

    {

      std::cout<<"Error allocating d_H_rowptrs: "<<cudaGetErrorString(status[i])<<std::endl;

    }

    cusparse_status[i] = cusparseXcoo2csr(sparsehandle, Hamiltonian[i].rows, num_Elem[i], dim[i], d_H_rowptrs[i], CUSPARSE_INDEX_BASE_ZERO);

if (cusparse_status[i] != CUSPARSE_STATUS_SUCCESS)

    {

      std::cout<<"Error converting to CSR: "<<cusparse_status[i]<<std::endl;

    }

  }

}

When I try to run this, I get a CUSPARSE_STATUS_EXECUTION_FAILED error. I compiled it (this function is part of a larger project, but I am sure the matrix I am passing it is correct) using nvcc -w -g -G -gencode arch=compute_20,code=sm_21 -lcublas -lcusparse lanczos.cu

When I try to examine the problem in cuda-gdb, I get

warning: no loadable sections found in added symbol-file /tmp/cuda-dbg/27861/elf.76ffb0.8895a0.o.FuXAwf

warning: no loadable sections found in added symbol-file /tmp/cuda-dbg/27861/elf.76ffb0.88bb50.o.t0xXIx

warning: no loadable sections found in added symbol-file /tmp/cuda-dbg/27861/elf.76ffb0.a70a60.o.ugEDcJ

warning: no loadable sections found in added symbol-file /tmp/cuda-dbg/27861/elf.76ffb0.b3e0d0.o.imoyDd

I’m running Ubuntu 10.10 x86_64 with CUDA 4.1 and 2.6.35-31-generic as my kernel. I’m trying to test this on a 560Ti.

Any idea what’s wrong?

I’m trying to convert a COO matrix with ~200,000 elements to CSR format. The code I use is:

#include<iostream>

#include<cstdlib>

#include<cmath>

#include<iomanip>

#include"hamiltonian.h" //my header for code which makes the matrix

#include"cuda.h"

#include"cuda_runtime.h"

#include"cublas_v2.h"

#include"cusparse_v2.h"

#include"cuComplex.h"

using namespace std;

__host__ void lanczos(const int how_many, const int* num_Elem, d_hamiltonian*& Hamiltonian, int max_Iter, const int num_Eig, const double conv_req)

{

int* dim = (int*)malloc(how_many*sizeof(int));

  for(int i = 0; i < how_many; i++)

  {

    dim[i] = Hamiltonian[i].sectordim; //pull dimension of matrix out of custom struct

  }

cudaStream_t stream[how_many];

  cublasStatus_t cublas_status[how_many];

cublasHandle_t linalghandle;

  cublas_status[0] = cublasCreate(&linalghandle);

if (cublas_status[0] != CUBLAS_STATUS_SUCCESS)

  {

    std::cout<<"Initializing CUBLAS failed! Error: "<<cublas_status[0]<<std::endl;

  }

cusparseHandle_t sparsehandle;

  cusparseStatus_t cusparse_status[how_many];

  cusparse_status[0] = cusparseCreate(&sparsehandle);

if (cusparse_status[0] != CUSPARSE_STATUS_SUCCESS)

  {

    std::cout<<"Failed to initialize CUSPARSE! Error: "<<cusparse_status[0]<<std::endl;

  }

cusparseMatDescr_t H_descr[how_many];

  for(int i = 0; i<how_many; i++)

  {

    cusparse_status[i] = cusparseCreateMatDescr(&H_descr[i]);

if (cusparse_status[i] != CUSPARSE_STATUS_SUCCESS)

    {

      std::cout<<"Error creating matrix description: "<<cusparse_status[i]<<std::endl;

    }

    cusparse_status[i] = cusparseSetMatType(H_descr[i], CUSPARSE_MATRIX_TYPE_GENERAL);

    if (cusparse_status[i] != CUSPARSE_STATUS_SUCCESS)

    {

      std::cout<<"Error setting matrix type: "<<cusparse_status[i]<<std::endl;

    }

    cusparse_status[i] = cusparseSetMatIndexBase(H_descr[i], CUSPARSE_INDEX_BASE_ZERO);

    if (cusparse_status[i] != CUSPARSE_STATUS_SUCCESS)

    {

      std::cout<<"Error setting matrix index base: "<<cusparse_status[i]<<std::endl;

    }

}

  cudaError_t status[how_many];

  cout<<"Done creating descriptions"<<endl;

  int** d_H_rowptrs;

  d_H_rowptrs = (int**)malloc(how_many*sizeof(int*));

for(int i = 0; i < how_many; i++)

  {

    status[i] = cudaMalloc(&d_H_rowptrs[i], (dim[i] + 1)*sizeof(int));

    if (status[i] != CUDA_SUCCESS)

    {

      std::cout<<"Error allocating d_H_rowptrs: "<<cudaGetErrorString(status[i])<<std::endl;

    }

    cusparse_status[i] = cusparseXcoo2csr(sparsehandle, Hamiltonian[i].rows, num_Elem[i], dim[i], d_H_rowptrs[i], CUSPARSE_INDEX_BASE_ZERO);

if (cusparse_status[i] != CUSPARSE_STATUS_SUCCESS)

    {

      std::cout<<"Error converting to CSR: "<<cusparse_status[i]<<std::endl;

    }

  }

}

When I try to run this, I get a CUSPARSE_STATUS_EXECUTION_FAILED error. I compiled it (this function is part of a larger project, but I am sure the matrix I am passing it is correct) using nvcc -w -g -G -gencode arch=compute_20,code=sm_21 -lcublas -lcusparse lanczos.cu

When I try to examine the problem in cuda-gdb, I get

warning: no loadable sections found in added symbol-file /tmp/cuda-dbg/27861/elf.76ffb0.8895a0.o.FuXAwf

warning: no loadable sections found in added symbol-file /tmp/cuda-dbg/27861/elf.76ffb0.88bb50.o.t0xXIx

warning: no loadable sections found in added symbol-file /tmp/cuda-dbg/27861/elf.76ffb0.a70a60.o.ugEDcJ

warning: no loadable sections found in added symbol-file /tmp/cuda-dbg/27861/elf.76ffb0.b3e0d0.o.imoyDd

I’m running Ubuntu 10.10 x86_64 with CUDA 4.1 and 2.6.35-31-generic as my kernel. I’m trying to test this on a 560Ti.

Any idea what’s wrong?

The library team recommends to double check that the input matrix satisfies the requirements of the CUSPARSE library:

“Sparse matrices are assumed to be stored in row-major COO format, in other words, the index arrays are first sorted by row indices and then within the same row by column indices. Also it is assumed that each pair of row and column indices appears only once.” (see the documentation section on coordinate format for details).

If the matrix satisfies the requirements but convesion still fails, I would suggest filing a bug. In that case, the library team would prefer that the matrix be attached as a file in matrix market format (http://math.nist.gov/MatrixMarket/formats.html).

The library team recommends to double check that the input matrix satisfies the requirements of the CUSPARSE library:

“Sparse matrices are assumed to be stored in row-major COO format, in other words, the index arrays are first sorted by row indices and then within the same row by column indices. Also it is assumed that each pair of row and column indices appears only once.” (see the documentation section on coordinate format for details).

If the matrix satisfies the requirements but convesion still fails, I would suggest filing a bug. In that case, the library team would prefer that the matrix be attached as a file in matrix market format (http://math.nist.gov/MatrixMarket/formats.html).

I’ve looked at the matrix again - it is sorted by row, but within a row the elements are not column-ordered. Could this be enough to cause the function to fail? Here’s what the matrix looks like:

(0,0) - 4

(0,4) - 0.5

(0,11) - 0.5

(0,46) - 0.5

(0,165) - 0.5

(0,502) - 0.5

(0,1293) - 0.5

(0,3008) - 0.5

(0,6439) - 0.5

(1,1) - 2

(1,5) - 0.5

(1,18) - 0.5

(1,2) - 0.5

(1,53) - 0.5

(1,4) - 0.5

(1,9) - 0.5

(1,495) - 0.5

(1,165) - 0.5

(1,509) - 0.5

(1,1300) - 0.5

(1,3015) - 0.5

(1,6446) - 0.5

Where I’ve written the matrix as (row, col) - value. I’ve gone through and ensured that (row,col) pairs are not repeated. Would writing the matrix like this prevent Xcoo2csr from working?

Edit: the problem persists when I change my earlier code so that the matrix looks like:

(0,0) - 4

(0,4) - 0.5

(0,11) - 0.5

(0,46) - 0.5

(0,165) - 0.5

(0,502) - 0.5

(0,1293) - 0.5

(0,3008) - 0.5

(0,6439) - 0.5

(1,1) - 2

(1,2) - 0.5

(1,4) - 0.5

(1,5) - 0.5

(1,9) - 0.5

(1,18) - 0.5

(1,53) - 0.5

(1,165) - 0.5

(1,495) - 0.5

(1,509) - 0.5

(1,1300) - 0.5

(1,3015) - 0.5

(1,6446) - 0.5

Now I get a segfault instead of EXECUTION_FAILED. As inputs to the function, I’m passing:

Num_elem: 232518

Dim: 12870

row indices: 0x2027a0000

row pointers: 0x200800000

The number of elements and matrix dimension are both correct.

I’ve looked at the matrix again - it is sorted by row, but within a row the elements are not column-ordered. Could this be enough to cause the function to fail? Here’s what the matrix looks like:

(0,0) - 4

(0,4) - 0.5

(0,11) - 0.5

(0,46) - 0.5

(0,165) - 0.5

(0,502) - 0.5

(0,1293) - 0.5

(0,3008) - 0.5

(0,6439) - 0.5

(1,1) - 2

(1,5) - 0.5

(1,18) - 0.5

(1,2) - 0.5

(1,53) - 0.5

(1,4) - 0.5

(1,9) - 0.5

(1,495) - 0.5

(1,165) - 0.5

(1,509) - 0.5

(1,1300) - 0.5

(1,3015) - 0.5

(1,6446) - 0.5

Where I’ve written the matrix as (row, col) - value. I’ve gone through and ensured that (row,col) pairs are not repeated. Would writing the matrix like this prevent Xcoo2csr from working?

Edit: the problem persists when I change my earlier code so that the matrix looks like:

(0,0) - 4

(0,4) - 0.5

(0,11) - 0.5

(0,46) - 0.5

(0,165) - 0.5

(0,502) - 0.5

(0,1293) - 0.5

(0,3008) - 0.5

(0,6439) - 0.5

(1,1) - 2

(1,2) - 0.5

(1,4) - 0.5

(1,5) - 0.5

(1,9) - 0.5

(1,18) - 0.5

(1,53) - 0.5

(1,165) - 0.5

(1,495) - 0.5

(1,509) - 0.5

(1,1300) - 0.5

(1,3015) - 0.5

(1,6446) - 0.5

Now I get a segfault instead of EXECUTION_FAILED. As inputs to the function, I’m passing:

Num_elem: 232518

Dim: 12870

row indices: 0x2027a0000

row pointers: 0x200800000

The number of elements and matrix dimension are both correct.

I have never used CUSPARSE. Violating the matrix layout requirements presumably causes some sort of error status to be returned by CUSPARSE, but I don’t know what the “correct” error status is.

The fact that you are now getting a segfault suggests that there may be a problem of a different kind, namely that somewhere in your code host and device pointers are getting mixed up. Passing a host pointer instead of a device pointer to a kernel would cause it to fail (the kernel experiences the equivalent of a segfault), which is the problem you encountered initially. Likewise, de-referencing a device pointer in host code would cause a segfault, which is what you are seeing now.

I have never used CUSPARSE. Violating the matrix layout requirements presumably causes some sort of error status to be returned by CUSPARSE, but I don’t know what the “correct” error status is.

The fact that you are now getting a segfault suggests that there may be a problem of a different kind, namely that somewhere in your code host and device pointers are getting mixed up. Passing a host pointer instead of a device pointer to a kernel would cause it to fail (the kernel experiences the equivalent of a segfault), which is the problem you encountered initially. Likewise, de-referencing a device pointer in host code would cause a segfault, which is what you are seeing now.

After some more testing, I think the real problem I was having was actually occurring later on. If I comment out a make_cuDoubleComplex, the conversion works. If not, it fails. It’s strange.

Thanks a lot for your help!

After some more testing, I think the real problem I was having was actually occurring later on. If I comment out a make_cuDoubleComplex, the conversion works. If not, it fails. It’s strange.

Thanks a lot for your help!