Fortran CUSPARSE Bindings

I have been trying to implement a simple sparse matrix-vector multiplication with Compressed Sparse Row (CSR) format into some FORTRAN code that I have, needless to say unsuccessfully. I created a subroutine that would call the FORTRAN CUSPARSE bindings (fortran_cusparse.c) and modeled it after the users guide provided with the CUSPARSE library. Before calling the subroutine, the matrix-vector system has already been set-up in CSR format and passed as such.

I have no problems compiling or linking to CUSPARSE (version 4), but everytime I execute the program I get a “segmentation fault”. I have no idea where the problem may lie, I know it must be something with the address of the array(s) but otherwise I thought I have done everything correct. Can anyone see an obvious fault with the code - maybe some “fresh eyes” could help?

Thank you.

subroutine spmv_csr_gpu(avals, ia, ja, b, x, mm, nn)

! Local and passed-in variable(s)

          integer mm, nn, descrA

          integer handle, stat1, stat2, stat3, stat4, stat5

          real, dimension(nn) :: avals, davals

          integer, dimension(mm + 1) :: ia, dia

          integer, dimension(nn) :: ja, dja

          real, dimension(mm) :: b, db

          real, dimension(mm) :: x, dx

! Allocate memory and copy data to device structure(s)

          stat1 = cuda_malloc(davals, nn)

          stat2 = cuda_malloc(dia, (mm + 1))

          stat3 = cuda_malloc(dja, nn)

          stat4 = cuda_malloc(db, mm)

          stat5 = cuda_malloc(dx, mm)

          if ((stat1 /= 0) .or. (stat2 /= 0) .or. (stat3 /= 0) .or. &

              (stat4 /= 0) .or. (stat5 /= 0)) then

             write(*, *) "Device memory allocation failed"

            stop

          endif

          stat1 = cuda_memcpy_fort2c_real(davals, avals, nn, 1)

          stat2 = cuda_memcpy_fort2c_real(db, b, mm, 1)

          stat3 = cuda_memcpy_fort2c_real(dx, x, mm, 1)

          stat4 = cuda_memcpy_fort2c_int(dia, ia, (mm + 1), 1)

          stat5 = cuda_memcpy_fort2c_int(dja, ja, nn, 1)

          if ((stat1 /= 0) .or. (stat2 /= 0) .or. (stat3 /=0) .or. &

              (stat4 /= 0) .or. (stat5 /= 0)) then

             call cuda_free(davals)

             call cuda_free(db)

             call cuda_free(dx)

             call cuda_free(dia)

             call cuda_free(dja)

             write(*, *) "Host to device copy failed"

             stop

          endif

! Initialize the CUSPARSE library

          stat1 = cusparse_create(handle)

          if (stat1 /= 0) then

             call cuda_free(davals)

             call cuda_free(db)

             call cuda_free(dx)

             call cuda_free(dia)

             call cuda_free(dja)

             write(*, *) "CUSPARSE initialization failed"

             stop

          endif

! Create matrix type descriptor

          stat1 = cusparse_create_mat_descr(descrA)

          if (stat1 /= 0) then

             call cuda_free(davals)

             call cuda_free(db)

             call cuda_free(dx)

             call cuda_free(dia)

             call cuda_free(dja)

             call cusparse_destroy(handle)

             write(*, *) "Create matrix descriptor failed"

             stop

          endif

! Set matrix type and index base

          stat1 = cusparse_set_mat_type(descrA, 0)

          stat2 = cusparse_set_mat_index_base(descrA, 1)

          if ((stat1 /= 0) .or. (stat2 /= 0)) then

             call cuda_free(davals)

             call cuda_free(db)

             call cuda_free(dx)

             call cuda_free(dia)

             call cuda_free(dja)

             call cusparse_destroy_mat_descr(descrA)

             call cusparse_destroy(handle)

             write(*, *) "Setting matrix descriptor failed"

             stop

          endif

! Execute sparse matrix-vector CSR format

          stat1 = cusparse_dcsrmv(handle, 0, mm, nn, nn, &

                  1.0, descrA, davals, dia, dja, db, 1.0, dx)

if (stat1 /= 0) then

             call cuda_free(davals)

             call cuda_free(db)

             call cuda_free(dx)

             call cuda_free(dia)

             call cuda_free(dja)

             call cusparse_destroy_mat_descr(descrA)

             call cusparse_destroy(handle)

             write(*, *) "Matrix-vector multiplication failed"

             stop

          endif

! Copy memory from device to host

          stat1 = cuda_memcpy_c2fort_real(x, dx, mm, 2)

          if (stat1 /= 0) then

             call cuda_free(davals)

             call cuda_free(db)

             call cuda_free(dx)

             call cuda_free(dia)

             call cuda_free(dja)

             call cusparse_destroy_mat_descr(descrA)

             call cusparse_destroy(handle)

             write(*, *) "Memcpy from device to host failed"

             stop

          endif

end subroutine spmv_csr_gpu

I found a possible problem with my code.

When I check the actual matrix index base, as defined by the “cusparse_set_mat_index_base”, the value returned is 0. Shouldn’t this value be 1, considering I set the index base to 1?

Thanks.