I have been trying to implement a simple sparse matrix-vector multiplication with Compressed Sparse Row (CSR) format into some FORTRAN code that I have, needless to say unsuccessfully. I created a subroutine that would call the FORTRAN CUSPARSE bindings (fortran_cusparse.c) and modeled it after the users guide provided with the CUSPARSE library. Before calling the subroutine, the matrix-vector system has already been set-up in CSR format and passed as such.
I have no problems compiling or linking to CUSPARSE (version 4), but everytime I execute the program I get a “segmentation fault”. I have no idea where the problem may lie, I know it must be something with the address of the array(s) but otherwise I thought I have done everything correct. Can anyone see an obvious fault with the code - maybe some “fresh eyes” could help?
Thank you.
subroutine spmv_csr_gpu(avals, ia, ja, b, x, mm, nn)
! Local and passed-in variable(s)
integer mm, nn, descrA
integer handle, stat1, stat2, stat3, stat4, stat5
real, dimension(nn) :: avals, davals
integer, dimension(mm + 1) :: ia, dia
integer, dimension(nn) :: ja, dja
real, dimension(mm) :: b, db
real, dimension(mm) :: x, dx
! Allocate memory and copy data to device structure(s)
stat1 = cuda_malloc(davals, nn)
stat2 = cuda_malloc(dia, (mm + 1))
stat3 = cuda_malloc(dja, nn)
stat4 = cuda_malloc(db, mm)
stat5 = cuda_malloc(dx, mm)
if ((stat1 /= 0) .or. (stat2 /= 0) .or. (stat3 /= 0) .or. &
(stat4 /= 0) .or. (stat5 /= 0)) then
write(*, *) "Device memory allocation failed"
stop
endif
stat1 = cuda_memcpy_fort2c_real(davals, avals, nn, 1)
stat2 = cuda_memcpy_fort2c_real(db, b, mm, 1)
stat3 = cuda_memcpy_fort2c_real(dx, x, mm, 1)
stat4 = cuda_memcpy_fort2c_int(dia, ia, (mm + 1), 1)
stat5 = cuda_memcpy_fort2c_int(dja, ja, nn, 1)
if ((stat1 /= 0) .or. (stat2 /= 0) .or. (stat3 /=0) .or. &
(stat4 /= 0) .or. (stat5 /= 0)) then
call cuda_free(davals)
call cuda_free(db)
call cuda_free(dx)
call cuda_free(dia)
call cuda_free(dja)
write(*, *) "Host to device copy failed"
stop
endif
! Initialize the CUSPARSE library
stat1 = cusparse_create(handle)
if (stat1 /= 0) then
call cuda_free(davals)
call cuda_free(db)
call cuda_free(dx)
call cuda_free(dia)
call cuda_free(dja)
write(*, *) "CUSPARSE initialization failed"
stop
endif
! Create matrix type descriptor
stat1 = cusparse_create_mat_descr(descrA)
if (stat1 /= 0) then
call cuda_free(davals)
call cuda_free(db)
call cuda_free(dx)
call cuda_free(dia)
call cuda_free(dja)
call cusparse_destroy(handle)
write(*, *) "Create matrix descriptor failed"
stop
endif
! Set matrix type and index base
stat1 = cusparse_set_mat_type(descrA, 0)
stat2 = cusparse_set_mat_index_base(descrA, 1)
if ((stat1 /= 0) .or. (stat2 /= 0)) then
call cuda_free(davals)
call cuda_free(db)
call cuda_free(dx)
call cuda_free(dia)
call cuda_free(dja)
call cusparse_destroy_mat_descr(descrA)
call cusparse_destroy(handle)
write(*, *) "Setting matrix descriptor failed"
stop
endif
! Execute sparse matrix-vector CSR format
stat1 = cusparse_dcsrmv(handle, 0, mm, nn, nn, &
1.0, descrA, davals, dia, dja, db, 1.0, dx)
if (stat1 /= 0) then
call cuda_free(davals)
call cuda_free(db)
call cuda_free(dx)
call cuda_free(dia)
call cuda_free(dja)
call cusparse_destroy_mat_descr(descrA)
call cusparse_destroy(handle)
write(*, *) "Matrix-vector multiplication failed"
stop
endif
! Copy memory from device to host
stat1 = cuda_memcpy_c2fort_real(x, dx, mm, 2)
if (stat1 /= 0) then
call cuda_free(davals)
call cuda_free(db)
call cuda_free(dx)
call cuda_free(dia)
call cuda_free(dja)
call cusparse_destroy_mat_descr(descrA)
call cusparse_destroy(handle)
write(*, *) "Memcpy from device to host failed"
stop
endif
end subroutine spmv_csr_gpu