Problem with Fortran interface cusparseSpMV calculating complex SpMV

Hello, I am learning to use PGI fotran and cuda
I am trying to calculate complex SpMV using Fortran interface cusparseSpMV.

When I calculating complex spmv using cusparseSpMV,the program always raise the error “0:copyout Memcpy (host=, dev=, size=*) FAILED: 700(an illegal memory access was encountered” when I copy the result from device to host after invoking cusparseSpMV. And if I comment cudaMemcpy, deallocate of device memory followed will also raise error"0: DEALLOCATE: an illegal memory access was encountered".

status=cusparseSpMV(handle,CUSPARSE_OPERATION_NON_TRANSPOSE,alpha,matrix,vecX,beta,vecY,CUDA_C_64F,CUSPARSE_SPMV_ALG_DEFAULT,buffer)

status=cudaMemcpy(wfOut,g_wfOut,nWf)

And the same code for real SpMV runs correctly , I wonder where the problem is. Thanks a lot.
plus:
base: cuda11.7 hpc_sdk 22.7
compile: nvfortran -cuda -cudalib=cusparse -gpu=cc60 cf.f90

code:

program product
    use cudafor
    use cusparse
    use iso_c_binding

    implicit none

    complex*8,allocatable :: wf(:), element(:), wfOut(:)
    integer*4,allocatable:: rowIdx(:), colIdx(:),csrRowPtr(:)

    integer*4 :: nWf, nz, status, i
    real*8 :: alpha=1.0, beta=0.0

    integer*8 :: buffersize

    !integer*4,allocatable,device :: buffer(:)
    type(c_devptr) :: buffer

    complex*8,allocatable,device :: g_wf(:), g_element(:), g_wfOut(:)
    integer*4,allocatable,device :: g_rowIdx(:), g_colIdx(:), g_csrRowPtr(:)

    type(cusparseHandle) :: handle
    type(cusparseSpMatDescr) :: matrix
    type(cusparseDnVecDescr) :: vecX, vecY

    integer*4 :: start, end, countrate
    real*8 :: t

    nWf=53*53*53
    nz=6945539

    allocate(wf(nWf), wfOut(nWf), csrRowPtr(nWf+1))
    allocate(rowIdx(nz), colIdx(nz), element(nz))

    allocate(g_wf(nWf), g_wfOut(nWf), g_csrRowPtr(nWf+1))
    allocate(g_rowIdx(nz), g_colIdx(nz), g_element(nz))

    !read data
    open(18,file='element')
    read(18,*) (element(i),i=1,nz)
    close(18)
    open(18,file='rowIdx')
    read(18,*) (rowIdx(i),i=1,nz)
    close(18)
    open(18,file='colIdx')
    read(18,*) (colIdx(i),i=1,nz)
    close(18)
    open(18,file='WF')
    read(18,*) (wf(i),i=1,nWf)
    close(18)

    rowIdx=rowIdx+1
    colIdx=colIdx+1

    status=cudaMemcpy(g_wf,wf,nWf,cudaMemcpyHostToDevice)
    status=cudaMemcpy(g_rowIdx,rowIdx,nz,cudaMemcpyHostToDevice)
    status=cudaMemcpy(g_colIdx,colIdx,nz,cudaMemcpyHostToDevice)
    status=cudaMemcpy(g_element,element,nz,cudaMemcpyHostToDevice)
    status=cudaMemcpy(g_wfOut,wfOut,nWf,cudaMemcpyHostToDevice)

    !check
    wf=(0.0,0.0)
    write(*,*) wf(1)
    status= cudaMemcpy(wf,g_wf,nWf,cudaMemcpyDeviceToHost)
    write(*,*) wf(1)

    status=cusparseCreate(handle)

    status=cusparseXcoo2csr(handle, g_rowIdx, nz, nWf, g_csrRowPtr, CUSPARSE_INDEX_BASE_ONE)

    status=cusparseCreateCsr(matrix,nWf,nWf,nz,g_csrRowPtr,g_colIdx,g_element,CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_C_64F)

    status=cusparseCreateDnVec(vecX, nWf, g_wf, CUDA_C_64F)
    status=cusparseCreateDnVec(vecY, nWf, g_wfOut, CUDA_C_64F)

    status=cusparseSpMV_bufferSize(handle,CUSPARSE_OPERATION_NON_TRANSPOSE,alpha,matrix,vecX,beta,vecY,CUDA_C_64F,CUSPARSE_SPMV_ALG_DEFAULT,buffersize)

    status=cudaMalloc(buffer,buffersize)

    !call system_clock(start,countrate)
    !do i=1,2000,1
    !status=cusparseSpMV(handle,CUSPARSE_OPERATION_NON_TRANSPOSE,alpha,matrix,vecX,beta,vecY,CUDA_C_64F,CUSPARSE_SPMV_ALG_DEFAULT,buffer)
    !end do

    !call system_clock(end,countrate)
    !t=(end-start)/real(countrate)
    !write(*,*) t

    status=cusparseSpMV(handle,CUSPARSE_OPERATION_NON_TRANSPOSE,alpha,matrix,vecX,beta,vecY,CUDA_C_64F,CUSPARSE_SPMV_ALG_DEFAULT,buffer)

    status=cudaMemcpy(wfOut,g_wfOut,nWf)
    !write(*,*) wfOut(100)

    status=cudaFree(buffer)
    status=cusparseDestroy(handle)
    status=cusparseDestroyDnVec(vecX)
    status=cusparseDestroyDnVec(vecY)
    status=cusparseDestroySpMat(matrix)

    deallocate(wf)
    deallocate(rowIdx)
    deallocate(colIdx)
    deallocate(element)

    deallocate(g_colIdx)
    deallocate(g_csrRowPtr)
    deallocate(g_element)
    deallocate(g_rowIdx)
    deallocate(g_wf)
    deallocate(g_wfOut)
    !deallocate(buffer)

end program product

cf.f90 (3.4 KB)

Hi dluck.w,

Comparing how you call cusparseSpMV and the interface docs, the problem may be that alpha and beta should be declared as “real4", not "real8”.

Since the data files are missing, I commented the reads out and then set the arrays to 1. I could recreate the illegal address error, but when changing to “real*4”, it no longer occurs. While I can’t be 100% sure given the missing input files, but likely the issue.

-Mat

Thanks! I tried the solution you mentioned, and it didn’t work. And I think the problem is not there. Because in the interface file cusparse.f90, “!pgi$ ignore_tkr (tkrd) alpha, (tkrd) beta” is used. And I
have tried to set “complex” to alpha and beta to unified compute type, it still didn’t work.

According to the error information, I guess that cusparseSpMV deallocate my data array, maybe something wrong with buffer. If you change complex8 array to real8, “CUDA_C_64F” to “CUDA_R_64F”, the program could run with no error and get right result. If you just change “CUDA_C_64F” to “CUDA_R_64F”, the program would run with no error but return result (0.0, 0.0), which I guess is leaded to by different compute type.

I refered the pgi fortran cuda library interfaces, there is nothing mentioned about “computeType”, but something else like alg or index_type has defination in a way of “enum, bind(C)”.

Data attached! The file of element of sparse matrix is too large to upload, but it has a form of complex “(real , imag)” just like WF.

colIdx (44.8 MB)
rowIdx (44.8 MB)
WF (5.3 MB)

plus: I also noticed that in the error information of “copyout Memcpy (host=, dev= ,size=) FAILED …”, the “size” is only half of my complex vector, which means it maybe a real vector instead of complex.

I tried the solution you mentioned, and it didn’t work. And I think the problem is not there.

Is it the same or different error? I was using an A100, but just tried on a P100 and get an out of memory error.

% nvfortran -cuda -cudalib=cusparse -Mpreprocess -DUSE_R8 cf.f90
% a.out
 (0.000000,0.000000)
 (2.8394249E-03,-1.9894680E-03)
0: cudaMalloc: 0 bytes requested; not enough memory: 0(no error)

Because in the interface file cusparse.f90, “!pgi$ ignore_tkr (tkrd) alpha, (tkrd) beta” is used. And I
have tried to set “complex” to alpha and beta to unified compute type, it still didn’t work.

“alpha” and “beta” are passed in as C void* pointers but dereferenced as floats. So if you’re passing in other types like double or complex, then it’s likely to fail.

While there may be other issues, as I show below, if I simply change the declaration of alpha and beta to real*4, then the illegal address error goes away.

% cat cf.f90
program product
    use cudafor
    use cusparse
    use iso_c_binding

    implicit none

    complex*8,allocatable :: wf(:), element(:), wfOut(:)
    integer*4,allocatable:: rowIdx(:), colIdx(:),csrRowPtr(:)

    integer*4 :: nWf, nz, status, i
#ifdef USE_R8
    real*8 :: alpha=1.0, beta=0.0
#else
    real*4 :: alpha=1.0, beta=0.0
#endif

    integer*8 :: buffersize

    !integer*4,allocatable,device :: buffer(:)
    type(c_devptr) :: buffer

    complex*8,allocatable,device :: g_wf(:), g_element(:), g_wfOut(:)
    integer*4,allocatable,device :: g_rowIdx(:), g_colIdx(:), g_csrRowPtr(:)

    type(cusparseHandle) :: handle
    type(cusparseSpMatDescr) :: matrix
    type(cusparseDnVecDescr) :: vecX, vecY

    integer*4 :: start, end, countrate
    real*8 :: t

    nWf=53*53*53
    nz=6945539

    allocate(wf(nWf), wfOut(nWf), csrRowPtr(nWf+1))
    allocate(rowIdx(nz), colIdx(nz), element(nz))

    allocate(g_wf(nWf), g_wfOut(nWf), g_csrRowPtr(nWf+1))
    allocate(g_rowIdx(nz), g_colIdx(nz), g_element(nz))

    !read data
!    open(18,file='element')
!    read(18,*) (element(i),i=1,nz)
!    close(18)
    open(18,file='rowIdx')
    read(18,*) (rowIdx(i),i=1,nz)
    close(18)
    open(18,file='colIdx')
    read(18,*) (colIdx(i),i=1,nz)
    close(18)
    open(18,file='WF')
    read(18,*) (wf(i),i=1,nWf)
    close(18)

    element = 1
!    rowIdx = 1
!    colIdx = 1
!    WF=1

    rowIdx=rowIdx+1
    colIdx=colIdx+1

    status=cudaMemcpy(g_wf,wf,nWf,cudaMemcpyHostToDevice)
    status=cudaMemcpy(g_rowIdx,rowIdx,nz,cudaMemcpyHostToDevice)
    status=cudaMemcpy(g_colIdx,colIdx,nz,cudaMemcpyHostToDevice)
    status=cudaMemcpy(g_element,element,nz,cudaMemcpyHostToDevice)
    status=cudaMemcpy(g_wfOut,wfOut,nWf,cudaMemcpyHostToDevice)

    !check
    wf=(0.0,0.0)
    write(*,*) wf(1)
    status= cudaMemcpy(wf,g_wf,nWf,cudaMemcpyDeviceToHost)
    write(*,*) wf(1)

    status=cusparseCreate(handle)

    status=cusparseXcoo2csr(handle, g_rowIdx, nz, nWf, g_csrRowPtr, CUSPARSE_INDEX_BASE_ONE)

    status=cusparseCreateCsr(matrix,nWf,nWf,nz,g_csrRowPtr,g_colIdx,g_element,CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_C_64F)

    status=cusparseCreateDnVec(vecX, nWf, g_wf, CUDA_C_64F)
    status=cusparseCreateDnVec(vecY, nWf, g_wfOut, CUDA_C_64F)

    status=cusparseSpMV_bufferSize(handle,CUSPARSE_OPERATION_NON_TRANSPOSE,alpha,matrix,vecX,beta,vecY,CUDA_C_64F,CUSPARSE_SPMV_ALG_DEFAULT,buffersize)

    status=cudaMalloc(buffer,buffersize)

    !call system_clock(start,countrate)
    !do i=1,2000,1
    !status=cusparseSpMV(handle,CUSPARSE_OPERATION_NON_TRANSPOSE,alpha,matrix,vecX,beta,vecY,CUDA_C_64F,CUSPARSE_SPMV_ALG_DEFAULT,buffer)
    !end do

    !call system_clock(end,countrate)
    !t=(end-start)/real(countrate)
    !write(*,*) t

    status=cusparseSpMV(handle,CUSPARSE_OPERATION_NON_TRANSPOSE,alpha,matrix,vecX,beta,vecY,CUDA_C_64F,CUSPARSE_SPMV_ALG_DEFAULT,buffer)

    status=cudaMemcpy(wfOut,g_wfOut,nWf)
    !write(*,*) wfOut(100)

    status=cudaFree(buffer)
    status=cusparseDestroy(handle)
    status=cusparseDestroyDnVec(vecX)
    status=cusparseDestroyDnVec(vecY)
    status=cusparseDestroySpMat(matrix)

    deallocate(wf)
    deallocate(rowIdx)
    deallocate(colIdx)
    deallocate(element)

    deallocate(g_colIdx)
    deallocate(g_csrRowPtr)
    deallocate(g_element)
    deallocate(g_rowIdx)
    deallocate(g_wf)
    deallocate(g_wfOut)
    !deallocate(buffer)

end program product
% nvfortran -cuda -cudalib=cusparse -Mpreprocess -DUSE_R8 cf.f90; a.out
 (0.000000,0.000000)
 (2.8394249E-03,-1.9894680E-03)
0: copyout Memcpy (host=0x14aa1d31a230, dev=0x14a9f3400000, size=1191016) FAILED: 700(an illegal memory access was encountered)
% nvfortran -cuda -cudalib=cusparse -Mpreprocess -DUSE_R4 cf.f90 ; a.out
 (0.000000,0.000000)
 (2.8394249E-03,-1.9894680E-03)

Now I could run with no raised error by using v100 and setting alpha and beta to “real*4”,1050ti used before.

But the result is wrong, if you uncoment this, the result is (0.0,0.0).

!write(*,*) wfOut(100)

I think the problem is the declaration of a double complex number should be complex(8) or complex * 16 instead of complex * 8 since complex*8 is a single precision.

I made a modification of your code and tested on A100 (CUDA 22.5). It works fine.

cf-test-complex.f90 (3.5 KB)