Hello,
As indicated by the title, while I was profiling a program I have found major time losses due to a memcpy call which did not make sense. The entire program is accelerated using OpenACC and cuBLAS and cuSPARSE are used. The following is the minimal program I have created for testing this behavior and the observed profiling result is the same:
PROGRAM TEST_MAIN
!----------------------------------------------------
! test program for specific CUDA kernels
!----------------------------------------------------
USE CUDAFOR
USE CUBLAS_V2
USE CUSPARSE
USE OPENACC
!$ USE OMP_LIB
IMPLICIT NONE
!----------------------------------------------------
INTEGER :: N, NNZ
INTEGER, ALLOCATABLE, DIMENSION(:) :: ROWPTR, COLPTR
DOUBLE PRECISION, ALLOCATABLE, DIMENSION(:) :: VALPTR, VEC1, VEC2
DOUBLE PRECISION :: COEFF1, COEFF2, COEFF3
INTEGER :: I
! CUDA VARIABLES
TYPE(cublasHandle) :: cub_h
TYPE(cusparseHandle) :: cusp_h
TYPE(cusparseMatDescr) :: descrA
TYPE(c_devptr) :: buffer
TYPE(cusparseSpMatDescr) :: mat
TYPE(cusparseDnMatDescr) :: vecOne, vecTwo
TYPE(c_devptr) :: devptr
INTEGER(8) :: bsize
INTEGER :: cub_stat, cusp_stat
N = 52329; NNZ = 1375396
ALLOCATE(ROWPTR(NNZ), COLPTR(NNZ), VALPTR(NNZ))
CALL READ_COO('mat.mtx', ROWPTR, COLPTR, VALPTR, N, NNZ)
ALLOCATE(VEC1(N), VEC2(N))
CALL RAND_VEC(VEC1, 35647)
VEC2(:) = 0.0D0
!$ACC DATA COPYIN(ROWPTR, COLPTR, VALPTR, VEC1) COPY(VEC2)
! INITIALIZATION OF CUDA API
cub_stat = cublasInit()
cub_stat = cublasCreate(cub_h)
cub_stat = cublasSetStream(cub_h, acc_get_cuda_stream(acc_async_sync))
cub_stat = cublasSetPointerMode(cub_h, CUBLAS_POINTER_MODE_DEVICE)
cusp_stat = cusparseCreate(cusp_h)
if (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) &
write(*,*) 'cusparseCreate error for transformation: ', cusp_stat
cusp_stat = cusparseCreateMatDescr(descrA)
cusp_stat = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
cusp_stat = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)
cusp_stat = cusparseSetStream(cusp_h, acc_get_cuda_stream(acc_async_sync))
!$ACC HOST_DATA USE_DEVICE(VEC1, VEC2, ROWPTR, COLPTR, VALPTR)
cusp_stat = cusparseCreateDnMat(vecOne, N, 1, N, VEC1, CUDA_R_64F, CUSPARSE_ORDER_COL)
IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) PRINT *,"cusparseCreateDnMat: ", cusp_stat
cusp_stat = cusparseCreateDnMat(vecTwo, N, 1, N, VEC2, CUDA_R_64F, CUSPARSE_ORDER_COL)
IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) PRINT *,"cusparseCreateDnMat: ", cusp_stat
cusp_stat = cusparseCreateCoo(mat, N, N, NNZ, ROWPTR, COLPTR, VALPTR, &
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F)
IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) print *,"cusparseCreateCoo: ", cusp_stat
!$ACC END HOST_DATA
DO I = 1, 100000
!$ACC HOST_DATA USE_DEVICE(VEC1, VEC2)
cusp_stat = cusparseDnMatSetValues(vecOne, VEC1)
IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) PRINT *, "cusparseDnMatSetValues: ", cusp_stat
cusp_stat = cusparseDnMatSetValues(vecTwo, VEC2)
IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) PRINT *, "cusparseDnMatSetValues: ", cusp_stat
cusp_stat = cusparseSpMM_buffersize(cusp_h, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &
1.0D0, mat, vecOne, 0.0D0, vecTwo, CUDA_R_64F, CUSPARSE_COOMM_ALG1, bsize)
IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) print *,"cusparseSpMM_buffersize: ", cusp_stat
IF (bsize .GT. 0) buffer = acc_malloc(bsize)
cusp_stat = cusparseSpMM(cusp_h, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &
1.0D0, mat, vecOne, 0.0D0, vecTwo, CUDA_R_64F, CUSPARSE_COOMM_ALG1, buffer)
IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) PRINT *,"cusparseSpMM: ", cusp_stat
! -------------------------------------------------------------------------------
cusp_stat = cusparseDnMatGetValues(vecTwo, devptr)
IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) PRINT *, "cusparseDnMatGetValues: ", cusp_stat
! -------------------------------------------------------------------------------
IF (bsize .GT. 0) CALL acc_free(buffer)
!$ACC END HOST_DATA
!$ACC HOST_DATA USE_DEVICE(VEC1, VEC2)
cub_stat = cublasDdot(cub_h, N, VEC1, 1, VEC2, 1, COEFF1)
!$ACC END HOST_DATA
COEFF1 = 1.0D0 / COEFF1
!$ACC HOST_DATA USE_DEVICE(VEC1, VEC2)
cub_stat = cublasDdot(cub_h, N, VEC2, 1, VEC1, 1, COEFF2)
!$ACC END HOST_DATA
COEFF2 = COEFF1 + COEFF2
!$ACC HOST_DATA USE_DEVICE(VEC2)
cub_stat = cublasDnrm2(cub_h, N, VEC2, 1, COEFF3)
!$ACC END HOST_DATA
END DO
!$ACC END DATA
DEALLOCATE(ROWPTR, COLPTR, VALPTR, VEC1)
DO I = 1, 10
PRINT '(1X, A,I7,A,E18.12)', "VEC2(", I, ")=", VEC2(I)
END DO
PRINT *, ""
PRINT *, "COEFF3 =", COEFF3
!----------------------------------------------------
CONTAINS
!----------------------------------------------------
SUBROUTINE READ_COO(FILENAME, ROWPTR, COLPTR, VALUEPTR, N, NNZ)
IMPLICIT NONE
CHARACTER(LEN=*), INTENT(IN) :: FILENAME
INTEGER, INTENT(OUT) :: ROWPTR(NNZ), COLPTR(NNZ)
DOUBLE PRECISION, INTENT(OUT) :: VALUEPTR(NNZ)
INTEGER, INTENT(INOUT) :: N, NNZ
INTEGER :: NROWS, NCOLS, NNZDUM
INTEGER :: I, STATUS
CHARACTER(LEN=100) :: LINE
! OPEN THE FILE
OPEN(UNIT=57, FILE=FILENAME, STATUS='OLD', ACTION='READ')
! READ THE HEADER
READ(57, '(A)') LINE
PRINT*, "COO FILE DESCRIPTION: ", LINE
! READ ARRAY INFORMATION
READ(57, *) NROWS, NCOLS, NNZDUM
IF (NNZ .NE. NNZDUM) THEN
PRINT*, "NNZ IS INITIALIZED INCORRECTLY, CORRECT IT WITH THE FOLLOWING VALUE: ", NNZDUM
STOP
END IF
IF (NROWS .EQ. NCOLS .AND. N .NE. NROWS) THEN
PRINT*, "N IS INITIALIZED INCORRECTLY, CORRECT IT WITH THE FOLLOWING VALUE: ", NROWS
STOP
ELSE IF (NROWS .NE. NCOLS) THEN
PRINT*, "THE MATRIX IS NOT SYMMETRIC. USE A DIFFERENT FILE"
STOP
END IF
! READ THE NON-ZERO ENTRIES
DO I = 1, NNZ
READ(57, *) ROWPTR(I), COLPTR(I), VALUEPTR(I)
END DO
CLOSE(57)
END SUBROUTINE READ_COO
!----------------------------------------------------
SUBROUTINE RAND_VEC(VEC, SEED)
IMPLICIT NONE
INTEGER, INTENT(IN) :: SEED
DOUBLE PRECISION, INTENT(OUT) :: VEC(:)
INTEGER, ALLOCATABLE, DIMENSION(:) :: SEED_temp
INTEGER :: NSEED
CALL RANDOM_SEED(SIZE = NSEED)
ALLOCATE(SEED_temp(NSEED))
SEED_temp = SEED
CALL RANDOM_SEED(PUT=SEED_temp)
DEALLOCATE(SEED_temp)
CALL RANDOM_NUMBER(VEC)
END SUBROUTINE RAND_VEC
END PROGRAM TEST_MAIN
The Makefile:
Makefile.zip (586 Bytes)
The referenced sparse matrix I’ve used:
mat.zip (12.7 MB)
The profiling results came out as follows:
CUDA API Statistics:
Time(%) Total Time (ns) Num Calls Average (ns) Minimum (ns) Maximum (ns) StdDev (ns) Name
------- --------------- --------- ------------- ------------ ------------ ------------- -------------------------------
69,0 11.026.705.921 400.000 27.566,0 4.093 8.489.368 28.086,0 cudaMemcpyAsync
17,0 2.754.990.543 700.001 3.935,0 3.151 117.025.046 139.912,0 cudaLaunchKernel
5,0 797.426.465 4 199.356.616,0 669 329.861.696 144.931.998,0 cudaFree
2,0 421.809.473 100.000 4.218,0 3.858 342.241 1.206,0 cudaMemsetAsync
1,0 265.545.335 900.000 295,0 237 341.989 966,0 cudaStreamGetCaptureInfo_v10010
1,0 221.953.285 300.001 739,0 649 344.767 1.873,0 cudaStreamSynchronize
1,0 196.496.407 300.000 655,0 588 346.016 1.480,0 cudaEventRecord
1,0 158.580.843 300.000 528,0 441 347.150 1.904,0 cudaEventQuery
0,0 20.469.157 1 20.469.157,0 20.469.157 20.469.157 0,0 cuMemAllocManaged
0,0 323.526 1.492 216,0 100 6.072 254,0 cuGetProcAddress
0,0 245.891 6 40.981,0 2.393 109.781 44.916,0 cudaMalloc
0,0 22.300 36 619,0 338 6.019 1.008,0 cudaEventCreateWithFlags
0,0 7.557 4 1.889,0 1.058 2.410 609,0 cuInit
CUDA Kernel Statistics:
Time(%) Total Time (ns) Instances Average (ns) Minimum (ns) Maximum (ns) StdDev (ns) Name
------- --------------- --------- ------------ ------------ ------------ ----------- ----------------------------------------------------------------------------------------------------
58,0 5.652.060.757 100.000 56.520,0 53.248 8.171.019 25.693,0 void cusparse::cusparseCooMMSmallKernel<(unsigned int)128, (cusparseOperation_t)0, (cusparseOperati…
23,0 2.290.980.528 200.000 11.454,0 11.040 22.591 406,0 void nrm2_kernel<double, double, double, (int)0, (int)0, (int)128>(cublasNrm2Params<T1, T3>)
9,0 910.286.258 200.000 4.551,0 3.775 14.592 748,0 void dot_kernel<double, (int)128, (int)0, cublasDotParams<cublasGemvTensor<const double>, cublasGem…
8,0 799.740.337 200.000 3.998,0 3.583 14.272 406,0 void reduce_1Block_kernel<double, (int)128, (int)7, cublasGemvTensorStridedBatched<double>, cublasG…
0,0 242.525 1 242.525,0 242.525 242.525 0,0 __pgi_dev_cumemset_8n
CUDA Memory Operation Statistics (by time):
Time(%) Total Time (ns) Count Average (ns) Minimum (ns) Maximum (ns) StdDev (ns) Operation
------- --------------- ------- ------------ ------------ ------------ ----------- ---------------------------------
43,0 375.483.435 300.000 1.251,0 864 13.600 131,0 [CUDA memcpy DtoH]
31,0 268.208.025 100.000 2.682,0 2.272 477.116 1.503,0 [CUDA memset]
24,0 208.413.987 100.000 2.084,0 1.856 2.848 68,0 [CUDA memcpy HtoD]
0,0 2.994.783 443 6.760,0 2.270 86.719 12.915,0 [CUDA Unified Memory memcpy HtoD]
0,0 17.307 6 2.884,0 1.568 4.799 1.405,0 [CUDA Unified Memory memcpy DtoH]
CUDA Memory Operation Statistics (by size):
Total (MB) Count Average (MB) Minimum (MB) Maximum (MB) StdDev (MB) Operation
---------- ------- ------------ ------------ ------------ ----------- ---------------------------------
41863,200 100.000 0,419 0,419 0,419 0,000 [CUDA memset]
1024,800 100.000 0,010 0,010 0,010 0,000 [CUDA memcpy HtoD]
24,183 443 0,055 0,004 1,036 0,158 [CUDA Unified Memory memcpy HtoD]
2,400 300.000 0,000 0,000 0,000 0,000 [CUDA memcpy DtoH]
0,131 6 0,022 0,004 0,049 0,020 [CUDA Unified Memory memcpy DtoH]
I’ve tried for a while to fix this but I could not find a well-documented case similar to mine; I’m not quite sure what is the appropriate OpenACC directives are either. Thanks in advance.