cudaMemcpyAsync execution before and after Level 1 cuBLAS kernel calls

tarik.m · October 16, 2024, 9:45am

Hello,

As indicated by the title, while I was profiling a program I have found major time losses due to a memcpy call which did not make sense. The entire program is accelerated using OpenACC and cuBLAS and cuSPARSE are used. The following is the minimal program I have created for testing this behavior and the observed profiling result is the same:

PROGRAM TEST_MAIN
	!----------------------------------------------------
	! test program for specific CUDA kernels
	!----------------------------------------------------
	USE CUDAFOR
	USE CUBLAS_V2
	USE CUSPARSE
	USE OPENACC
	
	!$ USE OMP_LIB
	
	IMPLICIT NONE
	
	!----------------------------------------------------
	INTEGER :: N, NNZ
	INTEGER, ALLOCATABLE, DIMENSION(:) :: ROWPTR, COLPTR
	DOUBLE PRECISION, ALLOCATABLE, DIMENSION(:) :: VALPTR, VEC1, VEC2
	DOUBLE PRECISION :: COEFF1, COEFF2, COEFF3 
	
	INTEGER :: I
	
	! CUDA VARIABLES
	TYPE(cublasHandle) :: cub_h
	
	TYPE(cusparseHandle) :: cusp_h
	TYPE(cusparseMatDescr) :: descrA
	TYPE(c_devptr) :: buffer
	TYPE(cusparseSpMatDescr) :: mat
	TYPE(cusparseDnMatDescr) :: vecOne, vecTwo
	TYPE(c_devptr) :: devptr
	
	INTEGER(8) :: bsize
	INTEGER :: cub_stat, cusp_stat
	
	N = 52329; NNZ = 1375396
	
	ALLOCATE(ROWPTR(NNZ), COLPTR(NNZ), VALPTR(NNZ))
	
	CALL READ_COO('mat.mtx', ROWPTR, COLPTR, VALPTR, N, NNZ)
	
	ALLOCATE(VEC1(N), VEC2(N))
	
	CALL RAND_VEC(VEC1, 35647)
	VEC2(:) = 0.0D0
	
	!$ACC DATA COPYIN(ROWPTR, COLPTR, VALPTR, VEC1) COPY(VEC2)
	
	! INITIALIZATION OF CUDA API
	cub_stat = cublasInit()
	cub_stat = cublasCreate(cub_h)
	cub_stat = cublasSetStream(cub_h, acc_get_cuda_stream(acc_async_sync))
	cub_stat = cublasSetPointerMode(cub_h, CUBLAS_POINTER_MODE_DEVICE)
	
	cusp_stat = cusparseCreate(cusp_h)
	if (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) &
	  write(*,*) 'cusparseCreate error for transformation: ', cusp_stat
	cusp_stat = cusparseCreateMatDescr(descrA)
	cusp_stat = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
	cusp_stat = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)
	cusp_stat = cusparseSetStream(cusp_h, acc_get_cuda_stream(acc_async_sync))
	
	!$ACC HOST_DATA USE_DEVICE(VEC1, VEC2, ROWPTR, COLPTR, VALPTR)
	
	cusp_stat = cusparseCreateDnMat(vecOne, N, 1, N, VEC1, CUDA_R_64F, CUSPARSE_ORDER_COL)
	IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) PRINT *,"cusparseCreateDnMat: ", cusp_stat
	  
	cusp_stat = cusparseCreateDnMat(vecTwo, N, 1, N, VEC2, CUDA_R_64F, CUSPARSE_ORDER_COL)
	IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) PRINT *,"cusparseCreateDnMat: ", cusp_stat
	  
	cusp_stat = cusparseCreateCoo(mat, N, N, NNZ, ROWPTR, COLPTR, VALPTR, &
	                   CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F)
	IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) print *,"cusparseCreateCoo: ", cusp_stat
	
	!$ACC END HOST_DATA
	
	DO I = 1, 100000
	  !$ACC HOST_DATA USE_DEVICE(VEC1, VEC2)
	  
	  cusp_stat = cusparseDnMatSetValues(vecOne, VEC1)
	  IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) PRINT *, "cusparseDnMatSetValues: ", cusp_stat
	  
	  cusp_stat = cusparseDnMatSetValues(vecTwo, VEC2)
	  IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) PRINT *, "cusparseDnMatSetValues: ", cusp_stat
	  
	  cusp_stat = cusparseSpMM_buffersize(cusp_h, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &
	                     1.0D0, mat, vecOne, 0.0D0, vecTwo, CUDA_R_64F, CUSPARSE_COOMM_ALG1, bsize)
	  IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) print *,"cusparseSpMM_buffersize: ", cusp_stat
	                  
	  IF (bsize .GT. 0) buffer = acc_malloc(bsize)
	
	  cusp_stat = cusparseSpMM(cusp_h, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &
	                   1.0D0, mat, vecOne, 0.0D0, vecTwo, CUDA_R_64F, CUSPARSE_COOMM_ALG1, buffer)
	  IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) PRINT *,"cusparseSpMM: ", cusp_stat	   
	  
	  ! -------------------------------------------------------------------------------
	  cusp_stat = cusparseDnMatGetValues(vecTwo, devptr)
	  IF (cusp_stat .NE. CUSPARSE_STATUS_SUCCESS) PRINT *, "cusparseDnMatGetValues: ", cusp_stat
	  ! -------------------------------------------------------------------------------
	  
	  IF (bsize .GT. 0) CALL acc_free(buffer)
	  
	  !$ACC END HOST_DATA
	  
	  !$ACC HOST_DATA USE_DEVICE(VEC1, VEC2)
	  cub_stat = cublasDdot(cub_h, N, VEC1, 1, VEC2, 1, COEFF1)
	  !$ACC END HOST_DATA
	  
	  COEFF1 = 1.0D0 / COEFF1
	  
	  !$ACC HOST_DATA USE_DEVICE(VEC1, VEC2)
	  cub_stat = cublasDdot(cub_h, N, VEC2, 1, VEC1, 1, COEFF2)
	  !$ACC END HOST_DATA
	  
	  COEFF2 = COEFF1 + COEFF2
	  
	  !$ACC HOST_DATA USE_DEVICE(VEC2)
	  cub_stat = cublasDnrm2(cub_h, N, VEC2, 1, COEFF3)
	  !$ACC END HOST_DATA
	
	END DO
	
	!$ACC END DATA
	
	DEALLOCATE(ROWPTR, COLPTR, VALPTR, VEC1)
	
	DO I = 1, 10
          PRINT '(1X, A,I7,A,E18.12)', "VEC2(", I, ")=", VEC2(I)
      	END DO
      	
      	PRINT *, ""
      	
      	PRINT *, "COEFF3 =", COEFF3
	
	!----------------------------------------------------
	CONTAINS
	!----------------------------------------------------
	SUBROUTINE READ_COO(FILENAME, ROWPTR, COLPTR, VALUEPTR, N, NNZ)
	
	IMPLICIT NONE
	
	CHARACTER(LEN=*), INTENT(IN) :: FILENAME
	
	INTEGER, INTENT(OUT) :: ROWPTR(NNZ), COLPTR(NNZ)
	DOUBLE PRECISION, INTENT(OUT) :: VALUEPTR(NNZ)
	INTEGER, INTENT(INOUT) :: N, NNZ
	
	INTEGER :: NROWS, NCOLS, NNZDUM
	INTEGER :: I, STATUS
	CHARACTER(LEN=100) :: LINE

	! OPEN THE FILE
	OPEN(UNIT=57, FILE=FILENAME, STATUS='OLD', ACTION='READ')
	
	! READ THE HEADER
	READ(57, '(A)') LINE
	PRINT*, "COO FILE DESCRIPTION: ", LINE
	! READ ARRAY INFORMATION
	READ(57, *) NROWS, NCOLS, NNZDUM
	
	IF (NNZ .NE. NNZDUM) THEN
	  PRINT*, "NNZ IS INITIALIZED INCORRECTLY, CORRECT IT WITH THE FOLLOWING VALUE: ", NNZDUM
	  STOP
	END IF
	
	IF (NROWS .EQ. NCOLS .AND. N .NE. NROWS) THEN
	  PRINT*, "N IS INITIALIZED INCORRECTLY, CORRECT IT WITH THE FOLLOWING VALUE: ", NROWS
	  STOP
	ELSE IF (NROWS .NE. NCOLS) THEN
	  PRINT*, "THE MATRIX IS NOT SYMMETRIC. USE A DIFFERENT FILE"
	  STOP 
	END IF
																	
	! READ THE NON-ZERO ENTRIES
	DO I = 1, NNZ
	  READ(57, *) ROWPTR(I), COLPTR(I), VALUEPTR(I)
	END DO

	CLOSE(57)
	END SUBROUTINE READ_COO
	!----------------------------------------------------
	SUBROUTINE RAND_VEC(VEC, SEED)
	
	IMPLICIT NONE
	
	INTEGER, INTENT(IN) :: SEED
	
	DOUBLE PRECISION, INTENT(OUT) :: VEC(:)
	
	INTEGER, ALLOCATABLE, DIMENSION(:) :: SEED_temp
	INTEGER :: NSEED
	
	CALL RANDOM_SEED(SIZE = NSEED)
    	ALLOCATE(SEED_temp(NSEED))

    	SEED_temp = SEED
    	CALL RANDOM_SEED(PUT=SEED_temp)
    	DEALLOCATE(SEED_temp)
    	
    	CALL RANDOM_NUMBER(VEC)
	
	END SUBROUTINE RAND_VEC

END PROGRAM TEST_MAIN

The Makefile:
Makefile.zip (586 Bytes)

The referenced sparse matrix I’ve used:
mat.zip (12.7 MB)

The profiling results came out as follows:

CUDA API Statistics:

 Time(%)  Total Time (ns)  Num Calls  Average (ns)   Minimum (ns)  Maximum (ns)   StdDev (ns)                Name              
 -------  ---------------  ---------  -------------  ------------  ------------  -------------  -------------------------------
    69,0   11.026.705.921    400.000       27.566,0         4.093     8.489.368       28.086,0  cudaMemcpyAsync                
    17,0    2.754.990.543    700.001        3.935,0         3.151   117.025.046      139.912,0  cudaLaunchKernel               
     5,0      797.426.465          4  199.356.616,0           669   329.861.696  144.931.998,0  cudaFree                       
     2,0      421.809.473    100.000        4.218,0         3.858       342.241        1.206,0  cudaMemsetAsync                
     1,0      265.545.335    900.000          295,0           237       341.989          966,0  cudaStreamGetCaptureInfo_v10010
     1,0      221.953.285    300.001          739,0           649       344.767        1.873,0  cudaStreamSynchronize          
     1,0      196.496.407    300.000          655,0           588       346.016        1.480,0  cudaEventRecord                
     1,0      158.580.843    300.000          528,0           441       347.150        1.904,0  cudaEventQuery                 
     0,0       20.469.157          1   20.469.157,0    20.469.157    20.469.157            0,0  cuMemAllocManaged              
     0,0          323.526      1.492          216,0           100         6.072          254,0  cuGetProcAddress               
     0,0          245.891          6       40.981,0         2.393       109.781       44.916,0  cudaMalloc                     
     0,0           22.300         36          619,0           338         6.019        1.008,0  cudaEventCreateWithFlags       
     0,0            7.557          4        1.889,0         1.058         2.410          609,0  cuInit                         



CUDA Kernel Statistics:

 Time(%)  Total Time (ns)  Instances  Average (ns)  Minimum (ns)  Maximum (ns)  StdDev (ns)                                                  Name                                                
 -------  ---------------  ---------  ------------  ------------  ------------  -----------  ----------------------------------------------------------------------------------------------------
    58,0    5.652.060.757    100.000      56.520,0        53.248     8.171.019     25.693,0  void cusparse::cusparseCooMMSmallKernel<(unsigned int)128, (cusparseOperation_t)0, (cusparseOperati…
    23,0    2.290.980.528    200.000      11.454,0        11.040        22.591        406,0  void nrm2_kernel<double, double, double, (int)0, (int)0, (int)128>(cublasNrm2Params<T1, T3>)        
     9,0      910.286.258    200.000       4.551,0         3.775        14.592        748,0  void dot_kernel<double, (int)128, (int)0, cublasDotParams<cublasGemvTensor<const double>, cublasGem…
     8,0      799.740.337    200.000       3.998,0         3.583        14.272        406,0  void reduce_1Block_kernel<double, (int)128, (int)7, cublasGemvTensorStridedBatched<double>, cublasG…
     0,0          242.525          1     242.525,0       242.525       242.525          0,0  __pgi_dev_cumemset_8n                                                                               



CUDA Memory Operation Statistics (by time):

 Time(%)  Total Time (ns)   Count   Average (ns)  Minimum (ns)  Maximum (ns)  StdDev (ns)              Operation            
 -------  ---------------  -------  ------------  ------------  ------------  -----------  ---------------------------------
    43,0      375.483.435  300.000       1.251,0           864        13.600        131,0  [CUDA memcpy DtoH]               
    31,0      268.208.025  100.000       2.682,0         2.272       477.116      1.503,0  [CUDA memset]                    
    24,0      208.413.987  100.000       2.084,0         1.856         2.848         68,0  [CUDA memcpy HtoD]               
     0,0        2.994.783      443       6.760,0         2.270        86.719     12.915,0  [CUDA Unified Memory memcpy HtoD]
     0,0           17.307        6       2.884,0         1.568         4.799      1.405,0  [CUDA Unified Memory memcpy DtoH]



CUDA Memory Operation Statistics (by size):

 Total (MB)   Count   Average (MB)  Minimum (MB)  Maximum (MB)  StdDev (MB)              Operation            
 ----------  -------  ------------  ------------  ------------  -----------  ---------------------------------
 41863,200   100.000  0,419         0,419         0,419         0,000        [CUDA memset]                    
 1024,800    100.000  0,010         0,010         0,010         0,000        [CUDA memcpy HtoD]               
 24,183          443  0,055         0,004         1,036         0,158        [CUDA Unified Memory memcpy HtoD]
 2,400       300.000  0,000         0,000         0,000         0,000        [CUDA memcpy DtoH]               
 0,131             6  0,022         0,004         0,049         0,020        [CUDA Unified Memory memcpy DtoH]

I’ve tried for a while to fix this but I could not find a well-documented case similar to mine; I’m not quite sure what is the appropriate OpenACC directives are either. Thanks in advance.

MatColgrove · October 16, 2024, 9:09pm

For good or bad, I’m not seeing the extra data movement when I profile your code. You’re using CUDA Managed memory so the OpenACC data regions are effectively no-ops and any data movement would be handled implicitly by the driver.

What compiler version, GPU, CPU, and OS are you using?

I tried a few older compiler versions, CUDA 11.4, on an A100, but still got a similar profile:


    ** CUDA API Summary (cuda_api_sum):

 Time (%)  Total Time (ns)  Num Calls    Avg (ns)      Med (ns)     Min (ns)    Max (ns)    StdDev (ns)             Name
 --------  ---------------  ---------  ------------  ------------  ----------  -----------  ------------  ------------------------
     88.4      482,246,710          7  68,892,387.1       1,231.0         311  185,531,228  87,955,134.9  cudaFree
      4.9       26,729,165          8   3,341,145.6       7,792.5       3,823   26,678,924   9,429,886.9  cudaLaunchKernel
      3.8       20,475,345          1  20,475,345.0  20,475,345.0  20,475,345   20,475,345           0.0  cuMemAllocManaged
      2.3       12,812,575          4   3,203,143.8      26,893.5       7,204   12,751,584   6,365,641.3  cudaMemcpyAsync
      0.4        2,092,495          1   2,092,495.0   2,092,495.0   2,092,495    2,092,495           0.0  cuMemAllocHost_v2
      0.1          513,058          4     128,264.5       1,928.0         901      508,301     253,359.2  cudaStreamSynchronize
      0.1          345,317          8      43,164.6      13,806.5       2,981      181,670      64,070.7  cudaMalloc
      0.0          145,535          6      24,255.8       3,822.0       2,281      122,684      48,313.4  cuMemAlloc_v2
      0.0           41,358          2      20,679.0      20,679.0      17,465       23,893       4,545.3  cudaMemcpy
      0.0           30,050          1      30,050.0      30,050.0      30,050       30,050           0.0  cudaMemsetAsync
      0.0           23,707          5       4,741.4       3,237.0       2,837       10,632       3,323.2  cuMemcpyHtoDAsync_v2
      0.0           21,249         36         590.3         478.5         390        2,136         351.2  cudaEventCreateWithFlags
      0.0            6,598          4       1,649.5       1,685.5       1,434        1,793         167.6  cuInit
      0.0            4,913          3       1,637.7         975.0         856        3,082       1,252.2  cudaEventQuery
      0.0            4,222          3       1,407.3         910.0         791        2,521         966.3  cudaEventRecord
      0.0            1,961          1       1,961.0       1,961.0       1,961        1,961           0.0  cuStreamSynchronize
      0.0            1,737          4         434.3         365.5         125          881         321.9  cuCtxSetCurrent

Also, you can try setting the environment variable “NV_ACC_NOTIFY=2” to have the OpenACC runtime print out when it copies data to/from the device,. However, I only see the updates to the Fortran array descriptors, which is nominal.

% a.out
 COO FILE DESCRIPTION:
 %%MatrixMarket matrix coordinate real symmetric
upload CUDA data  file=main.F90 function=test_main line=46 device=0 threadid=1 variable=descriptor bytes=128
upload CUDA data  file=main.F90 function=test_main line=46 device=0 threadid=1 variable=descriptor bytes=128
upload CUDA data  file=main.F90 function=test_main line=46 device=0 threadid=1 variable=descriptor bytes=128
upload CUDA data  file=main.F90 function=test_main line=46 device=0 threadid=1 variable=descriptor bytes=128
upload CUDA data  file=main.F90 function=test_main line=46 device=0 threadid=1 variable=descriptor bytes=128
 VEC2(      1)=0.318294736206E+05
 VEC2(      2)=0.301576735205E+04
 VEC2(      3)=0.102868564206E+05
 VEC2(      4)=0.941770843568E+03
 VEC2(      5)=-.939634371297E+03
 VEC2(      6)=0.568890630369E+05
 VEC2(      7)=0.458691286003E+05
 VEC2(      8)=0.390423407609E+05
 VEC2(      9)=0.145847183304E+05
 VEC2(     10)=0.122101395388E+05

 COEFF3 =    0.000000000000000

tarik.m · October 17, 2024, 3:48pm

Hello Mat, thanks for the reply. I’ll list the compiler version, CPU, GPU, and the OS:

nvfortran --version
nvfortran 21.9-0 64-bit target on x86-64 Linux -tp skylake 
NVIDIA Compilers and Tools
Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.

lscpu
Architecture:           x86_64
  CPU op-mode(s):       32-bit, 64-bit
  Address sizes:        46 bits physical, 48 bits virtual
  Byte Order:           Little Endian
CPU(s):                 72
  On-line CPU(s) list:  0-71
Vendor ID:              GenuineIntel
  Model name:           Intel(R) Xeon(R) Gold 6254 CPU @ 3.10GHz
    CPU family:         6
    Model:              85
    Thread(s) per core: 2
    Core(s) per socket: 18
    Socket(s):          2
    Stepping:           7
    CPU max MHz:        4000,0000
    CPU min MHz:        1200,0000
    BogoMIPS:           6200.00
    Flags:              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca c
                        mov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm 
                        pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_per
                        fmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid ap
                        erfmperf pni pclmulqdq dtes64 monitor ds_cpl smx est tm2 s
                        sse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic
                         movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
                         lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cdp_l3 i
                        nvpcid_single intel_ppin ssbd mba ibrs ibpb stibp ibrs_enh
                        anced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms inv
                        pcid rtm cqm mpx rdt_a avx512f avx512dq rdseed adx smap cl
                        flushopt clwb intel_pt avx512cd avx512bw avx512vl xsaveopt
                         xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total
                         cqm_mbm_local dtherm ida arat pln pts hwp hwp_act_window 
                        hwp_epp hwp_pkg_req pku ospke avx512_vnni md_clear flush_l
                        1d arch_capabilities
Caches (sum of all):    
  L1d:                  1,1 MiB (36 instances)
  L1i:                  1,1 MiB (36 instances)
  L2:                   36 MiB (36 instances)
  L3:                   49,5 MiB (2 instances)
NUMA:                   
  NUMA node(s):         2
  NUMA node0 CPU(s):    0-17,36-53
  NUMA node1 CPU(s):    18-35,54-71
Vulnerabilities:        
  Itlb multihit:        KVM: Mitigation: VMX unsupported
  L1tf:                 Not affected
  Mds:                  Not affected
  Meltdown:             Not affected
  Mmio stale data:      Mitigation; Clear CPU buffers; SMT vulnerable
  Retbleed:             Mitigation; Enhanced IBRS
  Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl an
                        d seccomp
  Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sa
                        nitization
  Spectre v2:           Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, 
                        PBRSB-eIBRS SW sequence
  Srbds:                Not affected
  Tsx async abort:      Mitigation; Clear CPU buffers; SMT vulnerable

inxi -G
Graphics:
  Device-1: NVIDIA GV100GL [Quadro GV100] driver: nvidia v: 550.120
  Device-2: NVIDIA GP107GL [Quadro P620] driver: nvidia v: 550.120
  Display: server: X.org v: 1.21.1.4 with: Xwayland v: 22.1.5 driver: N/A
    tty: 82x45
  API: OpenGL Message: GL data unavailable in console. Try -G --display

cat /etc/os-release
NAME="openSUSE Leap"
VERSION="15.5"
ID="opensuse-leap"
ID_LIKE="suse opensuse"
VERSION_ID="15.5"
PRETTY_NAME="openSUSE Leap 15.5"
ANSI_COLOR="0;32"
CPE_NAME="cpe:/o:opensuse:leap:15.5"
BUG_REPORT_URL="https://bugs.opensuse.org"
HOME_URL="https://www.opensuse.org/"
DOCUMENTATION_URL="https://en.opensuse.org/Portal:Leap"
LOGO="distributor-logo-Leap"

If you have any further suggestions, I’m waiting for your response. Thanks for your help.

MatColgrove · October 17, 2024, 4:48pm

Unfortunately I don’t have a similar system so can’t reproduce your environment.

What I’d suggest is to try updating your compiler to our latest, currently 24.9, as well as the latest CUDA driver.

I don’t think the extra data movement would be coming from the OpenACC runtime, but possibly the older cuBLAS had an issue where it was copying memory even when the data is managed.

tarik.m · October 19, 2024, 5:35pm

Hello Mat, I’ve tried your suggestions; using the latest Fortran compiler and CUDA drivers albeit on a different system but it should work mostly the same. If it matters, the system I’ve used has a i7-10750H, RTX 2060 Mobile and it runs on Debian 12.7. The profile is similar to yours:


 ** CUDA API Summary (cuda_api_sum):

 Time (%)  Total Time (ns)  Num Calls    Avg (ns)      Med (ns)     Min (ns)    Max (ns)   StdDev (ns)               Name             
 --------  ---------------  ---------  ------------  ------------  ----------  ----------  -----------  ------------------------------
     66.9       50,248,137    200,001         251.2         233.0         186   1,468,443      3,588.1  cudaLaunchKernel              
     27.5       20,666,658          1  20,666,658.0  20,666,658.0  20,666,658  20,666,658          0.0  cuMemAllocManaged             
      4.1        3,083,388          3   1,027,796.0   1,383,332.0         639   1,699,417    903,474.4  cudaFree                      
      0.6          469,297          1     469,297.0     469,297.0     469,297     469,297          0.0  cuMemAllocHost_v2             
      0.3          223,272      1,634         136.6         102.0          58      13,689        361.7  cuGetProcAddress_v2           
      0.2          171,449          1     171,449.0     171,449.0     171,449     171,449          0.0  cudaStreamSynchronize         
      0.2          123,305          6      20,550.8       5,718.5       1,503      58,787     26,082.1  cudaMalloc                    
      0.2          123,033          6      20,505.5       2,772.5       1,825     106,898     42,368.8  cuMemAlloc_v2                 
      0.0           19,588          5       3,917.6       2,184.0       1,786      10,853      3,898.8  cuMemcpyHtoDAsync_v2          
      0.0           13,667         36         379.6         226.5         219       1,512        322.4  cudaEventCreateWithFlags      
      0.0            4,465          5         893.0         881.0         664       1,126        163.9  cuInit                        
      0.0            2,445          1       2,445.0       2,445.0       2,445       2,445          0.0  cuStreamSynchronize           
      0.0            1,812          4         453.0         229.0          66       1,288        566.4  cuCtxSetCurrent               
      0.0              659          2         329.5         329.5         213         446        164.8  cudaGetDriverEntryPoint_v11030
      0.0              374          4          93.5          93.0          78         110         13.1  cuModuleGetLoadingMode

I’ve compiled the file using -cuda -cudalib=cublas,cusparse -acc -gpu=mem:managed -Minfo=accel -O2 -mp flags. The more interesting case in my opinion is that the result I get from the program is different each time:

% ./main
 COO FILE DESCRIPTION: 
 %%MatrixMarket matrix coordinate real symmetric                                                     
 VEC2(      1)=0.318294736206E+05
 VEC2(      2)=0.301576735205E+04
 VEC2(      3)=0.154936627654E+03
 VEC2(      4)=0.941770843568E+03
 VEC2(      5)=0.651393963001E+06
 VEC2(      6)=-.319008443587E+03
 VEC2(      7)=0.291472920584E+03
 VEC2(      8)=0.291472920584E+03
 VEC2(      9)=0.564062103306E+04
 VEC2(     10)=0.126752489920E+05
 
 COEFF3 =    0.000000000000000
% NV_ACC_NOTIFY=2 ./main
 COO FILE DESCRIPTION: 
 %%MatrixMarket matrix coordinate real symmetric                                                     
upload CUDA data  file=/home/tarik/Desktop/OpenACC Testing/pcg_cu/memcpyasync_testing/test_main.f90 function=test_main line=46 device=0 threadid=1 variable=descriptor bytes=128
upload CUDA data  file=/home/tarik/Desktop/OpenACC Testing/pcg_cu/memcpyasync_testing/test_main.f90 function=test_main line=46 device=0 threadid=1 variable=descriptor bytes=128
upload CUDA data  file=/home/tarik/Desktop/OpenACC Testing/pcg_cu/memcpyasync_testing/test_main.f90 function=test_main line=46 device=0 threadid=1 variable=descriptor bytes=128
upload CUDA data  file=/home/tarik/Desktop/OpenACC Testing/pcg_cu/memcpyasync_testing/test_main.f90 function=test_main line=46 device=0 threadid=1 variable=descriptor bytes=128
upload CUDA data  file=/home/tarik/Desktop/OpenACC Testing/pcg_cu/memcpyasync_testing/test_main.f90 function=test_main line=46 device=0 threadid=1 variable=descriptor bytes=128
 VEC2(      1)=0.318294736206E+05
 VEC2(      2)=0.301576735205E+04
 VEC2(      3)=0.154936627654E+03
 VEC2(      4)=0.941770843568E+03
 VEC2(      5)=-.939634371297E+03
 VEC2(      6)=0.618591692514E+03
 VEC2(      7)=-.146400416129E+04
 VEC2(      8)=0.401647919323E+04
 VEC2(      9)=-.742575744290E+02
 VEC2(     10)=-.291472920584E+03
 
 COEFF3 =    0.000000000000000

I couldn’t profile the kernel calls within the program, which is why I don’t know for certain if they work. I know that the scalar value that is returned by the program should not be zero. I’m curious to the reason why this occurs, and I’m thankful for your help in advance.

MatColgrove · October 21, 2024, 5:09pm

I think there’s other issues with your code, though exactly what, I’m not sure. For example if remove managed memory and instead rely on the OpenACC data directives, then I see a crash in the cuSparse “coo” kernel. My best guess is that there’s something wrong with “vecOne” and “vecTwo”, but I’m not an expert with using cuSparse.

I’ll ask Brent to take a look, but if he doesn’t know either, then we may need to get advice from the cuSparse folks.

-Mat

tarik.m · October 21, 2024, 5:59pm

Hello Mat, thank you for your response. I am waiting for an input to proceed further. Thanks to everyone for their help in advance.

tarik.m · October 29, 2024, 5:47pm

Hey, are there any updates on this?

Topic		Replies	Views
CUDA very slow performance CUDA Programming and Performance	21	16737	March 6, 2020
Very poor performance with cudaMemcpy3DAsync / cudaMemcpy3D for transfer of strided data (device to ... CUDA Programming and Performance	22	2554	July 10, 2019
cuda fortran questions Legacy PGI Compilers	10	10945	July 27, 2012
CUDA 2.1 beta CUDA Programming and Performance	49	67164	December 3, 2008
Warning: Cuda API error detected: cuModuleLoadFatBinary returned (0xd1) CUDA-GDB	25	1098	July 30, 2024
CUDA accelerated Linpack seemingly not using any GPU CUDA Programming and Performance	18	3665	March 26, 2018
The best input layout settings in CuBlas GPU-Accelerated Libraries cublas	4	200	August 27, 2024
Inconsistent concurrent transfer speed CUDA Programming and Performance	21	1197	April 17, 2023
After installing cuda v2.3 , Can not execute 16K FFT . With release v2.3 , function after fft always CUDA Programming and Performance	8	3759	August 16, 2009
Help with CUBLAS performance and timing issues, please help... CUDA Programming and Performance	1	3441	December 26, 2008

cudaMemcpyAsync execution before and after Level 1 cuBLAS kernel calls

Related topics