Hi there,
I executed my cuda code on Titan V (cuda 9.2) and P4000 (cuda 10.1). cudaStream is mainly used in this program. But the performance is so surprising me, the execution time on Titan V (~13.5s) is slower than P4000 (~4s). The nvprof log are the following.
Thanks for the help.
Titan V
==29799== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 20.63% 10.7626s 1 10.7626s 10.7626s 10.7626s dissociation_excitedwater_b1a1(float*, float*, float*, int*, int*)
20.62% 10.7613s 1 10.7613s 10.7613s 10.7613s dissociation_excitedwater_rd(float*, float*, float*, int*, int*)
20.59% 10.7459s 1 10.7459s 10.7459s 10.7459s dissociation_dissociativewater(float*, float*, float*, int*, int*)
17.15% 8.94757s 1 8.94757s 8.94757s 8.94757s dissociation_excitedwater_a1b1(float*, float*, float*, int*, int*)
16.41% 8.56250s 1 8.56250s 8.56250s 8.56250s dissociation_ionizedwater(float*, float*, float*, int*, int*)
4.58% 2.39087s 1 2.39087s 2.39087s 2.39087s thermalisation_subexelectrons(float*, float*, float*, float*, int*, int*, int*)
0.01% 7.7554ms 1 7.7554ms 7.7554ms 7.7554ms setupcuseed(void)
0.00% 620.70us 10 62.069us 1.5360us 106.43us [CUDA memcpy DtoH]
0.00% 560.60us 46 12.187us 1.1200us 96.543us [CUDA memcpy HtoD]
0.00% 17.312us 1 17.312us 17.312us 17.312us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__copy_if::CopyIfAgent<thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::cuda_cub::__copy_if::no_stencil_tag_*, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::unary_negate<first_element_equal_255>, int, int*>, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::cuda_cub::__copy_if::no_stencil_tag_*, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::unary_negate<first_element_equal_255>, int, int*, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long>(thrust::device_ptr<int>, thrust::detail::normal_iterator<thrust::device_ptr<int>>, float, thrust::device_ptr<float>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type)
0.00% 4.7990us 2 2.3990us 2.2720us 2.5270us [CUDA memcpy HtoA]
0.00% 1.7600us 1 1.7600us 1.7600us 1.7600us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__copy_if::InitAgent<thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*, int>, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long, int*>(bool=1, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*)
API calls: 78.00% 10.9146s 5 2.18292s 6.0800us 10.9145s cudaDeviceSynchronize
19.55% 2.73574s 2 1.36787s 4.4130us 2.73573s cudaStreamSynchronize
2.29% 320.17ms 8 40.022ms 5.0950us 318.86ms cudaMalloc
0.06% 7.7562ms 1 7.7562ms 7.7562ms 7.7562ms cudaThreadSynchronize
0.04% 6.2802ms 10 628.02us 29.930us 1.6997ms cudaMemcpyAsync
0.01% 1.9322ms 192 10.063us 367ns 414.20us cuDeviceGetAttribute
0.01% 1.8378ms 9 204.20us 22.964us 1.4845ms cudaLaunchKernel
0.01% 1.0573ms 1 1.0573ms 1.0573ms 1.0573ms cudaDeviceReset
0.01% 852.25us 10 85.225us 23.150us 130.74us cudaMemcpy
0.01% 707.79us 8 88.474us 1.3040us 207.03us cudaFree
0.00% 558.08us 2 279.04us 263.68us 294.40us cuDeviceTotalMem
0.00% 311.59us 36 8.6550us 6.4330us 56.632us cudaMemcpyToSymbol
0.00% 166.86us 2 83.431us 81.136us 85.726us cuDeviceGetName
0.00% 120.54us 2 60.272us 3.1980us 117.35us cudaMallocArray
0.00% 101.77us 2 50.883us 8.4390us 93.328us cudaFreeArray
0.00% 38.540us 5 7.7080us 4.9520us 17.549us cudaStreamCreate
0.00% 30.546us 5 6.1090us 4.2140us 11.923us cudaStreamDestroy
0.00% 19.932us 2 9.9660us 6.6230us 13.309us cudaFuncGetAttributes
0.00% 15.558us 2 7.7790us 5.5440us 10.014us cudaMemcpyToArray
0.00% 13.933us 1 13.933us 13.933us 13.933us cudaSetDevice
0.00% 5.6580us 2 2.8290us 1.2760us 4.3820us cudaBindTextureToArray
0.00% 5.3770us 2 2.6880us 1.1480us 4.2290us cudaUnbindTexture
0.00% 5.2600us 2 2.6300us 1.9520us 3.3080us cuDeviceGetPCIBusId
0.00% 4.2110us 4 1.0520us 460ns 2.0690us cudaGetDevice
0.00% 3.7030us 4 925ns 441ns 1.6860us cudaDeviceGetAttribute
0.00% 3.4030us 3 1.1340us 396ns 2.0830us cuDeviceGetCount
0.00% 3.1960us 4 799ns 419ns 1.4720us cuDeviceGet
0.00% 1.4940us 3 498ns 270ns 825ns cudaGetLastError
0.00% 989ns 4 247ns 181ns 352ns cudaPeekAtLastError
0.00% 929ns 2 464ns 341ns 588ns cudaGetChannelDesc
P4000
==29339== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 25.39% 1.92013s 1 1.92013s 1.92013s 1.92013s dissociation_ionizedwater(float*, float*, float*, int*, int*)
19.83% 1.50023s 1 1.50023s 1.50023s 1.50023s dissociation_excitedwater_b1a1(float*, float*, float*, int*, int*)
17.68% 1.33765s 1 1.33765s 1.33765s 1.33765s dissociation_excitedwater_rd(float*, float*, float*, int*, int*)
13.16% 995.54ms 1 995.54ms 995.54ms 995.54ms dissociation_excitedwater_a1b1(float*, float*, float*, int*, int*)
12.01% 908.45ms 1 908.45ms 908.45ms 908.45ms dissociation_dissociativewater(float*, float*, float*, int*, int*)
11.55% 873.78ms 1 873.78ms 873.78ms 873.78ms thermalisation_subexelectrons(float*, float*, float*, float*, int*, int*, int*)
0.34% 25.885ms 1 25.885ms 25.885ms 25.885ms setupcuseed(void)
0.01% 1.1205ms 10 112.05us 768ns 179.49us [CUDA memcpy DtoH]
0.01% 958.00us 46 20.825us 608ns 174.40us [CUDA memcpy HtoD]
0.00% 47.073us 1 47.073us 47.073us 47.073us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__copy_if::CopyIfAgent<thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::cuda_cub::__copy_if::no_stencil_tag_*, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::unary_negate<first_element_equal_255>, int, int*>, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::cuda_cub::__copy_if::no_stencil_tag_*, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::unary_negate<first_element_equal_255>, int, int*, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long>(thrust::device_ptr<int>, thrust::detail::normal_iterator<thrust::device_ptr<int>>, float, thrust::device_ptr<float>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type)
0.00% 3.9360us 2 1.9680us 1.9520us 1.9840us [CUDA memcpy HtoA]
0.00% 1.0880us 1 1.0880us 1.0880us 1.0880us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__copy_if::InitAgent<thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*, int>, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long, int*>(bool=1, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*)
API calls: 73.19% 2.85564s 5 571.13ms 1.2560us 2.85558s cudaDeviceSynchronize
23.24% 906.68ms 2 453.34ms 1.9820us 906.67ms cudaStreamSynchronize
2.71% 105.71ms 8 13.213ms 5.2270us 105.31ms cudaMalloc
0.66% 25.888ms 1 25.888ms 25.888ms 25.888ms cudaThreadSynchronize
0.07% 2.7327ms 10 273.27us 15.910us 580.26us cudaMemcpyAsync
0.07% 2.5458ms 9 282.87us 8.8240us 2.3514ms cudaLaunchKernel
0.03% 1.1119ms 10 111.19us 52.750us 214.12us cudaMemcpy
0.01% 367.12us 8 45.889us 1.9290us 171.02us cudaFree
0.01% 305.01us 1 305.01us 305.01us 305.01us cudaDeviceReset
0.01% 231.17us 36 6.4210us 4.4840us 40.333us cudaMemcpyToSymbol
0.00% 148.84us 1 148.84us 148.84us 148.84us cuDeviceTotalMem
0.00% 104.93us 97 1.0810us 100ns 44.627us cuDeviceGetAttribute
0.00% 92.328us 2 46.164us 3.6840us 88.644us cudaMallocArray
0.00% 76.192us 2 38.096us 6.1380us 70.054us cudaFreeArray
0.00% 47.226us 1 47.226us 47.226us 47.226us cuDeviceGetName
0.00% 15.776us 2 7.8880us 5.1180us 10.658us cudaMemcpyToArray
0.00% 13.963us 5 2.7920us 1.5340us 7.0260us cudaStreamDestroy
0.00% 12.460us 5 2.4920us 1.0860us 7.9290us cudaStreamCreate
0.00% 8.6190us 2 4.3090us 1.9910us 6.6280us cudaFuncGetAttributes
0.00% 4.7870us 2 2.3930us 1.4890us 3.2980us cudaBindTextureToArray
0.00% 3.5200us 2 1.7600us 695ns 2.8250us cudaUnbindTexture
0.00% 2.8140us 1 2.8140us 2.8140us 2.8140us cudaSetDevice
0.00% 2.4750us 1 2.4750us 2.4750us 2.4750us cuDeviceGetPCIBusId
0.00% 1.7220us 4 430ns 226ns 966ns cudaGetDevice
0.00% 1.4440us 4 361ns 204ns 764ns cudaDeviceGetAttribute
0.00% 1.1200us 3 373ns 113ns 852ns cuDeviceGetCount
0.00% 1.0260us 2 513ns 114ns 912ns cuDeviceGet
0.00% 961ns 2 480ns 380ns 581ns cudaGetChannelDesc
0.00% 440ns 4 110ns 80ns 159ns cudaPeekAtLastError
0.00% 196ns 1 196ns 196ns 196ns cuDeviceGetUuid
0.00% 146ns 1 146ns 146ns 146ns cudaGetLastError