Hi,
I am trying to profile caffe with nvprof.
The problem is when caffe reaches around 9000 iterations it breaks.
However, If I run the same application without nvprof it finishes successfully.
The command that I am running is:
/usr/local/cuda-8.0/bin/nvprof --profile-child-processes --print-gpu-summary ./build/tools/caffe train …
The output of the error:
[…]
I0110 11:33:31.030156 119795 sgd_solver.cpp:106] Iteration 9200, lr = 0.001
*** Aborted at 1484066012 (unix time) try “date -d @1484066012” if you are using GNU date ***
PC: @ 0x0 (unknown)
*** SIGSEGV (@0x0) received by PID 119795 (TID 0x3fffb7fff190) from PID 0; stack trace: ***
@ 0x3fffb7f90478 ([vdso]+0x477)javascript:void(0);
@ 0x3fff9aa2b908 (unknown)
@ 0x3fff9a9ec6ec (unknown)
@ 0x3fff9a9e7178 (unknown)
@ 0x3fff9b2702ac (unknown)
@ 0x3fff9b0ad858 (unknown)
@ 0x3fff9b1452ac (unknown)
@ 0x3fff9b20b288 (unknown)
@ 0x3fff9b19dcc8 (unknown)
@ 0x3fff9b0b6094 (unknown)
@ 0x3fff9b0b7134 (unknown)
@ 0x3fff9afabe9c (unknown)
@ 0x3fff9afabfdc (unknown)
@ 0x3fff9b13e770 cuMemcpy
@ 0x3fffb700c578 (unknown)
@ 0x3fffb6fe1870 (unknown)
@ 0x3fffb7027658 cudaMemcpy
@ 0x3fffb78cc738 caffe::caffe_copy<>()
@ 0x3fffb78e5f9c caffe::BasePrefetchingDataLayer<>::Forward_gpu()
@ 0x3fffb786d8d0 caffe::Net<>::ForwardFromTo()
@ 0x3fffb786dca8 caffe::Net<>::Forward()
@ 0x3fffb788c690 caffe::Solver<>::Step()
@ 0x3fffb788ce68 caffe::Solver<>::Solve()
@ 0x3fffb78e4f08 caffe::P2PSync<>::Run()
@ 0x10013668 train()
@ 0x100109a0 main
@ 0x3fffb6dd4700 generic_start_main.isra.0
@ 0x3fffb6dd48f4 __libc_start_main
@ 0x0 (unknown)
==119795== Profiling application: ./build/tools/caffe train --solver=examples/cifar10/cifar10_full_solver.prototxt -gpu all
==119795== Profiling result:
Time(%) Time Calls Avg Min Max Name
30.43% 28.9407s 15176 1.9070ms 620.35us 3.4906ms void cudnn::detail::divNorm_bw_45d_kernel<float, float, bool=0>(cudnnTensorStruct, float const , float const , cudnnTensorStruct, float const , float const , float const , float const , float, cudnnTensorStruct, cudnnLRNStruct)
17.24% 16.3984s 95880 171.03us 43.648us 362.82us void cudnn::detail::implicit_convolve_sgemm<float, int=128, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0>(int, int, int, float const , int, cudnn::detail::implicit_convolve_sgemm<float, int=128, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0>, float const , kernel_conv_params, int, float, float)
16.65% 15.8349s 23970 660.61us 273.98us 1.2552ms void cudnn::detail::precomputed_convolve_sgemm<float, int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1>(int, int, int, float const , int, cudnn::detail::precomputed_convolve_sgemm<float, int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1>, float const , kernel_conv_params, int, float, float, int)
7.82% 7.43760s 22764 326.73us 203.55us 749.47us void cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1, int=512>(int, int, int, float const , int, cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1, int=512>, float const , kernel_grad_params, int, float, int, int)
7.47% 7.10733s 15176 468.33us 342.02us 763.68us void cudnn::detail::dgrad_alg1_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1, bool=0>(int, int, int, float const , int, float const , int, cudnn::detail::dgrad_alg1_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1, bool=0>, kernel_grad_params, int, int, float, int)
7.42% 7.05360s 15980 441.40us 75.264us 843.94us void cudnn::detail::divNorm_fw_45d_kernel<float, float, bool=0>(cudnnTensorStruct, float const , float const , cudnnTensorStruct, float const , float const , float, cudnnTensorStruct, cudnnLRNStruct)
5.53% 5.26066s 22764 231.10us 27.296us 444.83us void calc_bias_diff<int=2, float, float, int=128, int=0>(cudnnTensorStruct, float const , cudnnTensorStruct, float, float, float, int)
2.14% 2.03573s 7588 268.28us 266.91us 298.85us void caffe::MaxPoolBackward(int, float const , int const , float const , int, int, int, int, int, int, int, int, int, int, int, int, caffe::MaxPoolBackward)
0.68% 646.09ms 15178 42.567us 30.272us 60.128us void cudnn::detail::pooling_bw_kernel_avg<float, float, cudnn::detail::averpooling_func, int=1>(cudnnTensorStruct, float const , float const , cudnn::detail::pooling_bw_kernel_avg<float, float, cudnn::detail::averpooling_func, int=1>, float const , cudnn::detail::pooling_bw_kernel_avg<float, float, cudnn::detail::averpooling_func, int=1>, cudnnTensorStruct, cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, float)
0.53% 499.49ms 7990 62.514us 61.376us 70.240us void caffe::MaxPoolForward(int, float const , int, int, int, int, int, int, int, int, int, int, int, int, caffe::MaxPoolForward, int, float const )
0.45% 428.62ms 7990 53.644us 52.448us 56.192us void add_tensor_kernel_v3<int=2, float, float, int=64, int=1, int=2, int=4, int=2>(cudnnTensorStruct, float, cudnnTensorStruct, float const , float, float)
0.45% 424.04ms 15176 27.941us 11.808us 128.22us void setTensor4d_kernel<float, float, int=16, int=16>(cudnnTensor4dStruct, float, float)
0.42% 397.82ms 22765 17.474us 7.7120us 25.088us void cudnn::detail::activation_bw_4d_kernel<float, float, int=128, int=1, int=4, cudnn::detail::relu_func<float, cudnnNanPropagation_t=1, bool=0>>(cudnnTensorStruct, float const , float const , cudnn::detail::activation_bw_4d_kernel<float, float, int=128, int=1, int=4, cudnn::detail::relu_func<float, cudnnNanPropagation_t=1, bool=0>>, float const , cudnnTensorStruct, float, cudnnTensorStruct, int, double)
0.39% 366.97ms 32068 11.443us 543ns 49.920us [CUDA memcpy HtoD]
0.38% 362.24ms 7990 45.336us 43.744us 51.520us void gemm_kernel1x1_core<float, bool=0, bool=0, bool=0, bool=1, bool=0>(float, float const , float const , int, int, int, int, int, int, float, float, float, float, int)
0.28% 269.51ms 15980 16.865us 12.064us 23.872us void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func, int=1>(cudnnTensorStruct, float const , cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func, int=1>, cudnnTensorStruct, cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, float)
0.23% 223.45ms 23970 9.3220us 4.8640us 16.672us void cudnn::detail::activation_fw_4d_kernel<float, float, int=128, int=1, int=4, cudnn::detail::relu_func<float, cudnnNanPropagation_t=1, bool=0>>(cudnnTensorStruct, float const , cudnn::detail::activation_fw_4d_kernel<float, float, int=128, int=1, int=4, cudnn::detail::relu_func<float, cudnnNanPropagation_t=1, bool=0>>, cudnnTensorStruct, float, cudnnTensorStruct, int, double)
0.21% 204.43ms 15980 12.792us 10.879us 17.408us void add_tensor_kernel_v3<int=2, float, float, int=32, int=1, int=4, int=2, int=2>(cudnnTensorStruct, float, cudnnTensorStruct, float const , float, float)
0.14% 132.39ms 7590 17.442us 16.704us 18.848us maxwell_sgemm_128x64_raggedMn_nn
0.14% 132.38ms 22764 5.8150us 2.8160us 9.4400us void cudnn::detail::scale_filter_kernel<int=16, int=16>(cudnnFilter4dStruct, float, float)
0.13% 126.93ms 7590 16.722us 16.288us 21.439us gen_kmul4_sgemmNT2_core(float const , int, float const , int, float, int, int, int, int, float const , float const , float, float, int)
0.09% 83.902ms 7589 11.055us 10.720us 12.160us [CUDA memcpy PtoP]
0.09% 83.588ms 56910 1.4680us 1.0870us 3.0720us void axpy_kernel_val<float, float, int=0>(cublasAxpyParamsVal<float, float, float>)
0.08% 78.356ms 46734 1.6760us 1.5360us 3.3920us caffe::sync_conv_groups(void)
0.08% 72.626ms 23970 3.0290us 2.4960us 5.6640us void kern_precompute_indices<bool=0>(int, int, int, int, int, int, int)
0.07% 68.371ms 23570 2.9000us 1.4720us 6.7520us [CUDA memcpy DtoD]
0.05% 48.440ms 7590 6.3820us 5.6640us 7.1360us void gemv2N_kernel_val<float, float, float, int=128, int=32, int=4, int=4, int=1>(float, float, cublasGemv2Params_v2<float, float, float>)
0.05% 47.178ms 30352 1.5540us 1.1200us 2.2720us void caffe::SGDUpdate(int, float, float, caffe::SGDUpdate, caffe::SGDUpdate)
0.05% 43.337ms 15980 2.7110us 2.1750us 3.6160us void asum_kernel<float, float, int=0>(cublasAsumParams<float, float>)
0.04% 42.392ms 7990 5.3050us 4.9600us 6.0160us void cudnn::detail::softmax_fw_kernel<int=2, float, float, int=256, int=1, int=1, int=0>(cudnnTensorStruct, float const , cudnn::detail::softmax_fw_kernel<int=2, float, float, int=256, int=1, int=1, int=0>, cudnnTensorStruct, int, float, cudnnTensorStruct, int, int)
0.04% 41.438ms 15980 2.5930us 2.0160us 3.6800us void dot_kernel<float, float, float, int=128, int=0, int=0>(cublasDotParams<float, float>)
0.04% 36.933ms 68376 540ns 511ns 6.3680us [CUDA memset]
0.04% 33.534ms 15980 2.0980us 1.4400us 3.1360us void reduce_1Block_kernel<float, float, float, int=128, int=7>(float, int, float*)
0.03% 30.348ms 7990 3.7980us 3.2000us 4.6080us void caffe::SoftmaxLossForwardGPU(int, float const , float const , caffe::SoftmaxLossForwardGPU, int, int, int, bool, int, float const *)
0.03% 28.461ms 7590 3.7490us 3.4880us 4.2880us void caffe::SoftmaxLossBackwardGPU(int, float const , float const , caffe::SoftmaxLossBackwardGPU, int, int, int, bool, int, float const *)
0.03% 23.908ms 7990 2.9920us 2.6560us 3.6160us void gemmk1_kernel<float, int=256, int=5, bool=0, bool=0, bool=0, bool=0>(cublasGemmk1Params, float const *, float const , float)
0.02% 22.398ms 11384 1.9670us 1.4720us 2.7520us void scal_kernel_val<float, float, int=0>(cublasScalParamsVal<float, float>)
0.02% 14.462ms 24796 583ns 543ns 42.912us [CUDA memcpy DtoH]
0.01% 11.037ms 3794 2.9090us 2.2720us 3.7760us void caffe::add_kernel(int, float const , float const , caffe::add_kernel)
==119795== Warning: Some profiling data are not recorded. Make sure cudaProfilerStop() or cuProfilerStop() is called before application exit to flush profile data.
======== Error: Application received signal 11