Profiling TensorFlow

sgovind · May 6, 2019, 11:10pm

Greetings all!

I am running a GAN, DCGAN (https://github.com/carpedm20/DCGAN-tensorflow) using TensorFlow. I notice that about 73% of the execution time is spent in a module called dgrad_engine. Can anyone tell me what this dgrad_engine is ?

Thanks,
Govind

sgovind · May 8, 2019, 5:02pm

Hi There!

For your information I am attaching the nvprof output for the DC-GAN. Can anyone tell me what is happening ? Any pointers would be very useful.

50.73% 529.09ms 202 2.6193ms 1.7008ms 3.5510ms void cudnn::detail::dgrad_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1>(int, int, int, float const , int, float const , int, cudnn::detail::dgrad_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1>, kernel_grad_params, int, int, float, int, int, int)
23.93% 249.54ms 202 1.2354ms 413.25us 2.0831ms void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>(int, int, int, float const , int, float const , int, cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>, kernel_grad_params, int, int, float, int, int, int)
4.24% 44.209ms 400 110.52us 2.6560us 354.98us void tensorflow::functor::ShuffleInTensor3Simple<float, int=2, int=1, int=0, bool=0>(int, float const , tensorflow::functor::Dimension<int=3>, tensorflow::functor::ShuffleInTensor3Simple<float, int=2, int=1, int=0, bool=0>)
3.96% 41.267ms 400 103.17us 40.608us 212.42us void tensorflow::functor::PadInputCustomKernelNCHW<float, int=4>(int, float const , tensorflow::functor::Dimension<int=4>, tensorflow::functor::PadInputCustomKernelNCHW<float, int=4>, tensorflow::functor::Dimension, float const )
2.96% 30.837ms 400 77.092us 38.880us 143.39us void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1>(float, cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1>, cudnnTensorStruct, float const , float, cudnnTensorStruct, float, cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const , cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const , cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const , cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1> const , cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, bool=1, int=1>)
2.93% 30.575ms 400 76.437us 30.048us 160.03us void tensorflow::BiasNCHWKernel(int, float const , float const , tensorflow::BiasNCHWKernel, int, int)
2.47% 25.741ms 400 64.352us 8.7360us 142.75us void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, int=1, int=1, long>, int=16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const , float const >, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, long>, int=16, Eigen::MakePointer> const , Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, long>, int=16, Eigen::MakePointer> const > const > const > const , Eigen::GpuDevice>, long>(float, int=1)
2.40% 25.075ms 100 250.75us 238.79us 298.53us [CUDA memcpy DtoH]
1.34% 14.015ms 408 34.350us 11.264us 68.544us void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float, float)
1.33% 13.853ms 2 6.9263ms 2.6264ms 11.226ms void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=4, int=6, int=3, int=2, int=4, bool=1, bool=1>(int, int, int, float const , int, float const , int, cudnn::detail::dgrad2d_alg1_1<float, int=0, int=4, int=6, int=3, int=2, int=4, bool=1, bool=1>, kernel_grad_params, int, int, float, int, int)
0.72% 7.4815ms 6 1.2469ms 439.33us 4.4560ms cudnn_maxwell_gcgemm_64x64_nt_batched
0.41% 4.2772ms 100 42.771us 42.272us 43.393us void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, int=1024, int=2, int=1024, bool=0>(unsigned int const , tensorflow::functor::Dimension<int=3>, tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, int=1024, int=2, int=1024, bool=0>)
0.38% 3.9506ms 100 39.505us 38.816us 41.568us sgemm_32x32x32_NN_vec
0.34% 3.4948ms 4 873.69us 10.400us 2.6387ms void fft2d_r2c_32x32<float, bool=0, unsigned int=5, bool=0>(float2, float const , int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
0.33% 3.4724ms 150 23.149us 608ns 1.1002ms [CUDA memcpy HtoD]
0.30% 3.1440ms 2 1.5720ms 514.88us 2.6291ms void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, int=5, bool=1, bool=1>(int, int, int, float const , int, float const , int, cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, int=5, bool=1, bool=1>, kernel_grad_params, int, int, float, int, int)
0.30% 3.1030ms 15 206.86us 101.35us 701.96us void fft2d_r2c_32x32<float, bool=1, unsigned int=0, bool=0>(float2, float const , int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
0.28% 2.9091ms 100 29.091us 28.321us 29.985us void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, int=1, int=1, int>, int=16, Eigen::MakePointer>, Eigen::TensorCwiseUnaryOp<Eigen::internal::scalar_tanh_op, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, int>, int=16, Eigen::MakePointer> const > const > const , Eigen::GpuDevice>, int>(float, int=1)
0.22% 2.3077ms 100 23.076us 22.752us 23.584us void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, int=256, int=32, int=32, bool=0>(unsigned int const , tensorflow::functor::Dimension<int=3>, tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, int=256, int=32, int=32, bool=0>)
0.15% 1.6115ms 9 179.05us 173.60us 186.47us maxwell_gcgemm_64x32_nt
0.15% 1.5889ms 100 15.889us 14.944us 19.936us void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, int=2, int=1, int>, int=16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const , int=2, int=1, int>, int=16, Eigen::MakePointer> const , Eigen::TensorBroadcastingOp<Eigen::array<long, unsigned long=2> const , Eigen::TensorMap<Eigen::Tensor<float const , int=2, int=1, int>, int=16, Eigen::MakePointer> const > const > const > const , Eigen::GpuDevice>, int>(float, int=2)
0.07% 751.14us 13 57.780us 11.392us 216.64us void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=0, bool=0, bool=0>(float, float2 const , int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float, float*, int2, int, int)
0.06% 576.32us 2 288.16us 218.21us 358.12us void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const , int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float, float*, int2, int, int)