• Network Type Yolo_v4
• TLT Version 4.0.1
When using the default TFRecord conversion settings, I found that training with it sometimes results in errors:
=================================
[cb68cf58055a:00179] *** Process received signal ***
[cb68cf58055a:00179] Signal: Segmentation fault (11)
[cb68cf58055a:00179] Signal code: (-6)
[cb68cf58055a:00179] Failing at address: 0x3e8000000b3
[cb68cf58055a:00179] [ 0] /usr/lib/x86_64-linux-gnu/libc.so.6(+0x43090)[0x7f57da102090]
[cb68cf58055a:00179] [ 1] /usr/lib/x86_64-linux-gnu/libc.so.6(+0x18b9e6)[0x7f57da24a9e6]
[cb68cf58055a:00179] [ 2] ==== backtrace (tid: 374) ====
0 0x0000000000043090 killpg() ???:0
1 0x000000000018b9e6 __nss_database_lookup() ???:0
2 0x0000000000f8400f tensorflow::batch_util::CopyElementToSlice() ???:0
3 0x0000000005f60320 std::_Function_handler<void (tensorflow::Status const&), tensorflow::data::experimental::(anonymous namespace)::MapAndBatchDatasetOp::Dataset::Iterator::CallFunction(std::shared_ptr<tensorflow::data::IteratorContext>, std::shared_ptr<tensorflow::data::experimental::(anonymous namespace)::MapAndBatchDatasetOp::Dataset::Iterator::BatchResult> const&, long long)::{lambda(tensorflow::Status)#1}>::_M_invoke() map_and_batch_dataset_op.cc:0
4 0x0000000006ba6135 std::_Function_handler<void (tensorflow::Status const&), std::_Bind<tensorflow::data::InstantiatedCapturedFunction::RunAsync(tensorflow::data::IteratorContext*, std::vector<tensorflow::Tensor, std::allocator<tensorflow::Tensor> >&&, std::vector<tensorflow::Tensor, std::allocator<tensorflow::Tensor> >*, std::function<void (tensorflow::Status const&)>, std::string const&) const::{lambda(std::function<void (tensorflow::Status const&)> const&, tensorflow::data::IteratorContext*, std::function<void ()> const&, std::string const&, std::shared_ptr<tensorflow::data::(anonymous namespace)::SimpleStepStatsCollector> const&, tensorflow::Status)#3} (std::function<void (tensorflow::Status const&)>, tensorflow::data::IteratorContext*, std::function<void ()>, std::string, std::shared_ptr<tensorflow::data::(anonymous namespace)::SimpleStepStatsCollector>, std::_Placeholder<1>)> >::_M_invoke() captured_function.cc:0
5 0x00000000011442cd std::_Function_handler<void (tensorflow::Status const&), std::_Bind<tensorflow::FunctionLibraryRuntimeImpl::Run(tensorflow::FunctionLibraryRuntime::Options const&, unsigned long long, tensorflow::CallFrameInterface*, std::function<void (tensorflow::Status const&)>)::{lambda(std::function<void (tensorflow::Status const&)>, tensorflow::Status const&)#1} (std::function<void (tensorflow::Status const&)>, std::_Placeholder<1>)> >::_M_invoke() function.cc:0
6 0x000000000118ea26 std::_Function_handler<void (tensorflow::Status const&), std::_Bind<tensorflow::ProcessFunctionLibraryRuntime::Run(tensorflow::FunctionLibraryRuntime::Options const&, unsigned long long, tensorflow::CallFrameInterface*, std::function<void (tensorflow::Status const&)>) const::{lambda(std::function<void (tensorflow::Status const&)>&, tensorflow::Status const&)#1} (std::function<void (tensorflow::Status const&)>, std::_Placeholder<1>)> >::_M_invoke() process_function_library_runtime.cc:0
7 0x000000000118ce7d std::_Function_handler<void (tensorflow::Status const&), tensorflow::ProcessFunctionLibraryRuntime::Run(tensorflow::FunctionLibraryRuntime::Options const&, unsigned long long, absl::Span<tensorflow::Tensor const>, std::vector<tensorflow::Tensor, std::allocator<tensorflow::Tensor> >*, std::function<void (tensorflow::Status const&)>) const::{lambda(tensorflow::Status const&)#1}::operator()(tensorflow::Status const&) const::{lambda(tensorflow::Status const&)#1}>::_M_invoke() process_function_library_runtime.cc:0
8 0x0000000002ba37de tensorflow::ReffedStatusCallback::~ReffedStatusCallback() ???:0
9 0x0000000001193a3a tensorflow::ProcessFunctionLibraryRuntime::CleanUp() ???:0
10 0x0000000001193bb7 std::_Function_handler<void (tensorflow::Status const&), tensorflow::ProcessFunctionLibraryRuntime::Run(tensorflow::FunctionLibraryRuntime::Options const&, unsigned long long, absl::Span<tensorflow::Tensor const>, std::vector<tensorflow::Tensor, std::allocator<tensorflow::Tensor> >*, std::function<void (tensorflow::Status const&)>) const::{lambda(tensorflow::Status const&)#1}>::_M_invoke() process_function_library_runtime.cc:0
11 0x0000000002ba37de tensorflow::ReffedStatusCallback::~ReffedStatusCallback() ???:0
12 0x00000000011961be tensorflow::ProcessFunctionLibraryRuntime::RunMultiDevice(tensorflow::FunctionLibraryRuntime::Options const&, unsigned long long, absl::Span<tensorflow::Tensor const>, std::vector<tensorflow::Tensor, std::allocator<tensorflow::Tensor> >*, std::vector<std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime::CleanUpItem, std::default_delete<tensorflow::ProcessFunctionLibraryRuntime::CleanUpItem> >, std::allocator<std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime::CleanUpItem, std::default_delete<tensorflow::ProcessFunctionLibraryRuntime::CleanUpItem> > > >*, std::function<void (tensorflow::Status const&)>) const::{lambda(tensorflow::Status const&)#5}::operator()() process_function_library_runtime.cc:0
13 0x00000000011457c7 std::_Function_handler<void (tensorflow::Status const&), tensorflow::FunctionLibraryRuntimeImpl::Run(tensorflow::FunctionLibraryRuntime::Options const&, unsigned long long, absl::Span<tensorflow::Tensor const>, std::vector<tensorflow::Tensor, std::allocator<tensorflow::Tensor> >*, std::function<void (tensorflow::Status const&)>)::{lambda(tensorflow::Status const&)#2}>::_M_invoke() function.cc:0
14 0x00000000011f1725 Eigen::ThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WorkerLoop() ???:0
15 0x00000000011ee268 std::_Function_handler<void (), tensorflow::thread::EigenEnvironment::CreateThread(std::function<void ()>)::{lambda()#1}>::_M_invoke() ???:0
16 0x00000000018d69a0 execute_native_thread_routine() /dt9-src/libstdc++-v3/src/nonshared11/../c++11/thread.cc:80
17 0x00000000018d69a0 std::unique_ptr<std::thread::_State, std::default_delete<std::thread::_State> >::~unique_ptr() /dt9-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:292
18 0x00000000018d69a0 execute_native_thread_routine() /dt9-src/libstdc++-v3/src/nonshared11/../c++11/thread.cc:79
19 0x0000000000008609 start_thread() ???:0
20 0x000000000011f133 clone() ???:0
=================================
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/../libtensorflow_framework.so.1(_ZN10tensorflow10batch_util18CopyElementToSliceENS_6TensorEPS1_x+0x10f)[0x7f57d538200f]
[cb68cf58055a:00179] [ 3] /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/../libtensorflow_cc.so.1(+0x5f60320)[0x7f56c3f42320]
[cb68cf58055a:00179] [ 4] /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/../libtensorflow_cc.so.1(+0x6ba6135)[0x7f56c4b88135]
[cb68cf58055a:00179] [ 5] /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/../libtensorflow_framework.so.1(+0x11442cd)[0x7f57d55422cd]
[cb68cf58055a:00179] [ 6] /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/../libtensorflow_framework.so.1(+0x118ea26)[0x7f57d558ca26]
[cb68cf58055a:00179] [ 7] /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/../libtensorflow_framework.so.1(+0x118ce7d)[0x7f57d558ae7d]
[cb68cf58055a:00179] [ 8] /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/../libtensorflow_cc.so.1(_ZN10tensorflow20ReffedStatusCallbackD0Ev+0x4e)[0x7f56c0b857de]
[cb68cf58055a:00179] [ 9] /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/../libtensorflow_framework.so.1(_ZNK10tensorflow29ProcessFunctionLibraryRuntime7CleanUpEPSt6vectorISt10unique_ptrINS0_11CleanUpItemESt14default_deleteIS3_EESaIS6_EESt8functionIFvRKNS_6StatusEEE+0x2ba)[0x7f57d5591a3a]
[cb68cf58055a:00179] [10] /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/../libtensorflow_framework.so.1(+0x1193bb7)[0x7f57d5591bb7]
[cb68cf58055a:00179] [11] /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/../libtensorflow_cc.so.1(_ZN10tensorflow20ReffedStatusCallbackD0Ev+0x4e)[0x7f56c0b857de]
However, everything works fine when switching to the sequence format for training. Is there a problem with the TFRecord format in YOLOv4?
It’s worth mentioning that during the usage, I found that the TFRecord format training works effectively in most cases, and the error only occurs in a few situations.
I am not sure whether this is due to the dataset itself or a problem with the TFRecord format.
As a supplementary note: my custom dataset conversion script is fixed, so theoretically, it shouldn’t cause any issues.