I am trying to train a Resnet18 model with 5 classes with TLT 2.0 . Here’s my detectnet_v2_train_resnet18_kitti.txt
random_seed: 42
dataset_config {
data_sources {
tfrecords_path: “/workspace/tlt-experiments/data/tfrecords/kitti_trainval/*”
image_directory_path: “/workspace/tlt-experiments/data/training”
}
image_extension: “png”
target_class_mapping {
key: “car”
value: “car”
}
target_class_mapping {
key: “cyclist”
value: “cyclist”
}
target_class_mapping {
key: “tram”
value: “tram”
}
target_class_mapping {
key: “van”
value: “van”
}
target_class_mapping {
key: “truck”
value: “truck”
}
validation_fold: 0
}
augmentation_config {
preprocessing {
output_image_width: 1248
output_image_height: 384
min_bbox_width: 1.0
min_bbox_height: 1.0
output_image_channel: 3
}
spatial_augmentation {
hflip_probability: 0.5
zoom_min: 1.0
zoom_max: 1.0
translate_max_x: 8.0
translate_max_y: 8.0
}
color_augmentation {
hue_rotation_max: 25.0
saturation_shift_max: 0.20000000298
contrast_scale_max: 0.10000000149
contrast_center: 0.5
}
}
postprocessing_config {
target_class_config {
key: “car”
value {
clustering_config {
coverage_threshold: 0.00499999988824
dbscan_eps: 0.20000000298
dbscan_min_samples: 0.0500000007451
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “cyclist”
value {
clustering_config {
coverage_threshold: 0.00499999988824
dbscan_eps: 0.15000000596
dbscan_min_samples: 0.0500000007451
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “truck”
value {
clustering_config {
coverage_threshold: 0.00749999983236
dbscan_eps: 0.230000004172
dbscan_min_samples: 0.0500000007451
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “van”
value {
clustering_config {
coverage_threshold: 0.00499999988824
dbscan_eps: 0.20000000298
dbscan_min_samples: 0.0500000007451
minimum_bounding_box_height: 20
}
}
}
target_class_config {
key: “tram”
value {
clustering_config {
coverage_threshold: 0.00499999988824
dbscan_eps: 0.20000000298
dbscan_min_samples: 0.0500000007451
minimum_bounding_box_height: 20
}
}
}
}
model_config {
pretrained_model_file: “/workspace/tlt-experiments/detectnet_v2/pretrained_resnet18/tlt_pretrained_detectnet_v2_vresnet18/resnet18.hdf5”
num_layers: 18
use_batch_norm: true
objective_set {
bbox {
scale: 35.0
offset: 0.5
}
cov {
}
}
training_precision {
backend_floatx: FLOAT32
}
arch: “resnet”
}
evaluation_config {
validation_period_during_training: 10
first_validation_epoch: 30
minimum_detection_ground_truth_overlap {
key: “car”
value: 0.699999988079
}
minimum_detection_ground_truth_overlap {
key: “cyclist”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “truck”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “van”
value: 0.5
}
minimum_detection_ground_truth_overlap {
key: “tram”
value: 0.5
}
evaluation_box_config {
key: “car”
value {
minimum_height: 20
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
evaluation_box_config {
key: “cyclist”
value {
minimum_height: 20
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
evaluation_box_config {
key: “truck”
value {
minimum_height: 20
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
evaluation_box_config {
key: “van”
value {
minimum_height: 20
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
evaluation_box_config {
key: “tram”
value {
minimum_height: 20
maximum_height: 9999
minimum_width: 10
maximum_width: 9999
}
}
average_precision_mode: INTEGRATE
}
cost_function_config {
target_classes {
name: “car”
class_weight: 1.0
coverage_foreground_weight: 0.0500000007451
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: “truck”
class_weight: 8.0
coverage_foreground_weight: 0.0500000007451
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 1.0
}
}
target_classes {
name: “van”
class_weight: 4.0
coverage_foreground_weight: 0.0500000007451
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: “tram”
class_weight: 4.0
coverage_foreground_weight: 0.0500000007451
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: “cyclist”
class_weight: 4.0
coverage_foreground_weight: 0.0500000007451
objectives {
name: “cov”
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: “bbox”
initial_weight: 10.0
weight_target: 10.0
}
}
enable_autoweighting: true
max_objective_weight: 0.999899983406
min_objective_weight: 9.99999974738e-05
}
training_config {
batch_size_per_gpu: 4
num_epochs: 120
learning_rate {
soft_start_annealing_schedule {
min_learning_rate: 5e-06
max_learning_rate: 5e-04
soft_start: 0.10000000149
annealing: 0.699999988079
}
}
regularizer {
type: L1
weight: 3.00000002618e-09
}
optimizer {
adam {
epsilon: 9.99999993923e-09
beta1: 0.899999976158
beta2: 0.999000012875
}
}
cost_scaling {
initial_exponent: 20.0
increment: 0.005
decrement: 1.0
}
checkpoint_interval: 10
}
bbox_rasterizer_config {
target_class_config {
key: “car”
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.40000000596
cov_radius_y: 0.40000000596
bbox_min_radius: 1.0
}
}
target_class_config {
key: “truck”
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 1.0
cov_radius_y: 1.0
bbox_min_radius: 1.0
}
}
target_class_config {
key: “van”
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 1.0
cov_radius_y: 1.0
bbox_min_radius: 1.0
}
}
target_class_config {
key: “tram”
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 1.0
cov_radius_y: 1.0
bbox_min_radius: 1.0
}
}
target_class_config {
key: “cyclist”
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 1.0
cov_radius_y: 1.0
bbox_min_radius: 1.0
}
}
deadzone_radius: 0.400000154972
}
But I am receiving the following error during training step:
Using TensorFlow backend.
2020-09-02 02:21:01.275355: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
--------------------------------------------------------------------------
[[44670,1],0]: A high-performance Open MPI point-to-point messaging module
was unable to find any relevant network interfaces:
Module: OpenFabrics (openib)
Host: 28a74eaf1a18
Another transport will be used instead, although this may result in
lower performance.
NOTE: You can disable this warning by setting the MCA parameter
btl_base_warn_component_unused to 0.
--------------------------------------------------------------------------
2020-09-02 02:21:04.022657: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-09-02 02:21:04.025557: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-02 02:21:04.025875: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties:
name: GeForce RTX 2070 major: 7 minor: 5 memoryClockRate(GHz): 1.86
pciBusID: 0000:01:00.0
2020-09-02 02:21:04.025893: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2020-09-02 02:21:04.025960: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2020-09-02 02:21:04.026911: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2020-09-02 02:21:04.027191: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2020-09-02 02:21:04.028565: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2020-09-02 02:21:04.029569: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2020-09-02 02:21:04.029643: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-09-02 02:21:04.029763: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-02 02:21:04.030125: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-02 02:21:04.030421: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2020-09-02 02:21:04.030446: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2020-09-02 02:21:04.663087: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1159] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-09-02 02:21:04.663123: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1165] 0
2020-09-02 02:21:04.663132: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1178] 0: N
2020-09-02 02:21:04.663328: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-02 02:21:04.663773: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-02 02:21:04.664101: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-02 02:21:04.664401: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1304] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 5750 MB memory) -> physical GPU (device: 0, name: GeForce RTX 2070, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-09-02 02:21:04,665 [INFO] iva.detectnet_v2.scripts.train: Loading experiment spec at /workspace/examples/detectnet_v2/specs/detectnet_v2_train_resnet18_kitti.txt.
2020-09-02 02:21:04,666 [INFO] iva.detectnet_v2.spec_handler.spec_loader: Merging specification from /workspace/examples/detectnet_v2/specs/detectnet_v2_train_resnet18_kitti.txt
2020-09-02 02:21:05,235 [INFO] iva.detectnet_v2.scripts.train: Cannot iterate over exactly 6434 samples with a batch size of 16; each epoch will therefore take one extra step.
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) (None, 3, 128, 512) 0
__________________________________________________________________________________________________
conv1 (Conv2D) (None, 64, 64, 256) 9472 input_1[0][0]
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization) (None, 64, 64, 256) 256 conv1[0][0]
__________________________________________________________________________________________________
activation_1 (Activation) (None, 64, 64, 256) 0 bn_conv1[0][0]
__________________________________________________________________________________________________
block_1a_conv_1 (Conv2D) (None, 64, 32, 128) 36928 activation_1[0][0]
__________________________________________________________________________________________________
block_1a_bn_1 (BatchNormalizati (None, 64, 32, 128) 256 block_1a_conv_1[0][0]
__________________________________________________________________________________________________
block_1a_relu_1 (Activation) (None, 64, 32, 128) 0 block_1a_bn_1[0][0]
__________________________________________________________________________________________________
block_1a_conv_2 (Conv2D) (None, 64, 32, 128) 36928 block_1a_relu_1[0][0]
__________________________________________________________________________________________________
block_1a_conv_shortcut (Conv2D) (None, 64, 32, 128) 4160 activation_1[0][0]
__________________________________________________________________________________________________
block_1a_bn_2 (BatchNormalizati (None, 64, 32, 128) 256 block_1a_conv_2[0][0]
__________________________________________________________________________________________________
block_1a_bn_shortcut (BatchNorm (None, 64, 32, 128) 256 block_1a_conv_shortcut[0][0]
__________________________________________________________________________________________________
add_1 (Add) (None, 64, 32, 128) 0 block_1a_bn_2[0][0]
block_1a_bn_shortcut[0][0]
__________________________________________________________________________________________________
block_1a_relu (Activation) (None, 64, 32, 128) 0 add_1[0][0]
__________________________________________________________________________________________________
block_1b_conv_1 (Conv2D) (None, 64, 32, 128) 36928 block_1a_relu[0][0]
__________________________________________________________________________________________________
block_1b_bn_1 (BatchNormalizati (None, 64, 32, 128) 256 block_1b_conv_1[0][0]
__________________________________________________________________________________________________
block_1b_relu_1 (Activation) (None, 64, 32, 128) 0 block_1b_bn_1[0][0]
__________________________________________________________________________________________________
block_1b_conv_2 (Conv2D) (None, 64, 32, 128) 36928 block_1b_relu_1[0][0]
__________________________________________________________________________________________________
block_1b_bn_2 (BatchNormalizati (None, 64, 32, 128) 256 block_1b_conv_2[0][0]
__________________________________________________________________________________________________
add_2 (Add) (None, 64, 32, 128) 0 block_1b_bn_2[0][0]
block_1a_relu[0][0]
__________________________________________________________________________________________________
block_1b_relu (Activation) (None, 64, 32, 128) 0 add_2[0][0]
__________________________________________________________________________________________________
block_2a_conv_1 (Conv2D) (None, 128, 16, 64) 73856 block_1b_relu[0][0]
__________________________________________________________________________________________________
block_2a_bn_1 (BatchNormalizati (None, 128, 16, 64) 512 block_2a_conv_1[0][0]
__________________________________________________________________________________________________
block_2a_relu_1 (Activation) (None, 128, 16, 64) 0 block_2a_bn_1[0][0]
__________________________________________________________________________________________________
block_2a_conv_2 (Conv2D) (None, 128, 16, 64) 147584 block_2a_relu_1[0][0]
__________________________________________________________________________________________________
block_2a_conv_shortcut (Conv2D) (None, 128, 16, 64) 8320 block_1b_relu[0][0]
__________________________________________________________________________________________________
block_2a_bn_2 (BatchNormalizati (None, 128, 16, 64) 512 block_2a_conv_2[0][0]
__________________________________________________________________________________________________
block_2a_bn_shortcut (BatchNorm (None, 128, 16, 64) 512 block_2a_conv_shortcut[0][0]
__________________________________________________________________________________________________
add_3 (Add) (None, 128, 16, 64) 0 block_2a_bn_2[0][0]
block_2a_bn_shortcut[0][0]
__________________________________________________________________________________________________
block_2a_relu (Activation) (None, 128, 16, 64) 0 add_3[0][0]
__________________________________________________________________________________________________
block_2b_conv_1 (Conv2D) (None, 128, 16, 64) 147584 block_2a_relu[0][0]
__________________________________________________________________________________________________
block_2b_bn_1 (BatchNormalizati (None, 128, 16, 64) 512 block_2b_conv_1[0][0]
__________________________________________________________________________________________________
block_2b_relu_1 (Activation) (None, 128, 16, 64) 0 block_2b_bn_1[0][0]
__________________________________________________________________________________________________
block_2b_conv_2 (Conv2D) (None, 128, 16, 64) 147584 block_2b_relu_1[0][0]
__________________________________________________________________________________________________
block_2b_bn_2 (BatchNormalizati (None, 128, 16, 64) 512 block_2b_conv_2[0][0]
__________________________________________________________________________________________________
add_4 (Add) (None, 128, 16, 64) 0 block_2b_bn_2[0][0]
block_2a_relu[0][0]
__________________________________________________________________________________________________
block_2b_relu (Activation) (None, 128, 16, 64) 0 add_4[0][0]
__________________________________________________________________________________________________
block_3a_conv_1 (Conv2D) (None, 256, 8, 32) 295168 block_2b_relu[0][0]
__________________________________________________________________________________________________
block_3a_bn_1 (BatchNormalizati (None, 256, 8, 32) 1024 block_3a_conv_1[0][0]
__________________________________________________________________________________________________
block_3a_relu_1 (Activation) (None, 256, 8, 32) 0 block_3a_bn_1[0][0]
__________________________________________________________________________________________________
block_3a_conv_2 (Conv2D) (None, 256, 8, 32) 590080 block_3a_relu_1[0][0]
__________________________________________________________________________________________________
block_3a_conv_shortcut (Conv2D) (None, 256, 8, 32) 33024 block_2b_relu[0][0]
__________________________________________________________________________________________________
block_3a_bn_2 (BatchNormalizati (None, 256, 8, 32) 1024 block_3a_conv_2[0][0]
__________________________________________________________________________________________________
block_3a_bn_shortcut (BatchNorm (None, 256, 8, 32) 1024 block_3a_conv_shortcut[0][0]
__________________________________________________________________________________________________
add_5 (Add) (None, 256, 8, 32) 0 block_3a_bn_2[0][0]
block_3a_bn_shortcut[0][0]
__________________________________________________________________________________________________
block_3a_relu (Activation) (None, 256, 8, 32) 0 add_5[0][0]
__________________________________________________________________________________________________
block_3b_conv_1 (Conv2D) (None, 256, 8, 32) 590080 block_3a_relu[0][0]
__________________________________________________________________________________________________
block_3b_bn_1 (BatchNormalizati (None, 256, 8, 32) 1024 block_3b_conv_1[0][0]
__________________________________________________________________________________________________
block_3b_relu_1 (Activation) (None, 256, 8, 32) 0 block_3b_bn_1[0][0]
__________________________________________________________________________________________________
block_3b_conv_2 (Conv2D) (None, 256, 8, 32) 590080 block_3b_relu_1[0][0]
__________________________________________________________________________________________________
block_3b_bn_2 (BatchNormalizati (None, 256, 8, 32) 1024 block_3b_conv_2[0][0]
__________________________________________________________________________________________________
add_6 (Add) (None, 256, 8, 32) 0 block_3b_bn_2[0][0]
block_3a_relu[0][0]
__________________________________________________________________________________________________
block_3b_relu (Activation) (None, 256, 8, 32) 0 add_6[0][0]
__________________________________________________________________________________________________
block_4a_conv_1 (Conv2D) (None, 512, 8, 32) 1180160 block_3b_relu[0][0]
__________________________________________________________________________________________________
block_4a_bn_1 (BatchNormalizati (None, 512, 8, 32) 2048 block_4a_conv_1[0][0]
__________________________________________________________________________________________________
block_4a_relu_1 (Activation) (None, 512, 8, 32) 0 block_4a_bn_1[0][0]
__________________________________________________________________________________________________
block_4a_conv_2 (Conv2D) (None, 512, 8, 32) 2359808 block_4a_relu_1[0][0]
__________________________________________________________________________________________________
block_4a_conv_shortcut (Conv2D) (None, 512, 8, 32) 131584 block_3b_relu[0][0]
__________________________________________________________________________________________________
block_4a_bn_2 (BatchNormalizati (None, 512, 8, 32) 2048 block_4a_conv_2[0][0]
__________________________________________________________________________________________________
block_4a_bn_shortcut (BatchNorm (None, 512, 8, 32) 2048 block_4a_conv_shortcut[0][0]
__________________________________________________________________________________________________
add_7 (Add) (None, 512, 8, 32) 0 block_4a_bn_2[0][0]
block_4a_bn_shortcut[0][0]
__________________________________________________________________________________________________
block_4a_relu (Activation) (None, 512, 8, 32) 0 add_7[0][0]
__________________________________________________________________________________________________
block_4b_conv_1 (Conv2D) (None, 512, 8, 32) 2359808 block_4a_relu[0][0]
__________________________________________________________________________________________________
block_4b_bn_1 (BatchNormalizati (None, 512, 8, 32) 2048 block_4b_conv_1[0][0]
__________________________________________________________________________________________________
block_4b_relu_1 (Activation) (None, 512, 8, 32) 0 block_4b_bn_1[0][0]
__________________________________________________________________________________________________
block_4b_conv_2 (Conv2D) (None, 512, 8, 32) 2359808 block_4b_relu_1[0][0]
__________________________________________________________________________________________________
block_4b_bn_2 (BatchNormalizati (None, 512, 8, 32) 2048 block_4b_conv_2[0][0]
__________________________________________________________________________________________________
add_8 (Add) (None, 512, 8, 32) 0 block_4b_bn_2[0][0]
block_4a_relu[0][0]
__________________________________________________________________________________________________
block_4b_relu (Activation) (None, 512, 8, 32) 0 add_8[0][0]
__________________________________________________________________________________________________
output_bbox (Conv2D) (None, 12, 8, 32) 6156 block_4b_relu[0][0]
__________________________________________________________________________________________________
output_cov (Conv2D) (None, 3, 8, 32) 1539 block_4b_relu[0][0]
==================================================================================================
Total params: 11,203,023
Trainable params: 11,183,823
Non-trainable params: 19,200
__________________________________________________________________________________________________
2020-09-02 02:21:14,391 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Serial augmentation enabled = False
2020-09-02 02:21:14,391 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Pseudo sharding enabled = False
2020-09-02 02:21:14,391 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Max Image Dimensions (all sources): (0, 0)
2020-09-02 02:21:14,391 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: number of cpus: 8, io threads: 16, compute threads: 8, buffered batches: 4
2020-09-02 02:21:14,391 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: total dataset size 6434, number of sources: 1, batch size per gpu: 16, steps: 403
2020-09-02 02:21:14,520 [INFO] iva.detectnet_v2.dataloader.default_dataloader: Bounding box coordinates were detected in the input specification! Bboxes will be automatically converted to polygon coordinates.
2020-09-02 02:21:14.557716: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-02 02:21:14.558334: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties:
name: GeForce RTX 2070 major: 7 minor: 5 memoryClockRate(GHz): 1.86
pciBusID: 0000:01:00.0
2020-09-02 02:21:14.558397: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2020-09-02 02:21:14.558441: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2020-09-02 02:21:14.558470: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2020-09-02 02:21:14.558491: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2020-09-02 02:21:14.558513: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2020-09-02 02:21:14.558533: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2020-09-02 02:21:14.558551: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-09-02 02:21:14.558626: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-02 02:21:14.558945: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-02 02:21:14.559228: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2020-09-02 02:21:14,810 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: shuffle: True - shard 0 of 1
2020-09-02 02:21:14,817 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: sampling 1 datasets with weights:
2020-09-02 02:21:14,817 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: source: 0 weight: 1.000000
2020-09-02 02:21:15,348 [INFO] iva.detectnet_v2.scripts.train: Found 6434 samples in training set
2020-09-02 02:21:17,864 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Serial augmentation enabled = False
2020-09-02 02:21:17,864 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Pseudo sharding enabled = False
2020-09-02 02:21:17,864 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: Max Image Dimensions (all sources): (0, 0)
2020-09-02 02:21:17,864 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: number of cpus: 8, io threads: 16, compute threads: 8, buffered batches: 4
2020-09-02 02:21:17,864 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: total dataset size 1047, number of sources: 1, batch size per gpu: 16, steps: 66
2020-09-02 02:21:17,893 [INFO] iva.detectnet_v2.dataloader.default_dataloader: Bounding box coordinates were detected in the input specification! Bboxes will be automatically converted to polygon coordinates.
2020-09-02 02:21:18,142 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: shuffle: False - shard 0 of 1
2020-09-02 02:21:18,148 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: sampling 1 datasets with weights:
2020-09-02 02:21:18,148 [INFO] modulus.blocks.data_loaders.multi_source_loader.data_loader: source: 0 weight: 1.000000
2020-09-02 02:21:18,503 [INFO] iva.detectnet_v2.scripts.train: Found 1047 samples in validation set
2020-09-02 02:21:24.279957: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-02 02:21:24.280279: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties:
name: GeForce RTX 2070 major: 7 minor: 5 memoryClockRate(GHz): 1.86
pciBusID: 0000:01:00.0
2020-09-02 02:21:24.280325: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2020-09-02 02:21:24.280371: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2020-09-02 02:21:24.280404: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2020-09-02 02:21:24.280429: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2020-09-02 02:21:24.280481: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2020-09-02 02:21:24.280551: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2020-09-02 02:21:24.280581: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-09-02 02:21:24.280681: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-02 02:21:24.280994: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-02 02:21:24.281300: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2020-09-02 02:21:24.282484: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1159] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-09-02 02:21:24.282515: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1165] 0
2020-09-02 02:21:24.282524: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1178] 0: N
2020-09-02 02:21:24.282671: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-02 02:21:24.283107: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-09-02 02:21:24.283484: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1304] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 5750 MB memory) -> physical GPU (device: 0, name: GeForce RTX 2070, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-09-02 02:21:24.948619: W tensorflow/core/framework/op_kernel.cc:1651] OP_REQUIRES failed at save_restore_v2_ops.cc:184 : Not found: Key cost_sums/cyclist-bbox not found in checkpoint
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
return fn(*args)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1350, in _run_fn
target_list, run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.NotFoundError: 2 root error(s) found.
(0) Not found: Key cost_sums/cyclist-bbox not found in checkpoint
[[{{node save/RestoreV2}}]]
(1) Not found: Key cost_sums/cyclist-bbox not found in checkpoint
[[{{node save/RestoreV2}}]]
[[save/RestoreV2/_877]]
0 successful operations.
0 derived errors ignored.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 1290, in restore
{self.saver_def.filename_tensor_name: save_path})
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 956, in run
run_metadata_ptr)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1180, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1359, in _do_run
run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1384, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.NotFoundError: 2 root error(s) found.
(0) Not found: Key cost_sums/cyclist-bbox not found in checkpoint
[[node save/RestoreV2 (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]
(1) Not found: Key cost_sums/cyclist-bbox not found in checkpoint
[[node save/RestoreV2 (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]
[[save/RestoreV2/_877]]
0 successful operations.
0 derived errors ignored.
Original stack trace for 'save/RestoreV2':
File "/usr/local/bin/tlt-train-g1", line 8, in <module>
sys.exit(main())
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/common/magnet_train.py", line 55, in main
File "<decorator-gen-2>", line 2, in main
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 773, in main
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 691, in run_experiment
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 624, in train_gridbox
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 147, in run_training_loop
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/utilities.py", line 143, in get_singular_monitored_session
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1104, in __init__
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 727, in __init__
self._sess = self._coordinated_creator.create_session()
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 878, in create_session
self.tf_sess = self._session_creator.create_session()
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 638, in create_session
self._scaffold.finalize()
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 229, in finalize
self._saver = training_saver._get_saver_or_default() # pylint: disable=protected-access
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 599, in _get_saver_or_default
saver = Saver(sharded=True, allow_empty=True)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 828, in __init__
self.build()
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 840, in build
self._build(self._filename, build_save=True, build_restore=True)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 878, in _build
build_restore=build_restore)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 502, in _build_internal
restore_sequentially, reshape)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 381, in _AddShardedRestoreOps
name="restore_shard"))
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 328, in _AddRestoreOps
restore_sequentially)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 575, in bulk_restore
return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gen_io_ops.py", line 1696, in restore_v2
name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
attrs, op_def, compute_device)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
self._traceback = tf_stack.extract_stack()
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 1300, in restore
names_to_keys = object_graph_key_mapping(save_path)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 1618, in object_graph_key_mapping
object_graph_string = reader.get_tensor(trackable.OBJECT_GRAPH_PROTO_KEY)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/pywrap_tensorflow_internal.py", line 915, in get_tensor
return CheckpointReader_GetTensor(self, compat.as_bytes(tensor_str))
tensorflow.python.framework.errors_impl.NotFoundError: Key _CHECKPOINTABLE_OBJECT_GRAPH not found in checkpoint
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/bin/tlt-train-g1", line 8, in <module>
sys.exit(main())
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/common/magnet_train.py", line 55, in main
File "<decorator-gen-2>", line 2, in main
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 773, in main
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 691, in run_experiment
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 624, in train_gridbox
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 147, in run_training_loop
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/utilities.py", line 143, in get_singular_monitored_session
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1104, in __init__
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 727, in __init__
self._sess = self._coordinated_creator.create_session()
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 878, in create_session
self.tf_sess = self._session_creator.create_session()
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 647, in create_session
init_fn=self._scaffold.init_fn)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/session_manager.py", line 290, in prepare_session
config=config)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/session_manager.py", line 204, in _restore_checkpoint
saver.restore(sess, checkpoint_filename_with_path)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 1306, in restore
err, "a Variable name or other graph key that is missing")
tensorflow.python.framework.errors_impl.NotFoundError: Restoring from checkpoint failed. This is most likely due to a Variable name or other graph key that is missing from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:
2 root error(s) found.
(0) Not found: Key cost_sums/cyclist-bbox not found in checkpoint
[[node save/RestoreV2 (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]
(1) Not found: Key cost_sums/cyclist-bbox not found in checkpoint
[[node save/RestoreV2 (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]
[[save/RestoreV2/_877]]
0 successful operations.
0 derived errors ignored.
Original stack trace for 'save/RestoreV2':
File "/usr/local/bin/tlt-train-g1", line 8, in <module>
sys.exit(main())
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/common/magnet_train.py", line 55, in main
File "<decorator-gen-2>", line 2, in main
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/utilities/timer.py", line 46, in wrapped_fn
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 773, in main
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 691, in run_experiment
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 624, in train_gridbox
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/scripts/train.py", line 147, in run_training_loop
File "/home/vpraveen/.cache/dazel/_dazel_vpraveen/715c8bafe7816f3bb6f309cd506049bb/execroot/ai_infra/bazel-out/k8-py3-fastbuild/bin/magnet/packages/iva/build_wheel.runfiles/ai_infra/iva/detectnet_v2/training/utilities.py", line 143, in get_singular_monitored_session
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1104, in __init__
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 727, in __init__
self._sess = self._coordinated_creator.create_session()
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 878, in create_session
self.tf_sess = self._session_creator.create_session()
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 638, in create_session
self._scaffold.finalize()
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 229, in finalize
self._saver = training_saver._get_saver_or_default() # pylint: disable=protected-access
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 599, in _get_saver_or_default
saver = Saver(sharded=True, allow_empty=True)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 828, in __init__
self.build()
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 840, in build
self._build(self._filename, build_save=True, build_restore=True)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 878, in _build
build_restore=build_restore)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 502, in _build_internal
restore_sequentially, reshape)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 381, in _AddShardedRestoreOps
name="restore_shard"))
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 328, in _AddRestoreOps
restore_sequentially)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/saver.py", line 575, in bulk_restore
return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gen_io_ops.py", line 1696, in restore_v2
name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
attrs, op_def, compute_device)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
self._traceback = tf_stack.extract_stack()