@Morganh I am still having issues with detectnet. How can all values be 0?
{"validation cost": 0.00477045, "mean average precision": 0.0, "average_precision": {"gruenerStein": 0, "gelbeRutsche": 0, "orangePlatte": 0, "schwarzeStange": 0.0, "gelbesAuge": 0.0, "blauerBalken": 0}, "date": "2/15/2022", "time": "17:41:33", "status": "Evaluation Complete"}
Could you share your latest training spec file?
Here you go:
random_seed: 42
dataset_config {
data_sources {
tfrecords_path: "/workspace/projects/lego/data/tfrecords_detectnetv2_resnet18/tfrecord-*"
image_directory_path: "/workspace/projects/lego/data/kitti_detection_1000x1000/train"
}
image_extension: "jpg"
target_class_mapping {
key: "blauerBalken"
value: "blauerBalken"
}
target_class_mapping {
key: "gelbeRutsche"
value: "gelbeRutsche"
}
target_class_mapping {
key: "gelbesAuge"
value: "gelbesAuge"
}
target_class_mapping {
key: "gruenerStein"
value: "gruenerStein"
}
target_class_mapping {
key: "orangePlatte"
value: "orangePlatte"
}
target_class_mapping {
key: "schwarzeStange"
value: "schwarzeStange"
}
validation_fold: 0
}
augmentation_config {
preprocessing {
output_image_width: 480
output_image_height: 480
min_bbox_width: 1.0
min_bbox_height: 1.0
output_image_channel: 3
}
spatial_augmentation {
hflip_probability: 0.5
vflip_probability: 0.5
zoom_min: 1.0
zoom_max: 1.0
translate_max_x: 8.0
translate_max_y: 8.0
}
color_augmentation {
hue_rotation_max: 5.0
saturation_shift_max: 0.8999999761581421
contrast_scale_max: 0.10000000149011612
contrast_center: 0.5
}
}
postprocessing_config {
target_class_config {
key: "blauerBalken"
value {
clustering_config {
coverage_threshold: 0.004999999888241291
minimum_bounding_box_height: 20
dbscan_eps: 0.15000000596046448
dbscan_min_samples: 0.05000000074505806
}
}
}
target_class_config {
key: "gelbeRutsche"
value {
clustering_config {
coverage_threshold: 0.004999999888241291
minimum_bounding_box_height: 20
dbscan_eps: 0.15000000596046448
dbscan_min_samples: 0.05000000074505806
}
}
}
target_class_config {
key: "gelbesAuge"
value {
clustering_config {
coverage_threshold: 0.004999999888241291
minimum_bounding_box_height: 20
dbscan_eps: 0.15000000596046448
dbscan_min_samples: 0.05000000074505806
}
}
}
target_class_config {
key: "gruenerStein"
value {
clustering_config {
coverage_threshold: 0.004999999888241291
minimum_bounding_box_height: 20
dbscan_eps: 0.15000000596046448
dbscan_min_samples: 0.05000000074505806
}
}
}
target_class_config {
key: "orangePlatte"
value {
clustering_config {
coverage_threshold: 0.004999999888241291
minimum_bounding_box_height: 20
dbscan_eps: 0.15000000596046448
dbscan_min_samples: 0.05000000074505806
}
}
}
target_class_config {
key: "schwarzeStange"
value {
clustering_config {
coverage_threshold: 0.004999999888241291
minimum_bounding_box_height: 20
dbscan_eps: 0.15000000596046448
dbscan_min_samples: 0.05000000074505806
}
}
}
}
model_config {
pretrained_model_file: "/workspace/repositories/pretrained_detectnet_v2/pretrained_detectnet_v2_vresnet18/resnet18.hdf5"
num_layers: 18
use_batch_norm: true
objective_set {
bbox {
scale: 35.0
offset: 0.5
}
cov {
}
}
freeze_blocks: 0.0
freeze_blocks: 1.0
arch: "resnet"
all_projections: true
}
evaluation_config {
validation_period_during_training: 2
first_validation_epoch: 2
minimum_detection_ground_truth_overlap {
key: "blauerBalken"
value: 0.6000000238418579
}
minimum_detection_ground_truth_overlap {
key: "gelbeRutsche"
value: 0.6000000238418579
}
minimum_detection_ground_truth_overlap {
key: "gelbesAuge"
value: 0.6000000238418579
}
minimum_detection_ground_truth_overlap {
key: "gruenerStein"
value: 0.6000000238418579
}
minimum_detection_ground_truth_overlap {
key: "orangePlatte"
value: 0.6000000238418579
}
minimum_detection_ground_truth_overlap {
key: "schwarzeStange"
value: 0.6000000238418579
}
evaluation_box_config {
key: "blauerBalken"
value {
minimum_height: 20
maximum_height: 200
minimum_width: 10
maximum_width: 200
}
}
evaluation_box_config {
key: "gelbeRutsche"
value {
minimum_height: 20
maximum_height: 200
minimum_width: 10
maximum_width: 200
}
}
evaluation_box_config {
key: "gelbesAuge"
value {
minimum_height: 20
maximum_height: 200
minimum_width: 10
maximum_width: 200
}
}
evaluation_box_config {
key: "gruenerStein"
value {
minimum_height: 20
maximum_height: 200
minimum_width: 10
maximum_width: 200
}
}
evaluation_box_config {
key: "orangePlatte"
value {
minimum_height: 20
maximum_height: 200
minimum_width: 10
maximum_width: 200
}
}
evaluation_box_config {
key: "schwarzeStange"
value {
minimum_height: 20
maximum_height: 200
minimum_width: 10
maximum_width: 200
}
}
average_precision_mode: INTEGRATE
}
cost_function_config {
target_classes {
name: "gruenerStein"
class_weight: 1.0
coverage_foreground_weight: 0.05000000074505806
objectives {
name: "cov"
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: "bbox"
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: "gelbeRutsche"
class_weight: 1.0
coverage_foreground_weight: 0.05000000074505806
objectives {
name: "cov"
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: "bbox"
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: "orangePlatte"
class_weight: 1.0
coverage_foreground_weight: 0.05000000074505806
objectives {
name: "cov"
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: "bbox"
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: "schwarzeStange"
class_weight: 1.0
coverage_foreground_weight: 0.05000000074505806
objectives {
name: "cov"
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: "bbox"
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: "gelbesAuge"
class_weight: 1.0
coverage_foreground_weight: 0.05000000074505806
objectives {
name: "cov"
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: "bbox"
initial_weight: 10.0
weight_target: 10.0
}
}
target_classes {
name: "blauerBalken"
class_weight: 1.0
coverage_foreground_weight: 0.05000000074505806
objectives {
name: "cov"
initial_weight: 1.0
weight_target: 1.0
}
objectives {
name: "bbox"
initial_weight: 10.0
weight_target: 10.0
}
}
enable_autoweighting: true
max_objective_weight: 0.9998999834060669
min_objective_weight: 9.999999747378752e-05
}
training_config {
batch_size_per_gpu: 1
num_epochs: 120
learning_rate {
soft_start_annealing_schedule {
min_learning_rate: 4.999999873689376e-06
max_learning_rate: 0.0005000000237487257
soft_start: 0.10000000149011612
annealing: 0.699999988079071
}
}
regularizer {
type: L1
weight: 3.000000026176508e-09
}
optimizer {
adam {
epsilon: 9.99999993922529e-09
beta1: 0.8999999761581421
beta2: 0.9990000128746033
}
}
cost_scaling {
initial_exponent: 20.0
increment: 0.005
decrement: 1.0
}
checkpoint_interval: 1
}
bbox_rasterizer_config {
target_class_config {
key: "blauerBalken"
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4000000059604645
cov_radius_y: 0.4000000059604645
bbox_min_radius: 1.0
}
}
target_class_config {
key: "gelbeRutsche"
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4000000059604645
cov_radius_y: 0.4000000059604645
bbox_min_radius: 1.0
}
}
target_class_config {
key: "gelbesAuge"
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4000000059604645
cov_radius_y: 0.4000000059604645
bbox_min_radius: 1.0
}
}
target_class_config {
key: "gruenerStein"
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4000000059604645
cov_radius_y: 0.4000000059604645
bbox_min_radius: 1.0
}
}
target_class_config {
key: "orangePlatte"
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4000000059604645
cov_radius_y: 0.4000000059604645
bbox_min_radius: 1.0
}
}
target_class_config {
key: "schwarzeStange"
value {
cov_center_x: 0.5
cov_center_y: 0.5
cov_radius_x: 0.4000000059604645
cov_radius_y: 0.4000000059604645
bbox_min_radius: 1.0
}
}
deadzone_radius: 0.6700000166893005
}
You mentioned earlier that your training images are resized to 1000x1000.
But the training spec is set to
output_image_width: 480 output_image_height: 480
For this case, need to add āenable_auto_resize: Trueā.
i.e,
output_image_width: 480 output_image_height: 480 enable_auto_resize: True
See more info in DetectNet_v2 - NVIDIA Docs
Good catch, thanks!
Sadly, it did not fix my issue. I tried with enable_auto_resize: True
and also with a dataset preformatted to 480x480.
One thing that I noticed while training was that nvidia-smi
did not show any usage of the gpu whatsoever (power and ram usage wise, as util is always N/A).
Moreover there was no significant usage of the cpu or ram either, so I really donāt know what the program is doing at all.
Could you share the result of nvidia-smi?
Could you share the training log and upload it as a file?
nvidia-smi
before training:
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.00 Driver Version: 510.06 CUDA Version: 11.6 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... On | 00000000:09:00.0 Off | N/A |
| 0% 45C P8 24W / 350W | 706MiB / 24576MiB | N/A Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 1 NVIDIA GeForce ... On | 00000000:0A:00.0 Off | N/A |
| 35% 34C P8 N/A / 19W | 71MiB / 2048MiB | N/A Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
and while training:
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.00 Driver Version: 510.06 CUDA Version: 11.6 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... On | 00000000:09:00.0 Off | N/A |
| 0% 47C P8 24W / 350W | 938MiB / 24576MiB | N/A Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 1 NVIDIA GeForce ... On | 00000000:0A:00.0 Off | N/A |
| 52% 71C P0 N/A / 19W | 1840MiB / 2048MiB | N/A Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
That canāt be right. The gpu power consumption should be much higher if there were any usage of it.
Also, I cannot believe that I can only run training with a batch size of 1 with 25GB vram. It doesnāt get used at all?
.tao_mounts.json
:
{
"Mounts": [
{
"source": "~/nvidia-tao/projects",
"destination": "/workspace/projects"
},
{
"source": "~/nvidia-tao/repositories",
"destination": "/workspace/repositories"
}
],
"Envs": [
{
"variable": "CUDA_VISIBLE_DEVICES",
"value": "0"
},
{
"variable": "DALI_DISABLE_NVML",
"value": "1"
}
],
"DockerOptions": {
"shm_size": "16G",
"ulimits": {
"memlock": -1,
"stack": 67108864
},
"user": "1000:1000",
"ports": {}
}
}
Will add the train.log
later.
- Are you running WSL?
- Why did you set as above? Could you share the user guide link?
Yes, I am running ubuntu inside wsl 2.
I set CUDA_VISIBLE_DEVICES
so I dont accidentally use the second gpu, which is much slower.
DALI_DISABLE_NVML
is a result of some issue around dali (A nvml internal driver error occured (on WSL2) Ā· Issue #2921 Ā· NVIDIA/DALI Ā· GitHub)
To narrow down, could you try one of below two ways?
- Please follow TAO Toolkit Launcher ā TAO Toolkit 3.22.05 documentation and try.
- Or, login the tao docker directly with below command
docker run --runtime=nvidia -it --rm --gpus 0 -v /var/run/docker.sock:/var/run/docker.sock nvcr.io/nvidia/tao/tao-toolkit-tf:v3.21.11-tf1.15.4-py3 /bin/bash
then, run detectnet_v2 training.
#
detectnet_v2 train xxx
āruntime nvidia does result in an error stating that the runtime cannot be found, but without it the container starts and i can call nvidia-smi
.
Iāll try to run the training from the container now.
Here is the log until epoch 56. There is no change between epochs, so I stopped the training early.
train.log (672.2 KB)
So, you did not install nvidia-docker2. It is required. See https://docs.nvidia.com/tao/tao-toolkit/text/tao_toolkit_quick_start_guide.html?highlight=nvidia%20docker2#software-requirements
Iāll check back with my system admin. It did work fine just a couple of days back with a dssd model.
Edit:
nvidia-docker2 is already the newest version (2.9.1-1).
Maybe I have to restart the docker daemonā¦
Does tao not use --runtime nvidia? I cannot start containers using the flag and this guide does not mention --runtime nvidia
(Installation Guide ā NVIDIA Cloud Native Technologies documentation)
Any error?
Just docker: Error response from daemon: Unknown runtime specified nvidia.
The output of docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi
is
Thu Feb 17 18:59:11 2022
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.00 Driver Version: 510.06 CUDA Version: 11.6 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... On | 00000000:09:00.0 Off | N/A |
| 0% 37C P8 14W / 350W | 520MiB / 24576MiB | N/A Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 1 NVIDIA GeForce ... On | 00000000:0A:00.0 Off | N/A |
| 35% 33C P8 N/A / 19W | 81MiB / 2048MiB | N/A Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
For above error, please refer to Docker instantiation failed with error: 500 Server Error: Internal Server Error ("OCI runtime create failed...) - #15 by Morganh
sudo systemctl restart docker
This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.