import os
import glob
import subprocess
import getpass
import uuid
import json


namespace = 'default'


# Available models (#FIXME 1):
# 1. classification - https://docs.nvidia.com/tao/tao-toolkit/text/image_classification.html
# 2. multitask-classification - https://docs.nvidia.com/tao/tao-toolkit/text/multitask_image_classification.html
# classification is the same as multi-class classification

model_name = "classification"  # FIXME1 (Add the model name from the above mentioned list)
dataset_to_be_used = "custom" # FIXME2 example: default/custom; default for the dataset used in this tutorial notebook; custom for a different dataset


# SKIP this step IF you have already installed the TAO-Client wheel.
! pip3 install nvidia-tao-client

Requirement already satisfied: nvidia-tao-client in /home/keirton/Downloads/ENTER/lib/python3.9/site-packages (4.0.0)
Requirement already satisfied: requests>=2.27.1 in /home/keirton/Downloads/ENTER/lib/python3.9/site-packages (from nvidia-tao-client) (2.28.1)
Requirement already satisfied: click>=8.0.4 in /home/keirton/Downloads/ENTER/lib/python3.9/site-packages (from nvidia-tao-client) (8.1.3)
Requirement already satisfied: certifi>=2017.4.17 in /home/keirton/Downloads/ENTER/lib/python3.9/site-packages (from requests>=2.27.1->nvidia-tao-client) (2021.10.8)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/keirton/Downloads/ENTER/lib/python3.9/site-packages (from requests>=2.27.1->nvidia-tao-client) (1.26.8)
Requirement already satisfied: idna<4,>=2.5 in /home/keirton/Downloads/ENTER/lib/python3.9/site-packages (from requests>=2.27.1->nvidia-tao-client) (3.3)
Requirement already satisfied: charset-normalizer<3,>=2 in /home/keirton/Downloads/ENTER/lib/python3.9/site-packages (from requests>=2.27.1->nvidia-tao-client) (2.0.4)


# View the version of the TAO-Client
! tao-client --version

tao-client, version 4.0.0


# Define the node_addr and port number
node_addr = "127.0.1.1" # FIXME3 example: 10.137.149.22
node_port = "32080" # FIXME4 example: 32334
# In host machine, node ip_address and port number can be obtained as follows,
# ip_address: hostname -i
# port_number: kubectl get service ingress-nginx-controller -o jsonpath='{.spec.ports[0].nodePort}'
%env BASE_URL=http://{node_addr}:{node_port}/{namespace}/api/v1

env: BASE_URL=http://127.0.1.1:32080/default/api/v1


!echo $BASE_URL

http://127.0.1.1:32080/default/api/v1


# FIXME: Set ngc_api_key valiable
ngc_api_key = "DELETED" # FIXME5 example: zZYtczM5amdtdDcwNjk0cnA2bGU2bXQ3bnQ6NmQ4NjNhMDItMTdmZS00Y2QxLWI2ZjktNmE5M2YxZTc0OGyM

# Exchange NGC_API_KEY for JWT
identity = json.loads(subprocess.getoutput(f'tao-client login --ngc-api-key {ngc_api_key}'))

%env USER={identity['user_id']}
%env TOKEN={identity['token']}


# Get PVC ID
pvc_id = subprocess.getoutput(f'kubectl get pvc tao-toolkit-api-pvc -n {namespace} -o jsonpath="{{.spec.volumeName}}"')
print(pvc_id)

pvc-847745ac-9707-4cd3-91e5-9a1330082d57


# Get NFS server info
provisioner = json.loads(subprocess.getoutput(f'helm get values nfs-subdir-external-provisioner -o json'))
nfs_server = provisioner['nfs']['server']
nfs_path = provisioner['nfs']['path']
print(nfs_server, nfs_path)

192.168.11.37 /mnt/nfs_share


user = getpass.getuser()
home = os.path.expanduser('~')

! echo "Password for {user}"
password = getpass.getpass()

Password for keirton


# Mount shared volume 
! mkdir -p ~/shared

command = "apt-get -y install nfs-common >> /dev/null"
! echo {password} | sudo -S -k {command}

command = f"mount -t nfs {nfs_server}:{nfs_path}/{namespace}-tao-toolkit-api-pvc-{pvc_id} ~/shared"
! echo {password} | sudo -S -k {command} && echo DONE

DONE


DATA_DIR = "DATA_DIR" # FIXME6
os.environ['DATA_DIR']= DATA_DIR
!mkdir -p $DATA_DIR


if dataset_to_be_used == "default":
    if model_name == "classification":
        if not os.path.exists(os.path.join(DATA_DIR,"VOCtrainval_11-May-2012.tar")):
            print("Download VOC tar data into ", DATA_DIR)
        else:
            !tar -xf $DATA_DIR/VOCtrainval_11-May-2012.tar -C $DATA_DIR
    elif model_name == "multitask-classification":
        if not os.path.exists(os.path.join(DATA_DIR,"archive.zip")):
            print(f"Download Fashion zip data into ", DATA_DIR)
        else:
            !unzip -uq $DATA_DIR/archive.zip -d $DATA_DIR/


# Check the dataset is present
if model_name == "classification" and dataset_to_be_used == "default":
    !if [ ! -d $DATA_DIR/VOCdevkit ]; then echo 'Images folder NOT found.'; else echo 'Found images folder.';fi
    !rm -rf $DATA_DIR/split
elif model_name == "multitask-classification":
    !if [ ! -d $DATA_DIR/images ]; then echo 'Images folder NOT found.'; else echo 'Found images folder.';fi
    !if [ ! -f $DATA_DIR/styles.csv ]; then echo 'CSV file NOT found.'; else echo 'Found CSV file.';fi
    # Create subdirectories and remove existing files in them
    !mkdir -p $DATA_DIR/images_train && rm -rf $DATA_DIR/images_train/*
    !mkdir -p $DATA_DIR/images_val && rm -rf $DATA_DIR/images_val/*
    !mkdir -p $DATA_DIR/images_test && rm -rf $DATA_DIR/images_test/*


# Split dataset into train and val sets
if model_name == "classification" and dataset_to_be_used == "default":
    !python3 -m pip install tqdm
    from os.path import join as join_path
    import os
    import glob
    import re
    import shutil

    DATA_DIR=os.environ.get('DATA_DIR')
    source_dir = join_path(DATA_DIR, "VOCdevkit/VOC2012")
    target_dir = join_path(DATA_DIR, "formatted")


    suffix = '_trainval.txt'
    classes_dir = join_path(source_dir, "ImageSets", "Main")
    images_dir = join_path(source_dir, "JPEGImages")
    classes_files = glob.glob(classes_dir+"/*"+suffix)
    for file in classes_files:
        # get the filename and make output class folder
        classname = os.path.basename(file)
        if classname.endswith(suffix):
            classname = classname[:-len(suffix)]
            target_dir_path = join_path(target_dir, classname)
            if not os.path.exists(target_dir_path):
                os.makedirs(target_dir_path)
        else:
            continue
        print(classname)

        with open(file) as f:
            content = f.readlines()

        for line in content:
            tokens = re.split('\s+', line)
            if tokens[1] == '1':
                # copy this image into target dir_path
                target_file_path = join_path(target_dir_path, tokens[0] + '.jpg')
                src_file_path = join_path(images_dir, tokens[0] + '.jpg')
                shutil.copyfile(src_file_path, target_file_path)
    
    from random import shuffle
    from tqdm import tqdm

    DATA_DIR=os.environ.get('DATA_DIR')
    SOURCE_DIR=os.path.join(DATA_DIR, 'formatted')
    TARGET_DIR=os.path.join(DATA_DIR,'split')
    # list dir
    print(os.walk(SOURCE_DIR))
    dir_list = next(os.walk(SOURCE_DIR))[1]
    # for each dir, create a new dir in split
    for dir_i in tqdm(dir_list):
        newdir_train = os.path.join(TARGET_DIR, 'images_train', dir_i)
        newdir_val = os.path.join(TARGET_DIR, 'images_val', dir_i)
        newdir_test = os.path.join(TARGET_DIR, 'images_test', dir_i)

        if not os.path.exists(newdir_train):
                os.makedirs(newdir_train)
        if not os.path.exists(newdir_val):
                os.makedirs(newdir_val)
        if not os.path.exists(newdir_test):
                os.makedirs(newdir_test)

        img_list = glob.glob(os.path.join(SOURCE_DIR, dir_i, '*.jpg'))
        # shuffle data
        shuffle(img_list)

        for j in range(int(len(img_list)*0.7)):
                shutil.copy2(img_list[j], os.path.join(TARGET_DIR, 'images_train', dir_i))

        for j in range(int(len(img_list)*0.7), int(len(img_list)*0.8)):
                shutil.copy2(img_list[j], os.path.join(TARGET_DIR, 'images_val', dir_i))

        for j in range(int(len(img_list)*0.8), len(img_list)):
                shutil.copy2(img_list[j], os.path.join(TARGET_DIR, 'images_test', dir_i))

    print('Done splitting dataset.')

elif model_name == "multitask-classification" and dataset_to_be_used == "default":
    !python3 -m pip install numpy
    !python3 -m pip install pandas
    import os
    import shutil
    import numpy as np
    import pandas as pd

    df = pd.read_csv(os.environ['DATA_DIR'] + '/styles.csv', error_bad_lines=False, warn_bad_lines=False)
    df = df[['id', 'baseColour', 'subCategory', 'season']]
    df = df.dropna()
    category_cls = df.subCategory.value_counts()[:10].index # 10-class multitask-classification
    season_cls = ['Spring', 'Summer', 'Fall', 'Winter'] # 4-class multitask-classification
    color_cls = df.baseColour.value_counts()[:11].index # 11-class multitask-classification

    # Get all valid rows
    df = df[df.subCategory.isin(category_cls) & df.season.isin(season_cls) & df.baseColour.isin(color_cls)]
    df.columns = ['fname', 'base_color', 'category', 'season']
    df.fname = df.fname.astype(str)
    df.fname = df.fname + '.jpg'

    # remove entries whose image file is missing
    all_img_files = os.listdir(os.environ['DATA_DIR'] + '/images')
    df = df[df.fname.isin(all_img_files)]

    idx = np.arange(len(df))
    np.random.shuffle(idx)

    train_split_idx = int(len(df)*0.8)
    train_df = df.iloc[idx[:train_split_idx]]
    val_df = df.iloc[idx[train_split_idx:]]

    # Add a simple sanity check
    assert len(train_df.season.unique()) == 4 and len(train_df.base_color.unique()) == 11 and \
        len(train_df.category.unique()) == 10, 'Training set misses some classes, re-run this cell!'
    assert len(val_df.season.unique()) == 4 and len(val_df.base_color.unique()) == 11 and \
        len(val_df.category.unique()) == 10, 'Validation set misses some classes, re-run this cell!'

    for image_name in train_df["fname"]:
        source_file_name = os.path.join(DATA_DIR, "images", image_name)
        destination_file_name = os.path.join(DATA_DIR, "images_train", image_name)
        shutil.copy(source_file_name, destination_file_name)

    for image_name in val_df["fname"]:
        source_file_name = os.path.join(DATA_DIR, "images", image_name)
        destination_file_name = os.path.join(DATA_DIR, "images_train", image_name)
        shutil.copy(source_file_name, destination_file_name)
        destination_file_name = os.path.join(DATA_DIR, "images_val", image_name)
        shutil.copy(source_file_name, destination_file_name)

    # save processed data labels
    train_df.to_csv(os.environ['DATA_DIR'] + '/train.csv', index=False)
    val_df.to_csv(os.environ['DATA_DIR'] + '/val.csv', index=False)


# verify
if model_name == "classification":
    !if [ ! -d $DATA_DIR/split/images_train ]; then echo 'train folder NOT found.'; else echo 'Found train images folder.';fi
    !if [ ! -d $DATA_DIR/split/images_val ]; then echo 'val folder NOT found.'; else echo 'Found val images folder.';fi
    !if [ ! -d $DATA_DIR/split/images_test ]; then echo 'test folder NOT found.'; else echo 'Found test images folder.';fi
elif model_name == "multitask_classification":
    import pandas as pd

    print("Number of images in the train set. {}".format(
        len(pd.read_csv(os.environ['DATA_DIR'] + '/train.csv'))
    ))
    print("Number of images in the validation set. {}".format(
        len(pd.read_csv(os.environ['DATA_DIR'] + '/val.csv'))
    ))

Found train images folder.
Found val images folder.
Found test images folder.


if model_name == "classification":
    ds_format = "default"
elif model_name == "multitask-classification":
    ds_format = "custom"


train_dataset_id = subprocess.getoutput(f"tao-client {model_name} dataset-create --dataset_type image_classification --dataset_format {ds_format}")
print(train_dataset_id)

aa75d56e-245b-47e8-95df-083ab25fe238


if model_name == "classification":
    ! rsync -ah --info=progress2 {DATA_DIR}/split/images_train ~/shared/users/{os.environ['USER']}/datasets/{train_dataset_id}/
elif model_name == "multitask-classification":
    ! rsync -ah --info=progress2 {DATA_DIR}/images_train ~/shared/users/{os.environ['USER']}/datasets/{train_dataset_id}/
    ! rsync -ah --info=progress2 {DATA_DIR}/train.csv ~/shared/users/{os.environ['USER']}/datasets/{train_dataset_id}/
    ! rsync -ah --info=progress2 {DATA_DIR}/val.csv ~/shared/users/{os.environ['USER']}/datasets/{train_dataset_id}/
! echo DONE

          2.05G 100%   12.47MB/s    0:02:36 (xfr#18948, to-chk=0/18954)   
DONE


eval_dataset_id = subprocess.getoutput(f"tao-client {model_name} dataset-create --dataset_type image_classification --dataset_format {ds_format}")
print(eval_dataset_id)

a515188f-440e-451e-8c6e-5ec8f1eed22c


if model_name == "classification":
    ! rsync -ah --info=progress2 {DATA_DIR}/split/images_val ~/shared/users/{os.environ['USER']}/datasets/{eval_dataset_id}/
elif model_name == "multitask-classification":
    ! rsync -ah --info=progress2 {DATA_DIR}/images_val ~/shared/users/{os.environ['USER']}/datasets/{eval_dataset_id}
    ! rsync -ah --info=progress2 {DATA_DIR}/val.csv ~/shared/users/{os.environ['USER']}/datasets/{eval_dataset_id}/
! echo DONE

        194.57M 100%   12.96MB/s    0:00:14 (xfr#1783, to-chk=0/1789)  
DONE


pattern = os.path.join(home, 'shared', 'users', os.environ['USER'], 'datasets', '*', 'metadata.json')

datasets = []
for metadata_path in glob.glob(pattern):
    with open(metadata_path, 'r') as metadata_file:
        datasets.append(json.load(metadata_file))

print(json.dumps(datasets, indent=2))

[
  {
    "id": "a515188f-440e-451e-8c6e-5ec8f1eed22c",
    "created_on": "2022-12-20T02:11:50.131868",
    "last_modified": "2022-12-20T02:11:50.131880",
    "name": "My Dataset",
    "description": "My TAO Dataset",
    "version": "1.0.0",
    "logo": "https://www.nvidia.com",
    "type": "image_classification",
    "format": "default",
    "actions": []
  },
  {
    "id": "04c3cfc9-df7e-4fea-8394-240e66557b87",
    "created_on": "2022-12-20T00:12:44.156935",
    "last_modified": "2022-12-20T00:12:44.156945",
    "name": "My Dataset",
    "description": "My TAO Dataset",
    "version": "1.0.0",
    "logo": "https://www.nvidia.com",
    "type": "image_classification",
    "format": "default",
    "actions": []
  },
  {
    "id": "aa75d56e-245b-47e8-95df-083ab25fe238",
    "created_on": "2022-12-20T02:09:11.027728",
    "last_modified": "2022-12-20T02:09:11.027736",
    "name": "My Dataset",
    "description": "My TAO Dataset",
    "version": "1.0.0",
    "logo": "https://www.nvidia.com",
    "type": "image_classification",
    "format": "default",
    "actions": []
  },
  {
    "id": "b4fe854e-cdb2-4e60-a9eb-74ffd7f575e4",
    "created_on": "2022-12-20T00:10:15.883554",
    "last_modified": "2022-12-20T00:10:15.883561",
    "name": "My Dataset",
    "description": "My TAO Dataset",
    "version": "1.0.0",
    "logo": "https://www.nvidia.com",
    "type": "image_classification",
    "format": "default",
    "actions": []
  },
  {
    "id": "62b87f27-2e99-4cc8-86e6-a36c96db133b",
    "created_on": "2022-12-19T21:33:55.932868",
    "last_modified": "2022-12-19T21:33:55.932874",
    "name": "My Dataset",
    "description": "My TAO Dataset",
    "version": "1.0.0",
    "logo": "https://www.nvidia.com",
    "type": "image_classification",
    "format": "default",
    "actions": []
  }
]


network_arch = model_name.replace("-","_")
if network_arch == "classification":
    encode_key = "nvidia_tlt"
else:
    encode_key = "tlt_encode"
model_id = subprocess.getoutput(f"tao-client {model_name} model-create --network_arch {network_arch} --encryption_key {encode_key} ")
print(model_id)
print(model_name)

38dbf121-b31d-4592-ac79-604d54327730
classification


metadata_path = os.path.join(home, 'shared', 'users', os.environ['USER'], 'models', model_id, 'metadata.json')

with open(metadata_path , "r") as metadata_file:
    metadata = json.load(metadata_file)

metadata["train_datasets"] = [train_dataset_id]
metadata["eval_dataset"] = eval_dataset_id


# Assigning pretrained models to different yolo versions
# print base_url+"/model" to get the details of all pretrained models and make the appropriate changes to this map for experiments like for example 
# you are changing the number of layers to 34, then you have to make the appropriate change in the pretrained model name
#print(base_url+"/model")
pretrained_map = {"classification" : "pretrained_classification:efficientnet_b1_relu",
                  "multitask_classification" : "pretrained_classification:resnet10"}


pattern = os.path.join(home, 'shared', 'users', '*', 'models', '*', 'metadata.json')

ptm_id = None
for ptm_metadata_path in glob.glob(pattern):
  with open(ptm_metadata_path, 'r') as metadata_file:
    ptm_metadata = json.load(metadata_file)
    ngc_path = ptm_metadata.get("ngc_path")
    metadata_network_arch = ptm_metadata.get("network_arch")
    if metadata_network_arch == network_arch and ngc_path.endswith(pretrained_map[network_arch]):
      ptm_id = ptm_metadata["id"]
      break

metadata["ptm"] = ptm_id
print(ptm_id)

f2658a12-4f78-495d-be88-0474f0f5849b


# View default automl specs enabled
! tao-client {model_name} model-automl-defaults --id {model_id} | tee ~/shared/users/{os.environ['USER']}/models/{model_id}/specs/automl_defaults.json

[
  "train_config.lr_config.step.learning_rate",
  "train_config.optimizer.sgd.lr",
  "train_config.optimizer.sgd.nesterov",
  "train_config.reg_config.type",
  "train_config.reg_config.weight_decay"
]


# Choose automl algorithm between "Bayesian" and "HyperBand".
automl_algorithm="Bayesian" # FIXME7 example: Bayesian/HyperBand

metric = "kpi" #Don't change this, in future multiple metrics will be supported
additional_automl_parameters = [] #Refer to parameter list mentioned in the above links and add any extra parameter in addition to the default enabled ones
remove_default_automl_parameters = [] #Remove any hyperparameters that are enabled by default for AutoML

metadata["automl_algorithm"] = automl_algorithm
metadata["automl_enabled"] = True
metadata["metric"] = metric
metadata["automl_add_hyperparameters"] = str(additional_automl_parameters)
metadata["automl_remove_hyperparameters"] = str(remove_default_automl_parameters)

with open(metadata_path, "w") as metadata_file:
    json.dump(metadata, metadata_file, indent=2)

print(json.dumps(metadata, indent=2))

{
  "id": "38dbf121-b31d-4592-ac79-604d54327730",
  "created_on": "2022-12-20T02:12:04.861276",
  "last_modified": "2022-12-20T02:12:04.861291",
  "name": "My Model",
  "description": "My TAO Model",
  "version": "1.0.0",
  "logo": "https://www.nvidia.com",
  "ngc_path": "",
  "encryption_key": "nvidia_tlt",
  "read_only": false,
  "public": false,
  "network_arch": "classification",
  "dataset_type": "image_classification",
  "actions": [
    "train",
    "evaluate",
    "prune",
    "retrain",
    "export",
    "convert",
    "inference"
  ],
  "train_datasets": [
    "aa75d56e-245b-47e8-95df-083ab25fe238"
  ],
  "eval_dataset": "a515188f-440e-451e-8c6e-5ec8f1eed22c",
  "inference_dataset": null,
  "additional_id_info": null,
  "calibration_dataset": null,
  "ptm": "f2658a12-4f78-495d-be88-0474f0f5849b",
  "automl_enabled": true,
  "automl_algorithm": "Bayesian",
  "metric": "kpi",
  "automl_add_hyperparameters": "[]",
  "automl_remove_hyperparameters": "[]"
}


# Default train model specs
! tao-client {model_name} model-train-defaults --id {model_id} | tee ~/shared/users/{os.environ['USER']}/models/{model_id}/specs/train.json

{
  "eval_config": {
    "batch_size": 256,
    "enable_center_crop": true,
    "n_workers": 2,
    "top_k": 3
  },
  "model_config": {
    "all_projections": true,
    "arch": "resnet",
    "batch_norm_config": {
      "epsilon": 1e-05,
      "momentum": 0.9
    },
    "dropout": 0.001,
    "input_image_size": "3,224,224",
    "n_layers": 18,
    "retain_head": false,
    "use_batch_norm": true
  },
  "train_config": {
    "batch_size_per_gpu": 64,
    "enable_center_crop": true,
    "enable_color_augmentation": true,
    "enable_random_crop": true,
    "label_smoothing": 0.0,
    "lr_config": {
      "step": {
        "gamma": 0.1,
        "learning_rate": 0.06,
        "step_size": 10
      }
    },
    "mixup_alpha": 0.1,
    "n_epochs": 80,
    "n_workers": 2,
    "optimizer": {
      "sgd": {
        "decay": 0.0,
        "lr": 0.01,
        "momentum": 0.9,
        "nesterov": false
      }
    },
    "preprocess_mode": "caffe",
    "random_seed": 42,
    "reg_config": {
      "scope": "Conv2D,Dense",
      "type": "L2",
      "weight_decay": 5e-05
    }
  }
}


# Customize train model specs
specs_path = os.path.join(home, 'shared', 'users', os.environ['USER'], 'models', model_id, 'specs', 'train.json')

with open(specs_path , "r") as specs_file:
    specs = json.load(specs_file)

# Apply changes for any of the parameters listed in the previous cell as required
# Example for multitask-classification (for each network the parameter key might be different)
#specs["training_config"]["num_epochs"] = 10
# Example for classification
specs["train_config"]["n_epochs"] = 10
specs["model_config"]["arch"] = "efficientnet_b1"
specs["model_config"]["activation_type"]="relu"


with open(specs_path, "w") as specs_file:
    json.dump(specs, specs_file, indent=2)

print(json.dumps(specs, indent=2))

{
  "eval_config": {
    "batch_size": 256,
    "enable_center_crop": true,
    "n_workers": 2,
    "top_k": 3
  },
  "model_config": {
    "all_projections": true,
    "arch": "efficientnet_b1",
    "batch_norm_config": {
      "epsilon": 1e-05,
      "momentum": 0.9
    },
    "dropout": 0.001,
    "input_image_size": "3,224,224",
    "n_layers": 18,
    "retain_head": false,
    "use_batch_norm": true,
    "activation_type": "relu"
  },
  "train_config": {
    "batch_size_per_gpu": 64,
    "enable_center_crop": true,
    "enable_color_augmentation": true,
    "enable_random_crop": true,
    "label_smoothing": 0.0,
    "lr_config": {
      "step": {
        "gamma": 0.1,
        "learning_rate": 0.06,
        "step_size": 10
      }
    },
    "mixup_alpha": 0.1,
    "n_epochs": 10,
    "n_workers": 2,
    "optimizer": {
      "sgd": {
        "decay": 0.0,
        "lr": 0.01,
        "momentum": 0.9,
        "nesterov": false
      }
    },
    "preprocess_mode": "caffe",
    "random_seed": 42,
    "reg_config": {
      "scope": "Conv2D,Dense",
      "type": "L2",
      "weight_decay": 5e-05
    }
  }
}


train_job_id = subprocess.getoutput(f"tao-client {model_name} model-train --id " + model_id)
print(train_job_id)

87e3c22a-977a-49ee-b0cc-a155e834e67a


#utility function to print log file for the upcoming cell
def my_tail(logs_dir, log_file):
    %env LOG_FILE={logs_dir}/{log_file}
    ! mkdir -p {logs_dir}
    ! [ ! -f "$LOG_FILE" ] && touch $LOG_FILE && chmod 666 $LOG_FILE
    ! tail -f -n +1 $LOG_FILE | while read LINE; do echo "$LINE"; [[ "$LINE" == "EOF" ]] && pkill -P $$ tail; done


# Set poll_automl_stats to True if just want to see what's the time left, how many epochs are remaining etc.
# Set poll_automl_stats to False if you want to skip stats and see the training logs instead. Training logs viewing are supported only for Bayesian

# Training times for different models benchmarked on 1 GPU V100 machine can be found here: https://docs.nvidia.com/tao/tao-toolkit/text/automl/automl.html#results-of-automl-experiments

poll_automl_stats = True
if poll_automl_stats:
    import time
    from IPython.display import clear_output
    stats_path = os.path.join(home, 'shared', 'users', os.environ['USER'], 'models', model_id, train_job_id, "automl_metadata.json")
    controller_json_path = os.path.join(home, 'shared', 'users', os.environ['USER'], 'models', model_id, train_job_id, "controller.json")
    while True:
        time.sleep(15)
        clear_output(wait=True)
        if os.path.exists(stats_path):
            try:
                with open(stats_path , "r") as stats_file:
                    stats_dict = json.load(stats_file)
                print(json.dumps(stats_dict, indent=2))
                if float(stats_dict["Number of epochs yet to start"]) == 0.0:
                    break
            except (json.JSONDecodeError):
                print("Stats computed are being written to file. Stats will be visible on screen in a few seconds")
else:
    # Print the log file - supported only for bayesian (the file won't exist until the backend Toolkit container is running -- can take several minutes)
    if automl_algorithm == "Bayesian":
        logs_dir = os.path.join(home, 'shared', 'users', os.environ['USER'], 'models', model_id)
        max_recommendations = metadata.get("automl_max_recommendations",20)
        for experiment_num in range(max_recommendations):
            log_file = f"{train_job_id}/experiment_{experiment_num}/log.txt"
            while True:
                if os.path.exists(os.path.join(logs_dir, log_file)):
                    break
            print(f"\n\nViewing experiment {experiment_num}\n\n")
            my_tail(logs_dir, log_file)

{
  "best_map": 0.0,
  "Estimated time for automl completion": "Will be updated after completing one epoch",
  "Current experiment number": 1,
  "Number of epochs yet to start": Infinity,
  "Time per epoch in seconds": Infinity
}


# The config and the weights of the best configuration are present at best_model folder
# Takes a few seconds to copy the original automl experiment to best_model folder
!python3 -m pip install pandas
import pandas as pd

automl_job_dir = f"{home}/shared/users/{os.environ['USER']}/models/{model_id}/{train_job_id}"
best_model_path =  f"{automl_job_dir}/best_model"

while True:
    if os.path.exists(best_model_path) and len(os.listdir(best_model_path)) > 0 and os.path.exists(f"{best_model_path}/controller.json"):
        #List the binary model file
        print("\nCheckpoints for the best performing experiment")
        if os.path.exists(best_model_path+"/weights") and len(os.listdir(best_model_path+"/weights")) > 0:
            print(f"Folder: {best_model_path}/weights")
            print("Files:", os.listdir(best_model_path+"/weights"))
        else:
            print(f"Folder: {best_model_path}")
            print("Files:", os.listdir(best_model_path))

        experiment_artifacts = json.load(open(f"{best_model_path}/controller.json","r"))
        data_frame = pd.DataFrame(experiment_artifacts)
        # Print experiment id/number and the corresponding result
        print("\nResults of all experiments")
        with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):
            print(data_frame[["id","result"]])

        print("\nConfig/Spec file for the best performing experiment (recommendation_id.kitti with the maximum result value in the dataframe)")
        # List the recommendation config file of the best performing checkpoint(recommendation_id.kitti with the maximum result value in the dataframe)
        !ls {best_model_path}/*.kitti 
            
        break

Collecting pandas
  Downloading pandas-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
     |████████████████████████████████| 12.2 MB 3.4 MB/s eta 0:00:01
Collecting numpy>=1.20.3
  Downloading numpy-1.23.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
     |████████████████████████████████| 17.1 MB 66.9 MB/s eta 0:00:01
Requirement already satisfied: pytz>=2020.1 in /home/keirton/Downloads/ENTER/lib/python3.9/site-packages (from pandas) (2022.6)
Requirement already satisfied: python-dateutil>=2.8.1 in /home/keirton/Downloads/ENTER/lib/python3.9/site-packages (from pandas) (2.8.2)
Requirement already satisfied: six>=1.5 in /home/keirton/Downloads/ENTER/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)
Installing collected packages: numpy, pandas
Successfully installed numpy-1.23.5 pandas-1.5.2

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
File ~/Downloads/ENTER/lib/python3.9/genericpath.py:19, in exists(path)
     18 try:
---> 19     os.stat(path)
     20 except (OSError, ValueError):

FileNotFoundError: [Errno 2] No such file or directory: '/home/keirton/shared/users/bfd68d4b-51bf-54a3-abd0-11ab78499b18/models/3ded152e-b81c-4f92-bf74-748e6b23945d/6da87456-68e4-41c2-845d-046174fe4397/best_model'

During handling of the above exception, another exception occurred:

KeyboardInterrupt                         Traceback (most recent call last)
Cell In[32], line 10
      7 best_model_path =  f"{automl_job_dir}/best_model"
      9 while True:
---> 10     if os.path.exists(best_model_path) and len(os.listdir(best_model_path)) > 0 and os.path.exists(f"{best_model_path}/controller.json"):
     11         #List the binary model file
     12         print("\nCheckpoints for the best performing experiment")
     13         if os.path.exists(best_model_path+"/weights") and len(os.listdir(best_model_path+"/weights")) > 0:

File ~/Downloads/ENTER/lib/python3.9/genericpath.py:19, in exists(path)
     17 """Test whether a path exists.  Returns False for broken symbolic links"""
     18 try:
---> 19     os.stat(path)
     20 except (OSError, ValueError):
     21     return False

KeyboardInterrupt:


! rm -rf ~/shared/users/{os.environ['USER']}/models/{model_id}
! echo DONE


! rm -rf ~/shared/users/{os.environ['USER']}/datasets/{train_dataset_id}
! rm -rf ~/shared/users/{os.environ['USER']}/datasets/{eval_dataset_id}
! echo DONE


command = "umount ~/shared"
! echo {password} | sudo -S -k {command} && echo DONE


! pip3 uninstall -y nvidia-tao-client

Notebook to demonstrate TAO-Remote Client AutoML workflow for Image Classification¶

Learning Objective¶

The workflow in a nutshell¶

AutoML Workflow¶

Table of contents¶

Requirements¶

FIXME¶

Install TAO remote client ¶

Set the remote service base URL ¶

Access the shared volume ¶

Create the datasets ¶

Split dataset into train and val sets¶

Verify the dataset split¶

Create and upload datasets¶

List datasets ¶

Create a model experiment ¶

Assign train, eval datasets¶

Find pretrained model ¶

View hyperparameters that are enabled for AutoML by default¶

Provide train specs ¶

Run AutoML train ¶

Get the best model from AutoML ¶

Delete experiment ¶

Delete datasets ¶

Unmount shared volume ¶

Uninstall TAO Remote Client ¶

Notebook to demonstrate TAO-Remote Client AutoML workflow for Image Classification¶

Learning Objective¶

The workflow in a nutshell¶

AutoML Workflow¶

Table of contents¶

Requirements¶

FIXME¶

Install TAO remote client ¶

Set the remote service base URL ¶

Access the shared volume ¶

Create the datasets ¶

Split dataset into train and val sets¶

Verify the dataset split¶

Create and upload datasets¶

List datasets ¶

Create a model experiment ¶

Assign train, eval datasets¶

Find pretrained model ¶

View hyperparameters that are enabled for AutoML by default¶

Set AutoML related configurations ¶

Provide train specs ¶

Run AutoML train ¶

Get the best model from AutoML ¶

Delete experiment ¶

Delete datasets ¶

Unmount shared volume ¶

Uninstall TAO Remote Client ¶