Hello,
I am training a MaskRCNN model using Tensorflow for a custom Dataset (sample_data) where each image annotation is in the Pascal VOC XML format. Training the MaskRCNN model from the : ahttps://github.com/ahmedfgad/Mask-RCNN-TF2 repo
for Tensorflow 2.2.0
config for the model used :
Configurations:
BACKBONE resnet101
BACKBONE_STRIDES [4, 8, 16, 32, 64]
BATCH_SIZE 1
BBOX_STD_DEV [0.1 0.1 0.2 0.2]
COMPUTE_BACKBONE_SHAPE None
DETECTION_MAX_INSTANCES 100
DETECTION_MIN_CONFIDENCE 0.7
DETECTION_NMS_THRESHOLD 0.3
FPN_CLASSIF_FC_LAYERS_SIZE 1024
GPU_COUNT 1
GRADIENT_CLIP_NORM 5.0
IMAGES_PER_GPU 1
IMAGE_CHANNEL_COUNT 3
IMAGE_MAX_DIM 1024
IMAGE_META_SIZE 19
IMAGE_MIN_DIM 800
IMAGE_MIN_SCALE 0
IMAGE_RESIZE_MODE square
IMAGE_SHAPE [1024 1024 3]
LEARNING_MOMENTUM 0.9
LEARNING_RATE 0.0001
LOSS_WEIGHTS {'rpn_class_loss': 1.0, 'rpn_bbox_loss': 1.0, 'mrcnn_class_loss': 1.0, 'mrcnn_bbox_loss': 1.0, 'mrcnn_mask_loss': 1.0}
MASK_POOL_SIZE 14
MASK_SHAPE [28, 28]
MAX_GT_INSTANCES 100
MEAN_PIXEL [123.7 116.8 103.9]
MINI_MASK_SHAPE (56, 56)
NAME corn_cfg
NUM_CLASSES 7
POOL_SIZE 7
POST_NMS_ROIS_INFERENCE 1000
POST_NMS_ROIS_TRAINING 2000
PRE_NMS_LIMIT 6000
ROI_POSITIVE_RATIO 0.33
RPN_ANCHOR_RATIOS [0.5, 1, 2]
RPN_ANCHOR_SCALES (32, 64, 128, 256, 512)
RPN_ANCHOR_STRIDE 1
RPN_BBOX_STD_DEV [0.1 0.1 0.2 0.2]
RPN_NMS_THRESHOLD 0.7
RPN_TRAIN_ANCHORS_PER_IMAGE 256
STEPS_PER_EPOCH 100
TOP_DOWN_PYRAMID_SIZE 256
TRAIN_BN False
TRAIN_ROIS_PER_IMAGE 200
USE_MINI_MASK True
USE_RPN_ROIS True
VALIDATION_STEPS 50
WEIGHT_DECAY 0.0001
When running the model (using both versions) tensorflow-cpu, data generation is pretty fast(almost instantly) and training happens as expected with proper loss values
But when using the tensorflow-gpu, The model loading is too long, then epochs start after another 7-10 minutes and the loss generated is Nan,
I’ve tried to
- lower the Learning rate to 1e-5,
- multiprocessing off,
- workers = 1,
- changed optimizer to Adam,
System Specs: i5 12400f, 12gb Ram, 12Gb RTX 3060,
- all cudnn and cudatoolkit version according to tensorflow documentation installed.
Code for Training is :
from os import listdir
import imgaug
import numpy as np
from xml.etree import ElementTree
from numpy import zeros
from numpy import asarray
from mrcnn.utils import Dataset
from matplotlib import pyplot
from mrcnn.visualize import display_instances
from mrcnn.utils import extract_bboxes
from mrcnn.config import Config
from mrcnn.model import MaskRCNN
import mrcnn.model as mrmodel
import warnings
import tensorflow as tf
import time
warnings.filterwarnings('ignore')
# gpu_available = tf.config.list_physical_devices('GPU')
gpu_available = tf.test.is_gpu_available()
gpu_available
class CornDataset(Dataset):
# load the dataset definitions
def load_dataset(self, dataset_dir, is_train=True):
# start = time.perf_counter()
# define classes
self.add_class("dataset", 1, "fall-armyworm-larva")
self.add_class("dataset", 2, "fall-armyworm-larval-damage")
self.add_class("dataset", 3, "fall-armyworm-frass")
self.add_class("dataset", 4, "fall-armyworm-egg")
self.add_class("dataset", 5, "healthy-maize")
self.add_class("dataset", 6, "maize-streak-disease")
# define data locations
images_dir = dataset_dir + '/images/'
annotations_dir = dataset_dir + '/annots/'
# find all images
count = 1
for filename in listdir(images_dir):
print(filename)
# extract image id
image_id = filename[:-4]
name1 = ''
if filename[-4:] != 'jpeg':
name1 = filename[:-4]
else:
name1 = filename[:-5]
image_id = name1
# skip all images after 115 if we are building the train set
if is_train and int(image_id) >= 6770:
continue
# skip all images before 115 if we are building the test/val set
if not is_train and int(image_id) < 6770:
continue
img_path = images_dir + filename
ann_path = annotations_dir + image_id + '.xml'
# add to dataset
self.add_image('dataset', image_id=image_id, path=img_path, annotation=ann_path, class_ids = [0,1,2,3,4,5,6])
# stop = time.perf_counter()
# print("time for load_dataset",(stop-start))
# extract bounding boxes from an annotation file
def extract_boxes(self, filename):
# start = time.perf_counter()
# load and parse the file
tree = ElementTree.parse(filename)
# get the root of the document
root = tree.getroot()
# extract each bounding box
boxes = list()
for box in root.findall('.//object'):
name = box.find('name').text #Add label name to the box list
xmin = int(box.find('./bndbox/xmin').text)
ymin = int(box.find('./bndbox/ymin').text)
xmax = int(box.find('./bndbox/xmax').text)
ymax = int(box.find('./bndbox/ymax').text)
coors = [xmin, ymin, xmax, ymax, name]
boxes.append(coors)
# extract image dimensions
width = int(root.find('.//size/width').text)
height = int(root.find('.//size/height').text)
# stop = time.perf_counter()
# print("time for extract_boxes",(stop-start))
return boxes, width, height
# load the masks for an image
def load_mask(self, image_id):
# start = time.perf_counter()
# get details of image
info = self.image_info[image_id]
# define box file location
path = info['annotation']
#return info, path
# load XML
boxes, w, h = self.extract_boxes(path)
# create one array for all masks, each on a different channel
masks = zeros([h, w, len(boxes)], dtype='uint8')
# create masks
class_ids = list()
for i in range(len(boxes)):
box = boxes[i]
row_s, row_e = box[1], box[3]
col_s, col_e = box[0], box[2]
# box[4] will have the name of the class
if box[4]=='fall-armyworm-larva':
masks[row_s:row_e, col_s:col_e, i] = 1
class_ids.append(self.class_names.index('fall-armyworm-larva'))
elif box[4]=='fall-armyworm-larval-damage':
masks[row_s:row_e, col_s:col_e, i] = 2
class_ids.append(self.class_names.index('fall-armyworm-larval-damage'))
elif box[4]=='fall-armyworm-frass':
masks[row_s:row_e, col_s:col_e, i] = 3
class_ids.append(self.class_names.index('fall-armyworm-frass'))
elif box[4]=='fall-armyworm-egg':
masks[row_s:row_e, col_s:col_e, i] = 4
class_ids.append(self.class_names.index('fall-armyworm-egg'))
elif box[4]=='healthy-maize' or box[4]=='healthy-maize' or box[4]=='healthy-images' or box[4]=='none-healthy':
masks[row_s:row_e, col_s:col_e, i] = 5
class_ids.append(self.class_names.index('healthy-maize'))
elif box[4]=='maize-streak-disease':
masks[row_s:row_e, col_s:col_e, i] = 6
class_ids.append(self.class_names.index('maize-streak-disease'))
# stop = time.perf_counter()
# print("time for load_mask",(stop-start))
return masks, asarray(class_ids, dtype='int32')
# load an image reference
def image_reference(self, image_id):
info = self.image_info[image_id]
return info['path']
dataset_dir='final_dataset/'
validset_dir = 'validation/'
train_set = CornDataset()
train_set.load_dataset(dataset_dir, is_train=True)
train_set.prepare()
print('Train: %d' % len(train_set.image_ids))
# test/val set
test_set = CornDataset()
test_set.load_dataset(dataset_dir, is_train=False)
test_set.prepare()
print('Test: %d' % len(test_set.image_ids))
import random
num=random.randint(0, len(train_set.image_ids))
# define image id
image_id = num
# load the image
image = train_set.load_image(image_id)
# load the masks and the class ids
mask, class_ids = train_set.load_mask(image_id)
# extract bounding boxes from the masks
bbox = extract_bboxes(mask)
# display image with masks and bounding boxes
display_instances(image, bbox, mask, class_ids, train_set.class_names)
class CornConfig(Config):
# define the name of the configuration
NAME = "corn_cfg"
# number of classes (background + 5 Diseases + 1 Healthy)
NUM_CLASSES = 1 + 6
IMAGES_PER_GPU = 1
# number of training steps per epoch
STEPS_PER_EPOCH = 100
VALIDATION_STEPS = 50
# Skip detections with < 90% confidence
# DETECTION_MIN_CONFIDENCE = 0.8
LEARNING_RATE = 1e-4
# BATCH_SIZE = 28
# prepare config
config = CornConfig()
config.display()
import os
ROOT_DIR = "/home/mehathab/Desktop/maskrcnn_drY-run"
# Directory to save logs and trained model
DEFAULT_LOGS_DIR = os.path.join(ROOT_DIR, "logs")
# define the model
model = MaskRCNN(mode='training', model_dir=DEFAULT_LOGS_DIR, config=config)
model_inference = MaskRCNN(mode="inference", config=config, model_dir=DEFAULT_LOGS_DIR)
# load weights (mscoco) and exclude the output layers
WEIGHT_PATH = 'mask_rcnn_coco.h5'
model.load_weights(WEIGHT_PATH, by_name=True,
exclude=["mrcnn_class_logits", "mrcnn_bbox_fc", "mrcnn_bbox", "mrcnn_mask"])
# train weights (output layers or 'heads')
# history = model.train(train_set, test_set, learning_rate=config.LEARNING_RATE, epochs=100, layers='3+')
mean_average_precision_callback = mrmodel.MeanAveragePrecisionCallback(model,
model_inference,
test_set,
calculate_map_at_every_X_epoch=5,
verbose=1)
model.train(train_set,test_set,
learning_rate=config.LEARNING_RATE,
epochs=100,
layers='heads',
custom_callbacks=[mean_average_precision_callback])