TensorflowGPU- MaskRCNN loss is Nan values only

Hello,

I am training a MaskRCNN model using Tensorflow for a custom Dataset (sample_data) where each image annotation is in the Pascal VOC XML format. Training the MaskRCNN model from the : ahttps://github.com/ahmedfgad/Mask-RCNN-TF2 repo
for Tensorflow 2.2.0

config for the model used :

Configurations:
BACKBONE                       resnet101
BACKBONE_STRIDES               [4, 8, 16, 32, 64]
BATCH_SIZE                     1
BBOX_STD_DEV                   [0.1 0.1 0.2 0.2]
COMPUTE_BACKBONE_SHAPE         None
DETECTION_MAX_INSTANCES        100
DETECTION_MIN_CONFIDENCE       0.7
DETECTION_NMS_THRESHOLD        0.3
FPN_CLASSIF_FC_LAYERS_SIZE     1024
GPU_COUNT                      1
GRADIENT_CLIP_NORM             5.0
IMAGES_PER_GPU                 1
IMAGE_CHANNEL_COUNT            3
IMAGE_MAX_DIM                  1024
IMAGE_META_SIZE                19
IMAGE_MIN_DIM                  800
IMAGE_MIN_SCALE                0
IMAGE_RESIZE_MODE              square
IMAGE_SHAPE                    [1024 1024    3]
LEARNING_MOMENTUM              0.9
LEARNING_RATE                  0.0001
LOSS_WEIGHTS                   {'rpn_class_loss': 1.0, 'rpn_bbox_loss': 1.0, 'mrcnn_class_loss': 1.0, 'mrcnn_bbox_loss': 1.0, 'mrcnn_mask_loss': 1.0}
MASK_POOL_SIZE                 14
MASK_SHAPE                     [28, 28]
MAX_GT_INSTANCES               100
MEAN_PIXEL                     [123.7 116.8 103.9]
MINI_MASK_SHAPE                (56, 56)
NAME                           corn_cfg
NUM_CLASSES                    7
POOL_SIZE                      7
POST_NMS_ROIS_INFERENCE        1000
POST_NMS_ROIS_TRAINING         2000
PRE_NMS_LIMIT                  6000
ROI_POSITIVE_RATIO             0.33
RPN_ANCHOR_RATIOS              [0.5, 1, 2]
RPN_ANCHOR_SCALES              (32, 64, 128, 256, 512)
RPN_ANCHOR_STRIDE              1
RPN_BBOX_STD_DEV               [0.1 0.1 0.2 0.2]
RPN_NMS_THRESHOLD              0.7
RPN_TRAIN_ANCHORS_PER_IMAGE    256
STEPS_PER_EPOCH                100
TOP_DOWN_PYRAMID_SIZE          256
TRAIN_BN                       False
TRAIN_ROIS_PER_IMAGE           200
USE_MINI_MASK                  True
USE_RPN_ROIS                   True
VALIDATION_STEPS               50
WEIGHT_DECAY                   0.0001

When running the model (using both versions) tensorflow-cpu, data generation is pretty fast(almost instantly) and training happens as expected with proper loss values

But when using the tensorflow-gpu, The model loading is too long, then epochs start after another 7-10 minutes and the loss generated is Nan,

I’ve tried to

  • lower the Learning rate to 1e-5,
  • multiprocessing off,
  • workers = 1,
  • changed optimizer to Adam,

System Specs: i5 12400f, 12gb Ram, 12Gb RTX 3060,

  • all cudnn and cudatoolkit version according to tensorflow documentation installed.

Code for Training is :

from os import listdir
import imgaug
import numpy as np
from xml.etree import ElementTree
from numpy import zeros
from numpy import asarray
from mrcnn.utils import Dataset
from matplotlib import pyplot
from mrcnn.visualize import display_instances
from mrcnn.utils import extract_bboxes

from mrcnn.config import Config
from mrcnn.model import MaskRCNN
import mrcnn.model as mrmodel
import warnings
import tensorflow as tf
import time
warnings.filterwarnings('ignore')
# gpu_available = tf.config.list_physical_devices('GPU')
gpu_available = tf.test.is_gpu_available()
gpu_available


class CornDataset(Dataset):
    # load the dataset definitions
    def load_dataset(self, dataset_dir, is_train=True):
#         start = time.perf_counter()
        # define classes
        self.add_class("dataset", 1, "fall-armyworm-larva")
        self.add_class("dataset", 2, "fall-armyworm-larval-damage")
        self.add_class("dataset", 3, "fall-armyworm-frass")
        self.add_class("dataset", 4, "fall-armyworm-egg")
        self.add_class("dataset", 5, "healthy-maize")
        self.add_class("dataset", 6, "maize-streak-disease")
        
        # define data locations
        images_dir = dataset_dir + '/images/'
        annotations_dir = dataset_dir + '/annots/'
       
             
    # find all images
        count = 1
        for filename in listdir(images_dir):
            print(filename)
			# extract image id
            image_id = filename[:-4]
            name1 = ''
            if filename[-4:] != 'jpeg':
                name1 = filename[:-4]
            else:
                name1 = filename[:-5]
            image_id = name1
			
			# skip all images after 115 if we are building the train set
            if is_train and int(image_id) >= 6770:
                continue
			# skip all images before 115 if we are building the test/val set
            if not is_train and int(image_id) < 6770:
                continue
                
            img_path = images_dir + filename
            ann_path = annotations_dir + image_id + '.xml'
			# add to dataset
            self.add_image('dataset', image_id=image_id, path=img_path, annotation=ann_path, class_ids = [0,1,2,3,4,5,6])
#         stop = time.perf_counter()
#         print("time for load_dataset",(stop-start))

	# extract bounding boxes from an annotation file
    def extract_boxes(self, filename):
#         start = time.perf_counter()
		# load and parse the file
        tree = ElementTree.parse(filename)
		# get the root of the document
        root = tree.getroot()
		# extract each bounding box
        boxes = list()
        for box in root.findall('.//object'):
            name = box.find('name').text   #Add label name to the box list
            xmin = int(box.find('./bndbox/xmin').text)
            ymin = int(box.find('./bndbox/ymin').text)
            xmax = int(box.find('./bndbox/xmax').text)
            ymax = int(box.find('./bndbox/ymax').text)
            coors = [xmin, ymin, xmax, ymax, name]
            boxes.append(coors)
            
		# extract image dimensions
        width = int(root.find('.//size/width').text)
        height = int(root.find('.//size/height').text)
#         stop = time.perf_counter()
#         print("time for extract_boxes",(stop-start))
        return boxes, width, height

	# load the masks for an image
    def load_mask(self, image_id):
#         start = time.perf_counter()
		# get details of image
        info = self.image_info[image_id]
		# define box file location
        path = info['annotation']
        #return info, path
        
        
		# load XML
        boxes, w, h = self.extract_boxes(path)
		# create one array for all masks, each on a different channel
        masks = zeros([h, w, len(boxes)], dtype='uint8')
		# create masks
        class_ids = list()
        for i in range(len(boxes)):
            box = boxes[i]
            row_s, row_e = box[1], box[3]
            col_s, col_e = box[0], box[2]
            
            
            # box[4] will have the name of the class 
            if box[4]=='fall-armyworm-larva':
                masks[row_s:row_e, col_s:col_e, i] = 1
                class_ids.append(self.class_names.index('fall-armyworm-larva'))
            elif box[4]=='fall-armyworm-larval-damage':
                masks[row_s:row_e, col_s:col_e, i] = 2
                class_ids.append(self.class_names.index('fall-armyworm-larval-damage'))
            elif box[4]=='fall-armyworm-frass':
                masks[row_s:row_e, col_s:col_e, i] = 3
                class_ids.append(self.class_names.index('fall-armyworm-frass'))
            elif box[4]=='fall-armyworm-egg':
                masks[row_s:row_e, col_s:col_e, i] = 4
                class_ids.append(self.class_names.index('fall-armyworm-egg'))
            elif box[4]=='healthy-maize' or box[4]=='healthy-maize' or box[4]=='healthy-images' or box[4]=='none-healthy':
                masks[row_s:row_e, col_s:col_e, i] = 5
                class_ids.append(self.class_names.index('healthy-maize'))
            elif box[4]=='maize-streak-disease':
                masks[row_s:row_e, col_s:col_e, i] = 6
                class_ids.append(self.class_names.index('maize-streak-disease'))
          
    #         stop = time.perf_counter()
#         print("time for load_mask",(stop-start))
        return masks, asarray(class_ids, dtype='int32')
        

	# load an image reference
    def image_reference(self, image_id):
        info = self.image_info[image_id]
        return info['path']


dataset_dir='final_dataset/'
validset_dir = 'validation/'

train_set = CornDataset()
train_set.load_dataset(dataset_dir, is_train=True)
train_set.prepare()
print('Train: %d' % len(train_set.image_ids))

# test/val set
test_set = CornDataset()
test_set.load_dataset(dataset_dir, is_train=False)
test_set.prepare()
print('Test: %d' % len(test_set.image_ids))

import random
num=random.randint(0, len(train_set.image_ids))
# define image id
image_id = num
# load the image
image = train_set.load_image(image_id)
# load the masks and the class ids
mask, class_ids = train_set.load_mask(image_id)
# extract bounding boxes from the masks
bbox = extract_bboxes(mask)
# display image with masks and bounding boxes
display_instances(image, bbox, mask, class_ids, train_set.class_names)


class CornConfig(Config):
    # define the name of the configuration
    NAME = "corn_cfg"
    # number of classes (background + 5 Diseases + 1 Healthy)
    NUM_CLASSES = 1 + 6
    IMAGES_PER_GPU = 1
    # number of training steps per epoch
    STEPS_PER_EPOCH = 100
    VALIDATION_STEPS = 50
    
     # Skip detections with < 90% confidence
#     DETECTION_MIN_CONFIDENCE = 0.8
    LEARNING_RATE = 1e-4
#     BATCH_SIZE = 28
    
    # prepare config
config = CornConfig()
config.display()

import os
ROOT_DIR = "/home/mehathab/Desktop/maskrcnn_drY-run"
# Directory to save logs and trained model
DEFAULT_LOGS_DIR = os.path.join(ROOT_DIR, "logs")

# define the model
model = MaskRCNN(mode='training', model_dir=DEFAULT_LOGS_DIR, config=config)
model_inference = MaskRCNN(mode="inference", config=config, model_dir=DEFAULT_LOGS_DIR)

# load weights (mscoco) and exclude the output layers
WEIGHT_PATH = 'mask_rcnn_coco.h5'
model.load_weights(WEIGHT_PATH, by_name=True,
                   exclude=["mrcnn_class_logits", "mrcnn_bbox_fc",  "mrcnn_bbox", "mrcnn_mask"])

# train weights (output layers or 'heads')
# history = model.train(train_set, test_set, learning_rate=config.LEARNING_RATE, epochs=100, layers='3+')

mean_average_precision_callback = mrmodel.MeanAveragePrecisionCallback(model,
                                                                        model_inference,
                                                                        test_set,
                                                                        calculate_map_at_every_X_epoch=5,
                                                                        verbose=1)
model.train(train_set,test_set,
            learning_rate=config.LEARNING_RATE,
            epochs=100,
            layers='heads',
            custom_callbacks=[mean_average_precision_callback])

Hey, I’ll start off by saying I basically don’t know anything. I started learning everything I know about a week ago. I have the same exact issue, same driver, same code, everything. I think the issue is that cuda and cudnn are supposed to be versions 10.1.0 and 7.6.5, respectfully, but our driver wants to run versions 11.2.0 and whatever the cudnn match is. I’m not sure if there is a way to downgrade the driver to a compatible version? Just thought I’d spitball ideas and hope.