Nvprof error when running cifar10_keras

==== environment====
docker: centos7
GPU: 780 3G
CUDA Version: 10.0
tensorflow-gpu: 1.14
keras: 2.3.1

I hope to use nvprof for analysis when running cifar10_keras. There are too few error messages reported by nvprof, and I cannot resolve the cause of the error.

==42== Error: Internal profiling error 4055:34.

I attach my use case code and the commands I run, hope someone can help me.
command
nvprof --metrics flop_count_sp python3 nvprof_keras.py train --batch_size 1 model.json
nvprof_keras.py
import argparse
import tensorflow as tf
import nni
from nni.networkmorphism_tuner.graph import json_to_graph
import keras.backend.tensorflow_backend as KTF
from keras import backend as K
from keras.layers import Dense
from keras.models import Sequential
from keras.initializers import Constant
from keras.datasets import cifar10
from keras.utils import multi_gpu_model, to_categorical
import utils
import os

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
KTF.set_session(sess)

def get_args():
    """ get args from command line
    """
    parser = argparse.ArgumentParser("cifar10")
    parser.add_argument("mode", type=str, help="Enter \"train\" or \"test\". Calculate FLOPs for training one batch or testing one batch.")
    parser.add_argument("model_json_file", type=str, help="Model file to load in json format.")
    parser.add_argument("--batch_size", type=int, default=1, help="batch size")
    parser.add_argument("--optimizer", type=str, default="SGD", help="optimizer")
    parser.add_argument("--epochs", type=int, default=200, help="epoch limit")
    return parser.parse_args()

trainloader = None
testloader = None
net = None
args = get_args()

def build_graph_from_json(ir_model_json):
    """build model from json representation
    """
    graph = json_to_graph(ir_model_json)
    model = graph.produce_keras_model()
    return model

def parse_rev_args(receive_msg):
    """ parse reveive msgs to global variable
    """
    global trainloader
    global testloader
    global net
    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    y_train = to_categorical(y_train, 10)
    y_test = to_categorical(y_test, 10)
    x_train = x_train.astype("float32")
    x_test = x_test.astype("float32")
    x_train /= 255.0
    x_test /= 255.0
    trainloader = (x_train, y_train)
    testloader = (x_test, y_test)

    # Model
    net = build_graph_from_json(receive_msg)
    g_sess = K.get_session()
    graph = g_sess.graph
    # parallel model
    try:
        available_devices = os.environ["CUDA_VISIBLE_DEVICES"]
        gpus = len(available_devices.split(","))
        if gpus > 1:
            net = multi_gpu_model(net, gpus)
    except KeyError:
        print("parallel model not support in this config settings")
    return 0

def train_eval(mode,batch_size):
    """ train and eval the model
    """
    global trainloader
    global testloader
    global net
    best_acc = 0
    bs_explore = int(batch_size)

    # Compile the model
    net.compile(
        loss="categorical_crossentropy", optimizer='SGD', metrics=["accuracy"]
    )

    (x_train, y_train) = trainloader
    (x_test, y_test) = testloader
    # train procedure
    trial_id=nni.get_trial_id()
    if mode == "train":
        net.fit(
            x=x_train[0:bs_explore],
            y=y_train[0:bs_explore],
            batch_size=bs_explore
           # validation_data=(x_test[0], y_test[0]),
           # epochs=args.epochs,
           # shuffle=True
        )
    else:
        # trial report final acc to tuner
        _, acc = net.evaluate(x_test[0:bs_explore], y_test[0:bs_explore])
    exit()

if __name__ == "__main__":
    RCV_CONFIG = ""
    with open(args.model_json_file) as file_read:
        RCV_CONFIG = file_read.readline()
    parse_rev_args(RCV_CONFIG)
    train_eval(args.mode, args.batch_size)

model.json
{"input_shape": [32, 32, 3], "vis": null, "weighted": false, "operation_history": [], "layer_id_to_input_node_ids": {"0": [0], "1": [1], "2": [2], "3": [3], "4": [4], "5": [5], "6": [6], "7": [7], "8": [8], "9": [9], "10": [10], "11": [11], "12": [12], "13": [13], "14": [14], "15": [15], "16": [16]}, "layer_id_to_output_node_ids": {"0": [1], "1": [2], "2": [3], "3": [4], "4": [5], "5": [6], "6": [7], "7": [8], "8": [9], "9": [10], "10": [11], "11": [12], "12": [13], "13": [14], "14": [15], "15": [16], "16": [17]}, "adj_list": {"0": [[1, 0]], "1": [[2, 1]], "2": [[3, 2]], "3": [[4, 3]], "4": [[5, 4]], "5": [[6, 5]], "6": [[7, 6]], "7": [[8, 7]], "8": [[9, 8]], "9": [[10, 9]], "10": [[11, 10]], "11": [[12, 11]], "12": [[13, 12]], "13": [[14, 13]], "14": [[15, 14]], "15": [[16, 15]], "16": [[17, 16]], "17": []}, "reverse_adj_list": {"0": [], "1": [[0, 0]], "2": [[1, 1]], "3": [[2, 2]], "4": [[3, 3]], "5": [[4, 4]], "6": [[5, 5]], "7": [[6, 6]], "8": [[7, 7]], "9": [[8, 8]], "10": [[9, 9]], "11": [[10, 10]], "12": [[11, 11]], "13": [[12, 12]], "14": [[13, 13]], "15": [[14, 14]], "16": [[15, 15]], "17": [[16, 16]]}, "node_list": [[0, [32, 32, 3]], [1, [32, 32, 3]], [2, [32, 32, 3]], [3, [32, 32, 64]], [4, [16, 16, 64]], [5, [16, 16, 64]], [6, [16, 16, 64]], [7, [16, 16, 64]], [8, [8, 8, 64]], [9, [8, 8, 64]], [10, [8, 8, 64]], [11, [8, 8, 64]], [12, [4, 4, 64]], [13, [64]], [14, [64]], [15, [64]], [16, [64]], [17, [10]]], "layer_list": [[0, ["StubReLU", 0, 1]], [1, ["StubBatchNormalization2d", 1, 2, 3]], [2, ["StubConv2d", 2, 3, 3, 64, 3, 1, 1]], [3, ["StubPooling2d", 3, 4, 2, 2, 0]], [4, ["StubReLU", 4, 5]], [5, ["StubBatchNormalization2d", 5, 6, 64]], [6, ["StubConv2d", 6, 7, 64, 64, 3, 1, 1]], [7, ["StubPooling2d", 7, 8, 2, 2, 0]], [8, ["StubReLU", 8, 9]], [9, ["StubBatchNormalization2d", 9, 10, 64]], [10, ["StubConv2d", 10, 11, 64, 64, 3, 1, 1]], [11, ["StubPooling2d", 11, 12, 2, 2, 0]], [12, ["StubGlobalPooling2d", 12, 13]], [13, ["StubDropout2d", 13, 14, 0.25]], [14, ["StubDense", 14, 15, 64, 64]], [15, ["StubReLU", 15, 16]], [16, ["StubDense", 16, 17, 64, 10]]]}