Nvprof error when running cifar10_keras

hongji.chen · April 7, 2020, 12:43pm

==== environment====
docker: centos7
GPU: 780 3G
CUDA Version: 10.0
tensorflow-gpu: 1.14
keras: 2.3.1

I hope to use nvprof for analysis when running cifar10_keras. There are too few error messages reported by nvprof, and I cannot resolve the cause of the error.

==42== Error: Internal profiling error 4055:34.

I attach my use case code and the commands I run, hope someone can help me.
command
nvprof --metrics flop_count_sp python3 nvprof_keras.py train --batch_size 1 model.json
nvprof_keras.py
import argparse
import tensorflow as tf
import nni
from nni.networkmorphism_tuner.graph import json_to_graph
import keras.backend.tensorflow_backend as KTF
from keras import backend as K
from keras.layers import Dense
from keras.models import Sequential
from keras.initializers import Constant
from keras.datasets import cifar10
from keras.utils import multi_gpu_model, to_categorical
import utils
import os

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
KTF.set_session(sess)

def get_args():
    """ get args from command line
    """
    parser = argparse.ArgumentParser("cifar10")
    parser.add_argument("mode", type=str, help="Enter \"train\" or \"test\". Calculate FLOPs for training one batch or testing one batch.")
    parser.add_argument("model_json_file", type=str, help="Model file to load in json format.")
    parser.add_argument("--batch_size", type=int, default=1, help="batch size")
    parser.add_argument("--optimizer", type=str, default="SGD", help="optimizer")
    parser.add_argument("--epochs", type=int, default=200, help="epoch limit")
    return parser.parse_args()

trainloader = None
testloader = None
net = None
args = get_args()

def build_graph_from_json(ir_model_json):
    """build model from json representation
    """
    graph = json_to_graph(ir_model_json)
    model = graph.produce_keras_model()
    return model

def parse_rev_args(receive_msg):
    """ parse reveive msgs to global variable
    """
    global trainloader
    global testloader
    global net
    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    y_train = to_categorical(y_train, 10)
    y_test = to_categorical(y_test, 10)
    x_train = x_train.astype("float32")
    x_test = x_test.astype("float32")
    x_train /= 255.0
    x_test /= 255.0
    trainloader = (x_train, y_train)
    testloader = (x_test, y_test)

    # Model
    net = build_graph_from_json(receive_msg)
    g_sess = K.get_session()
    graph = g_sess.graph
    # parallel model
    try:
        available_devices = os.environ["CUDA_VISIBLE_DEVICES"]
        gpus = len(available_devices.split(","))
        if gpus > 1:
            net = multi_gpu_model(net, gpus)
    except KeyError:
        print("parallel model not support in this config settings")
    return 0

def train_eval(mode,batch_size):
    """ train and eval the model
    """
    global trainloader
    global testloader
    global net
    best_acc = 0
    bs_explore = int(batch_size)

    # Compile the model
    net.compile(
        loss="categorical_crossentropy", optimizer='SGD', metrics=["accuracy"]
    )

    (x_train, y_train) = trainloader
    (x_test, y_test) = testloader
    # train procedure
    trial_id=nni.get_trial_id()
    if mode == "train":
        net.fit(
            x=x_train[0:bs_explore],
            y=y_train[0:bs_explore],
            batch_size=bs_explore
           # validation_data=(x_test[0], y_test[0]),
           # epochs=args.epochs,
           # shuffle=True
        )
    else:
        # trial report final acc to tuner
        _, acc = net.evaluate(x_test[0:bs_explore], y_test[0:bs_explore])
    exit()

if __name__ == "__main__":
    RCV_CONFIG = ""
    with open(args.model_json_file) as file_read:
        RCV_CONFIG = file_read.readline()
    parse_rev_args(RCV_CONFIG)
    train_eval(args.mode, args.batch_size)

model.json
{"input_shape": [32, 32, 3], "vis": null, "weighted": false, "operation_history": [], "layer_id_to_input_node_ids": {"0": [0], "1": [1], "2": [2], "3": [3], "4": [4], "5": [5], "6": [6], "7": [7], "8": [8], "9": [9], "10": [10], "11": [11], "12": [12], "13": [13], "14": [14], "15": [15], "16": [16]}, "layer_id_to_output_node_ids": {"0": [1], "1": [2], "2": [3], "3": [4], "4": [5], "5": [6], "6": [7], "7": [8], "8": [9], "9": [10], "10": [11], "11": [12], "12": [13], "13": [14], "14": [15], "15": [16], "16": [17]}, "adj_list": {"0": [[1, 0]], "1": [[2, 1]], "2": [[3, 2]], "3": [[4, 3]], "4": [[5, 4]], "5": [[6, 5]], "6": [[7, 6]], "7": [[8, 7]], "8": [[9, 8]], "9": [[10, 9]], "10": [[11, 10]], "11": [[12, 11]], "12": [[13, 12]], "13": [[14, 13]], "14": [[15, 14]], "15": [[16, 15]], "16": [[17, 16]], "17": []}, "reverse_adj_list": {"0": [], "1": [[0, 0]], "2": [[1, 1]], "3": [[2, 2]], "4": [[3, 3]], "5": [[4, 4]], "6": [[5, 5]], "7": [[6, 6]], "8": [[7, 7]], "9": [[8, 8]], "10": [[9, 9]], "11": [[10, 10]], "12": [[11, 11]], "13": [[12, 12]], "14": [[13, 13]], "15": [[14, 14]], "16": [[15, 15]], "17": [[16, 16]]}, "node_list": [[0, [32, 32, 3]], [1, [32, 32, 3]], [2, [32, 32, 3]], [3, [32, 32, 64]], [4, [16, 16, 64]], [5, [16, 16, 64]], [6, [16, 16, 64]], [7, [16, 16, 64]], [8, [8, 8, 64]], [9, [8, 8, 64]], [10, [8, 8, 64]], [11, [8, 8, 64]], [12, [4, 4, 64]], [13, [64]], [14, [64]], [15, [64]], [16, [64]], [17, [10]]], "layer_list": [[0, ["StubReLU", 0, 1]], [1, ["StubBatchNormalization2d", 1, 2, 3]], [2, ["StubConv2d", 2, 3, 3, 64, 3, 1, 1]], [3, ["StubPooling2d", 3, 4, 2, 2, 0]], [4, ["StubReLU", 4, 5]], [5, ["StubBatchNormalization2d", 5, 6, 64]], [6, ["StubConv2d", 6, 7, 64, 64, 3, 1, 1]], [7, ["StubPooling2d", 7, 8, 2, 2, 0]], [8, ["StubReLU", 8, 9]], [9, ["StubBatchNormalization2d", 9, 10, 64]], [10, ["StubConv2d", 10, 11, 64, 64, 3, 1, 1]], [11, ["StubPooling2d", 11, 12, 2, 2, 0]], [12, ["StubGlobalPooling2d", 12, 13]], [13, ["StubDropout2d", 13, 14, 0.25]], [14, ["StubDense", 14, 15, 64, 64]], [15, ["StubReLU", 15, 16]], [16, ["StubDense", 16, 17, 64, 10]]]}

Topic		Replies	Views
nvprof error Application received signal 11 CUDA Programming and Performance	10	5334	May 12, 2021
Profiling deadloop (replay kernel) with nvprof on deep neural network Visual Profiler and nvprof	8	3306	August 24, 2017
nvprof: Internal profiling error 4277:5 on Tesla P100, but not on GTX 1070 Visual Profiler and nvprof	12	3982	October 12, 2021
nvprof shows DRAM throughput greater than theoretically possible Visual Profiler and nvprof	10	1773	January 11, 2018
Parsing custom tensorflow model DeepStream SDK	31	571	September 4, 2023
Internal profiling error 4292:1 CUDA Programming and Performance	1	1925	December 14, 2019
Error importing keras.models in tensorflow 2.5.0+nv21.8 / jetpack v46 Jetson Nano tensorflow	11	6558	March 9, 2022
Cannot profile RTX 2060 KO (TU104) with CUDA 11.0 on windows and ubuntu Visual Profiler and nvprof nvbugs	8	2755	July 27, 2020
Can't Get NCU GUI To Import Properly Nsight Compute	8	1339	October 5, 2020
DLProf crash Profiling Linux Targets nsight , deep-learning-profiler	10	2048	September 1, 2021

Nvprof error when running cifar10_keras

Related topics