could not find any implementation for node 2-layer MLP, try increasing the workspace size with IBuilder::setMaxWorkspaceSize()

Tensorflow 1.11.0
TensorRT 5.0.0.10
Ubuntun 16.04
Python 2.7
Cuda 9.0
Cudnn 7.1.2

I met a problem:

[TensorRT] ERROR: Internal error: could not find any implementation for node 2-layer MLP, try increasing the workspace size with IBuilder::setMaxWorkspaceSize()
[TensorRT] ERROR: …/builder/tacticOptimizer.cpp (1228) - OutOfMemory Error in computeCosts: 0

I am using the sample form the tutorial which located in “TensorRT-5.0.0.10/python/sample/end_to_end_mnist”.
This sample works fine when I simply run it in terminal.
But when I use the model(the keras model in “end_to_end_mnist”) and try to build an engine from scratch
it get an error.

I follow https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#optimizing_int8_python.I have frozen the model and parse it successfully but fail to build an engine

I try both decreasing the batchsize and increasing the workspace but they did not work.
Does anyone know how to solve this problem?

FILE_ONE:

import tensorflow as tf
import numpy as np
import tensorrt as trt
import sys,os

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
print(type(x_train))
NUM_TRAIN = 60000
NUM_TEST = 10000
x_train = np.reshape(x_train, (NUM_TRAIN, 28, 28, 1))
x_test = np.reshape(x_test, (NUM_TEST, 28, 28, 1))

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=[28, 28, 1]))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(512, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(10, activation=tf.nn.softmax))
model.compile(optimizer=‘adam’, loss=‘sparse_categorical_crossentropy’, metrics=[‘accuracy’])

Train the model on the data

model.fit(x_train, y_train, epochs=1, verbose=1)

Evaluate the model on test data

model.evaluate(x_test, y_test)

output_names = model.output.op.name
sess = tf.keras.backend.get_session()
frozen_graph = tf.graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), [output_names])
frozen_graph = tf.graph_util.remove_training_nodes(frozen_graph)

Save the model

with open(“testkeras_uff”, “wb”) as ofile:
ofile.write(frozen_graph.SerializeToString())

class ModelData(object):
MODEL_FILE = os.path.join(os.path.dirname(file), “testkeras_uff”)
INPUT_NAME =“input_1”
INPUT_SHAPE = (1, 28, 28)
OUTPUT_NAME = “dense_1/Softmax”

BEFORE RUNING FILE TWO U SHOULD “convert-to-uff testkeras_uff” in terminal
FILE_TWO:

import tensorflow as tf
import numpy as np
import tensorrt as trt
import sys,os
import pycuda.driver as cuda
import pycuda.autoinit
import argparse

model_file=“testkeras_uff.uff”
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.UffParser() as parser:
# Parse the Uff Network
parser.register_input(“input_1”, (1,28,28))
parser.register_output(“dense_1/Softmax”)
parser.parse(model_file, network)
# Build and return an engine.
builder.build_cuda_engine(network)

builder.max_batch_size = 1
builder.max_workspace_size = 1 <<20   


with builder.build_cuda_engine(network) as engine:

    h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32)
    h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32)
    # Allocate device memory for inputs and outputs.
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    # Create a stream in which to copy inputs/outputs and run inference.

    stream=cuda.Stream()

    with engine.create_execution_context() as context:
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input, h_input, stream)
        # Run inference.
        context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream
        stream.synchronize()
        # Return the host output.

Thanks.

Hello,

I modified :/workspace/tensorrt/samples/python/end_to_end_tensorflow_mnist/sample.py to have max_workspace_size max_batch_size and ran with no issues.

def build_engine(model_file):
    # For more information on TRT basics, refer to the introductory samples.
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.UffParser() as parser:
    [b]    builder.max_workspace_size = 1 << 20 #common.GiB(1)
        builder.max_batch_size = 1[/b]

        # Parse the Uff Network
        parser.register_input(ModelData.INPUT_NAME, ModelData.INPUT_SHAPE)
        parser.register_output(ModelData.OUTPUT_NAME)
        parser.parse(model_file, network)
        # Build and return an engine.
        return builder.build_cuda_engine(network)

But I do see the same symptom with keras mnist model. Reviewing now and report back what we find.

Thanks NVES it works.

Very glad that you can reply.

I also suggest you to edit the code on https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#build_engine_python which makes a little bit confusing there.

Thanks again.

similar error.

@NVES
but I am in nv tesla v100 gpu,
and my envir:

TensorRT 5.0.2.6
centos 7
Cuda 9.0
Cudnn 7.3.1

if I set builder->setMaxWorkspaceSize(1 << 20), it works, but when I set builder->setMaxWorkspaceSize(1 << 32), it shows the error:

ERROR: Internal error: could not find any implementation for node 2-layer MLP, try increasing the workspace size with IBuilder::setMaxWorkspaceSize()
ERROR: ../builder/tacticOptimizer.cpp (1230) - OutOfMemory Error in computeCosts: 0

my code is as follow:

IBuilder* builder = createInferBuilder(gLogger);
    INetworkDefinition* network = builder->createNetwork();

    ICaffeParser* parser = createCaffeParser();
    // parser->setPluginFactory(&pluginFactory);

    bool mEnableFp16 = builder->platformHasFastFp16();
    bool mEnableInt8 = builder->platformHasFastInt8();
    printf(LOG_GIE "platform %s Fp16 support.\n", mEnableFp16 ? "has" : "does not have");
    printf(LOG_GIE "platform %s Int8 support.\n", mEnableInt8 ? "has" : "does not have");

    DataType modelDataType = mEnableFp16 ? DataType::kHALF : DataType::kFLOAT;
    // DataType modelDataType = useInt8 ? DataType::kINT8 : DataType::kFLOAT;

    printf(LOG_GIE "loading %s \n", deployFile.c_str());

    const IBlobNameToTensor *blobNameToTensor =	parser->parse(deployFile.c_str(),
                                                              modelFile.c_str(),
                                                              *network,
                                                              modelDataType);

    assert(blobNameToTensor != nullptr);

    for (int i = 0, n = network->getNbInputs(); i < n; i++)
    {
        Dims3 dims = static_cast<Dims3&&>(network->getInput(i)->getDimensions());
        std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.d[0] << "x" << dims.d[1] << "x" <<
        dims.d[2] << std::endl;
    }

    for (auto& s : outputs) network->markOutput(*blobNameToTensor->find(s.c_str()));

    builder->setMaxBatchSize(maxBatchSize);
    builder->setMaxWorkspaceSize(1 << 32);

    // set up the network for paired-fp16 format
    if(mEnableFp16)builder->setHalf2Mode(true);

    ICudaEngine* engine = builder->buildCudaEngine(*network);
    assert(engine);

    network->destroy();
    parser->destroy();

    gieModelStream = engine->serialize();
    engine->destroy();
    builder->destroy();
    // pluginFactory.destroyPlugin();

    std::ofstream ofs("serialized_engine.trt", std::ios::out | std::ios::binary);
    ofs.write((char*)(gieModelStream->data()), gieModelStream->size());
    ofs.close();
    gieModelStream->destroy();
    shutdownProtobufLibrary();

can you give some advises?