TensorRT ( C++ ) inference strange behavior on Jetson AGX Xavier

I developed 2 distinct models, for 2 use cases, to analyzed some vibration patterns: one of them when system is turn on and second when system is shut down (so there are no any vibration detected )
The entire training process uses TensorFlow 2.7.0 (an auto encoder in python) to create .h5 models, which are converted to .onnx models files and then to .engine files for the Jetson platform (Jetson AGX Xavier CUDA ).

Jetson AGX Xavier specs:
cuda: 11.4.315
cuDNN: 8.6.0
tensorRT: 8.5.2.2
jetpack: 5.1.3
python3 -c “import tensorflow as tf; print(‘TensorFlow version:’, tf.version)”
TensorFlow version: 2.11.0

Auto encoder trainig script in python ( sample) :

input_img = tf.keras.layers.Input(shape=(2000, lines))

  # Encoder
  x = tf.keras.layers.Conv1D(12, 128, padding='same')(input_img)
  x = tf.keras.layers.MaxPooling1D(4)(x)  # Downsample: 2000 -> 500

  x = tf.keras.layers.Conv1D(12, 64, padding='same')(x)
  x = tf.keras.layers.MaxPooling1D(2)(x)  # Downsample: 500 -> 250

  x = tf.keras.layers.Conv1D(12, 16, padding='same')(x)
  x = tf.keras.layers.MaxPooling1D(2)(x)  # Downsample: 250 -> 125

  # Bottleneck
  x = tf.keras.layers.Flatten()(x)
  x = tf.keras.layers.Dense(self.__config['MODEL']['ENCODED_STATE_SIZE'])(x)

  # Decoder
  x = tf.keras.layers.Dense(125 * 12)(x)  # Expand to match last encoder feature size
  x = tf.keras.layers.Reshape((125, 12))(x)

  x = tf.keras.layers.UpSampling1D(2)(x)  # Upsample: 125 -> 250
  x = tf.keras.layers.Conv1D(12, 16, padding='same')(x)

  x = tf.keras.layers.UpSampling1D(2)(x)  # Upsample: 250 -> 500
  x = tf.keras.layers.Conv1D(12, 64, padding='same')(x)

  x = tf.keras.layers.UpSampling1D(4)(x)  # Upsample: 500 -> 2000
  x = tf.keras.layers.Conv1D(lines, 128, padding='same')(x)  # Correct Final Layer

  # Model definition
  self.__model = tf.keras.models.Model(input_img, x)

It doesn’t matter which model I use, inference result values are the SAME, exactly the same values, as if the neural network learned nothing…
You can see below 2 comparative charts with the inference values

Don’t assume that the data might be corrupted, I have collected enough data to train for both cases and I’ve checked their validity

The confusing part is that inference works in python, using TensorFlow 2.7.0 with GPU, an Ubuntu Focal x86_64…I mean, I saw different values between 2 charts

In Jetson I’ve made a py script to convert .h5 model file into .onnx and then into .engine format:

import tf2onnx
import tensorflow as tf
import argparse
import subprocess


def convert_h5_to_onnx(h5_model_path, onnx_model_path):
    print("Converting .h5 model to ONNX...")

    model = tf.keras.models.load_model(h5_model_path)

    model_proto, _ = tf2onnx.convert.from_keras(model, opset=13)
    
    with open(onnx_model_path, "wb") as f:
        f.write(model_proto.SerializeToString())
    
    print(f"ONNX model saved at {onnx_model_path}")

def convert_onnx_to_trt(onnx_model_path, engine_model_path, trt_precision_mode):
    print("Converting ONNX model to TensorRT Engine...")

    fp_precision_flag = '--fp16' if trt_precision_mode.upper() == 'FP16' else ''
    
    trtexec_path = "/usr/src/tensorrt/bin/trtexec"

    command = f"{trtexec_path} --onnx={onnx_model_path} --saveEngine={engine_model_path} {fp_precision_flag}"
    
    process = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    
    if process.returncode != 0:
        print(f"Error in converting to TensorRT engine:\n{process.stderr.decode('utf-8')}")
    else:
        print(f"TensorRT engine saved at {engine_model_path}")

# Main
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Convert a .h5 model to ONNX and TensorRT engine format")
    parser.add_argument("--h5_model_path", type=str, required=True, help="Path to the .h5 model file")
    parser.add_argument("--onnx_model_path", type=str, required=True, help="Path to save the converted ONNX model")
    parser.add_argument("--engine_model_path", type=str, required=True, help="Path to save the converted TensorRT engine")
    parser.add_argument("--trt_precision_mode", type=str, choices=['FP32', 'FP16'], default="FP16", help="Precision mode for TensorRT engine (FP32 or FP16)")

    args = parser.parse_args()

    convert_h5_to_onnx(args.h5_model_path, args.onnx_model_path)

    convert_onnx_to_trt(args.onnx_model_path, args.engine_model_path, args.trt_precision_mode)

“RunInference” is my C/C++ inference function using TensorRT ( as input data , I used FFT s of the raw values )

void RunInference(ICudaEngine* engine, IExecutionContext* context, int input_index, int output_index, kiss_fft_cpx* x_fft, kiss_fft_cpx* y_fft, kiss_fft_cpx* z_fft, float* predicted_output, int g_code, const char* clientName) {
	
    int batchSize = 1;
    int input_size = batchSize * 2000 * 3 * sizeof(float);  // [1, 2000, 3]
    int output_size = batchSize * 3 * sizeof(float);        // [1, 3]

    // Prepare normalized input data and set DC component to zero
    float input_data[2000 * 3];
    const int MN = 4000;
	
    for (int i = 0; i < 2000; i++) {
        input_data[i * 3 + 0] = sqrt(x_fft[i].r * x_fft[i].r + x_fft[i].i * x_fft[i].i) / MN; 
        input_data[i * 3 + 1] = sqrt(y_fft[i].r * y_fft[i].r + y_fft[i].i * y_fft[i].i) / MN; 
        input_data[i * 3 + 2] = sqrt(z_fft[i].r * z_fft[i].r + z_fft[i].i * z_fft[i].i) / MN; 
    }
	
    // Set DC component to zero
    input_data[0] = 0;  // X-axis
    input_data[1] = 0;  // Y-axis
    input_data[2] = 0;  // Z-axis

    ////Allocate GPU buffers for input and output
    void* buffers[2];
	
	write_log(LOG_DEBUG, "RunInference for '%s' - input_index = %d, output_index = %d", clientName, input_index, output_index);
	
	if (cudaMalloc(&buffers[input_index], input_size) != cudaSuccess) {
		write_log(LOG_ERROR, "RunInference for '%s' - Failed to allocate memory for input buffer", clientName);
		return;
	}
	if (cudaMalloc(&buffers[output_index], output_size) != cudaSuccess) {
		write_log(LOG_ERROR, "RunInference for '%s' - Failed to allocate memory for output buffer", clientName);
		cudaFree(buffers[input_index]);
		return;
	}
	
	if (cudaMemset(buffers[input_index], 0, input_size) != cudaSuccess) {
		write_log(LOG_ERROR, "RunInference for '%s' - Failed to memset input buffer to zero", clientName);
		return;
	}
	if (cudaMemset(buffers[output_index], 0, output_size) != cudaSuccess) {
		write_log(LOG_ERROR, "RunInference for '%s' - Failed to memset output buffer to zero", clientName);
		return;
	}
	///////////////////
				
    // Copy the input data to the GPU
    cudaMemcpy(buffers[input_index], input_data, input_size, cudaMemcpyHostToDevice);

    // Launch inference
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    context->enqueueV2(buffers, stream, nullptr);
    cudaStreamSynchronize(stream);

    // Copy the output data from GPU to CPU
    cudaMemcpy(predicted_output, buffers[output_index], output_size, cudaMemcpyDeviceToHost);

    // Free GPU memory
    cudaFree(buffers[input_index]);
    cudaFree(buffers[output_index]);
    cudaStreamDestroy(stream);
}

This is how I load one model in app and how I call inference function:

	IRuntime* runtime = createInferRuntime(gLogger);
	if (!runtime) {
		write_log(LOG_ERROR, "client_handler: Failed to create runtime for client %s", client.ClientName);
		return (void*)-1;
	}
	
	std::vector<char> engine_data = loadEngine(client.ModelPath, client.ClientName);	

          ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size(), nullptr);
          if (!engine) {
          	write_log(LOG_ERROR, "client_handler: Failed to create engine for thread %s", client.ClientName);

          	return (void*)-1;
          }

            IExecutionContext* context = engine->createExecutionContext();
            if (!context) {
            write_log(LOG_ERROR, "client_handler: Failed to create execution context for thread %s", client.ClientName);							
            engine->destroy();
            return (void*)-1;
            }
            
            int input_index = engine->getBindingIndex(client.ModelInputBindingName) ;//get from config file
            int output_index = engine->getBindingIndex(client.ModelOutputBindingName); //get from config file

      	   RunInference(engine, context, input_index, output_index, x_fft, y_fft, z_fft, predicted_output, client.G_code, client.ClientName);
      	
      	// Synchronize the GPU to ensure all operations are completed
      	cudaDeviceSynchronize();
      
      	// Check for CUDA errors after synchronization
      	cudaError_t err = cudaGetLastError();
      	if (err != cudaSuccess) {
      		write_log(LOG_ERROR, "CUDA error after synchronization in thread '%s': %s", client.ClientName, cudaGetErrorString(err));
      	} else {
      		write_log(LOG_INFO, "GPU synchronized successfully for thread '%s'", client.ClientName);
      	}
      	
      	 context->destroy();
      	 engine->destroy();
         runtime->destroy();
	

I want to point out that the vibrations are detected by the application, but I don’t understand why the range of values doesn’t change depending on the trained model from the two scenarios. I suspect the problem might be with the model conversion or the inference process / function in TensorRT using C/C++.

Do you have any suggestions?