Mismatch between TensorRT conv layer against PyTorch conv layer output

I am seeing huge difference between TensRT inference output against Pytorch layer output. I am using (1x28x28) input, (20, 5, 5) kernels for the validation. Can you please let me know if I am missing anything in below code snippet?

Cuda 10.0
TensorRT 5.0.2

############## Initialize weights and data #################
weights = torch.load(‘weights.pt’)
test_data = np.random.rand(1, 28, 28)
test_data_vector = np.reshape(786, -1)

PyTorch Inference

torch_data = torch.tensor(test_data).float().unsqueeze(0)
pyt_output = torch.nn.functional.conv2d(torch_data, weights[‘conv1.weight’], weights[‘conv1.bias’], stride=1)

TensorRT Inference

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network()
builder.max_workspace_size = 1 << 30

input_tensor = network.add_input(name=“data”, dtype=trt.float32, shape=(1, 28, 28))
conv1_w = weights[‘conv1.weight’].numpy()
conv1_b = weights[‘conv1.bias’].numpy()
conv1 = network.add_convolution(input=input_tensor, num_output_maps=20, kernel_shape=(5, 5), kernel=conv1_w, bias=conv1_b)
conv1.stride = (1, 1)
conv1.get_output(0).name = “output”
network.mark_output(tensor=conv1.get_output(0))

engine = builder.build_cuda_engine(network)
context = engine.create_execution_context()

inputs, outputs, bindings, stream = allocate_buffers(engine)

Copy to the pagelocked input buffer

np.copyto(inputs[0].host, test_data_vector)

For more information on performing inference, refer to the introductory samples.

The common.do_inference function will return a list of outputs - we only have one in this case.

trt_output = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
print((trt_output[0].reshape(1, 20, 24, 24)))
print(’############’)
print(pyt_output.numpy())

Imports code

import torch
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit

Simple helper data class that’s a little nicer to use than a 2-tuple.

class HostDeviceMem(object):
def init(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem

def __str__(self):
    return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

def __repr__(self):
    return self.__str__()

Allocates all buffers required for an engine, i.e. host/device inputs/outputs.

def allocate_buffers(engine):
inputs =
outputs =
bindings =
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream

This function is generalized for multiple inputs/outputs.

inputs and outputs are expected to be lists of HostDeviceMem objects.

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]

I apologize for the post, I had typo in my code “test_data_vector = np.reshape(786, -1)”, changing to “test_data_vector = test_data.astype(trt.nptype(trt.float32)).ravel()” fixed the issue