How to quantize a model for Tensorrt?

I want to quantize a model with INT8 and infer with TensorRT.
I followed this page and wrote codes, but it did not work.

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
import torch.cuda
import torch_tensorrt
from PIL import Image
import os
import copy
import time
import torch.nn.utils.prune as prune
import torch.quantization
from modelopt.torch.quantization.utils import export_torch_mode


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_path = '../../dataset/oxford_pets'
train_annotations = os.path.join(data_path, 'annotations', 'trainval.txt')
test_annotations = os.path.join(data_path, 'annotations', 'test.txt')
img_dir = os.path.join(data_path, 'images')


class OxfordPetsDataset(Dataset):
    """
    https://www.robots.ox.ac.uk/~vgg/data/pets/
    """
    def __init__(self, annotations_file, img_dir, transform=None):
        self.img_labels = pd.read_csv(annotations_file, delimiter=' ', header=None)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0] + '.jpg')
        image = Image.open(img_path).convert('RGB')
        label = self.img_labels.iloc[idx, 1] - 1  

        if self.transform:
            image = self.transform(image)

        return image, label


batch_size = 32


data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}

train_dataset = OxfordPetsDataset(
    annotations_file=train_annotations,
    img_dir=img_dir,
    transform=data_transforms['train']
)

test_dataset = OxfordPetsDataset(
    annotations_file=test_annotations,
    img_dir=img_dir,
    transform=data_transforms['test']
)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model = models.resnet50(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 37)  # 37 class
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

num_epochs = 5

print(f"Start training...")

model.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}')

correct = 0
total = 0

torch.save(model.state_dict(), 'model/original_oxford_pets_resnet50.pth')


def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    inference_time = 0
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            torch.cuda.synchronize() 
            start_time = time.time()
            outputs = model(inputs)
            torch.cuda.synchronize()  
            inference_time += time.time() - start_time
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        torch.cuda.synchronize()
    
    accuracy = 100 * correct / total
    avg_inference_time = inference_time / len(test_loader)
    return accuracy, avg_inference_time


original_model = models.resnet50()
num_ftrs = original_model.fc.in_features
original_model.fc = nn.Linear(num_ftrs, 37)
original_model.load_state_dict(torch.load('model/original_oxford_pets_resnet50.pth'))
original_model = original_model.to(device)

ptqs_model = copy.deepcopy(original_model)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


def calibrate_loop(model, train_dataloader=train_loader):
    model.eval()
    total = 0
    correct = 0  
    loss = 0.0  
    for data, labels in train_dataloader:
        data, labels = data.cuda(), labels.cuda()
        logits = model(data)  
        loss += criterion(logits, labels)
        preds = torch.max(logits, 1)[1]
        total += labels.size(0)
        correct += (preds == labels).sum().item()

    print(f"PTQ Loss: {loss / total:.5f}, Accuracy: {100 * correct / total:.2f}%")


quant_type = "int8"  
quant_cfg = mtq.INT8_DEFAULT_CFG if quant_type == "int8" else mtq.FP8_DEFAULT_CFG

try:
    mtq.quantize(ptqs_model, quant_cfg, forward_loop=calibrate_loop)
except Exception as err:
    print(f"Quantization failed: {str(err)}")


with torch.no_grad():
    with export_torch_mode():
        input_tensor = next(iter(test_loader))[0].cuda()

        from torch.export._trace import _export
        exp_program = _export(ptqs_model, (input_tensor,))

        enabled_precisions = {torch.int8} if quant_type == "int8" else {torch.float8_e4m3fn}
        trt_ptq_int8_model = torch_tensorrt.dynamo.compile(
            exp_program,
            inputs=[input_tensor],
            enabled_precisions=enabled_precisions,
            min_block_size=1,
            debug=False
        )

        total, correct, loss = 0, 0, 0.0
        for data, labels in test_loader:
            data, labels = data.cuda(), labels.cuda()
            out = trt_ptq_int8_model(data)
            loss += criterion(out, labels)
            preds = torch.max(out, 1)[1]
            total += labels.size(0)
            correct += (preds == labels).sum().item()

        print(f"Test Loss: {loss / total:.5f}, Test Accuracy: {100 * correct / total:.2f}%")

I executed this code and encountered an error below.

INFO:torch_tensorrt.dynamo._compiler:Compilation Settings: CompilationSettings(enabled_precisions={<dtype.i8: 3>}, debug=False, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, assume_dynamic_shape_support=False, sparse_weights=False, refit=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False, timing_cache_path='/tmp/timing_cache.bin')

INFO:torch_tensorrt.dynamo._compiler:Partitioning the graph via the fast partitioner
INFO:torch_tensorrt [TensorRT Conversion Context]:The logger passed into createInferBuilder differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.
INFO:torch_tensorrt [TensorRT Conversion Context]:[MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 2074, GPU 6806 (MiB)
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:TRT INetwork construction elapsed time: 0:00:00.004684
INFO:torch_tensorrt [TensorRT Conversion Context]:BuilderFlag::kTF32 is set but hardware does not support TF32. Disabling TF32.
WARNING:torch_tensorrt [TensorRT Conversion Context]:Calibrator is not being used. Users must provide dynamic range for all tensors that are not Int32 or Bool.
ERROR:torch_tensorrt [TensorRT Conversion Context]:IBuilder::buildSerializedNetwork: Error Code 4: Internal Error (Calibration failure occurred with no scaling factors detected. This could be due to no int8 calibrator or insufficient custom scales for network layers. Please see int8 sample to setup calibration correctly.)
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Cell In[22], line 9
      6 exp_program = _export(ptqs_model, (input_tensor,))
      8 enabled_precisions = {torch.int8} if quant_type == "int8" else {torch.float8_e4m3fn}
----> 9 trt_ptq_int8_model = torch_tensorrt.dynamo.compile(
     10     exp_program,
     11     inputs=[input_tensor],
     12     enabled_precisions=enabled_precisions,
     13     min_block_size=1,
     14     debug=False
     15 )
     17 # 推論
     18 total, correct, loss = 0, 0, 0.0

File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/_compiler.py:230, in compile(exported_program, inputs, device, disable_tf32, assume_dynamic_shape_support, sparse_weights, enabled_precisions, engine_capability, refit, debug, num_avg_timing_iters, workspace_size, dla_sram_size, dla_local_dram_size, dla_global_dram_size, truncate_double, require_full_compilation, min_block_size, torch_executed_ops, torch_executed_modules, pass_through_build_failures, max_aux_streams, version_compatible, optimization_level, use_python_runtime, use_fast_partitioner, enable_experimental_decompositions, dryrun, hardware_compatible, timing_cache_path, **kwargs)
    228 settings = CompilationSettings(**compilation_options)
    229 logger.info("Compilation Settings: %s\n", settings)
--> 230 trt_gm = compile_module(gm, inputs, settings)
    231 return trt_gm

File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/_compiler.py:418, in compile_module(gm, sample_inputs, settings)
    416     # Create TRT engines from submodule
    417     if not settings.dryrun:
--> 418         trt_module = convert_module(
    419             submodule,
    420             submodule_inputs,
    421             settings=settings,
    422             name=name,
    423         )
    425         trt_modules[name] = trt_module
    427 sample_outputs = gm(
    428     *get_torch_inputs(sample_inputs, to_torch_device(settings.device))
    429 )

File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/conversion/_conversion.py:106, in convert_module(module, inputs, settings, name)
     91 def convert_module(
     92     module: torch.fx.GraphModule,
     93     inputs: Sequence[Input],
     94     settings: CompilationSettings = CompilationSettings(),
     95     name: str = "",
     96 ) -> PythonTorchTensorRTModule | TorchTensorRTModule:
     97     """Convert an FX module to a TRT module
     98     Args:
     99         module: FX GraphModule to convert
   (...)
    104         _PythonTorchTensorRTModule or TorchTensorRTModule
    105     """
--> 106     interpreter_result = interpret_module_to_result(module, inputs, settings)
    108     if settings.use_python_runtime or not ENABLED_FEATURES.torch_tensorrt_runtime:
    109         if not settings.use_python_runtime:

File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/conversion/_conversion.py:87, in interpret_module_to_result(module, inputs, settings)
     73 output_dtypes = infer_module_output_dtypes(
     74     module,
     75     inputs,
     76     settings.device,
     77     truncate_double=settings.truncate_double,
     78 )
     80 interpreter = TRTInterpreter(
     81     module,
     82     inputs,
   (...)
     85     compilation_settings=settings,
     86 )
---> 87 interpreter_result = interpreter.run()
     88 return interpreter_result

File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py:344, in TRTInterpreter.run(self, strict_type_constraints, algorithm_selector, tactic_sources)
    337 self._create_timing_cache(
    338     builder_config, self.compilation_settings.timing_cache_path
    339 )
    341 serialized_engine = self.builder.build_serialized_network(
    342     self.ctx.net, builder_config
    343 )
--> 344 assert serialized_engine
    346 _LOGGER.info(
    347     f"Build TRT engine elapsed time: {datetime.now() - build_engine_start_time}"
    348 )
    349 _LOGGER.info(f"TRT Engine uses: {serialized_engine.nbytes} bytes of Memory")

AssertionError: 

Even though I followed the codes from the documentation exactly, I encountered the same error.
How can I solve this issue?

Additional Information:

import torch
import tensorrt as trt
import torch_tensorrt
print(f"PyTorch: {torch.__version__}")
print(f"TensorRT: {trt.__version__}")
print(f"Torch-TensorRT: {torch_tensorrt.__version__}")
PyTorch: 2.4.1+cu121
TensorRT: 10.1.0
Torch-TensorRT: 2.4.0+cu121

# I can not use ptq with my version.
calib = torch_tensorrt.ptq.DataLoaderCalibration()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[3], line 1
----> 1 calib = torch_tensorrt.ptq.DataLoaderCalibration()

AttributeError: module 'torch_tensorrt' has no attribute 'ptq'

Dataset: The Oxford-IIIT Pet Dataset