I want to quantize a model with INT8 and infer with TensorRT.
I followed this page and wrote codes, but it did not work.
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
import torch.cuda
import torch_tensorrt
from PIL import Image
import os
import copy
import time
import torch.nn.utils.prune as prune
import torch.quantization
from modelopt.torch.quantization.utils import export_torch_mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_path = '../../dataset/oxford_pets'
train_annotations = os.path.join(data_path, 'annotations', 'trainval.txt')
test_annotations = os.path.join(data_path, 'annotations', 'test.txt')
img_dir = os.path.join(data_path, 'images')
class OxfordPetsDataset(Dataset):
"""
https://www.robots.ox.ac.uk/~vgg/data/pets/
"""
def __init__(self, annotations_file, img_dir, transform=None):
self.img_labels = pd.read_csv(annotations_file, delimiter=' ', header=None)
self.img_dir = img_dir
self.transform = transform
def __len__(self):
return len(self.img_labels)
def __getitem__(self, idx):
img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0] + '.jpg')
image = Image.open(img_path).convert('RGB')
label = self.img_labels.iloc[idx, 1] - 1
if self.transform:
image = self.transform(image)
return image, label
batch_size = 32
data_transforms = {
'train': transforms.Compose([
transforms.Resize((224, 224)),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'test': transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
}
train_dataset = OxfordPetsDataset(
annotations_file=train_annotations,
img_dir=img_dir,
transform=data_transforms['train']
)
test_dataset = OxfordPetsDataset(
annotations_file=test_annotations,
img_dir=img_dir,
transform=data_transforms['test']
)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
model = models.resnet50(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 37) # 37 class
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
num_epochs = 5
print(f"Start training...")
model.train()
for epoch in range(num_epochs):
running_loss = 0.0
for inputs, labels in train_loader:
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}')
correct = 0
total = 0
torch.save(model.state_dict(), 'model/original_oxford_pets_resnet50.pth')
def evaluate_model(model, test_loader, device):
model.eval()
correct = 0
total = 0
inference_time = 0
with torch.no_grad():
for inputs, labels in test_loader:
inputs = inputs.to(device)
labels = labels.to(device)
torch.cuda.synchronize()
start_time = time.time()
outputs = model(inputs)
torch.cuda.synchronize()
inference_time += time.time() - start_time
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
torch.cuda.synchronize()
accuracy = 100 * correct / total
avg_inference_time = inference_time / len(test_loader)
return accuracy, avg_inference_time
original_model = models.resnet50()
num_ftrs = original_model.fc.in_features
original_model.fc = nn.Linear(num_ftrs, 37)
original_model.load_state_dict(torch.load('model/original_oxford_pets_resnet50.pth'))
original_model = original_model.to(device)
ptqs_model = copy.deepcopy(original_model)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
def calibrate_loop(model, train_dataloader=train_loader):
model.eval()
total = 0
correct = 0
loss = 0.0
for data, labels in train_dataloader:
data, labels = data.cuda(), labels.cuda()
logits = model(data)
loss += criterion(logits, labels)
preds = torch.max(logits, 1)[1]
total += labels.size(0)
correct += (preds == labels).sum().item()
print(f"PTQ Loss: {loss / total:.5f}, Accuracy: {100 * correct / total:.2f}%")
quant_type = "int8"
quant_cfg = mtq.INT8_DEFAULT_CFG if quant_type == "int8" else mtq.FP8_DEFAULT_CFG
try:
mtq.quantize(ptqs_model, quant_cfg, forward_loop=calibrate_loop)
except Exception as err:
print(f"Quantization failed: {str(err)}")
with torch.no_grad():
with export_torch_mode():
input_tensor = next(iter(test_loader))[0].cuda()
from torch.export._trace import _export
exp_program = _export(ptqs_model, (input_tensor,))
enabled_precisions = {torch.int8} if quant_type == "int8" else {torch.float8_e4m3fn}
trt_ptq_int8_model = torch_tensorrt.dynamo.compile(
exp_program,
inputs=[input_tensor],
enabled_precisions=enabled_precisions,
min_block_size=1,
debug=False
)
total, correct, loss = 0, 0, 0.0
for data, labels in test_loader:
data, labels = data.cuda(), labels.cuda()
out = trt_ptq_int8_model(data)
loss += criterion(out, labels)
preds = torch.max(out, 1)[1]
total += labels.size(0)
correct += (preds == labels).sum().item()
print(f"Test Loss: {loss / total:.5f}, Test Accuracy: {100 * correct / total:.2f}%")
I executed this code and encountered an error below.
INFO:torch_tensorrt.dynamo._compiler:Compilation Settings: CompilationSettings(enabled_precisions={<dtype.i8: 3>}, debug=False, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, assume_dynamic_shape_support=False, sparse_weights=False, refit=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False, timing_cache_path='/tmp/timing_cache.bin')
INFO:torch_tensorrt.dynamo._compiler:Partitioning the graph via the fast partitioner
INFO:torch_tensorrt [TensorRT Conversion Context]:The logger passed into createInferBuilder differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.
INFO:torch_tensorrt [TensorRT Conversion Context]:[MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 2074, GPU 6806 (MiB)
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:TRT INetwork construction elapsed time: 0:00:00.004684
INFO:torch_tensorrt [TensorRT Conversion Context]:BuilderFlag::kTF32 is set but hardware does not support TF32. Disabling TF32.
WARNING:torch_tensorrt [TensorRT Conversion Context]:Calibrator is not being used. Users must provide dynamic range for all tensors that are not Int32 or Bool.
ERROR:torch_tensorrt [TensorRT Conversion Context]:IBuilder::buildSerializedNetwork: Error Code 4: Internal Error (Calibration failure occurred with no scaling factors detected. This could be due to no int8 calibrator or insufficient custom scales for network layers. Please see int8 sample to setup calibration correctly.)
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
Cell In[22], line 9
6 exp_program = _export(ptqs_model, (input_tensor,))
8 enabled_precisions = {torch.int8} if quant_type == "int8" else {torch.float8_e4m3fn}
----> 9 trt_ptq_int8_model = torch_tensorrt.dynamo.compile(
10 exp_program,
11 inputs=[input_tensor],
12 enabled_precisions=enabled_precisions,
13 min_block_size=1,
14 debug=False
15 )
17 # 推論
18 total, correct, loss = 0, 0, 0.0
File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/_compiler.py:230, in compile(exported_program, inputs, device, disable_tf32, assume_dynamic_shape_support, sparse_weights, enabled_precisions, engine_capability, refit, debug, num_avg_timing_iters, workspace_size, dla_sram_size, dla_local_dram_size, dla_global_dram_size, truncate_double, require_full_compilation, min_block_size, torch_executed_ops, torch_executed_modules, pass_through_build_failures, max_aux_streams, version_compatible, optimization_level, use_python_runtime, use_fast_partitioner, enable_experimental_decompositions, dryrun, hardware_compatible, timing_cache_path, **kwargs)
228 settings = CompilationSettings(**compilation_options)
229 logger.info("Compilation Settings: %s\n", settings)
--> 230 trt_gm = compile_module(gm, inputs, settings)
231 return trt_gm
File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/_compiler.py:418, in compile_module(gm, sample_inputs, settings)
416 # Create TRT engines from submodule
417 if not settings.dryrun:
--> 418 trt_module = convert_module(
419 submodule,
420 submodule_inputs,
421 settings=settings,
422 name=name,
423 )
425 trt_modules[name] = trt_module
427 sample_outputs = gm(
428 *get_torch_inputs(sample_inputs, to_torch_device(settings.device))
429 )
File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/conversion/_conversion.py:106, in convert_module(module, inputs, settings, name)
91 def convert_module(
92 module: torch.fx.GraphModule,
93 inputs: Sequence[Input],
94 settings: CompilationSettings = CompilationSettings(),
95 name: str = "",
96 ) -> PythonTorchTensorRTModule | TorchTensorRTModule:
97 """Convert an FX module to a TRT module
98 Args:
99 module: FX GraphModule to convert
(...)
104 _PythonTorchTensorRTModule or TorchTensorRTModule
105 """
--> 106 interpreter_result = interpret_module_to_result(module, inputs, settings)
108 if settings.use_python_runtime or not ENABLED_FEATURES.torch_tensorrt_runtime:
109 if not settings.use_python_runtime:
File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/conversion/_conversion.py:87, in interpret_module_to_result(module, inputs, settings)
73 output_dtypes = infer_module_output_dtypes(
74 module,
75 inputs,
76 settings.device,
77 truncate_double=settings.truncate_double,
78 )
80 interpreter = TRTInterpreter(
81 module,
82 inputs,
(...)
85 compilation_settings=settings,
86 )
---> 87 interpreter_result = interpreter.run()
88 return interpreter_result
File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py:344, in TRTInterpreter.run(self, strict_type_constraints, algorithm_selector, tactic_sources)
337 self._create_timing_cache(
338 builder_config, self.compilation_settings.timing_cache_path
339 )
341 serialized_engine = self.builder.build_serialized_network(
342 self.ctx.net, builder_config
343 )
--> 344 assert serialized_engine
346 _LOGGER.info(
347 f"Build TRT engine elapsed time: {datetime.now() - build_engine_start_time}"
348 )
349 _LOGGER.info(f"TRT Engine uses: {serialized_engine.nbytes} bytes of Memory")
AssertionError:
Even though I followed the codes from the documentation exactly, I encountered the same error.
How can I solve this issue?
Additional Information:
import torch
import tensorrt as trt
import torch_tensorrt
print(f"PyTorch: {torch.__version__}")
print(f"TensorRT: {trt.__version__}")
print(f"Torch-TensorRT: {torch_tensorrt.__version__}")
PyTorch: 2.4.1+cu121
TensorRT: 10.1.0
Torch-TensorRT: 2.4.0+cu121
# I can not use ptq with my version.
calib = torch_tensorrt.ptq.DataLoaderCalibration()
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[3], line 1
----> 1 calib = torch_tensorrt.ptq.DataLoaderCalibration()
AttributeError: module 'torch_tensorrt' has no attribute 'ptq'
Dataset: The Oxford-IIIT Pet Dataset