What is TensorRT?
TensorRT is NVIDIA's high-performance deep learning inference optimizer and runtime. It delivers:
- 2-6x faster inference compared to PyTorch/TensorFlow
- Reduced memory footprint via quantization
- Optimized for production with batching and caching
- Hardware-specific optimization for each GPU architecture
đź’ˇ When to Use TensorRT
Use TensorRT when you need maximum performance for production inference—especially for vision models and LLMs where latency matters.
TensorRT-LLM for Large Language Models
Installation
# Docker (recommended)
docker run --gpus all -it nvcr.io/nvidia/tensorrt:24.01-py3
# Or pip install
pip install tensorrt tensorrt-llm
# Verify installation
python -c "import tensorrt; print(tensorrt.__version__)"
Convert LLaMA to TensorRT-LLM
# Clone TensorRT-LLM
git clone https://github.com/NVIDIA/TensorRT-LLM.git
cd TensorRT-LLM/examples/llama
# Convert checkpoint
python convert_checkpoint.py \
--model_dir /path/to/llama-3.1-8b \
--output_dir ./llama_checkpoint \
--dtype float16 \
--tp_size 1 # Tensor parallelism
# Build TensorRT engine
trtllm-build \
--checkpoint_dir ./llama_checkpoint \
--output_dir ./llama_engine \
--gemm_plugin float16 \
--max_batch_size 8 \
--max_input_len 2048 \
--max_output_len 512 \
--paged_kv_cache enable \
--use_custom_all_reduce enable
Running Inference
from tensorrt_llm import LLM, SamplingParams
# Load the engine
llm = LLM(model="./llama_engine")
# Inference
prompts = [
"The future of AI is",
"Machine learning enables",
]
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.9,
max_tokens=100
)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
print(output.outputs[0].text)
TensorRT for Vision Models
PyTorch to TensorRT
import torch
import tensorrt as trt
from torch2trt import torch2trt
# Load PyTorch model
model = torch.hub.load('pytorch/vision', 'resnet50', pretrained=True).cuda().eval()
# Create sample input
x = torch.randn(1, 3, 224, 224).cuda()
# Convert to TensorRT
model_trt = torch2trt(
model,
[x],
fp16_mode=True,
max_batch_size=32
)
# Save engine
torch.save(model_trt.state_dict(), 'resnet50_trt.pth')
# Benchmark
import time
# PyTorch
torch.cuda.synchronize()
start = time.time()
for _ in range(100):
_ = model(x)
torch.cuda.synchronize()
pytorch_time = (time.time() - start) / 100
# TensorRT
torch.cuda.synchronize()
start = time.time()
for _ in range(100):
_ = model_trt(x)
torch.cuda.synchronize()
tensorrt_time = (time.time() - start) / 100
print(f"PyTorch: {pytorch_time*1000:.2f}ms")
print(f"TensorRT: {tensorrt_time*1000:.2f}ms")
print(f"Speedup: {pytorch_time/tensorrt_time:.2f}x")
ONNX to TensorRT
import tensorrt as trt
def build_engine(onnx_path, engine_path, precision="fp16"):
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, logger)
# Parse ONNX
with open(onnx_path, 'rb') as f:
if not parser.parse(f.read()):
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
# Build config
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1GB
if precision == "fp16":
config.set_flag(trt.BuilderFlag.FP16)
elif precision == "int8":
config.set_flag(trt.BuilderFlag.INT8)
# Need calibration for INT8
# Dynamic shapes
profile = builder.create_optimization_profile()
profile.set_shape(
"input",
min=(1, 3, 224, 224),
opt=(8, 3, 224, 224),
max=(32, 3, 224, 224)
)
config.add_optimization_profile(profile)
# Build engine
engine = builder.build_serialized_network(network, config)
with open(engine_path, 'wb') as f:
f.write(engine)
return engine
# Convert
build_engine("model.onnx", "model.engine", "fp16")
INT8 Quantization
INT8 can provide 2x additional speedup over FP16:
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
class Int8Calibrator(trt.IInt8EntropyCalibrator2):
def __init__(self, calibration_data, cache_file):
super().__init__()
self.cache_file = cache_file
self.data = calibration_data
self.batch_idx = 0
self.batch_size = 32
# Allocate GPU memory for calibration batch
self.device_input = cuda.mem_alloc(
self.data[0].nbytes * self.batch_size
)
def get_batch_size(self):
return self.batch_size
def get_batch(self, names):
if self.batch_idx < len(self.data):
batch = self.data[self.batch_idx:self.batch_idx+self.batch_size]
cuda.memcpy_htod(self.device_input, np.ascontiguousarray(batch))
self.batch_idx += self.batch_size
return [int(self.device_input)]
return None
def read_calibration_cache(self):
try:
with open(self.cache_file, 'rb') as f:
return f.read()
except:
return None
def write_calibration_cache(self, cache):
with open(self.cache_file, 'wb') as f:
f.write(cache)
# Build INT8 engine with calibration
def build_int8_engine(onnx_path, engine_path, calibration_data):
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, logger)
with open(onnx_path, 'rb') as f:
parser.parse(f.read())
config = builder.create_builder_config()
config.set_flag(trt.BuilderFlag.INT8)
calibrator = Int8Calibrator(calibration_data, "calibration.cache")
config.int8_calibrator = calibrator
engine = builder.build_serialized_network(network, config)
with open(engine_path, 'wb') as f:
f.write(engine)
return engine
⚠️ INT8 Accuracy
INT8 quantization may reduce accuracy. Always validate on your test set. Use representative calibration data for best results.
Stable Diffusion with TensorRT
# Using diffusers with TensorRT backend
pip install optimum[onnxruntime-gpu]
from optimum.onnxruntime import ORTStableDiffusionPipeline
# Convert and optimize
pipe = ORTStableDiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
export=True,
provider="TensorrtExecutionProvider",
provider_options={
"trt_fp16_enable": True,
"trt_engine_cache_enable": True,
"trt_engine_cache_path": "./trt_cache"
}
)
# First run builds engine (slow)
# Subsequent runs use cached engine (fast)
image = pipe(
"a beautiful sunset over mountains",
num_inference_steps=20
).images[0]
Benchmark Results
| Model | PyTorch FP16 | TensorRT FP16 | TensorRT INT8 |
|---|---|---|---|
| ResNet-50 | 2.1 ms | 0.8 ms | 0.5 ms |
| BERT-base | 3.2 ms | 1.1 ms | 0.7 ms |
| LLaMA-8B (per token) | 15 ms | 6 ms | 4 ms |
| SDXL (1 image) | 4.2 sec | 1.8 sec | - |
Benchmarks on A100 80GB, batch size 1 unless noted
Run TensorRT on Cloud GPUs
A100 and H100 GPUs from $1.50/hr with TensorRT pre-installed.
Start Optimizing →Production Deployment
Triton Inference Server with TensorRT
# model_repository/
# └── resnet50/
# ├── config.pbtxt
# └── 1/
# └── model.plan # TensorRT engine
# config.pbtxt
name: "resnet50"
platform: "tensorrt_plan"
max_batch_size: 32
input [
{
name: "input"
data_type: TYPE_FP16
dims: [ 3, 224, 224 ]
}
]
output [
{
name: "output"
data_type: TYPE_FP16
dims: [ 1000 ]
}
]
instance_group [
{
count: 2
kind: KIND_GPU
}
]
dynamic_batching {
preferred_batch_size: [ 8, 16, 32 ]
max_queue_delay_microseconds: 100
}
Client Code
import tritonclient.http as httpclient
import numpy as np
client = httpclient.InferenceServerClient(url="localhost:8000")
# Prepare input
image = np.random.randn(1, 3, 224, 224).astype(np.float16)
inputs = [httpclient.InferInput("input", image.shape, "FP16")]
inputs[0].set_data_from_numpy(image)
outputs = [httpclient.InferRequestedOutput("output")]
# Inference
result = client.infer("resnet50", inputs, outputs=outputs)
predictions = result.as_numpy("output")
print(f"Top prediction: {np.argmax(predictions)}")
Optimization Tips
1. Use Optimal Batch Sizes
TensorRT performs best with power-of-2 batch sizes. Profile different sizes:
# Profile different batch sizes
for batch_size in [1, 2, 4, 8, 16, 32]:
# Build engine with this max batch
# Measure throughput
pass
2. Enable All Optimizations
# TensorRT-LLM optimizations
trtllm-build \
--gemm_plugin float16 \
--paged_kv_cache enable \
--use_custom_all_reduce enable \
--multiple_profiles enable \
--use_fused_mlp enable
3. Use Latest TensorRT Version
Each version brings significant improvements. TensorRT 10.x is 15-30% faster than 8.x for many workloads.
4. Match Engine to GPU
TensorRT engines are GPU-specific. Build separate engines for A100 vs H100.
Troubleshooting
Engine Build Fails
- Check CUDA/TensorRT version compatibility
- Reduce max batch size or sequence length
- Try without INT8 first
Slower Than Expected
- Ensure GPU is in maximum performance mode
- Check for CPU bottlenecks in preprocessing
- Use CUDA graphs for repeated inference
Accuracy Issues with INT8
- Use more calibration samples
- Try different calibration algorithms
- Fall back to FP16 for sensitive layers
Conclusion
TensorRT is essential for production ML inference. Key takeaways:
- 2-4x speedup over PyTorch with FP16
- Additional 2x possible with INT8
- Use TensorRT-LLM for transformer models
- Combine with Triton for production serving
Deploy your TensorRT-optimized models on GPUBrazil for maximum performance at minimal cost.