TensorRT Optimization: 3x Faster Inference for LLMs and Vision Models

What is TensorRT?

TensorRT is NVIDIA's high-performance deep learning inference optimizer and runtime. It delivers:

2-6x faster inference compared to PyTorch/TensorFlow
Reduced memory footprint via quantization
Optimized for production with batching and caching
Hardware-specific optimization for each GPU architecture

💡 When to Use TensorRT

Use TensorRT when you need maximum performance for production inference—especially for vision models and LLMs where latency matters.

TensorRT-LLM for Large Language Models

Installation

# Docker (recommended)
docker run --gpus all -it nvcr.io/nvidia/tensorrt:24.01-py3

# Or pip install
pip install tensorrt tensorrt-llm

# Verify installation
python -c "import tensorrt; print(tensorrt.__version__)"

Convert LLaMA to TensorRT-LLM

# Clone TensorRT-LLM
git clone https://github.com/NVIDIA/TensorRT-LLM.git
cd TensorRT-LLM/examples/llama

# Convert checkpoint
python convert_checkpoint.py \
    --model_dir /path/to/llama-3.1-8b \
    --output_dir ./llama_checkpoint \
    --dtype float16 \
    --tp_size 1  # Tensor parallelism

# Build TensorRT engine
trtllm-build \
    --checkpoint_dir ./llama_checkpoint \
    --output_dir ./llama_engine \
    --gemm_plugin float16 \
    --max_batch_size 8 \
    --max_input_len 2048 \
    --max_output_len 512 \
    --paged_kv_cache enable \
    --use_custom_all_reduce enable

Running Inference

from tensorrt_llm import LLM, SamplingParams

# Load the engine
llm = LLM(model="./llama_engine")

# Inference
prompts = [
    "The future of AI is",
    "Machine learning enables",
]

sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=100
)

outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    print(output.outputs[0].text)

TensorRT for Vision Models

PyTorch to TensorRT

import torch
import tensorrt as trt
from torch2trt import torch2trt

# Load PyTorch model
model = torch.hub.load('pytorch/vision', 'resnet50', pretrained=True).cuda().eval()

# Create sample input
x = torch.randn(1, 3, 224, 224).cuda()

# Convert to TensorRT
model_trt = torch2trt(
    model, 
    [x],
    fp16_mode=True,
    max_batch_size=32
)

# Save engine
torch.save(model_trt.state_dict(), 'resnet50_trt.pth')

# Benchmark
import time

# PyTorch
torch.cuda.synchronize()
start = time.time()
for _ in range(100):
    _ = model(x)
torch.cuda.synchronize()
pytorch_time = (time.time() - start) / 100

# TensorRT
torch.cuda.synchronize()
start = time.time()
for _ in range(100):
    _ = model_trt(x)
torch.cuda.synchronize()
tensorrt_time = (time.time() - start) / 100

print(f"PyTorch: {pytorch_time*1000:.2f}ms")
print(f"TensorRT: {tensorrt_time*1000:.2f}ms")
print(f"Speedup: {pytorch_time/tensorrt_time:.2f}x")

ONNX to TensorRT

import tensorrt as trt

def build_engine(onnx_path, engine_path, precision="fp16"):
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    )
    parser = trt.OnnxParser(network, logger)
    
    # Parse ONNX
    with open(onnx_path, 'rb') as f:
        if not parser.parse(f.read()):
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return None
    
    # Build config
    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)  # 1GB
    
    if precision == "fp16":
        config.set_flag(trt.BuilderFlag.FP16)
    elif precision == "int8":
        config.set_flag(trt.BuilderFlag.INT8)
        # Need calibration for INT8
    
    # Dynamic shapes
    profile = builder.create_optimization_profile()
    profile.set_shape(
        "input",
        min=(1, 3, 224, 224),
        opt=(8, 3, 224, 224),
        max=(32, 3, 224, 224)
    )
    config.add_optimization_profile(profile)
    
    # Build engine
    engine = builder.build_serialized_network(network, config)
    
    with open(engine_path, 'wb') as f:
        f.write(engine)
    
    return engine

# Convert
build_engine("model.onnx", "model.engine", "fp16")

INT8 Quantization

INT8 can provide 2x additional speedup over FP16:

import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit

class Int8Calibrator(trt.IInt8EntropyCalibrator2):
    def __init__(self, calibration_data, cache_file):
        super().__init__()
        self.cache_file = cache_file
        self.data = calibration_data
        self.batch_idx = 0
        self.batch_size = 32
        
        # Allocate GPU memory for calibration batch
        self.device_input = cuda.mem_alloc(
            self.data[0].nbytes * self.batch_size
        )
    
    def get_batch_size(self):
        return self.batch_size
    
    def get_batch(self, names):
        if self.batch_idx < len(self.data):
            batch = self.data[self.batch_idx:self.batch_idx+self.batch_size]
            cuda.memcpy_htod(self.device_input, np.ascontiguousarray(batch))
            self.batch_idx += self.batch_size
            return [int(self.device_input)]
        return None
    
    def read_calibration_cache(self):
        try:
            with open(self.cache_file, 'rb') as f:
                return f.read()
        except:
            return None
    
    def write_calibration_cache(self, cache):
        with open(self.cache_file, 'wb') as f:
            f.write(cache)

# Build INT8 engine with calibration
def build_int8_engine(onnx_path, engine_path, calibration_data):
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    )
    parser = trt.OnnxParser(network, logger)
    
    with open(onnx_path, 'rb') as f:
        parser.parse(f.read())
    
    config = builder.create_builder_config()
    config.set_flag(trt.BuilderFlag.INT8)
    
    calibrator = Int8Calibrator(calibration_data, "calibration.cache")
    config.int8_calibrator = calibrator
    
    engine = builder.build_serialized_network(network, config)
    
    with open(engine_path, 'wb') as f:
        f.write(engine)
    
    return engine

⚠️ INT8 Accuracy

INT8 quantization may reduce accuracy. Always validate on your test set. Use representative calibration data for best results.

Stable Diffusion with TensorRT

# Using diffusers with TensorRT backend
pip install optimum[onnxruntime-gpu]

from optimum.onnxruntime import ORTStableDiffusionPipeline

# Convert and optimize
pipe = ORTStableDiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    export=True,
    provider="TensorrtExecutionProvider",
    provider_options={
        "trt_fp16_enable": True,
        "trt_engine_cache_enable": True,
        "trt_engine_cache_path": "./trt_cache"
    }
)

# First run builds engine (slow)
# Subsequent runs use cached engine (fast)
image = pipe(
    "a beautiful sunset over mountains",
    num_inference_steps=20
).images[0]

Benchmark Results

Model	PyTorch FP16	TensorRT FP16	TensorRT INT8
ResNet-50	2.1 ms	0.8 ms	0.5 ms
BERT-base	3.2 ms	1.1 ms	0.7 ms
LLaMA-8B (per token)	15 ms	6 ms	4 ms
SDXL (1 image)	4.2 sec	1.8 sec	-

Benchmarks on A100 80GB, batch size 1 unless noted

Run TensorRT on Cloud GPUs

A100 and H100 GPUs from $1.50/hr with TensorRT pre-installed.

Start Optimizing →

Production Deployment

Triton Inference Server with TensorRT

# model_repository/
# └── resnet50/
#     ├── config.pbtxt
#     └── 1/
#         └── model.plan  # TensorRT engine

# config.pbtxt
name: "resnet50"
platform: "tensorrt_plan"
max_batch_size: 32

input [
  {
    name: "input"
    data_type: TYPE_FP16
    dims: [ 3, 224, 224 ]
  }
]

output [
  {
    name: "output"
    data_type: TYPE_FP16
    dims: [ 1000 ]
  }
]

instance_group [
  {
    count: 2
    kind: KIND_GPU
  }
]

dynamic_batching {
  preferred_batch_size: [ 8, 16, 32 ]
  max_queue_delay_microseconds: 100
}

Client Code

import tritonclient.http as httpclient
import numpy as np

client = httpclient.InferenceServerClient(url="localhost:8000")

# Prepare input
image = np.random.randn(1, 3, 224, 224).astype(np.float16)
inputs = [httpclient.InferInput("input", image.shape, "FP16")]
inputs[0].set_data_from_numpy(image)

outputs = [httpclient.InferRequestedOutput("output")]

# Inference
result = client.infer("resnet50", inputs, outputs=outputs)
predictions = result.as_numpy("output")

print(f"Top prediction: {np.argmax(predictions)}")

Optimization Tips

1. Use Optimal Batch Sizes

TensorRT performs best with power-of-2 batch sizes. Profile different sizes:

# Profile different batch sizes
for batch_size in [1, 2, 4, 8, 16, 32]:
    # Build engine with this max batch
    # Measure throughput
    pass

2. Enable All Optimizations

# TensorRT-LLM optimizations
trtllm-build \
    --gemm_plugin float16 \
    --paged_kv_cache enable \
    --use_custom_all_reduce enable \
    --multiple_profiles enable \
    --use_fused_mlp enable

3. Use Latest TensorRT Version

Each version brings significant improvements. TensorRT 10.x is 15-30% faster than 8.x for many workloads.

4. Match Engine to GPU

TensorRT engines are GPU-specific. Build separate engines for A100 vs H100.

Troubleshooting

Engine Build Fails

Check CUDA/TensorRT version compatibility
Reduce max batch size or sequence length
Try without INT8 first

Slower Than Expected

Ensure GPU is in maximum performance mode
Check for CPU bottlenecks in preprocessing
Use CUDA graphs for repeated inference

Accuracy Issues with INT8

Use more calibration samples
Try different calibration algorithms
Fall back to FP16 for sensitive layers

Conclusion

TensorRT is essential for production ML inference. Key takeaways:

2-4x speedup over PyTorch with FP16
Additional 2x possible with INT8
Use TensorRT-LLM for transformer models
Combine with Triton for production serving

Deploy your TensorRT-optimized models on GPUBrazil for maximum performance at minimal cost.