What is TensorRT?

TensorRT is NVIDIA's high-performance deep learning inference optimizer and runtime. It delivers:

đź’ˇ When to Use TensorRT

Use TensorRT when you need maximum performance for production inference—especially for vision models and LLMs where latency matters.

TensorRT-LLM for Large Language Models

Installation

# Docker (recommended)
docker run --gpus all -it nvcr.io/nvidia/tensorrt:24.01-py3

# Or pip install
pip install tensorrt tensorrt-llm

# Verify installation
python -c "import tensorrt; print(tensorrt.__version__)"

Convert LLaMA to TensorRT-LLM

# Clone TensorRT-LLM
git clone https://github.com/NVIDIA/TensorRT-LLM.git
cd TensorRT-LLM/examples/llama

# Convert checkpoint
python convert_checkpoint.py \
    --model_dir /path/to/llama-3.1-8b \
    --output_dir ./llama_checkpoint \
    --dtype float16 \
    --tp_size 1  # Tensor parallelism

# Build TensorRT engine
trtllm-build \
    --checkpoint_dir ./llama_checkpoint \
    --output_dir ./llama_engine \
    --gemm_plugin float16 \
    --max_batch_size 8 \
    --max_input_len 2048 \
    --max_output_len 512 \
    --paged_kv_cache enable \
    --use_custom_all_reduce enable

Running Inference

from tensorrt_llm import LLM, SamplingParams

# Load the engine
llm = LLM(model="./llama_engine")

# Inference
prompts = [
    "The future of AI is",
    "Machine learning enables",
]

sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=100
)

outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    print(output.outputs[0].text)

TensorRT for Vision Models

PyTorch to TensorRT

import torch
import tensorrt as trt
from torch2trt import torch2trt

# Load PyTorch model
model = torch.hub.load('pytorch/vision', 'resnet50', pretrained=True).cuda().eval()

# Create sample input
x = torch.randn(1, 3, 224, 224).cuda()

# Convert to TensorRT
model_trt = torch2trt(
    model, 
    [x],
    fp16_mode=True,
    max_batch_size=32
)

# Save engine
torch.save(model_trt.state_dict(), 'resnet50_trt.pth')

# Benchmark
import time

# PyTorch
torch.cuda.synchronize()
start = time.time()
for _ in range(100):
    _ = model(x)
torch.cuda.synchronize()
pytorch_time = (time.time() - start) / 100

# TensorRT
torch.cuda.synchronize()
start = time.time()
for _ in range(100):
    _ = model_trt(x)
torch.cuda.synchronize()
tensorrt_time = (time.time() - start) / 100

print(f"PyTorch: {pytorch_time*1000:.2f}ms")
print(f"TensorRT: {tensorrt_time*1000:.2f}ms")
print(f"Speedup: {pytorch_time/tensorrt_time:.2f}x")

ONNX to TensorRT

import tensorrt as trt

def build_engine(onnx_path, engine_path, precision="fp16"):
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    )
    parser = trt.OnnxParser(network, logger)
    
    # Parse ONNX
    with open(onnx_path, 'rb') as f:
        if not parser.parse(f.read()):
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return None
    
    # Build config
    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)  # 1GB
    
    if precision == "fp16":
        config.set_flag(trt.BuilderFlag.FP16)
    elif precision == "int8":
        config.set_flag(trt.BuilderFlag.INT8)
        # Need calibration for INT8
    
    # Dynamic shapes
    profile = builder.create_optimization_profile()
    profile.set_shape(
        "input",
        min=(1, 3, 224, 224),
        opt=(8, 3, 224, 224),
        max=(32, 3, 224, 224)
    )
    config.add_optimization_profile(profile)
    
    # Build engine
    engine = builder.build_serialized_network(network, config)
    
    with open(engine_path, 'wb') as f:
        f.write(engine)
    
    return engine

# Convert
build_engine("model.onnx", "model.engine", "fp16")

INT8 Quantization

INT8 can provide 2x additional speedup over FP16:

import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit

class Int8Calibrator(trt.IInt8EntropyCalibrator2):
    def __init__(self, calibration_data, cache_file):
        super().__init__()
        self.cache_file = cache_file
        self.data = calibration_data
        self.batch_idx = 0
        self.batch_size = 32
        
        # Allocate GPU memory for calibration batch
        self.device_input = cuda.mem_alloc(
            self.data[0].nbytes * self.batch_size
        )
    
    def get_batch_size(self):
        return self.batch_size
    
    def get_batch(self, names):
        if self.batch_idx < len(self.data):
            batch = self.data[self.batch_idx:self.batch_idx+self.batch_size]
            cuda.memcpy_htod(self.device_input, np.ascontiguousarray(batch))
            self.batch_idx += self.batch_size
            return [int(self.device_input)]
        return None
    
    def read_calibration_cache(self):
        try:
            with open(self.cache_file, 'rb') as f:
                return f.read()
        except:
            return None
    
    def write_calibration_cache(self, cache):
        with open(self.cache_file, 'wb') as f:
            f.write(cache)

# Build INT8 engine with calibration
def build_int8_engine(onnx_path, engine_path, calibration_data):
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    )
    parser = trt.OnnxParser(network, logger)
    
    with open(onnx_path, 'rb') as f:
        parser.parse(f.read())
    
    config = builder.create_builder_config()
    config.set_flag(trt.BuilderFlag.INT8)
    
    calibrator = Int8Calibrator(calibration_data, "calibration.cache")
    config.int8_calibrator = calibrator
    
    engine = builder.build_serialized_network(network, config)
    
    with open(engine_path, 'wb') as f:
        f.write(engine)
    
    return engine

⚠️ INT8 Accuracy

INT8 quantization may reduce accuracy. Always validate on your test set. Use representative calibration data for best results.

Stable Diffusion with TensorRT

# Using diffusers with TensorRT backend
pip install optimum[onnxruntime-gpu]

from optimum.onnxruntime import ORTStableDiffusionPipeline

# Convert and optimize
pipe = ORTStableDiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    export=True,
    provider="TensorrtExecutionProvider",
    provider_options={
        "trt_fp16_enable": True,
        "trt_engine_cache_enable": True,
        "trt_engine_cache_path": "./trt_cache"
    }
)

# First run builds engine (slow)
# Subsequent runs use cached engine (fast)
image = pipe(
    "a beautiful sunset over mountains",
    num_inference_steps=20
).images[0]

Benchmark Results

ModelPyTorch FP16TensorRT FP16TensorRT INT8
ResNet-502.1 ms0.8 ms0.5 ms
BERT-base3.2 ms1.1 ms0.7 ms
LLaMA-8B (per token)15 ms6 ms4 ms
SDXL (1 image)4.2 sec1.8 sec-

Benchmarks on A100 80GB, batch size 1 unless noted

Run TensorRT on Cloud GPUs

A100 and H100 GPUs from $1.50/hr with TensorRT pre-installed.

Start Optimizing →

Production Deployment

Triton Inference Server with TensorRT

# model_repository/
# └── resnet50/
#     ├── config.pbtxt
#     └── 1/
#         └── model.plan  # TensorRT engine

# config.pbtxt
name: "resnet50"
platform: "tensorrt_plan"
max_batch_size: 32

input [
  {
    name: "input"
    data_type: TYPE_FP16
    dims: [ 3, 224, 224 ]
  }
]

output [
  {
    name: "output"
    data_type: TYPE_FP16
    dims: [ 1000 ]
  }
]

instance_group [
  {
    count: 2
    kind: KIND_GPU
  }
]

dynamic_batching {
  preferred_batch_size: [ 8, 16, 32 ]
  max_queue_delay_microseconds: 100
}

Client Code

import tritonclient.http as httpclient
import numpy as np

client = httpclient.InferenceServerClient(url="localhost:8000")

# Prepare input
image = np.random.randn(1, 3, 224, 224).astype(np.float16)
inputs = [httpclient.InferInput("input", image.shape, "FP16")]
inputs[0].set_data_from_numpy(image)

outputs = [httpclient.InferRequestedOutput("output")]

# Inference
result = client.infer("resnet50", inputs, outputs=outputs)
predictions = result.as_numpy("output")

print(f"Top prediction: {np.argmax(predictions)}")

Optimization Tips

1. Use Optimal Batch Sizes

TensorRT performs best with power-of-2 batch sizes. Profile different sizes:

# Profile different batch sizes
for batch_size in [1, 2, 4, 8, 16, 32]:
    # Build engine with this max batch
    # Measure throughput
    pass

2. Enable All Optimizations

# TensorRT-LLM optimizations
trtllm-build \
    --gemm_plugin float16 \
    --paged_kv_cache enable \
    --use_custom_all_reduce enable \
    --multiple_profiles enable \
    --use_fused_mlp enable

3. Use Latest TensorRT Version

Each version brings significant improvements. TensorRT 10.x is 15-30% faster than 8.x for many workloads.

4. Match Engine to GPU

TensorRT engines are GPU-specific. Build separate engines for A100 vs H100.

Troubleshooting

Engine Build Fails

Slower Than Expected

Accuracy Issues with INT8

Conclusion

TensorRT is essential for production ML inference. Key takeaways:

Deploy your TensorRT-optimized models on GPUBrazil for maximum performance at minimal cost.