Skip to content

Data Loading Optimization

Your expensive GPU sits idle while waiting for data. This is one of the most common—and overlooked—performance bottlenecks in deep learning.

# Watch GPU utilization in real-time
watch -n 0.1 nvidia-smi

# Or use nvtop for better visualization
nvtop

# Check if it's jumping between low and high values = data bottleneck
import time
import torch

def profile_dataloader(dataloader, num_batches=100):
    """Profile data loading speed"""
    model = torch.nn.Identity().cuda()

    # Warmup
    for _ in range(10):
        data, _ = next(iter(dataloader))

    # Profile data loading
    start = time.time()
    for i, (data, target) in enumerate(dataloader):
        if i >= num_batches:
            break
        data = data.cuda()
    data_time = time.time() - start

    # Profile with dummy compute
    start = time.time()
    for i, (data, target) in enumerate(dataloader):
        if i >= num_batches:
            break
        data = data.cuda()
        _ = model(data)  # Minimal compute
        torch.cuda.synchronize()
    total_time = time.time() - start

    data_loading_pct = (data_time / total_time) * 100
    print(f"Data loading: {data_time:.2f}s ({data_loading_pct:.1f}% of total)")
    print(f"Total time: {total_time:.2f}s")

    if data_loading_pct > 20:
        print("⚠️  Data loading is a bottleneck!")
    else:
        print("✓ Data loading is acceptable")

# Usage
# profile_dataloader(train_loader)

The single most important parameter:

from torch.utils.data import DataLoader

# Bad: Single-threaded data loading
train_loader = DataLoader(
    dataset,
    batch_size=32,
    num_workers=0  # ❌ Default - very slow!
)

# Good: Multi-process data loading
train_loader = DataLoader(
    dataset,
    batch_size=32,
    num_workers=4,        # ✓ Use 4-8 workers
    pin_memory=True,      # ✓ Faster GPU transfer
    persistent_workers=True  # ✓ Keep workers alive (PyTorch 1.7+)
)
import time

def benchmark_dataloader(dataset, batch_size, num_workers_list=[0, 2, 4, 8, 16]):
    """Test different num_workers settings"""
    results = {}

    for num_workers in num_workers_list:
        loader = DataLoader(
            dataset,
            batch_size=batch_size,
            num_workers=num_workers,
            pin_memory=True
        )

        start = time.time()
        for i, (data, target) in enumerate(loader):
            if i >= 100:  # Test 100 batches
                break
            data = data.cuda()
        elapsed = time.time() - start

        results[num_workers] = elapsed
        print(f"num_workers={num_workers}: {elapsed:.2f}s")

    # Find optimal
    optimal = min(results, key=results.get)
    print(f"\n✓ Optimal num_workers: {optimal}")
    return optimal

# Usage
# optimal_workers = benchmark_dataloader(train_dataset, batch_size=32)

General guidelines:

  • Start with num_workers=4
  • Increase to 8-16 for complex augmentations
  • Don’t exceed number of CPU cores
  • More workers ≠ always better (diminishing returns + overhead)

Load next batch while GPU processes current batch:

class DataPrefetcher:
    """Prefetch data to GPU while processing current batch"""
    def __init__(self, loader):
        self.loader = iter(loader)
        self.stream = torch.cuda.Stream()
        self.preload()

    def preload(self):
        try:
            self.next_data, self.next_target = next(self.loader)
        except StopIteration:
            self.next_data = None
            self.next_target = None
            return

        with torch.cuda.stream(self.stream):
            self.next_data = self.next_data.cuda(non_blocking=True)
            self.next_target = self.next_target.cuda(non_blocking=True)

    def next(self):
        torch.cuda.current_stream().wait_stream(self.stream)
        data = self.next_data
        target = self.next_target
        if data is not None:
            data.record_stream(torch.cuda.current_stream())
        if target is not None:
            target.record_stream(torch.cuda.current_stream())
        self.preload()
        return data, target

# Usage
prefetcher = DataPrefetcher(train_loader)
data, target = prefetcher.next()
while data is not None:
    # Your training code
    output = model(data)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()

    data, target = prefetcher.next()
# Slow: PIL-based transforms
from torchvision import transforms

slow_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.4, 0.4, 0.4),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

# Fast: GPU-accelerated transforms (NVIDIA DALI)
import nvidia.dali as dali
import nvidia.dali.fn as fn
import nvidia.dali.types as types

@dali.pipeline_def
def create_dali_pipeline(data_dir, batch_size, num_threads):
    images, labels = fn.readers.file(
        file_root=data_dir,
        random_shuffle=True,
        name="Reader"
    )
    images = fn.decoders.image(images, device="mixed")  # Decode on GPU
    images = fn.random_resized_crop(images, size=224, device="gpu")
    images = fn.flip(images, horizontal=1, device="gpu")
    images = fn.normalize(images, device="gpu",
                         mean=[0.485*255, 0.456*255, 0.406*255],
                         stddev=[0.229*255, 0.224*255, 0.225*255])
    return images, labels

# Can be 2-3x faster than CPU augmentation
import kornia.augmentation as K

class GPUAugmentation(nn.Module):
    """Apply augmentation on GPU"""
    def __init__(self):
        super().__init__()
        self.transform = nn.Sequential(
            K.RandomResizedCrop(size=(224, 224)),
            K.RandomHorizontalFlip(),
            K.ColorJitter(0.4, 0.4, 0.4, 0.1),
            K.Normalize(mean=[0.485, 0.456, 0.406],
                       std=[0.229, 0.224, 0.225])
        )

    def forward(self, x):
        return self.transform(x)

# Usage in training loop
augmentation = GPUAugmentation().cuda()

for data, target in train_loader:
    data = data.cuda()
    data = augmentation(data)  # Apply on GPU
    output = model(data)

If dataset fits in RAM, cache it:

class CachedDataset(torch.utils.data.Dataset):
    """Cache entire dataset in RAM"""
    def __init__(self, dataset):
        self.dataset = dataset
        self.cache = {}
        print("Caching dataset...")
        for i in range(len(dataset)):
            self.cache[i] = dataset[i]
            if i % 1000 == 0:
                print(f"Cached {i}/{len(dataset)} samples")

    def __getitem__(self, idx):
        return self.cache[idx]

    def __len__(self):
        return len(self.dataset)

# Usage
cached_dataset = CachedDataset(original_dataset)
train_loader = DataLoader(cached_dataset, batch_size=32, num_workers=4)
# Slow: Load and decode on-the-fly
class SlowImageDataset(Dataset):
    def __getitem__(self, idx):
        img = Image.open(self.image_paths[idx])  # Decode JPEG
        img = self.transform(img)
        return img

# Fast: Pre-decode and store as .npy or .pt
class FastImageDataset(Dataset):
    def __getitem__(self, idx):
        img = np.load(self.image_paths[idx])  # Already decoded
        img = self.transform(img)
        return img

# Pre-process script
import numpy as np
from PIL import Image
from pathlib import Path

def preprocess_dataset(image_dir, output_dir):
    """Convert images to pre-decoded numpy arrays"""
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)

    for img_path in Path(image_dir).glob("*.jpg"):
        img = Image.open(img_path)
        img_array = np.array(img)

        output_path = output_dir / f"{img_path.stem}.npy"
        np.save(output_path, img_array)
# Install faster PIL replacement
pip uninstall pillow
pip install pillow-simd

# Can be 4-6x faster for image loading
# Check if data is on slow storage
import time
from pathlib import Path

def benchmark_storage(data_path, num_samples=1000):
    """Benchmark storage read speed"""
    files = list(Path(data_path).glob("*.jpg"))[:num_samples]

    start = time.time()
    for f in files:
        _ = f.read_bytes()
    elapsed = time.time() - start

    throughput = num_samples / elapsed
    print(f"Storage throughput: {throughput:.1f} images/sec")

    if throughput < 100:
        print("⚠️  Slow storage detected!")
        print("Consider: SSD, NVMe, or RAM disk")
# If data is on network storage, copy to local SSD
# Add this to your training script startup

LOCAL_DATA="/tmp/dataset"
REMOTE_DATA="/network/slow/storage/dataset"

if [ ! -d "$LOCAL_DATA" ]; then
    echo "Copying dataset to local SSD..."
    mkdir -p $LOCAL_DATA
    rsync -av --progress $REMOTE_DATA/ $LOCAL_DATA/
fi

# Use LOCAL_DATA in your training
python train.py --data-path $LOCAL_DATA
# Default collate_fn can be slow for variable-size data
from torch.utils.data import default_collate

def fast_collate_fn(batch):
    """Faster collation for specific data types"""
    # Pre-allocate tensor
    imgs = torch.zeros((len(batch), 3, 224, 224))
    targets = torch.zeros(len(batch), dtype=torch.long)

    for i, (img, target) in enumerate(batch):
        imgs[i] = img
        targets[i] = target

    return imgs, targets

train_loader = DataLoader(
    dataset,
    batch_size=32,
    collate_fn=fast_collate_fn,  # Use custom collation
    num_workers=4
)
OptimizationExpected SpeedupEffort
num_workers=42-4xLow ⭐
pin_memory=True1.1-1.2xLow ⭐
Data prefetching1.2-1.5xMedium
GPU augmentation1.5-3xMedium
Cache dataset2-5xLow (if fits RAM)
Fast storage2-10xHigh (hardware)
DALI pipeline2-3xHigh
  1. Always set num_workers ≥ 4 (easiest win)

  2. Enable pin_memory for GPU training:

    DataLoader(..., pin_memory=True)
  3. Use persistent_workers to avoid respawning:

    DataLoader(..., persistent_workers=True)
  4. Profile before optimizing - measure actual bottleneck

  5. Test with different batch sizes - larger batches → less data loading overhead

  6. Monitor CPU usage - if maxed out, reduce augmentation complexity

  7. Use prefetch_factor (PyTorch 1.7+):

    DataLoader(..., prefetch_factor=2)  # Load 2 batches ahead
from torch.utils.data import DataLoader

# Production-ready configuration
train_loader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    num_workers=8,              # Multi-process loading
    pin_memory=True,            # Faster GPU transfer
    persistent_workers=True,    # Keep workers alive
    prefetch_factor=2,          # Prefetch 2 batches
    drop_last=True,             # Avoid small last batch
)
# Increase timeout if processing is slow
train_loader = DataLoader(
    dataset,
    num_workers=4,
    timeout=600  # Wait up to 10 minutes
)
# Limit memory per worker
import torch.multiprocessing as mp

mp.set_start_method('spawn', force=True)  # Instead of 'fork'

# Or reduce num_workers
# Workers need warmup - this is normal
# Or use persistent_workers=True to avoid respawning
  • Data loading is often the bottleneck, not compute
  • Start with num_workers=4 and pin_memory=True
  • Profile to identify actual bottleneck before complex optimizations
  • GPU augmentation can be 2-3x faster than CPU
  • Fast storage matters - SSD >> HDD, local >> network
  • Cache small datasets in RAM for maximum speed
  • Monitor GPU utilization to detect data bottlenecks