Data Loading Optimization
The Data Loading Problem
Section titled “The Data Loading Problem”Your expensive GPU sits idle while waiting for data. This is one of the most common—and overlooked—performance bottlenecks in deep learning.
Quick Diagnosis
Section titled “Quick Diagnosis”Monitor GPU Utilization
Section titled “Monitor GPU Utilization”# Watch GPU utilization in real-time
watch -n 0.1 nvidia-smi
# Or use nvtop for better visualization
nvtop
# Check if it's jumping between low and high values = data bottleneckProfile Your Data Pipeline
Section titled “Profile Your Data Pipeline”import time
import torch
def profile_dataloader(dataloader, num_batches=100):
"""Profile data loading speed"""
model = torch.nn.Identity().cuda()
# Warmup
for _ in range(10):
data, _ = next(iter(dataloader))
# Profile data loading
start = time.time()
for i, (data, target) in enumerate(dataloader):
if i >= num_batches:
break
data = data.cuda()
data_time = time.time() - start
# Profile with dummy compute
start = time.time()
for i, (data, target) in enumerate(dataloader):
if i >= num_batches:
break
data = data.cuda()
_ = model(data) # Minimal compute
torch.cuda.synchronize()
total_time = time.time() - start
data_loading_pct = (data_time / total_time) * 100
print(f"Data loading: {data_time:.2f}s ({data_loading_pct:.1f}% of total)")
print(f"Total time: {total_time:.2f}s")
if data_loading_pct > 20:
print("⚠️ Data loading is a bottleneck!")
else:
print("✓ Data loading is acceptable")
# Usage
# profile_dataloader(train_loader)Solution 1: Increase num_workers
Section titled “Solution 1: Increase num_workers”The single most important parameter:
from torch.utils.data import DataLoader
# Bad: Single-threaded data loading
train_loader = DataLoader(
dataset,
batch_size=32,
num_workers=0 # ❌ Default - very slow!
)
# Good: Multi-process data loading
train_loader = DataLoader(
dataset,
batch_size=32,
num_workers=4, # ✓ Use 4-8 workers
pin_memory=True, # ✓ Faster GPU transfer
persistent_workers=True # ✓ Keep workers alive (PyTorch 1.7+)
)import tensorflow as tf
# Bad: No prefetching or parallelization
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.batch(32)
# Good: Multi-threaded with prefetching
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=10000)
train_dataset = train_dataset.batch(32)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE) # ✓ Auto-tune prefetching
train_dataset = train_dataset.cache() # ✓ Cache in memory if fits
# Or with explicit parallelization
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.map(
preprocess_fn,
num_parallel_calls=tf.data.AUTOTUNE # ✓ Parallel preprocessing
)
train_dataset = train_dataset.batch(32)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)Finding Optimal num_workers
Section titled “Finding Optimal num_workers”import time
def benchmark_dataloader(dataset, batch_size, num_workers_list=[0, 2, 4, 8, 16]):
"""Test different num_workers settings"""
results = {}
for num_workers in num_workers_list:
loader = DataLoader(
dataset,
batch_size=batch_size,
num_workers=num_workers,
pin_memory=True
)
start = time.time()
for i, (data, target) in enumerate(loader):
if i >= 100: # Test 100 batches
break
data = data.cuda()
elapsed = time.time() - start
results[num_workers] = elapsed
print(f"num_workers={num_workers}: {elapsed:.2f}s")
# Find optimal
optimal = min(results, key=results.get)
print(f"\n✓ Optimal num_workers: {optimal}")
return optimal
# Usage
# optimal_workers = benchmark_dataloader(train_dataset, batch_size=32)General guidelines:
- Start with
num_workers=4 - Increase to 8-16 for complex augmentations
- Don’t exceed number of CPU cores
- More workers ≠ always better (diminishing returns + overhead)
Solution 2: Data Prefetching
Section titled “Solution 2: Data Prefetching”Load next batch while GPU processes current batch:
class DataPrefetcher:
"""Prefetch data to GPU while processing current batch"""
def __init__(self, loader):
self.loader = iter(loader)
self.stream = torch.cuda.Stream()
self.preload()
def preload(self):
try:
self.next_data, self.next_target = next(self.loader)
except StopIteration:
self.next_data = None
self.next_target = None
return
with torch.cuda.stream(self.stream):
self.next_data = self.next_data.cuda(non_blocking=True)
self.next_target = self.next_target.cuda(non_blocking=True)
def next(self):
torch.cuda.current_stream().wait_stream(self.stream)
data = self.next_data
target = self.next_target
if data is not None:
data.record_stream(torch.cuda.current_stream())
if target is not None:
target.record_stream(torch.cuda.current_stream())
self.preload()
return data, target
# Usage
prefetcher = DataPrefetcher(train_loader)
data, target = prefetcher.next()
while data is not None:
# Your training code
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
data, target = prefetcher.next()Solution 3: Optimize Data Augmentation
Section titled “Solution 3: Optimize Data Augmentation”Use Efficient Augmentation Libraries
Section titled “Use Efficient Augmentation Libraries”# Slow: PIL-based transforms
from torchvision import transforms
slow_transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(0.4, 0.4, 0.4),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
# Fast: GPU-accelerated transforms (NVIDIA DALI)
import nvidia.dali as dali
import nvidia.dali.fn as fn
import nvidia.dali.types as types
@dali.pipeline_def
def create_dali_pipeline(data_dir, batch_size, num_threads):
images, labels = fn.readers.file(
file_root=data_dir,
random_shuffle=True,
name="Reader"
)
images = fn.decoders.image(images, device="mixed") # Decode on GPU
images = fn.random_resized_crop(images, size=224, device="gpu")
images = fn.flip(images, horizontal=1, device="gpu")
images = fn.normalize(images, device="gpu",
mean=[0.485*255, 0.456*255, 0.406*255],
stddev=[0.229*255, 0.224*255, 0.225*255])
return images, labels
# Can be 2-3x faster than CPU augmentationUse kornia for GPU Augmentation
Section titled “Use kornia for GPU Augmentation”import kornia.augmentation as K
class GPUAugmentation(nn.Module):
"""Apply augmentation on GPU"""
def __init__(self):
super().__init__()
self.transform = nn.Sequential(
K.RandomResizedCrop(size=(224, 224)),
K.RandomHorizontalFlip(),
K.ColorJitter(0.4, 0.4, 0.4, 0.1),
K.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
)
def forward(self, x):
return self.transform(x)
# Usage in training loop
augmentation = GPUAugmentation().cuda()
for data, target in train_loader:
data = data.cuda()
data = augmentation(data) # Apply on GPU
output = model(data)Solution 4: Cache Small Datasets
Section titled “Solution 4: Cache Small Datasets”If dataset fits in RAM, cache it:
class CachedDataset(torch.utils.data.Dataset):
"""Cache entire dataset in RAM"""
def __init__(self, dataset):
self.dataset = dataset
self.cache = {}
print("Caching dataset...")
for i in range(len(dataset)):
self.cache[i] = dataset[i]
if i % 1000 == 0:
print(f"Cached {i}/{len(dataset)} samples")
def __getitem__(self, idx):
return self.cache[idx]
def __len__(self):
return len(self.dataset)
# Usage
cached_dataset = CachedDataset(original_dataset)
train_loader = DataLoader(cached_dataset, batch_size=32, num_workers=4)Solution 5: Optimize Image Loading
Section titled “Solution 5: Optimize Image Loading”Use Efficient Image Formats
Section titled “Use Efficient Image Formats”# Slow: Load and decode on-the-fly
class SlowImageDataset(Dataset):
def __getitem__(self, idx):
img = Image.open(self.image_paths[idx]) # Decode JPEG
img = self.transform(img)
return img
# Fast: Pre-decode and store as .npy or .pt
class FastImageDataset(Dataset):
def __getitem__(self, idx):
img = np.load(self.image_paths[idx]) # Already decoded
img = self.transform(img)
return img
# Pre-process script
import numpy as np
from PIL import Image
from pathlib import Path
def preprocess_dataset(image_dir, output_dir):
"""Convert images to pre-decoded numpy arrays"""
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)
for img_path in Path(image_dir).glob("*.jpg"):
img = Image.open(img_path)
img_array = np.array(img)
output_path = output_dir / f"{img_path.stem}.npy"
np.save(output_path, img_array)Use Pillow-SIMD
Section titled “Use Pillow-SIMD”# Install faster PIL replacement
pip uninstall pillow
pip install pillow-simd
# Can be 4-6x faster for image loadingSolution 6: Reduce I/O Bottleneck
Section titled “Solution 6: Reduce I/O Bottleneck”Use Faster Storage
Section titled “Use Faster Storage”# Check if data is on slow storage
import time
from pathlib import Path
def benchmark_storage(data_path, num_samples=1000):
"""Benchmark storage read speed"""
files = list(Path(data_path).glob("*.jpg"))[:num_samples]
start = time.time()
for f in files:
_ = f.read_bytes()
elapsed = time.time() - start
throughput = num_samples / elapsed
print(f"Storage throughput: {throughput:.1f} images/sec")
if throughput < 100:
print("⚠️ Slow storage detected!")
print("Consider: SSD, NVMe, or RAM disk")Copy to Local SSD
Section titled “Copy to Local SSD”# If data is on network storage, copy to local SSD
# Add this to your training script startup
LOCAL_DATA="/tmp/dataset"
REMOTE_DATA="/network/slow/storage/dataset"
if [ ! -d "$LOCAL_DATA" ]; then
echo "Copying dataset to local SSD..."
mkdir -p $LOCAL_DATA
rsync -av --progress $REMOTE_DATA/ $LOCAL_DATA/
fi
# Use LOCAL_DATA in your training
python train.py --data-path $LOCAL_DATASolution 7: Optimize Batch Assembly
Section titled “Solution 7: Optimize Batch Assembly”Use Efficient Collation
Section titled “Use Efficient Collation”# Default collate_fn can be slow for variable-size data
from torch.utils.data import default_collate
def fast_collate_fn(batch):
"""Faster collation for specific data types"""
# Pre-allocate tensor
imgs = torch.zeros((len(batch), 3, 224, 224))
targets = torch.zeros(len(batch), dtype=torch.long)
for i, (img, target) in enumerate(batch):
imgs[i] = img
targets[i] = target
return imgs, targets
train_loader = DataLoader(
dataset,
batch_size=32,
collate_fn=fast_collate_fn, # Use custom collation
num_workers=4
)Performance Comparison
Section titled “Performance Comparison”| Optimization | Expected Speedup | Effort |
|---|---|---|
| num_workers=4 | 2-4x | Low ⭐ |
| pin_memory=True | 1.1-1.2x | Low ⭐ |
| Data prefetching | 1.2-1.5x | Medium |
| GPU augmentation | 1.5-3x | Medium |
| Cache dataset | 2-5x | Low (if fits RAM) |
| Fast storage | 2-10x | High (hardware) |
| DALI pipeline | 2-3x | High |
Best Practices Checklist
Section titled “Best Practices Checklist”-
Always set num_workers ≥ 4 (easiest win)
-
Enable pin_memory for GPU training:
DataLoader(..., pin_memory=True) -
Use persistent_workers to avoid respawning:
DataLoader(..., persistent_workers=True) -
Profile before optimizing - measure actual bottleneck
-
Test with different batch sizes - larger batches → less data loading overhead
-
Monitor CPU usage - if maxed out, reduce augmentation complexity
-
Use prefetch_factor (PyTorch 1.7+):
DataLoader(..., prefetch_factor=2) # Load 2 batches ahead
Complete Optimized DataLoader
Section titled “Complete Optimized DataLoader”from torch.utils.data import DataLoader
# Production-ready configuration
train_loader = DataLoader(
dataset,
batch_size=32,
shuffle=True,
num_workers=8, # Multi-process loading
pin_memory=True, # Faster GPU transfer
persistent_workers=True, # Keep workers alive
prefetch_factor=2, # Prefetch 2 batches
drop_last=True, # Avoid small last batch
)import tensorflow as tf
# Production-ready configuration
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
# Optimize pipeline
train_dataset = (
train_dataset
.shuffle(buffer_size=10000) # Shuffle data
.map(preprocess_fn,
num_parallel_calls=tf.data.AUTOTUNE) # Parallel preprocessing
.batch(32, drop_remainder=True) # Avoid small last batch
.cache() # Cache in memory
.prefetch(tf.data.AUTOTUNE) # Auto-tune prefetching
)
# For advanced control
options = tf.data.Options()
options.threading.private_threadpool_size = 8 # Number of threads
options.threading.max_intra_op_parallelism = 1
train_dataset = train_dataset.with_options(options)Debugging Data Loading Issues
Section titled “Debugging Data Loading Issues”Issue: Workers Timing Out
Section titled “Issue: Workers Timing Out”# Increase timeout if processing is slow
train_loader = DataLoader(
dataset,
num_workers=4,
timeout=600 # Wait up to 10 minutes
)Issue: Memory Leak with num_workers > 0
Section titled “Issue: Memory Leak with num_workers > 0”# Limit memory per worker
import torch.multiprocessing as mp
mp.set_start_method('spawn', force=True) # Instead of 'fork'
# Or reduce num_workersIssue: Slow First Epoch
Section titled “Issue: Slow First Epoch”# Workers need warmup - this is normal
# Or use persistent_workers=True to avoid respawningKey Takeaways
Section titled “Key Takeaways”- Data loading is often the bottleneck, not compute
- Start with
num_workers=4andpin_memory=True - Profile to identify actual bottleneck before complex optimizations
- GPU augmentation can be 2-3x faster than CPU
- Fast storage matters - SSD >> HDD, local >> network
- Cache small datasets in RAM for maximum speed
- Monitor GPU utilization to detect data bottlenecks