Learning Rate Optimization
Why Learning Rate Matters
Section titled “Why Learning Rate Matters”The learning rate is arguably the most important hyperparameter in deep learning. Set it wrong, and your model either:
- Too High: Diverges, produces NaN losses, or oscillates wildly
- Too Low: Trains painfully slow or gets stuck in poor local minima
- Just Right: Converges quickly to good solutions
Quick Start: Finding Your Learning Rate
Section titled “Quick Start: Finding Your Learning Rate”The Learning Rate Range Test
Section titled “The Learning Rate Range Test”The most reliable method to find a good learning rate:
import torch
import matplotlib.pyplot as plt
def find_lr(model, train_loader, optimizer, criterion,
start_lr=1e-7, end_lr=10, num_iter=100):
"""
Perform learning rate range test
"""
lrs = []
losses = []
lr_mult = (end_lr / start_lr) ** (1 / num_iter)
lr = start_lr
optimizer.param_groups[0]['lr'] = lr
best_loss = float('inf')
batch_iter = iter(train_loader)
for i in range(num_iter):
try:
data, target = next(batch_iter)
except StopIteration:
batch_iter = iter(train_loader)
data, target = next(batch_iter)
data, target = data.cuda(), target.cuda()
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
# Stop if loss explodes
if loss.item() > 4 * best_loss or torch.isnan(loss):
break
if loss.item() < best_loss:
best_loss = loss.item()
lrs.append(lr)
losses.append(loss.item())
loss.backward()
optimizer.step()
# Update learning rate
lr *= lr_mult
optimizer.param_groups[0]['lr'] = lr
# Plot results
plt.figure(figsize=(10, 6))
plt.plot(lrs, losses)
plt.xscale('log')
plt.xlabel('Learning Rate')
plt.ylabel('Loss')
plt.title('Learning Rate Range Test')
plt.grid(True)
plt.savefig('lr_range_test.png')
return lrs, losses
# Usage
# lrs, losses = find_lr(model, train_loader, optimizer, criterion)import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
def find_lr(model, train_dataset, start_lr=1e-7, end_lr=10, num_iter=100):
"""
Perform learning rate range test
"""
lrs = []
losses = []
lr_mult = (end_lr / start_lr) ** (1 / num_iter)
lr = start_lr
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
best_loss = float('inf')
train_iter = iter(train_dataset)
for i in range(num_iter):
try:
data, target = next(train_iter)
except StopIteration:
train_iter = iter(train_dataset)
data, target = next(train_iter)
with tf.GradientTape() as tape:
output = model(data, training=True)
loss = loss_fn(target, output)
# Stop if loss explodes
if loss.numpy() > 4 * best_loss or np.isnan(loss.numpy()):
break
if loss.numpy() < best_loss:
best_loss = loss.numpy()
lrs.append(lr)
losses.append(loss.numpy())
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
# Update learning rate
lr *= lr_mult
optimizer.learning_rate.assign(lr)
# Plot results
plt.figure(figsize=(10, 6))
plt.plot(lrs, losses)
plt.xscale('log')
plt.xlabel('Learning Rate')
plt.ylabel('Loss')
plt.title('Learning Rate Range Test')
plt.grid(True)
plt.savefig('lr_range_test.png')
return lrs, losses
# Usage
# lrs, losses = find_lr(model, train_dataset)How to interpret:
- Look for the steepest downward slope in the loss curve
- Pick a learning rate from the middle of that slope
- Usually 10x smaller than where loss starts to increase
Learning Rate Schedules
Section titled “Learning Rate Schedules”1. Cosine Annealing (Recommended)
Section titled “1. Cosine Annealing (Recommended)”Smoothly decreases learning rate following a cosine curve:
from torch.optim.lr_scheduler import CosineAnnealingLR
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
scheduler = CosineAnnealingLR(optimizer, T_max=100, eta_min=1e-6)
for epoch in range(100):
train(model, train_loader, optimizer)
scheduler.step()import tensorflow as tf
# Create cosine decay schedule
lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
initial_learning_rate=1e-3,
decay_steps=100 * steps_per_epoch, # 100 epochs
alpha=1e-6 # Minimum learning rate
)
optimizer = tf.keras.optimizers.AdamW(learning_rate=lr_schedule)
# Or use callback for epoch-based scheduling
cosine_callback = tf.keras.callbacks.LearningRateScheduler(
lambda epoch: 1e-3 * 0.5 * (1 + np.cos(np.pi * epoch / 100))
)
model.fit(train_dataset, epochs=100, callbacks=[cosine_callback])Pros:
- Smooth decay prevents sudden performance drops
- Works well for most architectures
- Can help escape local minima with warm restarts
2. One Cycle Policy
Section titled “2. One Cycle Policy”Increases then decreases learning rate in one cycle:
from torch.optim.lr_scheduler import OneCycleLR
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
scheduler = OneCycleLR(
optimizer,
max_lr=0.1,
epochs=100,
steps_per_epoch=len(train_loader)
)
for epoch in range(100):
for batch in train_loader:
train_step(batch)
scheduler.step() # Call after each batch!import tensorflow as tf
# Calculate total steps
total_steps = 100 * steps_per_epoch
# One cycle schedule: warmup -> peak -> decay
def one_cycle_schedule(step):
if step < total_steps * 0.3: # Warmup phase
return 0.1 * (step / (total_steps * 0.3))
else: # Decay phase
progress = (step - total_steps * 0.3) / (total_steps * 0.7)
return 0.1 * (1 - progress)
lr_schedule = tf.keras.optimizers.schedules.LearningRateSchedule()
lr_schedule.__call__ = one_cycle_schedule
optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule, momentum=0.9)
# Or use callback
class OneCycleScheduler(tf.keras.callbacks.Callback):
def __init__(self, max_lr, total_steps):
self.max_lr = max_lr
self.total_steps = total_steps
self.step = 0
def on_batch_begin(self, batch, logs=None):
lr = one_cycle_schedule(self.step)
tf.keras.backend.set_value(self.model.optimizer.lr, lr)
self.step += 1Best for:
- Fast convergence (often beats other schedules)
- Limited training time/budget
- When you know total training iterations upfront
3. Reduce on Plateau
Section titled “3. Reduce on Plateau”Decreases LR when metrics stop improving:
from torch.optim.lr_scheduler import ReduceLROnPlateau
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = ReduceLROnPlateau(
optimizer,
mode='min',
factor=0.5, # Reduce by half
patience=5, # Wait 5 epochs
min_lr=1e-6
)
for epoch in range(100):
train(model, train_loader, optimizer)
val_loss = validate(model, val_loader)
scheduler.step(val_loss) # Pass validation lossimport tensorflow as tf
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
# Use ReduceLROnPlateau callback
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
monitor='val_loss',
factor=0.5, # Reduce by half
patience=5, # Wait 5 epochs
min_lr=1e-6,
verbose=1
)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')
model.fit(
train_dataset,
validation_data=val_dataset,
epochs=100,
callbacks=[reduce_lr]
)Best for:
- Unknown optimal training length
- When validation loss is your primary metric
- Conservative training approaches
Optimizer-Specific Tips
Section titled “Optimizer-Specific Tips”AdamW (Most Popular)
Section titled “AdamW (Most Popular)”optimizer = torch.optim.AdamW(
model.parameters(),
lr=1e-3, # Good default
weight_decay=0.01, # Regularization
betas=(0.9, 0.999)
)optimizer = tf.keras.optimizers.AdamW(
learning_rate=1e-3, # Good default
weight_decay=0.01, # Regularization
beta_1=0.9,
beta_2=0.999
)Typical LR ranges:
- Transformers: 1e-4 to 5e-4
- CNNs: 1e-3 to 3e-3
- Small models: 1e-3 to 1e-2
SGD with Momentum
Section titled “SGD with Momentum”optimizer = torch.optim.SGD(
model.parameters(),
lr=0.1, # Usually 10-100x higher than Adam
momentum=0.9,
weight_decay=1e-4,
nesterov=True # Often helps
)optimizer = tf.keras.optimizers.SGD(
learning_rate=0.1, # Usually 10-100x higher than Adam
momentum=0.9,
nesterov=True # Often helps
)
# Note: TensorFlow handles weight decay separately
# Add to loss or use kernel_regularizer in layersTypical LR ranges:
- ResNets: 0.1 (with decay)
- Small CNNs: 0.01 to 0.1
Warmup Strategy
Section titled “Warmup Strategy”Gradually increase learning rate at start of training:
import math
def get_lr_with_warmup(current_step, warmup_steps, max_lr, total_steps):
"""Calculate learning rate with warmup and cosine decay"""
if current_step < warmup_steps:
# Linear warmup
return max_lr * current_step / warmup_steps
else:
# Cosine decay
progress = (current_step - warmup_steps) / (total_steps - warmup_steps)
return max_lr * 0.5 * (1 + math.cos(math.pi * progress))
# Usage in training loop
for step in range(total_steps):
lr = get_lr_with_warmup(step, warmup_steps=1000,
max_lr=1e-3, total_steps=100000)
for param_group in optimizer.param_groups:
param_group['lr'] = lrimport tensorflow as tf
import numpy as np
class WarmupCosineDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
def __init__(self, warmup_steps, max_lr, total_steps):
self.warmup_steps = warmup_steps
self.max_lr = max_lr
self.total_steps = total_steps
def __call__(self, step):
step = tf.cast(step, tf.float32)
warmup_steps = tf.cast(self.warmup_steps, tf.float32)
total_steps = tf.cast(self.total_steps, tf.float32)
# Linear warmup
warmup_lr = self.max_lr * step / warmup_steps
# Cosine decay
progress = (step - warmup_steps) / (total_steps - warmup_steps)
cosine_lr = self.max_lr * 0.5 * (1 + tf.cos(np.pi * progress))
return tf.cond(step < warmup_steps, lambda: warmup_lr, lambda: cosine_lr)
# Usage
lr_schedule = WarmupCosineDecay(
warmup_steps=1000,
max_lr=1e-3,
total_steps=100000
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)Why warmup helps:
- Prevents early training instability
- Allows batch normalization statistics to stabilize
- Essential for large batch training
- Recommended warmup: 1-5% of total training steps
Common Issues & Solutions
Section titled “Common Issues & Solutions”Loss Goes to NaN
Section titled “Loss Goes to NaN”# Check for:
1. Learning rate too high → reduce by 10x
2. Gradient explosion → add gradient clipping
3. Bad initialization → use proper init (Xavier/He)
# Add gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)Training Too Slow
Section titled “Training Too Slow”# Try:
1. Increase learning rate (use LR range test)
2. Use AdamW instead of SGD
3. Add learning rate warmup
4. Try OneCycleLR schedulerValidation Loss Increases
Section titled “Validation Loss Increases”# Solutions:
1. Reduce learning rate
2. Add/increase weight decay
3. Use learning rate decay schedule
4. Add dropout or other regularizationReal-World Example Configurations
Section titled “Real-World Example Configurations”Vision Transformers (ViT)
Section titled “Vision Transformers (ViT)”from torch.optim.lr_scheduler import CosineAnnealingLR
optimizer = torch.optim.AdamW(
model.parameters(),
lr=1e-3,
weight_decay=0.05,
betas=(0.9, 0.999)
)
scheduler = CosineAnnealingLR(
optimizer,
T_max=epochs,
eta_min=1e-6
)
# With warmup
warmup_epochs = 5import tensorflow as tf
lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
initial_learning_rate=1e-3,
decay_steps=epochs * steps_per_epoch,
alpha=1e-6
)
optimizer = tf.keras.optimizers.AdamW(
learning_rate=lr_schedule,
weight_decay=0.05,
beta_1=0.9,
beta_2=0.999
)
# With warmup: use WarmupCosineDecay class from aboveResNet/CNN
Section titled “ResNet/CNN”optimizer = torch.optim.SGD(
model.parameters(),
lr=0.1,
momentum=0.9,
weight_decay=1e-4
)
# Step decay every 30 epochs
scheduler = torch.optim.lr_scheduler.StepLR(
optimizer,
step_size=30,
gamma=0.1
)optimizer = tf.keras.optimizers.SGD(
learning_rate=0.1,
momentum=0.9
)
# Step decay callback
def lr_schedule(epoch):
return 0.1 * (0.1 ** (epoch // 30))
lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_schedule)
model.compile(optimizer=optimizer, loss='categorical_crossentropy')
model.fit(train_dataset, epochs=100, callbacks=[lr_callback])Fine-tuning Pretrained Models
Section titled “Fine-tuning Pretrained Models”# Different learning rates for different layers
optimizer = torch.optim.AdamW([
{'params': model.backbone.parameters(), 'lr': 1e-5}, # Pretrained
{'params': model.head.parameters(), 'lr': 1e-3} # New layers
], weight_decay=0.01)# Different learning rates for different layers
optimizer = tf.keras.optimizers.AdamW(weight_decay=0.01)
# Set different learning rates by layer
for layer in model.layers[:-2]: # Backbone layers
layer.learning_rate = 1e-5
for layer in model.layers[-2:]: # Head layers
layer.learning_rate = 1e-3
# Or use multiple optimizers (advanced)
backbone_optimizer = tf.keras.optimizers.AdamW(learning_rate=1e-5, weight_decay=0.01)
head_optimizer = tf.keras.optimizers.AdamW(learning_rate=1e-3, weight_decay=0.01)Key Takeaways
Section titled “Key Takeaways”- Always run a learning rate range test for new models/datasets
- Start with proven configurations for your architecture type
- Use warmup for the first 1-5% of training
- OneCycleLR often converges fastest
- AdamW is a safe default optimizer
- Monitor training curves - they tell you if LR is wrong
- When fine-tuning, use 10-100x lower learning rates