Training Best Practices
Best practices for training DREAM models effectively
Training Best Practices
This guide covers best practices for training DREAM models effectively.
Learning Rate
Recommended Settings
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=1e-3)Learning Rate Scheduling
Use a scheduler to adjust learning rate during training:
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
optimizer,
mode='min',
factor=0.5,
patience=5,
verbose=True
)
# Training loop
for epoch in range(num_epochs):
train_loss = train_one_epoch()
val_loss = validate()
scheduler.step(val_loss)Learning Rate Warmup
For large models, use warmup:
from torch.optim.lr_scheduler import LinearLR, SequentialLR
warmup = LinearLR(optimizer, start_factor=0.1, end_factor=1.0, total_iters=1000)
scheduler = SequentialLR(optimizer, schedulers=[warmup], milestones=[1000])Gradient Clipping
Gradient clipping prevents explosion in recurrent models:
# Clip gradients
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
# Or clip by value
torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=0.5)Recommended: max_norm=1.0 for most tasks.
State Management
Truncated BPTT
For long sequences, detach state to limit backpropagation:
state = model.init_state(batch_size)
for chunk in chunks:
output, state = model(chunk, state)
# Detach state for truncated BPTT
state = state.detach()State Initialization
Initialize state fresh for each epoch:
for epoch in range(num_epochs):
state = model.init_state(batch_size, device='cuda')
for batch in dataloader:
output, state = model(batch, state)
# ...Initialization
Default Initialization
DREAM uses Xavier initialization by default:
model = DREAM(input_dim=64, hidden_dim=128)
# Already initializedCustom Initialization
For specific needs:
def init_weights(m):
if hasattr(m, 'weight') and m.weight.dim() > 1:
torch.nn.init.xavier_uniform_(m.weight)
if hasattr(m, 'bias') and m.bias is not None:
torch.nn.init.zeros_(m.bias)
model.apply(init_weights)Monitoring
Key Metrics to Track
# During training
with torch.no_grad():
print(f"Loss: {loss.item():.4f}")
print(f"Average surprise: {state.avg_surprise.mean().item():.4f}")
print(f"U norm: {state.U.norm().item():.4f}")
print(f"Adaptive tau: {state.adaptive_tau.mean().item():.4f}")TensorBoard Logging
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
for step, batch in enumerate(dataloader):
output, state = model(batch)
loss = criterion(output, target)
writer.add_scalar('Loss/train', loss.item(), step)
writer.add_scalar('Surprise', state.avg_surprise.mean().item(), step)
writer.add_scalar('U_norm', state.U.norm().item(), step)Batch Size
Guidelines
| Scenario | Batch Size |
|---|---|
| Small model, GPU | 64-128 |
| Large model, GPU | 16-32 |
| CPU training | 4-16 |
| Memory-constrained | 1-8 with gradient accumulation |
Gradient Accumulation
For small GPUs:
accumulation_steps = 4
optimizer.zero_grad()
for i, batch in enumerate(dataloader):
output, state = model(batch)
loss = criterion(output, target) / accumulation_steps
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()Mixed Precision Training
Use mixed precision for faster training on modern GPUs:
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for batch in dataloader:
optimizer.zero_grad()
with autocast():
output, state = model(batch)
loss = criterion(output, target)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()Benefits:
- 2-3x faster on Volta/Ampere GPUs
- Reduced memory usage
- Larger batch sizes possible
Regularization
Dropout
Add dropout between layers:
class DREAMWithDropout(nn.Module):
def __init__(self, input_dim, hidden_dim, dropout=0.1):
super().__init__()
self.dream = DREAM(input_dim, hidden_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
output, state = self.dream(x)
return self.dropout(output), stateWeight Decay
Use AdamW with weight decay:
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)Early Stopping
best_val_loss = float('inf')
patience = 10
patience_counter = 0
for epoch in range(num_epochs):
train_loss = train_one_epoch()
val_loss = validate()
if val_loss < best_val_loss:
best_val_loss = val_loss
patience_counter = 0
torch.save(model.state_dict(), 'best_model.pt')
else:
patience_counter += 1
if patience_counter >= patience:
print(f"Early stopping at epoch {epoch}")
breakCommon Issues and Solutions
Issue: Loss is NaN
Causes:
- Learning rate too high
- Time step too large
- Unstable initialization
Solutions:
# Reduce learning rate
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
# Smaller time step
config = DREAMConfig(time_step=0.05)
# Larger time constant
config = DREAMConfig(ltc_tau_sys=15.0)
# Smaller weights
config = DREAMConfig(target_norm=1.5)Issue: Model Doesn't Learn
Check:
# 1. Gradients are flowing
for name, param in model.named_parameters():
if param.grad is None:
print(f"No gradient for {name}")
# 2. Surprise is non-zero
print(f"Surprise: {state.avg_surprise.mean().item()}")
# 3. Input is normalized
print(f"Input mean: {x.mean().item()}, std: {x.std().item()}")Solutions:
# Increase plasticity
config = DREAMConfig(base_plasticity=0.2)
# Reduce threshold
config = DREAMConfig(base_threshold=0.3)
# Enable LTC
config = DREAMConfig(ltc_enabled=True)Issue: CUDA Out of Memory
Solutions:
# 1. Reduce batch size
batch_size = 16 # instead of 32
# 2. Reduce hidden dimension
config = DREAMConfig(hidden_dim=128) # instead of 256
# 3. Reduce rank
config = DREAMConfig(rank=8) # instead of 16
# 4. Use gradient accumulation
accumulation_steps = 4
# 5. Use mixed precision
scaler = GradScaler()Issue: Training is Slow
Optimizations:
# 1. Use GPU
model = model.to('cuda')
# 2. Reduce rank
config = DREAMConfig(rank=8)
# 3. Disable LTC if not needed
config = DREAMConfig(ltc_enabled=False)
# 4. Use mixed precision
with autocast():
output, state = model(x)
# 5. Use larger time step (if stable)
config = DREAMConfig(time_step=0.15)Complete Training Example
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
from dream import DREAM
# Model
model = DREAM(input_dim=39, hidden_dim=256, rank=16).to('cuda')
classifier = nn.Linear(256, 10).to('cuda')
# Optimizer
optimizer = torch.optim.AdamW(
list(model.parameters()) + list(classifier.parameters()),
lr=1e-3,
weight_decay=1e-4
)
# Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode='min', factor=0.5, patience=5
)
# Loss
criterion = nn.CrossEntropyLoss()
# Scaler for mixed precision
scaler = GradScaler()
# Training loop
best_val_acc = 0
for epoch in range(50):
model.train()
total_loss = 0
for batch_x, batch_y in train_loader:
batch_x = batch_x.to('cuda')
batch_y = batch_y.to('cuda')
optimizer.zero_grad()
with autocast():
_, final_state = model(batch_x, return_sequences=False)
logits = classifier(final_state.h)
loss = criterion(logits, batch_y)
scaler.scale(loss).backward()
# Gradient clipping
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(
list(model.parameters()) + list(classifier.parameters()),
max_norm=1.0
)
scaler.step(optimizer)
scaler.update()
total_loss += loss.item()
# Validation
model.eval()
correct = 0
total = 0
with torch.no_grad():
for batch_x, batch_y in val_loader:
batch_x = batch_x.to('cuda')
_, final_state = model(batch_x, return_sequences=False)
logits = classifier(final_state.h)
predictions = logits.argmax(dim=-1)
correct += (predictions == batch_y.to('cuda')).sum().item()
total += batch_y.size(0)
val_acc = correct / total
avg_loss = total_loss / len(train_loader)
print(f"Epoch {epoch}: Loss={avg_loss:.4f}, Val Acc={val_acc:.4f}")
scheduler.step(avg_loss)
# Save best model
if val_acc > best_val_acc:
best_val_acc = val_acc
torch.save({
'model': model.state_dict(),
'classifier': classifier.state_dict(),
'val_acc': val_acc,
}, 'best_model.pt')Next Steps
- Configuration Guide - Tune parameters
- Examples - Real-world examples
- Troubleshooting - Common issues