Manifestro Docs

Training Best Practices

Best practices for training DREAM models effectively

Training Best Practices

This guide covers best practices for training DREAM models effectively.

Learning Rate

import torch.optim as optim

optimizer = optim.AdamW(model.parameters(), lr=1e-3)

Learning Rate Scheduling

Use a scheduler to adjust learning rate during training:

scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    mode='min', 
    factor=0.5, 
    patience=5,
    verbose=True
)

# Training loop
for epoch in range(num_epochs):
    train_loss = train_one_epoch()
    val_loss = validate()
    
    scheduler.step(val_loss)

Learning Rate Warmup

For large models, use warmup:

from torch.optim.lr_scheduler import LinearLR, SequentialLR

warmup = LinearLR(optimizer, start_factor=0.1, end_factor=1.0, total_iters=1000)
scheduler = SequentialLR(optimizer, schedulers=[warmup], milestones=[1000])

Gradient Clipping

Gradient clipping prevents explosion in recurrent models:

# Clip gradients
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

# Or clip by value
torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=0.5)

Recommended: max_norm=1.0 for most tasks.


State Management

Truncated BPTT

For long sequences, detach state to limit backpropagation:

state = model.init_state(batch_size)

for chunk in chunks:
    output, state = model(chunk, state)
    
    # Detach state for truncated BPTT
    state = state.detach()

State Initialization

Initialize state fresh for each epoch:

for epoch in range(num_epochs):
    state = model.init_state(batch_size, device='cuda')
    
    for batch in dataloader:
        output, state = model(batch, state)
        # ...

Initialization

Default Initialization

DREAM uses Xavier initialization by default:

model = DREAM(input_dim=64, hidden_dim=128)
# Already initialized

Custom Initialization

For specific needs:

def init_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        torch.nn.init.xavier_uniform_(m.weight)
    if hasattr(m, 'bias') and m.bias is not None:
        torch.nn.init.zeros_(m.bias)

model.apply(init_weights)

Monitoring

Key Metrics to Track

# During training
with torch.no_grad():
    print(f"Loss: {loss.item():.4f}")
    print(f"Average surprise: {state.avg_surprise.mean().item():.4f}")
    print(f"U norm: {state.U.norm().item():.4f}")
    print(f"Adaptive tau: {state.adaptive_tau.mean().item():.4f}")

TensorBoard Logging

from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()

for step, batch in enumerate(dataloader):
    output, state = model(batch)
    loss = criterion(output, target)
    
    writer.add_scalar('Loss/train', loss.item(), step)
    writer.add_scalar('Surprise', state.avg_surprise.mean().item(), step)
    writer.add_scalar('U_norm', state.U.norm().item(), step)

Batch Size

Guidelines

ScenarioBatch Size
Small model, GPU64-128
Large model, GPU16-32
CPU training4-16
Memory-constrained1-8 with gradient accumulation

Gradient Accumulation

For small GPUs:

accumulation_steps = 4
optimizer.zero_grad()

for i, batch in enumerate(dataloader):
    output, state = model(batch)
    loss = criterion(output, target) / accumulation_steps
    
    loss.backward()
    
    if (i + 1) % accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

Mixed Precision Training

Use mixed precision for faster training on modern GPUs:

from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

for batch in dataloader:
    optimizer.zero_grad()
    
    with autocast():
        output, state = model(batch)
        loss = criterion(output, target)
    
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

Benefits:

  • 2-3x faster on Volta/Ampere GPUs
  • Reduced memory usage
  • Larger batch sizes possible

Regularization

Dropout

Add dropout between layers:

class DREAMWithDropout(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout=0.1):
        super().__init__()
        self.dream = DREAM(input_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        output, state = self.dream(x)
        return self.dropout(output), state

Weight Decay

Use AdamW with weight decay:

optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

Early Stopping

best_val_loss = float('inf')
patience = 10
patience_counter = 0

for epoch in range(num_epochs):
    train_loss = train_one_epoch()
    val_loss = validate()
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

Common Issues and Solutions

Issue: Loss is NaN

Causes:

  • Learning rate too high
  • Time step too large
  • Unstable initialization

Solutions:

# Reduce learning rate
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

# Smaller time step
config = DREAMConfig(time_step=0.05)

# Larger time constant
config = DREAMConfig(ltc_tau_sys=15.0)

# Smaller weights
config = DREAMConfig(target_norm=1.5)

Issue: Model Doesn't Learn

Check:

# 1. Gradients are flowing
for name, param in model.named_parameters():
    if param.grad is None:
        print(f"No gradient for {name}")

# 2. Surprise is non-zero
print(f"Surprise: {state.avg_surprise.mean().item()}")

# 3. Input is normalized
print(f"Input mean: {x.mean().item()}, std: {x.std().item()}")

Solutions:

# Increase plasticity
config = DREAMConfig(base_plasticity=0.2)

# Reduce threshold
config = DREAMConfig(base_threshold=0.3)

# Enable LTC
config = DREAMConfig(ltc_enabled=True)

Issue: CUDA Out of Memory

Solutions:

# 1. Reduce batch size
batch_size = 16  # instead of 32

# 2. Reduce hidden dimension
config = DREAMConfig(hidden_dim=128)  # instead of 256

# 3. Reduce rank
config = DREAMConfig(rank=8)  # instead of 16

# 4. Use gradient accumulation
accumulation_steps = 4

# 5. Use mixed precision
scaler = GradScaler()

Issue: Training is Slow

Optimizations:

# 1. Use GPU
model = model.to('cuda')

# 2. Reduce rank
config = DREAMConfig(rank=8)

# 3. Disable LTC if not needed
config = DREAMConfig(ltc_enabled=False)

# 4. Use mixed precision
with autocast():
    output, state = model(x)

# 5. Use larger time step (if stable)
config = DREAMConfig(time_step=0.15)

Complete Training Example

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
from dream import DREAM

# Model
model = DREAM(input_dim=39, hidden_dim=256, rank=16).to('cuda')
classifier = nn.Linear(256, 10).to('cuda')

# Optimizer
optimizer = torch.optim.AdamW(
    list(model.parameters()) + list(classifier.parameters()),
    lr=1e-3,
    weight_decay=1e-4
)

# Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5
)

# Loss
criterion = nn.CrossEntropyLoss()

# Scaler for mixed precision
scaler = GradScaler()

# Training loop
best_val_acc = 0
for epoch in range(50):
    model.train()
    total_loss = 0
    
    for batch_x, batch_y in train_loader:
        batch_x = batch_x.to('cuda')
        batch_y = batch_y.to('cuda')
        
        optimizer.zero_grad()
        
        with autocast():
            _, final_state = model(batch_x, return_sequences=False)
            logits = classifier(final_state.h)
            loss = criterion(logits, batch_y)
        
        scaler.scale(loss).backward()
        
        # Gradient clipping
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(
            list(model.parameters()) + list(classifier.parameters()),
            max_norm=1.0
        )
        
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
    
    # Validation
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch_x, batch_y in val_loader:
            batch_x = batch_x.to('cuda')
            _, final_state = model(batch_x, return_sequences=False)
            logits = classifier(final_state.h)
            predictions = logits.argmax(dim=-1)
            correct += (predictions == batch_y.to('cuda')).sum().item()
            total += batch_y.size(0)
    
    val_acc = correct / total
    avg_loss = total_loss / len(train_loader)
    
    print(f"Epoch {epoch}: Loss={avg_loss:.4f}, Val Acc={val_acc:.4f}")
    
    scheduler.step(avg_loss)
    
    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save({
            'model': model.state_dict(),
            'classifier': classifier.state_dict(),
            'val_acc': val_acc,
        }, 'best_model.pt')

Next Steps

On this page