Skip to content

Backup & Sync Scripts

Essential for protecting your work:

  • Dataset backups - Protect valuable preprocessed data
  • Model checkpoints - Save training progress
  • Code synchronization - Keep multiple machines in sync
  • Remote backups - Off-site redundancy

backup_dataset.sh - Simple dataset backup
#!/bin/bash
# Backup dataset to external drive or network storage

SOURCE="/data/datasets/imagenet"
DEST="/mnt/backup/datasets/imagenet"

# Create backup with progress
rsync -avh --progress "$SOURCE/" "$DEST/"

# With compression (slower but smaller)
rsync -avhz --progress "$SOURCE/" "$DEST/"

# Exclude certain files
rsync -avh --progress \
    --exclude='*.tmp' \
    --exclude='.cache' \
    "$SOURCE/" "$DEST/"
incremental_backup.sh - Only copy changes
#!/bin/bash
# Faster backups - only sync changes

SOURCE="/data/datasets"
DEST="/mnt/backup/datasets"
LOG="/var/log/dataset_backup.log"

# Incremental with deletion of removed files
rsync -avh --progress \
    --delete \
    --log-file="$LOG" \
    "$SOURCE/" "$DEST/"

echo "Backup completed: $(date)" >> "$LOG"
Create compressed archive
#!/bin/bash
# Create compressed backup archive

DATASET="/data/datasets/coco"
BACKUP_DIR="/mnt/backup"
DATE=$(date +%Y%m%d)

# Create compressed tar archive
tar -czf "$BACKUP_DIR/coco_${DATE}.tar.gz" \
    -C /data/datasets coco

# With progress (requires pv)
tar -czf - -C /data/datasets coco | \
    pv -s $(du -sb /data/datasets/coco | awk '{print $1}') > \
    "$BACKUP_DIR/coco_${DATE}.tar.gz"

backup_checkpoints.sh - Sync checkpoints to backup
#!/bin/bash
# Continuously backup training checkpoints

SOURCE="/home/user/experiments/model_v1/checkpoints"
DEST="/mnt/backup/checkpoints/model_v1"

# Watch and sync checkpoints as they're created
while true; do
    rsync -avh --progress \
        --include='*.pt' \
        --include='*.pth' \
        --include='*.ckpt' \
        --exclude='*' \
        "$SOURCE/" "$DEST/"

    # Check every 5 minutes
    sleep 300
done
cleanup_checkpoints.sh - Keep only N best checkpoints
#!/bin/bash
# Keep only the 5 most recent checkpoints to save space

CHECKPOINT_DIR="/home/user/experiments/checkpoints"
KEEP_COUNT=5

# Find and keep only N most recent .pt files
cd "$CHECKPOINT_DIR"
ls -t *.pt | tail -n +$((KEEP_COUNT + 1)) | xargs -r rm

echo "Kept $KEEP_COUNT most recent checkpoints"
sync_checkpoints.py - Smart checkpoint syncing
#!/usr/bin/env python3
"""
Sync only best checkpoints based on metrics
"""
import os
import shutil
import torch
from pathlib import Path

def sync_best_checkpoints(source_dir, dest_dir, metric='val_loss', keep_best=5):
    """
    Copy only the best N checkpoints based on metric
    """
    checkpoints = []

    # Find all checkpoints
    for ckpt_path in Path(source_dir).glob('*.pt'):
        try:
            # Load checkpoint metadata
            ckpt = torch.load(ckpt_path, map_location='cpu')
            if metric in ckpt:
                checkpoints.append((ckpt_path, ckpt[metric]))
        except:
            continue

    # Sort by metric (assuming lower is better for loss)
    checkpoints.sort(key=lambda x: x[1])

    # Copy best N checkpoints
    Path(dest_dir).mkdir(parents=True, exist_ok=True)

    for ckpt_path, metric_val in checkpoints[:keep_best]:
        dest_path = Path(dest_dir) / ckpt_path.name
        shutil.copy2(ckpt_path, dest_path)
        print(f"Copied {ckpt_path.name} ({metric}={metric_val:.4f})")

if __name__ == "__main__":
    sync_best_checkpoints(
        source_dir="/experiments/checkpoints",
        dest_dir="/backup/checkpoints",
        metric="val_loss",
        keep_best=5
    )

sync_to_remote.sh - Sync to remote server
#!/bin/bash
# Sync local files to remote server via SSH

LOCAL_DIR="/home/user/datasets"
REMOTE_USER="username"
REMOTE_HOST="remote.server.com"
REMOTE_DIR="/data/datasets"

# Sync to remote
rsync -avhz --progress \
    -e "ssh -p 22" \
    "$LOCAL_DIR/" \
    "${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR}/"

# With bandwidth limit (useful for slow connections)
rsync -avhz --progress \
    --bwlimit=10000 \  # 10 MB/s limit
    -e "ssh -p 22" \
    "$LOCAL_DIR/" \
    "${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR}/"
backup_to_s3.sh - Backup to AWS S3
#!/bin/bash
# Sync datasets to AWS S3 for cloud backup

# Install AWS CLI: pip install awscli

LOCAL_DIR="/data/datasets/imagenet"
S3_BUCKET="s3://my-ml-backups/datasets/imagenet"

# Sync to S3 (only upload new/changed files)
aws s3 sync "$LOCAL_DIR" "$S3_BUCKET" \
    --storage-class GLACIER \  # Use cheaper storage for backups
    --exclude "*.tmp" \
    --exclude ".cache/*"

# Download from S3
# aws s3 sync "$S3_BUCKET" "$LOCAL_DIR"

Setup automated daily backups
# Edit crontab
crontab -e

# Add these lines for automated backups:

# Daily dataset backup at 2 AM
0 2 * * * /home/user/scripts/backup_dataset.sh >> /var/log/backup.log 2>&1

# Hourly checkpoint sync
0 * * * * /home/user/scripts/backup_checkpoints.sh >> /var/log/checkpoints.log 2>&1

# Weekly remote sync on Sundays at 3 AM
0 3 * * 0 /home/user/scripts/sync_to_remote.sh >> /var/log/remote_sync.log 2>&1
full_backup.sh - Comprehensive backup script
#!/bin/bash
# Complete backup solution with logging and notifications

# Configuration
DATASETS="/data/datasets"
CHECKPOINTS="/home/user/experiments"
CODE="/home/user/code"
BACKUP_ROOT="/mnt/backup"
DATE=$(date +%Y%m%d_%H%M%S)
LOG_FILE="/var/log/backup_${DATE}.log"

# Email for notifications (requires mail configured)
NOTIFY_EMAIL="[email protected]"

# Logging function
log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}

# Start backup
log "Starting backup process..."

# 1. Backup datasets
log "Backing up datasets..."
rsync -avh --progress \
    "$DATASETS/" "$BACKUP_ROOT/datasets/" \
    >> "$LOG_FILE" 2>&1

# 2. Backup checkpoints
log "Backing up checkpoints..."
rsync -avh --progress \
    --include='*.pt' --include='*.pth' --include='*.ckpt' \
    "$CHECKPOINTS/" "$BACKUP_ROOT/checkpoints/" \
    >> "$LOG_FILE" 2>&1

# 3. Backup code
log "Backing up code..."
rsync -avh --progress \
    --exclude='.git' --exclude='__pycache__' \
    "$CODE/" "$BACKUP_ROOT/code/" \
    >> "$LOG_FILE" 2>&1

# 4. Create compressed archive of critical files
log "Creating compressed archive..."
tar -czf "$BACKUP_ROOT/archives/critical_${DATE}.tar.gz" \
    "$CODE" "$CHECKPOINTS" \
    >> "$LOG_FILE" 2>&1

# Check if backup succeeded
if [ $? -eq 0 ]; then
    log "Backup completed successfully!"
    echo "Backup successful: $DATE" | mail -s "Backup Success" "$NOTIFY_EMAIL"
else
    log "ERROR: Backup failed!"
    echo "Backup FAILED: $DATE. Check $LOG_FILE" | mail -s "Backup FAILED" "$NOTIFY_EMAIL"
    exit 1
fi

# Cleanup old backups (keep last 7 days)
log "Cleaning up old backups..."
find "$BACKUP_ROOT/archives" -name "critical_*.tar.gz" -mtime +7 -delete

log "Backup process complete."

dataset_snapshot.sh - Version datasets
#!/bin/bash
# Create versioned snapshots of datasets

DATASET="/data/datasets/my_dataset"
SNAPSHOT_DIR="/data/snapshots"
VERSION=$(date +%Y%m%d)

# Create hard-link snapshot (fast, space-efficient)
cp -al "$DATASET" "$SNAPSHOT_DIR/my_dataset_v${VERSION}"

echo "Created snapshot: my_dataset_v${VERSION}"
# For projects with code + small datasets
git lfs install

# Track large files
git lfs track "*.pth"
git lfs track "*.h5"
git lfs track "data/*.csv"

# Commit and push
git add .gitattributes
git commit -m "Setup Git LFS"
git push

verify_backup.sh - Check backup integrity
#!/bin/bash
# Verify backup matches source

SOURCE="/data/datasets/imagenet"
BACKUP="/mnt/backup/datasets/imagenet"

# Compare directories
rsync -avhn --delete "$SOURCE/" "$BACKUP/" | grep -v "^$" > /tmp/backup_diff.txt

if [ -s /tmp/backup_diff.txt ]; then
    echo "WARNING: Backup differs from source!"
    cat /tmp/backup_diff.txt
    exit 1
else
    echo "Backup verified: matches source."
    exit 0
fi
check_backup_age.sh - Alert if backup is old
#!/bin/bash
# Alert if last backup is older than N days

BACKUP_DIR="/mnt/backup/datasets"
MAX_AGE_DAYS=2

LAST_BACKUP=$(find "$BACKUP_DIR" -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -f2- -d" ")
BACKUP_AGE_DAYS=$(( ($(date +%s) - $(stat -c %Y "$LAST_BACKUP")) / 86400 ))

if [ $BACKUP_AGE_DAYS -gt $MAX_AGE_DAYS ]; then
    echo "WARNING: Last backup is $BACKUP_AGE_DAYS days old (threshold: $MAX_AGE_DAYS)"
    exit 1
else
    echo "Backup is up to date ($BACKUP_AGE_DAYS days old)"
    exit 0
fi

  1. 3-2-1 Rule: 3 copies, 2 different media, 1 offsite
  2. Automate: Use cron for regular backups
  3. Verify: Test restoring from backups periodically
  4. Monitor: Set up alerts for backup failures
  5. Compress: Use compression for long-term storage
  6. Version: Keep multiple versions of critical data