Skip to content

GPU Monitoring Scripts

Effective GPU monitoring helps you:

  • Track GPU utilization and memory usage
  • Detect performance bottlenecks
  • Monitor temperature and prevent thermal throttling
  • Log metrics for analysis
  • Set up alerts for issues

# Basic GPU status
nvidia-smi

# Simplified output
nvidia-smi --query-gpu=index,name,temperature.gpu,utilization.gpu,memory.used,memory.total --format=csv

# Watch mode (updates every 2 seconds)
watch -n 2 nvidia-smi

gpu_logger.sh - Log GPU metrics to file
#!/bin/bash
# Log GPU metrics continuously

LOG_FILE="gpu_metrics_$(date +%Y%m%d_%H%M%S).csv"
INTERVAL=5  # seconds

# Create header
echo "timestamp,gpu_id,gpu_name,temp_c,utilization_%,memory_used_mb,memory_total_mb,power_w" > "$LOG_FILE"

# Log continuously
while true; do
    nvidia-smi --query-gpu=timestamp,index,name,temperature.gpu,utilization.gpu,memory.used,memory.total,power.draw \
        --format=csv,noheader,nounits >> "$LOG_FILE"
    sleep $INTERVAL
done

Usage:

chmod +x gpu_logger.sh
./gpu_logger.sh &  # Run in background
gpu_dashboard.sh - Colorful terminal dashboard
#!/bin/bash
# Real-time GPU monitoring dashboard

while true; do
    clear
    echo "=== GPU Monitoring Dashboard ==="
    echo "Time: $(date '+%Y-%m-%d %H:%M:%S')"
    echo ""

    # GPU utilization
    nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw \
        --format=csv,noheader | \
    while IFS=',' read -r idx name util mem_used mem_total temp power; do
        echo "GPU $idx: $name"
        echo "  Utilization: $util"
        echo "  Memory: $mem_used / $mem_total"
        echo "  Temperature: $temp"
        echo "  Power: $power"
        echo ""
    done

    # Running processes
    echo "=== Active Processes ==="
    nvidia-smi pmon -c 1 | grep -v "#"

    sleep 2
done

gpu_monitor.py - Python monitoring
#!/usr/bin/env python3
import pynvml
import time
from datetime import datetime

# Initialize NVIDIA Management Library
pynvml.nvmlInit()

def monitor_gpus(interval=2):
    """Monitor all GPUs continuously"""
    device_count = pynvml.nvmlDeviceGetCount()

    try:
        while True:
            print(f"\n{'='*60}")
            print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
            print(f"{'='*60}")

            for i in range(device_count):
                handle = pynvml.nvmlDeviceGetHandleByIndex(i)

                # Get device info
                name = pynvml.nvmlDeviceGetName(handle)

                # Get utilization
                util = pynvml.nvmlDeviceGetUtilizationRates(handle)

                # Get memory info
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)

                # Get temperature
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)

                # Get power
                power = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0  # Convert to W

                print(f"\nGPU {i}: {name}")
                print(f"  Utilization: {util.gpu}%")
                print(f"  Memory: {mem.used / 1024**3:.2f} GB / {mem.total / 1024**3:.2f} GB ({mem.used/mem.total*100:.1f}%)")
                print(f"  Temperature: {temp}°C")
                print(f"  Power: {power:.2f} W")

            time.sleep(interval)

    except KeyboardInterrupt:
        print("\nMonitoring stopped.")
    finally:
        pynvml.nvmlShutdown()

if __name__ == "__main__":
    # Install: pip install nvidia-ml-py3
    monitor_gpus(interval=2)
gpu_alert.py - Email/Slack alerts
#!/usr/bin/env python3
import pynvml
import time
import smtplib
from email.mime.text import MIMEText

pynvml.nvmlInit()

# Alert thresholds
TEMP_THRESHOLD = 80  # °C
MEMORY_THRESHOLD = 90  # %
CHECK_INTERVAL = 60  # seconds

def send_alert(message):
    """Send email alert (configure SMTP settings)"""
    sender = "[email protected]"
    receiver = "[email protected]"

    msg = MIMEText(message)
    msg['Subject'] = 'GPU Alert'
    msg['From'] = sender
    msg['To'] = receiver

    # Configure your SMTP server
    # smtp = smtplib.SMTP('smtp.gmail.com', 587)
    # smtp.starttls()
    # smtp.login(sender, 'your_password')
    # smtp.send_message(msg)
    # smtp.quit()

    print(f"ALERT: {message}")

def check_gpus():
    """Check GPU health and send alerts"""
    device_count = pynvml.nvmlDeviceGetCount()

    for i in range(device_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        name = pynvml.nvmlDeviceGetName(handle)

        # Check temperature
        temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
        if temp >= TEMP_THRESHOLD:
            send_alert(f"GPU {i} ({name}) temperature {temp}°C exceeds threshold {TEMP_THRESHOLD}°C")

        # Check memory
        mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
        mem_percent = (mem.used / mem.total) * 100
        if mem_percent >= MEMORY_THRESHOLD:
            send_alert(f"GPU {i} ({name}) memory usage {mem_percent:.1f}% exceeds threshold {MEMORY_THRESHOLD}%")

if __name__ == "__main__":
    try:
        while True:
            check_gpus()
            time.sleep(CHECK_INTERVAL)
    except KeyboardInterrupt:
        print("\nAlert monitoring stopped.")
    finally:
        pynvml.nvmlShutdown()

# Install nvtop (like htop for GPUs)
sudo apt install nvtop

# Run
nvtop

Features:

  • Real-time GPU usage graphs
  • Process monitoring
  • Multi-GPU support
  • Color-coded interface
# Install
pip install gpustat

# Basic usage
gpustat

# Watch mode
gpustat --watch

# With color
gpustat --color

# JSON output for scripting
gpustat --json
nvidia_gpu_exporter for Prometheus
# Install NVIDIA GPU exporter
docker run -d --gpus all \
  -p 9445:9445 \
  nvidia/dcgm-exporter:latest

# Access metrics at http://localhost:9445/metrics

Then configure Grafana to visualize:

  • GPU utilization over time
  • Memory usage trends
  • Temperature monitoring
  • Power consumption

Monitor during PyTorch training
import torch
import pynvml

pynvml.nvmlInit()

def log_gpu_usage(epoch, batch_idx):
    """Log GPU usage during training"""
    if batch_idx % 100 == 0:  # Log every 100 batches
        for i in range(torch.cuda.device_count()):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
            util = pynvml.nvmlDeviceGetUtilizationRates(handle)

            print(f"Epoch {epoch}, Batch {batch_idx}, GPU {i}: "
                  f"Util {util.gpu}%, Mem {mem.used/1024**3:.1f}GB/{mem.total/1024**3:.1f}GB")

# In training loop
for epoch in range(num_epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        # Your training code
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        # Log GPU usage
        log_gpu_usage(epoch, batch_idx)

check_throttling.sh
#!/bin/bash
# Check if GPUs are throttling due to temperature or power

nvidia-smi --query-gpu=index,name,clocks_throttle_reasons.active,temperature.gpu,power.draw \
    --format=csv

# Reasons codes:
# 0x0000000000000000 - Not throttled
# 0x0000000000000001 - GPU idle
# 0x0000000000000002 - Applications clocks setting
# 0x0000000000000004 - SW power cap
# 0x0000000000000008 - HW slowdown
# 0x0000000000000010 - Sync boost
# 0x0000000000000020 - SW thermal slowdown
# 0x0000000000000040 - HW thermal slowdown
# 0x0000000000000080 - HW power brake slowdown
find_memory_leak.py
import torch
import gc

def find_tensors():
    """Find all tensors in memory"""
    for obj in gc.get_objects():
        try:
            if torch.is_tensor(obj):
                print(type(obj), obj.size())
        except:
            pass

# Call periodically during training to detect leaks
find_tensors()

  1. Monitor Continuously - Run monitoring during long training runs
  2. Log Metrics - Save GPU logs for post-analysis
  3. Set Alerts - Get notified of temperature/memory issues
  4. Check Baselines - Know your GPU’s normal behavior
  5. Use Multiple Tools - Combine nvidia-smi, nvtop, and custom scripts