Skip to content

SLURM & HPC Clusters

SLURM (Simple Linux Utility for Resource Management) is the most widely used open-source job scheduler for HPC clusters. It manages:

  • Job queuing and scheduling
  • Resource allocation (CPUs, GPUs, memory)
  • Job monitoring and accounting
  • Fair-share scheduling

This guide covers SLURM usage for deep learning workloads on HPC clusters.


submit_job.sh - Basic batch job
#!/bin/bash
#SBATCH --job-name=my_training
#SBATCH --output=logs/%x_%j.out
#SBATCH --error=logs/%x_%j.err
#SBATCH --time=24:00:00
#SBATCH --partition=gpu
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=4
#SBATCH --mem=32G

# Your commands here
module load python/3.10 cuda/12.1
source ~/venvs/ml/bin/activate
python train.py --epochs 100

Submit with: sbatch submit_job.sh

DirectiveDescriptionExample
--job-nameJob name--job-name=training
--outputstdout file--output=logs/%x_%j.out
--errorstderr file--error=logs/%x_%j.err
--timeTime limit--time=24:00:00 (24h)
--partitionQueue/partition--partition=gpu
--gresGeneric resources--gres=gpu:2 (2 GPUs)
--cpus-per-taskCPU cores--cpus-per-task=8
--memMemory--mem=64G
--nodesNumber of nodes--nodes=2
--ntasksNumber of tasks--ntasks=4

single_gpu.sh
#!/bin/bash
#SBATCH --partition=gpu
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=4
#SBATCH --mem=32G
#SBATCH --time=48:00:00
#SBATCH --job-name=single_gpu_train

module load cuda/12.1 cudnn/8.9
source ~/venvs/pytorch/bin/activate

python train.py \
    --model resnet50 \
    --batch-size 128 \
    --epochs 100
multi_gpu.sh
#!/bin/bash
#SBATCH --partition=gpu
#SBATCH --gres=gpu:4          # Request 4 GPUs
#SBATCH --cpus-per-task=16     # 4 CPUs per GPU
#SBATCH --mem=128G
#SBATCH --time=72:00:00
#SBATCH --job-name=multi_gpu_ddp

module load cuda/12.1
source ~/venvs/pytorch/bin/activate

# PyTorch DistributedDataParallel
torchrun --standalone --nnodes=1 --nproc_per_node=4 \
    train.py --distributed
multi_node.sh
#!/bin/bash
#SBATCH --partition=gpu
#SBATCH --nodes=2              # 2 nodes
#SBATCH --ntasks-per-node=1    # 1 task per node
#SBATCH --gres=gpu:4           # 4 GPUs per node
#SBATCH --cpus-per-task=16
#SBATCH --mem=128G
#SBATCH --time=96:00:00

# Get master node address
export MASTER_ADDR=$(scontrol show hostname $SLURM_NODELIST | head -n 1)
export MASTER_PORT=29500

# Launch distributed training
srun torchrun \
    --nnodes=$SLURM_NNODES \
    --nproc_per_node=4 \
    --node_rank=$SLURM_NODEID \
    --master_addr=$MASTER_ADDR \
    --master_port=$MASTER_PORT \
    train.py --distributed

# List your jobs
squeue -u $USER

# With more details
squeue -u $USER -o "%.18i %.9P %.30j %.8T %.10M %.6D %R"

# Watch in real-time
watch -n 1 squeue -u $USER
# Cancel job
scancel JOBID

# Cancel all your jobs
scancel -u $USER

# Cancel jobs by name
scancel --name=training

# Hold job (prevent from starting)
scontrol hold JOBID

# Release held job
scontrol release JOBID

# Update job (before it starts)
scontrol update JobId=JOBID TimeLimit=48:00:00

Run multiple similar jobs efficiently:

job_array.sh - Parameter sweep
#!/bin/bash
#SBATCH --array=0-9           # 10 jobs: indices 0-9
#SBATCH --partition=gpu
#SBATCH --gres=gpu:1
#SBATCH --time=24:00:00
#SBATCH --job-name=sweep
#SBATCH --output=logs/sweep_%A_%a.out

# Learning rates to test
LRS=(0.1 0.01 0.001 0.0001 0.00001 0.1 0.01 0.001 0.0001 0.00001)

# Get learning rate for this array task
LR=${LRS[$SLURM_ARRAY_TASK_ID]}

# Run training with this learning rate
python train.py --lr $LR --output_dir results/lr_$LR

Submit: sbatch job_array.sh

Manage array:

# Check array jobs
squeue -u $USER -r

# Cancel specific array task
scancel JOBID_3

# Cancel entire array
scancel JOBID

# Job 1: Preprocess data
JOB1=$(sbatch --parsable preprocess.sh)

# Job 2: Train (waits for Job 1)
JOB2=$(sbatch --dependency=afterok:$JOB1 train.sh)

# Job 3: Evaluate (waits for Job 2)
sbatch --dependency=afterok:$JOB2 evaluate.sh
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH [email protected]
checkpoint_resume.sh
#!/bin/bash
#SBATCH --partition=gpu
#SBATCH --gres=gpu:1
#SBATCH --time=24:00:00
#SBATCH --signal=B:USR1@600  # Send signal 10 min before timeout

# Checkpoint handler
checkpoint() {
    echo "Checkpointing..."
    # Your checkpoint save code
    touch checkpoint_signal
}

trap checkpoint USR1

# Run training
python train.py --resume_if_exists

# If time runs out, automatically resubmit
if [ -f checkpoint_signal ]; then
    sbatch $0  # Resubmit this script
fi

# After job completes
seff JOBID

Example output:

Job ID: 123456
Cluster: mycluster
User/Group: user/group
State: COMPLETED (exit code 0)
Cores: 4
CPU Utilized: 23:45:30
CPU Efficiency: 98.52% of 24:06:00 core-walltime
Memory Utilized: 28.5 GB
Memory Efficiency: 89.06% of 32.0 GB
# Start with conservative estimate
#SBATCH --time=4:00:00
#SBATCH --mem=16G

# Check actual usage with seff
# Adjust for production run

  1. Test with Short Jobs - Debug with --time=1:00:00 first
  2. Request Exact GPUs - Use --gres=gpu:a100:2 for specific GPU types
  3. Use Job Arrays - For parameter sweeps instead of many separate jobs
  4. Checkpoint Frequently - Save progress every epoch or hour
  5. Monitor Efficiency - Use seff to optimize resource requests
  6. Clean Up - Remove old output files and checkpoints

# Why is job pending?
squeue -j JOBID --start

# Check partition limits
scontrol show partition gpu

# Check your limits
sacctmgr show assoc where user=$USER format=user,account,partition,maxjobs,maxsubmit
# Check actual memory usage
seff JOBID

# Increase memory in job script
#SBATCH --mem=64G

# Or memory per CPU
#SBATCH --mem-per-cpu=4G
# Check job output
cat slurm-JOBID.out

# Check system logs
sacct -j JOBID --format=JobID,State,ExitCode,DerivedExitCode

# Common causes:
# - Out of memory (OOM)
# - Time limit exceeded
# - Node failure

# Submit job
sbatch script.sh

# List jobs
squeue -u $USER

# Cancel job
scancel JOBID

# Job details
scontrol show job JOBID

# Job efficiency
seff JOBID

# Interactive session
srun --pty bash

# Cluster info
sinfo

# Your account info
sacctmgr show user $USER

# Job history
sacct -u $USER --starttime=2025-01-01

Official SLURM Documentation