#!/bin/bash #SBATCH --job-name=swr-teamprojekt #SBATCH --partition=a100 #SBATCH --time=00:30:00 ### e.g. request 4 nodes with 1 gpu each, totally 4 gpus (WORLD_SIZE==4) ### Note: --gres=gpu:x should equal to ntasks-per-node #SBATCH --nodes=2 #SBATCH --ntasks-per-node=1 #SBATCH --gres=gpu:a100:1 #SBATCH --cpus-per-task=8 #SBATCH --mem=64gb #SBATCH --chdir=/mnt/lustre/mladm/mfa252/SWR2-cool-projekt-main/ #SBATCH --output=/mnt/lustre/mladm/mfa252/%x-%j.out ### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others ### change WORLD_SIZE as gpus/node * num_nodes export MASTER_PORT=18120 export WORLD_SIZE=2 ### get the first node name as master address - customized for vgg slurm ### e.g. master(gnodee[2-5],gnoded1) == gnodee2 echo "NODELIST="${SLURM_NODELIST} master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) export MASTER_ADDR=$master_addr echo "MASTER_ADDR="$MASTER_ADDR export NCCL_DEBUG="INFO" source venv/bin/activate ### the command to run srun ./train.sh