aboutsummaryrefslogtreecommitdiff
path: root/distributed.sh
diff options
context:
space:
mode:
Diffstat (limited to 'distributed.sh')
-rwxr-xr-xdistributed.sh34
1 files changed, 34 insertions, 0 deletions
diff --git a/distributed.sh b/distributed.sh
new file mode 100755
index 0000000..4949159
--- /dev/null
+++ b/distributed.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+#SBATCH --job-name=swr-teamprojekt
+#SBATCH --partition=a100
+#SBATCH --time=00:30:00
+
+### e.g. request 4 nodes with 1 gpu each, totally 4 gpus (WORLD_SIZE==4)
+### Note: --gres=gpu:x should equal to ntasks-per-node
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --gres=gpu:a100:1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64gb
+#SBATCH --chdir=/mnt/lustre/mladm/mfa252/SWR2-cool-projekt-main/
+#SBATCH --output=/mnt/lustre/mladm/mfa252/%x-%j.out
+
+### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others
+### change WORLD_SIZE as gpus/node * num_nodes
+export MASTER_PORT=18120
+export WORLD_SIZE=2
+
+### get the first node name as master address - customized for vgg slurm
+### e.g. master(gnodee[2-5],gnoded1) == gnodee2
+echo "NODELIST="${SLURM_NODELIST}
+master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_ADDR=$master_addr
+echo "MASTER_ADDR="$MASTER_ADDR
+
+export NCCL_DEBUG="INFO"
+
+source venv/bin/activate
+
+### the command to run
+srun ./train.sh