diff options
author | Marvin Borner | 2023-08-19 19:16:06 +0200 |
---|---|---|
committer | Marvin Borner | 2023-08-19 19:16:06 +0200 |
commit | 9cbbcacee6814837799b5fa941f8de7edbda4a7e (patch) | |
tree | 1ab4b40cc7c4d7729e9be895701d7f37ef438776 /distributed.sh | |
parent | ec177107cb3a1a31d2fc49cc4990413af287305e (diff) |
Tried some more distributing
For now decided against it, so I will overwrite the changes soon.
Diffstat (limited to 'distributed.sh')
-rwxr-xr-x | distributed.sh | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/distributed.sh b/distributed.sh new file mode 100755 index 0000000..4949159 --- /dev/null +++ b/distributed.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +#SBATCH --job-name=swr-teamprojekt +#SBATCH --partition=a100 +#SBATCH --time=00:30:00 + +### e.g. request 4 nodes with 1 gpu each, totally 4 gpus (WORLD_SIZE==4) +### Note: --gres=gpu:x should equal to ntasks-per-node +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --gres=gpu:a100:1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=64gb +#SBATCH --chdir=/mnt/lustre/mladm/mfa252/SWR2-cool-projekt-main/ +#SBATCH --output=/mnt/lustre/mladm/mfa252/%x-%j.out + +### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others +### change WORLD_SIZE as gpus/node * num_nodes +export MASTER_PORT=18120 +export WORLD_SIZE=2 + +### get the first node name as master address - customized for vgg slurm +### e.g. master(gnodee[2-5],gnoded1) == gnodee2 +echo "NODELIST="${SLURM_NODELIST} +master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +export MASTER_ADDR=$master_addr +echo "MASTER_ADDR="$MASTER_ADDR + +export NCCL_DEBUG="INFO" + +source venv/bin/activate + +### the command to run +srun ./train.sh |