diff options
Diffstat (limited to 'distributed.sh')
-rwxr-xr-x | distributed.sh | 34 |
1 files changed, 0 insertions, 34 deletions
diff --git a/distributed.sh b/distributed.sh deleted file mode 100755 index 4949159..0000000 --- a/distributed.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=swr-teamprojekt -#SBATCH --partition=a100 -#SBATCH --time=00:30:00 - -### e.g. request 4 nodes with 1 gpu each, totally 4 gpus (WORLD_SIZE==4) -### Note: --gres=gpu:x should equal to ntasks-per-node -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=1 -#SBATCH --gres=gpu:a100:1 -#SBATCH --cpus-per-task=8 -#SBATCH --mem=64gb -#SBATCH --chdir=/mnt/lustre/mladm/mfa252/SWR2-cool-projekt-main/ -#SBATCH --output=/mnt/lustre/mladm/mfa252/%x-%j.out - -### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others -### change WORLD_SIZE as gpus/node * num_nodes -export MASTER_PORT=18120 -export WORLD_SIZE=2 - -### get the first node name as master address - customized for vgg slurm -### e.g. master(gnodee[2-5],gnoded1) == gnodee2 -echo "NODELIST="${SLURM_NODELIST} -master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_ADDR=$master_addr -echo "MASTER_ADDR="$MASTER_ADDR - -export NCCL_DEBUG="INFO" - -source venv/bin/activate - -### the command to run -srun ./train.sh |