diff options
author | Marvin | 2023-08-19 21:39:02 +0200 |
---|---|---|
committer | GitHub | 2023-08-19 21:39:02 +0200 |
commit | f88c9afc6e9efcb6f79a959779114095c23e0cef (patch) | |
tree | b2a46dfff4c908aae734ebe5e4caa32e2d3f1d48 | |
parent | b5aee436d95c6eb54adb7dc3f405249520ff7e9b (diff) | |
parent | 0f94b144fbd79b721e994f0350fe8ee19c7a691c (diff) |
Merge pull request #18 from Algo-Boys/not-distributed-but-still-cool
Not distributed but still cool
-rwxr-xr-x | hpc.sh | 19 | ||||
-rwxr-xr-x | hpc_train.sh | 3 | ||||
-rw-r--r-- | swr2_asr/train.py | 69 |
3 files changed, 37 insertions, 54 deletions
@@ -0,0 +1,19 @@ +#!/bin/bash + +#SBATCH --job-name=swr-teamprojekt +#SBATCH --partition=a100 +#SBATCH --time=00:30:00 + +### Note: --gres=gpu:x should equal to ntasks-per-node +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --gres=gpu:a100:1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=64gb +#SBATCH --chdir=/mnt/lustre/mladm/mfa252/SWR2-cool-projekt-main/ +#SBATCH --output=/mnt/lustre/mladm/mfa252/%x-%j.out + +source venv/bin/activate + +### the command to run +srun ./hpc_train.sh diff --git a/hpc_train.sh b/hpc_train.sh new file mode 100755 index 0000000..c7d1636 --- /dev/null +++ b/hpc_train.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +yes no | python -m swr2_asr.train --epochs=100 --batch_size=30 --dataset_path=/mnt/lustre/mladm/mfa252/data diff --git a/swr2_asr/train.py b/swr2_asr/train.py index 56d10c0..ad8c9e9 100644 --- a/swr2_asr/train.py +++ b/swr2_asr/train.py @@ -7,8 +7,6 @@ import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler -import torch.distributed as dist import torchaudio from .loss_scores import cer, wer @@ -394,7 +392,6 @@ def run( learning_rate: float, batch_size: int, epochs: int, - world_size: int, load: bool, path: str, dataset_path: str, @@ -411,8 +408,6 @@ def run( "learning_rate": learning_rate, "batch_size": batch_size, "epochs": epochs, - "world_size": world_size, - "distributed": world_size > 1, } use_cuda = torch.cuda.is_available() @@ -422,37 +417,16 @@ def run( download_dataset = not os.path.isdir(path) train_dataset = MultilingualLibriSpeech( - dataset_path, "mls_polish_opus", split="dev", download=download_dataset + dataset_path, "mls_german_opus", split="dev", download=download_dataset ) test_dataset = MultilingualLibriSpeech( - dataset_path, "mls_polish_opus", split="test", download=False + dataset_path, "mls_german_opus", split="test", download=False ) - # initialize distributed training - ngpus_per_node = torch.cuda.device_count() - if hparams["distributed"]: - if "SLURM_PROCID" in os.environ: # for slurm scheduler - hparams["rank"] = int(os.environ["SLURM_PROCID"]) - hparams["gpu"] = hparams["rank"] % ngpus_per_node - dist.init_process_group( - backend="nccl", - init_method="env://", - world_size=hparams["world_size"], - rank=hparams["rank"], - ) - - train_sampler = ( - DistributedSampler(train_dataset, shuffle=True) - if hparams["distributed"] - else None - ) train_loader = DataLoader( train_dataset, batch_size=hparams["batch_size"], shuffle=True, - sampler=train_sampler, - num_workers=hparams["world_size"], # TODO? - pin_memory=True, collate_fn=lambda x: data_processing(x, "train"), ) @@ -460,12 +434,13 @@ def run( test_dataset, batch_size=hparams["batch_size"], shuffle=True, - sampler=None, - num_workers=hparams["world_size"], # TODO? - pin_memory=True, collate_fn=lambda x: data_processing(x, "train"), ) + # enable flag to find the most compatible algorithms in advance + if use_cuda: + torch.backends.cudnn.benchmark = True + model = SpeechRecognitionModel( hparams["n_cnn_layers"], hparams["n_rnn_layers"], @@ -476,19 +451,6 @@ def run( hparams["dropout"], ).to(device) - if hparams["distributed"]: - if "gpu" in hparams: - torch.cuda.set_device(hparams["gpu"]) - model.cuda(hparams["gpu"]) - model = torch.nn.parallel.DistributedDataParallel( - model, device_ids=[hparams["gpu"]] - ) - model_without_ddp = model.module - else: - model.cuda() - model = torch.nn.parallel.DistributedDataParallel(model) - model_without_ddp = model.module - print( "Num Model Parameters", sum([param.nelement() for param in model.parameters()]) ) @@ -520,23 +482,23 @@ def run( epoch, iter_meter, ) - if epoch % 3 == 0 or epoch == epochs: - torch.save( - {"epoch": epoch, "model_state_dict": model.state_dict(), "loss": loss}, - path, - ) + test(model=model, device=device, test_loader=test_loader, criterion=criterion) + print("saving epoch", str(epoch)) + torch.save( + {"epoch": epoch, "model_state_dict": model.state_dict(), "loss": loss}, + path + str(epoch), + ) @click.command() @click.option("--learning_rate", default=1e-3, help="Learning rate") -@click.option("--batch_size", default=1, help="Batch size") +@click.option("--batch_size", default=10, help="Batch size") @click.option("--epochs", default=1, help="Number of epochs") -@click.option("--world_size", default=1, help="Number of nodes for distribution") @click.option("--load", default=False, help="Do you want to load a model?") @click.option( "--path", - default="models/model.pt", + default="model", help="Path where the model will be saved to/loaded from", ) @click.option( @@ -548,17 +510,16 @@ def run_cli( learning_rate: float, batch_size: int, epochs: int, - world_size: int, load: bool, path: str, dataset_path: str, ) -> None: """Runs the training script.""" + run( learning_rate=learning_rate, batch_size=batch_size, epochs=epochs, - world_size=world_size, load=load, path=path, dataset_path=dataset_path, |