From fd3106c2cce565d378def73b0d77b0123f68523b Mon Sep 17 00:00:00 2001 From: JoJoBarthold2 Date: Sat, 19 Aug 2023 13:26:01 +0200 Subject: train now returns loss so it can be saved ( amen ) --- swr2_asr/train.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'swr2_asr') diff --git a/swr2_asr/train.py b/swr2_asr/train.py index 81312d9..6eaf4c1 100644 --- a/swr2_asr/train.py +++ b/swr2_asr/train.py @@ -351,7 +351,9 @@ def train( [{batch_idx * len(spectrograms)}/{data_len} \ ({100.0 * batch_idx / len(train_loader)}%)]\t \ Loss: {loss.item()}" + ) + return loss def test(model, device, test_loader, criterion): @@ -460,7 +462,7 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No iter_meter = IterMeter() for epoch in range(1, epochs + 1): - train( + loss = train( model, device, train_loader, -- cgit v1.2.3 From d5568bb9f51c4b586c7bd8537140cb1e201f5840 Mon Sep 17 00:00:00 2001 From: JoJoBarthold2 Date: Sat, 19 Aug 2023 13:29:20 +0200 Subject: also now saves loss ( hahah funny meme) | || || |_ --- swr2_asr/train.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'swr2_asr') diff --git a/swr2_asr/train.py b/swr2_asr/train.py index 6eaf4c1..9a8620f 100644 --- a/swr2_asr/train.py +++ b/swr2_asr/train.py @@ -351,10 +351,8 @@ def train( [{batch_idx * len(spectrograms)}/{data_len} \ ({100.0 * batch_idx / len(train_loader)}%)]\t \ Loss: {loss.item()}" - ) - return loss - + return loss.item() def test(model, device, test_loader, criterion): """Test""" @@ -476,7 +474,7 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No torch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), - },MODEL_SAVE_PATH) + 'loss': loss},MODEL_SAVE_PATH) test(model=model, device=device, test_loader=test_loader, criterion=criterion) -- cgit v1.2.3 From 631ed7a3f7230cb61023875f3a0945542f6e97a9 Mon Sep 17 00:00:00 2001 From: JoJoBarthold2 Date: Sat, 19 Aug 2023 13:31:28 +0200 Subject: fix --- swr2_asr/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'swr2_asr') diff --git a/swr2_asr/train.py b/swr2_asr/train.py index 9a8620f..2e72dee 100644 --- a/swr2_asr/train.py +++ b/swr2_asr/train.py @@ -352,7 +352,7 @@ def train( ({100.0 * batch_idx / len(train_loader)}%)]\t \ Loss: {loss.item()}" ) - return loss.item() + return loss.item() def test(model, device, test_loader, criterion): """Test""" -- cgit v1.2.3 From aea161ee7f2c96aab529ca22675fb54cdcadbd12 Mon Sep 17 00:00:00 2001 From: JoJoBarthold2 Date: Sat, 19 Aug 2023 14:13:57 +0200 Subject: loading now works --- swr2_asr/train.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'swr2_asr') diff --git a/swr2_asr/train.py b/swr2_asr/train.py index 2e72dee..346be0b 100644 --- a/swr2_asr/train.py +++ b/swr2_asr/train.py @@ -9,8 +9,8 @@ from torch.utils.data import DataLoader import torchaudio from .loss_scores import cer, wer -MODEL_SAVE_PATH = "models/model.pt" -LOSS + + class TextTransform: """Maps characters to integers and vice versa""" @@ -388,7 +388,7 @@ def test(model, device, test_loader, criterion): ) -def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> None: +def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3,load: bool=False, path: str="models/model.pt") -> None: """Runs the training script.""" hparams = { "n_cnn_layers": 3, @@ -446,10 +446,14 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No print( "Num Model Parameters", sum([param.nelement() for param in model.parameters()]) ) - optimizer = optim.AdamW(model.parameters(), hparams["learning_rate"]) criterion = nn.CTCLoss(blank=28).to(device) - + if load: + checkpoint = torch.load(path) + model.load_state_dict(checkpoint['model_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + epoch = checkpoint['epoch'] + loss = checkpoint['loss'] scheduler = optim.lr_scheduler.OneCycleLR( optimizer, max_lr=hparams["learning_rate"], @@ -474,7 +478,7 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No torch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), - 'loss': loss},MODEL_SAVE_PATH) + 'loss': loss},path) test(model=model, device=device, test_loader=test_loader, criterion=criterion) @@ -482,10 +486,13 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No @click.option("--learning-rate", default=1e-3, help="Learning rate") @click.option("--batch_size", default=1, help="Batch size") @click.option("--epochs", default=1, help="Number of epochs") -def run_cli(learning_rate: float, batch_size: int, epochs: int) -> None: +@click.option("--load", default = False, help="Do you want to load a model?") +@click.option("--path",default="models/model.pt", + help= "Path where the model will be saved to/loaded from" ) +def run_cli(learning_rate: float, batch_size: int, epochs: int, load:bool,path:str) -> None: """Runs the training script.""" - run(learning_rate=learning_rate, batch_size=batch_size, epochs=epochs) + run(learning_rate=learning_rate, batch_size=batch_size, epochs=epochs,load= load, path = path) if __name__ == "__main__": - run(learning_rate=5e-4, batch_size=16, epochs=1) + run(learning_rate=5e-4, batch_size=16, epochs=1,load=False, path= "models/model.pt") -- cgit v1.2.3 From 66c37e72ef2dc7c88e1814627f35e506c7c09648 Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Sat, 19 Aug 2023 14:19:20 +0200 Subject: Started distribution --- swr2_asr/train.py | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'swr2_asr') diff --git a/swr2_asr/train.py b/swr2_asr/train.py index 81312d9..29c2293 100644 --- a/swr2_asr/train.py +++ b/swr2_asr/train.py @@ -1,11 +1,14 @@ """Training script for the ASR model.""" from AudioLoader.speech import MultilingualLibriSpeech +import os import click import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler +import torch.distributed as dist import torchaudio from .loss_scores import cer, wer @@ -388,7 +391,7 @@ def test(model, device, test_loader, criterion): ) -def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> None: +def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3, world_size: int = 1) -> None: """Runs the training script.""" hparams = { "n_cnn_layers": 3, @@ -401,6 +404,8 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No "learning_rate": learning_rate, "batch_size": batch_size, "epochs": epochs, + "world_size": world_size, + "distributed": world_size > 1, } use_cuda = torch.cuda.is_available() @@ -415,22 +420,34 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No "/Volumes/pherkel/SWR2-ASR/", "mls_german_opus", split="test", download=False ) - kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} + # initialize distributed training + ngpus_per_node = torch.cuda.device_count() + if hparams["distributed"]: + if 'SLURM_PROCID' in os.environ: # for slurm scheduler + hparams["rank"] = int(os.environ['SLURM_PROCID']) + hparams["gpu"] = hparams["rank"] % ngpus_per_node + dist.init_process_group(backend="nccl", init_method="env://", + world_size=hparams["world_size"], rank=hparams["rank"]) + train_sampler = DistributedSampler(train_dataset, shuffle=True) train_loader = DataLoader( train_dataset, batch_size=hparams["batch_size"], shuffle=True, + sampler=train_sampler, + num_workers=hparams["world_size"], # TODO? + pin_memory=True, collate_fn=lambda x: data_processing(x, "train"), - **kwargs, ) test_loader = DataLoader( test_dataset, batch_size=hparams["batch_size"], shuffle=True, + sampler=None, + num_workers=hparams["world_size"], # TODO? + pin_memory=True, collate_fn=lambda x: data_processing(x, "train"), - **kwargs, ) model = SpeechRecognitionModel( @@ -443,6 +460,17 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No hparams["dropout"], ).to(device) + if hparams["distributed"]: + if "gpu" in hparams: + torch.cuda.set_device(hparams["gpu"]) + model.cuda(hparams["gpu"]) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[hparams["gpu"]]) + model_without_ddp = model.module + else: + model.cuda() + model = torch.nn.parallel.DistributedDataParallel(model) + model_without_ddp = model.module + print( "Num Model Parameters", sum([param.nelement() for param in model.parameters()]) ) @@ -482,9 +510,10 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No @click.option("--learning-rate", default=1e-3, help="Learning rate") @click.option("--batch_size", default=1, help="Batch size") @click.option("--epochs", default=1, help="Number of epochs") -def run_cli(learning_rate: float, batch_size: int, epochs: int) -> None: +@click.option("--world_size", default=1, help="Number of nodes for distribution") +def run_cli(learning_rate: float, batch_size: int, epochs: int, world_size: int) -> None: """Runs the training script.""" - run(learning_rate=learning_rate, batch_size=batch_size, epochs=epochs) + run(learning_rate=learning_rate, batch_size=batch_size, epochs=epochs, world_size=world_size) if __name__ == "__main__": -- cgit v1.2.3 From ec177107cb3a1a31d2fc49cc4990413af287305e Mon Sep 17 00:00:00 2001 From: Marvin Borner Date: Sat, 19 Aug 2023 14:40:47 +0200 Subject: Fixed some distribution thingies --- pyproject.toml | 2 +- swr2_asr/train.py | 98 +++++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 71 insertions(+), 29 deletions(-) (limited to 'swr2_asr') diff --git a/pyproject.toml b/pyproject.toml index fdd89a5..1c29b7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ readme = "readme.md" packages = [{include = "swr2_asr"}] [tool.poetry.dependencies] -python = "~3.10" +python = "^3.10" torch = "2.0.0" torchaudio = "2.0.1" audioloader = {git = "https://github.com/marvinborner/AudioLoader.git"} diff --git a/swr2_asr/train.py b/swr2_asr/train.py index ba002e0..56d10c0 100644 --- a/swr2_asr/train.py +++ b/swr2_asr/train.py @@ -13,8 +13,6 @@ import torchaudio from .loss_scores import cer, wer - - class TextTransform: """Maps characters to integers and vice versa""" @@ -357,6 +355,7 @@ def train( ) return loss.item() + def test(model, device, test_loader, criterion): """Test""" print("\nevaluating...") @@ -391,7 +390,15 @@ def test(model, device, test_loader, criterion): ) -def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3, world_size: int = 1, load: bool=False, path: str="models/model.pt") -> None: +def run( + learning_rate: float, + batch_size: int, + epochs: int, + world_size: int, + load: bool, + path: str, + dataset_path: str, +) -> None: """Runs the training script.""" hparams = { "n_cnn_layers": 3, @@ -413,29 +420,38 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3, world device = torch.device("cuda" if use_cuda else "cpu") # pylint: disable=no-member # device = torch.device("mps") + download_dataset = not os.path.isdir(path) train_dataset = MultilingualLibriSpeech( - "/Volumes/pherkel/SWR2-ASR/", "mls_german_opus", split="dev", download=False + dataset_path, "mls_polish_opus", split="dev", download=download_dataset ) test_dataset = MultilingualLibriSpeech( - "/Volumes/pherkel/SWR2-ASR/", "mls_german_opus", split="test", download=False + dataset_path, "mls_polish_opus", split="test", download=False ) # initialize distributed training ngpus_per_node = torch.cuda.device_count() if hparams["distributed"]: - if 'SLURM_PROCID' in os.environ: # for slurm scheduler - hparams["rank"] = int(os.environ['SLURM_PROCID']) + if "SLURM_PROCID" in os.environ: # for slurm scheduler + hparams["rank"] = int(os.environ["SLURM_PROCID"]) hparams["gpu"] = hparams["rank"] % ngpus_per_node - dist.init_process_group(backend="nccl", init_method="env://", - world_size=hparams["world_size"], rank=hparams["rank"]) + dist.init_process_group( + backend="nccl", + init_method="env://", + world_size=hparams["world_size"], + rank=hparams["rank"], + ) - train_sampler = DistributedSampler(train_dataset, shuffle=True) + train_sampler = ( + DistributedSampler(train_dataset, shuffle=True) + if hparams["distributed"] + else None + ) train_loader = DataLoader( train_dataset, batch_size=hparams["batch_size"], shuffle=True, sampler=train_sampler, - num_workers=hparams["world_size"], # TODO? + num_workers=hparams["world_size"], # TODO? pin_memory=True, collate_fn=lambda x: data_processing(x, "train"), ) @@ -445,7 +461,7 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3, world batch_size=hparams["batch_size"], shuffle=True, sampler=None, - num_workers=hparams["world_size"], # TODO? + num_workers=hparams["world_size"], # TODO? pin_memory=True, collate_fn=lambda x: data_processing(x, "train"), ) @@ -464,7 +480,9 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3, world if "gpu" in hparams: torch.cuda.set_device(hparams["gpu"]) model.cuda(hparams["gpu"]) - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[hparams["gpu"]]) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[hparams["gpu"]] + ) model_without_ddp = model.module else: model.cuda() @@ -478,10 +496,10 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3, world criterion = nn.CTCLoss(blank=28).to(device) if load: checkpoint = torch.load(path) - model.load_state_dict(checkpoint['model_state_dict']) - optimizer.load_state_dict(checkpoint['optimizer_state_dict']) - epoch = checkpoint['epoch'] - loss = checkpoint['loss'] + model.load_state_dict(checkpoint["model_state_dict"]) + optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) + epoch = checkpoint["epoch"] + loss = checkpoint["loss"] scheduler = optim.lr_scheduler.OneCycleLR( optimizer, max_lr=hparams["learning_rate"], @@ -502,22 +520,46 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3, world epoch, iter_meter, ) - if epoch%3 == 0 or epoch == epochs: - torch.save({ - 'epoch': epoch, - 'model_state_dict': model.state_dict(), - 'loss': loss},path) + if epoch % 3 == 0 or epoch == epochs: + torch.save( + {"epoch": epoch, "model_state_dict": model.state_dict(), "loss": loss}, + path, + ) test(model=model, device=device, test_loader=test_loader, criterion=criterion) @click.command() -@click.option("--learning-rate", default=1e-3, help="Learning rate") +@click.option("--learning_rate", default=1e-3, help="Learning rate") @click.option("--batch_size", default=1, help="Batch size") @click.option("--epochs", default=1, help="Number of epochs") @click.option("--world_size", default=1, help="Number of nodes for distribution") -@click.option("--load", default = False, help="Do you want to load a model?") -@click.option("--path",default="models/model.pt", - help= "Path where the model will be saved to/loaded from" ) -def run_cli(learning_rate: float, batch_size: int, epochs: int, world_size: int, load: bool, path: str) -> None: +@click.option("--load", default=False, help="Do you want to load a model?") +@click.option( + "--path", + default="models/model.pt", + help="Path where the model will be saved to/loaded from", +) +@click.option( + "--dataset_path", + default="data/", + help="Path for the dataset directory", +) +def run_cli( + learning_rate: float, + batch_size: int, + epochs: int, + world_size: int, + load: bool, + path: str, + dataset_path: str, +) -> None: """Runs the training script.""" - run(learning_rate=learning_rate, batch_size=batch_size, epochs=epochs, world_size=world_size, load=load, path=path) + run( + learning_rate=learning_rate, + batch_size=batch_size, + epochs=epochs, + world_size=world_size, + load=load, + path=path, + dataset_path=dataset_path, + ) -- cgit v1.2.3