From fd3106c2cce565d378def73b0d77b0123f68523b Mon Sep 17 00:00:00 2001
From: JoJoBarthold2
Date: Sat, 19 Aug 2023 13:26:01 +0200
Subject: train now returns loss so it can be saved ( amen )

---
 swr2_asr/train.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'swr2_asr')

diff --git a/swr2_asr/train.py b/swr2_asr/train.py
index 81312d9..6eaf4c1 100644
--- a/swr2_asr/train.py
+++ b/swr2_asr/train.py
@@ -351,7 +351,9 @@ def train(
                     [{batch_idx * len(spectrograms)}/{data_len} \
                     ({100.0 * batch_idx / len(train_loader)}%)]\t \
                     Loss: {loss.item()}"
+                    
             )
+        return loss  
 
 
 def test(model, device, test_loader, criterion):
@@ -460,7 +462,7 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No
 
     iter_meter = IterMeter()
     for epoch in range(1, epochs + 1):
-        train(
+        loss = train(
             model,
             device,
             train_loader,
-- 
cgit v1.2.3


From d5568bb9f51c4b586c7bd8537140cb1e201f5840 Mon Sep 17 00:00:00 2001
From: JoJoBarthold2
Date: Sat, 19 Aug 2023 13:29:20 +0200
Subject: also now saves loss ( hahah funny meme) | ||

|| |_
---
 swr2_asr/train.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'swr2_asr')

diff --git a/swr2_asr/train.py b/swr2_asr/train.py
index 6eaf4c1..9a8620f 100644
--- a/swr2_asr/train.py
+++ b/swr2_asr/train.py
@@ -351,10 +351,8 @@ def train(
                     [{batch_idx * len(spectrograms)}/{data_len} \
                     ({100.0 * batch_idx / len(train_loader)}%)]\t \
                     Loss: {loss.item()}"
-                    
             )
-        return loss  
-
+            return loss.item()
 
 def test(model, device, test_loader, criterion):
     """Test"""
@@ -476,7 +474,7 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No
             torch.save({
                 'epoch': epoch,
                 'model_state_dict': model.state_dict(),
-                 },MODEL_SAVE_PATH)
+                'loss': loss},MODEL_SAVE_PATH)
         test(model=model, device=device, test_loader=test_loader, criterion=criterion)
 
 
-- 
cgit v1.2.3


From 631ed7a3f7230cb61023875f3a0945542f6e97a9 Mon Sep 17 00:00:00 2001
From: JoJoBarthold2
Date: Sat, 19 Aug 2023 13:31:28 +0200
Subject: fix

---
 swr2_asr/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'swr2_asr')

diff --git a/swr2_asr/train.py b/swr2_asr/train.py
index 9a8620f..2e72dee 100644
--- a/swr2_asr/train.py
+++ b/swr2_asr/train.py
@@ -352,7 +352,7 @@ def train(
                     ({100.0 * batch_idx / len(train_loader)}%)]\t \
                     Loss: {loss.item()}"
             )
-            return loss.item()
+        return loss.item()
 
 def test(model, device, test_loader, criterion):
     """Test"""
-- 
cgit v1.2.3


From aea161ee7f2c96aab529ca22675fb54cdcadbd12 Mon Sep 17 00:00:00 2001
From: JoJoBarthold2
Date: Sat, 19 Aug 2023 14:13:57 +0200
Subject: loading now works

---
 swr2_asr/train.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'swr2_asr')

diff --git a/swr2_asr/train.py b/swr2_asr/train.py
index 2e72dee..346be0b 100644
--- a/swr2_asr/train.py
+++ b/swr2_asr/train.py
@@ -9,8 +9,8 @@ from torch.utils.data import DataLoader
 import torchaudio
 from .loss_scores import cer, wer
 
-MODEL_SAVE_PATH = "models/model.pt"
-LOSS
+
+
 
 class TextTransform:
     """Maps characters to integers and vice versa"""
@@ -388,7 +388,7 @@ def test(model, device, test_loader, criterion):
     )
 
 
-def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> None:
+def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3,load: bool=False, path: str="models/model.pt") -> None:
     """Runs the training script."""
     hparams = {
         "n_cnn_layers": 3,
@@ -446,10 +446,14 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No
     print(
         "Num Model Parameters", sum([param.nelement() for param in model.parameters()])
     )
-
     optimizer = optim.AdamW(model.parameters(), hparams["learning_rate"])
     criterion = nn.CTCLoss(blank=28).to(device)
-
+    if load:
+        checkpoint = torch.load(path)
+        model.load_state_dict(checkpoint['model_state_dict'])
+        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        epoch = checkpoint['epoch']
+        loss = checkpoint['loss']
     scheduler = optim.lr_scheduler.OneCycleLR(
         optimizer,
         max_lr=hparams["learning_rate"],
@@ -474,7 +478,7 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No
             torch.save({
                 'epoch': epoch,
                 'model_state_dict': model.state_dict(),
-                'loss': loss},MODEL_SAVE_PATH)
+                'loss': loss},path)
         test(model=model, device=device, test_loader=test_loader, criterion=criterion)
 
 
@@ -482,10 +486,13 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No
 @click.option("--learning-rate", default=1e-3, help="Learning rate")
 @click.option("--batch_size", default=1, help="Batch size")
 @click.option("--epochs", default=1, help="Number of epochs")
-def run_cli(learning_rate: float, batch_size: int, epochs: int) -> None:
+@click.option("--load", default = False, help="Do you want to load a model?")
+@click.option("--path",default="models/model.pt",
+              help= "Path where the model will be saved to/loaded from" )
+def run_cli(learning_rate: float, batch_size: int, epochs: int, load:bool,path:str) -> None:
     """Runs the training script."""
-    run(learning_rate=learning_rate, batch_size=batch_size, epochs=epochs)
+    run(learning_rate=learning_rate, batch_size=batch_size, epochs=epochs,load= load, path = path)
 
 
 if __name__ == "__main__":
-    run(learning_rate=5e-4, batch_size=16, epochs=1)
+    run(learning_rate=5e-4, batch_size=16, epochs=1,load=False, path= "models/model.pt")
-- 
cgit v1.2.3


From 66c37e72ef2dc7c88e1814627f35e506c7c09648 Mon Sep 17 00:00:00 2001
From: Marvin Borner
Date: Sat, 19 Aug 2023 14:19:20 +0200
Subject: Started distribution

---
 swr2_asr/train.py | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

(limited to 'swr2_asr')

diff --git a/swr2_asr/train.py b/swr2_asr/train.py
index 81312d9..29c2293 100644
--- a/swr2_asr/train.py
+++ b/swr2_asr/train.py
@@ -1,11 +1,14 @@
 """Training script for the ASR model."""
 from AudioLoader.speech import MultilingualLibriSpeech
+import os
 import click
 import torch
 import torch.nn as nn
 import torch.optim as optim
 import torch.nn.functional as F
 from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+import torch.distributed as dist
 import torchaudio
 from .loss_scores import cer, wer
 
@@ -388,7 +391,7 @@ def test(model, device, test_loader, criterion):
     )
 
 
-def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> None:
+def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3, world_size: int = 1) -> None:
     """Runs the training script."""
     hparams = {
         "n_cnn_layers": 3,
@@ -401,6 +404,8 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No
         "learning_rate": learning_rate,
         "batch_size": batch_size,
         "epochs": epochs,
+        "world_size": world_size,
+        "distributed": world_size > 1,
     }
 
     use_cuda = torch.cuda.is_available()
@@ -415,22 +420,34 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No
         "/Volumes/pherkel/SWR2-ASR/", "mls_german_opus", split="test", download=False
     )
 
-    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}
+    # initialize distributed training
+    ngpus_per_node = torch.cuda.device_count()
+    if hparams["distributed"]:
+        if 'SLURM_PROCID' in os.environ: # for slurm scheduler
+            hparams["rank"] = int(os.environ['SLURM_PROCID'])
+            hparams["gpu"] = hparams["rank"] % ngpus_per_node
+        dist.init_process_group(backend="nccl", init_method="env://",
+                                world_size=hparams["world_size"], rank=hparams["rank"])
 
+    train_sampler = DistributedSampler(train_dataset, shuffle=True)
     train_loader = DataLoader(
         train_dataset,
         batch_size=hparams["batch_size"],
         shuffle=True,
+        sampler=train_sampler,
+        num_workers=hparams["world_size"], # TODO?
+        pin_memory=True,
         collate_fn=lambda x: data_processing(x, "train"),
-        **kwargs,
     )
 
     test_loader = DataLoader(
         test_dataset,
         batch_size=hparams["batch_size"],
         shuffle=True,
+        sampler=None,
+        num_workers=hparams["world_size"], # TODO?
+        pin_memory=True,
         collate_fn=lambda x: data_processing(x, "train"),
-        **kwargs,
     )
 
     model = SpeechRecognitionModel(
@@ -443,6 +460,17 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No
         hparams["dropout"],
     ).to(device)
 
+    if hparams["distributed"]:
+        if "gpu" in hparams:
+            torch.cuda.set_device(hparams["gpu"])
+            model.cuda(hparams["gpu"])
+            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[hparams["gpu"]])
+            model_without_ddp = model.module
+        else:
+            model.cuda()
+            model = torch.nn.parallel.DistributedDataParallel(model)
+            model_without_ddp = model.module
+
     print(
         "Num Model Parameters", sum([param.nelement() for param in model.parameters()])
     )
@@ -482,9 +510,10 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3) -> No
 @click.option("--learning-rate", default=1e-3, help="Learning rate")
 @click.option("--batch_size", default=1, help="Batch size")
 @click.option("--epochs", default=1, help="Number of epochs")
-def run_cli(learning_rate: float, batch_size: int, epochs: int) -> None:
+@click.option("--world_size", default=1, help="Number of nodes for distribution")
+def run_cli(learning_rate: float, batch_size: int, epochs: int, world_size: int) -> None:
     """Runs the training script."""
-    run(learning_rate=learning_rate, batch_size=batch_size, epochs=epochs)
+    run(learning_rate=learning_rate, batch_size=batch_size, epochs=epochs, world_size=world_size)
 
 
 if __name__ == "__main__":
-- 
cgit v1.2.3


From ec177107cb3a1a31d2fc49cc4990413af287305e Mon Sep 17 00:00:00 2001
From: Marvin Borner
Date: Sat, 19 Aug 2023 14:40:47 +0200
Subject: Fixed some distribution thingies

---
 pyproject.toml    |  2 +-
 swr2_asr/train.py | 98 +++++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 71 insertions(+), 29 deletions(-)

(limited to 'swr2_asr')

diff --git a/pyproject.toml b/pyproject.toml
index fdd89a5..1c29b7c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ readme = "readme.md"
 packages = [{include = "swr2_asr"}]
 
 [tool.poetry.dependencies]
-python = "~3.10"
+python = "^3.10"
 torch = "2.0.0"
 torchaudio = "2.0.1"
 audioloader = {git = "https://github.com/marvinborner/AudioLoader.git"}
diff --git a/swr2_asr/train.py b/swr2_asr/train.py
index ba002e0..56d10c0 100644
--- a/swr2_asr/train.py
+++ b/swr2_asr/train.py
@@ -13,8 +13,6 @@ import torchaudio
 from .loss_scores import cer, wer
 
 
-
-
 class TextTransform:
     """Maps characters to integers and vice versa"""
 
@@ -357,6 +355,7 @@ def train(
             )
         return loss.item()
 
+
 def test(model, device, test_loader, criterion):
     """Test"""
     print("\nevaluating...")
@@ -391,7 +390,15 @@ def test(model, device, test_loader, criterion):
     )
 
 
-def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3, world_size: int = 1, load: bool=False, path: str="models/model.pt") -> None:
+def run(
+    learning_rate: float,
+    batch_size: int,
+    epochs: int,
+    world_size: int,
+    load: bool,
+    path: str,
+    dataset_path: str,
+) -> None:
     """Runs the training script."""
     hparams = {
         "n_cnn_layers": 3,
@@ -413,29 +420,38 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3, world
     device = torch.device("cuda" if use_cuda else "cpu")  # pylint: disable=no-member
     # device = torch.device("mps")
 
+    download_dataset = not os.path.isdir(path)
     train_dataset = MultilingualLibriSpeech(
-        "/Volumes/pherkel/SWR2-ASR/", "mls_german_opus", split="dev", download=False
+        dataset_path, "mls_polish_opus", split="dev", download=download_dataset
     )
     test_dataset = MultilingualLibriSpeech(
-        "/Volumes/pherkel/SWR2-ASR/", "mls_german_opus", split="test", download=False
+        dataset_path, "mls_polish_opus", split="test", download=False
     )
 
     # initialize distributed training
     ngpus_per_node = torch.cuda.device_count()
     if hparams["distributed"]:
-        if 'SLURM_PROCID' in os.environ: # for slurm scheduler
-            hparams["rank"] = int(os.environ['SLURM_PROCID'])
+        if "SLURM_PROCID" in os.environ:  # for slurm scheduler
+            hparams["rank"] = int(os.environ["SLURM_PROCID"])
             hparams["gpu"] = hparams["rank"] % ngpus_per_node
-        dist.init_process_group(backend="nccl", init_method="env://",
-                                world_size=hparams["world_size"], rank=hparams["rank"])
+        dist.init_process_group(
+            backend="nccl",
+            init_method="env://",
+            world_size=hparams["world_size"],
+            rank=hparams["rank"],
+        )
 
-    train_sampler = DistributedSampler(train_dataset, shuffle=True)
+    train_sampler = (
+        DistributedSampler(train_dataset, shuffle=True)
+        if hparams["distributed"]
+        else None
+    )
     train_loader = DataLoader(
         train_dataset,
         batch_size=hparams["batch_size"],
         shuffle=True,
         sampler=train_sampler,
-        num_workers=hparams["world_size"], # TODO?
+        num_workers=hparams["world_size"],  # TODO?
         pin_memory=True,
         collate_fn=lambda x: data_processing(x, "train"),
     )
@@ -445,7 +461,7 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3, world
         batch_size=hparams["batch_size"],
         shuffle=True,
         sampler=None,
-        num_workers=hparams["world_size"], # TODO?
+        num_workers=hparams["world_size"],  # TODO?
         pin_memory=True,
         collate_fn=lambda x: data_processing(x, "train"),
     )
@@ -464,7 +480,9 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3, world
         if "gpu" in hparams:
             torch.cuda.set_device(hparams["gpu"])
             model.cuda(hparams["gpu"])
-            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[hparams["gpu"]])
+            model = torch.nn.parallel.DistributedDataParallel(
+                model, device_ids=[hparams["gpu"]]
+            )
             model_without_ddp = model.module
         else:
             model.cuda()
@@ -478,10 +496,10 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3, world
     criterion = nn.CTCLoss(blank=28).to(device)
     if load:
         checkpoint = torch.load(path)
-        model.load_state_dict(checkpoint['model_state_dict'])
-        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
-        epoch = checkpoint['epoch']
-        loss = checkpoint['loss']
+        model.load_state_dict(checkpoint["model_state_dict"])
+        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+        epoch = checkpoint["epoch"]
+        loss = checkpoint["loss"]
     scheduler = optim.lr_scheduler.OneCycleLR(
         optimizer,
         max_lr=hparams["learning_rate"],
@@ -502,22 +520,46 @@ def run(learning_rate: float = 5e-4, batch_size: int = 8, epochs: int = 3, world
             epoch,
             iter_meter,
         )
-        if epoch%3 == 0 or epoch == epochs:
-            torch.save({
-                'epoch': epoch,
-                'model_state_dict': model.state_dict(),
-                'loss': loss},path)
+        if epoch % 3 == 0 or epoch == epochs:
+            torch.save(
+                {"epoch": epoch, "model_state_dict": model.state_dict(), "loss": loss},
+                path,
+            )
         test(model=model, device=device, test_loader=test_loader, criterion=criterion)
 
 
 @click.command()
-@click.option("--learning-rate", default=1e-3, help="Learning rate")
+@click.option("--learning_rate", default=1e-3, help="Learning rate")
 @click.option("--batch_size", default=1, help="Batch size")
 @click.option("--epochs", default=1, help="Number of epochs")
 @click.option("--world_size", default=1, help="Number of nodes for distribution")
-@click.option("--load", default = False, help="Do you want to load a model?")
-@click.option("--path",default="models/model.pt",
-              help= "Path where the model will be saved to/loaded from" )
-def run_cli(learning_rate: float, batch_size: int, epochs: int, world_size: int, load: bool, path: str) -> None:
+@click.option("--load", default=False, help="Do you want to load a model?")
+@click.option(
+    "--path",
+    default="models/model.pt",
+    help="Path where the model will be saved to/loaded from",
+)
+@click.option(
+    "--dataset_path",
+    default="data/",
+    help="Path for the dataset directory",
+)
+def run_cli(
+    learning_rate: float,
+    batch_size: int,
+    epochs: int,
+    world_size: int,
+    load: bool,
+    path: str,
+    dataset_path: str,
+) -> None:
     """Runs the training script."""
-    run(learning_rate=learning_rate, batch_size=batch_size, epochs=epochs, world_size=world_size, load=load, path=path)
+    run(
+        learning_rate=learning_rate,
+        batch_size=batch_size,
+        epochs=epochs,
+        world_size=world_size,
+        load=load,
+        path=path,
+        dataset_path=dataset_path,
+    )
-- 
cgit v1.2.3