aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--config.cluster.yaml12
-rw-r--r--data/own/swabian.flacbin0 -> 407453 bytes
-rwxr-xr-xhpc.sh10
-rwxr-xr-xhpc_train.sh2
-rw-r--r--swr2_asr/inference.py7
-rw-r--r--swr2_asr/utils/visualization.py36
6 files changed, 44 insertions, 23 deletions
diff --git a/config.cluster.yaml b/config.cluster.yaml
index a3def0e..7af0aca 100644
--- a/config.cluster.yaml
+++ b/config.cluster.yaml
@@ -4,18 +4,18 @@ model:
rnn_dim: 512
n_feats: 128 # number of mel features
stride: 2
- dropout: 0.25 # recommended to be around 0.4-0.6 for smaller datasets, 0.1 for really large datasets
+ dropout: 0.2 # recommended to be around 0.4-0.6 for smaller datasets, 0.1 for really large datasets
training:
learning_rate: 0.0005
- batch_size: 64 # recommended to maximum number that fits on the GPU (batch size of 32 fits on a 12GB GPU)
+ batch_size: 400 # recommended to maximum number that fits on the GPU (batch size of 32 fits on a 12GB GPU)
epochs: 150
eval_every_n: 5 # evaluate every n epochs
- num_workers: 8 # number of workers for dataloader
+ num_workers: 12 # number of workers for dataloader
device: "cuda" # device to run inference on if gpu is available, else "cpu" will be set automatically
dataset:
- download: True
+ download: False
dataset_root_path: "/mnt/lustre/mladm/mfa252/data" # files will be downloaded into this dir
language_name: "mls_german_opus"
limited_supervision: False # set to True if you want to use limited supervision
@@ -26,9 +26,9 @@ tokenizer:
tokenizer_path: "data/tokenizers/char_tokenizer_german.json"
checkpoints:
- model_load_path: "data/runs/epoch31" # path to load model from
+ model_load_path: "data/runs/epoch50" # path to load model from
model_save_path: "data/runs/epoch" # path to save model to
inference:
model_load_path: ~ # path to load model from
- device: "cuda" # device to run inference on if gpu is available, else "cpu" will be set automatically \ No newline at end of file
+ device: "cuda" # device to run inference on if gpu is available, else "cpu" will be set automatically
diff --git a/data/own/swabian.flac b/data/own/swabian.flac
new file mode 100644
index 0000000..69891ba
--- /dev/null
+++ b/data/own/swabian.flac
Binary files differ
diff --git a/hpc.sh b/hpc.sh
index ba0c5eb..718b6d2 100755
--- a/hpc.sh
+++ b/hpc.sh
@@ -2,15 +2,15 @@
#SBATCH --job-name=swr-teamprojekt
#SBATCH --partition=a100
-#SBATCH --time=00:30:00
+#SBATCH --time=24:00:00
### Note: --gres=gpu:x should equal to ntasks-per-node
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
-#SBATCH --gres=gpu:a100:1
-#SBATCH --cpus-per-task=8
-#SBATCH --mem=64gb
-#SBATCH --chdir=/mnt/lustre/mladm/mfa252/SWR2-cool-projekt-main/
+#SBATCH --gres=gpu:a100:4
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=32gb
+#SBATCH --chdir=/mnt/lustre/mladm/mfa252/ref/
#SBATCH --output=/mnt/lustre/mladm/mfa252/%x-%j.out
source venv/bin/activate
diff --git a/hpc_train.sh b/hpc_train.sh
index 2280087..9b21a53 100755
--- a/hpc_train.sh
+++ b/hpc_train.sh
@@ -1,3 +1,3 @@
#!/bin/sh
-yes no | python -m swr2_asr.train --config_path config.cluster.yaml
+python -m swr2_asr.train --config_path config.cluster.yaml
diff --git a/swr2_asr/inference.py b/swr2_asr/inference.py
index 3f6a44e..3c58af0 100644
--- a/swr2_asr/inference.py
+++ b/swr2_asr/inference.py
@@ -66,7 +66,12 @@ def main(config_path: str, file_path: str) -> None:
).to(device)
checkpoint = torch.load(inference_config["model_load_path"], map_location=device)
- model.load_state_dict(checkpoint["model_state_dict"], strict=True)
+
+ state_dict = {
+ k[len("module.") :] if k.startswith("module.") else k: v
+ for k, v in checkpoint["model_state_dict"].items()
+ }
+ model.load_state_dict(state_dict, strict=True)
model.eval()
waveform, sample_rate = torchaudio.load(file_path) # pylint: disable=no-member
diff --git a/swr2_asr/utils/visualization.py b/swr2_asr/utils/visualization.py
index a55d0d5..b288c5a 100644
--- a/swr2_asr/utils/visualization.py
+++ b/swr2_asr/utils/visualization.py
@@ -4,19 +4,35 @@ import matplotlib.pyplot as plt
import torch
-def plot(epochs, path):
+def plot(path):
"""Plots the losses over the epochs"""
- losses = []
+ train_losses = []
test_losses = []
cers = []
wers = []
- for epoch in range(1, epochs + 1):
- current_state = torch.load(path + str(epoch))
- losses.append(current_state["loss"])
- test_losses.append(current_state["test_loss"])
- cers.append(current_state["avg_cer"])
- wers.append(current_state["avg_wer"])
- plt.plot(losses)
- plt.plot(test_losses)
+ epoch = 5
+ while True:
+ try:
+ current_state = torch.load(path + str(epoch), map_location=torch.device("cpu"))
+ except FileNotFoundError:
+ break
+ train_losses.append((epoch, current_state["train_loss"].item()))
+ test_losses.append((epoch, current_state["test_loss"]))
+ cers.append((epoch, current_state["avg_cer"]))
+ wers.append((epoch, current_state["avg_wer"]))
+ epoch += 5
+
+ plt.plot(*zip(*train_losses), label="train_loss")
+ plt.plot(*zip(*test_losses), label="test_loss")
+ plt.plot(*zip(*cers), label="cer")
+ plt.plot(*zip(*wers), label="wer")
+ plt.xlabel("epoch")
+ plt.ylabel("score")
+ plt.title("Model performance for 5n epochs")
+ plt.legend()
plt.savefig("losses.svg")
+
+
+if __name__ == "__main__":
+ plot("data/runs/epoch")