diff options
-rw-r--r-- | config.cluster.yaml | 12 | ||||
-rw-r--r-- | data/own/swabian.flac | bin | 0 -> 407453 bytes | |||
-rwxr-xr-x | hpc.sh | 10 | ||||
-rwxr-xr-x | hpc_train.sh | 2 | ||||
-rw-r--r-- | swr2_asr/inference.py | 7 | ||||
-rw-r--r-- | swr2_asr/utils/visualization.py | 36 |
6 files changed, 44 insertions, 23 deletions
diff --git a/config.cluster.yaml b/config.cluster.yaml index a3def0e..7af0aca 100644 --- a/config.cluster.yaml +++ b/config.cluster.yaml @@ -4,18 +4,18 @@ model: rnn_dim: 512 n_feats: 128 # number of mel features stride: 2 - dropout: 0.25 # recommended to be around 0.4-0.6 for smaller datasets, 0.1 for really large datasets + dropout: 0.2 # recommended to be around 0.4-0.6 for smaller datasets, 0.1 for really large datasets training: learning_rate: 0.0005 - batch_size: 64 # recommended to maximum number that fits on the GPU (batch size of 32 fits on a 12GB GPU) + batch_size: 400 # recommended to maximum number that fits on the GPU (batch size of 32 fits on a 12GB GPU) epochs: 150 eval_every_n: 5 # evaluate every n epochs - num_workers: 8 # number of workers for dataloader + num_workers: 12 # number of workers for dataloader device: "cuda" # device to run inference on if gpu is available, else "cpu" will be set automatically dataset: - download: True + download: False dataset_root_path: "/mnt/lustre/mladm/mfa252/data" # files will be downloaded into this dir language_name: "mls_german_opus" limited_supervision: False # set to True if you want to use limited supervision @@ -26,9 +26,9 @@ tokenizer: tokenizer_path: "data/tokenizers/char_tokenizer_german.json" checkpoints: - model_load_path: "data/runs/epoch31" # path to load model from + model_load_path: "data/runs/epoch50" # path to load model from model_save_path: "data/runs/epoch" # path to save model to inference: model_load_path: ~ # path to load model from - device: "cuda" # device to run inference on if gpu is available, else "cpu" will be set automatically
\ No newline at end of file + device: "cuda" # device to run inference on if gpu is available, else "cpu" will be set automatically diff --git a/data/own/swabian.flac b/data/own/swabian.flac Binary files differnew file mode 100644 index 0000000..69891ba --- /dev/null +++ b/data/own/swabian.flac @@ -2,15 +2,15 @@ #SBATCH --job-name=swr-teamprojekt #SBATCH --partition=a100 -#SBATCH --time=00:30:00 +#SBATCH --time=24:00:00 ### Note: --gres=gpu:x should equal to ntasks-per-node #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 -#SBATCH --gres=gpu:a100:1 -#SBATCH --cpus-per-task=8 -#SBATCH --mem=64gb -#SBATCH --chdir=/mnt/lustre/mladm/mfa252/SWR2-cool-projekt-main/ +#SBATCH --gres=gpu:a100:4 +#SBATCH --cpus-per-task=16 +#SBATCH --mem=32gb +#SBATCH --chdir=/mnt/lustre/mladm/mfa252/ref/ #SBATCH --output=/mnt/lustre/mladm/mfa252/%x-%j.out source venv/bin/activate diff --git a/hpc_train.sh b/hpc_train.sh index 2280087..9b21a53 100755 --- a/hpc_train.sh +++ b/hpc_train.sh @@ -1,3 +1,3 @@ #!/bin/sh -yes no | python -m swr2_asr.train --config_path config.cluster.yaml +python -m swr2_asr.train --config_path config.cluster.yaml diff --git a/swr2_asr/inference.py b/swr2_asr/inference.py index 3f6a44e..3c58af0 100644 --- a/swr2_asr/inference.py +++ b/swr2_asr/inference.py @@ -66,7 +66,12 @@ def main(config_path: str, file_path: str) -> None: ).to(device) checkpoint = torch.load(inference_config["model_load_path"], map_location=device) - model.load_state_dict(checkpoint["model_state_dict"], strict=True) + + state_dict = { + k[len("module.") :] if k.startswith("module.") else k: v + for k, v in checkpoint["model_state_dict"].items() + } + model.load_state_dict(state_dict, strict=True) model.eval() waveform, sample_rate = torchaudio.load(file_path) # pylint: disable=no-member diff --git a/swr2_asr/utils/visualization.py b/swr2_asr/utils/visualization.py index a55d0d5..b288c5a 100644 --- a/swr2_asr/utils/visualization.py +++ b/swr2_asr/utils/visualization.py @@ -4,19 +4,35 @@ import matplotlib.pyplot as plt import torch -def plot(epochs, path): +def plot(path): """Plots the losses over the epochs""" - losses = [] + train_losses = [] test_losses = [] cers = [] wers = [] - for epoch in range(1, epochs + 1): - current_state = torch.load(path + str(epoch)) - losses.append(current_state["loss"]) - test_losses.append(current_state["test_loss"]) - cers.append(current_state["avg_cer"]) - wers.append(current_state["avg_wer"]) - plt.plot(losses) - plt.plot(test_losses) + epoch = 5 + while True: + try: + current_state = torch.load(path + str(epoch), map_location=torch.device("cpu")) + except FileNotFoundError: + break + train_losses.append((epoch, current_state["train_loss"].item())) + test_losses.append((epoch, current_state["test_loss"])) + cers.append((epoch, current_state["avg_cer"])) + wers.append((epoch, current_state["avg_wer"])) + epoch += 5 + + plt.plot(*zip(*train_losses), label="train_loss") + plt.plot(*zip(*test_losses), label="test_loss") + plt.plot(*zip(*cers), label="cer") + plt.plot(*zip(*wers), label="wer") + plt.xlabel("epoch") + plt.ylabel("score") + plt.title("Model performance for 5n epochs") + plt.legend() plt.savefig("losses.svg") + + +if __name__ == "__main__": + plot("data/runs/epoch") |