From 58b30927bd870604a4077a8af9ec3cad7b0be21c Mon Sep 17 00:00:00 2001
From: Pherkel
Date: Mon, 11 Sep 2023 21:52:42 +0200
Subject: changed config to yaml!

---
 swr2_asr/inference.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'swr2_asr/inference.py')

diff --git a/swr2_asr/inference.py b/swr2_asr/inference.py
index c3eec42..f8342f7 100644
--- a/swr2_asr/inference.py
+++ b/swr2_asr/inference.py
@@ -1,11 +1,12 @@
 """Training script for the ASR model."""
+from typing import TypedDict
+
 import torch
-import torchaudio
 import torch.nn.functional as F
-from typing import TypedDict
+import torchaudio
 
-from swr2_asr.tokenizer import CharTokenizer
 from swr2_asr.model_deep_speech import SpeechRecognitionModel
+from swr2_asr.utils.tokenizer import CharTokenizer
 
 
 class HParams(TypedDict):
@@ -28,8 +29,7 @@ def greedy_decoder(output, tokenizer, collapse_repeated=True):
     arg_maxes = torch.argmax(output, dim=2)  # pylint: disable=no-member
     blank_label = tokenizer.encode(" ").ids[0]
     decodes = []
-    targets = []
-    for i, args in enumerate(arg_maxes):
+    for _i, args in enumerate(arg_maxes):
         decode = []
         for j, index in enumerate(args):
             if index != blank_label:
@@ -44,7 +44,7 @@ def main() -> None:
     """inference function."""
 
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    device = torch.device(device)
+    device = torch.device(device)  # pylint: disable=no-member
 
     tokenizer = CharTokenizer.from_file("char_tokenizer_german.json")
 
@@ -90,7 +90,7 @@ def main() -> None:
     model.load_state_dict(state_dict)
 
     # waveform, sample_rate = torchaudio.load("test.opus")
-    waveform, sample_rate = torchaudio.load("marvin_rede.flac")
+    waveform, sample_rate = torchaudio.load("marvin_rede.flac")  # pylint: disable=no-member
     if sample_rate != spectrogram_hparams["sample_rate"]:
         resampler = torchaudio.transforms.Resample(sample_rate, spectrogram_hparams["sample_rate"])
         waveform = resampler(waveform)
@@ -103,7 +103,7 @@ def main() -> None:
     specs = [spec]
     specs = torch.nn.utils.rnn.pad_sequence(specs, batch_first=True).unsqueeze(1).transpose(2, 3)
 
-    output = model(specs)
+    output = model(specs)  # pylint: disable=not-callable
     output = F.log_softmax(output, dim=2)
     output = output.transpose(0, 1)  # (time, batch, n_class)
     decodes = greedy_decoder(output, tokenizer)
-- 
cgit v1.2.3


From 6f5513140f153206cfa91df3077e67ce58043d35 Mon Sep 17 00:00:00 2001
From: Pherkel
Date: Mon, 11 Sep 2023 22:58:19 +0200
Subject: model loading is broken :(

---
 config.philipp.yaml   |   9 +++-
 config.train.yaml     |  28 ----------
 config.yaml           |  34 ++++++++++++
 swr2_asr/inference.py | 140 ++++++++++++++++++++++----------------------------
 swr2_asr/train.py     |   2 +-
 5 files changed, 103 insertions(+), 110 deletions(-)
 delete mode 100644 config.train.yaml
 create mode 100644 config.yaml

(limited to 'swr2_asr/inference.py')

diff --git a/config.philipp.yaml b/config.philipp.yaml
index 6b905cd..4a723c6 100644
--- a/config.philipp.yaml
+++ b/config.philipp.yaml
@@ -12,6 +12,7 @@ training:
   epochs: 3 
   eval_every_n: 1 # evaluate every n epochs
   num_workers: 4 # number of workers for dataloader
+  device: "cuda" # device to run inference on if gpu is available, else "cpu" will be set automatically
 
 dataset:
   download: True
@@ -25,5 +26,9 @@ tokenizer:
   tokenizer_path: "data/tokenizers/char_tokenizer_german.json"
 
 checkpoints:
-  model_load_path: ~ # path to load model from
-  model_save_path: ~ # path to save model to
\ No newline at end of file
+  model_load_path: "data/runs/epoch30" # path to load model from
+  model_save_path: ~ # path to save model to
+
+inference:
+  model_load_path: "data/runs/epoch30" # path to load model from
+  device: "cuda" # device to run inference on if gpu is available, else "cpu" will be set automatically
\ No newline at end of file
diff --git a/config.train.yaml b/config.train.yaml
deleted file mode 100644
index c82439d..0000000
--- a/config.train.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-model: 
-  n_cnn_layers: 3
-  n_rnn_layers: 5
-  rnn_dim: 512
-  n_feats: 128 # number of mel features
-  stride: 2
-  dropout: 0.25 # recommended to be around 0.4 for smaller datasets, 0.1 for really large datasets
-
-training:
-  learning_rate: 5e-4
-  batch_size: 8 # recommended to maximum number that fits on the GPU (batch size of 32 fits on a 12GB GPU)
-  epochs: 3 
-  eval_every_n: 3 # evaluate every n epochs
-  num_workers: 8 # number of workers for dataloader
-
-dataset:
-  download: True
-  dataset_root_path: "YOUR/PATH" # files will be downloaded into this dir
-  language_name: "mls_german_opus"
-  limited_supervision: False # set to True if you want to use limited supervision
-  dataset_percentage: 1.0 # percentage of dataset to use (1.0 = 100%)
-
-tokenizer:
-  tokenizer_path: "data/tokenizers/char_tokenizer_german.yaml"
-
-checkpoints:
-  model_load_path: "YOUR/PATH" # path to load model from
-  model_save_path: "YOUR/PATH" # path to save model to
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
new file mode 100644
index 0000000..e5ff43a
--- /dev/null
+++ b/config.yaml
@@ -0,0 +1,34 @@
+model: 
+  n_cnn_layers: 3
+  n_rnn_layers: 5
+  rnn_dim: 512
+  n_feats: 128 # number of mel features
+  stride: 2
+  dropout: 0.3 # recommended to be around 0.4 for smaller datasets, 0.1 for really large datasets
+
+training:
+  learning_rate: 5e-4
+  batch_size: 8 # recommended to maximum number that fits on the GPU (batch size of 32 fits on a 12GB GPU)
+  epochs: 3 
+  eval_every_n: 3 # evaluate every n epochs
+  num_workers: 8 # number of workers for dataloader
+
+dataset:
+  download: True
+  dataset_root_path: "YOUR/PATH" # files will be downloaded into this dir
+  language_name: "mls_german_opus"
+  limited_supervision: False # set to True if you want to use limited supervision
+  dataset_percentage: 1.0 # percentage of dataset to use (1.0 = 100%)
+  shuffle: True
+
+tokenizer:
+  tokenizer_path: "data/tokenizers/char_tokenizer_german.yaml"
+
+checkpoints:
+  model_load_path: "YOUR/PATH" # path to load model from
+  model_save_path: "YOUR/PATH" # path to save model to
+
+inference:
+  model_load_path: "YOUR/PATH" # path to load model from
+  beam_width: 10 # beam width for beam search
+  device: "cuda" # device to run inference on if gpu is available, else "cpu" will be set automatically
\ No newline at end of file
diff --git a/swr2_asr/inference.py b/swr2_asr/inference.py
index f8342f7..6495a9a 100644
--- a/swr2_asr/inference.py
+++ b/swr2_asr/inference.py
@@ -1,35 +1,20 @@
 """Training script for the ASR model."""
-from typing import TypedDict
-
+import click
 import torch
 import torch.nn.functional as F
 import torchaudio
+import yaml
 
 from swr2_asr.model_deep_speech import SpeechRecognitionModel
 from swr2_asr.utils.tokenizer import CharTokenizer
 
 
-class HParams(TypedDict):
-    """Type for the hyperparameters of the model."""
-
-    n_cnn_layers: int
-    n_rnn_layers: int
-    rnn_dim: int
-    n_class: int
-    n_feats: int
-    stride: int
-    dropout: float
-    learning_rate: float
-    batch_size: int
-    epochs: int
-
-
-def greedy_decoder(output, tokenizer, collapse_repeated=True):
+def greedy_decoder(output, tokenizer: CharTokenizer, collapse_repeated=True):
     """Greedily decode a sequence."""
     arg_maxes = torch.argmax(output, dim=2)  # pylint: disable=no-member
-    blank_label = tokenizer.encode(" ").ids[0]
+    blank_label = tokenizer.get_blank_token()
     decodes = []
-    for _i, args in enumerate(arg_maxes):
+    for args in arg_maxes:
         decode = []
         for j, index in enumerate(args):
             if index != blank_label:
@@ -40,75 +25,72 @@ def greedy_decoder(output, tokenizer, collapse_repeated=True):
     return decodes
 
 
-def main() -> None:
+@click.command()
+@click.option(
+    "--config_path",
+    default="config.yaml",
+    help="Path to yaml config file",
+    type=click.Path(exists=True),
+)
+@click.option(
+    "--file_path",
+    help="Path to audio file",
+    type=click.Path(exists=True),
+)
+def main(config_path: str, file_path: str) -> None:
     """inference function."""
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
+    with open(config_path, "r", encoding="utf-8") as yaml_file:
+        config_dict = yaml.safe_load(yaml_file)
+
+    # Create separate dictionaries for each top-level key
+    model_config = config_dict.get("model", {})
+    tokenizer_config = config_dict.get("tokenizer", {})
+    inference_config = config_dict.get("inference", {})
+
+    if inference_config["device"] == "cpu":
+        device = "cpu"
+    elif inference_config["device"] == "cuda":
+        device = "cuda" if torch.cuda.is_available() else "cpu"
     device = torch.device(device)  # pylint: disable=no-member
 
-    tokenizer = CharTokenizer.from_file("char_tokenizer_german.json")
-
-    spectrogram_hparams = {
-        "sample_rate": 16000,
-        "n_fft": 400,
-        "win_length": 400,
-        "hop_length": 160,
-        "n_mels": 128,
-        "f_min": 0,
-        "f_max": 8000,
-        "power": 2.0,
-    }
-
-    hparams = HParams(
-        n_cnn_layers=3,
-        n_rnn_layers=5,
-        rnn_dim=512,
-        n_class=tokenizer.get_vocab_size(),
-        n_feats=128,
-        stride=2,
-        dropout=0.1,
-        learning_rate=0.1,
-        batch_size=30,
-        epochs=100,
-    )
+    tokenizer = CharTokenizer.from_file(tokenizer_config["tokenizer_path"])
 
     model = SpeechRecognitionModel(
-        hparams["n_cnn_layers"],
-        hparams["n_rnn_layers"],
-        hparams["rnn_dim"],
-        hparams["n_class"],
-        hparams["n_feats"],
-        hparams["stride"],
-        hparams["dropout"],
+        model_config["n_cnn_layers"],
+        model_config["n_rnn_layers"],
+        model_config["rnn_dim"],
+        tokenizer.get_vocab_size(),
+        model_config["n_feats"],
+        model_config["stride"],
+        model_config["dropout"],
     ).to(device)
 
-    checkpoint = torch.load("model8", map_location=device)
-    state_dict = {
-        k[len("module.") :] if k.startswith("module.") else k: v
-        for k, v in checkpoint["model_state_dict"].items()
-    }
-    model.load_state_dict(state_dict)
-
-    # waveform, sample_rate = torchaudio.load("test.opus")
-    waveform, sample_rate = torchaudio.load("marvin_rede.flac")  # pylint: disable=no-member
-    if sample_rate != spectrogram_hparams["sample_rate"]:
-        resampler = torchaudio.transforms.Resample(sample_rate, spectrogram_hparams["sample_rate"])
+    checkpoint = torch.load(inference_config["model_load_path"], map_location=device)
+    print(checkpoint["model_state_dict"].keys())
+    model.load_state_dict(checkpoint["model_state_dict"], strict=False)
+    model.eval()
+    waveform, sample_rate = torchaudio.load(file_path)  # pylint: disable=no-member
+    if waveform.shape[0] != 1:
+        waveform = waveform[1]
+        waveform = waveform.unsqueeze(0)
+    if sample_rate != 16000:
+        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
         waveform = resampler(waveform)
+        sample_rate = 16000
+
+    data_processing = torchaudio.transforms.MelSpectrogram(n_mels=model_config["n_feats"])
+
+    spec = data_processing(waveform).squeeze(0).transpose(0, 1)
 
-    spec = (
-        torchaudio.transforms.MelSpectrogram(**spectrogram_hparams)(waveform)
-        .squeeze(0)
-        .transpose(0, 1)
-    )
-    specs = [spec]
-    specs = torch.nn.utils.rnn.pad_sequence(specs, batch_first=True).unsqueeze(1).transpose(2, 3)
+    spec = spec.unsqueeze(0)
+    spec = spec.transpose(1, 2)
+    spec = spec.unsqueeze(0)
+    output = model(spec)  # pylint: disable=not-callable
+    output = F.log_softmax(output, dim=2)  # (batch, time, n_class)
+    decoded_preds = greedy_decoder(output, tokenizer)
 
-    output = model(specs)  # pylint: disable=not-callable
-    output = F.log_softmax(output, dim=2)
-    output = output.transpose(0, 1)  # (time, batch, n_class)
-    decodes = greedy_decoder(output, tokenizer)
-    print(decodes)
+    print(decoded_preds)
 
 
 if __name__ == "__main__":
-    main()
+    main()  # pylint: disable=no-value-for-parameter
diff --git a/swr2_asr/train.py b/swr2_asr/train.py
index ca70d21..ec25918 100644
--- a/swr2_asr/train.py
+++ b/swr2_asr/train.py
@@ -263,7 +263,7 @@ def main(config_path: str):
     prev_epoch = 0
 
     if checkpoints_config["model_load_path"] is not None:
-        checkpoint = torch.load(checkpoints_config["model_load_path"])
+        checkpoint = torch.load(checkpoints_config["model_load_path"], map_location=device)
         model.load_state_dict(checkpoint["model_state_dict"])
         optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
         prev_epoch = checkpoint["epoch"]
-- 
cgit v1.2.3


From 96fee5f59f67187292ddf37db4660c5085fb66b5 Mon Sep 17 00:00:00 2001
From: Pherkel
Date: Mon, 11 Sep 2023 23:08:45 +0200
Subject: changed name to match pre-trained weights

---
 swr2_asr/inference.py         |  4 +--
 swr2_asr/model_deep_speech.py | 68 +++++++++++++++----------------------------
 2 files changed, 25 insertions(+), 47 deletions(-)

(limited to 'swr2_asr/inference.py')

diff --git a/swr2_asr/inference.py b/swr2_asr/inference.py
index 6495a9a..3f6a44e 100644
--- a/swr2_asr/inference.py
+++ b/swr2_asr/inference.py
@@ -66,9 +66,9 @@ def main(config_path: str, file_path: str) -> None:
     ).to(device)
 
     checkpoint = torch.load(inference_config["model_load_path"], map_location=device)
-    print(checkpoint["model_state_dict"].keys())
-    model.load_state_dict(checkpoint["model_state_dict"], strict=False)
+    model.load_state_dict(checkpoint["model_state_dict"], strict=True)
     model.eval()
+
     waveform, sample_rate = torchaudio.load(file_path)  # pylint: disable=no-member
     if waveform.shape[0] != 1:
         waveform = waveform[1]
diff --git a/swr2_asr/model_deep_speech.py b/swr2_asr/model_deep_speech.py
index 77f4c8a..73f5a81 100644
--- a/swr2_asr/model_deep_speech.py
+++ b/swr2_asr/model_deep_speech.py
@@ -10,8 +10,8 @@ from torch import nn
 class CNNLayerNorm(nn.Module):
     """Layer normalization built for cnns input"""
 
-    def __init__(self, n_feats: int):
-        super().__init__()
+    def __init__(self, n_feats):
+        super(CNNLayerNorm, self).__init__()
         self.layer_norm = nn.LayerNorm(n_feats)
 
     def forward(self, data):
@@ -22,34 +22,22 @@ class CNNLayerNorm(nn.Module):
 
 
 class ResidualCNN(nn.Module):
-    """Residual CNN inspired by https://arxiv.org/pdf/1603.05027.pdf"""
+    """Residual CNN inspired by https://arxiv.org/pdf/1603.05027.pdf
+    except with layer norm instead of batch norm
+    """
 
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel: int,
-        stride: int,
-        dropout: float,
-        n_feats: int,
-    ):
-        super().__init__()
+    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
+        super(ResidualCNN, self).__init__()
 
         self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding=kernel // 2)
-        self.cnn2 = nn.Conv2d(
-            out_channels,
-            out_channels,
-            kernel,
-            stride,
-            padding=kernel // 2,
-        )
+        self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding=kernel // 2)
         self.dropout1 = nn.Dropout(dropout)
         self.dropout2 = nn.Dropout(dropout)
         self.layer_norm1 = CNNLayerNorm(n_feats)
         self.layer_norm2 = CNNLayerNorm(n_feats)
 
     def forward(self, data):
-        """x (batch, channel, feature, time)"""
+        """data (batch, channel, feature, time)"""
         residual = data  # (batch, channel, feature, time)
         data = self.layer_norm1(data)
         data = F.gelu(data)
@@ -64,18 +52,12 @@ class ResidualCNN(nn.Module):
 
 
 class BidirectionalGRU(nn.Module):
-    """Bidirectional GRU with Layer Normalization and Dropout"""
+    """Bidirectional GRU layer"""
 
-    def __init__(
-        self,
-        rnn_dim: int,
-        hidden_size: int,
-        dropout: float,
-        batch_first: bool,
-    ):
-        super().__init__()
+    def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
+        super(BidirectionalGRU, self).__init__()
 
-        self.bi_gru = nn.GRU(
+        self.BiGRU = nn.GRU(  # pylint: disable=invalid-name
             input_size=rnn_dim,
             hidden_size=hidden_size,
             num_layers=1,
@@ -86,11 +68,11 @@ class BidirectionalGRU(nn.Module):
         self.dropout = nn.Dropout(dropout)
 
     def forward(self, data):
-        """data (batch, time, feature)"""
+        """x (batch, time, feature)"""
         data = self.layer_norm(data)
         data = F.gelu(data)
+        data, _ = self.BiGRU(data)
         data = self.dropout(data)
-        data, _ = self.bi_gru(data)
         return data
 
 
@@ -98,18 +80,14 @@ class SpeechRecognitionModel(nn.Module):
     """Speech Recognition Model Inspired by DeepSpeech 2"""
 
     def __init__(
-        self,
-        n_cnn_layers: int,
-        n_rnn_layers: int,
-        rnn_dim: int,
-        n_class: int,
-        n_feats: int,
-        stride: int = 2,
-        dropout: float = 0.1,
+        self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1
     ):
-        super().__init__()
-        n_feats //= 2
-        self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3 // 2)
+        super(SpeechRecognitionModel, self).__init__()
+        n_feats = n_feats // 2
+        self.cnn = nn.Conv2d(
+            1, 32, 3, stride=stride, padding=3 // 2
+        )  # cnn for extracting heirachal features
+
         # n residual cnn layers with filter size of 32
         self.rescnn_layers = nn.Sequential(
             *[
@@ -137,7 +115,7 @@ class SpeechRecognitionModel(nn.Module):
         )
 
     def forward(self, data):
-        """data (batch, channel, feature, time)"""
+        """x (batch, channel, feature, time)"""
         data = self.cnn(data)
         data = self.rescnn_layers(data)
         sizes = data.size()
-- 
cgit v1.2.3