diff options
author | Pherkel | 2023-09-18 18:13:46 +0200 |
---|---|---|
committer | GitHub | 2023-09-18 18:13:46 +0200 |
commit | f94506764bde3e4d41dc593e9d11aa7330c00e30 (patch) | |
tree | 6fc438536a72e195805c1aea97926f4c9bbd4f85 /swr2_asr/utils/data.py | |
parent | 8b3a0b47813733ef67befa6959a4d24f8518b5b7 (diff) | |
parent | 21a3b1d7cc8544fa0031b8934283382bdfd1d8f1 (diff) |
Merge pull request #38 from Algo-Boys/decoder
Decoder
Diffstat (limited to 'swr2_asr/utils/data.py')
-rw-r--r-- | swr2_asr/utils/data.py | 20 |
1 files changed, 19 insertions, 1 deletions
diff --git a/swr2_asr/utils/data.py b/swr2_asr/utils/data.py index d551c98..74cd572 100644 --- a/swr2_asr/utils/data.py +++ b/swr2_asr/utils/data.py @@ -6,7 +6,7 @@ import numpy as np import torch import torchaudio from torch import Tensor, nn -from torch.utils.data import DataLoader, Dataset +from torch.utils.data import Dataset from torchaudio.datasets.utils import _extract_tar from swr2_asr.utils.tokenizer import CharTokenizer @@ -343,3 +343,21 @@ class MLSDataset(Dataset): dataset_lookup_entry["chapterid"], idx, ) # type: ignore + + +def create_lexicon(vocab_counts_path, lexicon_path): + """Create a lexicon from the vocab_counts.txt file""" + words_list = [] + with open(vocab_counts_path, "r", encoding="utf-8") as file: + for line in file: + words = line.split() + if len(words) >= 1: + word = words[0] + words_list.append(word) + + with open(lexicon_path, "w", encoding="utf-8") as file: + for word in words_list: + file.write(f"{word} ") + for char in word: + file.write(char + " ") + file.write("<SPACE>") |