diff options
author | JoJoBarthold2 | 2023-09-16 15:48:16 +0200 |
---|---|---|
committer | JoJoBarthold2 | 2023-09-16 15:48:16 +0200 |
commit | ec8bfe9df205608282e5297635363fc8fc8fe55b (patch) | |
tree | 8cf949da526bd59f2b44ed0a59b4352ab84e4147 /swr2_asr/utils | |
parent | ea42dd50f167307d52fb128823904fe46f1118ec (diff) |
created a method that prints the lexicon
Diffstat (limited to 'swr2_asr/utils')
-rw-r--r-- | swr2_asr/utils/data.py | 21 |
1 files changed, 21 insertions, 0 deletions
diff --git a/swr2_asr/utils/data.py b/swr2_asr/utils/data.py index d551c98..f484bdd 100644 --- a/swr2_asr/utils/data.py +++ b/swr2_asr/utils/data.py @@ -343,3 +343,24 @@ class MLSDataset(Dataset): dataset_lookup_entry["chapterid"], idx, ) # type: ignore + + def create_lexicon(vocab_counts_path, lexicon_path): + + words_list = [] + with open(vocab_counts_path, 'r') as file: + for line in file: + + words = line.split() + if len(words) >= 1: + + word = words[0] + words_list.append(word) + + with open(lexicon_path, 'w') as file: + for word in words_list: + file.write(f"{word} ") + for char in word: + file.write(char + ' ') + file.write("|") + +
\ No newline at end of file |