From ec8bfe9df205608282e5297635363fc8fc8fe55b Mon Sep 17 00:00:00 2001 From: JoJoBarthold2 Date: Sat, 16 Sep 2023 15:48:16 +0200 Subject: created a method that prints the lexicon --- swr2_asr/utils/data.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'swr2_asr/utils/data.py') diff --git a/swr2_asr/utils/data.py b/swr2_asr/utils/data.py index d551c98..f484bdd 100644 --- a/swr2_asr/utils/data.py +++ b/swr2_asr/utils/data.py @@ -343,3 +343,24 @@ class MLSDataset(Dataset): dataset_lookup_entry["chapterid"], idx, ) # type: ignore + + def create_lexicon(vocab_counts_path, lexicon_path): + + words_list = [] + with open(vocab_counts_path, 'r') as file: + for line in file: + + words = line.split() + if len(words) >= 1: + + word = words[0] + words_list.append(word) + + with open(lexicon_path, 'w') as file: + for word in words_list: + file.write(f"{word} ") + for char in word: + file.write(char + ' ') + file.write("|") + + \ No newline at end of file -- cgit v1.2.3