diff options
-rw-r--r-- | .gitignore | 11 | ||||
-rw-r--r-- | data/tokenizers/char_tokenizer_german.json | 38 | ||||
-rw-r--r-- | swr2_asr/utils/tokenizer.py | 110 |
3 files changed, 101 insertions, 58 deletions
@@ -1,5 +1,7 @@ # Training files -data/ +data/* +!data/tokenizers + # Mac **/.DS_Store @@ -163,10 +165,3 @@ dmypy.json # Cython debug symbols cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ diff --git a/data/tokenizers/char_tokenizer_german.json b/data/tokenizers/char_tokenizer_german.json new file mode 100644 index 0000000..20db079 --- /dev/null +++ b/data/tokenizers/char_tokenizer_german.json @@ -0,0 +1,38 @@ +_ 0 +<BLANK> 1 +<UNK> 2 +<SPACE> 3 +a 4 +b 5 +c 6 +d 7 +e 8 +f 9 +g 10 +h 11 +i 12 +j 13 +k 14 +l 15 +m 16 +n 17 +o 18 +p 19 +q 20 +r 21 +s 22 +t 23 +u 24 +v 25 +w 26 +x 27 +y 28 +z 29 +é 30 +à 31 +ä 32 +ö 33 +ß 34 +ü 35 +- 36 +' 37 diff --git a/swr2_asr/utils/tokenizer.py b/swr2_asr/utils/tokenizer.py index d92465a..5482bbe 100644 --- a/swr2_asr/utils/tokenizer.py +++ b/swr2_asr/utils/tokenizer.py @@ -1,59 +1,18 @@ """Tokenizer for Multilingual Librispeech datasets""" +from datetime import datetime +import os + +from tqdm.autonotebook import tqdm + + class CharTokenizer: """Maps characters to integers and vice versa""" def __init__(self): - char_map_str = """ - _ - <BLANK> - <UNK> - <SPACE> - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - é - à - ä - ö - ß - ü - - - ' - - """ - self.char_map = {} self.index_map = {} - for idx, char in enumerate(char_map_str.strip().split("\n")): - char = char.strip() - self.char_map[char] = idx - self.index_map[idx] = char - self.index_map[1] = " " def encode(self, text: str) -> list[int]: """Use a character map and convert text to an integer sequence""" @@ -91,7 +50,59 @@ class CharTokenizer: """Get the integer representation of the <SPACE> character""" return self.char_map["<SPACE>"] - # TODO: add train function + @staticmethod + def train(dataset_path: str, language: str) -> "CharTokenizer": + """Train the tokenizer on a dataset""" + chars = set() + root_path = os.path.join(dataset_path, language) + for split in os.listdir(root_path): + split_dir = os.path.join(root_path, split) + if os.path.isdir(split_dir): + transcript_path = os.path.join(split_dir, "transcripts.txt") + + with open(transcript_path, "r", encoding="utf-8") as transcrips: + lines = transcrips.readlines() + lines = [line.split(" ", 1)[1] for line in lines] + lines = [line.strip() for line in lines] + lines = [line.lower() for line in lines] + + for line in tqdm(lines, desc=f"Training tokenizer on {split_dir} split"): + chars.update(line) + + # sort chars + chars.remove(" ") + chars = sorted(chars) + + train_tokenizer = CharTokenizer() + + train_tokenizer.char_map["_"] = 0 + train_tokenizer.char_map["<BLANK>"] = 1 + train_tokenizer.char_map["<UNK>"] = 2 + train_tokenizer.char_map["<SPACE>"] = 3 + + train_tokenizer.index_map[0] = "_" + train_tokenizer.index_map[1] = "<BLANK>" + train_tokenizer.index_map[2] = "<UNK>" + train_tokenizer.index_map[3] = "<SPACE>" + + offset = 4 + + for idx, char in enumerate(chars): + idx += offset + train_tokenizer.char_map[char] = idx + train_tokenizer.index_map[idx] = char + + train_tokenizer_dir = os.path.join("data/tokenizers") + train_tokenizer_path = os.path.join( + train_tokenizer_dir, + f"char_tokenizer_{language}_{datetime.now().strftime('%Y-%m-%d_%H-%M')}.json", + ) + + if not os.path.exists(os.path.dirname(train_tokenizer_dir)): + os.makedirs(train_tokenizer_dir) + train_tokenizer.save(train_tokenizer_path) + + return train_tokenizer def save(self, path: str) -> None: """Save the tokenizer to a file""" @@ -114,8 +125,7 @@ class CharTokenizer: if __name__ == "__main__": - tokenizer = CharTokenizer() - tokenizer.save("data/tokenizers/char_tokenizer_german.json") + tokenizer = CharTokenizer.train("/Volumes/pherkel 1/SWR2-ASR", "mls_german_opus") print(tokenizer.char_map) print(tokenizer.index_map) print(tokenizer.get_vocab_size()) |