"""Tokenizer for Multilingual Librispeech datasets""" class CharTokenizer: """Maps characters to integers and vice versa""" def __init__(self): char_map_str = """ _ a b c d e f g h i j k l m n o p q r s t u v w x y z é à ä ö ß ü - ' """ self.char_map = {} self.index_map = {} for idx, char in enumerate(char_map_str.strip().split("\n")): char = char.strip() self.char_map[char] = idx self.index_map[idx] = char self.index_map[1] = " " def encode(self, text: str) -> list[int]: """Use a character map and convert text to an integer sequence""" int_sequence = [] for char in text: if char == " ": char = self.char_map[""] elif char not in self.char_map: char = self.char_map[""] else: char = self.char_map[char] int_sequence.append(char) return int_sequence def decode(self, labels: list[int]) -> str: """Use a character map and convert integer labels to an text sequence""" string = [] for i in labels: string.append(self.index_map[i]) return "".join(string).replace("", " ") def get_vocab_size(self) -> int: """Get the number of unique characters in the dataset""" return len(self.char_map) def get_blank_token(self) -> int: """Get the integer representation of the character""" return self.char_map[""] def get_unk_token(self) -> int: """Get the integer representation of the character""" return self.char_map[""] def get_space_token(self) -> int: """Get the integer representation of the character""" return self.char_map[""] # TODO: add train function def save(self, path: str) -> None: """Save the tokenizer to a file""" with open(path, "w", encoding="utf-8") as file: for char, index in self.char_map.items(): file.write(f"{char} {index}\n") @staticmethod def from_file(tokenizer_file: str) -> "CharTokenizer": """Instantiate a CharTokenizer from a file""" load_tokenizer = CharTokenizer() with open(tokenizer_file, "r", encoding="utf-8") as file: for line in file: line = line.strip() if line: char, index = line.split() tokenizer.char_map[char] = int(index) tokenizer.index_map[int(index)] = char return load_tokenizer if __name__ == "__main__": tokenizer = CharTokenizer() tokenizer.save("data/tokenizers/char_tokenizer_german.json") print(tokenizer.char_map) print(tokenizer.index_map) print(tokenizer.get_vocab_size()) print(tokenizer.get_blank_token()) print(tokenizer.get_unk_token()) print(tokenizer.get_space_token()) print(tokenizer.encode("hallo welt")) print(tokenizer.decode([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))