1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
|
"""Tokenizer for use with Multilingual Librispeech"""
import json
import os
from dataclasses import dataclass
from typing import Type
import click
from tokenizers import Tokenizer, normalizers
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tqdm import tqdm
class TokenizerType:
"""Base class for tokenizers.
exposes the same interface as tokenizers from the huggingface library"""
def encode(self, sequence: str) -> list[int]:
"""Encode a sequence to a list of integer labels"""
raise NotImplementedError
def decode(self, labels: list[int], remove_special_tokens: bool) -> str:
"""Decode a list of integer labels to a sequence"""
raise NotImplementedError
def decode_batch(self, labels: list[list[int]]) -> list[str]:
"""Decode a batch of integer labels to a list of sequences"""
raise NotImplementedError
def get_vocab_size(self) -> int:
"""Get the size of the vocabulary"""
raise NotImplementedError
def enable_padding(
self,
length: int = -1,
direction: str = "right",
pad_id: int = 0,
pad_type_id: int = 0,
pad_token: str = "[PAD]",
) -> None:
"""Enable padding for the tokenizer"""
raise NotImplementedError
def save(self, path: str) -> None:
"""Save the tokenizer to a file"""
raise NotImplementedError
@staticmethod
def from_file(path: str) -> "TokenizerType":
"""Load the tokenizer from a file"""
raise NotImplementedError
MyTokenizerType = Type[TokenizerType]
@dataclass
class Encoding:
"""Simple dataclass to represent an encoding"""
ids: list[int]
tokens: list[str]
class CharTokenizer(TokenizerType):
"""Very simple tokenizer for use with Multilingual Librispeech
Simply checks what characters are in the dataset and uses them as tokens.
Exposes the same interface as tokenizers from the huggingface library, i.e.
encode, decode, decode_batch, get_vocab_size, save, from_file and train.
"""
def __init__(self):
self.char_map = {}
self.index_map = {}
self.add_tokens(["<UNK>", "<SPACE>"])
def add_tokens(self, tokens: list[str]):
"""Manually add tokens to the tokenizer
Args:
tokens (list[str]): List of tokens to add
"""
for token in tokens:
if token not in self.char_map:
self.char_map[token] = len(self.char_map)
self.index_map[len(self.index_map)] = token
def train(self, dataset_path: str, language: str, split: str):
"""Train the tokenizer on the given dataset
Args:
dataset_path (str): Path to the MLS dataset
language (str): Language to use
split (str): Split to use
"""
if split not in ["train", "dev", "test", "all"]:
raise ValueError("Split must be one of train, dev, test, all")
if split == "all":
splits = ["train", "dev", "test"]
else:
splits = [split]
chars: set = set()
for s_plit in splits:
transcript_path = os.path.join(dataset_path, language, s_plit, "transcripts.txt")
with open(
transcript_path,
"r",
encoding="utf-8",
) as file:
lines = file.readlines()
lines = [line.split(" ", 1)[1] for line in lines]
lines = [line.strip() for line in lines]
for line in tqdm(lines, desc=f"Training tokenizer on {s_plit} split"):
chars.update(line)
offset = len(self.char_map)
for i, char in enumerate(chars):
i += offset
self.char_map[char] = i
self.index_map[i] = char
def encode(self, sequence: str):
"""Use a character map and convert text to an integer sequence
automatically maps spaces to <SPACE> and makes everything lowercase
unknown characters are mapped to the <UNK> token
"""
int_sequence = []
sequence = sequence.lower()
for char in sequence:
if char == " ":
mapped_char = self.char_map["<SPACE>"]
elif char not in self.char_map:
mapped_char = self.char_map["<UNK>"]
else:
mapped_char = self.char_map[char]
int_sequence.append(mapped_char)
return Encoding(ids=int_sequence, tokens=list(sequence))
def decode(self, labels: list[int], remove_special_tokens: bool = True):
"""Use a character map and convert integer labels to an text sequence
Args:
labels (list[int]): List of integer labels
remove_special_tokens (bool): Whether to remove special tokens.
Defaults to True.
"""
string = []
for i in labels:
if remove_special_tokens and self.index_map[f"{i}"] == "<UNK>":
continue
if remove_special_tokens and self.index_map[f"{i}"] == "<SPACE>":
string.append(" ")
string.append(self.index_map[f"{i}"])
return "".join(string).replace("<SPACE>", " ")
def decode_batch(self, labels: list[list[int]]):
"""Use a character map and convert integer labels to an text sequence"""
strings = []
for label in labels:
string = []
for i in label:
if self.index_map[i] == "<UNK>":
continue
if self.index_map[i] == "<SPACE>":
string.append(" ")
string.append(self.index_map[i])
strings.append("".join(string).replace("<SPACE>", " "))
return strings
def get_vocab_size(self):
"""Get the size of the vocabulary"""
return len(self.char_map)
def save(self, path: str):
"""Save the tokenizer to a file"""
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf-8") as file:
# save it in the following format:
# {"char_map": {"a": 0, "b": 1, ...}, "index_map": {0: "a", 1: "b", ...}}
json.dump(
{"char_map": self.char_map, "index_map": self.index_map},
file,
ensure_ascii=False,
)
@staticmethod
def from_file(path: str) -> "CharTokenizer":
"""Load the tokenizer from a file"""
char_tokenizer = CharTokenizer()
with open(path, "r", encoding="utf-8") as file:
# load it in the following format:
# {"char_map": {"a": 0, "b": 1, ...}, "index_map": {0: "a", 1: "b", ...}}
saved_file = json.load(file)
char_tokenizer.char_map = saved_file["char_map"]
char_tokenizer.index_map = saved_file["index_map"]
return char_tokenizer
@click.command()
@click.option("--dataset_path", default="data", help="Path to the MLS dataset")
@click.option("--language", default="mls_german_opus", help="Language to use")
@click.option("--split", default="train", help="Split to use (including all)")
@click.option("--out_path", default="tokenizer.json", help="Path to save the tokenizer to")
@click.option("--vocab_size", default=2000, help="Size of the vocabulary")
def train_bpe_tokenizer_cli(
dataset_path: str,
language: str,
split: str,
out_path: str,
vocab_size: int,
):
"""Train a Byte-Pair Encoder tokenizer on the MLS dataset"""
train_bpe_tokenizer(
dataset_path,
language,
split,
out_path,
vocab_size,
)
def train_bpe_tokenizer(
dataset_path: str,
language: str,
split: str,
out_path: str,
vocab_size: int,
):
"""Train a Byte-Pair Encoder tokenizer on the MLS dataset
Assumes that the MLS dataset is located in the dataset_path and there is a
transcripts.txt file in the split folder.
Args:
dataset_path (str): Path to the MLS dataset
language (str): Language to use
split (str): Split to use
download (bool): Whether to download the dataset if it is not present
out_path (str): Path to save the tokenizer to
vocab_size (int): Size of the vocabulary
"""
if split not in ["train", "dev", "test", "all"]:
raise ValueError("Split must be one of train, dev, test, all")
if split == "all":
splits = ["train", "dev", "test"]
else:
splits = [split]
lines = []
for s_plit in splits:
transcripts_path = os.path.join(dataset_path, language, s_plit, "transcripts.txt")
if not os.path.exists(transcripts_path):
raise FileNotFoundError(
f"Could not find transcripts.txt in {transcripts_path}. "
"Please make sure that the dataset is downloaded."
)
with open(
transcripts_path,
"r",
encoding="utf-8",
) as file:
sp_lines = file.readlines()
sp_lines = [line.split(" ", 1)[1] for line in sp_lines]
sp_lines = [line.strip() for line in sp_lines]
lines.append(sp_lines)
bpe_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
initial_alphabet = [
" ",
"a",
"b",
"c",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"q",
"r",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"ä",
"ö",
"ü",
"ß",
"-",
"é",
"è",
"à",
"ù",
"ç",
"â",
"ê",
"î",
"ô",
"û",
"ë",
"ï",
"ü",
]
trainer = BpeTrainer(
special_tokens=["[UNK]"],
vocab_size=vocab_size,
initial_alphabet=initial_alphabet,
show_progress=True,
) # type: ignore
bpe_tokenizer.pre_tokenizer = Whitespace() # type: ignore
bpe_tokenizer.normalizer = normalizers.Lowercase() # type: ignore
bpe_tokenizer.train_from_iterator(lines, trainer=trainer)
bpe_tokenizer.save(out_path)
@click.command()
@click.option("--dataset_path", default="data", help="Path to the MLS dataset")
@click.option("--language", default="mls_german_opus", help="Language to use")
@click.option("--split", default="train", help="Split to use")
@click.option("--out_path", default="tokenizer_chars.txt", help="Path to save the tokenizer to")
def train_char_tokenizer_cli(
dataset_path: str,
language: str,
split: str,
out_path: str,
):
"""Train a Byte-Pair Encoder tokenizer on the MLS dataset"""
train_char_tokenizer(dataset_path, language, split, out_path)
def train_char_tokenizer(
dataset_path: str,
language: str,
split: str,
out_path: str,
):
"""Train a Byte-Pair Encoder tokenizer on the MLS dataset
Assumes that the MLS dataset is located in the dataset_path and there is a
transcripts.txt file in the split folder.
Args:
dataset_path (str): Path to the MLS dataset
language (str): Language to use
split (str): Split to use
download (bool): Whether to download the dataset if it is not present
out_path (str): Path to save the tokenizer to
"""
char_tokenizer = CharTokenizer()
char_tokenizer.train(dataset_path, language, split)
char_tokenizer.save(out_path)
if __name__ == "__main__":
tokenizer = CharTokenizer()
tokenizer.from_file("data/tokenizers/char_tokenizer_german.json")
print(tokenizer.decode(tokenizer.encode("Fichier non trouvé").ids))
|