aboutsummaryrefslogtreecommitdiff
path: root/swr2_asr/tokenizer.py
diff options
context:
space:
mode:
Diffstat (limited to 'swr2_asr/tokenizer.py')
-rw-r--r--swr2_asr/tokenizer.py1
1 files changed, 1 insertions, 0 deletions
diff --git a/swr2_asr/tokenizer.py b/swr2_asr/tokenizer.py
index 5758da7..8e3bf09 100644
--- a/swr2_asr/tokenizer.py
+++ b/swr2_asr/tokenizer.py
@@ -302,6 +302,7 @@ def train_bpe_tokenizer(
"ü",
]
+ # TODO: add padding token / whitespace token / special tokens
trainer = BpeTrainer(
special_tokens=["[UNK]"],
vocab_size=vocab_size,