From cbbe597ce360e938e152bfbe8ea19e3090a43671 Mon Sep 17 00:00:00 2001 From: JoJoBarthold2 Date: Wed, 6 Sep 2023 14:52:58 +0200 Subject: deleted a TODO --- swr2_asr/tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/swr2_asr/tokenizer.py b/swr2_asr/tokenizer.py index e4df93b..2e2fb57 100644 --- a/swr2_asr/tokenizer.py +++ b/swr2_asr/tokenizer.py @@ -156,7 +156,7 @@ class CharTokenizer(TokenizerType): """ string = [] for i in labels: - if remove_special_tokens and self.index_map[f"{i}"] == "": + if remove_special_tokens and self.index_map[f"{i}"] == "": continue if remove_special_tokens and self.index_map[f"{i}"] == "": string.append(" ") @@ -329,7 +329,7 @@ def train_bpe_tokenizer( "ΓΌ", ] - # TODO: add padding token / whitespace token / special tokens + trainer = BpeTrainer( special_tokens=["[UNK]"], vocab_size=vocab_size, -- cgit v1.2.3