diff options
author | JoJoBarthold2 | 2023-09-06 14:52:58 +0200 |
---|---|---|
committer | JoJoBarthold2 | 2023-09-06 14:52:58 +0200 |
commit | cbbe597ce360e938e152bfbe8ea19e3090a43671 (patch) | |
tree | b5f989f35b5ede60828bea7005ae744e292080f5 | |
parent | 9bb4ffbd0ef84bfea428112495cc63266e3b0955 (diff) |
deleted a TODO
-rw-r--r-- | swr2_asr/tokenizer.py | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/swr2_asr/tokenizer.py b/swr2_asr/tokenizer.py index e4df93b..2e2fb57 100644 --- a/swr2_asr/tokenizer.py +++ b/swr2_asr/tokenizer.py @@ -156,7 +156,7 @@ class CharTokenizer(TokenizerType): """ string = [] for i in labels: - if remove_special_tokens and self.index_map[f"{i}"] == "<UNK>": + if remove_special_tokens and self.index_map[f"{i}"] == "<UNK>": continue if remove_special_tokens and self.index_map[f"{i}"] == "<SPACE>": string.append(" ") @@ -329,7 +329,7 @@ def train_bpe_tokenizer( "ü", ] - # TODO: add padding token / whitespace token / special tokens + trainer = BpeTrainer( special_tokens=["[UNK]"], vocab_size=vocab_size, |