aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoJoBarthold22023-09-06 14:52:58 +0200
committerJoJoBarthold22023-09-06 14:52:58 +0200
commitcbbe597ce360e938e152bfbe8ea19e3090a43671 (patch)
treeb5f989f35b5ede60828bea7005ae744e292080f5
parent9bb4ffbd0ef84bfea428112495cc63266e3b0955 (diff)
deleted a TODO
-rw-r--r--swr2_asr/tokenizer.py4
1 files changed, 2 insertions, 2 deletions
diff --git a/swr2_asr/tokenizer.py b/swr2_asr/tokenizer.py
index e4df93b..2e2fb57 100644
--- a/swr2_asr/tokenizer.py
+++ b/swr2_asr/tokenizer.py
@@ -156,7 +156,7 @@ class CharTokenizer(TokenizerType):
"""
string = []
for i in labels:
- if remove_special_tokens and self.index_map[f"{i}"] == "<UNK>":
+ if remove_special_tokens and self.index_map[f"{i}"] == "<UNK>":
continue
if remove_special_tokens and self.index_map[f"{i}"] == "<SPACE>":
string.append(" ")
@@ -329,7 +329,7 @@ def train_bpe_tokenizer(
"ü",
]
- # TODO: add padding token / whitespace token / special tokens
+
trainer = BpeTrainer(
special_tokens=["[UNK]"],
vocab_size=vocab_size,