From 3ae21cbc432113531aa15e0cebd8a34c3767ba35 Mon Sep 17 00:00:00 2001 From: Pherkel Date: Sun, 20 Aug 2023 14:52:15 +0200 Subject: added todos --- swr2_asr/tokenizer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'swr2_asr/tokenizer.py') diff --git a/swr2_asr/tokenizer.py b/swr2_asr/tokenizer.py index d9cd622..79d6727 100644 --- a/swr2_asr/tokenizer.py +++ b/swr2_asr/tokenizer.py @@ -26,7 +26,7 @@ class CharTokenizer: Simply checks what characters are in the dataset and uses them as tokens. Exposes the same interface as tokenizers from the huggingface library, i.e. - encode, decode, decode_batch, save, from_file and train. + encode, decode, decode_batch, get_vocab_size, save, from_file and train. """ def __init__(self): @@ -140,6 +140,10 @@ class CharTokenizer: strings.append("".join(string).replace("", " ")) return strings + def get_vocab_size(self): + """Get the size of the vocabulary""" + return len(self.char_map) + def save(self, path: str): """Save the tokenizer to a file""" with open(path, "w", encoding="utf-8") as file: -- cgit v1.2.3