diff options
Diffstat (limited to 'swr2_asr/tokenizer.py')
-rw-r--r-- | swr2_asr/tokenizer.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/swr2_asr/tokenizer.py b/swr2_asr/tokenizer.py index d9cd622..79d6727 100644 --- a/swr2_asr/tokenizer.py +++ b/swr2_asr/tokenizer.py @@ -26,7 +26,7 @@ class CharTokenizer: Simply checks what characters are in the dataset and uses them as tokens. Exposes the same interface as tokenizers from the huggingface library, i.e. - encode, decode, decode_batch, save, from_file and train. + encode, decode, decode_batch, get_vocab_size, save, from_file and train. """ def __init__(self): @@ -140,6 +140,10 @@ class CharTokenizer: strings.append("".join(string).replace("<SPACE>", " ")) return strings + def get_vocab_size(self): + """Get the size of the vocabulary""" + return len(self.char_map) + def save(self, path: str): """Save the tokenizer to a file""" with open(path, "w", encoding="utf-8") as file: |