aboutsummaryrefslogtreecommitdiff
path: root/swr2_asr/tokenizer.py
diff options
context:
space:
mode:
Diffstat (limited to 'swr2_asr/tokenizer.py')
-rw-r--r--swr2_asr/tokenizer.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/swr2_asr/tokenizer.py b/swr2_asr/tokenizer.py
index d9cd622..79d6727 100644
--- a/swr2_asr/tokenizer.py
+++ b/swr2_asr/tokenizer.py
@@ -26,7 +26,7 @@ class CharTokenizer:
Simply checks what characters are in the dataset and uses them as tokens.
Exposes the same interface as tokenizers from the huggingface library, i.e.
- encode, decode, decode_batch, save, from_file and train.
+ encode, decode, decode_batch, get_vocab_size, save, from_file and train.
"""
def __init__(self):
@@ -140,6 +140,10 @@ class CharTokenizer:
strings.append("".join(string).replace("<SPACE>", " "))
return strings
+ def get_vocab_size(self):
+ """Get the size of the vocabulary"""
+ return len(self.char_map)
+
def save(self, path: str):
"""Save the tokenizer to a file"""
with open(path, "w", encoding="utf-8") as file: