diff options
author | Pherkel | 2023-08-21 18:29:29 +0200 |
---|---|---|
committer | Pherkel | 2023-08-21 18:29:29 +0200 |
commit | d65e728575e07a54cec52ccb57af3cafedaac1a2 (patch) | |
tree | 72e757f7faaaaddf2d3b03a6d733be4d90732cc4 | |
parent | 4c31ecc1bb748242d4740ab5b42514598006d10b (diff) |
adjust char tokenizer interface
-rw-r--r-- | swr2_asr/tokenizer.py | 10 |
1 files changed, 7 insertions, 3 deletions
diff --git a/swr2_asr/tokenizer.py b/swr2_asr/tokenizer.py index a665159..4dbb386 100644 --- a/swr2_asr/tokenizer.py +++ b/swr2_asr/tokenizer.py @@ -155,14 +155,18 @@ class CharTokenizer: ensure_ascii=False, ) - def from_file(self, path: str): + @staticmethod + def from_file(path: str) -> "CharTokenizer": """Load the tokenizer from a file""" + char_tokenizer = CharTokenizer() with open(path, "r", encoding="utf-8") as file: # load it in the following format: # {"char_map": {"a": 0, "b": 1, ...}, "index_map": {0: "a", 1: "b", ...}} saved_file = json.load(file) - self.char_map = saved_file["char_map"] - self.index_map = saved_file["index_map"] + char_tokenizer.char_map = saved_file["char_map"] + char_tokenizer.index_map = saved_file["index_map"] + + return char_tokenizer @click.command() |