From d65e728575e07a54cec52ccb57af3cafedaac1a2 Mon Sep 17 00:00:00 2001 From: Pherkel Date: Mon, 21 Aug 2023 18:29:29 +0200 Subject: adjust char tokenizer interface --- swr2_asr/tokenizer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/swr2_asr/tokenizer.py b/swr2_asr/tokenizer.py index a665159..4dbb386 100644 --- a/swr2_asr/tokenizer.py +++ b/swr2_asr/tokenizer.py @@ -155,14 +155,18 @@ class CharTokenizer: ensure_ascii=False, ) - def from_file(self, path: str): + @staticmethod + def from_file(path: str) -> "CharTokenizer": """Load the tokenizer from a file""" + char_tokenizer = CharTokenizer() with open(path, "r", encoding="utf-8") as file: # load it in the following format: # {"char_map": {"a": 0, "b": 1, ...}, "index_map": {0: "a", 1: "b", ...}} saved_file = json.load(file) - self.char_map = saved_file["char_map"] - self.index_map = saved_file["index_map"] + char_tokenizer.char_map = saved_file["char_map"] + char_tokenizer.index_map = saved_file["index_map"] + + return char_tokenizer @click.command() -- cgit v1.2.3