aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPherkel2023-08-21 18:29:29 +0200
committerPherkel2023-08-21 18:29:29 +0200
commitd65e728575e07a54cec52ccb57af3cafedaac1a2 (patch)
tree72e757f7faaaaddf2d3b03a6d733be4d90732cc4
parent4c31ecc1bb748242d4740ab5b42514598006d10b (diff)
adjust char tokenizer interface
-rw-r--r--swr2_asr/tokenizer.py10
1 files changed, 7 insertions, 3 deletions
diff --git a/swr2_asr/tokenizer.py b/swr2_asr/tokenizer.py
index a665159..4dbb386 100644
--- a/swr2_asr/tokenizer.py
+++ b/swr2_asr/tokenizer.py
@@ -155,14 +155,18 @@ class CharTokenizer:
ensure_ascii=False,
)
- def from_file(self, path: str):
+ @staticmethod
+ def from_file(path: str) -> "CharTokenizer":
"""Load the tokenizer from a file"""
+ char_tokenizer = CharTokenizer()
with open(path, "r", encoding="utf-8") as file:
# load it in the following format:
# {"char_map": {"a": 0, "b": 1, ...}, "index_map": {0: "a", 1: "b", ...}}
saved_file = json.load(file)
- self.char_map = saved_file["char_map"]
- self.index_map = saved_file["index_map"]
+ char_tokenizer.char_map = saved_file["char_map"]
+ char_tokenizer.index_map = saved_file["index_map"]
+
+ return char_tokenizer
@click.command()