From 8be140b38183b7465b5888a15b536a5f7fa66db6 Mon Sep 17 00:00:00 2001 From: Pherkel Date: Mon, 11 Sep 2023 20:45:32 +0200 Subject: added tokenizer to git and tokenizer training routing --- data/tokenizers/char_tokenizer_german.json | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 data/tokenizers/char_tokenizer_german.json (limited to 'data/tokenizers/char_tokenizer_german.json') diff --git a/data/tokenizers/char_tokenizer_german.json b/data/tokenizers/char_tokenizer_german.json new file mode 100644 index 0000000..20db079 --- /dev/null +++ b/data/tokenizers/char_tokenizer_german.json @@ -0,0 +1,38 @@ +_ 0 + 1 + 2 + 3 +a 4 +b 5 +c 6 +d 7 +e 8 +f 9 +g 10 +h 11 +i 12 +j 13 +k 14 +l 15 +m 16 +n 17 +o 18 +p 19 +q 20 +r 21 +s 22 +t 23 +u 24 +v 25 +w 26 +x 27 +y 28 +z 29 +é 30 +à 31 +ä 32 +ö 33 +ß 34 +ü 35 +- 36 +' 37 -- cgit v1.2.3