aboutsummaryrefslogtreecommitdiff
path: root/data/tokenizers
diff options
context:
space:
mode:
authorPherkel2023-09-11 20:45:32 +0200
committerPherkel2023-09-11 20:45:32 +0200
commit8be140b38183b7465b5888a15b536a5f7fa66db6 (patch)
tree68737b56d9859c139eb8e998cf50813ec7c68bdf /data/tokenizers
parentc078ce6789c134aa05607903d3bf9e4be64df45d (diff)
added tokenizer to git and tokenizer training routing
Diffstat (limited to 'data/tokenizers')
-rw-r--r--data/tokenizers/char_tokenizer_german.json38
1 files changed, 38 insertions, 0 deletions
diff --git a/data/tokenizers/char_tokenizer_german.json b/data/tokenizers/char_tokenizer_german.json
new file mode 100644
index 0000000..20db079
--- /dev/null
+++ b/data/tokenizers/char_tokenizer_german.json
@@ -0,0 +1,38 @@
+_ 0
+<BLANK> 1
+<UNK> 2
+<SPACE> 3
+a 4
+b 5
+c 6
+d 7
+e 8
+f 9
+g 10
+h 11
+i 12
+j 13
+k 14
+l 15
+m 16
+n 17
+o 18
+p 19
+q 20
+r 21
+s 22
+t 23
+u 24
+v 25
+w 26
+x 27
+y 28
+z 29
+é 30
+à 31
+ä 32
+ö 33
+ß 34
+ü 35
+- 36
+' 37