aboutsummaryrefslogtreecommitdiff
path: root/swr2_asr/utils
diff options
context:
space:
mode:
authorPhilipp Merkel2023-09-11 22:36:28 +0000
committerPhilipp Merkel2023-09-11 22:36:28 +0000
commit4aff1fcd70cd8601541a1dd5bd820b0263ed1362 (patch)
treefe30e408ad30e25e7ea2891e223240e7316986c0 /swr2_asr/utils
parent3811dc68de2e2572b3656b8f4460553136eb11b4 (diff)
fix: switched up training and test splits in train.py
Diffstat (limited to 'swr2_asr/utils')
-rw-r--r--swr2_asr/utils/data.py31
-rw-r--r--swr2_asr/utils/tokenizer.py12
2 files changed, 0 insertions, 43 deletions
diff --git a/swr2_asr/utils/data.py b/swr2_asr/utils/data.py
index 10f0ea8..d551c98 100644
--- a/swr2_asr/utils/data.py
+++ b/swr2_asr/utils/data.py
@@ -134,11 +134,6 @@ class MLSDataset(Dataset):
def initialize_limited(self) -> None:
"""Initializes the limited supervision dataset"""
- # get file handles
- # get file paths
- # get transcripts
- # create train or validation split
-
handles = set()
train_root_path = os.path.join(self.dataset_path, self.language, "train")
@@ -348,29 +343,3 @@ class MLSDataset(Dataset):
dataset_lookup_entry["chapterid"],
idx,
) # type: ignore
-
-
-if __name__ == "__main__":
- DATASET_PATH = "/Volumes/pherkel/SWR2-ASR"
- LANGUAGE = "mls_german_opus"
- split = Split.DEV
- DOWNLOAD = False
-
- dataset = MLSDataset(DATASET_PATH, LANGUAGE, split, download=DOWNLOAD)
-
- dataloader = DataLoader(
- dataset,
- batch_size=1,
- shuffle=True,
- collate_fn=DataProcessing(
- "train", CharTokenizer.from_file("data/tokenizers/char_tokenizer_german.json")
- ),
- )
-
- for batch in dataloader:
- print(batch)
- break
-
- print(len(dataset))
-
- print(dataset[0])
diff --git a/swr2_asr/utils/tokenizer.py b/swr2_asr/utils/tokenizer.py
index 22569eb..1cc7b84 100644
--- a/swr2_asr/utils/tokenizer.py
+++ b/swr2_asr/utils/tokenizer.py
@@ -120,15 +120,3 @@ class CharTokenizer:
load_tokenizer.char_map[char] = int(index)
load_tokenizer.index_map[int(index)] = char
return load_tokenizer
-
-
-if __name__ == "__main__":
- tokenizer = CharTokenizer.train("/Volumes/pherkel 1/SWR2-ASR", "mls_german_opus")
- print(tokenizer.char_map)
- print(tokenizer.index_map)
- print(tokenizer.get_vocab_size())
- print(tokenizer.get_blank_token())
- print(tokenizer.get_unk_token())
- print(tokenizer.get_space_token())
- print(tokenizer.encode("hallo welt"))
- print(tokenizer.decode([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))