diff options
author | Philipp Merkel | 2023-09-11 22:36:28 +0000 |
---|---|---|
committer | Philipp Merkel | 2023-09-11 22:36:28 +0000 |
commit | 4aff1fcd70cd8601541a1dd5bd820b0263ed1362 (patch) | |
tree | fe30e408ad30e25e7ea2891e223240e7316986c0 /swr2_asr/utils/data.py | |
parent | 3811dc68de2e2572b3656b8f4460553136eb11b4 (diff) |
fix: switched up training and test splits in train.py
Diffstat (limited to 'swr2_asr/utils/data.py')
-rw-r--r-- | swr2_asr/utils/data.py | 31 |
1 files changed, 0 insertions, 31 deletions
diff --git a/swr2_asr/utils/data.py b/swr2_asr/utils/data.py index 10f0ea8..d551c98 100644 --- a/swr2_asr/utils/data.py +++ b/swr2_asr/utils/data.py @@ -134,11 +134,6 @@ class MLSDataset(Dataset): def initialize_limited(self) -> None: """Initializes the limited supervision dataset""" - # get file handles - # get file paths - # get transcripts - # create train or validation split - handles = set() train_root_path = os.path.join(self.dataset_path, self.language, "train") @@ -348,29 +343,3 @@ class MLSDataset(Dataset): dataset_lookup_entry["chapterid"], idx, ) # type: ignore - - -if __name__ == "__main__": - DATASET_PATH = "/Volumes/pherkel/SWR2-ASR" - LANGUAGE = "mls_german_opus" - split = Split.DEV - DOWNLOAD = False - - dataset = MLSDataset(DATASET_PATH, LANGUAGE, split, download=DOWNLOAD) - - dataloader = DataLoader( - dataset, - batch_size=1, - shuffle=True, - collate_fn=DataProcessing( - "train", CharTokenizer.from_file("data/tokenizers/char_tokenizer_german.json") - ), - ) - - for batch in dataloader: - print(batch) - break - - print(len(dataset)) - - print(dataset[0]) |