Spaces:
Paused
Paused
| # -*- coding: utf-8 -*- | |
| # file: compress_datasets.py | |
| # time: 19:13 2023/2/5 | |
| # author: yangheng <hy345@exeter.ac.uk> | |
| # github: https://github.com/yangheng95 | |
| # huggingface: https://huggingface.co/yangheng | |
| # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en | |
| # Copyright (C) 2021. All Rights Reserved. | |
| # -*- coding: utf-8 -*- | |
| # file: zip_datasets.py | |
| # time: 05/11/2022 17:10 | |
| # author: yangheng <hy345@exeter.ac.uk> | |
| # github: https://github.com/yangheng95 | |
| # GScholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en | |
| # ResearchGate: https://www.researchgate.net/profile/Heng-Yang-17/research | |
| # Copyright (C) 2022. All Rights Reserved. | |
| import os | |
| import shutil | |
| import zipfile | |
| from pathlib import Path | |
| import findfile | |
| from pyabsa.utils.pyabsa_utils import fprint | |
| def cascade_zip_datasets(): | |
| # iterate zip all datasets in the folder | |
| datasets = findfile.find_dirs("integrated_datasets", "datasets", recursive=1) | |
| for dataset in datasets: | |
| if dataset in [ | |
| "integrated_datasets", | |
| "integrated_datasets.zip", | |
| ]: | |
| continue | |
| task_name = Path(dataset).name | |
| for d in findfile.find_dirs(dataset, ""): | |
| fprint(f"compressing dataset: {d}") | |
| dataset_name = Path(d).name | |
| zip_file = zipfile.ZipFile( | |
| f"integrated_datasets/{task_name}.{dataset_name}.zip".lower(), | |
| "w", | |
| zipfile.ZIP_DEFLATED, | |
| ) | |
| for root, dirs, files in os.walk(d): | |
| for file in files: | |
| zip_file.write(os.path.join(root, file).lower()) | |
| zip_file.close() | |
| if __name__ == "__main__": | |
| # if os.path.exists('integrated_datasets'): | |
| # try: | |
| # shutil.rmtree('integrated_datasets') | |
| # except: | |
| # os.system('rm -rf integrated_datasets') | |
| # | |
| # from pyabsa import download_all_available_datasets | |
| # | |
| # download_all_available_datasets() | |
| cascade_zip_datasets() | |