Spaces:
Runtime error
Runtime error
| # Kevin @ Laronix Dec. 2022 | |
| # Data processing at Laronix | |
| import csv | |
| import soundfile as sf | |
| import pandas as pd | |
| from pathlib import Path | |
| import librosa | |
| import sys | |
| import numpy as np | |
| import pdb | |
| from rich.progress import track | |
| wavdir = sys.argv[1] | |
| txtdir = sys.argv[2] | |
| thre_len = int(sys.argv[3]) | |
| origin_sr = int(sys.argv[4]) | |
| target_sr = int(sys.argv[5]) | |
| wavs = sorted(Path(wavdir).glob("**/*.wav")) | |
| txts = sorted(Path(txtdir).glob("**/*.txt")) | |
| target_dir = "./data/%s_%d_%d_len%d" % ( | |
| Path(wavdir).stem, | |
| origin_sr, | |
| target_sr, | |
| thre_len, | |
| ) | |
| Path.mkdir(Path(target_dir), exist_ok=True) | |
| # pdb.set_trace() | |
| tables = [] | |
| for x, y in track( | |
| zip(wavs, txts), description="Processing...", total=len(wavs) | |
| ): | |
| label = 1 | |
| with open(y, "r") as f: | |
| txt = f.readline() | |
| if len(txt.split(" ")) <= thre_len: | |
| label = 1 | |
| record = [x, Path(x).stem, txt, len(txt.split(" ")), label] | |
| tables.append(record) | |
| # Select length <= 10 words sentences for training | |
| if len(txt.split(" ")) <= thre_len: | |
| wav, sr = librosa.load(x, sr=origin_sr) | |
| wav_ = librosa.resample(wav, orig_sr=sr, target_sr=target_sr) | |
| sf.write( | |
| Path(target_dir) / Path((x).stem + ".wav"), | |
| data=wav_, | |
| samplerate=target_sr, | |
| ) | |
| D = pd.DataFrame( | |
| tables, columns=["wav_path", "id", "text", "len", "length_label"] | |
| ) | |
| D.to_csv(target_dir + ".datalog", sep=",") | |
| print("Check data log at %s" % (target_dir + ".datalog")) | |
| D.get(["id", "text"]).to_csv( | |
| target_dir + ".txt", sep="\t", header=False, index=False, quoting=3 | |
| ) | |
| print("Generate id_text at %s" % (target_dir + ".txt")) | |
| print("Finish") | |