-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmake_data_all.py
71 lines (53 loc) · 2.36 KB
/
make_data_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import glob2
import os
import random
import numpy as np
from sklearn.model_selection import train_test_split
_, _, files = next(os.walk('/mnt/data1/waris/repo/vq-bnf/translation-all/vq64/ppgs'))
data_train, data_test, labels_train, labels_test = train_test_split(files, files, test_size=0.05, random_state=42)
# train_list = "/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc_train/SV2TTS/synthesizer/train_split.txt"
# dev_list = "/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc_train/SV2TTS/synthesizer/dev_split.txt"
# with open(train_list, encoding="utf-8") as f:
# train_metadata = [line.strip().split("|") for line in f]
train_list=[]
spk_dict={}
for idx in range(len(data_train)):
_, spkr, fid = data_train[idx].split("-")
fid = fid.split(".")[0]
train_list.append(f'{spkr}/{fid}')
if spkr not in spk_dict:
spk_dict[spkr] = []
spk_dict[spkr].append(f'{fid}')
with open('train_all.txt', mode='wt', encoding='utf-8') as myfile:
for entry in train_list:
spkr, fid = entry.split('/')
rnd_spk_uttr = random.choice(spk_dict[spkr])
myfile.write(f'{entry}/{rnd_spk_uttr}')
myfile.write('\n')
# with open(dev_list, encoding="utf-8") as f:
# dev_metadata = [line.strip().split("|") for line in f]
with open('dev_all.txt', mode='wt', encoding='utf-8') as myfile:
for idx in range(len(data_test)):
_, spkr, fid = data_test[idx].split("-")
fid = fid.split(".")[0]
myfile.write(f'{spkr}/{fid}/{fid}')
myfile.write('\n')
# wav_file_list = glob2.glob(f"/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc_train/**/*.wav")
# ids = []
# for t in wav_file_list:
# spkr = t.split('.')[0].split('/')[-3]
# fid = t.split('.')[0].split('/')[-1]
# wav = t.split('.')[0].split('/')[-2]
# # with open('/path/to/filename.txt', mode='wt', encoding='utf-8') as myfile:
# ids.append(f'{spkr}/{fid}')
# ids = np.array(ids)
# np.random.shuffle(ids)
# data_train, data_test, labels_train, labels_test = train_test_split(ids, ids, test_size=0.05, random_state=42)
# with open('train.txt', mode='wt', encoding='utf-8') as myfile:
# for s in data_train:
# myfile.write(s)
# myfile.write('\n')
# with open('dev.txt', mode='wt', encoding='utf-8') as myfile:
# for s in data_test:
# myfile.write(s)
# myfile.write('\n')