Spaces:
Running
Running
| # Copyright (c) 2023 Amphion. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import json | |
| from tqdm import tqdm | |
| import os | |
| import librosa | |
| from utils.util import has_existed | |
| def get_lines(file): | |
| with open(file, "r") as f: | |
| lines = f.readlines() | |
| lines = [l.strip() for l in lines] | |
| return lines | |
| def get_uid2utt(opencpop_path, dataset, dataset_type): | |
| index_count = 0 | |
| total_duration = 0 | |
| file = os.path.join(opencpop_path, "segments", "{}.txt".format(dataset_type)) | |
| lines = get_lines(file) | |
| uid2utt = [] | |
| for l in tqdm(lines): | |
| items = l.split("|") | |
| uid = items[0] | |
| res = { | |
| "Dataset": dataset, | |
| "index": index_count, | |
| "Singer": "female1", | |
| "Uid": uid, | |
| } | |
| # Duration in wav files | |
| audio_file = os.path.join(opencpop_path, "segments/wavs/{}.wav".format(uid)) | |
| res["Path"] = audio_file | |
| duration = librosa.get_duration(filename=res["Path"]) | |
| res["Duration"] = duration | |
| uid2utt.append(res) | |
| index_count = index_count + 1 | |
| total_duration += duration | |
| return uid2utt, total_duration / 3600 | |
| def main(dataset, output_path, dataset_path): | |
| print("-" * 10) | |
| print("Dataset splits for {}...\n".format(dataset)) | |
| save_dir = os.path.join(output_path, dataset) | |
| opencpop_path = dataset_path | |
| for dataset_type in ["train", "test"]: | |
| output_file = os.path.join(save_dir, "{}.json".format(dataset_type)) | |
| if has_existed(output_file): | |
| continue | |
| res, hours = get_uid2utt(opencpop_path, dataset, dataset_type) | |
| # Save | |
| os.makedirs(save_dir, exist_ok=True) | |
| with open(output_file, "w") as f: | |
| json.dump(res, f, indent=4, ensure_ascii=False) | |
| print("{}_{}_hours= {}".format(dataset, dataset_type, hours)) | |