雾聪
2023-08-07 f8d1c79fe355efb18ae49e4363307dfec3ab89ce
egs/callhome/eend_ola/local/split.py
New file
@@ -0,0 +1,117 @@
import argparse
import os
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('root_path', help='raw data path')
    args = parser.parse_args()
    root_path = args.root_path
    work_path = os.path.join(root_path, ".work")
    scp_files = os.listdir(work_path)
    reco2dur_dict = {}
    with open(os.path.join(root_path, 'reco2dur')) as f:
        lines = f.readlines()
        for line in lines:
            parts = line.strip().split()
            reco2dur_dict[parts[0]] = parts[1]
    spk2utt_dict = {}
    with open(os.path.join(root_path, 'spk2utt')) as f:
        lines = f.readlines()
        for line in lines:
            parts = line.strip().split()
            spk = parts[0]
            utts = parts[1:]
            for utt in utts:
                tmp = utt.split('data')
                rec = 'data_' + '_'.join(tmp[1][1:].split('_')[:-2])
                if rec in spk2utt_dict.keys():
                    spk2utt_dict[rec].append((spk, utt))
                else:
                    spk2utt_dict[rec] = []
                    spk2utt_dict[rec].append((spk, utt))
    segment_dict = {}
    with open(os.path.join(root_path, 'segments')) as f:
        lines = f.readlines()
        for line in lines:
            parts = line.strip().split()
            if parts[1] in segment_dict.keys():
                segment_dict[parts[1]].append((parts[0], parts[2], parts[3]))
            else:
                segment_dict[parts[1]] = []
                segment_dict[parts[1]].append((parts[0], parts[2], parts[3]))
    utt2spk_dict = {}
    with open(os.path.join(root_path, 'utt2spk')) as f:
        lines = f.readlines()
        for line in lines:
            parts = line.strip().split()
            utt = parts[0]
            tmp = utt.split('data')
            rec = 'data_' + '_'.join(tmp[1][1:].split('_')[:-2])
            if rec in utt2spk_dict.keys():
                utt2spk_dict[rec].append((parts[0], parts[1]))
            else:
                utt2spk_dict[rec] = []
                utt2spk_dict[rec].append((parts[0], parts[1]))
    for file in scp_files:
        scp_file = os.path.join(work_path, file)
        idx = scp_file.split('.')[-1]
        reco2dur_file = os.path.join(work_path, 'reco2dur.{}'.format(str(idx)))
        spk2utt_file = os.path.join(work_path, 'spk2utt.{}'.format(str(idx)))
        segment_file = os.path.join(work_path, 'segments.{}'.format(str(idx)))
        utt2spk_file = os.path.join(work_path, 'utt2spk.{}'.format(str(idx)))
        fpp = open(scp_file)
        scp_lines = fpp.readlines()
        keys = []
        for line in scp_lines:
            name = line.strip().split()[0]
            keys.append(name)
        with open(reco2dur_file, 'w') as f:
            lines = []
            for key in keys:
                string = key + ' ' + reco2dur_dict[key]
                lines.append(string + '\n')
            lines[-1] = lines[-1][:-1]
            f.writelines(lines)
        with open(spk2utt_file, 'w') as f:
            lines = []
            for key in keys:
                items = spk2utt_dict[key]
                for item in items:
                    string = item[0]
                    for it in item[1:]:
                        string += ' '
                        string += it
                    lines.append(string + '\n')
            lines[-1] = lines[-1][:-1]
            f.writelines(lines)
        with open(segment_file, 'w') as f:
            lines = []
            for key in keys:
                items = segment_dict[key]
                for item in items:
                    string = item[0] + ' ' + key + ' ' + item[1] + ' ' + item[2]
                    lines.append(string + '\n')
            lines[-1] = lines[-1][:-1]
            f.writelines(lines)
        with open(utt2spk_file, 'w') as f:
            lines = []
            for key in keys:
                items = utt2spk_dict[key]
                for item in items:
                    string = item[0] + ' ' + item[1]
                    lines.append(string + '\n')
            lines[-1] = lines[-1][:-1]
            f.writelines(lines)
        fpp.close()