# -*- coding: utf-8 -*- """ Process the textgrid files """ import argparse import codecs from distutils.util import strtobool from pathlib import Path import textgrid import pdb def get_args(): parser = argparse.ArgumentParser(description="process the textgrid files") parser.add_argument("--path", type=str, required=True, help="Data path") args = parser.parse_args() return args class Segment(object): def __init__(self, uttid, text): self.uttid = uttid self.text = text def main(args): text = codecs.open(Path(args.path) / "text", "r", "utf-8") spk2utt = codecs.open(Path(args.path) / "spk2utt", "r", "utf-8") utt2spk = codecs.open(Path(args.path) / "utt2spk_all_fifo", "r", "utf-8") spk2id = codecs.open(Path(args.path) / "spk2id", "w", "utf-8") spkid_map = {} meetingid_map = {} for line in spk2utt: spkid = line.strip().split(" ")[0] meeting_id = spkid.split("-")[0] if meeting_id not in meetingid_map: meetingid_map[meeting_id] = 1 else: meetingid_map[meeting_id] += 1 spkid_map[spkid] = meetingid_map[meeting_id] spk2id.write("%s %s\n" % (spkid, meetingid_map[meeting_id])) utt2spklist = {} for line in utt2spk: uttid = line.strip().split(" ")[0] spkid = line.strip().split(" ")[1] spklist = spkid.split("$") tmp = [] for index in range(len(spklist)): tmp.append(spkid_map[spklist[index]]) utt2spklist[uttid] = tmp # parse the textgrid file for each utterance all_segments = [] for line in text: uttid = line.strip().split(" ")[0] context = line.strip().split(" ")[1] spklist = utt2spklist[uttid] length_text = len(context) cnt = 0 tmp_text = "" for index in range(length_text): if context[index] != "$": tmp_text += str(spklist[cnt]) else: tmp_text += "$" cnt += 1 tmp_seg = Segment(uttid,tmp_text) all_segments.append(tmp_seg) text.close() utt2spk.close() spk2utt.close() spk2id.close() text_id = codecs.open(Path(args.path) / "text_id", "w", "utf-8") for i in range(len(all_segments)): uttid_tmp = all_segments[i].uttid text_tmp = all_segments[i].text text_id.write("%s %s\n" % (uttid_tmp, text_tmp)) text_id.close() if __name__ == "__main__": args = get_args() main(args)