import sys
|
import codecs
|
import zhconv
|
|
decode_result = sys.argv[1]
|
utt2spk_file = sys.argv[2]
|
merged_result = "/".join(decode_result.split("/")[:-1]) + "/text_merge"
|
|
utt2text = {}
|
utt2spk = {}
|
spk2texts = {}
|
spk2text = {}
|
meeting2text = {}
|
|
with codecs.open(decode_result, "r", "utf-8") as f1:
|
with codecs.open(utt2spk_file, "r", "utf-8") as f2:
|
for line in f1.readlines():
|
try:
|
line_list = line.strip().split()
|
uttid = line_list[0]
|
text = "".join(line_list[1:])
|
except:
|
continue
|
utt2text[uttid] = text
|
for line in f2.readlines():
|
uttid, spkid = line.strip().split()
|
utt2spk[uttid] = spkid
|
|
for utt, text in utt2text.items():
|
spk = utt2spk[utt]
|
stime = int(utt.split("-")[-2])
|
if spk in spk2texts.keys():
|
spk2texts[spk].append([stime, text])
|
else:
|
spk2texts[spk] = [[stime, text]]
|
|
for spk, texts in spk2texts.items():
|
texts = sorted(texts, key=lambda x: x[0])
|
text = "".join([x[1] for x in texts])
|
spk2text[spk] = text
|
|
with codecs.open(merged_result, "w", "utf-8") as f:
|
for spk, text in spk2text.items():
|
# meeting = spk.split("-")[2]
|
meeting = spk.split("-")[0]
|
if meeting in meeting2text.keys():
|
meeting2text[meeting] = meeting2text[meeting] + "$" + text
|
else:
|
meeting2text[meeting] = text
|
for meeting, text in meeting2text.items():
|
f.write("%s %s\n" % (meeting, text))
|