From f8d1c79fe355efb18ae49e4363307dfec3ab89ce Mon Sep 17 00:00:00 2001
From: 雾聪 <wucong.lyb@alibaba-inc.com>
Date: 星期一, 07 八月 2023 16:14:11 +0800
Subject: [PATCH] Merge branch 'main' of https://github.com/alibaba-damo-academy/FunASR into main

---
 egs/callhome/eend_ola/local/split.py |  117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 117 insertions(+), 0 deletions(-)

diff --git a/egs/callhome/eend_ola/local/split.py b/egs/callhome/eend_ola/local/split.py
new file mode 100644
index 0000000..7ad1bad
--- /dev/null
+++ b/egs/callhome/eend_ola/local/split.py
@@ -0,0 +1,117 @@
+import argparse
+import os
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('root_path', help='raw data path')
+    args = parser.parse_args()
+
+    root_path = args.root_path
+    work_path = os.path.join(root_path, ".work")
+    scp_files = os.listdir(work_path)
+
+    reco2dur_dict = {}
+    with open(os.path.join(root_path, 'reco2dur')) as f:
+        lines = f.readlines()
+        for line in lines:
+            parts = line.strip().split()
+            reco2dur_dict[parts[0]] = parts[1]
+
+    spk2utt_dict = {}
+    with open(os.path.join(root_path, 'spk2utt')) as f:
+        lines = f.readlines()
+        for line in lines:
+            parts = line.strip().split()
+            spk = parts[0]
+            utts = parts[1:]
+            for utt in utts:
+                tmp = utt.split('data')
+                rec = 'data_' + '_'.join(tmp[1][1:].split('_')[:-2])
+                if rec in spk2utt_dict.keys():
+                    spk2utt_dict[rec].append((spk, utt))
+                else:
+                    spk2utt_dict[rec] = []
+                    spk2utt_dict[rec].append((spk, utt))
+
+    segment_dict = {}
+    with open(os.path.join(root_path, 'segments')) as f:
+        lines = f.readlines()
+        for line in lines:
+            parts = line.strip().split()
+            if parts[1] in segment_dict.keys():
+                segment_dict[parts[1]].append((parts[0], parts[2], parts[3]))
+            else:
+                segment_dict[parts[1]] = []
+                segment_dict[parts[1]].append((parts[0], parts[2], parts[3]))
+
+    utt2spk_dict = {}
+    with open(os.path.join(root_path, 'utt2spk')) as f:
+        lines = f.readlines()
+        for line in lines:
+            parts = line.strip().split()
+            utt = parts[0]
+            tmp = utt.split('data')
+            rec = 'data_' + '_'.join(tmp[1][1:].split('_')[:-2])
+            if rec in utt2spk_dict.keys():
+                utt2spk_dict[rec].append((parts[0], parts[1]))
+            else:
+                utt2spk_dict[rec] = []
+                utt2spk_dict[rec].append((parts[0], parts[1]))
+
+    for file in scp_files:
+        scp_file = os.path.join(work_path, file)
+        idx = scp_file.split('.')[-1]
+        reco2dur_file = os.path.join(work_path, 'reco2dur.{}'.format(str(idx)))
+        spk2utt_file = os.path.join(work_path, 'spk2utt.{}'.format(str(idx)))
+        segment_file = os.path.join(work_path, 'segments.{}'.format(str(idx)))
+        utt2spk_file = os.path.join(work_path, 'utt2spk.{}'.format(str(idx)))
+
+        fpp = open(scp_file)
+        scp_lines = fpp.readlines()
+        keys = []
+        for line in scp_lines:
+            name = line.strip().split()[0]
+            keys.append(name)
+
+        with open(reco2dur_file, 'w') as f:
+            lines = []
+            for key in keys:
+                string = key + ' ' + reco2dur_dict[key]
+                lines.append(string + '\n')
+            lines[-1] = lines[-1][:-1]
+            f.writelines(lines)
+
+        with open(spk2utt_file, 'w') as f:
+            lines = []
+            for key in keys:
+                items = spk2utt_dict[key]
+                for item in items:
+                    string = item[0]
+                    for it in item[1:]:
+                        string += ' '
+                        string += it
+                    lines.append(string + '\n')
+            lines[-1] = lines[-1][:-1]
+            f.writelines(lines)
+
+        with open(segment_file, 'w') as f:
+            lines = []
+            for key in keys:
+                items = segment_dict[key]
+                for item in items:
+                    string = item[0] + ' ' + key + ' ' + item[1] + ' ' + item[2]
+                    lines.append(string + '\n')
+            lines[-1] = lines[-1][:-1]
+            f.writelines(lines)
+
+        with open(utt2spk_file, 'w') as f:
+            lines = []
+            for key in keys:
+                items = utt2spk_dict[key]
+                for item in items:
+                    string = item[0] + ' ' + item[1]
+                    lines.append(string + '\n')
+            lines[-1] = lines[-1][:-1]
+            f.writelines(lines)
+
+        fpp.close()

--
Gitblit v1.9.1