From 5d41556d3968d2f9f48b64c0c08b9bfa5fbb323c Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 10 五月 2023 10:00:59 +0800
Subject: [PATCH] Merge pull request #482 from alibaba-damo-academy/dev_sx
---
funasr/utils/timestamp_tools.py | 38 ++++++++++++++++++++++++++++++++------
1 files changed, 32 insertions(+), 6 deletions(-)
diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py
index 87cc49e..489d317 100644
--- a/funasr/utils/timestamp_tools.py
+++ b/funasr/utils/timestamp_tools.py
@@ -94,19 +94,33 @@
res.append({
'text': text_postprocessed.split(),
"start": time_stamp_postprocessed[0][0],
- "end": time_stamp_postprocessed[-1][1]
+ "end": time_stamp_postprocessed[-1][1],
+ 'text_seg': text_postprocessed.split(),
+ "ts_list": time_stamp_postprocessed,
})
return res
if len(punc_id_list) != len(time_stamp_postprocessed):
print(" warning length mistach!!!!!!")
- sentence_text = ''
+ sentence_text = ""
+ sentence_text_seg = ""
+ ts_list = []
sentence_start = time_stamp_postprocessed[0][0]
sentence_end = time_stamp_postprocessed[0][1]
texts = text_postprocessed.split()
punc_stamp_text_list = list(zip_longest(punc_id_list, time_stamp_postprocessed, texts, fillvalue=None))
for punc_stamp_text in punc_stamp_text_list:
punc_id, time_stamp, text = punc_stamp_text
- sentence_text += text if text is not None else ''
+ # sentence_text += text if text is not None else ''
+ if text is not None:
+ if 'a' <= text[0] <= 'z' or 'A' <= text[0] <= 'Z':
+ sentence_text += ' ' + text
+ elif len(sentence_text) and ('a' <= sentence_text[-1] <= 'z' or 'A' <= sentence_text[-1] <= 'Z'):
+ sentence_text += ' ' + text
+ else:
+ sentence_text += text
+ sentence_text_seg += text + ' '
+ ts_list.append(time_stamp)
+
punc_id = int(punc_id) if punc_id is not None else 1
sentence_end = time_stamp[1] if time_stamp is not None else sentence_end
@@ -115,27 +129,39 @@
res.append({
'text': sentence_text,
"start": sentence_start,
- "end": sentence_end
+ "end": sentence_end,
+ "text_seg": sentence_text_seg,
+ "ts_list": ts_list
})
sentence_text = ''
+ sentence_text_seg = ''
+ ts_list = []
sentence_start = sentence_end
elif punc_id == 3:
sentence_text += '.'
res.append({
'text': sentence_text,
"start": sentence_start,
- "end": sentence_end
+ "end": sentence_end,
+ "text_seg": sentence_text_seg,
+ "ts_list": ts_list
})
sentence_text = ''
+ sentence_text_seg = ''
+ ts_list = []
sentence_start = sentence_end
elif punc_id == 4:
sentence_text += '?'
res.append({
'text': sentence_text,
"start": sentence_start,
- "end": sentence_end
+ "end": sentence_end,
+ "text_seg": sentence_text_seg,
+ "ts_list": ts_list
})
sentence_text = ''
+ sentence_text_seg = ''
+ ts_list = []
sentence_start = sentence_end
return res
--
Gitblit v1.9.1