Shi Xian
2024-06-12 1300d38bf0604abbcfd0b8f493a1a70f38b502e3
funasr/utils/timestamp_tools.py
@@ -185,3 +185,88 @@
            ts_list = []
            sentence_start = sentence_end
    return res
def timestamp_sentence_en(
    punc_id_list, timestamp_postprocessed, text_postprocessed, return_raw_text=False
):
    punc_list = [",", ".", "?", ","]
    res = []
    if text_postprocessed is None:
        return res
    if timestamp_postprocessed is None:
        return res
    if len(timestamp_postprocessed) == 0:
        return res
    if len(text_postprocessed) == 0:
        return res
    if punc_id_list is None or len(punc_id_list) == 0:
        res.append(
            {
                "text": text_postprocessed.split(),
                "start": timestamp_postprocessed[0][0],
                "end": timestamp_postprocessed[-1][1],
                "timestamp": timestamp_postprocessed,
            }
        )
        return res
    if len(punc_id_list) != len(timestamp_postprocessed):
        logging.warning("length mismatch between punc and timestamp")
    sentence_text = ""
    sentence_text_seg = ""
    ts_list = []
    sentence_start = timestamp_postprocessed[0][0]
    sentence_end = timestamp_postprocessed[0][1]
    texts = text_postprocessed.split()
    punc_stamp_text_list = list(
        zip_longest(punc_id_list, timestamp_postprocessed, texts, fillvalue=None)
    )
    for punc_stamp_text in punc_stamp_text_list:
        punc_id, timestamp, text = punc_stamp_text
        # sentence_text += text if text is not None else ''
        if text is not None:
            if "a" <= text[0] <= "z" or "A" <= text[0] <= "Z":
                sentence_text += " " + text
            elif len(sentence_text) and (
                "a" <= sentence_text[-1] <= "z" or "A" <= sentence_text[-1] <= "Z"
            ):
                sentence_text += " " + text
            else:
                sentence_text += text
            sentence_text_seg += text + " "
        ts_list.append(timestamp)
        punc_id = int(punc_id) if punc_id is not None else 1
        sentence_end = timestamp[1] if timestamp is not None else sentence_end
        sentence_text = sentence_text[1:] if sentence_text[0] == ' ' else sentence_text
        if punc_id > 1:
            sentence_text += punc_list[punc_id - 2]
            sentence_text_seg = (
                sentence_text_seg[:-1] if sentence_text_seg[-1] == " " else sentence_text_seg
            )
            if return_raw_text:
                res.append(
                    {
                        "text": sentence_text,
                        "start": sentence_start,
                        "end": sentence_end,
                        "timestamp": ts_list,
                        "raw_text": sentence_text_seg,
                    }
                )
            else:
                res.append(
                    {
                        "text": sentence_text,
                        "start": sentence_start,
                        "end": sentence_end,
                        "timestamp": ts_list,
                    }
                )
            sentence_text = ""
            sentence_text_seg = ""
            ts_list = []
            sentence_start = sentence_end
    return res