| | |
| | | from funasr.utils.types import str2triple_str |
| | | from funasr.utils.types import str_or_none |
| | | from funasr.utils.vad_utils import slice_padding_fbank |
| | | |
| | | from tqdm import tqdm |
| | | |
| | | def inference_asr( |
| | | maxlenratio: float, |
| | |
| | | |
| | | batch_size_token_ms_cum = 0 |
| | | beg_idx = 0 |
| | | for j, _ in enumerate(range(0, n)): |
| | | beg_asr_total = time.time() |
| | | for j, _ in enumerate(tqdm(range(0, n))): |
| | | batch_size_token_ms_cum += (sorted_data[j][0][1] - sorted_data[j][0][0]) |
| | | if j < n - 1 and (batch_size_token_ms_cum + sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0]) < batch_size_token_ms and (sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0]) < batch_size_token_threshold_s: |
| | | continue |
| | |
| | | beg_idx = end_idx |
| | | batch = {"speech": speech_j, "speech_lengths": speech_lengths_j} |
| | | batch = to_device(batch, device=device) |
| | | print("batch: ", speech_j.shape[0]) |
| | | # print("batch: ", speech_j.shape[0]) |
| | | beg_asr = time.time() |
| | | results = speech2text(**batch) |
| | | end_asr = time.time() |
| | | print("time cost asr: ", end_asr - beg_asr) |
| | | # print("time cost asr: ", end_asr - beg_asr) |
| | | |
| | | if len(results) < 1: |
| | | results = [["", [], [], [], [], [], []]] |
| | | results_sorted.extend(results) |
| | | |
| | | end_asr_total = time.time() |
| | | print("total time cost asr: ", end_asr_total-beg_asr_total) |
| | | restored_data = [0] * n |
| | | for j in range(n): |
| | | index = sorted_data[j][1] |
| | |
| | | quantize_modules: Optional[List[str]] = None, |
| | | quantize_dtype: Optional[str] = "float16", |
| | | streaming: Optional[bool] = False, |
| | | simu_streaming: Optional[bool] = False, |
| | | fake_streaming: Optional[bool] = False, |
| | | full_utt: Optional[bool] = False, |
| | | chunk_size: Optional[int] = 16, |
| | | left_context: Optional[int] = 16, |
| | |
| | | quantize_modules=quantize_modules, |
| | | quantize_dtype=quantize_dtype, |
| | | streaming=streaming, |
| | | simu_streaming=simu_streaming, |
| | | fake_streaming=fake_streaming, |
| | | full_utt=full_utt, |
| | | chunk_size=chunk_size, |
| | | left_context=left_context, |
| | |
| | | final_hyps = speech2text.streaming_decode( |
| | | speech[_end: len(speech)], is_final=True |
| | | ) |
| | | elif speech2text.simu_streaming: |
| | | final_hyps = speech2text.simu_streaming_decode(**batch) |
| | | elif speech2text.fake_streaming: |
| | | final_hyps = speech2text.fake_streaming_decode(**batch) |
| | | elif speech2text.full_utt: |
| | | final_hyps = speech2text.full_utt_decode(**batch) |
| | | else: |
| | |
| | | group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight") |
| | | group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight") |
| | | group.add_argument("--streaming", type=str2bool, default=False) |
| | | group.add_argument("--simu_streaming", type=str2bool, default=False) |
| | | group.add_argument("--fake_streaming", type=str2bool, default=False) |
| | | group.add_argument("--full_utt", type=str2bool, default=False) |
| | | group.add_argument("--chunk_size", type=int, default=16) |
| | | group.add_argument("--left_context", type=int, default=16) |