Merge branch 'main' of github.com:alibaba-damo-academy/FunASR
merge
| | |
| | | |
| | | <a name="whats-new"></a> |
| | | ## What's new: |
| | | - 2024/10/29: Real-time Transcription Service 1.12 released,The 2pass-offline mode supports the SensevoiceSmal model;([docs](runtime/readme.md)); |
| | | - 2024/10/10:Added support for the Whisper-large-v3-turbo model, a multitasking model that can perform multilingual speech recognition, speech translation, and language identification. It can be downloaded from the [modelscope](examples/industrial_data_pretraining/whisper/demo.py), and [openai](examples/industrial_data_pretraining/whisper/demo_from_openai.py). |
| | | - 2024/09/26: Offline File Transcription Service 4.6, Offline File Transcription Service of English 1.7,Real-time Transcription Service 1.11 released,fix memory leak & Support the SensevoiceSmall onnx model;File Transcription Service 2.0 GPU released, Fix GPU memory leak; ([docs](runtime/readme.md)); |
| | | - 2024/09/25:keyword spotting models are new supported. Supports fine-tuning and inference for four models: [fsmn_kws](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online), [fsmn_kws_mt](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online), [sanm_kws](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-offline), [sanm_kws_streaming](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online). |
| | |
| | | |
| | | <a name="最新动态"></a> |
| | | ## 最新动态 |
| | | - 2024/10/29: 中文实时语音听写服务 1.12 发布,2pass-offline模式支持SensevoiceSmall模型;详细信息参阅([部署文档](runtime/readme_cn.md)) |
| | | - 2024/10/10:新增加Whisper-large-v3-turbo模型支持,多语言语音识别/翻译/语种识别,支持从 [modelscope](examples/industrial_data_pretraining/whisper/demo.py)仓库下载,也支持从 [openai](examples/industrial_data_pretraining/whisper/demo_from_openai.py)仓库下载模型。 |
| | | - 2024/09/26: 中文离线文件转写服务 4.6、英文离线文件转写服务 1.7、中文实时语音听写服务 1.11 发布,修复ONNX内存泄漏、支持SensevoiceSmall onnx模型;中文离线文件转写服务GPU 2.0 发布,修复显存泄漏; 详细信息参阅([部署文档](runtime/readme_cn.md)) |
| | | - 2024/09/25:新增语音唤醒模型,支持[fsmn_kws](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online), [fsmn_kws_mt](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online), [sanm_kws](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-offline), [sanm_kws_streaming](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online) 4个模型的微调和推理。 |
| | |
| | | training_data_to_tokens, |
| | | ) |
| | | |
| | | |
| | | """ |
| | | Runs Evaluation on data in the format of : <semiotic class>\t<unnormalized text>\t<`self` if trivial class or normalized text> |
| | | like the Google text normalization data https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish |
| | | """ |
| | | |
| | | |
| | | def parse_args(): |
| | | parser = ArgumentParser() |
| | | parser.add_argument("--input", help="input file path", type=str) |
| | | parser.add_argument("--input", help="input file path", type=str, required=True) |
| | | parser.add_argument( |
| | | "--lang", |
| | | help="language", |
| | |
| | | ) |
| | | return parser.parse_args() |
| | | |
| | | |
| | | if __name__ == "__main__": |
| | | # Example usage: |
| | | # python run_evaluate.py --input=<INPUT> --cat=<CATEGORY> --filter |
| | | args = parse_args() |
| | | if args.lang == "en": |
| | | from fun_text_processing.inverse_text_normalization.en.clean_eval_data import ( |
| | | filter_loaded_data, |
| | | ) |
| | | from fun_text_processing.inverse_text_normalization.en.clean_eval_data import filter_loaded_data |
| | | |
| | | file_path = args.input |
| | | inverse_normalizer = InverseNormalizer() |
| | | |
| | |
| | | if args.filter: |
| | | training_data = filter_loaded_data(training_data) |
| | | |
| | | # Evaluate at sentence level if no specific category is provided |
| | | if args.category is None: |
| | | print("Sentence level evaluation...") |
| | | sentences_un_normalized, sentences_normalized, _ = training_data_to_sentences(training_data) |
| | |
| | | ) |
| | | print("- Accuracy: " + str(sentences_accuracy)) |
| | | |
| | | # Evaluate at token level |
| | | print("Token level evaluation...") |
| | | tokens_per_type = training_data_to_tokens(training_data, category=args.category) |
| | | token_accuracy = {} |
| | | for token_type in tokens_per_type: |
| | | for token_type, (tokens_un_normalized, tokens_normalized) in tokens_per_type.items(): |
| | | print("- Token type: " + token_type) |
| | | tokens_un_normalized, tokens_normalized = tokens_per_type[token_type] |
| | | print(" - Data: " + str(len(tokens_normalized)) + " tokens") |
| | | tokens_prediction = inverse_normalizer.inverse_normalize_list(tokens_normalized) |
| | | print(" - Denormalized. Evaluating...") |
| | |
| | | tokens_prediction, tokens_un_normalized, input=tokens_normalized |
| | | ) |
| | | print(" - Accuracy: " + str(token_accuracy[token_type])) |
| | | token_count_per_type = { |
| | | token_type: len(tokens_per_type[token_type][0]) for token_type in tokens_per_type |
| | | } |
| | | |
| | | # Calculate weighted token accuracy |
| | | token_count_per_type = {token_type: len(tokens) for token_type, (tokens, _) in tokens_per_type.items()} |
| | | token_weighted_accuracy = [ |
| | | token_count_per_type[token_type] * accuracy |
| | | for token_type, accuracy in token_accuracy.items() |
| | |
| | | if token_type not in known_types: |
| | | raise ValueError("Unexpected token type: " + token_type) |
| | | |
| | | # Output table summarizing evaluation results if no specific category is provided |
| | | if args.category is None: |
| | | c1 = ["Class", "sent level"] + known_types |
| | | c2 = ["Num Tokens", len(sentences_normalized)] + [ |
| | | token_count_per_type[known_type] if known_type in tokens_per_type else "0" |
| | | for known_type in known_types |
| | | str(token_count_per_type.get(known_type, 0)) for known_type in known_types |
| | | ] |
| | | c3 = ["Denormalization", sentences_accuracy] + [ |
| | | token_accuracy[known_type] if known_type in token_accuracy else "0" |
| | | for known_type in known_types |
| | | c3 = ["Denormalization", str(sentences_accuracy)] + [ |
| | | str(token_accuracy.get(known_type, "0")) for known_type in known_types |
| | | ] |
| | | |
| | | for i in range(len(c1)): |
| | | print(f"{str(c1[i]):10s} | {str(c2[i]):10s} | {str(c3[i]):5s}") |
| | | print(f"{c1[i]:10s} | {c2[i]:10s} | {c3[i]:5s}") |
| | | else: |
| | | print(f"numbers\t{token_count_per_type[args.category]}") |
| | | print(f"Denormalization\t{token_accuracy[args.category]}") |
| | |
| | | import time |
| | | import math |
| | | import torch |
| | | import numpy as np |
| | | from torch import nn |
| | | from enum import Enum |
| | | from dataclasses import dataclass |
| | |
| | | cache["stats"].data_buf_all = torch.cat( |
| | | (cache["stats"].data_buf_all, cache["stats"].waveform[0]) |
| | | ) |
| | | for offset in range( |
| | | 0, cache["stats"].waveform.shape[1] - frame_sample_length + 1, frame_shift_length |
| | | ): |
| | | cache["stats"].decibel.append( |
| | | 10 |
| | | * math.log10( |
| | | (cache["stats"].waveform[0][offset : offset + frame_sample_length]) |
| | | .square() |
| | | .sum() |
| | | + 0.000001 |
| | | ) |
| | | ) |
| | | |
| | | waveform_numpy = cache["stats"].waveform.numpy() |
| | | |
| | | offsets = np.arange(0, waveform_numpy.shape[1] - frame_sample_length + 1, frame_shift_length) |
| | | frames = waveform_numpy[0, offsets[:, np.newaxis] + np.arange(frame_sample_length)] |
| | | |
| | | decibel_numpy = 10 * np.log10(np.sum(np.square(frames), axis=1) + 0.000001) |
| | | decibel_numpy = decibel_numpy.tolist() |
| | | |
| | | cache["stats"].decibel.extend(decibel_numpy) |
| | | |
| | | |
| | | def ComputeScores(self, feats: torch.Tensor, cache: dict = {}) -> None: |
| | | scores = self.encoder(feats, cache=cache["encoder"]).to("cpu") # return B * T * D |
| | |
| | | cur_seg = cache["stats"].output_data_buf[-1] |
| | | if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms: |
| | | print("warning\n") |
| | | out_pos = len(cur_seg.buffer) # cur_seg.buff现在没做任何操作 |
| | | data_to_pop = 0 |
| | | if end_point_is_sent_end: |
| | | data_to_pop = expected_sample_number |
| | |
| | | expected_sample_number = len(cache["stats"].data_buf) |
| | | |
| | | cur_seg.doa = 0 |
| | | for sample_cpy_out in range(0, data_to_pop): |
| | | # cur_seg.buffer[out_pos ++] = data_buf_.back(); |
| | | out_pos += 1 |
| | | for sample_cpy_out in range(data_to_pop, expected_sample_number): |
| | | # cur_seg.buffer[out_pos++] = data_buf_.back() |
| | | out_pos += 1 |
| | | if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms: |
| | | print("Something wrong with the VAD algorithm\n") |
| | | cache["stats"].data_buf_start_frame += frm_cnt |
| | |
| | | assert len(cache["stats"].sil_pdf_ids) == self.vad_opts.silence_pdf_num |
| | | if len(cache["stats"].sil_pdf_ids) > 0: |
| | | assert len(cache["stats"].scores) == 1 # 只支持batch_size = 1的测试 |
| | | sil_pdf_scores = [ |
| | | cache["stats"].scores[0][t][sil_pdf_id] for sil_pdf_id in cache["stats"].sil_pdf_ids |
| | | ] |
| | | sum_score = sum(sil_pdf_scores) |
| | | """ |
| | | - Change type of `sum_score` to float. The reason is that `sum_score` is a tensor with single element. |
| | | and `torch.Tensor` is slower `float` when tensor has only one element. |
| | | - Put the iteration of `sil_pdf_ids` inside `sum()` to reduce the overhead of creating a new list. |
| | | - The default `sil_pdf_ids` is [0], the `if` statement is used to reduce the overhead of expression |
| | | generation, which result in a mere (~2%) performance gain. |
| | | """ |
| | | if len(cache["stats"].sil_pdf_ids) > 1: |
| | | sum_score = sum(cache["stats"].scores[0][t][sil_pdf_id].item() for sil_pdf_id in cache["stats"].sil_pdf_ids) |
| | | else: |
| | | sum_score = cache["stats"].scores[0][t][cache["stats"].sil_pdf_ids[0]].item() |
| | | noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio |
| | | total_score = 1.0 |
| | | sum_score = total_score - sum_score |
| | |
| | | |
| | | | TIME | INFO | IMAGE VERSION | IMAGE ID | |
| | | |------------|-------------------------------------------------------------------------------------|-------------------------------------|--------------| |
| | | | 2024.10.29 | The 2pass-offline mode supports the SensevoiceSmal model | funasr-runtime-sdk-online-cpu-0.1.12 | f5febc5cf13a | |
| | | | 2024.09.26 | Fix memory leak | funasr-runtime-sdk-online-cpu-0.1.11 | e51a36c42771 | |
| | | | 2024.05.15 | Adapting to FunASR 1.0 model structure | funasr-runtime-sdk-online-cpu-0.1.10 | 1c2adfcff84d | |
| | | | 2024.03.05 | docker image supports ARM64 platform, update modelscope | funasr-runtime-sdk-online-cpu-0.1.9 | 4a875e08c7a2 | |
| | |
| | | ### Pull Docker Image |
| | | Use the following command to pull and start the FunASR software package docker image: |
| | | ```shell |
| | | sudo docker pull registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.10 |
| | | sudo docker pull registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.12 |
| | | mkdir -p ./funasr-runtime-resources/models |
| | | sudo docker run -p 10096:10095 -it --privileged=true -v $PWD/funasr-runtime-resources/models:/workspace/models registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.10 |
| | | sudo docker run -p 10096:10095 -it --privileged=true -v $PWD/funasr-runtime-resources/models:/workspace/models registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.12 |
| | | ``` |
| | | |
| | | ### Launching the Server |
| | |
| | | |
| | | | 时间 | 详情 | 镜像版本 | 镜像ID | |
| | | |:-----------|:----------------------------------|--------------------------------------|--------------| |
| | | | 2024.10.29 | 2pass-offline模式支持SensevoiceSmall模型 | funasr-runtime-sdk-online-cpu-0.1.12 | f5febc5cf13a | |
| | | | 2024.09.26 | 修复内存泄漏 | funasr-runtime-sdk-online-cpu-0.1.11 | e51a36c42771 | |
| | | | 2024.05.15 | 适配FunASR 1.0模型结构 | funasr-runtime-sdk-online-cpu-0.1.10 | 1c2adfcff84d | |
| | | | 2024.03.05 | docker镜像支持arm64平台,升级modelscope版本 | funasr-runtime-sdk-online-cpu-0.1.9 | 4a875e08c7a2 | |
| | |
| | | |
| | | ```shell |
| | | sudo docker pull \ |
| | | registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.11 |
| | | registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.12 |
| | | mkdir -p ./funasr-runtime-resources/models |
| | | sudo docker run -p 10096:10095 -it --privileged=true \ |
| | | -v $PWD/funasr-runtime-resources/models:/workspace/models \ |
| | | registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.11 |
| | | registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.12 |
| | | ``` |
| | | |
| | | ### 服务端启动 |
| | |
| | | --hotword /workspace/models/hotwords.txt > log.txt 2>&1 & |
| | | |
| | | # 如果您想关闭ssl,增加参数:--certfile 0 |
| | | # 如果您想使用时间戳或者nn热词模型进行部署,请设置--model-dir为对应模型: |
| | | # 如果您想使用SenseVoiceSmall模型、时间戳、nn热词模型进行部署,请设置--model-dir为对应模型: |
| | | # iic/SenseVoiceSmall-onnx |
| | | # damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-onnx(时间戳) |
| | | # damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404-onnx(nn热词) |
| | | # 如果您想在服务端加载热词,请在宿主机文件./funasr-runtime-resources/models/hotwords.txt配置热词(docker映射地址为/workspace/models/hotwords.txt): |
| | | # 每行一个热词,格式(热词 权重):阿里巴巴 20(注:热词理论上无限制,但为了兼顾性能和效果,建议热词长度不超过10,个数不超过1k,权重1~100) |
| | | # SenseVoiceSmall-onnx识别结果中“<|zh|><|NEUTRAL|><|Speech|> ”分别为对应的语种、情感、事件信息 |
| | | ``` |
| | | 服务端详细参数介绍可参考[服务端用法详解](#服务端用法详解) |
| | | ### 客户端测试与使用 |
| | |
| | | _FUNASRAPI FUNASR_RESULT FunTpassInferBuffer(FUNASR_HANDLE handle, FUNASR_HANDLE online_handle, const char* sz_buf, |
| | | int n_len, std::vector<std::vector<std::string>> &punc_cache, bool input_finished=true, |
| | | int sampling_rate=16000, std::string wav_format="pcm", ASR_TYPE mode=ASR_TWO_PASS, |
| | | const std::vector<std::vector<float>> &hw_emb={{0.0}}, bool itn=true, FUNASR_DEC_HANDLE dec_handle=nullptr); |
| | | const std::vector<std::vector<float>> &hw_emb={{0.0}}, bool itn=true, FUNASR_DEC_HANDLE dec_handle=nullptr, |
| | | std::string svs_lang="auto", bool svs_itn=true); |
| | | _FUNASRAPI void FunTpassUninit(FUNASR_HANDLE handle); |
| | | _FUNASRAPI void FunTpassOnlineUninit(FUNASR_HANDLE handle); |
| | | |
| | |
| | | virtual void StartUtterance() = 0; |
| | | virtual void EndUtterance() = 0; |
| | | virtual void Reset() = 0; |
| | | virtual string GreedySearch(float* in, int n_len, int64_t token_nums, bool is_stamp=false, std::vector<float> us_alphas={0}, std::vector<float> us_cif_peak={0}){return "";}; |
| | | virtual void InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){}; |
| | | virtual void InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){}; |
| | | virtual void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){}; |
| | | virtual void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, |
| | | const std::string &am_config, const std::string &token_file, const std::string &online_token_file, int thread_num){}; |
| | | virtual void InitLm(const std::string &lm_file, const std::string &lm_config, const std::string &lex_file){}; |
| | | virtual void InitFstDecoder(){}; |
| | | virtual std::string Forward(float *din, int len, bool input_finished, const std::vector<std::vector<float>> &hw_emb={{0.0}}, void* wfst_decoder=nullptr){return "";}; |
| | |
| | | bool UseVad(){return use_vad;}; |
| | | bool UsePunc(){return use_punc;}; |
| | | bool UseITN(){return use_itn;}; |
| | | std::string GetModelType(){return model_type;}; |
| | | |
| | | private: |
| | | bool use_vad=false; |
| | | bool use_punc=false; |
| | | bool use_itn=false; |
| | | std::string model_type = MODEL_PARA; |
| | | }; |
| | | |
| | | TpassStream *CreateTpassStream(std::map<std::string, std::string>& model_path, int thread_num=1); |
| | |
| | | _FUNASRAPI FUNASR_RESULT FunTpassInferBuffer(FUNASR_HANDLE handle, FUNASR_HANDLE online_handle, const char* sz_buf, |
| | | int n_len, std::vector<std::vector<std::string>> &punc_cache, bool input_finished, |
| | | int sampling_rate, std::string wav_format, ASR_TYPE mode, |
| | | const std::vector<std::vector<float>> &hw_emb, bool itn, FUNASR_DEC_HANDLE dec_handle) |
| | | const std::vector<std::vector<float>> &hw_emb, bool itn, FUNASR_DEC_HANDLE dec_handle, |
| | | std::string svs_lang, bool svs_itn) |
| | | { |
| | | funasr::TpassStream* tpass_stream = (funasr::TpassStream*)handle; |
| | | funasr::TpassOnlineStream* tpass_online_stream = (funasr::TpassOnlineStream*)online_handle; |
| | |
| | | |
| | | funasr::AudioFrame* frame = nullptr; |
| | | while(audio->FetchChunck(frame) > 0){ |
| | | string msg = ((funasr::ParaformerOnline*)asr_online_handle)->Forward(frame->data, frame->len, frame->is_final); |
| | | string msg = (asr_online_handle)->Forward(frame->data, frame->len, frame->is_final); |
| | | if(mode == ASR_ONLINE){ |
| | | ((funasr::ParaformerOnline*)asr_online_handle)->online_res += msg; |
| | | if(frame->is_final){ |
| | |
| | | len = new int[1]; |
| | | buff[0] = frame->data; |
| | | len[0] = frame->len; |
| | | vector<string> msgs = ((funasr::Paraformer*)asr_handle)->Forward(buff, len, frame->is_final, hw_emb, dec_handle); |
| | | vector<string> msgs; |
| | | if(tpass_stream->GetModelType() == MODEL_SVS){ |
| | | msgs = (tpass_stream->asr_handle)->Forward(buff, len, true, svs_lang, svs_itn, 1); |
| | | }else{ |
| | | msgs = (tpass_stream->asr_handle)->Forward(buff, len, true, hw_emb, dec_handle, 1); |
| | | } |
| | | string msg = msgs.size()>0?msgs[0]:""; |
| | | std::vector<std::string> msg_vec = funasr::SplitStr(msg, " | "); // split with timestamp |
| | | if(msg_vec.size()==0){ |
| | |
| | | p_result->stamp += cur_stamp + "]"; |
| | | } |
| | | |
| | | string msg_punc = punc_online_handle->AddPunc(msg.c_str(), punc_cache[1]); |
| | | if(input_finished){ |
| | | msg_punc += "。"; |
| | | } |
| | | p_result->tpass_msg = msg_punc; |
| | | #if !defined(__APPLE__) |
| | | if(tpass_stream->UseITN() && itn){ |
| | | string msg_itn = tpass_stream->itn_handle->Normalize(msg_punc); |
| | | // TimestampSmooth |
| | | if(!(p_result->stamp).empty()){ |
| | | std::string new_stamp = funasr::TimestampSmooth(p_result->tpass_msg, msg_itn, p_result->stamp); |
| | | if(!new_stamp.empty()){ |
| | | p_result->stamp = new_stamp; |
| | | } |
| | | if (tpass_stream->GetModelType() == MODEL_PARA){ |
| | | string msg_punc = punc_online_handle->AddPunc(msg.c_str(), punc_cache[1]); |
| | | if(input_finished){ |
| | | msg_punc += "。"; |
| | | } |
| | | p_result->tpass_msg = msg_itn; |
| | | } |
| | | p_result->tpass_msg = msg_punc; |
| | | |
| | | #if !defined(__APPLE__) |
| | | if(tpass_stream->UseITN() && itn){ |
| | | string msg_itn = tpass_stream->itn_handle->Normalize(msg_punc); |
| | | // TimestampSmooth |
| | | if(!(p_result->stamp).empty()){ |
| | | std::string new_stamp = funasr::TimestampSmooth(p_result->tpass_msg, msg_itn, p_result->stamp); |
| | | if(!new_stamp.empty()){ |
| | | p_result->stamp = new_stamp; |
| | | } |
| | | } |
| | | p_result->tpass_msg = msg_itn; |
| | | } |
| | | #endif |
| | | }else{ |
| | | p_result->tpass_msg = msg; |
| | | } |
| | | if (!(p_result->stamp).empty()){ |
| | | p_result->stamp_sents = funasr::TimestampSentence(p_result->tpass_msg, p_result->stamp); |
| | | } |
| | |
| | | |
| | | namespace funasr { |
| | | |
| | | ParaformerOnline::ParaformerOnline(Paraformer* para_handle, std::vector<int> chunk_size) |
| | | :para_handle_(std::move(para_handle)),chunk_size(chunk_size),session_options_{}{ |
| | | InitOnline( |
| | | para_handle_->fbank_opts_, |
| | | para_handle_->encoder_session_, |
| | | para_handle_->decoder_session_, |
| | | para_handle_->en_szInputNames_, |
| | | para_handle_->en_szOutputNames_, |
| | | para_handle_->de_szInputNames_, |
| | | para_handle_->de_szOutputNames_, |
| | | para_handle_->means_list_, |
| | | para_handle_->vars_list_); |
| | | ParaformerOnline::ParaformerOnline(Model* offline_handle, std::vector<int> chunk_size, std::string model_type) |
| | | :offline_handle_(std::move(offline_handle)),chunk_size(chunk_size),session_options_{}{ |
| | | if(model_type == MODEL_PARA){ |
| | | Paraformer* para_handle = dynamic_cast<Paraformer*>(offline_handle_); |
| | | InitOnline( |
| | | para_handle->fbank_opts_, |
| | | para_handle->encoder_session_, |
| | | para_handle->decoder_session_, |
| | | para_handle->en_szInputNames_, |
| | | para_handle->en_szOutputNames_, |
| | | para_handle->de_szInputNames_, |
| | | para_handle->de_szOutputNames_, |
| | | para_handle->means_list_, |
| | | para_handle->vars_list_, |
| | | para_handle->frame_length, |
| | | para_handle->frame_shift, |
| | | para_handle->n_mels, |
| | | para_handle->lfr_m, |
| | | para_handle->lfr_n, |
| | | para_handle->encoder_size, |
| | | para_handle->fsmn_layers, |
| | | para_handle->fsmn_lorder, |
| | | para_handle->fsmn_dims, |
| | | para_handle->cif_threshold, |
| | | para_handle->tail_alphas); |
| | | }else if(model_type == MODEL_SVS){ |
| | | SenseVoiceSmall* svs_handle = dynamic_cast<SenseVoiceSmall*>(offline_handle_); |
| | | InitOnline( |
| | | svs_handle->fbank_opts_, |
| | | svs_handle->encoder_session_, |
| | | svs_handle->decoder_session_, |
| | | svs_handle->en_szInputNames_, |
| | | svs_handle->en_szOutputNames_, |
| | | svs_handle->de_szInputNames_, |
| | | svs_handle->de_szOutputNames_, |
| | | svs_handle->means_list_, |
| | | svs_handle->vars_list_, |
| | | svs_handle->frame_length, |
| | | svs_handle->frame_shift, |
| | | svs_handle->n_mels, |
| | | svs_handle->lfr_m, |
| | | svs_handle->lfr_n, |
| | | svs_handle->encoder_size, |
| | | svs_handle->fsmn_layers, |
| | | svs_handle->fsmn_lorder, |
| | | svs_handle->fsmn_dims, |
| | | svs_handle->cif_threshold, |
| | | svs_handle->tail_alphas); |
| | | } |
| | | InitCache(); |
| | | } |
| | | |
| | |
| | | vector<const char*> &de_szInputNames, |
| | | vector<const char*> &de_szOutputNames, |
| | | vector<float> &means_list, |
| | | vector<float> &vars_list){ |
| | | vector<float> &vars_list, |
| | | int frame_length_, |
| | | int frame_shift_, |
| | | int n_mels_, |
| | | int lfr_m_, |
| | | int lfr_n_, |
| | | int encoder_size_, |
| | | int fsmn_layers_, |
| | | int fsmn_lorder_, |
| | | int fsmn_dims_, |
| | | float cif_threshold_, |
| | | float tail_alphas_){ |
| | | fbank_opts_ = fbank_opts; |
| | | encoder_session_ = encoder_session; |
| | | decoder_session_ = decoder_session; |
| | |
| | | means_list_ = means_list; |
| | | vars_list_ = vars_list; |
| | | |
| | | frame_length = para_handle_->frame_length; |
| | | frame_shift = para_handle_->frame_shift; |
| | | n_mels = para_handle_->n_mels; |
| | | lfr_m = para_handle_->lfr_m; |
| | | lfr_n = para_handle_->lfr_n; |
| | | encoder_size = para_handle_->encoder_size; |
| | | fsmn_layers = para_handle_->fsmn_layers; |
| | | fsmn_lorder = para_handle_->fsmn_lorder; |
| | | fsmn_dims = para_handle_->fsmn_dims; |
| | | cif_threshold = para_handle_->cif_threshold; |
| | | tail_alphas = para_handle_->tail_alphas; |
| | | frame_length = frame_length_; |
| | | frame_shift = frame_shift_; |
| | | n_mels = n_mels_; |
| | | lfr_m = lfr_m_; |
| | | lfr_n = lfr_n_; |
| | | encoder_size = encoder_size_; |
| | | fsmn_layers = fsmn_layers_; |
| | | fsmn_lorder = fsmn_lorder_; |
| | | fsmn_dims = fsmn_dims_; |
| | | cif_threshold = cif_threshold_; |
| | | tail_alphas = tail_alphas_; |
| | | |
| | | // other vars |
| | | sqrt_factor = std::sqrt(encoder_size); |
| | | for(int i=0; i<fsmn_lorder*fsmn_dims; i++){ |
| | | fsmn_init_cache_.emplace_back(0); |
| | | } |
| | | chunk_len = chunk_size[1]*frame_shift*lfr_n*para_handle_->asr_sample_rate/1000; |
| | | chunk_len = chunk_size[1]*frame_shift*lfr_n*offline_handle_->GetAsrSampleRate()/1000; |
| | | |
| | | frame_sample_length_ = para_handle_->asr_sample_rate / 1000 * frame_length; |
| | | frame_shift_sample_length_ = para_handle_->asr_sample_rate / 1000 * frame_shift; |
| | | frame_sample_length_ = offline_handle_->GetAsrSampleRate() / 1000 * frame_length; |
| | | frame_shift_sample_length_ = offline_handle_->GetAsrSampleRate() / 1000 * frame_shift; |
| | | |
| | | } |
| | | |
| | |
| | | |
| | | std::vector<int64_t> decoder_shape = decoder_tensor[0].GetTensorTypeAndShapeInfo().GetShape(); |
| | | float* float_data = decoder_tensor[0].GetTensorMutableData<float>(); |
| | | result = para_handle_->GreedySearch(float_data, list_frame.size(), decoder_shape[2]); |
| | | result = offline_handle_->GreedySearch(float_data, list_frame.size(), decoder_shape[2]); |
| | | } |
| | | }catch (std::exception const &e) |
| | | { |
| | |
| | | if(is_first_chunk){ |
| | | is_first_chunk = false; |
| | | } |
| | | ExtractFeats(para_handle_->asr_sample_rate, wav_feats, waves, input_finished); |
| | | ExtractFeats(offline_handle_->GetAsrSampleRate(), wav_feats, waves, input_finished); |
| | | if(wav_feats.size() == 0){ |
| | | return result; |
| | | } |
| | |
| | | vector<const char*> &de_szInputNames, |
| | | vector<const char*> &de_szOutputNames, |
| | | vector<float> &means_list, |
| | | vector<float> &vars_list); |
| | | vector<float> &vars_list, |
| | | int frame_length_, |
| | | int frame_shift_, |
| | | int n_mels_, |
| | | int lfr_m_, |
| | | int lfr_n_, |
| | | int encoder_size_, |
| | | int fsmn_layers_, |
| | | int fsmn_lorder_, |
| | | int fsmn_dims_, |
| | | float cif_threshold_, |
| | | float tail_alphas_); |
| | | |
| | | void StartUtterance() |
| | | { |
| | |
| | | { |
| | | } |
| | | |
| | | Paraformer* para_handle_ = nullptr; |
| | | // from para_handle_ |
| | | Model* offline_handle_ = nullptr; |
| | | // from offline_handle_ |
| | | knf::FbankOptions fbank_opts_; |
| | | std::shared_ptr<Ort::Session> encoder_session_ = nullptr; |
| | | std::shared_ptr<Ort::Session> decoder_session_ = nullptr; |
| | |
| | | vector<const char*> de_szOutputNames_; |
| | | vector<float> means_list_; |
| | | vector<float> vars_list_; |
| | | // configs from para_handle_ |
| | | // configs from offline_handle_ |
| | | int frame_length = 25; |
| | | int frame_shift = 10; |
| | | int n_mels = 80; |
| | |
| | | double sqrt_factor; |
| | | |
| | | public: |
| | | ParaformerOnline(Paraformer* para_handle, std::vector<int> chunk_size); |
| | | ParaformerOnline(Model* offline_handle, std::vector<int> chunk_size, std::string model_type=MODEL_PARA); |
| | | ~ParaformerOnline(); |
| | | void Reset(); |
| | | void ResetCache(); |
| | |
| | | string Forward(float* din, int len, bool input_finished, const std::vector<std::vector<float>> &hw_emb={{0.0}}, void* wfst_decoder=nullptr); |
| | | string Rescoring(); |
| | | |
| | | int GetAsrSampleRate() { return para_handle_->asr_sample_rate; }; |
| | | int GetAsrSampleRate() { return offline_handle_->GetAsrSampleRate(); }; |
| | | |
| | | // 2pass |
| | | std::string online_res; |
| | |
| | | } |
| | | |
| | | // 2pass |
| | | void Paraformer::InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){ |
| | | void Paraformer::InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, |
| | | const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, const std::string &online_token_file, int thread_num){ |
| | | // online |
| | | InitAsr(en_model, de_model, am_cmvn, am_config, token_file, thread_num); |
| | | InitAsr(en_model, de_model, am_cmvn, am_config, online_token_file, thread_num); |
| | | |
| | | // offline |
| | | try { |
| | |
| | | exit(-1); |
| | | } |
| | | |
| | | // string strName; |
| | | // GetInputName(m_session_.get(), strName); |
| | | // m_strInputNames.push_back(strName.c_str()); |
| | | // GetInputName(m_session_.get(), strName,1); |
| | | // m_strInputNames.push_back(strName); |
| | | |
| | | // if (use_hotword) { |
| | | // GetInputName(m_session_.get(), strName, 2); |
| | | // m_strInputNames.push_back(strName); |
| | | // } |
| | | |
| | | // // support time stamp |
| | | // size_t numOutputNodes = m_session_->GetOutputCount(); |
| | | // for(int index=0; index<numOutputNodes; index++){ |
| | | // GetOutputName(m_session_.get(), strName, index); |
| | | // m_strOutputNames.push_back(strName); |
| | | // } |
| | | |
| | | // for (auto& item : m_strInputNames) |
| | | // m_szInputNames.push_back(item.c_str()); |
| | | // for (auto& item : m_strOutputNames) |
| | | // m_szOutputNames.push_back(item.c_str()); |
| | | GetInputNames(m_session_.get(), m_strInputNames, m_szInputNames); |
| | | GetOutputNames(m_session_.get(), m_strOutputNames, m_szOutputNames); |
| | | } |
| | |
| | | // online |
| | | void InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num); |
| | | // 2pass |
| | | void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num); |
| | | void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, |
| | | const std::string &am_config, const std::string &token_file, const std::string &online_token_file, int thread_num); |
| | | void InitHwCompiler(const std::string &hw_model, int thread_num); |
| | | void InitSegDict(const std::string &seg_dict_model); |
| | | std::vector<std::vector<float>> CompileHotwordEmbedding(std::string &hotwords); |
| | |
| | | LoadCmvn(am_cmvn.c_str()); |
| | | } |
| | | |
| | | // online |
| | | void SenseVoiceSmall::InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){ |
| | | |
| | | LoadOnlineConfigFromYaml(am_config.c_str()); |
| | | // knf options |
| | | fbank_opts_.frame_opts.dither = 0; |
| | | fbank_opts_.mel_opts.num_bins = n_mels; |
| | | fbank_opts_.frame_opts.samp_freq = asr_sample_rate; |
| | | fbank_opts_.frame_opts.window_type = window_type; |
| | | fbank_opts_.frame_opts.frame_shift_ms = frame_shift; |
| | | fbank_opts_.frame_opts.frame_length_ms = frame_length; |
| | | fbank_opts_.energy_floor = 0; |
| | | fbank_opts_.mel_opts.debug_mel = false; |
| | | |
| | | // session_options_.SetInterOpNumThreads(1); |
| | | session_options_.SetIntraOpNumThreads(thread_num); |
| | | session_options_.SetGraphOptimizationLevel(ORT_ENABLE_ALL); |
| | | // DisableCpuMemArena can improve performance |
| | | session_options_.DisableCpuMemArena(); |
| | | |
| | | try { |
| | | encoder_session_ = std::make_unique<Ort::Session>(env_, ORTSTRING(en_model).c_str(), session_options_); |
| | | LOG(INFO) << "Successfully load model from " << en_model; |
| | | } catch (std::exception const &e) { |
| | | LOG(ERROR) << "Error when load am encoder model: " << e.what(); |
| | | exit(-1); |
| | | } |
| | | |
| | | try { |
| | | decoder_session_ = std::make_unique<Ort::Session>(env_, ORTSTRING(de_model).c_str(), session_options_); |
| | | LOG(INFO) << "Successfully load model from " << de_model; |
| | | } catch (std::exception const &e) { |
| | | LOG(ERROR) << "Error when load am decoder model: " << e.what(); |
| | | exit(-1); |
| | | } |
| | | |
| | | // encoder |
| | | string strName; |
| | | GetInputName(encoder_session_.get(), strName); |
| | | en_strInputNames.push_back(strName.c_str()); |
| | | GetInputName(encoder_session_.get(), strName,1); |
| | | en_strInputNames.push_back(strName); |
| | | |
| | | GetOutputName(encoder_session_.get(), strName); |
| | | en_strOutputNames.push_back(strName); |
| | | GetOutputName(encoder_session_.get(), strName,1); |
| | | en_strOutputNames.push_back(strName); |
| | | GetOutputName(encoder_session_.get(), strName,2); |
| | | en_strOutputNames.push_back(strName); |
| | | |
| | | for (auto& item : en_strInputNames) |
| | | en_szInputNames_.push_back(item.c_str()); |
| | | for (auto& item : en_strOutputNames) |
| | | en_szOutputNames_.push_back(item.c_str()); |
| | | |
| | | // decoder |
| | | int de_input_len = 4 + fsmn_layers; |
| | | int de_out_len = 2 + fsmn_layers; |
| | | for(int i=0;i<de_input_len; i++){ |
| | | GetInputName(decoder_session_.get(), strName, i); |
| | | de_strInputNames.push_back(strName.c_str()); |
| | | } |
| | | |
| | | for(int i=0;i<de_out_len; i++){ |
| | | GetOutputName(decoder_session_.get(), strName,i); |
| | | de_strOutputNames.push_back(strName); |
| | | } |
| | | |
| | | for (auto& item : de_strInputNames) |
| | | de_szInputNames_.push_back(item.c_str()); |
| | | for (auto& item : de_strOutputNames) |
| | | de_szOutputNames_.push_back(item.c_str()); |
| | | |
| | | online_vocab = new Vocab(token_file.c_str()); |
| | | phone_set_ = new PhoneSet(token_file.c_str()); |
| | | LoadCmvn(am_cmvn.c_str()); |
| | | } |
| | | |
| | | // 2pass |
| | | void SenseVoiceSmall::InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, |
| | | const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, const std::string &online_token_file, int thread_num){ |
| | | // online |
| | | InitAsr(en_model, de_model, am_cmvn, am_config, online_token_file, thread_num); |
| | | |
| | | // offline |
| | | try { |
| | | m_session_ = std::make_unique<Ort::Session>(env_, ORTSTRING(am_model).c_str(), session_options_); |
| | | LOG(INFO) << "Successfully load model from " << am_model; |
| | | } catch (std::exception const &e) { |
| | | LOG(ERROR) << "Error when load am onnx model: " << e.what(); |
| | | exit(-1); |
| | | } |
| | | |
| | | GetInputNames(m_session_.get(), m_strInputNames, m_szInputNames); |
| | | GetOutputNames(m_session_.get(), m_strOutputNames, m_szOutputNames); |
| | | vocab = new Vocab(token_file.c_str()); |
| | | } |
| | | |
| | | void SenseVoiceSmall::LoadOnlineConfigFromYaml(const char* filename){ |
| | | |
| | | YAML::Node config; |
| | | try{ |
| | | config = YAML::LoadFile(filename); |
| | | }catch(exception const &e){ |
| | | LOG(ERROR) << "Error loading file, yaml file error or not exist."; |
| | | exit(-1); |
| | | } |
| | | |
| | | try{ |
| | | YAML::Node frontend_conf = config["frontend_conf"]; |
| | | YAML::Node encoder_conf = config["encoder_conf"]; |
| | | YAML::Node decoder_conf = config["decoder_conf"]; |
| | | YAML::Node predictor_conf = config["predictor_conf"]; |
| | | |
| | | this->window_type = frontend_conf["window"].as<string>(); |
| | | this->n_mels = frontend_conf["n_mels"].as<int>(); |
| | | this->frame_length = frontend_conf["frame_length"].as<int>(); |
| | | this->frame_shift = frontend_conf["frame_shift"].as<int>(); |
| | | this->lfr_m = frontend_conf["lfr_m"].as<int>(); |
| | | this->lfr_n = frontend_conf["lfr_n"].as<int>(); |
| | | |
| | | this->encoder_size = encoder_conf["output_size"].as<int>(); |
| | | this->fsmn_dims = encoder_conf["output_size"].as<int>(); |
| | | |
| | | this->fsmn_layers = decoder_conf["num_blocks"].as<int>(); |
| | | this->fsmn_lorder = decoder_conf["kernel_size"].as<int>()-1; |
| | | |
| | | this->cif_threshold = predictor_conf["threshold"].as<double>(); |
| | | this->tail_alphas = predictor_conf["tail_threshold"].as<double>(); |
| | | |
| | | this->asr_sample_rate = frontend_conf["fs"].as<int>(); |
| | | |
| | | |
| | | }catch(exception const &e){ |
| | | LOG(ERROR) << "Error when load argument from vad config YAML."; |
| | | exit(-1); |
| | | } |
| | | } |
| | | |
| | | void SenseVoiceSmall::LoadConfigFromYaml(const char* filename){ |
| | | |
| | | YAML::Node config; |
| | |
| | | { |
| | | if(vocab){ |
| | | delete vocab; |
| | | } |
| | | if(online_vocab){ |
| | | delete online_vocab; |
| | | } |
| | | if(lm_vocab){ |
| | | delete lm_vocab; |
| | |
| | | return str_lang + str_emo + str_event + " " + text; |
| | | } |
| | | |
| | | string SenseVoiceSmall::GreedySearch(float * in, int n_len, int64_t token_nums, bool is_stamp, std::vector<float> us_alphas, std::vector<float> us_cif_peak) |
| | | { |
| | | vector<int> hyps; |
| | | int Tmax = n_len; |
| | | for (int i = 0; i < Tmax; i++) { |
| | | int max_idx; |
| | | float max_val; |
| | | FindMax(in + i * token_nums, token_nums, max_val, max_idx); |
| | | hyps.push_back(max_idx); |
| | | } |
| | | if(!is_stamp){ |
| | | return online_vocab->Vector2StringV2(hyps, language); |
| | | }else{ |
| | | std::vector<string> char_list; |
| | | std::vector<std::vector<float>> timestamp_list; |
| | | std::string res_str; |
| | | online_vocab->Vector2String(hyps, char_list); |
| | | std::vector<string> raw_char(char_list); |
| | | TimestampOnnx(us_alphas, us_cif_peak, char_list, res_str, timestamp_list); |
| | | |
| | | return PostProcess(raw_char, timestamp_list); |
| | | } |
| | | } |
| | | |
| | | void SenseVoiceSmall::LfrCmvn(std::vector<std::vector<float>> &asr_feats) { |
| | | |
| | | std::vector<std::vector<float>> out_feats; |
| | |
| | | class SenseVoiceSmall : public Model { |
| | | private: |
| | | Vocab* vocab = nullptr; |
| | | Vocab* online_vocab = nullptr; |
| | | Vocab* lm_vocab = nullptr; |
| | | SegDict* seg_dict = nullptr; |
| | | PhoneSet* phone_set_ = nullptr; |
| | | const float scale = 1.0; |
| | | |
| | | void LoadConfigFromYaml(const char* filename); |
| | | void LoadOnlineConfigFromYaml(const char* filename); |
| | | void LoadCmvn(const char *filename); |
| | | void LfrCmvn(std::vector<std::vector<float>> &asr_feats); |
| | | |
| | |
| | | ~SenseVoiceSmall(); |
| | | void InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num); |
| | | // online |
| | | // void InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num); |
| | | void InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num); |
| | | // 2pass |
| | | // void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num); |
| | | void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, |
| | | const std::string &token_file, const std::string &online_token_file, int thread_num); |
| | | // void InitHwCompiler(const std::string &hw_model, int thread_num); |
| | | // void InitSegDict(const std::string &seg_dict_model); |
| | | std::vector<std::vector<float>> CompileHotwordEmbedding(std::string &hotwords); |
| | |
| | | void FbankKaldi(float sample_rate, const float* waves, int len, std::vector<std::vector<float>> &asr_feats); |
| | | std::vector<std::string> Forward(float** din, int* len, bool input_finished=true, std::string svs_lang="auto", bool svs_itn=true, int batch_in=1); |
| | | string CTCSearch( float * in, std::vector<int32_t> paraformer_length, std::vector<int64_t> outputShape); |
| | | |
| | | string GreedySearch( float* in, int n_len, int64_t token_nums, |
| | | bool is_stamp=false, std::vector<float> us_alphas={0}, std::vector<float> us_cif_peak={0}); |
| | | string Rescoring(); |
| | | string GetLang(){return language;}; |
| | | int GetAsrSampleRate() { return asr_sample_rate; }; |
| | |
| | | int asr_sample_rate = MODEL_SAMPLE_RATE; |
| | | int batch_size_ = 1; |
| | | int blank_id = 0; |
| | | float cif_threshold = 1.0; |
| | | float tail_alphas = 0.45; |
| | | //dict |
| | | std::map<std::string, int> lid_map = { |
| | | {"auto", 0}, |
| | |
| | | } |
| | | |
| | | if(tpass_obj->asr_handle){ |
| | | asr_online_handle = make_unique<ParaformerOnline>((Paraformer*)(tpass_obj->asr_handle).get(), chunk_size); |
| | | asr_online_handle = make_unique<ParaformerOnline>((tpass_obj->asr_handle).get(), chunk_size, tpass_stream->GetModelType()); |
| | | }else{ |
| | | LOG(ERROR)<<"asr_handle is null"; |
| | | exit(-1); |
| | |
| | | string am_cmvn_path; |
| | | string am_config_path; |
| | | string token_path; |
| | | string online_token_path; |
| | | string hw_compile_model_path; |
| | | string seg_dict_path; |
| | | |
| | | asr_handle = make_unique<Paraformer>(); |
| | | if (model_path.at(MODEL_DIR).find(MODEL_SVS) != std::string::npos) |
| | | { |
| | | asr_handle = make_unique<SenseVoiceSmall>(); |
| | | model_type = MODEL_SVS; |
| | | }else{ |
| | | asr_handle = make_unique<Paraformer>(); |
| | | } |
| | | |
| | | bool enable_hotword = false; |
| | | hw_compile_model_path = PathAppend(model_path.at(MODEL_DIR), MODEL_EB_NAME); |
| | |
| | | am_model_path = PathAppend(model_path.at(OFFLINE_MODEL_DIR), MODEL_NAME); |
| | | en_model_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), ENCODER_NAME); |
| | | de_model_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), DECODER_NAME); |
| | | online_token_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), TOKEN_PATH); |
| | | if(model_path.find(QUANTIZE) != model_path.end() && model_path.at(QUANTIZE) == "true"){ |
| | | am_model_path = PathAppend(model_path.at(OFFLINE_MODEL_DIR), QUANT_MODEL_NAME); |
| | | en_model_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), QUANT_ENCODER_NAME); |
| | |
| | | am_config_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), AM_CONFIG_NAME); |
| | | token_path = PathAppend(model_path.at(MODEL_DIR), TOKEN_PATH); |
| | | |
| | | asr_handle->InitAsr(am_model_path, en_model_path, de_model_path, am_cmvn_path, am_config_path, token_path, thread_num); |
| | | asr_handle->InitAsr(am_model_path, en_model_path, de_model_path, am_cmvn_path, am_config_path, token_path, online_token_path, thread_num); |
| | | }else{ |
| | | LOG(ERROR) <<"Can not find offline-model-dir or online-model-dir"; |
| | | exit(-1); |
| | |
| | | The FunASR real-time speech-to-text service software package not only performs real-time speech-to-text conversion, but also allows high-precision transcription text correction at the end of each sentence and outputs text with punctuation, supporting high-concurrency multiple requests. |
| | | In order to meet the needs of different users for different scenarios, different tutorials are prepared: |
| | | |
| | | ### Whats-new |
| | | ### Whats-new+ |
| | | - 2024/10/29: Real-time Transcription Service 1.12 released,The 2pass-offline mode supports the SensevoiceSmal model, docker image version funasr-runtime-sdk-online-cpu-0.1.12 (f5febc5cf13a) |
| | | - 2024/09/26: Real-time Transcription Service 1.11 released,Fix memory leak, docker image version funasr-runtime-sdk-online-cpu-0.1.11 (e51a36c42771) |
| | | - 2024/05/15: Real-time Transcription Service 1.10 released,adapting to FunASR 1.0 model structure, docker image version funasr-runtime-sdk-online-cpu-0.1.10 (1c2adfcff84d) |
| | | - 2024/03/05: Real-time Transcription Service 1.9 released,docker image supports ARM64 platform, update modelscope, docker image version funasr-runtime-sdk-online-cpu-0.1.9 (4a875e08c7a2) |
| | |
| | | 为了支持不同用户的需求,针对不同场景,准备了不同的图文教程: |
| | | |
| | | ### 最新动态 |
| | | - 2024/10/29: 中文实时语音听写服务 1.12 发布,2pass-offline模式支持SensevoiceSmall模型,docker镜像版本funasr-runtime-sdk-online-cpu-0.1.12 (f5febc5cf13a) |
| | | - 2024/09/26: 中文实时语音听写服务 1.11 发布,修复内存泄漏,docker镜像版本funasr-runtime-sdk-online-cpu-0.1.11 (e51a36c42771) |
| | | - 2024/05/15: 中文实时语音听写服务 1.10 发布,适配FunASR 1.0模型结构,docker镜像版本funasr-runtime-sdk-online-cpu-0.1.10 (1c2adfcff84d) |
| | | - 2024/03/05: 中文实时语音听写服务 1.9 发布,docker镜像支持arm64平台,升级modelscope版本,docker镜像版本funasr-runtime-sdk-online-cpu-0.1.9 (4a875e08c7a2) |
| | |
| | | void run(const std::string& uri, const std::vector<string>& wav_list, |
| | | const std::vector<string>& wav_ids, int audio_fs, std::string asr_mode, |
| | | std::vector<int> chunk_size, const std::unordered_map<std::string, int>& hws_map, |
| | | bool is_record=false, int use_itn=1) { |
| | | bool is_record=false, int use_itn=1, int svs_itn=1) { |
| | | // Create a new connection to the given URI |
| | | websocketpp::lib::error_code ec; |
| | | typename websocketpp::client<T>::connection_ptr con = |
| | |
| | | websocketpp::lib::thread asio_thread(&websocketpp::client<T>::run, |
| | | &m_client); |
| | | if(is_record){ |
| | | send_rec_data(asr_mode, chunk_size, hws_map, use_itn); |
| | | send_rec_data(asr_mode, chunk_size, hws_map, use_itn, svs_itn); |
| | | }else{ |
| | | send_wav_data(wav_list[0], wav_ids[0], audio_fs, asr_mode, chunk_size, hws_map, use_itn); |
| | | send_wav_data(wav_list[0], wav_ids[0], audio_fs, asr_mode, chunk_size, hws_map, use_itn, svs_itn); |
| | | } |
| | | |
| | | WaitABit(); |
| | |
| | | // send wav to server |
| | | void send_wav_data(string wav_path, string wav_id, int audio_fs, std::string asr_mode, |
| | | std::vector<int> chunk_vector, const std::unordered_map<std::string, int>& hws_map, |
| | | int use_itn) { |
| | | int use_itn, int svs_itn) { |
| | | uint64_t count = 0; |
| | | std::stringstream val; |
| | | |
| | |
| | | jsonbegin["audio_fs"] = sampling_rate; |
| | | jsonbegin["is_speaking"] = true; |
| | | jsonbegin["itn"] = true; |
| | | jsonbegin["svs_itn"] = true; |
| | | if(use_itn == 0){ |
| | | jsonbegin["itn"] = false; |
| | | } |
| | | if(svs_itn == 0){ |
| | | jsonbegin["svs_itn"] = false; |
| | | } |
| | | if(!hws_map.empty()){ |
| | | LOG(INFO) << "hotwords: "; |
| | |
| | | } |
| | | |
| | | void send_rec_data(std::string asr_mode, std::vector<int> chunk_vector, |
| | | const std::unordered_map<std::string, int>& hws_map, int use_itn) { |
| | | const std::unordered_map<std::string, int>& hws_map, int use_itn, int svs_itn) { |
| | | // first message |
| | | bool wait = false; |
| | | while (1) { |
| | |
| | | jsonbegin["audio_fs"] = sample_rate; |
| | | jsonbegin["is_speaking"] = true; |
| | | jsonbegin["itn"] = true; |
| | | jsonbegin["svs_itn"] = true; |
| | | if(use_itn == 0){ |
| | | jsonbegin["itn"] = false; |
| | | } |
| | | if(svs_itn == 0){ |
| | | jsonbegin["svs_itn"] = false; |
| | | } |
| | | if(!hws_map.empty()){ |
| | | LOG(INFO) << "hotwords: "; |
| | |
| | | "", "use-itn", |
| | | "use-itn is 1 means use itn, 0 means not use itn", false, 1, |
| | | "int"); |
| | | TCLAP::ValueArg<int> svs_itn_( |
| | | "", "svs-itn", |
| | | "svs-itn is 1 means use itn and punc, 0 means not use", false, 1, "int"); |
| | | TCLAP::ValueArg<std::string> hotword_("", HOTWORD, |
| | | "the hotword file, one hotword perline, Format: Hotword Weight (could be: 阿里巴巴 20)", false, "", "string"); |
| | | |
| | |
| | | cmd.add(thread_num_); |
| | | cmd.add(is_ssl_); |
| | | cmd.add(use_itn_); |
| | | cmd.add(svs_itn_); |
| | | cmd.add(hotword_); |
| | | cmd.parse(argc, argv); |
| | | |
| | |
| | | std::string asr_mode = asr_mode_.getValue(); |
| | | std::string chunk_size_str = chunk_size_.getValue(); |
| | | int use_itn = use_itn_.getValue(); |
| | | int svs_itn = svs_itn_.getValue(); |
| | | // get chunk_size |
| | | std::vector<int> chunk_size; |
| | | std::stringstream ss(chunk_size_str); |
| | |
| | | |
| | | c.m_client.set_tls_init_handler(bind(&OnTlsInit, ::_1)); |
| | | |
| | | c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, true, use_itn); |
| | | c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, true, use_itn, svs_itn); |
| | | } else { |
| | | WebsocketClient<websocketpp::config::asio_client> c(is_ssl); |
| | | |
| | | c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, true, use_itn); |
| | | c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, true, use_itn, svs_itn); |
| | | } |
| | | |
| | | }else{ |
| | |
| | | tmp_wav_ids.emplace_back(wav_ids[wav_i + i]); |
| | | |
| | | client_threads.emplace_back( |
| | | [uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, is_ssl, hws_map, use_itn]() { |
| | | [uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, is_ssl, hws_map, use_itn, svs_itn]() { |
| | | if (is_ssl == 1) { |
| | | WebsocketClient<websocketpp::config::asio_tls_client> c(is_ssl); |
| | | |
| | | c.m_client.set_tls_init_handler(bind(&OnTlsInit, ::_1)); |
| | | |
| | | c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, false, use_itn); |
| | | c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, false, use_itn, svs_itn); |
| | | } else { |
| | | WebsocketClient<websocketpp::config::asio_client> c(is_ssl); |
| | | |
| | | c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, false, use_itn); |
| | | c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, false, use_itn, svs_itn); |
| | | } |
| | | }); |
| | | } |
| | |
| | | s_itn_path=""; |
| | | s_lm_path=""; |
| | | } |
| | | found = s_offline_asr_path.find(MODEL_SVS); |
| | | if (found != std::string::npos) { |
| | | model_path["model-revision"]="v2.0.5"; |
| | | s_lm_path=""; |
| | | model_path[LM_DIR]=""; |
| | | } |
| | | |
| | | if (access(s_offline_asr_path.c_str(), F_OK) == 0) { |
| | | // local |
| | |
| | | int audio_fs, |
| | | std::string wav_format, |
| | | FUNASR_HANDLE& tpass_online_handle, |
| | | FUNASR_DEC_HANDLE& decoder_handle) { |
| | | FUNASR_DEC_HANDLE& decoder_handle, |
| | | std::string svs_lang, |
| | | bool sys_itn) { |
| | | // lock for each connection |
| | | if(!tpass_online_handle){ |
| | | scoped_lock guard(thread_lock); |
| | |
| | | subvector.data(), subvector.size(), |
| | | punc_cache, false, audio_fs, |
| | | wav_format, (ASR_TYPE)asr_mode_, |
| | | hotwords_embedding, itn, decoder_handle); |
| | | hotwords_embedding, itn, decoder_handle, |
| | | svs_lang, sys_itn); |
| | | |
| | | } else { |
| | | scoped_lock guard(thread_lock); |
| | |
| | | buffer.data(), buffer.size(), punc_cache, |
| | | is_final, audio_fs, |
| | | wav_format, (ASR_TYPE)asr_mode_, |
| | | hotwords_embedding, itn, decoder_handle); |
| | | hotwords_embedding, itn, decoder_handle, |
| | | svs_lang, sys_itn); |
| | | } else { |
| | | scoped_lock guard(thread_lock); |
| | | msg["access_num"]=(int)msg["access_num"]-1; |
| | |
| | | data_msg->msg["audio_fs"] = 16000; // default is 16k |
| | | data_msg->msg["access_num"] = 0; // the number of access for this object, when it is 0, we can free it saftly |
| | | data_msg->msg["is_eof"]=false; // if this connection is closed |
| | | data_msg->msg["svs_lang"]="auto"; |
| | | data_msg->msg["svs_itn"]=true; |
| | | FUNASR_DEC_HANDLE decoder_handle = |
| | | FunASRWfstDecoderInit(tpass_handle, ASR_TWO_PASS, global_beam_, lattice_beam_, am_scale_); |
| | | data_msg->decoder_handle = decoder_handle; |
| | |
| | | if (jsonresult.contains("itn")) { |
| | | msg_data->msg["itn"] = jsonresult["itn"]; |
| | | } |
| | | if (jsonresult.contains("svs_lang")) { |
| | | msg_data->msg["svs_lang"] = jsonresult["svs_lang"]; |
| | | } |
| | | if (jsonresult.contains("svs_itn")) { |
| | | msg_data->msg["svs_itn"] = jsonresult["svs_itn"]; |
| | | } |
| | | LOG(INFO) << "jsonresult=" << jsonresult |
| | | << ", msg_data->msg=" << msg_data->msg; |
| | | if ((jsonresult["is_speaking"] == false || |
| | |
| | | msg_data->msg["audio_fs"], |
| | | msg_data->msg["wav_format"], |
| | | std::ref(msg_data->tpass_online_handle), |
| | | std::ref(msg_data->decoder_handle))); |
| | | std::ref(msg_data->decoder_handle), |
| | | msg_data->msg["svs_lang"], |
| | | msg_data->msg["svs_itn"])); |
| | | msg_data->msg["access_num"]=(int)(msg_data->msg["access_num"])+1; |
| | | } |
| | | catch (std::exception const &e) |
| | |
| | | msg_data->msg["audio_fs"], |
| | | msg_data->msg["wav_format"], |
| | | std::ref(msg_data->tpass_online_handle), |
| | | std::ref(msg_data->decoder_handle))); |
| | | std::ref(msg_data->decoder_handle), |
| | | msg_data->msg["svs_lang"], |
| | | msg_data->msg["svs_itn"])); |
| | | msg_data->msg["access_num"]=(int)(msg_data->msg["access_num"])+1; |
| | | } |
| | | } |
| | |
| | | int audio_fs, |
| | | std::string wav_format, |
| | | FUNASR_HANDLE& tpass_online_handle, |
| | | FUNASR_DEC_HANDLE& decoder_handle); |
| | | FUNASR_DEC_HANDLE& decoder_handle, |
| | | std::string svs_lang, |
| | | bool sys_itn); |
| | | |
| | | void initAsr(std::map<std::string, std::string>& model_path, int thread_num); |
| | | void on_message(websocketpp::connection_hdl hdl, message_ptr msg); |