/** * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. * MIT License (https://opensource.org/licenses/MIT) */ #ifndef VAD_SERVER_FSMNVAD_H #define VAD_SERVER_FSMNVAD_H #include "precomp.h" namespace funasr { class FsmnVad : public VadModel { /** * Author: Speech Lab of DAMO Academy, Alibaba Group * Deep-FSMN for Large Vocabulary Continuous Speech Recognition * https://arxiv.org/abs/1803.05030 */ public: FsmnVad(); ~FsmnVad(); void Test(); void InitVad(const std::string &vad_model, const std::string &vad_cmvn, const std::string &vad_config, int thread_num); std::vector> Infer(std::vector &waves, bool input_finished=true); void Forward( const std::vector> &chunk_feats, std::vector> *out_prob, std::vector> *in_cache, bool is_final); void Reset(); int GetVadSampleRate() { return vad_sample_rate_; }; std::shared_ptr vad_session_ = nullptr; Ort::Env env_; Ort::SessionOptions session_options_; std::vector vad_in_names_; std::vector vad_out_names_; std::vector> in_cache_; knf::FbankOptions fbank_opts_; std::vector means_list_; std::vector vars_list_; int vad_sample_rate_ = MODEL_SAMPLE_RATE; int vad_silence_duration_ = VAD_SILENCE_DURATION; int vad_max_len_ = VAD_MAX_LEN; double vad_speech_noise_thres_ = VAD_SPEECH_NOISE_THRES; int lfr_m = VAD_LFR_M; int lfr_n = VAD_LFR_N; private: void ReadModel(const char* vad_model); void LoadConfigFromYaml(const char* filename); static void GetInputOutputInfo( const std::shared_ptr &session, std::vector *in_names, std::vector *out_names); void FbankKaldi(float sample_rate, std::vector> &vad_feats, std::vector &waves); void LfrCmvn(std::vector> &vad_feats); void LoadCmvn(const char *filename); void InitCache(); }; } // namespace funasr #endif //VAD_SERVER_FSMNVAD_H