python/FunASR-XL.git

			@@ -16,7 +16,7 @@
			}

			// offline
			void ParaformerTorch::InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, int thread_num){
			void ParaformerTorch::InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){
			LoadConfigFromYaml(am_config.c_str());
			// knf options
			fbank_opts_.frame_opts.dither = 0;
			@@ -28,8 +28,8 @@
			fbank_opts_.energy_floor = 0;
			fbank_opts_.mel_opts.debug_mel = false;

			vocab = new Vocab(am_config.c_str());
			phone_set_ = new PhoneSet(am_config.c_str());
			vocab = new Vocab(token_file.c_str());
			phone_set_ = new PhoneSet(token_file.c_str());
			LoadCmvn(am_cmvn.c_str());

			torch::DeviceType device = at::kCPU;
			@@ -281,10 +281,10 @@
			if(asr_feats.size() != 0){
			LfrCmvn(asr_feats);
			}
			int32_t num_frames = asr_feats.size() / feature_dim;
			int32_t num_frames = asr_feats.size();
			paraformer_length.emplace_back(num_frames);
			if(max_size < asr_feats.size()){
			max_size = asr_feats.size();
			if(max_size < asr_feats.size()*feature_dim){
			max_size = asr_feats.size()*feature_dim;
			max_frames = num_frames;
			}

			@@ -373,6 +373,9 @@
			}
			}
			results.push_back(result);
			if (wfst_decoder){
			wfst_decoder->StartUtterance();
			}
			}
			}
			catch (std::exception const &e)