| | |
| | | } |
| | | |
| | | // offline |
| | | void ParaformerTorch::InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, int thread_num){ |
| | | void ParaformerTorch::InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){ |
| | | LoadConfigFromYaml(am_config.c_str()); |
| | | // knf options |
| | | fbank_opts_.frame_opts.dither = 0; |
| | |
| | | fbank_opts_.energy_floor = 0; |
| | | fbank_opts_.mel_opts.debug_mel = false; |
| | | |
| | | vocab = new Vocab(am_config.c_str()); |
| | | phone_set_ = new PhoneSet(am_config.c_str()); |
| | | vocab = new Vocab(token_file.c_str()); |
| | | phone_set_ = new PhoneSet(token_file.c_str()); |
| | | LoadCmvn(am_cmvn.c_str()); |
| | | |
| | | torch::DeviceType device = at::kCPU; |
| | |
| | | torch::jit::script::Module model = torch::jit::load(am_model, device); |
| | | model_ = std::make_shared<TorchModule>(std::move(model)); |
| | | LOG(INFO) << "Successfully load model from " << am_model; |
| | | torch::NoGradGuard no_grad; |
| | | model_->eval(); |
| | | torch::jit::setGraphExecutorOptimize(false); |
| | | torch::jit::FusionStrategy static0 = {{torch::jit::FusionBehavior::STATIC, 0}}; |
| | | torch::jit::setFusionStrategy(static0); |
| | | } catch (std::exception const &e) { |
| | | LOG(ERROR) << "Error when load am model: " << am_model << e.what(); |
| | | exit(-1); |
| | |
| | | |
| | | void ParaformerTorch::InitHwCompiler(const std::string &hw_model, int thread_num) { |
| | | // TODO |
| | | torch::DeviceType device = at::kCPU; |
| | | #ifdef USE_GPU |
| | | if (!torch::cuda::is_available()) { |
| | | // LOG(ERROR) << "CUDA is not available! Please check your GPU settings"; |
| | | exit(-1); |
| | | } else { |
| | | // LOG(INFO) << "CUDA is available, running on GPU"; |
| | | device = at::kCUDA; |
| | | } |
| | | #endif |
| | | |
| | | try { |
| | | torch::jit::script::Module model = torch::jit::load(hw_model, device); |
| | | hw_model_ = std::make_shared<TorchModule>(std::move(model)); |
| | | LOG(INFO) << "Successfully load model from " << hw_model; |
| | | torch::NoGradGuard no_grad; |
| | | hw_model_->eval(); |
| | | } catch (std::exception const &e) { |
| | | LOG(ERROR) << "Error when load hw model: " << hw_model << e.what(); |
| | | exit(-1); |
| | | } |
| | | use_hotword = true; |
| | | } |
| | | |
| | |
| | | { |
| | | if(vocab){ |
| | | delete vocab; |
| | | vocab = nullptr; |
| | | } |
| | | if(lm_vocab){ |
| | | delete lm_vocab; |
| | | lm_vocab = nullptr; |
| | | } |
| | | if(seg_dict){ |
| | | delete seg_dict; |
| | | seg_dict = nullptr; |
| | | } |
| | | if(phone_set_){ |
| | | delete phone_set_; |
| | | phone_set_ = nullptr; |
| | | } |
| | | } |
| | | |
| | |
| | | asr_feats = out_feats; |
| | | } |
| | | |
| | | string ParaformerTorch::Forward(float* din, int len, bool input_finished, const std::vector<std::vector<float>> &hw_emb, void* decoder_handle) |
| | | std::vector<std::string> ParaformerTorch::Forward(float** din, int* len, bool input_finished, const std::vector<std::vector<float>> &hw_emb, void* decoder_handle, int batch_in) |
| | | { |
| | | vector<std::string> results; |
| | | string result=""; |
| | | |
| | | WfstDecoder* wfst_decoder = (WfstDecoder*)decoder_handle; |
| | | int32_t in_feat_dim = fbank_opts_.mel_opts.num_bins; |
| | | int32_t feature_dim = lfr_m*in_feat_dim; |
| | | |
| | | std::vector<std::vector<float>> asr_feats; |
| | | FbankKaldi(asr_sample_rate, din, len, asr_feats); |
| | | if(asr_feats.size() == 0){ |
| | | return ""; |
| | | } |
| | | LfrCmvn(asr_feats); |
| | | int32_t feat_dim = lfr_m*in_feat_dim; |
| | | int32_t num_frames = asr_feats.size(); |
| | | |
| | | std::vector<float> wav_feats; |
| | | for (const auto &frame_feat: asr_feats) { |
| | | wav_feats.insert(wav_feats.end(), frame_feat.begin(), frame_feat.end()); |
| | | } |
| | | std::vector<vector<float>> feats_batch; |
| | | std::vector<int32_t> paraformer_length; |
| | | paraformer_length.emplace_back(num_frames); |
| | | int max_size = 0; |
| | | int max_frames = 0; |
| | | for(int index=0; index<batch_in; index++){ |
| | | std::vector<std::vector<float>> asr_feats; |
| | | FbankKaldi(asr_sample_rate, din[index], len[index], asr_feats); |
| | | if(asr_feats.size() != 0){ |
| | | LfrCmvn(asr_feats); |
| | | } |
| | | int32_t num_frames = asr_feats.size(); |
| | | paraformer_length.emplace_back(num_frames); |
| | | if(max_size < asr_feats.size()*feature_dim){ |
| | | max_size = asr_feats.size()*feature_dim; |
| | | max_frames = num_frames; |
| | | } |
| | | |
| | | torch::NoGradGuard no_grad; |
| | | model_->eval(); |
| | | std::vector<float> flattened; |
| | | for (const auto& sub_vector : asr_feats) { |
| | | flattened.insert(flattened.end(), sub_vector.begin(), sub_vector.end()); |
| | | } |
| | | feats_batch.emplace_back(flattened); |
| | | } |
| | | |
| | | if(max_frames == 0){ |
| | | for(int index=0; index<batch_in; index++){ |
| | | results.push_back(result); |
| | | } |
| | | return results; |
| | | } |
| | | |
| | | // padding |
| | | std::vector<float> all_feats(batch_in * max_frames * feature_dim); |
| | | for(int index=0; index<batch_in; index++){ |
| | | feats_batch[index].resize(max_size); |
| | | std::memcpy(&all_feats[index * max_frames * feature_dim], feats_batch[index].data(), |
| | | max_frames * feature_dim * sizeof(float)); |
| | | } |
| | | torch::Tensor feats = |
| | | torch::from_blob(wav_feats.data(), |
| | | {1, num_frames, feat_dim}, torch::kFloat).contiguous(); |
| | | torch::from_blob(all_feats.data(), |
| | | {batch_in, max_frames, feature_dim}, torch::kFloat).contiguous(); |
| | | torch::Tensor feat_lens = torch::from_blob(paraformer_length.data(), |
| | | {1}, torch::kInt32); |
| | | {batch_in}, torch::kInt32); |
| | | |
| | | // 2. forward |
| | | #ifdef USE_GPU |
| | |
| | | #endif |
| | | std::vector<torch::jit::IValue> inputs = {feats, feat_lens}; |
| | | |
| | | string result=""; |
| | | std::vector<float> batch_embedding; |
| | | std::vector<float> embedding; |
| | | try{ |
| | | if (use_hotword) { |
| | | if(hw_emb.size()<=0){ |
| | | LOG(ERROR) << "hw_emb is null"; |
| | | for(int index=0; index<batch_in; index++){ |
| | | results.push_back(result); |
| | | } |
| | | return results; |
| | | } |
| | | |
| | | embedding.reserve(hw_emb.size() * hw_emb[0].size()); |
| | | for (auto item : hw_emb) { |
| | | embedding.insert(embedding.end(), item.begin(), item.end()); |
| | | } |
| | | batch_embedding.reserve(batch_in * embedding.size()); |
| | | for (size_t index = 0; index < batch_in; ++index) { |
| | | batch_embedding.insert(batch_embedding.end(), embedding.begin(), embedding.end()); |
| | | } |
| | | |
| | | torch::Tensor tensor_hw_emb = |
| | | torch::from_blob(batch_embedding.data(), |
| | | {batch_in, static_cast<int64_t>(hw_emb.size()), static_cast<int64_t>(hw_emb[0].size())}, torch::kFloat).contiguous(); |
| | | #ifdef USE_GPU |
| | | tensor_hw_emb = tensor_hw_emb.to(at::kCUDA); |
| | | #endif |
| | | inputs.emplace_back(tensor_hw_emb); |
| | | } |
| | | }catch (std::exception const &e) |
| | | { |
| | | LOG(ERROR)<<e.what(); |
| | | for(int index=0; index<batch_in; index++){ |
| | | results.push_back(result); |
| | | } |
| | | return results; |
| | | } |
| | | |
| | | try { |
| | | if(inputs.size() == 0){ |
| | | LOG(ERROR) << "inputs of forward is null"; |
| | | for(int index=0; index<batch_in; index++){ |
| | | results.push_back(result); |
| | | } |
| | | return results; |
| | | } |
| | | auto outputs = model_->forward(inputs).toTuple()->elements(); |
| | | torch::Tensor am_scores; |
| | | torch::Tensor valid_token_lens; |
| | |
| | | am_scores = outputs[0].toTensor(); |
| | | valid_token_lens = outputs[1].toTensor(); |
| | | #endif |
| | | // timestamp |
| | | |
| | | torch::Tensor us_alphas_tensor; |
| | | torch::Tensor us_peaks_tensor; |
| | | if(outputs.size() == 4){ |
| | | torch::Tensor us_alphas_tensor; |
| | | torch::Tensor us_peaks_tensor; |
| | | #ifdef USE_GPU |
| | | us_alphas_tensor = outputs[2].toTensor().to(at::kCPU); |
| | | us_peaks_tensor = outputs[3].toTensor().to(at::kCPU); |
| | |
| | | us_alphas_tensor = outputs[2].toTensor(); |
| | | us_peaks_tensor = outputs[3].toTensor(); |
| | | #endif |
| | | } |
| | | |
| | | int us_alphas_shape_1 = us_alphas_tensor.size(1); |
| | | float* us_alphas_data = us_alphas_tensor.data_ptr<float>(); |
| | | std::vector<float> us_alphas(us_alphas_shape_1); |
| | | for (int i = 0; i < us_alphas_shape_1; i++) { |
| | | us_alphas[i] = us_alphas_data[i]; |
| | | } |
| | | |
| | | int us_peaks_shape_1 = us_peaks_tensor.size(1); |
| | | float* us_peaks_data = us_peaks_tensor.data_ptr<float>(); |
| | | std::vector<float> us_peaks(us_peaks_shape_1); |
| | | for (int i = 0; i < us_peaks_shape_1; i++) { |
| | | us_peaks[i] = us_peaks_data[i]; |
| | | } |
| | | if (lm_ == nullptr) { |
| | | result = GreedySearch(am_scores[0].data_ptr<float>(), valid_token_lens[0].item<int>(), am_scores.size(2), true, us_alphas, us_peaks); |
| | | } else { |
| | | result = BeamSearch(wfst_decoder, am_scores[0].data_ptr<float>(), valid_token_lens[0].item<int>(), am_scores.size(2)); |
| | | if (input_finished) { |
| | | result = FinalizeDecode(wfst_decoder, true, us_alphas, us_peaks); |
| | | // timestamp |
| | | for(int index=0; index<batch_in; index++){ |
| | | result=""; |
| | | if(outputs.size() == 4){ |
| | | float* us_alphas_data = us_alphas_tensor[index].data_ptr<float>(); |
| | | std::vector<float> us_alphas(paraformer_length[index]*3); |
| | | for (int i = 0; i < us_alphas.size(); i++) { |
| | | us_alphas[i] = us_alphas_data[i]; |
| | | } |
| | | |
| | | float* us_peaks_data = us_peaks_tensor[index].data_ptr<float>(); |
| | | std::vector<float> us_peaks(paraformer_length[index]*3); |
| | | for (int i = 0; i < us_peaks.size(); i++) { |
| | | us_peaks[i] = us_peaks_data[i]; |
| | | } |
| | | if (lm_ == nullptr) { |
| | | result = GreedySearch(am_scores[index].data_ptr<float>(), valid_token_lens[index].item<int>(), am_scores.size(2), true, us_alphas, us_peaks); |
| | | } else { |
| | | result = BeamSearch(wfst_decoder, am_scores[index].data_ptr<float>(), valid_token_lens[index].item<int>(), am_scores.size(2)); |
| | | if (input_finished) { |
| | | result = FinalizeDecode(wfst_decoder, true, us_alphas, us_peaks); |
| | | } |
| | | } |
| | | }else{ |
| | | if (lm_ == nullptr) { |
| | | result = GreedySearch(am_scores[index].data_ptr<float>(), valid_token_lens[index].item<int>(), am_scores.size(2)); |
| | | } else { |
| | | result = BeamSearch(wfst_decoder, am_scores[index].data_ptr<float>(), valid_token_lens[index].item<int>(), am_scores.size(2)); |
| | | if (input_finished) { |
| | | result = FinalizeDecode(wfst_decoder); |
| | | } |
| | | } |
| | | } |
| | | results.push_back(result); |
| | | if (wfst_decoder){ |
| | | wfst_decoder->StartUtterance(); |
| | | } |
| | | }else{ |
| | | if (lm_ == nullptr) { |
| | | result = GreedySearch(am_scores[0].data_ptr<float>(), valid_token_lens[0].item<int>(), am_scores.size(2)); |
| | | } else { |
| | | result = BeamSearch(wfst_decoder, am_scores[0].data_ptr<float>(), valid_token_lens[0].item<int>(), am_scores.size(2)); |
| | | if (input_finished) { |
| | | result = FinalizeDecode(wfst_decoder); |
| | | } |
| | | } |
| | | } |
| | | } |
| | | catch (std::exception const &e) |
| | |
| | | LOG(ERROR)<<e.what(); |
| | | } |
| | | |
| | | return result; |
| | | return results; |
| | | } |
| | | |
| | | std::vector<std::vector<float>> ParaformerTorch::CompileHotwordEmbedding(std::string &hotwords) { |
| | | // TODO |
| | | std::vector<std::vector<float>> result(1, std::vector<float>(10, 0.0f)); |
| | | int embedding_dim = encoder_size; |
| | | std::vector<std::vector<float>> hw_emb; |
| | | if (!use_hotword) { |
| | | std::vector<float> vec(embedding_dim, 0); |
| | | hw_emb.push_back(vec); |
| | | return hw_emb; |
| | | } |
| | | int max_hotword_len = 10; |
| | | std::vector<int32_t> hotword_matrix; |
| | | std::vector<int32_t> lengths; |
| | | int hotword_size = 1; |
| | | int real_hw_size = 0; |
| | | if (!hotwords.empty()) { |
| | | std::vector<std::string> hotword_array = split(hotwords, ' '); |
| | | hotword_size = hotword_array.size() + 1; |
| | | hotword_matrix.reserve(hotword_size * max_hotword_len); |
| | | for (auto hotword : hotword_array) { |
| | | std::vector<std::string> chars; |
| | | if (EncodeConverter::IsAllChineseCharactor((const U8CHAR_T*)hotword.c_str(), hotword.size())) { |
| | | KeepChineseCharacterAndSplit(hotword, chars); |
| | | } else { |
| | | // for english |
| | | std::vector<std::string> words = split(hotword, ' '); |
| | | for (auto word : words) { |
| | | std::vector<string> tokens = seg_dict->GetTokensByWord(word); |
| | | chars.insert(chars.end(), tokens.begin(), tokens.end()); |
| | | } |
| | | } |
| | | if(chars.size()==0){ |
| | | continue; |
| | | } |
| | | std::vector<int32_t> hw_vector(max_hotword_len, 0); |
| | | int vector_len = std::min(max_hotword_len, (int)chars.size()); |
| | | int chs_oov = false; |
| | | for (int i=0; i<vector_len; i++) { |
| | | hw_vector[i] = phone_set_->String2Id(chars[i]); |
| | | if(hw_vector[i] == -1){ |
| | | chs_oov = true; |
| | | break; |
| | | } |
| | | } |
| | | if(chs_oov){ |
| | | LOG(INFO) << "OOV: " << hotword; |
| | | continue; |
| | | } |
| | | LOG(INFO) << hotword; |
| | | lengths.push_back(vector_len); |
| | | real_hw_size += 1; |
| | | hotword_matrix.insert(hotword_matrix.end(), hw_vector.begin(), hw_vector.end()); |
| | | } |
| | | hotword_size = real_hw_size + 1; |
| | | } |
| | | std::vector<int32_t> blank_vec(max_hotword_len, 0); |
| | | blank_vec[0] = 1; |
| | | hotword_matrix.insert(hotword_matrix.end(), blank_vec.begin(), blank_vec.end()); |
| | | lengths.push_back(1); |
| | | |
| | | torch::Tensor feats = |
| | | torch::from_blob(hotword_matrix.data(), |
| | | {hotword_size, max_hotword_len}, torch::kInt32).contiguous(); |
| | | |
| | | // 2. forward |
| | | #ifdef USE_GPU |
| | | feats = feats.to(at::kCUDA); |
| | | #endif |
| | | std::vector<torch::jit::IValue> inputs = {feats}; |
| | | std::vector<std::vector<float>> result; |
| | | try { |
| | | auto output = hw_model_->forward(inputs); |
| | | torch::Tensor emb_tensor; |
| | | #ifdef USE_GPU |
| | | emb_tensor = output.toTensor().to(at::kCPU); |
| | | #else |
| | | emb_tensor = output.toTensor(); |
| | | #endif |
| | | assert(emb_tensor.size(0) == max_hotword_len); |
| | | assert(emb_tensor.size(1) == hotword_size); |
| | | embedding_dim = emb_tensor.size(2); |
| | | |
| | | float* floatData = emb_tensor.data_ptr<float>(); |
| | | for (int j = 0; j < hotword_size; j++) |
| | | { |
| | | int start_pos = hotword_size * (lengths[j] - 1) * embedding_dim + j * embedding_dim; |
| | | std::vector<float> embedding; |
| | | embedding.insert(embedding.begin(), floatData + start_pos, floatData + start_pos + embedding_dim); |
| | | result.push_back(embedding); |
| | | } |
| | | } |
| | | catch (std::exception const &e) |
| | | { |
| | | LOG(ERROR)<<e.what(); |
| | | } |
| | | return result; |
| | | } |
| | | |