Yabin Li
2024-06-25 b7060884fa4b8b85f79462644a5c99062d223da0
runtime/onnxruntime/src/paraformer-torch.cpp
@@ -16,7 +16,7 @@
}
// offline
void ParaformerTorch::InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, int thread_num){
void ParaformerTorch::InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){
    LoadConfigFromYaml(am_config.c_str());
    // knf options
    fbank_opts_.frame_opts.dither = 0;
@@ -28,8 +28,8 @@
    fbank_opts_.energy_floor = 0;
    fbank_opts_.mel_opts.debug_mel = false;
    vocab = new Vocab(am_config.c_str());
   phone_set_ = new PhoneSet(am_config.c_str());
    vocab = new Vocab(token_file.c_str());
   phone_set_ = new PhoneSet(token_file.c_str());
    LoadCmvn(am_cmvn.c_str());
    torch::DeviceType device = at::kCPU;
@@ -50,8 +50,13 @@
        torch::jit::script::Module model = torch::jit::load(am_model, device);
        model_ = std::make_shared<TorchModule>(std::move(model)); 
        LOG(INFO) << "Successfully load model from " << am_model;
        torch::NoGradGuard no_grad;
        model_->eval();
        torch::jit::setGraphExecutorOptimize(false);
        torch::jit::FusionStrategy static0 = {{torch::jit::FusionBehavior::STATIC, 0}};
        torch::jit::setFusionStrategy(static0);
    } catch (std::exception const &e) {
        LOG(ERROR) << "Error when load am model: " << am_model << ", " << e.what();
        LOG(ERROR) << "Error when load am model: " << am_model << e.what();
        exit(-1);
    }
}
@@ -100,6 +105,27 @@
void ParaformerTorch::InitHwCompiler(const std::string &hw_model, int thread_num) {
    // TODO
    torch::DeviceType device = at::kCPU;
    #ifdef USE_GPU
    if (!torch::cuda::is_available()) {
        // LOG(ERROR) << "CUDA is not available! Please check your GPU settings";
        exit(-1);
    } else {
        // LOG(INFO) << "CUDA is available, running on GPU";
        device = at::kCUDA;
    }
    #endif
    try {
        torch::jit::script::Module model = torch::jit::load(hw_model, device);
        hw_model_ = std::make_shared<TorchModule>(std::move(model));
        LOG(INFO) << "Successfully load model from " << hw_model;
        torch::NoGradGuard no_grad;
        hw_model_->eval();
    } catch (std::exception const &e) {
        LOG(ERROR) << "Error when load hw model: " << hw_model << e.what();
        exit(-1);
    }
    use_hotword = true;
}
@@ -111,15 +137,19 @@
{
    if(vocab){
        delete vocab;
        vocab = nullptr;
    }
    if(lm_vocab){
        delete lm_vocab;
        lm_vocab = nullptr;
    }
    if(seg_dict){
        delete seg_dict;
        seg_dict = nullptr;
    }
    if(phone_set_){
        delete phone_set_;
        phone_set_ = nullptr;
    }
}
@@ -265,34 +295,58 @@
    asr_feats = out_feats;
}
string ParaformerTorch::Forward(float* din, int len, bool input_finished, const std::vector<std::vector<float>> &hw_emb, void* decoder_handle)
std::vector<std::string> ParaformerTorch::Forward(float** din, int* len, bool input_finished, const std::vector<std::vector<float>> &hw_emb, void* decoder_handle, int batch_in)
{
    vector<std::string> results;
    string result="";
    WfstDecoder* wfst_decoder = (WfstDecoder*)decoder_handle;
    int32_t in_feat_dim = fbank_opts_.mel_opts.num_bins;
    int32_t feature_dim = lfr_m*in_feat_dim;
    std::vector<std::vector<float>> asr_feats;
    FbankKaldi(asr_sample_rate, din, len, asr_feats);
    if(asr_feats.size() == 0){
      return "";
    }
    LfrCmvn(asr_feats);
    int32_t feat_dim = lfr_m*in_feat_dim;
    int32_t num_frames = asr_feats.size();
    std::vector<float> wav_feats;
    for (const auto &frame_feat: asr_feats) {
        wav_feats.insert(wav_feats.end(), frame_feat.begin(), frame_feat.end());
    }
    std::vector<vector<float>> feats_batch;
    std::vector<int32_t> paraformer_length;
    paraformer_length.emplace_back(num_frames);
    int max_size = 0;
    int max_frames = 0;
    for(int index=0; index<batch_in; index++){
        std::vector<std::vector<float>> asr_feats;
        FbankKaldi(asr_sample_rate, din[index], len[index], asr_feats);
        if(asr_feats.size() != 0){
            LfrCmvn(asr_feats);
        }
        int32_t num_frames  = asr_feats.size();
        paraformer_length.emplace_back(num_frames);
        if(max_size < asr_feats.size()*feature_dim){
            max_size = asr_feats.size()*feature_dim;
            max_frames = num_frames;
        }
    torch::NoGradGuard no_grad;
    model_->eval();
        std::vector<float> flattened;
        for (const auto& sub_vector : asr_feats) {
            flattened.insert(flattened.end(), sub_vector.begin(), sub_vector.end());
        }
        feats_batch.emplace_back(flattened);
    }
    if(max_frames == 0){
        for(int index=0; index<batch_in; index++){
            results.push_back(result);
        }
        return results;
    }
    // padding
    std::vector<float> all_feats(batch_in * max_frames * feature_dim);
    for(int index=0; index<batch_in; index++){
        feats_batch[index].resize(max_size);
        std::memcpy(&all_feats[index * max_frames * feature_dim], feats_batch[index].data(),
                        max_frames * feature_dim * sizeof(float));
    }
    torch::Tensor feats =
        torch::from_blob(wav_feats.data(),
                {1, num_frames, feat_dim}, torch::kFloat).contiguous();
        torch::from_blob(all_feats.data(),
                {batch_in, max_frames, feature_dim}, torch::kFloat).contiguous();
    torch::Tensor feat_lens = torch::from_blob(paraformer_length.data(),
                        {1}, torch::kInt32);
                        {batch_in}, torch::kInt32);
    // 2. forward
    #ifdef USE_GPU
@@ -301,8 +355,52 @@
    #endif
    std::vector<torch::jit::IValue> inputs = {feats, feat_lens};
    string result="";
    std::vector<float> batch_embedding;
    std::vector<float> embedding;
    try{
        if (use_hotword) {
            if(hw_emb.size()<=0){
                LOG(ERROR) << "hw_emb is null";
                for(int index=0; index<batch_in; index++){
                    results.push_back(result);
                }
                return results;
            }
            embedding.reserve(hw_emb.size() * hw_emb[0].size());
            for (auto item : hw_emb) {
                embedding.insert(embedding.end(), item.begin(), item.end());
            }
            batch_embedding.reserve(batch_in * embedding.size());
            for (size_t index = 0; index < batch_in; ++index) {
                batch_embedding.insert(batch_embedding.end(), embedding.begin(), embedding.end());
            }
            torch::Tensor tensor_hw_emb =
                torch::from_blob(batch_embedding.data(),
                        {batch_in, static_cast<int64_t>(hw_emb.size()), static_cast<int64_t>(hw_emb[0].size())}, torch::kFloat).contiguous();
            #ifdef USE_GPU
            tensor_hw_emb = tensor_hw_emb.to(at::kCUDA);
            #endif
            inputs.emplace_back(tensor_hw_emb);
        }
    }catch (std::exception const &e)
    {
        LOG(ERROR)<<e.what();
        for(int index=0; index<batch_in; index++){
            results.push_back(result);
        }
        return results;
    }
    try {
        if(inputs.size() == 0){
            LOG(ERROR) << "inputs of forward is null";
            for(int index=0; index<batch_in; index++){
                results.push_back(result);
            }
            return results;
        }
        auto outputs = model_->forward(inputs).toTuple()->elements();
        torch::Tensor am_scores;
        torch::Tensor valid_token_lens;
@@ -313,10 +411,10 @@
        am_scores = outputs[0].toTensor();
        valid_token_lens = outputs[1].toTensor();
        #endif
        // timestamp
        torch::Tensor us_alphas_tensor;
        torch::Tensor us_peaks_tensor;
        if(outputs.size() == 4){
            torch::Tensor us_alphas_tensor;
            torch::Tensor us_peaks_tensor;
            #ifdef USE_GPU
            us_alphas_tensor = outputs[2].toTensor().to(at::kCPU);
            us_peaks_tensor = outputs[3].toTensor().to(at::kCPU);
@@ -324,37 +422,45 @@
            us_alphas_tensor = outputs[2].toTensor();
            us_peaks_tensor = outputs[3].toTensor();
            #endif
        }
            int us_alphas_shape_1 = us_alphas_tensor.size(1);
            float* us_alphas_data = us_alphas_tensor.data_ptr<float>();
            std::vector<float> us_alphas(us_alphas_shape_1);
            for (int i = 0; i < us_alphas_shape_1; i++) {
                us_alphas[i] = us_alphas_data[i];
            }
            int us_peaks_shape_1 = us_peaks_tensor.size(1);
            float* us_peaks_data = us_peaks_tensor.data_ptr<float>();
            std::vector<float> us_peaks(us_peaks_shape_1);
            for (int i = 0; i < us_peaks_shape_1; i++) {
                us_peaks[i] = us_peaks_data[i];
            }
         if (lm_ == nullptr) {
                result = GreedySearch(am_scores[0].data_ptr<float>(), valid_token_lens[0].item<int>(), am_scores.size(2), true, us_alphas, us_peaks);
         } else {
             result = BeamSearch(wfst_decoder, am_scores[0].data_ptr<float>(), valid_token_lens[0].item<int>(), am_scores.size(2));
                if (input_finished) {
                    result = FinalizeDecode(wfst_decoder, true, us_alphas, us_peaks);
        // timestamp
        for(int index=0; index<batch_in; index++){
            result="";
            if(outputs.size() == 4){
                float* us_alphas_data = us_alphas_tensor[index].data_ptr<float>();
                std::vector<float> us_alphas(paraformer_length[index]*3);
                for (int i = 0; i < us_alphas.size(); i++) {
                    us_alphas[i] = us_alphas_data[i];
                }
                float* us_peaks_data = us_peaks_tensor[index].data_ptr<float>();
                std::vector<float> us_peaks(paraformer_length[index]*3);
                for (int i = 0; i < us_peaks.size(); i++) {
                    us_peaks[i] = us_peaks_data[i];
                }
                if (lm_ == nullptr) {
                    result = GreedySearch(am_scores[index].data_ptr<float>(), valid_token_lens[index].item<int>(), am_scores.size(2), true, us_alphas, us_peaks);
                } else {
                    result = BeamSearch(wfst_decoder, am_scores[index].data_ptr<float>(), valid_token_lens[index].item<int>(), am_scores.size(2));
                    if (input_finished) {
                        result = FinalizeDecode(wfst_decoder, true, us_alphas, us_peaks);
                    }
                }
            }else{
                if (lm_ == nullptr) {
                    result = GreedySearch(am_scores[index].data_ptr<float>(), valid_token_lens[index].item<int>(), am_scores.size(2));
                } else {
                    result = BeamSearch(wfst_decoder, am_scores[index].data_ptr<float>(), valid_token_lens[index].item<int>(), am_scores.size(2));
                    if (input_finished) {
                        result = FinalizeDecode(wfst_decoder);
                    }
                }
            }
            results.push_back(result);
         if (wfst_decoder){
            wfst_decoder->StartUtterance();
         }
        }else{
            if (lm_ == nullptr) {
                result = GreedySearch(am_scores[0].data_ptr<float>(), valid_token_lens[0].item<int>(), am_scores.size(2));
            } else {
                result = BeamSearch(wfst_decoder, am_scores[0].data_ptr<float>(), valid_token_lens[0].item<int>(), am_scores.size(2));
                if (input_finished) {
                    result = FinalizeDecode(wfst_decoder);
                }
            }
        }
    }
    catch (std::exception const &e)
@@ -362,12 +468,102 @@
        LOG(ERROR)<<e.what();
    }
    return result;
    return results;
}
std::vector<std::vector<float>> ParaformerTorch::CompileHotwordEmbedding(std::string &hotwords) {
    // TODO
    std::vector<std::vector<float>> result(1, std::vector<float>(10, 0.0f));
    int embedding_dim = encoder_size;
    std::vector<std::vector<float>> hw_emb;
    if (!use_hotword) {
        std::vector<float> vec(embedding_dim, 0);
        hw_emb.push_back(vec);
        return hw_emb;
    }
    int max_hotword_len = 10;
    std::vector<int32_t> hotword_matrix;
    std::vector<int32_t> lengths;
    int hotword_size = 1;
    int real_hw_size = 0;
    if (!hotwords.empty()) {
      std::vector<std::string> hotword_array = split(hotwords, ' ');
      hotword_size = hotword_array.size() + 1;
      hotword_matrix.reserve(hotword_size * max_hotword_len);
      for (auto hotword : hotword_array) {
        std::vector<std::string> chars;
        if (EncodeConverter::IsAllChineseCharactor((const U8CHAR_T*)hotword.c_str(), hotword.size())) {
          KeepChineseCharacterAndSplit(hotword, chars);
        } else {
          // for english
          std::vector<std::string> words = split(hotword, ' ');
          for (auto word : words) {
            std::vector<string> tokens = seg_dict->GetTokensByWord(word);
            chars.insert(chars.end(), tokens.begin(), tokens.end());
          }
        }
        if(chars.size()==0){
            continue;
        }
        std::vector<int32_t> hw_vector(max_hotword_len, 0);
        int vector_len = std::min(max_hotword_len, (int)chars.size());
        int chs_oov = false;
        for (int i=0; i<vector_len; i++) {
          hw_vector[i] = phone_set_->String2Id(chars[i]);
          if(hw_vector[i] == -1){
            chs_oov = true;
            break;
          }
        }
        if(chs_oov){
          LOG(INFO) << "OOV: " << hotword;
          continue;
        }
        LOG(INFO) << hotword;
        lengths.push_back(vector_len);
        real_hw_size += 1;
        hotword_matrix.insert(hotword_matrix.end(), hw_vector.begin(), hw_vector.end());
      }
      hotword_size = real_hw_size + 1;
    }
    std::vector<int32_t> blank_vec(max_hotword_len, 0);
    blank_vec[0] = 1;
    hotword_matrix.insert(hotword_matrix.end(), blank_vec.begin(), blank_vec.end());
    lengths.push_back(1);
    torch::Tensor feats =
        torch::from_blob(hotword_matrix.data(),
                {hotword_size, max_hotword_len}, torch::kInt32).contiguous();
    // 2. forward
    #ifdef USE_GPU
    feats = feats.to(at::kCUDA);
    #endif
    std::vector<torch::jit::IValue> inputs = {feats};
    std::vector<std::vector<float>> result;
    try {
        auto output = hw_model_->forward(inputs);
        torch::Tensor emb_tensor;
        #ifdef USE_GPU
        emb_tensor = output.toTensor().to(at::kCPU);
        #else
        emb_tensor = output.toTensor();
        #endif
        assert(emb_tensor.size(0) == max_hotword_len);
        assert(emb_tensor.size(1) == hotword_size);
        embedding_dim = emb_tensor.size(2);
        float* floatData = emb_tensor.data_ptr<float>();
        for (int j = 0; j < hotword_size; j++)
        {
            int start_pos = hotword_size * (lengths[j] - 1) * embedding_dim + j * embedding_dim;
            std::vector<float> embedding;
            embedding.insert(embedding.begin(), floatData + start_pos, floatData + start_pos + embedding_dim);
            result.push_back(embedding);
        }
    }
    catch (std::exception const &e)
    {
        LOG(ERROR)<<e.what();
    }
    return result;
}