From f479f94832ce6ca0d381ed3d8e53675dea24a6ec Mon Sep 17 00:00:00 2001
From: 北念 <lzr265946@alibaba-inc.com>
Date: 星期四, 02 十一月 2023 16:35:10 +0800
Subject: [PATCH] fix paraformer-16k-en finetune pipeline
---
funasr/runtime/onnxruntime/src/vocab.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/funasr/runtime/onnxruntime/src/vocab.cpp b/funasr/runtime/onnxruntime/src/vocab.cpp
index c29156f..2babc40 100644
--- a/funasr/runtime/onnxruntime/src/vocab.cpp
+++ b/funasr/runtime/onnxruntime/src/vocab.cpp
@@ -75,20 +75,52 @@
return false;
}
-string Vocab::Vector2StringV2(vector<int> in)
+string Vocab::WordFormat(std::string word)
+{
+ if(word == "i"){
+ return "I";
+ }else if(word == "i'm"){
+ return "I'm";
+ }else if(word == "i've"){
+ return "I've";
+ }else if(word == "i'll"){
+ return "I'll";
+ }else{
+ return word;
+ }
+}
+
+string Vocab::Vector2StringV2(vector<int> in, std::string language)
{
int i;
list<string> words;
int is_pre_english = false;
int pre_english_len = 0;
int is_combining = false;
- string combine = "";
+ std::string combine = "";
+ std::string unicodeChar = "鈻�";
for (auto it = in.begin(); it != in.end(); it++) {
string word = vocab[*it];
// step1 space character skips
if (word == "<s>" || word == "</s>" || word == "<unk>")
continue;
+ if (language == "en-bpe"){
+ size_t found = word.find(unicodeChar);
+ if(found != std::string::npos){
+ if (combine != ""){
+ combine = WordFormat(combine);
+ if (words.size() != 0){
+ combine = " " + combine;
+ }
+ words.push_back(combine);
+ }
+ combine = word.substr(3);
+ }else{
+ combine += word;
+ }
+ continue;
+ }
// step2 combie phoneme to full word
{
int sub_word = !(word.find("@@") == string::npos);
@@ -147,6 +179,14 @@
}
}
+ if (language == "en-bpe" and combine != ""){
+ combine = WordFormat(combine);
+ if (words.size() != 0){
+ combine = " " + combine;
+ }
+ words.push_back(combine);
+ }
+
stringstream ss;
for (auto it = words.begin(); it != words.end(); it++) {
ss << *it;
--
Gitblit v1.9.1