From a5a06217c17dc69aca53d1e247b7a6671d35373c Mon Sep 17 00:00:00 2001
From: manyeyes <32889020+manyeyes@users.noreply.github.com>
Date: 星期二, 23 七月 2024 14:41:00 +0800
Subject: [PATCH] Add support for SenseVoiceSmall onnx model in c# lib (#1946)
---
runtime/csharp/AliParaformerAsr/README.md | 9
runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineModel.cs | 72 ++++++
runtime/csharp/AliParaformerAsr/AliParaformerAsr/Utils/PadHelper.cs | 55 ++++
runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/ModelOutputEntity.cs | 17 +
runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineRecognizer.cs | 187 ++++-----------
runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/FrontendConfEntity.cs | 10
runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineProjOfParaformer.cs | 113 +++++++++
runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineProjOfSenseVoiceSmall.cs | 156 +++++++++++++
runtime/csharp/AliParaformerAsr/AliParaformerAsr/WavFrontend.cs | 3
runtime/csharp/AliParaformerAsr/AliParaformerAsr/AliParaformerAsr.csproj | 8
runtime/csharp/AliParaformerAsr/AliParaformerAsr/IOfflineProj.cs | 43 +++
runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/ModelConfEntity.cs | 12
12 files changed, 527 insertions(+), 158 deletions(-)
diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/AliParaformerAsr.csproj b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/AliParaformerAsr.csproj
index b3fe558..de0416f 100644
--- a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/AliParaformerAsr.csproj
+++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/AliParaformerAsr.csproj
@@ -1,16 +1,16 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
- <TargetFramework>net6.0</TargetFramework>
+ <TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
- <PackageReference Include="KaldiNativeFbankSharp" Version="1.0.8" />
+ <PackageReference Include="KaldiNativeFbankSharp" Version="1.1.2" />
<PackageReference Include="Microsoft.Extensions.Logging" Version="7.0.0" />
- <PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.17.3" />
- <PackageReference Include="Microsoft.ML.OnnxRuntime.Managed" Version="1.17.3" />
+ <PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.18.1" />
+ <PackageReference Include="Microsoft.ML.OnnxRuntime.Managed" Version="1.18.1" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
<PackageReference Include="YamlDotNet" Version="13.1.1" />
</ItemGroup>
diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/IOfflineProj.cs b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/IOfflineProj.cs
new file mode 100644
index 0000000..5cf2544
--- /dev/null
+++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/IOfflineProj.cs
@@ -0,0 +1,43 @@
+锘�// See https://github.com/manyeyes for more information
+// Copyright (c) 2024 by manyeyes
+using AliParaformerAsr.Model;
+using Microsoft.ML.OnnxRuntime;
+
+namespace AliParaformerAsr
+{
+ internal interface IOfflineProj
+ {
+ InferenceSession ModelSession
+ {
+ get;
+ set;
+ }
+ int Blank_id
+ {
+ get;
+ set;
+ }
+ int Sos_eos_id
+ {
+ get;
+ set;
+ }
+ int Unk_id
+ {
+ get;
+ set;
+ }
+ int SampleRate
+ {
+ get;
+ set;
+ }
+ int FeatureDim
+ {
+ get;
+ set;
+ }
+ internal ModelOutputEntity ModelProj(List<OfflineInputEntity> modelInputs);
+ internal void Dispose();
+ }
+}
diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/FrontendConfEntity.cs b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/FrontendConfEntity.cs
index 4b0ca8e..d8c0021 100644
--- a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/FrontendConfEntity.cs
+++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/FrontendConfEntity.cs
@@ -1,11 +1,5 @@
锘�// See https://github.com/manyeyes for more information
// Copyright (c) 2023 by manyeyes
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
-
namespace AliParaformerAsr.Model
{
public class FrontendConfEntity
@@ -15,9 +9,10 @@
private int _n_mels = 80;
private int _frame_length = 25;
private int _frame_shift = 10;
- private float _dither = 0.0F;
+ private float _dither = 1.0F;
private int _lfr_m = 7;
private int _lfr_n = 6;
+ private bool _snip_edges = false;
public int fs { get => _fs; set => _fs = value; }
public string window { get => _window; set => _window = value; }
@@ -27,5 +22,6 @@
public float dither { get => _dither; set => _dither = value; }
public int lfr_m { get => _lfr_m; set => _lfr_m = value; }
public int lfr_n { get => _lfr_n; set => _lfr_n = value; }
+ public bool snip_edges { get => _snip_edges; set => _snip_edges = value; }
}
}
diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/ModelConfEntity.cs b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/ModelConfEntity.cs
index 15af179..844f02e 100644
--- a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/ModelConfEntity.cs
+++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/ModelConfEntity.cs
@@ -1,11 +1,5 @@
锘�// See https://github.com/manyeyes for more information
// Copyright (c) 2023 by manyeyes
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
-
namespace AliParaformerAsr.Model
{
public class ModelConfEntity
@@ -16,6 +10,9 @@
private float _predictor_weight = 1.0F;
private int _predictor_bias = 1;
private float _sampling_ratio = 0.75F;
+ private int _sos = 1;
+ private int _eos = 2;
+ private int _ignore_id = -1;
public float ctc_weight { get => _ctc_weight; set => _ctc_weight = value; }
public float lsm_weight { get => _lsm_weight; set => _lsm_weight = value; }
@@ -23,5 +20,8 @@
public float predictor_weight { get => _predictor_weight; set => _predictor_weight = value; }
public int predictor_bias { get => _predictor_bias; set => _predictor_bias = value; }
public float sampling_ratio { get => _sampling_ratio; set => _sampling_ratio = value; }
+ public int sos { get => _sos; set => _sos = value; }
+ public int eos { get => _eos; set => _eos = value; }
+ public int ignore_id { get => _ignore_id; set => _ignore_id = value; }
}
}
diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/ModelOutputEntity.cs b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/ModelOutputEntity.cs
new file mode 100644
index 0000000..6ee5de0
--- /dev/null
+++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Model/ModelOutputEntity.cs
@@ -0,0 +1,17 @@
+锘�// See https://github.com/manyeyes for more information
+// Copyright (c) 2024 by manyeyes
+using Microsoft.ML.OnnxRuntime.Tensors;
+
+namespace AliParaformerAsr.Model
+{
+ internal class ModelOutputEntity
+ {
+ private Tensor<float>? _model_out;
+ private int[]? _model_out_lens;
+ private Tensor<float>? _cif_peak_tensor;
+
+ public Tensor<float>? model_out { get => _model_out; set => _model_out = value; }
+ public int[]? model_out_lens { get => _model_out_lens; set => _model_out_lens = value; }
+ public Tensor<float>? cif_peak_tensor { get => _cif_peak_tensor; set => _cif_peak_tensor = value; }
+ }
+}
diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineModel.cs b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineModel.cs
new file mode 100644
index 0000000..28901ca
--- /dev/null
+++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineModel.cs
@@ -0,0 +1,72 @@
+锘�// See https://github.com/manyeyes for more information
+// Copyright (c) 2024 by manyeyes
+using Microsoft.ML.OnnxRuntime;
+
+namespace AliParaformerAsr
+{
+ public enum OnnxRumtimeTypes
+ {
+ CPU = 0,
+
+ DML = 1,
+
+ CUDA = 2,
+ }
+ public class OfflineModel
+ {
+ private InferenceSession _modelSession;
+ private int _blank_id = 0;
+ private int sos_eos_id = 1;
+ private int _unk_id = 2;
+ private int _featureDim = 80;
+ private int _sampleRate = 16000;
+
+ public OfflineModel(string modelFilePath, int threadsNum = 2, OnnxRumtimeTypes rumtimeType = OnnxRumtimeTypes.CPU, int deviceId = 0)
+ {
+ _modelSession = initModel(modelFilePath, threadsNum, rumtimeType, deviceId);
+ }
+ public int Blank_id { get => _blank_id; set => _blank_id = value; }
+ public int Sos_eos_id { get => sos_eos_id; set => sos_eos_id = value; }
+ public int Unk_id { get => _unk_id; set => _unk_id = value; }
+ public int FeatureDim { get => _featureDim; set => _featureDim = value; }
+ public InferenceSession ModelSession { get => _modelSession; set => _modelSession = value; }
+ public int SampleRate { get => _sampleRate; set => _sampleRate = value; }
+
+ public InferenceSession initModel(string modelFilePath, int threadsNum = 2, OnnxRumtimeTypes rumtimeType = OnnxRumtimeTypes.CPU, int deviceId = 0)
+ {
+ var options = new SessionOptions();
+ switch (rumtimeType)
+ {
+ case OnnxRumtimeTypes.DML:
+ options.AppendExecutionProvider_DML(deviceId);
+ break;
+ case OnnxRumtimeTypes.CUDA:
+ options.AppendExecutionProvider_CUDA(deviceId);
+ break;
+ default:
+ options.AppendExecutionProvider_CPU(deviceId);
+ break;
+ }
+ //options.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_INFO;
+ options.InterOpNumThreads = threadsNum;
+ InferenceSession onnxSession = new InferenceSession(modelFilePath, options);
+ return onnxSession;
+ }
+ protected virtual void Dispose(bool disposing)
+ {
+ if (disposing)
+ {
+ if (_modelSession != null)
+ {
+ _modelSession.Dispose();
+ }
+ }
+ }
+
+ internal void Dispose()
+ {
+ Dispose(disposing: true);
+ GC.SuppressFinalize(this);
+ }
+ }
+}
diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineProjOfParaformer.cs b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineProjOfParaformer.cs
new file mode 100644
index 0000000..82d75ef
--- /dev/null
+++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineProjOfParaformer.cs
@@ -0,0 +1,113 @@
+锘�// See https://github.com/manyeyes for more information
+// Copyright (c) 2024 by manyeyes
+using AliParaformerAsr.Model;
+using Microsoft.ML.OnnxRuntime;
+using Microsoft.ML.OnnxRuntime.Tensors;
+using AliParaformerAsr.Utils;
+
+namespace AliParaformerAsr
+{
+ internal class OfflineProjOfParaformer : IOfflineProj, IDisposable
+ {
+ // To detect redundant calls
+ private bool _disposed;
+
+ private InferenceSession _modelSession;
+ private int _blank_id = 0;
+ private int _sos_eos_id = 1;
+ private int _unk_id = 2;
+
+ private int _featureDim = 80;
+ private int _sampleRate = 16000;
+
+ public OfflineProjOfParaformer(OfflineModel offlineModel)
+ {
+ _modelSession = offlineModel.ModelSession;
+ _blank_id = offlineModel.Blank_id;
+ _sos_eos_id = offlineModel.Sos_eos_id;
+ _unk_id = offlineModel.Unk_id;
+ _featureDim = offlineModel.FeatureDim;
+ _sampleRate = offlineModel.SampleRate;
+ }
+ public InferenceSession ModelSession { get => _modelSession; set => _modelSession = value; }
+ public int Blank_id { get => _blank_id; set => _blank_id = value; }
+ public int Sos_eos_id { get => _sos_eos_id; set => _sos_eos_id = value; }
+ public int Unk_id { get => _unk_id; set => _unk_id = value; }
+ public int FeatureDim { get => _featureDim; set => _featureDim = value; }
+ public int SampleRate { get => _sampleRate; set => _sampleRate = value; }
+
+ public ModelOutputEntity ModelProj(List<OfflineInputEntity> modelInputs)
+ {
+ int batchSize = modelInputs.Count;
+ float[] padSequence = PadHelper.PadSequence(modelInputs);
+ var inputMeta = _modelSession.InputMetadata;
+ var container = new List<NamedOnnxValue>();
+ foreach (var name in inputMeta.Keys)
+ {
+ if (name == "speech")
+ {
+ int[] dim = new int[] { batchSize, padSequence.Length / 560 / batchSize, 560 };
+ var tensor = new DenseTensor<float>(padSequence, dim, false);
+ container.Add(NamedOnnxValue.CreateFromTensor<float>(name, tensor));
+ }
+ if (name == "speech_lengths")
+ {
+ int[] dim = new int[] { batchSize };
+ int[] speech_lengths = new int[batchSize];
+ for (int i = 0; i < batchSize; i++)
+ {
+ speech_lengths[i] = padSequence.Length / 560 / batchSize;
+ }
+ var tensor = new DenseTensor<int>(speech_lengths, dim, false);
+ container.Add(NamedOnnxValue.CreateFromTensor<int>(name, tensor));
+ }
+ }
+ ModelOutputEntity modelOutputEntity = new ModelOutputEntity();
+ try
+ {
+ IDisposableReadOnlyCollection<DisposableNamedOnnxValue> results = _modelSession.Run(container);
+
+ if (results != null)
+ {
+ var resultsArray = results.ToArray();
+ modelOutputEntity.model_out = resultsArray[0].AsTensor<float>();
+ modelOutputEntity.model_out_lens = resultsArray[1].AsEnumerable<int>().ToArray();
+ if (resultsArray.Length >= 4)
+ {
+ Tensor<float> cif_peak_tensor = resultsArray[3].AsTensor<float>();
+ modelOutputEntity.cif_peak_tensor = cif_peak_tensor;
+ }
+ }
+ }
+ catch (Exception ex)
+ {
+ //
+ }
+ return modelOutputEntity;
+ }
+ protected virtual void Dispose(bool disposing)
+ {
+ if (!_disposed)
+ {
+ if (disposing)
+ {
+ if (_modelSession != null)
+ {
+ _modelSession.Dispose();
+ }
+ }
+ _disposed = true;
+ }
+ }
+
+ public void Dispose()
+ {
+ Dispose(disposing: true);
+ GC.SuppressFinalize(this);
+ }
+ ~OfflineProjOfParaformer()
+ {
+ Dispose(_disposed);
+ }
+ }
+}
diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineProjOfSenseVoiceSmall.cs b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineProjOfSenseVoiceSmall.cs
new file mode 100644
index 0000000..705e8ef
--- /dev/null
+++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineProjOfSenseVoiceSmall.cs
@@ -0,0 +1,156 @@
+锘�// See https://github.com/manyeyes for more information
+// Copyright (c) 2024 by manyeyes
+using AliParaformerAsr.Model;
+using Microsoft.ML.OnnxRuntime;
+using Microsoft.ML.OnnxRuntime.Tensors;
+using AliParaformerAsr.Utils;
+
+namespace AliParaformerAsr
+{
+ internal class OfflineProjOfSenseVoiceSmall : IOfflineProj, IDisposable
+ {
+ // To detect redundant calls
+ private bool _disposed;
+
+ private InferenceSession _modelSession;
+ private int _blank_id = 0;
+ private int _sos_eos_id = 1;
+ private int _unk_id = 2;
+
+ private int _featureDim = 80;
+ private int _sampleRate = 16000;
+
+ private bool _use_itn = false;
+ private string _textnorm = "woitn";
+ private Dictionary<string, int> _lidDict = new Dictionary<string, int>() { { "auto", 0 }, { "zh", 3 }, { "en", 4 }, { "yue", 7 }, { "ja", 11 }, { "ko", 12 }, { "nospeech", 13 } };
+ private Dictionary<int, int> _lidIntDict = new Dictionary<int, int>() { { 24884, 3 }, { 24885, 4 }, { 24888, 7 }, { 24892, 11 }, { 24896, 12 }, { 24992, 13 } };
+ private Dictionary<string, int> _textnormDict = new Dictionary<string, int>() { { "withitn", 14 }, { "woitn", 15 } };
+ private Dictionary<int, int> _textnormIntDict = new Dictionary<int, int>() { { 25016, 14 }, { 25017, 15 } };
+
+ public OfflineProjOfSenseVoiceSmall(OfflineModel offlineModel)
+ {
+ _modelSession = offlineModel.ModelSession;
+ _blank_id = offlineModel.Blank_id;
+ _sos_eos_id = offlineModel.Sos_eos_id;
+ _unk_id = offlineModel.Unk_id;
+ _featureDim = offlineModel.FeatureDim;
+ _sampleRate = offlineModel.SampleRate;
+ }
+ public InferenceSession ModelSession { get => _modelSession; set => _modelSession = value; }
+ public int Blank_id { get => _blank_id; set => _blank_id = value; }
+ public int Sos_eos_id { get => _sos_eos_id; set => _sos_eos_id = value; }
+ public int Unk_id { get => _unk_id; set => _unk_id = value; }
+ public int FeatureDim { get => _featureDim; set => _featureDim = value; }
+ public int SampleRate { get => _sampleRate; set => _sampleRate = value; }
+
+ public ModelOutputEntity ModelProj(List<OfflineInputEntity> modelInputs)
+ {
+ int batchSize = modelInputs.Count;
+ float[] padSequence = PadHelper.PadSequence(modelInputs);
+ //
+ string languageValue = "ja";
+ int languageId = 0;
+ if (_lidDict.ContainsKey(languageValue))
+ {
+ languageId = _lidDict.GetValueOrDefault(languageValue);
+ }
+ string textnormValue = "withitn";
+ int textnormId = 15;
+ if (_textnormDict.ContainsKey(textnormValue))
+ {
+ textnormId = _textnormDict.GetValueOrDefault(textnormValue);
+ }
+ var inputMeta = _modelSession.InputMetadata;
+ var container = new List<NamedOnnxValue>();
+ foreach (var name in inputMeta.Keys)
+ {
+ if (name == "speech")
+ {
+ int[] dim = new int[] { batchSize, padSequence.Length / 560 / batchSize, 560 };
+ var tensor = new DenseTensor<float>(padSequence, dim, false);
+ container.Add(NamedOnnxValue.CreateFromTensor<float>(name, tensor));
+ }
+ if (name == "speech_lengths")
+ {
+
+ int[] dim = new int[] { batchSize };
+ int[] speech_lengths = new int[batchSize];
+ for (int i = 0; i < batchSize; i++)
+ {
+ speech_lengths[i] = padSequence.Length / 560 / batchSize;
+ }
+ var tensor = new DenseTensor<int>(speech_lengths, dim, false);
+ container.Add(NamedOnnxValue.CreateFromTensor<int>(name, tensor));
+ }
+ if (name == "language")
+ {
+ int[] language = new int[batchSize];
+ for (int i = 0; i < batchSize; i++)
+ {
+ language[i] = languageId;
+ }
+ int[] dim = new int[] { batchSize };
+ var tensor = new DenseTensor<int>(language, dim, false);
+ container.Add(NamedOnnxValue.CreateFromTensor<int>(name, tensor));
+ }
+ if (name == "textnorm")
+ {
+ int[] textnorm = new int[batchSize];
+ for (int i = 0; i < batchSize; i++)
+ {
+ textnorm[i] = textnormId;
+ }
+ int[] dim = new int[] { batchSize };
+ var tensor = new DenseTensor<int>(textnorm, dim, false);
+ container.Add(NamedOnnxValue.CreateFromTensor<int>(name, tensor));
+ }
+ }
+ ModelOutputEntity modelOutputEntity = new ModelOutputEntity();
+ try
+ {
+ IDisposableReadOnlyCollection<DisposableNamedOnnxValue> results = _modelSession.Run(container);
+
+ if (results != null)
+ {
+ var resultsArray = results.ToArray();
+ modelOutputEntity.model_out = resultsArray[0].AsTensor<float>();
+ modelOutputEntity.model_out_lens = resultsArray[1].AsEnumerable<int>().ToArray();
+ if (resultsArray.Length >= 4)
+ {
+ Tensor<float> cif_peak_tensor = resultsArray[3].AsTensor<float>();
+ modelOutputEntity.cif_peak_tensor = cif_peak_tensor;
+ }
+ }
+ }
+ catch (Exception ex)
+ {
+ //
+ }
+ return modelOutputEntity;
+ }
+ protected virtual void Dispose(bool disposing)
+ {
+ if (!_disposed)
+ {
+ if (disposing)
+ {
+ if (_modelSession != null)
+ {
+ _modelSession.Dispose();
+ }
+ }
+ _disposed = true;
+ }
+ }
+
+ public void Dispose()
+ {
+ Dispose(disposing: true);
+ GC.SuppressFinalize(this);
+ }
+ ~OfflineProjOfSenseVoiceSmall()
+ {
+ Dispose(_disposed);
+ }
+ }
+}
diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineRecognizer.cs b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineRecognizer.cs
index c2d7f68..3011d13 100644
--- a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineRecognizer.cs
+++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineRecognizer.cs
@@ -1,28 +1,17 @@
锘�// See https://github.com/manyeyes for more information
-// Copyright (c) 2023 by manyeyes
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
+// Copyright (c) 2024 by manyeyes
using AliParaformerAsr.Model;
using AliParaformerAsr.Utils;
+using Microsoft.Extensions.Logging;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
-using Microsoft.Extensions.Logging;
-using System.Text.RegularExpressions;
using Newtonsoft.Json.Linq;
+using System.Text.RegularExpressions;
// 妯″瀷鏂囦欢鍦板潃锛� https://modelscope.cn/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-onnx
+// 妯″瀷鏂囦欢鍦板潃锛� https://www.modelscope.cn/models/manyeyes/sensevoice-small-onnx
namespace AliParaformerAsr
{
- public enum OnnxRumtimeTypes
- {
- CPU = 0,
-
- DML = 1,
-
- CUDA = 2,
- }
-
/// <summary>
/// offline recognizer package
/// Copyright (c) 2023 by manyeyes
@@ -35,6 +24,8 @@
private string _frontend;
private FrontendConfEntity _frontendConfEntity;
private string[] _tokens;
+ private IOfflineProj? _offlineProj;
+ private OfflineModel _offlineModel;
/// <summary>
///
@@ -48,24 +39,9 @@
/// <param name="batchSize"></param>
/// <param name="threadsNum"></param>
/// <exception cref="ArgumentException"></exception>
- public OfflineRecognizer(string modelFilePath, string configFilePath, string mvnFilePath, string tokensFilePath, OnnxRumtimeTypes rumtimeType = OnnxRumtimeTypes.CPU, int deviceId = 0)
+ public OfflineRecognizer(string modelFilePath, string configFilePath, string mvnFilePath, string tokensFilePath, int threadsNum = 1, OnnxRumtimeTypes rumtimeType = OnnxRumtimeTypes.CPU, int deviceId = 0)
{
- var options = new SessionOptions();
- switch(rumtimeType)
- {
- case OnnxRumtimeTypes.DML:
- options.AppendExecutionProvider_DML(deviceId);
- break;
- case OnnxRumtimeTypes.CUDA:
- options.AppendExecutionProvider_CUDA(deviceId);
- break;
- default:
- options.AppendExecutionProvider_CPU(deviceId);
- break;
- }
- //options.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_INFO;
-
- _onnxSession = new InferenceSession(modelFilePath, options);
+ _offlineModel = new OfflineModel(modelFilePath, threadsNum);
string[] tokenLines;
if (tokensFilePath.EndsWith(".txt"))
@@ -86,6 +62,18 @@
_tokens = tokenLines;
OfflineYamlEntity offlineYamlEntity = YamlHelper.ReadYaml<OfflineYamlEntity>(configFilePath);
+ switch (offlineYamlEntity.model.ToLower())
+ {
+ case "paraformer":
+ _offlineProj = new OfflineProjOfParaformer(_offlineModel);
+ break;
+ case "sensevoicesmall":
+ _offlineProj = new OfflineProjOfSenseVoiceSmall(_offlineModel);
+ break;
+ default:
+ _offlineProj = null;
+ break;
+ }
_wavFrontend = new WavFrontend(mvnFilePath, offlineYamlEntity.frontend_conf);
_frontend = offlineYamlEntity.frontend;
_frontendConfEntity = offlineYamlEntity.frontend_conf;
@@ -120,73 +108,40 @@
private OfflineOutputEntity Forward(List<OfflineInputEntity> modelInputs)
{
- int BatchSize = modelInputs.Count;
- float[] padSequence = PadSequence(modelInputs);
- var inputMeta = _onnxSession.InputMetadata;
- var container = new List<NamedOnnxValue>();
- foreach (var name in inputMeta.Keys)
- {
- if (name == "speech")
- {
- int[] dim = new int[] { BatchSize, padSequence.Length / 560 / BatchSize, 560 };//inputMeta["speech"].Dimensions[2]
- var tensor = new DenseTensor<float>(padSequence, dim, false);
- container.Add(NamedOnnxValue.CreateFromTensor<float>(name, tensor));
- }
- if (name == "speech_lengths")
- {
- int[] dim = new int[] { BatchSize };
- int[] speech_lengths = new int[BatchSize];
- for (int i = 0; i < BatchSize; i++)
- {
- speech_lengths[i] = padSequence.Length / 560 / BatchSize;
- }
- var tensor = new DenseTensor<int>(speech_lengths, dim, false);
- container.Add(NamedOnnxValue.CreateFromTensor<int>(name, tensor));
- }
- }
-
-
- IReadOnlyCollection<string> outputNames = new List<string>();
- outputNames.Append("logits");
- outputNames.Append("token_num");
- IDisposableReadOnlyCollection<DisposableNamedOnnxValue> results = null;
+ OfflineOutputEntity offlineOutputEntity = new OfflineOutputEntity();
try
{
- results = _onnxSession.Run(container);
+ ModelOutputEntity modelOutputEntity = _offlineProj.ModelProj(modelInputs);
+ if (modelOutputEntity != null)
+ {
+ offlineOutputEntity.Token_nums_length = modelOutputEntity.model_out_lens.AsEnumerable<int>().ToArray();
+ Tensor<float> logits_tensor = modelOutputEntity.model_out;
+ List<int[]> token_nums = new List<int[]> { };
+
+ for (int i = 0; i < logits_tensor.Dimensions[0]; i++)
+ {
+ int[] item = new int[logits_tensor.Dimensions[1]];
+ for (int j = 0; j < logits_tensor.Dimensions[1]; j++)
+ {
+ int token_num = 0;
+ for (int k = 1; k < logits_tensor.Dimensions[2]; k++)
+ {
+ token_num = logits_tensor[i, j, token_num] > logits_tensor[i, j, k] ? token_num : k;
+ }
+ item[j] = (int)token_num;
+ }
+ token_nums.Add(item);
+ }
+ offlineOutputEntity.Token_nums = token_nums;
+ }
}
catch (Exception ex)
{
//
}
- OfflineOutputEntity modelOutput = new OfflineOutputEntity();
- if (results != null)
- {
- var resultsArray = results.ToArray();
- modelOutput.Logits = resultsArray[0].AsEnumerable<float>().ToArray();
- modelOutput.Token_nums_length = resultsArray[1].AsEnumerable<int>().ToArray();
-
- Tensor<float> logits_tensor = resultsArray[0].AsTensor<float>();
- Tensor<Int64> token_nums_tensor = resultsArray[1].AsTensor<Int64>();
-
- List<int[]> token_nums = new List<int[]> { };
-
- for (int i = 0; i < logits_tensor.Dimensions[0]; i++)
- {
- int[] item = new int[logits_tensor.Dimensions[1]];
- for (int j = 0; j < logits_tensor.Dimensions[1]; j++)
- {
- int token_num = 0;
- for (int k = 1; k < logits_tensor.Dimensions[2]; k++)
- {
- token_num = logits_tensor[i, j, token_num] > logits_tensor[i, j, k] ? token_num : k;
- }
- item[j] = (int)token_num;
- }
- token_nums.Add(item);
- }
- modelOutput.Token_nums = token_nums;
- }
- return modelOutput;
+
+
+ return offlineOutputEntity;
}
private List<string> DecodeMulti(List<int[]> token_nums)
@@ -203,9 +158,9 @@
break;
}
- string tokenChar = _tokens[token];
+ string tokenChar = _tokens[token].Split("\t")[0];
- if (tokenChar != "</s>" && tokenChar != "<s>" && tokenChar != "<blank>")
+ if (tokenChar != "</s>" && tokenChar != "<s>" && tokenChar != "<blank>" && tokenChar != "<unk>")
{
if (IsChinese(tokenChar, true))
{
@@ -244,48 +199,6 @@
else
return false;
}
-
- private float[] PadSequence(List<OfflineInputEntity> modelInputs)
- {
- int max_speech_length = modelInputs.Max(x => x.SpeechLength);
- int speech_length = max_speech_length * modelInputs.Count;
- float[] speech = new float[speech_length];
- float[,] xxx = new float[modelInputs.Count, max_speech_length];
- for (int i = 0; i < modelInputs.Count; i++)
- {
- if (max_speech_length == modelInputs[i].SpeechLength)
- {
- for (int j = 0; j < xxx.GetLength(1); j++)
- {
-#pragma warning disable CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
- xxx[i, j] = modelInputs[i].Speech[j];
-#pragma warning restore CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
- }
- continue;
- }
- float[] nullspeech = new float[max_speech_length - modelInputs[i].SpeechLength];
- float[]? curr_speech = modelInputs[i].Speech;
- float[] padspeech = new float[max_speech_length];
- padspeech = _wavFrontend.ApplyCmvn(padspeech);
- Array.Copy(curr_speech, 0, padspeech, 0, curr_speech.Length);
- for (int j = 0; j < padspeech.Length; j++)
- {
-#pragma warning disable CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
- xxx[i, j] = padspeech[j];
-#pragma warning restore CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
- }
-
- }
- int s = 0;
- for (int i = 0; i < xxx.GetLength(0); i++)
- {
- for (int j = 0; j < xxx.GetLength(1); j++)
- {
- speech[s] = xxx[i, j];
- s++;
- }
- }
- return speech;
- }
+
}
}
\ No newline at end of file
diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Utils/PadHelper.cs b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Utils/PadHelper.cs
new file mode 100644
index 0000000..e9102e8
--- /dev/null
+++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/Utils/PadHelper.cs
@@ -0,0 +1,55 @@
+锘縰sing AliParaformerAsr.Model;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace AliParaformerAsr.Utils
+{
+ internal static class PadHelper
+ {
+ public static float[] PadSequence(List<OfflineInputEntity> modelInputs)
+ {
+ int max_speech_length = modelInputs.Max(x => x.SpeechLength);
+ int speech_length = max_speech_length * modelInputs.Count;
+ float[] speech = new float[speech_length];
+ float[,] xxx = new float[modelInputs.Count, max_speech_length];
+ for (int i = 0; i < modelInputs.Count; i++)
+ {
+ if (max_speech_length == modelInputs[i].SpeechLength)
+ {
+ for (int j = 0; j < xxx.GetLength(1); j++)
+ {
+#pragma warning disable CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
+ xxx[i, j] = modelInputs[i].Speech[j];
+#pragma warning restore CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
+ }
+ continue;
+ }
+ float[] nullspeech = new float[max_speech_length - modelInputs[i].SpeechLength];
+ float[]? curr_speech = modelInputs[i].Speech;
+ float[] padspeech = new float[max_speech_length];
+ Array.Copy(curr_speech, 0, padspeech, 0, curr_speech.Length);
+ for (int j = 0; j < padspeech.Length; j++)
+ {
+#pragma warning disable CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
+ xxx[i, j] = padspeech[j];
+#pragma warning restore CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
+ }
+
+ }
+ int s = 0;
+ for (int i = 0; i < xxx.GetLength(0); i++)
+ {
+ for (int j = 0; j < xxx.GetLength(1); j++)
+ {
+ speech[s] = xxx[i, j];
+ s++;
+ }
+ }
+ speech = speech.Select(x => x == 0 ? -23.025850929940457F * 32768 : x).ToArray();
+ return speech;
+ }
+ }
+}
diff --git a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/WavFrontend.cs b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/WavFrontend.cs
index 5c1bec3..eb3a0b8 100644
--- a/runtime/csharp/AliParaformerAsr/AliParaformerAsr/WavFrontend.cs
+++ b/runtime/csharp/AliParaformerAsr/AliParaformerAsr/WavFrontend.cs
@@ -30,7 +30,8 @@
_fbank_beg_idx = 0;
_onlineFbank = new OnlineFbank(
dither: _frontendConfEntity.dither,
- snip_edges: false,
+ snip_edges: _frontendConfEntity.snip_edges,
+ window_type: _frontendConfEntity.window,
sample_rate: _frontendConfEntity.fs,
num_bins: _frontendConfEntity.n_mels
);
diff --git a/runtime/csharp/AliParaformerAsr/README.md b/runtime/csharp/AliParaformerAsr/README.md
index 887180d..4b2ef5b 100644
--- a/runtime/csharp/AliParaformerAsr/README.md
+++ b/runtime/csharp/AliParaformerAsr/README.md
@@ -1,4 +1,10 @@
# AliParaformerAsr
+##### 鏀寔妯″瀷
+## paraformer-large offline onnx妯″瀷涓嬭浇
+https://huggingface.co/manyeyes/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-onnx
+## SenseVoiceSmall offline onnx妯″瀷涓嬭浇
+https://www.modelscope.cn/models/manyeyes/sensevoice-small-onnx
+
##### 绠�浠嬶細
椤圭洰涓娇鐢ㄧ殑Asr妯″瀷鏄樋閲屽反宸磋揪鎽╅櫌鎻愪緵鐨凱araformer-large ASR妯″瀷銆�
**椤圭洰鍩轰簬Net 6.0锛屼娇鐢–#缂栧啓锛岃皟鐢∕icrosoft.ML.OnnxRuntime瀵筼nnx妯″瀷杩涜瑙g爜锛屾敮鎸佽法骞冲彴缂栬瘧銆傞」鐩互搴撶殑褰㈠紡杩涜璋冪敤锛岄儴缃查潪甯告柟渚裤��**
@@ -25,9 +31,6 @@
##### ASR甯哥敤鍙傛暟锛堝弬鑰冿細asr.yaml鏂囦欢锛夛細
鐢ㄤ簬瑙g爜鐨刟sr.yaml閰嶇疆鍙傛暟锛屽彇鑷畼鏂规ā鍨嬮厤缃甤onfig.yaml鍘熸枃浠躲�備究浜庤窡杩涘拰鍗囩骇銆�
-
-## paraformer-large offline onnx妯″瀷涓嬭浇
-https://huggingface.co/manyeyes/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-onnx
## 绂荤嚎锛堥潪娴佸紡锛夋ā鍨嬭皟鐢ㄦ柟娉曪細
--
Gitblit v1.9.1