From c2a2575f198b1bfd452ea5769bec81bcce3d3a42 Mon Sep 17 00:00:00 2001
From: manyeyes <32889020+manyeyes@users.noreply.github.com>
Date: 星期三, 21 六月 2023 09:31:59 +0800
Subject: [PATCH] add c# assembly for fsmn vad (#650)
---
funasr/runtime/csharp/AliFsmnVadSharp/AliFsmnVadSharp.csproj | 37 +
funasr/runtime/csharp/AliFsmnVadSharp/Model/FrontendConfEntity.cs | 29
funasr/runtime/csharp/AliFsmnVadSharp/Model/SegmentEntity.cs | 22
funasr/runtime/csharp/AliFsmnVadSharp/Model/E2EVadFrameProbEntity.cs | 23
funasr/runtime/csharp/AliFsmnVadSharp.Examples/AliFsmnVadSharp.Examples.csproj | 18
funasr/runtime/csharp/AliFsmnVadSharp/E2EVadModel.cs | 717 +++++++++++++++++++
funasr/runtime/csharp/AliFsmnVadSharp/Model/VadOutputEntity.cs | 19
funasr/runtime/csharp/AliFsmnVadSharp.sln | 37 +
funasr/runtime/csharp/AliFsmnVadSharp/Model/EncoderConfEntity.cs | 35
funasr/runtime/csharp/AliFsmnVadSharp/Model/VadPostConfEntity.cs | 72 ++
funasr/runtime/csharp/AliFsmnVadSharp/AliFsmnVad.cs | 387 ++++++++++
funasr/runtime/csharp/AliFsmnVadSharp/Lib/kaldi-native-fbank-dll.dll | 0
funasr/runtime/csharp/README.md | 59 +
funasr/runtime/csharp/AliFsmnVadSharp/Model/E2EVadSpeechBufWithDoaEntity.cs | 98 ++
funasr/runtime/csharp/AliFsmnVadSharp/WavFrontend.cs | 185 +++++
funasr/runtime/csharp/AliFsmnVadSharp/WindowDetector.cs | 156 ++++
funasr/runtime/csharp/AliFsmnVadSharp/Model/VadYamlEntity.cs | 27
funasr/runtime/csharp/AliFsmnVadSharp/Utils/YamlHelper.cs | 28
funasr/runtime/csharp/AliFsmnVadSharp/Model/VadInputEntity.cs | 23
funasr/runtime/csharp/AliFsmnVadSharp/Model/CmvnEntity.cs | 17
funasr/runtime/csharp/AliFsmnVadSharp/DLL/KnfOnlineFbank.cs | 26
funasr/runtime/csharp/AliFsmnVadSharp.Examples/Program.cs | 61 +
funasr/runtime/csharp/AliFsmnVadSharp/DLL/KaldiNativeFbank.cs | 40 +
funasr/runtime/csharp/AliFsmnVadSharp/Struct/FbankData.cs | 6
24 files changed, 2,122 insertions(+), 0 deletions(-)
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp.Examples/AliFsmnVadSharp.Examples.csproj b/funasr/runtime/csharp/AliFsmnVadSharp.Examples/AliFsmnVadSharp.Examples.csproj
new file mode 100644
index 0000000..b494bb5
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp.Examples/AliFsmnVadSharp.Examples.csproj
@@ -0,0 +1,18 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+ <PropertyGroup>
+ <OutputType>Exe</OutputType>
+ <TargetFramework>net6.0</TargetFramework>
+ <ImplicitUsings>enable</ImplicitUsings>
+ <Nullable>enable</Nullable>
+ </PropertyGroup>
+
+ <ItemGroup>
+ <PackageReference Include="NAudio" Version="2.1.0" />
+ </ItemGroup>
+
+ <ItemGroup>
+ <ProjectReference Include="..\AliFsmnVadSharp\AliFsmnVadSharp.csproj" />
+ </ItemGroup>
+
+</Project>
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp.Examples/Program.cs b/funasr/runtime/csharp/AliFsmnVadSharp.Examples/Program.cs
new file mode 100644
index 0000000..dd3bf78
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp.Examples/Program.cs
@@ -0,0 +1,61 @@
+锘縰sing AliFsmnVadSharp;
+using AliFsmnVadSharp.Model;
+using NAudio.Wave;
+
+internal static class Program
+{
+ [STAThread]
+ private static void Main()
+ {
+ string applicationBase = AppDomain.CurrentDomain.BaseDirectory;
+ string modelFilePath = applicationBase + "./speech_fsmn_vad_zh-cn-16k-common-pytorch/model.onnx";
+ string configFilePath = applicationBase + "./speech_fsmn_vad_zh-cn-16k-common-pytorch/vad.yaml";
+ string mvnFilePath = applicationBase + "./speech_fsmn_vad_zh-cn-16k-common-pytorch/vad.mvn";
+ int batchSize = 2;
+ TimeSpan start_time0 = new TimeSpan(DateTime.Now.Ticks);
+ AliFsmnVad aliFsmnVad = new AliFsmnVad(modelFilePath, configFilePath, mvnFilePath, batchSize);
+ TimeSpan end_time0 = new TimeSpan(DateTime.Now.Ticks);
+ double elapsed_milliseconds0 = end_time0.TotalMilliseconds - start_time0.TotalMilliseconds;
+ Console.WriteLine("load model and init config elapsed_milliseconds:{0}", elapsed_milliseconds0.ToString());
+ List<float[]> samples = new List<float[]>();
+ TimeSpan total_duration = new TimeSpan(0L);
+ for (int i = 0; i < 2; i++)
+ {
+ string wavFilePath = string.Format(applicationBase + "./speech_fsmn_vad_zh-cn-16k-common-pytorch/example/{0}.wav", i.ToString());//vad_example
+ if (!File.Exists(wavFilePath))
+ {
+ continue;
+ }
+ AudioFileReader _audioFileReader = new AudioFileReader(wavFilePath);
+ byte[] datas = new byte[_audioFileReader.Length];
+ _audioFileReader.Read(datas, 0, datas.Length);
+ TimeSpan duration = _audioFileReader.TotalTime;
+ float[] wavdata = new float[datas.Length / 4];
+ Buffer.BlockCopy(datas, 0, wavdata, 0, datas.Length);
+ float[] sample = wavdata.Select((float x) => x * 32768f).ToArray();
+ samples.Add(wavdata);
+ total_duration += duration;
+ }
+ TimeSpan start_time = new TimeSpan(DateTime.Now.Ticks);
+ //SegmentEntity[] segments_duration = aliFsmnVad.GetSegments(samples);
+ SegmentEntity[] segments_duration = aliFsmnVad.GetSegmentsByStep(samples);
+ TimeSpan end_time = new TimeSpan(DateTime.Now.Ticks);
+ Console.WriteLine("vad infer result:");
+ foreach (SegmentEntity segment in segments_duration)
+ {
+ Console.Write("[");
+ foreach (var x in segment.Segment)
+ {
+ Console.Write("[" + string.Join(",", x.ToArray()) + "]");
+ }
+ Console.Write("]\r\n");
+ }
+
+ double elapsed_milliseconds = end_time.TotalMilliseconds - start_time.TotalMilliseconds;
+ double rtf = elapsed_milliseconds / total_duration.TotalMilliseconds;
+ Console.WriteLine("elapsed_milliseconds:{0}", elapsed_milliseconds.ToString());
+ Console.WriteLine("total_duration:{0}", total_duration.TotalMilliseconds.ToString());
+ Console.WriteLine("rtf:{1}", "0".ToString(), rtf.ToString());
+ Console.WriteLine("------------------------");
+ }
+}
\ No newline at end of file
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp.sln b/funasr/runtime/csharp/AliFsmnVadSharp.sln
new file mode 100644
index 0000000..8bf24aa
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp.sln
@@ -0,0 +1,37 @@
+锘�
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.1.32210.238
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AliFsmnVadSharp", "AliFsmnVadSharp\AliFsmnVadSharp.csproj", "{BFB82F2E-AD5B-405C-AAFF-3CE33C548748}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AliFsmnVadSharp.Examples", "AliFsmnVadSharp.Examples\AliFsmnVadSharp.Examples.csproj", "{2FFA4D03-A62B-435B-B57B-7E49209810E1}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{212561CC-9836-4F45-A31B-298EF576F519}"
+ ProjectSection(SolutionItems) = preProject
+ license = license
+ README.md = README.md
+ EndProjectSection
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Any CPU = Debug|Any CPU
+ Release|Any CPU = Release|Any CPU
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {BFB82F2E-AD5B-405C-AAFF-3CE33C548748}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {BFB82F2E-AD5B-405C-AAFF-3CE33C548748}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {BFB82F2E-AD5B-405C-AAFF-3CE33C548748}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {BFB82F2E-AD5B-405C-AAFF-3CE33C548748}.Release|Any CPU.Build.0 = Release|Any CPU
+ {2FFA4D03-A62B-435B-B57B-7E49209810E1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {2FFA4D03-A62B-435B-B57B-7E49209810E1}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {2FFA4D03-A62B-435B-B57B-7E49209810E1}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {2FFA4D03-A62B-435B-B57B-7E49209810E1}.Release|Any CPU.Build.0 = Release|Any CPU
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ SolutionGuid = {FCC1BBCC-91A3-4223-B368-D272FB5108B6}
+ EndGlobalSection
+EndGlobal
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/AliFsmnVad.cs b/funasr/runtime/csharp/AliFsmnVadSharp/AliFsmnVad.cs
new file mode 100644
index 0000000..f42bfb1
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/AliFsmnVad.cs
@@ -0,0 +1,387 @@
+锘縰sing System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.ML;
+using Microsoft.ML.OnnxRuntime;
+using Microsoft.ML.OnnxRuntime.Tensors;
+using Microsoft.Extensions.Logging;
+using AliFsmnVadSharp.Model;
+using AliFsmnVadSharp.Utils;
+
+namespace AliFsmnVadSharp
+{
+ public class AliFsmnVad
+ {
+ private InferenceSession _onnxSession;
+ private readonly ILogger<AliFsmnVad> _logger;
+ private string _frontend;
+ private WavFrontend _wavFrontend;
+ private int _batchSize = 1;
+ private int _max_end_sil = int.MinValue;
+ private EncoderConfEntity _encoderConfEntity;
+ private VadPostConfEntity _vad_post_conf;
+
+ public AliFsmnVad(string modelFilePath, string configFilePath, string mvnFilePath, int batchSize = 1)
+ {
+ Microsoft.ML.OnnxRuntime.SessionOptions options = new Microsoft.ML.OnnxRuntime.SessionOptions();
+ options.AppendExecutionProvider_CPU(0);
+ options.InterOpNumThreads = 1;
+ _onnxSession = new InferenceSession(modelFilePath, options);
+
+ VadYamlEntity vadYamlEntity = YamlHelper.ReadYaml<VadYamlEntity>(configFilePath);
+ _wavFrontend = new WavFrontend(mvnFilePath, vadYamlEntity.frontend_conf);
+ _frontend = vadYamlEntity.frontend;
+ _vad_post_conf = vadYamlEntity.vad_post_conf;
+ _batchSize = batchSize;
+ _max_end_sil = _max_end_sil != int.MinValue ? _max_end_sil : vadYamlEntity.vad_post_conf.max_end_silence_time;
+ _encoderConfEntity = vadYamlEntity.encoder_conf;
+
+ ILoggerFactory loggerFactory = new LoggerFactory();
+ _logger = new Logger<AliFsmnVad>(loggerFactory);
+ }
+
+ public SegmentEntity[] GetSegments(List<float[]> samples)
+ {
+ int waveform_nums = samples.Count;
+ _batchSize = Math.Min(waveform_nums, _batchSize);
+ SegmentEntity[] segments = new SegmentEntity[waveform_nums];
+ for (int beg_idx = 0; beg_idx < waveform_nums; beg_idx += _batchSize)
+ {
+ int end_idx = Math.Min(waveform_nums, beg_idx + _batchSize);
+ List<float[]> waveform_list = new List<float[]>();
+ for (int i = beg_idx; i < end_idx; i++)
+ {
+ waveform_list.Add(samples[i]);
+ }
+ List<VadInputEntity> vadInputEntitys = ExtractFeats(waveform_list);
+ try
+ {
+ int t_offset = 0;
+ int step = Math.Min(waveform_list.Max(x => x.Length), 6000);
+ bool is_final = true;
+ List<VadOutputEntity> vadOutputEntitys = Infer(vadInputEntitys);
+ for (int batch_num = beg_idx; batch_num < end_idx; batch_num++)
+ {
+ var scores = vadOutputEntitys[batch_num - beg_idx].Scores;
+ SegmentEntity[] segments_part = vadInputEntitys[batch_num].VadScorer.DefaultCall(scores, waveform_list[batch_num - beg_idx], is_final: is_final, max_end_sil: _max_end_sil, online: false);
+ if (segments_part.Length > 0)
+ {
+#pragma warning disable CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
+ if (segments[batch_num] == null)
+ {
+ segments[batch_num] = new SegmentEntity();
+ }
+ segments[batch_num].Segment.AddRange(segments_part[0].Segment); //
+#pragma warning restore CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
+
+ }
+ }
+ }
+ catch (OnnxRuntimeException ex)
+ {
+ _logger.LogWarning("input wav is silence or noise");
+ segments = null;
+ }
+// for (int batch_num = 0; batch_num < _batchSize; batch_num++)
+// {
+// List<float[]> segment_waveforms = new List<float[]>();
+// foreach (int[] segment in segments[beg_idx + batch_num].Segment)
+// {
+// // (int)(16000 * (segment[0] / 1000.0) * 2);
+// int frame_length = (((6000 * 400) / 400 - 1) * 160 + 400) / 60 / 1000;
+// int frame_start = segment[0] * frame_length;
+// int frame_end = segment[1] * frame_length;
+// float[] segment_waveform = new float[frame_end - frame_start];
+// Array.Copy(waveform_list[batch_num], frame_start, segment_waveform, 0, segment_waveform.Length);
+// segment_waveforms.Add(segment_waveform);
+// }
+// segments[beg_idx + batch_num].Waveform.AddRange(segment_waveforms);
+// }
+ }
+
+ return segments;
+ }
+
+ public SegmentEntity[] GetSegmentsByStep(List<float[]> samples)
+ {
+ int waveform_nums = samples.Count;
+ _batchSize=Math.Min(waveform_nums, _batchSize);
+ SegmentEntity[] segments = new SegmentEntity[waveform_nums];
+ for (int beg_idx = 0; beg_idx < waveform_nums; beg_idx += _batchSize)
+ {
+ int end_idx = Math.Min(waveform_nums, beg_idx + _batchSize);
+ List<float[]> waveform_list = new List<float[]>();
+ for (int i = beg_idx; i < end_idx; i++)
+ {
+ waveform_list.Add(samples[i]);
+ }
+ List<VadInputEntity> vadInputEntitys = ExtractFeats(waveform_list);
+ int feats_len = vadInputEntitys.Max(x => x.SpeechLength);
+ List<float[]> in_cache = new List<float[]>();
+ in_cache = PrepareCache(in_cache);
+ try
+ {
+ int step = Math.Min(vadInputEntitys.Max(x => x.SpeechLength), 6000 * 400);
+ bool is_final = true;
+ for (int t_offset = 0; t_offset < (int)(feats_len); t_offset += Math.Min(step, feats_len - t_offset))
+ {
+
+ if (t_offset + step >= feats_len - 1)
+ {
+ step = feats_len - t_offset;
+ is_final = true;
+ }
+ else
+ {
+ is_final = false;
+ }
+ List<VadInputEntity> vadInputEntitys_step = new List<VadInputEntity>();
+ foreach (VadInputEntity vadInputEntity in vadInputEntitys)
+ {
+ VadInputEntity vadInputEntity_step = new VadInputEntity();
+ float[]? feats = vadInputEntity.Speech;
+ int curr_step = Math.Min(feats.Length - t_offset, step);
+ if (curr_step <= 0)
+ {
+ vadInputEntity_step.Speech = new float[32000];
+ vadInputEntity_step.SpeechLength = 0;
+ vadInputEntity_step.InCaches = in_cache;
+ vadInputEntity_step.Waveform = new float[(((int)(32000) / 400 - 1) * 160 + 400)];
+ vadInputEntitys_step.Add(vadInputEntity_step);
+ continue;
+ }
+ float[]? feats_step = new float[curr_step];
+ Array.Copy(feats, t_offset, feats_step, 0, feats_step.Length);
+ float[]? waveform = vadInputEntity.Waveform;
+ float[]? waveform_step = new float[Math.Min(waveform.Length, ((int)(t_offset + step) / 400 - 1) * 160 + 400) - t_offset / 400 * 160];
+ Array.Copy(waveform, t_offset / 400 * 160, waveform_step, 0, waveform_step.Length);
+ vadInputEntity_step.Speech = feats_step;
+ vadInputEntity_step.SpeechLength = feats_step.Length;
+ vadInputEntity_step.InCaches = vadInputEntity.InCaches;
+ vadInputEntity_step.Waveform = waveform_step;
+ vadInputEntitys_step.Add(vadInputEntity_step);
+ }
+ List<VadOutputEntity> vadOutputEntitys = Infer(vadInputEntitys_step);
+ for (int batch_num = 0; batch_num < _batchSize; batch_num++)
+ {
+ vadInputEntitys[batch_num].InCaches = vadOutputEntitys[batch_num].OutCaches;
+ var scores = vadOutputEntitys[batch_num].Scores;
+ SegmentEntity[] segments_part = vadInputEntitys[batch_num].VadScorer.DefaultCall(scores, vadInputEntitys_step[batch_num].Waveform, is_final: is_final, max_end_sil: _max_end_sil, online: false);
+ if (segments_part.Length > 0)
+ {
+
+#pragma warning disable CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
+ if (segments[beg_idx + batch_num] == null)
+ {
+ segments[beg_idx + batch_num] = new SegmentEntity();
+ }
+ if (segments_part[0] != null)
+ {
+ segments[beg_idx + batch_num].Segment.AddRange(segments_part[0].Segment);
+ }
+#pragma warning restore CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
+
+ }
+ }
+ }
+ }
+ catch (OnnxRuntimeException ex)
+ {
+ _logger.LogWarning("input wav is silence or noise");
+ segments = null;
+ }
+// for (int batch_num = 0; batch_num < _batchSize; batch_num++)
+// {
+// List<float[]> segment_waveforms=new List<float[]>();
+// foreach (int[] segment in segments[beg_idx + batch_num].Segment)
+// {
+// // (int)(16000 * (segment[0] / 1000.0) * 2);
+// int frame_length = (((6000 * 400) / 400 - 1) * 160 + 400) / 60 / 1000;
+// int frame_start = segment[0] * frame_length;
+// int frame_end = segment[1] * frame_length;
+// if(frame_end > waveform_list[batch_num].Length)
+// {
+// break;
+// }
+// float[] segment_waveform = new float[frame_end - frame_start];
+// Array.Copy(waveform_list[batch_num], frame_start, segment_waveform, 0, segment_waveform.Length);
+// segment_waveforms.Add(segment_waveform);
+// }
+// segments[beg_idx + batch_num].Waveform.AddRange(segment_waveforms);
+// }
+
+ }
+ return segments;
+ }
+
+ private List<float[]> PrepareCache(List<float[]> in_cache)
+ {
+ if (in_cache.Count > 0)
+ {
+ return in_cache;
+ }
+
+ int fsmn_layers = _encoderConfEntity.fsmn_layers;
+
+ int proj_dim = _encoderConfEntity.proj_dim;
+ int lorder = _encoderConfEntity.lorder;
+
+ for (int i = 0; i < fsmn_layers; i++)
+ {
+ float[] cache = new float[1 * proj_dim * (lorder - 1) * 1];
+ in_cache.Add(cache);
+ }
+ return in_cache;
+ }
+
+ private List<VadInputEntity> ExtractFeats(List<float[]> waveform_list)
+ {
+ List<float[]> in_cache = new List<float[]>();
+ in_cache = PrepareCache(in_cache);
+ List<VadInputEntity> vadInputEntitys = new List<VadInputEntity>();
+ foreach (var waveform in waveform_list)
+ {
+ float[] fbanks = _wavFrontend.GetFbank(waveform);
+ float[] features = _wavFrontend.LfrCmvn(fbanks);
+ VadInputEntity vadInputEntity = new VadInputEntity();
+ vadInputEntity.Waveform = waveform;
+ vadInputEntity.Speech = features;
+ vadInputEntity.SpeechLength = features.Length;
+ vadInputEntity.InCaches = in_cache;
+ vadInputEntity.VadScorer = new E2EVadModel(_vad_post_conf);
+ vadInputEntitys.Add(vadInputEntity);
+ }
+ return vadInputEntitys;
+ }
+ /// <summary>
+ /// 涓�缁存暟缁勮浆3缁存暟缁�
+ /// </summary>
+ /// <param name="obj"></param>
+ /// <param name="len">涓�缁撮暱</param>
+ /// <param name="wid">浜岀淮闀�</param>
+ /// <returns></returns>
+ public static T[,,] DimOneToThree<T>(T[] oneDimObj, int len, int wid)
+ {
+ if (oneDimObj.Length % (len * wid) != 0)
+ return null;
+ int height = oneDimObj.Length / (len * wid);
+ T[,,] threeDimObj = new T[len, wid, height];
+
+ for (int i = 0; i < oneDimObj.Length; i++)
+ {
+ threeDimObj[i / (wid * height), (i / height) % wid, i % height] = oneDimObj[i];
+ }
+ return threeDimObj;
+ }
+
+ private List<VadOutputEntity> Infer(List<VadInputEntity> vadInputEntitys)
+ {
+ List<VadOutputEntity> vadOutputEntities = new List<VadOutputEntity>();
+ foreach (VadInputEntity vadInputEntity in vadInputEntitys)
+ {
+ int batchSize = 1;//_batchSize
+ var inputMeta = _onnxSession.InputMetadata;
+ var container = new List<NamedOnnxValue>();
+ int[] dim = new int[] { batchSize, vadInputEntity.Speech.Length / 400 / batchSize, 400 };
+ var tensor = new DenseTensor<float>(vadInputEntity.Speech, dim, false);
+ container.Add(NamedOnnxValue.CreateFromTensor<float>("speech", tensor));
+
+ int i = 0;
+ foreach (var cache in vadInputEntity.InCaches)
+ {
+ int[] cache_dim = new int[] { 1, 128, cache.Length / 128 / 1, 1 };
+ var cache_tensor = new DenseTensor<float>(cache, cache_dim, false);
+ container.Add(NamedOnnxValue.CreateFromTensor<float>("in_cache" + i.ToString(), cache_tensor));
+ i++;
+ }
+
+ IDisposableReadOnlyCollection<DisposableNamedOnnxValue> results = _onnxSession.Run(container);
+ var resultsArray = results.ToArray();
+ VadOutputEntity vadOutputEntity = new VadOutputEntity();
+ for (int j = 0; j < resultsArray.Length; j++)
+ {
+ if (resultsArray[j].Name.Equals("logits"))
+ {
+ Tensor<float> tensors = resultsArray[0].AsTensor<float>();
+ var _scores = DimOneToThree<float>(tensors.ToArray(), 1, tensors.Dimensions[1]);
+ vadOutputEntity.Scores = _scores;
+ }
+ if (resultsArray[j].Name.StartsWith("out_cache"))
+ {
+ vadOutputEntity.OutCaches.Add(resultsArray[j].AsEnumerable<float>().ToArray());
+ }
+
+ }
+ vadOutputEntities.Add(vadOutputEntity);
+ }
+
+ return vadOutputEntities;
+ }
+
+ private float[] PadSequence(List<VadInputEntity> modelInputs)
+ {
+ int max_speech_length = modelInputs.Max(x => x.SpeechLength);
+ int speech_length = max_speech_length * modelInputs.Count;
+ float[] speech = new float[speech_length];
+ float[,] xxx = new float[modelInputs.Count, max_speech_length];
+ for (int i = 0; i < modelInputs.Count; i++)
+ {
+ if (max_speech_length == modelInputs[i].SpeechLength)
+ {
+ for (int j = 0; j < xxx.GetLength(1); j++)
+ {
+#pragma warning disable CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
+ xxx[i, j] = modelInputs[i].Speech[j];
+#pragma warning restore CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
+ }
+ continue;
+ }
+ float[] nullspeech = new float[max_speech_length - modelInputs[i].SpeechLength];
+ float[]? curr_speech = modelInputs[i].Speech;
+ float[] padspeech = new float[max_speech_length];
+ // ///////////////////////////////////////////////////
+ var arr_neg_mean = _onnxSession.ModelMetadata.CustomMetadataMap["neg_mean"].ToString().Split(',').ToArray();
+ double[] neg_mean = arr_neg_mean.Select(x => (double)Convert.ToDouble(x)).ToArray();
+ var arr_inv_stddev = _onnxSession.ModelMetadata.CustomMetadataMap["inv_stddev"].ToString().Split(',').ToArray();
+ double[] inv_stddev = arr_inv_stddev.Select(x => (double)Convert.ToDouble(x)).ToArray();
+
+ int dim = neg_mean.Length;
+ for (int j = 0; j < max_speech_length; j++)
+ {
+ int k = new Random().Next(0, dim);
+ padspeech[j] = (float)((float)(0 + neg_mean[k]) * inv_stddev[k]);
+ }
+ Array.Copy(curr_speech, 0, padspeech, 0, curr_speech.Length);
+ for (int j = 0; j < padspeech.Length; j++)
+ {
+#pragma warning disable CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
+ xxx[i, j] = padspeech[j];
+#pragma warning restore CS8602 // 瑙e紩鐢ㄥ彲鑳藉嚭鐜扮┖寮曠敤銆�
+ }
+
+ }
+ int s = 0;
+ for (int i = 0; i < xxx.GetLength(0); i++)
+ {
+ for (int j = 0; j < xxx.GetLength(1); j++)
+ {
+ speech[s] = xxx[i, j];
+ s++;
+ }
+ }
+ return speech;
+ }
+
+
+
+
+
+
+
+
+
+
+
+
+ }
+}
\ No newline at end of file
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/AliFsmnVadSharp.csproj b/funasr/runtime/csharp/AliFsmnVadSharp/AliFsmnVadSharp.csproj
new file mode 100644
index 0000000..4991517
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/AliFsmnVadSharp.csproj
@@ -0,0 +1,37 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+ <PropertyGroup>
+ <TargetFramework>net6.0</TargetFramework>
+ <ImplicitUsings>enable</ImplicitUsings>
+ <Nullable>enable</Nullable>
+ </PropertyGroup>
+
+ <ItemGroup>
+ <PackageReference Include="Microsoft.Extensions.Logging" Version="7.0.0" />
+ <PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.15.0" />
+ <PackageReference Include="YamlDotNet" Version="13.1.0" />
+ </ItemGroup>
+
+ <ItemGroup>
+ <None Update="Lib\kaldi-native-fbank-dll.dll">
+ <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+ <TargetPath>kaldi-native-fbank-dll.dll</TargetPath>
+ </None>
+ <None Update="speech_fsmn_vad_zh-cn-16k-common-pytorch\example\0.wav">
+ <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+ </None>
+ <None Update="speech_fsmn_vad_zh-cn-16k-common-pytorch\example\1.wav">
+ <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+ </None>
+ <None Update="speech_fsmn_vad_zh-cn-16k-common-pytorch\model.onnx">
+ <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+ </None>
+ <None Update="speech_fsmn_vad_zh-cn-16k-common-pytorch\vad.mvn">
+ <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+ </None>
+ <None Update="speech_fsmn_vad_zh-cn-16k-common-pytorch\vad.yaml">
+ <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+ </None>
+ </ItemGroup>
+
+</Project>
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/DLL/KaldiNativeFbank.cs b/funasr/runtime/csharp/AliFsmnVadSharp/DLL/KaldiNativeFbank.cs
new file mode 100644
index 0000000..af0ad36
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/DLL/KaldiNativeFbank.cs
@@ -0,0 +1,40 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using System.Runtime.InteropServices;
+using AliFsmnVadSharp.Struct;
+
+namespace AliFsmnVadSharp.DLL
+{
+ public static class KaldiNativeFbank
+ {
+ private const string dllName = @"kaldi-native-fbank-dll";
+
+ [DllImport(dllName, EntryPoint = "GetFbankOptions", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern IntPtr GetFbankOptions(float dither, bool snip_edges, float sample_rate, int num_bins, float frame_shift = 10.0f, float frame_length = 25.0f, float energy_floor = 0.0f, bool debug_mel = false, string window_type = "hamming");
+
+ [DllImport(dllName, EntryPoint = "GetOnlineFbank", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern KnfOnlineFbank GetOnlineFbank(IntPtr opts);
+
+ [DllImport(dllName, EntryPoint = "AcceptWaveform", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern void AcceptWaveform(KnfOnlineFbank knfOnlineFbank, float sample_rate, float[] samples, int samples_size);
+
+ [DllImport(dllName, EntryPoint = "InputFinished", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern void InputFinished(KnfOnlineFbank knfOnlineFbank);
+
+ [DllImport(dllName, EntryPoint = "GetNumFramesReady", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern int GetNumFramesReady(KnfOnlineFbank knfOnlineFbank);
+
+ [DllImport(dllName, EntryPoint = "AcceptWaveformxxx", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern FbankDatas AcceptWaveformxxx(KnfOnlineFbank knfOnlineFbank, float sample_rate, float[] samples, int samples_size);
+
+ [DllImport(dllName, EntryPoint = "GetFbank", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern void GetFbank(KnfOnlineFbank knfOnlineFbank,int frame, ref FbankData pData);
+
+ [DllImport(dllName, EntryPoint = "GetFbanks", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern void GetFbanks(KnfOnlineFbank knfOnlineFbank, int framesNum, ref FbankDatas fbankDatas);
+
+ }
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/DLL/KnfOnlineFbank.cs b/funasr/runtime/csharp/AliFsmnVadSharp/DLL/KnfOnlineFbank.cs
new file mode 100644
index 0000000..45549b2
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/DLL/KnfOnlineFbank.cs
@@ -0,0 +1,26 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Runtime.InteropServices;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace AliFsmnVadSharp.DLL
+{
+ internal struct FbankData
+ {
+ public IntPtr data;
+ public int data_length;
+ };
+
+ internal struct FbankDatas
+ {
+ public IntPtr data;
+ public int data_length;
+ };
+
+ internal struct KnfOnlineFbank
+ {
+ public IntPtr impl;
+ };
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/E2EVadModel.cs b/funasr/runtime/csharp/AliFsmnVadSharp/E2EVadModel.cs
new file mode 100644
index 0000000..ce519b1
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/E2EVadModel.cs
@@ -0,0 +1,717 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using AliFsmnVadSharp.Model;
+
+namespace AliFsmnVadSharp
+{
+ enum VadStateMachine
+ {
+ kVadInStateStartPointNotDetected = 1,
+ kVadInStateInSpeechSegment = 2,
+ kVadInStateEndPointDetected = 3,
+ }
+ enum VadDetectMode
+ {
+ kVadSingleUtteranceDetectMode = 0,
+ kVadMutipleUtteranceDetectMode = 1,
+ }
+
+
+ internal class E2EVadModel
+ {
+ private VadPostConfEntity _vad_opts = new VadPostConfEntity();
+ private WindowDetector _windows_detector = new WindowDetector();
+ private bool _is_final = false;
+ private int _data_buf_start_frame = 0;
+ private int _frm_cnt = 0;
+ private int _latest_confirmed_speech_frame = 0;
+ private int _lastest_confirmed_silence_frame = -1;
+ private int _continous_silence_frame_count = 0;
+ private int _vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected;
+ private int _confirmed_start_frame = -1;
+ private int _confirmed_end_frame = -1;
+ private int _number_end_time_detected = 0;
+ private int _sil_frame = 0;
+ private int[] _sil_pdf_ids = new int[0];
+ private double _noise_average_decibel = -100.0D;
+ private bool _pre_end_silence_detected = false;
+ private bool _next_seg = true;
+
+ private List<E2EVadSpeechBufWithDoaEntity> _output_data_buf;
+ private int _output_data_buf_offset = 0;
+ private List<E2EVadFrameProbEntity> _frame_probs = new List<E2EVadFrameProbEntity>();
+ private int _max_end_sil_frame_cnt_thresh = 800 - 150;
+ private float _speech_noise_thres = 0.6F;
+ private float[,,] _scores = null;
+ private int _idx_pre_chunk = 0;
+ private bool _max_time_out = false;
+ private List<double> _decibel = new List<double>();
+ private int _data_buf_size = 0;
+ private int _data_buf_all_size = 0;
+
+ public E2EVadModel(VadPostConfEntity vadPostConfEntity)
+ {
+ _vad_opts = vadPostConfEntity;
+ _windows_detector = new WindowDetector(_vad_opts.window_size_ms,
+ _vad_opts.sil_to_speech_time_thres,
+ _vad_opts.speech_to_sil_time_thres,
+ _vad_opts.frame_in_ms);
+ AllResetDetection();
+ }
+
+ private void AllResetDetection()
+ {
+ _is_final = false;
+ _data_buf_start_frame = 0;
+ _frm_cnt = 0;
+ _latest_confirmed_speech_frame = 0;
+ _lastest_confirmed_silence_frame = -1;
+ _continous_silence_frame_count = 0;
+ _vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected;
+ _confirmed_start_frame = -1;
+ _confirmed_end_frame = -1;
+ _number_end_time_detected = 0;
+ _sil_frame = 0;
+ _sil_pdf_ids = _vad_opts.sil_pdf_ids;
+ _noise_average_decibel = -100.0F;
+ _pre_end_silence_detected = false;
+ _next_seg = true;
+
+ _output_data_buf = new List<E2EVadSpeechBufWithDoaEntity>();
+ _output_data_buf_offset = 0;
+ _frame_probs = new List<E2EVadFrameProbEntity>();
+ _max_end_sil_frame_cnt_thresh = _vad_opts.max_end_silence_time - _vad_opts.speech_to_sil_time_thres;
+ _speech_noise_thres = _vad_opts.speech_noise_thres;
+ _scores = null;
+ _idx_pre_chunk = 0;
+ _max_time_out = false;
+ _decibel = new List<double>();
+ _data_buf_size = 0;
+ _data_buf_all_size = 0;
+ ResetDetection();
+ }
+
+ private void ResetDetection()
+ {
+ _continous_silence_frame_count = 0;
+ _latest_confirmed_speech_frame = 0;
+ _lastest_confirmed_silence_frame = -1;
+ _confirmed_start_frame = -1;
+ _confirmed_end_frame = -1;
+ _vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected;
+ _windows_detector.Reset();
+ _sil_frame = 0;
+ _frame_probs = new List<E2EVadFrameProbEntity>();
+ }
+
+ private void ComputeDecibel(float[] waveform)
+ {
+ int frame_sample_length = (int)(_vad_opts.frame_length_ms * _vad_opts.sample_rate / 1000);
+ int frame_shift_length = (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000);
+ if (_data_buf_all_size == 0)
+ {
+ _data_buf_all_size = waveform.Length;
+ _data_buf_size = _data_buf_all_size;
+ }
+ else
+ {
+ _data_buf_all_size += waveform.Length;
+ }
+
+ for (int offset = 0; offset < waveform.Length - frame_sample_length + 1; offset += frame_shift_length)
+ {
+ float[] _waveform_chunk = new float[frame_sample_length];
+ Array.Copy(waveform, offset, _waveform_chunk, 0, _waveform_chunk.Length);
+ float[] _waveform_chunk_pow = _waveform_chunk.Select(x => (float)Math.Pow((double)x, 2)).ToArray();
+ _decibel.Add(
+ 10 * Math.Log10(
+ _waveform_chunk_pow.Sum() + 0.000001
+ )
+ );
+ }
+
+ }
+
+ private void ComputeScores(float[,,] scores)
+ {
+ _vad_opts.nn_eval_block_size = scores.GetLength(1);
+ _frm_cnt += scores.GetLength(1);
+ _scores = scores;
+ }
+
+ private void PopDataBufTillFrame(int frame_idx)// need check again
+ {
+ while (_data_buf_start_frame < frame_idx)
+ {
+ if (_data_buf_size >= (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000))
+ {
+ _data_buf_start_frame += 1;
+ _data_buf_size = _data_buf_all_size - _data_buf_start_frame * (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000);
+ }
+ }
+ }
+
+ private void PopDataToOutputBuf(int start_frm, int frm_cnt, bool first_frm_is_start_point,
+ bool last_frm_is_end_point, bool end_point_is_sent_end)
+ {
+ PopDataBufTillFrame(start_frm);
+ int expected_sample_number = (int)(frm_cnt * _vad_opts.sample_rate * _vad_opts.frame_in_ms / 1000);
+ if (last_frm_is_end_point)
+ {
+ int extra_sample = Math.Max(0, (int)(_vad_opts.frame_length_ms * _vad_opts.sample_rate / 1000 - _vad_opts.sample_rate * _vad_opts.frame_in_ms / 1000));
+ expected_sample_number += (int)(extra_sample);
+ }
+
+ if (end_point_is_sent_end)
+ {
+ expected_sample_number = Math.Max(expected_sample_number, _data_buf_size);
+ }
+ if (_data_buf_size < expected_sample_number)
+ {
+ Console.WriteLine("error in calling pop data_buf\n");
+ }
+
+ if (_output_data_buf.Count == 0 || first_frm_is_start_point)
+ {
+ _output_data_buf.Add(new E2EVadSpeechBufWithDoaEntity());
+ _output_data_buf.Last().Reset();
+ _output_data_buf.Last().start_ms = start_frm * _vad_opts.frame_in_ms;
+ _output_data_buf.Last().end_ms = _output_data_buf.Last().start_ms;
+ _output_data_buf.Last().doa = 0;
+ }
+
+ E2EVadSpeechBufWithDoaEntity cur_seg = _output_data_buf.Last();
+ if (cur_seg.end_ms != start_frm * _vad_opts.frame_in_ms)
+ {
+ Console.WriteLine("warning\n");
+ }
+
+ int out_pos = cur_seg.buffer.Length; // cur_seg.buff鐜板湪娌″仛浠讳綍鎿嶄綔
+ int data_to_pop = 0;
+ if (end_point_is_sent_end)
+ {
+ data_to_pop = expected_sample_number;
+ }
+ else
+ {
+ data_to_pop = (int)(frm_cnt * _vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000);
+ }
+ if (data_to_pop > _data_buf_size)
+ {
+ Console.WriteLine("VAD data_to_pop is bigger than _data_buf_size!!!\n");
+ data_to_pop = _data_buf_size;
+ expected_sample_number = _data_buf_size;
+ }
+
+
+ cur_seg.doa = 0;
+ for (int sample_cpy_out = 0; sample_cpy_out < data_to_pop; sample_cpy_out++)
+ {
+ out_pos += 1;
+ }
+ for (int sample_cpy_out = data_to_pop; sample_cpy_out < expected_sample_number; sample_cpy_out++)
+ {
+ out_pos += 1;
+ }
+
+ if (cur_seg.end_ms != start_frm * _vad_opts.frame_in_ms)
+ {
+ Console.WriteLine("Something wrong with the VAD algorithm\n");
+ }
+
+ _data_buf_start_frame += frm_cnt;
+ cur_seg.end_ms = (start_frm + frm_cnt) * _vad_opts.frame_in_ms;
+ if (first_frm_is_start_point)
+ {
+ cur_seg.contain_seg_start_point = true;
+ }
+
+ if (last_frm_is_end_point)
+ {
+ cur_seg.contain_seg_end_point = true;
+ }
+ }
+
+ private void OnSilenceDetected(int valid_frame)
+ {
+ _lastest_confirmed_silence_frame = valid_frame;
+ if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
+ {
+ PopDataBufTillFrame(valid_frame);
+ }
+
+ }
+
+ private void OnVoiceDetected(int valid_frame)
+ {
+ _latest_confirmed_speech_frame = valid_frame;
+ PopDataToOutputBuf(valid_frame, 1, false, false, false);
+ }
+
+ private void OnVoiceStart(int start_frame, bool fake_result = false)
+ {
+ if (_vad_opts.do_start_point_detection)
+ {
+ //do nothing
+ }
+ if (_confirmed_start_frame != -1)
+ {
+
+ Console.WriteLine("not reset vad properly\n");
+ }
+ else
+ {
+ _confirmed_start_frame = start_frame;
+ }
+ if (!fake_result || _vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
+ {
+
+ PopDataToOutputBuf(_confirmed_start_frame, 1, true, false, false);
+ }
+ }
+
+ private void OnVoiceEnd(int end_frame, bool fake_result, bool is_last_frame)
+ {
+ for (int t = _latest_confirmed_speech_frame + 1; t < end_frame; t++)
+ {
+ OnVoiceDetected(t);
+ }
+ if (_vad_opts.do_end_point_detection)
+ {
+ //do nothing
+ }
+ if (_confirmed_end_frame != -1)
+ {
+ Console.WriteLine("not reset vad properly\n");
+ }
+ else
+ {
+ _confirmed_end_frame = end_frame;
+ }
+ if (!fake_result)
+ {
+ _sil_frame = 0;
+ PopDataToOutputBuf(_confirmed_end_frame, 1, false, true, is_last_frame);
+ }
+ _number_end_time_detected += 1;
+ }
+
+ private void MaybeOnVoiceEndIfLastFrame(bool is_final_frame, int cur_frm_idx)
+ {
+ if (is_final_frame)
+ {
+ OnVoiceEnd(cur_frm_idx, false, true);
+ _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
+ }
+ }
+
+ private int GetLatency()
+ {
+ return (int)(LatencyFrmNumAtStartPoint() * _vad_opts.frame_in_ms);
+ }
+
+ private int LatencyFrmNumAtStartPoint()
+ {
+ int vad_latency = _windows_detector.GetWinSize();
+ if (_vad_opts.do_extend != 0)
+ {
+ vad_latency += (int)(_vad_opts.lookback_time_start_point / _vad_opts.frame_in_ms);
+ }
+ return vad_latency;
+ }
+
+ private FrameState GetFrameState(int t)
+ {
+
+ FrameState frame_state = FrameState.kFrameStateInvalid;
+ double cur_decibel = _decibel[t];
+ double cur_snr = cur_decibel - _noise_average_decibel;
+ if (cur_decibel < _vad_opts.decibel_thres)
+ {
+ frame_state = FrameState.kFrameStateSil;
+ DetectOneFrame(frame_state, t, false);
+ return frame_state;
+ }
+
+
+ double sum_score = 0.0D;
+ double noise_prob = 0.0D;
+ Trace.Assert(_sil_pdf_ids.Length == _vad_opts.silence_pdf_num, "");
+ if (_sil_pdf_ids.Length > 0)
+ {
+ Trace.Assert(_scores.GetLength(0) == 1, "鍙敮鎸乥atch_size = 1鐨勬祴璇�"); // 鍙敮鎸乥atch_size = 1鐨勬祴璇�
+ float[] sil_pdf_scores = new float[_sil_pdf_ids.Length];
+ int j = 0;
+ foreach (int sil_pdf_id in _sil_pdf_ids)
+ {
+ sil_pdf_scores[j] = _scores[0,t - _idx_pre_chunk,sil_pdf_id];
+ j++;
+ }
+ sum_score = sil_pdf_scores.Length == 0 ? 0 : sil_pdf_scores.Sum();
+ noise_prob = Math.Log(sum_score) * _vad_opts.speech_2_noise_ratio;
+ double total_score = 1.0D;
+ sum_score = total_score - sum_score;
+ }
+ double speech_prob = Math.Log(sum_score);
+ if (_vad_opts.output_frame_probs)
+ {
+ E2EVadFrameProbEntity frame_prob = new E2EVadFrameProbEntity();
+ frame_prob.noise_prob = noise_prob;
+ frame_prob.speech_prob = speech_prob;
+ frame_prob.score = sum_score;
+ frame_prob.frame_id = t;
+ _frame_probs.Add(frame_prob);
+ }
+
+ if (Math.Exp(speech_prob) >= Math.Exp(noise_prob) + _speech_noise_thres)
+ {
+ if (cur_snr >= _vad_opts.snr_thres && cur_decibel >= _vad_opts.decibel_thres)
+ {
+ frame_state = FrameState.kFrameStateSpeech;
+ }
+ else
+ {
+ frame_state = FrameState.kFrameStateSil;
+ }
+ }
+ else
+ {
+ frame_state = FrameState.kFrameStateSil;
+ if (_noise_average_decibel < -99.9)
+ {
+ _noise_average_decibel = cur_decibel;
+ }
+ else
+ {
+ _noise_average_decibel = (cur_decibel + _noise_average_decibel * (_vad_opts.noise_frame_num_used_for_snr - 1)) / _vad_opts.noise_frame_num_used_for_snr;
+ }
+ }
+ return frame_state;
+ }
+
+ public SegmentEntity[] DefaultCall(float[,,] score, float[] waveform,
+ bool is_final = false, int max_end_sil = 800, bool online = false
+ )
+ {
+ _max_end_sil_frame_cnt_thresh = max_end_sil - _vad_opts.speech_to_sil_time_thres;
+ // compute decibel for each frame
+ ComputeDecibel(waveform);
+ ComputeScores(score);
+ if (!is_final)
+ {
+ DetectCommonFrames();
+ }
+ else
+ {
+ DetectLastFrames();
+ }
+ int batchSize = score.GetLength(0);
+ SegmentEntity[] segments = new SegmentEntity[batchSize];
+ for (int batch_num = 0; batch_num < batchSize; batch_num++) // only support batch_size = 1 now
+ {
+ List<int[]> segment_batch = new List<int[]>();
+ if (_output_data_buf.Count > 0)
+ {
+ for (int i = _output_data_buf_offset; i < _output_data_buf.Count; i++)
+ {
+ int start_ms;
+ int end_ms;
+ if (online)
+ {
+ if (!_output_data_buf[i].contain_seg_start_point)
+ {
+ continue;
+ }
+ if (!_next_seg && !_output_data_buf[i].contain_seg_end_point)
+ {
+ continue;
+ }
+ start_ms = _next_seg ? _output_data_buf[i].start_ms : -1;
+ if (_output_data_buf[i].contain_seg_end_point)
+ {
+ end_ms = _output_data_buf[i].end_ms;
+ _next_seg = true;
+ _output_data_buf_offset += 1;
+ }
+ else
+ {
+ end_ms = -1;
+ _next_seg = false;
+ }
+ }
+ else
+ {
+ if (!is_final && (!_output_data_buf[i].contain_seg_start_point || !_output_data_buf[i].contain_seg_end_point))
+ {
+ continue;
+ }
+ start_ms = _output_data_buf[i].start_ms;
+ end_ms = _output_data_buf[i].end_ms;
+ _output_data_buf_offset += 1;
+
+ }
+ int[] segment_ms = new int[] { start_ms, end_ms };
+ segment_batch.Add(segment_ms);
+
+ }
+
+ }
+
+ if (segment_batch.Count > 0)
+ {
+ if (segments[batch_num] == null)
+ {
+ segments[batch_num] = new SegmentEntity();
+ }
+ segments[batch_num].Segment.AddRange(segment_batch);
+ }
+ }
+
+ if (is_final)
+ {
+ // reset class variables and clear the dict for the next query
+ AllResetDetection();
+ }
+
+ return segments;
+ }
+
+ private int DetectCommonFrames()
+ {
+ if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected)
+ {
+ return 0;
+ }
+ for (int i = _vad_opts.nn_eval_block_size - 1; i > -1; i += -1)
+ {
+ FrameState frame_state = FrameState.kFrameStateInvalid;
+ frame_state = GetFrameState(_frm_cnt - 1 - i);
+ DetectOneFrame(frame_state, _frm_cnt - 1 - i, false);
+ }
+
+ _idx_pre_chunk += _scores.GetLength(1)* _scores.GetLength(0); //_scores.shape[1];
+ return 0;
+ }
+
+ private int DetectLastFrames()
+ {
+ if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected)
+ {
+ return 0;
+ }
+ for (int i = _vad_opts.nn_eval_block_size - 1; i > -1; i += -1)
+ {
+ FrameState frame_state = FrameState.kFrameStateInvalid;
+ frame_state = GetFrameState(_frm_cnt - 1 - i);
+ if (i != 0)
+ {
+ DetectOneFrame(frame_state, _frm_cnt - 1 - i, false);
+ }
+ else
+ {
+ DetectOneFrame(frame_state, _frm_cnt - 1, true);
+ }
+
+
+ }
+
+ return 0;
+ }
+
+ private void DetectOneFrame(FrameState cur_frm_state, int cur_frm_idx, bool is_final_frame)
+ {
+ FrameState tmp_cur_frm_state = FrameState.kFrameStateInvalid;
+ if (cur_frm_state == FrameState.kFrameStateSpeech)
+ {
+ if (Math.Abs(1.0) > _vad_opts.fe_prior_thres)//Fabs
+ {
+ tmp_cur_frm_state = FrameState.kFrameStateSpeech;
+ }
+ else
+ {
+ tmp_cur_frm_state = FrameState.kFrameStateSil;
+ }
+ }
+ else if (cur_frm_state == FrameState.kFrameStateSil)
+ {
+ tmp_cur_frm_state = FrameState.kFrameStateSil;
+ }
+
+ AudioChangeState state_change = _windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx);
+ int frm_shift_in_ms = _vad_opts.frame_in_ms;
+ if (AudioChangeState.kChangeStateSil2Speech == state_change)
+ {
+ int silence_frame_count = _continous_silence_frame_count; // no used
+ _continous_silence_frame_count = 0;
+ _pre_end_silence_detected = false;
+ int start_frame = 0;
+ if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
+ {
+ start_frame = Math.Max(_data_buf_start_frame, cur_frm_idx - LatencyFrmNumAtStartPoint());
+ OnVoiceStart(start_frame);
+ _vad_state_machine = (int)VadStateMachine.kVadInStateInSpeechSegment;
+ for (int t = start_frame + 1; t < cur_frm_idx + 1; t++)
+ {
+ OnVoiceDetected(t);
+ }
+
+ }
+ else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
+ {
+ for (int t = _latest_confirmed_speech_frame + 1; t < cur_frm_idx; t++)
+ {
+ OnVoiceDetected(t);
+ }
+ if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
+ {
+ OnVoiceEnd(cur_frm_idx, false, false);
+ _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
+ }
+
+ else if (!is_final_frame)
+ {
+ OnVoiceDetected(cur_frm_idx);
+ }
+ else
+ {
+ MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
+ }
+
+ }
+ else
+ {
+ return;
+ }
+ }
+ else if (AudioChangeState.kChangeStateSpeech2Sil == state_change)
+ {
+ _continous_silence_frame_count = 0;
+ if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
+ { return; }
+ else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
+ {
+ if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
+ {
+ OnVoiceEnd(cur_frm_idx, false, false);
+ _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
+ }
+ else if (!is_final_frame)
+ {
+ OnVoiceDetected(cur_frm_idx);
+ }
+ else
+ {
+ MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
+ }
+
+ }
+ else
+ {
+ return;
+ }
+ }
+ else if (AudioChangeState.kChangeStateSpeech2Speech == state_change)
+ {
+ _continous_silence_frame_count = 0;
+ if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
+ {
+ if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
+ {
+ _max_time_out = true;
+ OnVoiceEnd(cur_frm_idx, false, false);
+ _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
+ }
+ else if (!is_final_frame)
+ {
+ OnVoiceDetected(cur_frm_idx);
+ }
+ else
+ {
+ MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
+ }
+ }
+ else
+ {
+ return;
+ }
+
+ }
+ else if (AudioChangeState.kChangeStateSil2Sil == state_change)
+ {
+ _continous_silence_frame_count += 1;
+ if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected)
+ {
+ // silence timeout, return zero length decision
+ if (((_vad_opts.detect_mode == (int)VadDetectMode.kVadSingleUtteranceDetectMode) && (
+ _continous_silence_frame_count * frm_shift_in_ms > _vad_opts.max_start_silence_time)) || (is_final_frame && _number_end_time_detected == 0))
+ {
+ for (int t = _lastest_confirmed_silence_frame + 1; t < cur_frm_idx; t++)
+ {
+ OnSilenceDetected(t);
+ }
+ OnVoiceStart(0, true);
+ OnVoiceEnd(0, true, false);
+ _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
+ }
+ else
+ {
+ if (cur_frm_idx >= LatencyFrmNumAtStartPoint())
+ {
+ OnSilenceDetected(cur_frm_idx - LatencyFrmNumAtStartPoint());
+ }
+ }
+ }
+ else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment)
+ {
+ if (_continous_silence_frame_count * frm_shift_in_ms >= _max_end_sil_frame_cnt_thresh)
+ {
+ int lookback_frame = (int)(_max_end_sil_frame_cnt_thresh / frm_shift_in_ms);
+ if (_vad_opts.do_extend != 0)
+ {
+ lookback_frame -= (int)(_vad_opts.lookahead_time_end_point / frm_shift_in_ms);
+ lookback_frame -= 1;
+ lookback_frame = Math.Max(0, lookback_frame);
+ }
+
+ OnVoiceEnd(cur_frm_idx - lookback_frame, false, false);
+ _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
+ }
+ else if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms)
+ {
+ OnVoiceEnd(cur_frm_idx, false, false);
+ _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected;
+ }
+
+ else if (_vad_opts.do_extend != 0 && !is_final_frame)
+ {
+ if (_continous_silence_frame_count <= (int)(_vad_opts.lookahead_time_end_point / frm_shift_in_ms))
+ {
+ OnVoiceDetected(cur_frm_idx);
+ }
+ }
+
+ else
+ {
+ MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx);
+ }
+ }
+ else
+ {
+ return;
+ }
+
+ }
+
+ if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected && _vad_opts.detect_mode == (int)VadDetectMode.kVadMutipleUtteranceDetectMode)
+ {
+ ResetDetection();
+ }
+
+ }
+
+ }
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/Lib/kaldi-native-fbank-dll.dll b/funasr/runtime/csharp/AliFsmnVadSharp/Lib/kaldi-native-fbank-dll.dll
new file mode 100644
index 0000000..cddc940
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/Lib/kaldi-native-fbank-dll.dll
Binary files differ
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/Model/CmvnEntity.cs b/funasr/runtime/csharp/AliFsmnVadSharp/Model/CmvnEntity.cs
new file mode 100644
index 0000000..2f93df1
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/Model/CmvnEntity.cs
@@ -0,0 +1,17 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace AliFsmnVadSharp.Model
+{
+ internal class CmvnEntity
+ {
+ private List<float> _means = new List<float>();
+ private List<float> _vars = new List<float>();
+
+ public List<float> Means { get => _means; set => _means = value; }
+ public List<float> Vars { get => _vars; set => _vars = value; }
+ }
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/Model/E2EVadFrameProbEntity.cs b/funasr/runtime/csharp/AliFsmnVadSharp/Model/E2EVadFrameProbEntity.cs
new file mode 100644
index 0000000..58a4ca9
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/Model/E2EVadFrameProbEntity.cs
@@ -0,0 +1,23 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace AliFsmnVadSharp.Model
+{
+ internal class E2EVadFrameProbEntity
+ {
+ private double _noise_prob = 0.0F;
+ private double _speech_prob = 0.0F;
+ private double _score = 0.0F;
+ private int _frame_id = 0;
+ private int _frm_state = 0;
+
+ public double noise_prob { get => _noise_prob; set => _noise_prob = value; }
+ public double speech_prob { get => _speech_prob; set => _speech_prob = value; }
+ public double score { get => _score; set => _score = value; }
+ public int frame_id { get => _frame_id; set => _frame_id = value; }
+ public int frm_state { get => _frm_state; set => _frm_state = value; }
+ }
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/Model/E2EVadSpeechBufWithDoaEntity.cs b/funasr/runtime/csharp/AliFsmnVadSharp/Model/E2EVadSpeechBufWithDoaEntity.cs
new file mode 100644
index 0000000..8c2e7f7
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/Model/E2EVadSpeechBufWithDoaEntity.cs
@@ -0,0 +1,98 @@
+// AliFsmnVadSharp, Version=1.0.0.0, Culture=neutral, PublicKeyToken=null
+// AliFsmnVadSharp.Model.E2EVadSpeechBufWithDoaEntity
+internal class E2EVadSpeechBufWithDoaEntity
+{
+ private int _start_ms = 0;
+
+ private int _end_ms = 0;
+
+ private byte[]? _buffer;
+
+ private bool _contain_seg_start_point = false;
+
+ private bool _contain_seg_end_point = false;
+
+ private int _doa = 0;
+
+ public int start_ms
+ {
+ get
+ {
+ return _start_ms;
+ }
+ set
+ {
+ _start_ms = value;
+ }
+ }
+
+ public int end_ms
+ {
+ get
+ {
+ return _end_ms;
+ }
+ set
+ {
+ _end_ms = value;
+ }
+ }
+
+ public byte[]? buffer
+ {
+ get
+ {
+ return _buffer;
+ }
+ set
+ {
+ _buffer = value;
+ }
+ }
+
+ public bool contain_seg_start_point
+ {
+ get
+ {
+ return _contain_seg_start_point;
+ }
+ set
+ {
+ _contain_seg_start_point = value;
+ }
+ }
+
+ public bool contain_seg_end_point
+ {
+ get
+ {
+ return _contain_seg_end_point;
+ }
+ set
+ {
+ _contain_seg_end_point = value;
+ }
+ }
+
+ public int doa
+ {
+ get
+ {
+ return _doa;
+ }
+ set
+ {
+ _doa = value;
+ }
+ }
+
+ public void Reset()
+ {
+ _start_ms = 0;
+ _end_ms = 0;
+ _buffer = new byte[0];
+ _contain_seg_start_point = false;
+ _contain_seg_end_point = false;
+ _doa = 0;
+ }
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/Model/EncoderConfEntity.cs b/funasr/runtime/csharp/AliFsmnVadSharp/Model/EncoderConfEntity.cs
new file mode 100644
index 0000000..8365b12
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/Model/EncoderConfEntity.cs
@@ -0,0 +1,35 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace AliFsmnVadSharp.Model
+{
+ public class EncoderConfEntity
+ {
+ private int _input_dim=400;
+ private int _input_affineDim = 140;
+ private int _fsmn_layers = 4;
+ private int _linear_dim = 250;
+ private int _proj_dim = 128;
+ private int _lorder = 20;
+ private int _rorder = 0;
+ private int _lstride = 1;
+ private int _rstride = 0;
+ private int _output_dffine_dim = 140;
+ private int _output_dim = 248;
+
+ public int input_dim { get => _input_dim; set => _input_dim = value; }
+ public int input_affine_dim { get => _input_affineDim; set => _input_affineDim = value; }
+ public int fsmn_layers { get => _fsmn_layers; set => _fsmn_layers = value; }
+ public int linear_dim { get => _linear_dim; set => _linear_dim = value; }
+ public int proj_dim { get => _proj_dim; set => _proj_dim = value; }
+ public int lorder { get => _lorder; set => _lorder = value; }
+ public int rorder { get => _rorder; set => _rorder = value; }
+ public int lstride { get => _lstride; set => _lstride = value; }
+ public int rstride { get => _rstride; set => _rstride = value; }
+ public int output_affine_dim { get => _output_dffine_dim; set => _output_dffine_dim = value; }
+ public int output_dim { get => _output_dim; set => _output_dim = value; }
+ }
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/Model/FrontendConfEntity.cs b/funasr/runtime/csharp/AliFsmnVadSharp/Model/FrontendConfEntity.cs
new file mode 100644
index 0000000..22bb35a
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/Model/FrontendConfEntity.cs
@@ -0,0 +1,29 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace AliFsmnVadSharp.Model
+{
+ public class FrontendConfEntity
+ {
+ private int _fs = 16000;
+ private string _window = "hamming";
+ private int _n_mels = 80;
+ private int _frame_length = 25;
+ private int _frame_shift = 10;
+ private float _dither = 0.0F;
+ private int _lfr_m = 5;
+ private int _lfr_n = 1;
+
+ public int fs { get => _fs; set => _fs = value; }
+ public string window { get => _window; set => _window = value; }
+ public int n_mels { get => _n_mels; set => _n_mels = value; }
+ public int frame_length { get => _frame_length; set => _frame_length = value; }
+ public int frame_shift { get => _frame_shift; set => _frame_shift = value; }
+ public float dither { get => _dither; set => _dither = value; }
+ public int lfr_m { get => _lfr_m; set => _lfr_m = value; }
+ public int lfr_n { get => _lfr_n; set => _lfr_n = value; }
+ }
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/Model/SegmentEntity.cs b/funasr/runtime/csharp/AliFsmnVadSharp/Model/SegmentEntity.cs
new file mode 100644
index 0000000..bdb715d
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/Model/SegmentEntity.cs
@@ -0,0 +1,22 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace AliFsmnVadSharp.Model
+{
+ public class SegmentEntity
+ {
+ private List<int[]> _segment=new List<int[]>();
+ private List<float[]> _waveform=new List<float[]>();
+
+ public List<int[]> Segment { get => _segment; set => _segment = value; }
+ public List<float[]> Waveform { get => _waveform; set => _waveform = value; }
+ //public SegmentEntity()
+ //{
+ // int[] t=new int[0];
+ // _segment.Add(t);
+ //}
+ }
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/Model/VadInputEntity.cs b/funasr/runtime/csharp/AliFsmnVadSharp/Model/VadInputEntity.cs
new file mode 100644
index 0000000..fcd63d8
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/Model/VadInputEntity.cs
@@ -0,0 +1,23 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace AliFsmnVadSharp.Model
+{
+ internal class VadInputEntity
+ {
+ private float[]? _speech;
+ private int _speechLength;
+ private List<float[]> _inCaches = new List<float[]>();
+ private float[]? _waveform;
+ private E2EVadModel _vad_scorer;
+
+ public float[]? Speech { get => _speech; set => _speech = value; }
+ public int SpeechLength { get => _speechLength; set => _speechLength = value; }
+ public List<float[]> InCaches { get => _inCaches; set => _inCaches = value; }
+ public float[] Waveform { get => _waveform; set => _waveform = value; }
+ internal E2EVadModel VadScorer { get => _vad_scorer; set => _vad_scorer = value; }
+ }
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/Model/VadOutputEntity.cs b/funasr/runtime/csharp/AliFsmnVadSharp/Model/VadOutputEntity.cs
new file mode 100644
index 0000000..fa8639e
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/Model/VadOutputEntity.cs
@@ -0,0 +1,19 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace AliFsmnVadSharp.Model
+{
+ internal class VadOutputEntity
+ {
+ private float[,,]? _scores;
+ private List<float[]> _outCaches=new List<float[]>();
+ private float[]? _waveform;
+
+ public float[,,]? Scores { get => _scores; set => _scores = value; }
+ public List<float[]> OutCaches { get => _outCaches; set => _outCaches = value; }
+ public float[] Waveform { get => _waveform; set => _waveform = value; }
+ }
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/Model/VadPostConfEntity.cs b/funasr/runtime/csharp/AliFsmnVadSharp/Model/VadPostConfEntity.cs
new file mode 100644
index 0000000..e566cf2
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/Model/VadPostConfEntity.cs
@@ -0,0 +1,72 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace AliFsmnVadSharp.Model
+{
+ public class VadPostConfEntity
+ {
+ private int _sample_rate= 16000;
+ private int _detect_mode = 1 ;
+ private int _snr_mode = 0;
+ private int _max_end_silence_time = 800;
+ private int _max_start_silence_time = 3000;
+ private bool _do_start_point_detection = true;
+ private bool _do_end_point_detection = true;
+ private int _window_size_ms = 200;
+ private int _sil_to_speech_time_thres = 150;
+ private int _speech_to_sil_time_thres = 150;
+ private float _speech_2_noise_ratio = 1.0F;
+ private int _do_extend = 1;
+ private int _lookback_time_start_point = 200;
+ private int _lookahead_time_end_point = 100;
+ private int _max_single_segment_time = 60000;
+ private int _nn_eval_block_size = 8;
+ private int _dcd_block_size = 4;
+ private float _snr_thres = -100.0F;
+ private int _noise_frame_num_used_for_snr = 100;
+ private float _decibel_thres = -100.0F;
+ private float _speech_noise_thres = 0.6F;
+ private float _fe_prior_thres = 0.0001F;
+ private int _silence_pdf_num = 1;
+ private int[] _sil_pdf_ids = new int[] {0};
+ private float _speech_noise_thresh_low = -0.1F;
+ private float _speech_noise_thresh_high = 0.3F;
+ private bool _output_frame_probs = false;
+ private int _frame_in_ms = 10;
+ private int _frame_length_ms = 25;
+
+ public int sample_rate { get => _sample_rate; set => _sample_rate = value; }
+ public int detect_mode { get => _detect_mode; set => _detect_mode = value; }
+ public int snr_mode { get => _snr_mode; set => _snr_mode = value; }
+ public int max_end_silence_time { get => _max_end_silence_time; set => _max_end_silence_time = value; }
+ public int max_start_silence_time { get => _max_start_silence_time; set => _max_start_silence_time = value; }
+ public bool do_start_point_detection { get => _do_start_point_detection; set => _do_start_point_detection = value; }
+ public bool do_end_point_detection { get => _do_end_point_detection; set => _do_end_point_detection = value; }
+ public int window_size_ms { get => _window_size_ms; set => _window_size_ms = value; }
+ public int sil_to_speech_time_thres { get => _sil_to_speech_time_thres; set => _sil_to_speech_time_thres = value; }
+ public int speech_to_sil_time_thres { get => _speech_to_sil_time_thres; set => _speech_to_sil_time_thres = value; }
+ public float speech_2_noise_ratio { get => _speech_2_noise_ratio; set => _speech_2_noise_ratio = value; }
+ public int do_extend { get => _do_extend; set => _do_extend = value; }
+ public int lookback_time_start_point { get => _lookback_time_start_point; set => _lookback_time_start_point = value; }
+ public int lookahead_time_end_point { get => _lookahead_time_end_point; set => _lookahead_time_end_point = value; }
+ public int max_single_segment_time { get => _max_single_segment_time; set => _max_single_segment_time = value; }
+ public int nn_eval_block_size { get => _nn_eval_block_size; set => _nn_eval_block_size = value; }
+ public int dcd_block_size { get => _dcd_block_size; set => _dcd_block_size = value; }
+ public float snr_thres { get => _snr_thres; set => _snr_thres = value; }
+ public int noise_frame_num_used_for_snr { get => _noise_frame_num_used_for_snr; set => _noise_frame_num_used_for_snr = value; }
+ public float decibel_thres { get => _decibel_thres; set => _decibel_thres = value; }
+ public float speech_noise_thres { get => _speech_noise_thres; set => _speech_noise_thres = value; }
+ public float fe_prior_thres { get => _fe_prior_thres; set => _fe_prior_thres = value; }
+ public int silence_pdf_num { get => _silence_pdf_num; set => _silence_pdf_num = value; }
+ public int[] sil_pdf_ids { get => _sil_pdf_ids; set => _sil_pdf_ids = value; }
+ public float speech_noise_thresh_low { get => _speech_noise_thresh_low; set => _speech_noise_thresh_low = value; }
+ public float speech_noise_thresh_high { get => _speech_noise_thresh_high; set => _speech_noise_thresh_high = value; }
+ public bool output_frame_probs { get => _output_frame_probs; set => _output_frame_probs = value; }
+ public int frame_in_ms { get => _frame_in_ms; set => _frame_in_ms = value; }
+ public int frame_length_ms { get => _frame_length_ms; set => _frame_length_ms = value; }
+
+ }
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/Model/VadYamlEntity.cs b/funasr/runtime/csharp/AliFsmnVadSharp/Model/VadYamlEntity.cs
new file mode 100644
index 0000000..65e77ed
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/Model/VadYamlEntity.cs
@@ -0,0 +1,27 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace AliFsmnVadSharp.Model
+{
+ internal class VadYamlEntity
+ {
+ private int _input_size;
+ private string _frontend = "wav_frontend";
+ private FrontendConfEntity _frontend_conf=new FrontendConfEntity();
+ private string _model = "e2evad";
+ private string _encoder = "fsmn";
+ private EncoderConfEntity _encoder_conf=new EncoderConfEntity();
+ private VadPostConfEntity _vad_post_conf=new VadPostConfEntity();
+
+ public int input_size { get => _input_size; set => _input_size = value; }
+ public string frontend { get => _frontend; set => _frontend = value; }
+ public string model { get => _model; set => _model = value; }
+ public string encoder { get => _encoder; set => _encoder = value; }
+ public FrontendConfEntity frontend_conf { get => _frontend_conf; set => _frontend_conf = value; }
+ public EncoderConfEntity encoder_conf { get => _encoder_conf; set => _encoder_conf = value; }
+ public VadPostConfEntity vad_post_conf { get => _vad_post_conf; set => _vad_post_conf = value; }
+ }
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/Struct/FbankData.cs b/funasr/runtime/csharp/AliFsmnVadSharp/Struct/FbankData.cs
new file mode 100644
index 0000000..bbad3dc
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/Struct/FbankData.cs
@@ -0,0 +1,6 @@
+锘縰sing System.Runtime.InteropServices;
+
+namespace AliFsmnVadSharp.Struct
+{
+
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/Utils/YamlHelper.cs b/funasr/runtime/csharp/AliFsmnVadSharp/Utils/YamlHelper.cs
new file mode 100644
index 0000000..0b460ff
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/Utils/YamlHelper.cs
@@ -0,0 +1,28 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using System.Text.Json;
+using YamlDotNet.Serialization;
+
+namespace AliFsmnVadSharp.Utils
+{
+ internal class YamlHelper
+ {
+ public static T ReadYaml<T>(string yamlFilePath)
+ {
+ if (!File.Exists(yamlFilePath))
+ {
+#pragma warning disable CS8603 // 鍙兘杩斿洖 null 寮曠敤銆�
+ return default(T);
+#pragma warning restore CS8603 // 鍙兘杩斿洖 null 寮曠敤銆�
+ }
+ StreamReader yamlReader = File.OpenText(yamlFilePath);
+ Deserializer yamlDeserializer = new Deserializer();
+ T info = yamlDeserializer.Deserialize<T>(yamlReader);
+ yamlReader.Close();
+ return info;
+ }
+ }
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/WavFrontend.cs b/funasr/runtime/csharp/AliFsmnVadSharp/WavFrontend.cs
new file mode 100644
index 0000000..2c5b50f
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/WavFrontend.cs
@@ -0,0 +1,185 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using AliFsmnVadSharp.Model;
+using AliFsmnVadSharp.DLL;
+using AliFsmnVadSharp.Struct;
+using System.Runtime.InteropServices;
+
+namespace AliFsmnVadSharp
+{
+ internal class WavFrontend
+ {
+ private string _mvnFilePath;
+ private FrontendConfEntity _frontendConfEntity;
+ IntPtr _opts = IntPtr.Zero;
+ private CmvnEntity _cmvnEntity;
+
+ private static int _fbank_beg_idx = 0;
+
+ public WavFrontend(string mvnFilePath, FrontendConfEntity frontendConfEntity)
+ {
+ _mvnFilePath = mvnFilePath;
+ _frontendConfEntity = frontendConfEntity;
+ _fbank_beg_idx = 0;
+ _opts = KaldiNativeFbank.GetFbankOptions(
+ dither: _frontendConfEntity.dither,
+ snip_edges: true,
+ sample_rate: _frontendConfEntity.fs,
+ num_bins: _frontendConfEntity.n_mels
+ );
+ _cmvnEntity = LoadCmvn(mvnFilePath);
+ }
+
+ public float[] GetFbank(float[] samples)
+ {
+ float sample_rate = _frontendConfEntity.fs;
+ samples = samples.Select((float x) => x * 32768f).ToArray();
+ // method1
+ //FbankDatas fbankDatas = new FbankDatas();
+ //KaldiNativeFbank.GetFbanks(_knfOnlineFbank, framesNum,ref fbankDatas);
+ // method2
+ KnfOnlineFbank _knfOnlineFbank = KaldiNativeFbank.GetOnlineFbank(_opts);
+ KaldiNativeFbank.AcceptWaveform(_knfOnlineFbank, sample_rate, samples, samples.Length);
+ KaldiNativeFbank.InputFinished(_knfOnlineFbank);
+ int framesNum = KaldiNativeFbank.GetNumFramesReady(_knfOnlineFbank);
+ float[] fbanks = new float[framesNum * 80];
+ for (int i = 0; i < framesNum; i++)
+ {
+ FbankData fbankData = new FbankData();
+ KaldiNativeFbank.GetFbank(_knfOnlineFbank, i, ref fbankData);
+ float[] _fbankData = new float[fbankData.data_length];
+ Marshal.Copy(fbankData.data, _fbankData, 0, fbankData.data_length);
+ Array.Copy(_fbankData, 0, fbanks, i * 80, _fbankData.Length);
+ fbankData.data = IntPtr.Zero;
+ _fbankData = null;
+ }
+
+ samples = null;
+ GC.Collect();
+ return fbanks;
+ }
+
+
+ public float[] LfrCmvn(float[] fbanks)
+ {
+ float[] features = fbanks;
+ if (_frontendConfEntity.lfr_m != 1 || _frontendConfEntity.lfr_n != 1)
+ {
+ features = ApplyLfr(fbanks, _frontendConfEntity.lfr_m, _frontendConfEntity.lfr_n);
+ }
+ if (_cmvnEntity != null)
+ {
+ features = ApplyCmvn(features);
+ }
+ return features;
+ }
+
+ private float[] ApplyCmvn(float[] inputs)
+ {
+ var arr_neg_mean = _cmvnEntity.Means;
+ float[] neg_mean = arr_neg_mean.Select(x => (float)Convert.ToDouble(x)).ToArray();
+ var arr_inv_stddev = _cmvnEntity.Vars;
+ float[] inv_stddev = arr_inv_stddev.Select(x => (float)Convert.ToDouble(x)).ToArray();
+
+ int dim = neg_mean.Length;
+ int num_frames = inputs.Length / dim;
+
+ for (int i = 0; i < num_frames; i++)
+ {
+ for (int k = 0; k != dim; ++k)
+ {
+ inputs[dim * i + k] = (inputs[dim * i + k] + neg_mean[k]) * inv_stddev[k];
+ }
+ }
+ return inputs;
+ }
+
+ public float[] ApplyLfr(float[] inputs, int lfr_m, int lfr_n)
+ {
+ int t = inputs.Length / 80;
+ int t_lfr = (int)Math.Floor((double)(t / lfr_n));
+ float[] input_0 = new float[80];
+ Array.Copy(inputs, 0, input_0, 0, 80);
+ int tile_x = (lfr_m - 1) / 2;
+ t = t + tile_x;
+ float[] inputs_temp = new float[t * 80];
+ for (int i = 0; i < tile_x; i++)
+ {
+ Array.Copy(input_0, 0, inputs_temp, tile_x * 80, 80);
+ }
+ Array.Copy(inputs, 0, inputs_temp, tile_x * 80, inputs.Length);
+ inputs = inputs_temp;
+
+ float[] LFR_outputs = new float[t_lfr * lfr_m * 80];
+ for (int i = 0; i < t_lfr; i++)
+ {
+ if (lfr_m <= t - i * lfr_n)
+ {
+ Array.Copy(inputs, i * lfr_n * 80, LFR_outputs, i* lfr_m * 80, lfr_m * 80);
+ }
+ else
+ {
+ // process last LFR frame
+ int num_padding = lfr_m - (t - i * lfr_n);
+ float[] frame = new float[lfr_m * 80];
+ Array.Copy(inputs, i * lfr_n * 80, frame, 0, (t - i * lfr_n) * 80);
+
+ for (int j = 0; j < num_padding; j++)
+ {
+ Array.Copy(inputs, (t - 1) * 80, frame, (lfr_m - num_padding + j) * 80, 80);
+ }
+ Array.Copy(frame, 0, LFR_outputs, i * lfr_m * 80, frame.Length);
+ }
+ }
+ return LFR_outputs;
+ }
+
+ private CmvnEntity LoadCmvn(string mvnFilePath)
+ {
+ List<float> means_list = new List<float>();
+ List<float> vars_list = new List<float>();
+ FileStreamOptions options = new FileStreamOptions();
+ options.Access = FileAccess.Read;
+ options.Mode = FileMode.Open;
+ StreamReader srtReader = new StreamReader(mvnFilePath, options);
+ int i = 0;
+ while (!srtReader.EndOfStream)
+ {
+ string? strLine = srtReader.ReadLine();
+ if (!string.IsNullOrEmpty(strLine))
+ {
+ if (strLine.StartsWith("<AddShift>"))
+ {
+ i=1;
+ continue;
+ }
+ if (strLine.StartsWith("<Rescale>"))
+ {
+ i = 2;
+ continue;
+ }
+ if (strLine.StartsWith("<LearnRateCoef>") && i==1)
+ {
+ string[] add_shift_line = strLine.Substring(strLine.IndexOf("[") + 1, strLine.LastIndexOf("]") - strLine.IndexOf("[") - 1).Split(" ");
+ means_list = add_shift_line.Where(x => !string.IsNullOrEmpty(x)).Select(x => float.Parse(x.Trim())).ToList();
+ continue;
+ }
+ if (strLine.StartsWith("<LearnRateCoef>") && i==2)
+ {
+ string[] rescale_line = strLine.Substring(strLine.IndexOf("[") + 1, strLine.LastIndexOf("]") - strLine.IndexOf("[") - 1).Split(" ");
+ vars_list = rescale_line.Where(x => !string.IsNullOrEmpty(x)).Select(x => float.Parse(x.Trim())).ToList();
+ continue;
+ }
+ }
+ }
+ CmvnEntity cmvnEntity = new CmvnEntity();
+ cmvnEntity.Means = means_list;
+ cmvnEntity.Vars = vars_list;
+ return cmvnEntity;
+ }
+
+ }
+}
diff --git a/funasr/runtime/csharp/AliFsmnVadSharp/WindowDetector.cs b/funasr/runtime/csharp/AliFsmnVadSharp/WindowDetector.cs
new file mode 100644
index 0000000..785af32
--- /dev/null
+++ b/funasr/runtime/csharp/AliFsmnVadSharp/WindowDetector.cs
@@ -0,0 +1,156 @@
+锘縰sing System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace AliFsmnVadSharp
+{
+ public enum FrameState
+ {
+ kFrameStateInvalid = -1,
+ kFrameStateSpeech = 1,
+ kFrameStateSil = 0
+ }
+
+ /// <summary>
+ /// final voice/unvoice state per frame
+ /// </summary>
+ public enum AudioChangeState
+ {
+ kChangeStateSpeech2Speech = 0,
+ kChangeStateSpeech2Sil = 1,
+ kChangeStateSil2Sil = 2,
+ kChangeStateSil2Speech = 3,
+ kChangeStateNoBegin = 4,
+ kChangeStateInvalid = 5
+ }
+
+
+ internal class WindowDetector
+ {
+ private int _window_size_ms = 0; //window_size_ms;
+ private int _sil_to_speech_time = 0; //sil_to_speech_time;
+ private int _speech_to_sil_time = 0; //speech_to_sil_time;
+ private int _frame_size_ms = 0; //frame_size_ms;
+
+ private int _win_size_frame = 0;
+ private int _win_sum = 0;
+ private int[] _win_state = new int[0];// * _win_size_frame; // 鍒濆鍖栫獥
+
+ private int _cur_win_pos = 0;
+ private int _pre_frame_state = (int)FrameState.kFrameStateSil;
+ private int _cur_frame_state = (int)FrameState.kFrameStateSil;
+ private int _sil_to_speech_frmcnt_thres = 0; //int(sil_to_speech_time / frame_size_ms);
+ private int _speech_to_sil_frmcnt_thres = 0; //int(speech_to_sil_time / frame_size_ms);
+
+ private int _voice_last_frame_count = 0;
+ private int _noise_last_frame_count = 0;
+ private int _hydre_frame_count = 0;
+
+ public WindowDetector()
+ {
+
+ }
+
+ public WindowDetector(int window_size_ms, int sil_to_speech_time, int speech_to_sil_time, int frame_size_ms)
+ {
+ _window_size_ms = window_size_ms;
+ _sil_to_speech_time = sil_to_speech_time;
+ _speech_to_sil_time = speech_to_sil_time;
+ _frame_size_ms = frame_size_ms;
+
+ _win_size_frame = (int)(window_size_ms / frame_size_ms);
+ _win_sum = 0;
+ _win_state = new int[_win_size_frame];//[0] * _win_size_frame; // 鍒濆鍖栫獥
+
+ _cur_win_pos = 0;
+ _pre_frame_state = (int)FrameState.kFrameStateSil;
+ _cur_frame_state = (int)FrameState.kFrameStateSil;
+ _sil_to_speech_frmcnt_thres = (int)(sil_to_speech_time / frame_size_ms);
+ _speech_to_sil_frmcnt_thres = (int)(speech_to_sil_time / frame_size_ms);
+
+ _voice_last_frame_count = 0;
+ _noise_last_frame_count = 0;
+ _hydre_frame_count = 0;
+ }
+
+ public void Reset()
+ {
+ _cur_win_pos = 0;
+ _win_sum = 0;
+ _win_state = new int[_win_size_frame];
+ _pre_frame_state = (int)FrameState.kFrameStateSil;
+ _cur_frame_state = (int)FrameState.kFrameStateSil;
+ _voice_last_frame_count = 0;
+ _noise_last_frame_count = 0;
+ _hydre_frame_count = 0;
+ }
+
+
+ public int GetWinSize()
+ {
+ return _win_size_frame;
+ }
+
+ public AudioChangeState DetectOneFrame(FrameState frameState, int frame_count)
+ {
+
+
+ _cur_frame_state = (int)FrameState.kFrameStateSil;
+ if (frameState == FrameState.kFrameStateSpeech)
+ {
+ _cur_frame_state = 1;
+ }
+
+ else if (frameState == FrameState.kFrameStateSil)
+ {
+ _cur_frame_state = 0;
+ }
+
+ else
+ {
+ return AudioChangeState.kChangeStateInvalid;
+ }
+
+ _win_sum -= _win_state[_cur_win_pos];
+ _win_sum += _cur_frame_state;
+ _win_state[_cur_win_pos] = _cur_frame_state;
+ _cur_win_pos = (_cur_win_pos + 1) % _win_size_frame;
+
+ if (_pre_frame_state == (int)FrameState.kFrameStateSil && _win_sum >= _sil_to_speech_frmcnt_thres)
+ {
+ _pre_frame_state = (int)FrameState.kFrameStateSpeech;
+ return AudioChangeState.kChangeStateSil2Speech;
+ }
+
+
+ if (_pre_frame_state == (int)FrameState.kFrameStateSpeech && _win_sum <= _speech_to_sil_frmcnt_thres)
+ {
+ _pre_frame_state = (int)FrameState.kFrameStateSil;
+ return AudioChangeState.kChangeStateSpeech2Sil;
+ }
+
+
+ if (_pre_frame_state == (int)FrameState.kFrameStateSil)
+ {
+ return AudioChangeState.kChangeStateSil2Sil;
+ }
+
+ if (_pre_frame_state == (int)FrameState.kFrameStateSpeech)
+ {
+ return AudioChangeState.kChangeStateSpeech2Speech;
+ }
+
+ return AudioChangeState.kChangeStateInvalid;
+ }
+
+ private int FrameSizeMs()
+ {
+ return _frame_size_ms;
+ }
+
+
+
+ }
+}
diff --git a/funasr/runtime/csharp/README.md b/funasr/runtime/csharp/README.md
new file mode 100644
index 0000000..68175cd
--- /dev/null
+++ b/funasr/runtime/csharp/README.md
@@ -0,0 +1,59 @@
+# AliFsmnVadSharp
+##### 绠�浠嬶細
+椤圭洰涓娇鐢ㄧ殑VAD妯″瀷鏄樋閲屽反宸磋揪鎽╅櫌鎻愪緵鐨凢SMN-Monophone VAD妯″瀷銆�
+**椤圭洰鍩轰簬Net 6.0锛屼娇鐢–#缂栧啓锛岃皟鐢∕icrosoft.ML.OnnxRuntime瀵筼nnx妯″瀷杩涜瑙g爜锛屾敮鎸佽法骞冲彴缂栬瘧銆傞」鐩互搴撶殑褰㈠紡杩涜璋冪敤锛岄儴缃查潪甯告柟渚裤��**
+VAD鏁翠綋娴佺▼鐨剅tf鍦�0.008宸﹀彸銆�
+
+##### 鐢ㄩ�旓細
+16k涓枃閫氱敤VAD妯″瀷锛氬彲鐢ㄤ簬妫�娴嬮暱璇煶鐗囨涓湁鏁堣闊崇殑璧锋鏃堕棿鐐�.
+FSMN-Monophone VAD鏄揪鎽╅櫌璇煶鍥㈤槦鎻愬嚭鐨勯珮鏁堣闊崇鐐规娴嬫ā鍨嬶紝鐢ㄤ簬妫�娴嬭緭鍏ラ煶棰戜腑鏈夋晥璇煶鐨勮捣姝㈡椂闂寸偣淇℃伅锛屽苟灏嗘娴嬪嚭鏉ョ殑鏈夋晥闊抽鐗囨杈撳叆璇嗗埆寮曟搸杩涜璇嗗埆锛屽噺灏戞棤鏁堣闊冲甫鏉ョ殑璇嗗埆閿欒銆�
+
+##### VAD甯哥敤鍙傛暟璋冩暣璇存槑锛堝弬鑰冿細vad.yaml鏂囦欢锛夛細
+max_end_silence_time锛氬熬閮ㄨ繛缁娴嬪埌澶氶暱鏃堕棿闈欓煶杩涜灏剧偣鍒ゅ仠锛屽弬鏁拌寖鍥�500ms锝�6000ms锛岄粯璁ゅ��800ms(璇ュ�艰繃浣庡鏄撳嚭鐜拌闊虫彁鍓嶆埅鏂殑鎯呭喌)銆�
+speech_noise_thres锛歴peech鐨勫緱鍒嗗噺鍘籲oise鐨勫緱鍒嗗ぇ浜庢鍊煎垯鍒ゆ柇涓簊peech锛屽弬鏁拌寖鍥达細锛�-1,1锛�
+鍙栧�艰秺瓒嬩簬-1锛屽櫔闊宠璇垽瀹氫负璇煶鐨勬鐜囪秺澶э紝FA瓒婇珮
+鍙栧�艰秺瓒嬩簬+1锛岃闊宠璇垽瀹氫负鍣煶鐨勬鐜囪秺澶э紝Pmiss瓒婇珮
+閫氬父鎯呭喌涓嬶紝璇ュ�间細鏍规嵁褰撳墠妯″瀷鍦ㄩ暱璇煶娴嬭瘯闆嗕笂鐨勬晥鏋滃彇balance
+
+##### 妯″瀷鑾峰彇
+
+##### 璋冪敤鏂瑰紡锛�
+###### 1.娣诲姞椤圭洰寮曠敤
+using AliFsmnVadSharp;
+
+###### 2.鍒濆鍖栨ā鍨嬪拰閰嶇疆
+```csharp
+string applicationBase = AppDomain.CurrentDomain.BaseDirectory;
+string modelFilePath = applicationBase + "./speech_fsmn_vad_zh-cn-16k-common-pytorch/model.onnx";
+string configFilePath = applicationBase + "./speech_fsmn_vad_zh-cn-16k-common-pytorch/vad.yaml";
+string mvnFilePath = applicationBase + "./speech_fsmn_vad_zh-cn-16k-common-pytorch/vad.mvn";
+int batchSize = 2;//鎵归噺瑙g爜
+AliFsmnVad aliFsmnVad = new AliFsmnVad(modelFilePath, configFilePath, mvnFilePath, batchSize);
+```
+###### 3.璋冪敤
+鏂规硶涓�(閫傜敤浜庡皬鏂囦欢)锛�
+```csharp
+SegmentEntity[] segments_duration = aliFsmnVad.GetSegments(samples);
+```
+鏂规硶浜�(閫傜敤浜庡ぇ鏂囦欢)锛�
+```csharp
+SegmentEntity[] segments_duration = aliFsmnVad.GetSegmentsByStep(samples);
+```
+###### 4.杈撳嚭缁撴灉锛�
+```
+load model and init config elapsed_milliseconds:463.5390625
+vad infer result:
+[[70,2340][2620,6200][6480,23670][23950,26250][26780,28990][29950,31430][31750,37600][38210,46900][47310,49630][49910,56460][56740,59540][59820,70450]]
+elapsed_milliseconds:662.796875
+total_duration:70470.625
+rtf:0.009405292985552491
+```
+杈撳嚭鐨勬暟鎹紝渚嬪锛歔70,2340]锛屾槸浠ユ绉掍负鍗曚綅鐨剆egement鐨勮捣姝㈡椂闂达紝鍙互浠ユ涓轰緷鎹闊抽杩涜鍒嗙墖銆傚叾涓潤闊冲櫔闊抽儴鍒嗗凡琚幓闄ゃ��
+
+鍏朵粬璇存槑锛�
+娴嬭瘯鐢ㄤ緥锛欰liFsmnVadSharp.Examples銆�
+娴嬭瘯鐜锛歸indows11銆�
+娴嬭瘯鐢ㄤ緥涓璼amples鐨勮绠�,浣跨敤鐨勬槸NAudio搴撱��
+
+閫氳繃浠ヤ笅閾炬帴浜嗚В鏇村锛�
+https://www.modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary
--
Gitblit v1.9.1