runtime/csharp/AliFsmnVad/AliFsmnVadSharp.Examples/AliFsmnVadSharp.Examples.csproj
@@ -8,6 +8,7 @@ </PropertyGroup> <ItemGroup> <PackageReference Include="CommandLineParser" Version="2.9.1" /> <PackageReference Include="NAudio" Version="2.1.0" /> </ItemGroup> runtime/csharp/AliFsmnVad/AliFsmnVadSharp.Examples/Program.cs
@@ -1,61 +1,107 @@ using AliFsmnVadSharp; using AliFsmnVadSharp.Model; using CommandLine; using NAudio.Wave; internal static class Program { [STAThread] private static void Main() { string applicationBase = AppDomain.CurrentDomain.BaseDirectory; string modelFilePath = applicationBase + "./speech_fsmn_vad_zh-cn-16k-common-pytorch/model.onnx"; string configFilePath = applicationBase + "./speech_fsmn_vad_zh-cn-16k-common-pytorch/vad.yaml"; string mvnFilePath = applicationBase + "./speech_fsmn_vad_zh-cn-16k-common-pytorch/vad.mvn"; int batchSize = 2; TimeSpan start_time0 = new TimeSpan(DateTime.Now.Ticks); AliFsmnVad aliFsmnVad = new AliFsmnVad(modelFilePath, configFilePath, mvnFilePath, batchSize); TimeSpan end_time0 = new TimeSpan(DateTime.Now.Ticks); double elapsed_milliseconds0 = end_time0.TotalMilliseconds - start_time0.TotalMilliseconds; Console.WriteLine("load model and init config elapsed_milliseconds:{0}", elapsed_milliseconds0.ToString()); List<float[]> samples = new List<float[]>(); TimeSpan total_duration = new TimeSpan(0L); for (int i = 0; i < 2; i++) { string wavFilePath = string.Format(applicationBase + "./speech_fsmn_vad_zh-cn-16k-common-pytorch/example/{0}.wav", i.ToString());//vad_example if (!File.Exists(wavFilePath)) { continue; } AudioFileReader _audioFileReader = new AudioFileReader(wavFilePath); byte[] datas = new byte[_audioFileReader.Length]; _audioFileReader.Read(datas, 0, datas.Length); TimeSpan duration = _audioFileReader.TotalTime; float[] wavdata = new float[datas.Length / 4]; Buffer.BlockCopy(datas, 0, wavdata, 0, datas.Length); float[] sample = wavdata.Select((float x) => x * 32768f).ToArray(); samples.Add(wavdata); total_duration += duration; } TimeSpan start_time = new TimeSpan(DateTime.Now.Ticks); //SegmentEntity[] segments_duration = aliFsmnVad.GetSegments(samples); SegmentEntity[] segments_duration = aliFsmnVad.GetSegmentsByStep(samples); TimeSpan end_time = new TimeSpan(DateTime.Now.Ticks); Console.WriteLine("vad infer result:"); foreach (SegmentEntity segment in segments_duration) { Console.Write("["); foreach (var x in segment.Segment) { Console.Write("[" + string.Join(",", x.ToArray()) + "]"); } Console.Write("]\r\n"); } public class ProgramParams { [Option('i', "input", Required = true, HelpText = "Input wav file/folder path.")] public string WavFilePath { get; set; } double elapsed_milliseconds = end_time.TotalMilliseconds - start_time.TotalMilliseconds; [Option('m', "model", Default = "speech_fsmn_vad_zh-cn-16k-common-onnx", HelpText = "Model path.")] public string Model { get; set; } } [STAThread] private static void Main(string[] args) { var argParams = Parser.Default.ParseArguments<ProgramParams>(args).Value; string modelPath = argParams.Model; if (!Directory.Exists(argParams.Model)) { modelPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, modelPath); if (!Directory.Exists(modelPath)) { throw new DirectoryNotFoundException($"Model not found: {argParams.Model}"); } } string modelFilePath = Path.Combine(modelPath, "model_quant.onnx"); string configFilePath = Path.Combine(modelPath, "config.yaml"); string mvnFilePath = Path.Combine(modelPath, "am.mvn"); int batchSize = 1; AliFsmnVad aliFsmnVad = new AliFsmnVad(modelFilePath, configFilePath, mvnFilePath, batchSize); List<string> wavFiles = new List<string>(); if (File.Exists(argParams.WavFilePath)) { wavFiles.Add(argParams.WavFilePath); } else if (Directory.Exists(argParams.WavFilePath)) { foreach (var wavFilePath in Directory.GetFiles(argParams.WavFilePath, "*.wav")) { wavFiles.Add(wavFilePath); } } else { throw new Exception($"Invalid wav input path. {argParams.WavFilePath}"); } var start_time = DateTime.Now; TimeSpan total_duration = new TimeSpan(0L); for (int i = 0; i < wavFiles.Count; i += batchSize) { List<float[]> samples = new List<float[]>(); foreach(var wavFile in wavFiles.Skip(i).Take(batchSize)) { (var sample, var duration) = LoadWavFile(wavFile); samples.Add(sample); total_duration += duration; } SegmentEntity[] segments_duration = aliFsmnVad.GetSegments(samples); Console.WriteLine("vad infer result:"); foreach (SegmentEntity segment in segments_duration) { Console.Write("["); foreach (var x in segment.Segment) { Console.Write("[" + string.Join(",", x.ToArray()) + "]"); } Console.Write("]\r\n"); } } var end_time = DateTime.Now; double elapsed_milliseconds = (end_time - start_time).TotalMilliseconds; double rtf = elapsed_milliseconds / total_duration.TotalMilliseconds; Console.WriteLine("elapsed_milliseconds:{0}", elapsed_milliseconds.ToString()); Console.WriteLine("total_duration:{0}", total_duration.TotalMilliseconds.ToString()); Console.WriteLine("rtf:{1}", "0".ToString(), rtf.ToString()); Console.WriteLine("------------------------"); } private static (float[] sample, TimeSpan duration) LoadWavFile(string wavFilePath) { AudioFileReader _audioFileReader = new AudioFileReader(wavFilePath); byte[] datas = new byte[_audioFileReader.Length]; _audioFileReader.Read(datas, 0, datas.Length); var duration = _audioFileReader.TotalTime; float[] wavdata = new float[datas.Length / 4]; Buffer.BlockCopy(datas, 0, wavdata, 0, datas.Length); var sample = wavdata.Select((float x) => x * 32768f).ToArray(); return (sample, duration); } } runtime/csharp/AliFsmnVad/AliFsmnVadSharp/AliFsmnVad.cs
@@ -4,6 +4,8 @@ using Microsoft.ML.OnnxRuntime; using Microsoft.ML.OnnxRuntime.Tensors; // 模型文件下载地址:https://modelscope.cn/models/iic/speech_fsmn_vad_zh-cn-16k-common-onnx/ namespace AliFsmnVadSharp { public class AliFsmnVad : IDisposable @@ -28,9 +30,9 @@ VadYamlEntity vadYamlEntity = YamlHelper.ReadYaml<VadYamlEntity>(configFilePath); _wavFrontend = new WavFrontend(mvnFilePath, vadYamlEntity.frontend_conf); _frontend = vadYamlEntity.frontend; _vad_post_conf = vadYamlEntity.vad_post_conf; _vad_post_conf = vadYamlEntity.model_conf; _batchSize = batchSize; _max_end_sil = _max_end_sil != int.MinValue ? _max_end_sil : vadYamlEntity.vad_post_conf.max_end_silence_time; _max_end_sil = _max_end_sil != int.MinValue ? _max_end_sil : vadYamlEntity.model_conf.max_end_silence_time; _encoderConfEntity = vadYamlEntity.encoder_conf; ILoggerFactory loggerFactory = new LoggerFactory(); runtime/csharp/AliFsmnVad/AliFsmnVadSharp/Model/VadYamlEntity.cs
@@ -22,6 +22,6 @@ public string encoder { get => _encoder; set => _encoder = value; } public FrontendConfEntity frontend_conf { get => _frontend_conf; set => _frontend_conf = value; } public EncoderConfEntity encoder_conf { get => _encoder_conf; set => _encoder_conf = value; } public VadPostConfEntity vad_post_conf { get => _vad_post_conf; set => _vad_post_conf = value; } public VadPostConfEntity model_conf { get => _vad_post_conf; set => _vad_post_conf = value; } } } runtime/csharp/AliParaformerAsr/AliParaformerAsr.Examples/AliParaformerAsr.Examples.csproj
@@ -5,6 +5,7 @@ <TargetFramework>net6.0</TargetFramework> <ImplicitUsings>enable</ImplicitUsings> <Nullable>enable</Nullable> <Platforms>AnyCPU;x64</Platforms> </PropertyGroup> <ItemGroup> @@ -12,6 +13,7 @@ </ItemGroup> <ItemGroup> <PackageReference Include="CommandLineParser" Version="2.9.1" /> <PackageReference Include="NAudio" Version="2.1.0" /> </ItemGroup> runtime/csharp/AliParaformerAsr/AliParaformerAsr.Examples/Program.cs
@@ -1,66 +1,116 @@ using AliParaformerAsr; using CommandLine; using NAudio.Wave; internal static class Program { [STAThread] private static void Main() { string applicationBase = AppDomain.CurrentDomain.BaseDirectory; string modelName = "speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-onnx"; string modelFilePath = applicationBase + "./"+ modelName + "/model_quant.onnx"; string configFilePath = applicationBase + "./" + modelName + "/asr.yaml"; string mvnFilePath = applicationBase + "./" + modelName + "/am.mvn"; string tokensFilePath = applicationBase + "./" + modelName + "/tokens.txt"; AliParaformerAsr.OfflineRecognizer offlineRecognizer = new OfflineRecognizer(modelFilePath, configFilePath, mvnFilePath, tokensFilePath); List<float[]>? samples = null; TimeSpan total_duration = new TimeSpan(0L); if (samples == null) public class ProgramParams { [Option('i', "input", Required = true, HelpText = "Input wav file/folder path.")] public string WavFilePath { get; set; } [Option('m', "model", Default = "speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-onnx", HelpText = "Model path.")] public string Model { get; set; } } [STAThread] private static void Main(string[] args) { var argParams = Parser.Default.ParseArguments<ProgramParams>(args).Value; string modelPath = argParams.Model; if (!Directory.Exists(argParams.Model)) { samples = new List<float[]>(); for (int i = 0; i < 5; i++) modelPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, modelPath); if (!Directory.Exists(modelPath)) { string wavFilePath = string.Format(applicationBase + "./" + modelName + "/example/{0}.wav", i.ToString()); if (!File.Exists(wavFilePath)) { break; } AudioFileReader _audioFileReader = new AudioFileReader(wavFilePath); byte[] datas = new byte[_audioFileReader.Length]; _audioFileReader.Read(datas, 0, datas.Length); TimeSpan duration = _audioFileReader.TotalTime; float[] wavdata = new float[datas.Length / 4]; Buffer.BlockCopy(datas, 0, wavdata, 0, datas.Length); float[] sample = wavdata.Select((float x) => x * 32768f).ToArray(); samples.Add(sample); total_duration += duration; throw new DirectoryNotFoundException($"Model not found: {argParams.Model}"); } } TimeSpan start_time = new TimeSpan(DateTime.Now.Ticks); //1.Non batch method foreach (var sample in samples) string modelFilePath = Path.Combine(modelPath, "model_quant.onnx"); string configFilePath = Path.Combine(modelPath, "asr.yaml"); string mvnFilePath = Path.Combine(modelPath, "am.mvn"); string tokensFilePath = Path.Combine(modelPath, "tokens.json"); var offlineRecognizer = new OfflineRecognizer(modelFilePath, configFilePath, mvnFilePath, tokensFilePath, OnnxRumtimeTypes.CPU); List<float[]> samples = new List<float[]>(); TimeSpan total_duration = new TimeSpan(0L); if (File.Exists(argParams.WavFilePath)) { List<float[]> temp_samples = new List<float[]>(); temp_samples.Add(sample); (var sample, var duration) = LoadWavFile(argParams.WavFilePath); samples.Add(sample); total_duration += duration; } else if (Directory.Exists(argParams.WavFilePath)) { var findWavCount = 0; foreach (var wavFilePath in Directory.EnumerateFiles(argParams.WavFilePath, "*.wav")) { (var sample, var duration) = LoadWavFile(wavFilePath); samples.Add(sample); total_duration += duration; findWavCount++; } Console.WriteLine($"Total WAV files found: {findWavCount} duration:{total_duration}"); } else { throw new Exception($"Invalid wav input path. {argParams.WavFilePath}"); } var start_time = DateTime.Now; int batchSize = 1; // 输入参数支持批处理,但是实际效果提升有限,感觉还是负优化,等GPU版本优化后再试 for (int i = 0; i < samples.Count; i += batchSize) { List<float[]> temp_samples = samples.Skip(i).Take(batchSize).ToList(); List<string> results = offlineRecognizer.GetResults(temp_samples); foreach (string result in results) { Console.WriteLine(result); Console.WriteLine(""); } } //2.batch method //List<string> results_batch = offlineRecognizer.GetResults(samples); //foreach (string result in results_batch) //{ // Console.WriteLine(result); // Console.WriteLine(""); //} TimeSpan end_time = new TimeSpan(DateTime.Now.Ticks); double elapsed_milliseconds = end_time.TotalMilliseconds - start_time.TotalMilliseconds; var end_time = DateTime.Now; double elapsed_milliseconds = (end_time - start_time).TotalMilliseconds; double rtf = elapsed_milliseconds / total_duration.TotalMilliseconds; Console.WriteLine("elapsed_milliseconds:{0}", elapsed_milliseconds.ToString()); Console.WriteLine("total_duration:{0}", total_duration.TotalMilliseconds.ToString()); Console.WriteLine("rtf:{1}", "0".ToString(), rtf.ToString()); // 实时因子是处理时间与音频时长的比值。 // 例如,如果一个 10 秒的音频片段需要 5 秒来处理,那么实时因子就是 0.5。 // 如果处理时间和音频时长相等,那么实时因子就是 1,这意味着系统以实时速度进行处理。 // 数值越小,表示处理速度越快。 // from chatgpt 解释 Console.WriteLine("Real-Time Factor :{0}", rtf.ToString()); Console.WriteLine("end!"); } } private static (float[] sample, TimeSpan duration) LoadWavFile(string wavFilePath) { AudioFileReader _audioFileReader = new AudioFileReader(wavFilePath); byte[] datas = new byte[_audioFileReader.Length]; _audioFileReader.Read(datas, 0, datas.Length); var duration = _audioFileReader.TotalTime; float[] wavdata = new float[datas.Length / 4]; Buffer.BlockCopy(datas, 0, wavdata, 0, datas.Length); var sample = wavdata.Select((float x) => x * 32768f).ToArray(); return (sample, duration); } } runtime/csharp/AliParaformerAsr/AliParaformerAsr/AliParaformerAsr.csproj
@@ -9,7 +9,9 @@ <ItemGroup> <PackageReference Include="KaldiNativeFbankSharp" Version="1.0.8" /> <PackageReference Include="Microsoft.Extensions.Logging" Version="7.0.0" /> <PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.15.1" /> <PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.17.3" /> <PackageReference Include="Microsoft.ML.OnnxRuntime.Managed" Version="1.17.3" /> <PackageReference Include="Newtonsoft.Json" Version="13.0.3" /> <PackageReference Include="YamlDotNet" Version="13.1.1" /> </ItemGroup> runtime/csharp/AliParaformerAsr/AliParaformerAsr/OfflineRecognizer.cs
@@ -9,9 +9,20 @@ using Microsoft.ML.OnnxRuntime.Tensors; using Microsoft.Extensions.Logging; using System.Text.RegularExpressions; using Newtonsoft.Json.Linq; // 模型文件地址: https://modelscope.cn/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-onnx namespace AliParaformerAsr { public enum OnnxRumtimeTypes { CPU = 0, DML = 1, CUDA = 2, } /// <summary> /// offline recognizer package /// Copyright (c) 2023 by manyeyes @@ -24,35 +35,70 @@ private string _frontend; private FrontendConfEntity _frontendConfEntity; private string[] _tokens; private int _batchSize = 1; public OfflineRecognizer(string modelFilePath, string configFilePath, string mvnFilePath,string tokensFilePath, int batchSize = 1,int threadsNum=1) /// <summary> /// /// </summary> /// <param name="modelFilePath"></param> /// <param name="configFilePath"></param> /// <param name="mvnFilePath"></param> /// <param name="tokensFilePath"></param> /// <param name="rumtimeType">可以选择gpu,但是目前情况下,不建议使用,因为性能提升有限</param> /// <param name="deviceId">设备id,多显卡时用于指定执行的显卡</param> /// <param name="batchSize"></param> /// <param name="threadsNum"></param> /// <exception cref="ArgumentException"></exception> public OfflineRecognizer(string modelFilePath, string configFilePath, string mvnFilePath, string tokensFilePath, OnnxRumtimeTypes rumtimeType = OnnxRumtimeTypes.CPU, int deviceId = 0) { Microsoft.ML.OnnxRuntime.SessionOptions options = new Microsoft.ML.OnnxRuntime.SessionOptions(); var options = new SessionOptions(); switch(rumtimeType) { case OnnxRumtimeTypes.DML: options.AppendExecutionProvider_DML(deviceId); break; case OnnxRumtimeTypes.CUDA: options.AppendExecutionProvider_CUDA(deviceId); break; default: options.AppendExecutionProvider_CPU(deviceId); break; } //options.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_INFO; //options.AppendExecutionProvider_DML(0); options.AppendExecutionProvider_CPU(0); //options.AppendExecutionProvider_CUDA(0); options.InterOpNumThreads = threadsNum; _onnxSession = new InferenceSession(modelFilePath, options); _tokens = File.ReadAllLines(tokensFilePath); _onnxSession = new InferenceSession(modelFilePath, options); string[] tokenLines; if (tokensFilePath.EndsWith(".txt")) { tokenLines = File.ReadAllLines(tokensFilePath); } else if (tokensFilePath.EndsWith(".json")) { string jsonContent = File.ReadAllText(tokensFilePath); JArray tokenArray = JArray.Parse(jsonContent); tokenLines = tokenArray.Select(t => t.ToString()).ToArray(); } else { throw new ArgumentException("Invalid tokens file format. Only .txt and .json are supported."); } _tokens = tokenLines; OfflineYamlEntity offlineYamlEntity = YamlHelper.ReadYaml<OfflineYamlEntity>(configFilePath); _wavFrontend = new WavFrontend(mvnFilePath, offlineYamlEntity.frontend_conf); _frontend = offlineYamlEntity.frontend; _frontendConfEntity = offlineYamlEntity.frontend_conf; _batchSize = batchSize; ILoggerFactory loggerFactory = new LoggerFactory(); _logger = new Logger<OfflineRecognizer>(loggerFactory); } public List<string> GetResults(List<float[]> samples) { this._logger.LogInformation("get features begin"); _logger.LogInformation("get features begin"); List<OfflineInputEntity> offlineInputEntities = ExtractFeats(samples); OfflineOutputEntity modelOutput = this.Forward(offlineInputEntities); List<string> text_results = this.DecodeMulti(modelOutput.Token_nums); OfflineOutputEntity modelOutput = Forward(offlineInputEntities); List<string> text_results = DecodeMulti(modelOutput.Token_nums); return text_results; } @@ -156,15 +202,18 @@ { break; } if (_tokens[token] != "</s>" && _tokens[token] != "<s>" && _tokens[token] != "<blank>") string tokenChar = _tokens[token]; if (tokenChar != "</s>" && tokenChar != "<s>" && tokenChar != "<blank>") { if (IsChinese(_tokens[token],true)) if (IsChinese(tokenChar, true)) { text_result += _tokens[token]; text_result += tokenChar; } else { text_result += "▁" + _tokens[token]+ "▁"; text_result += "▁" + tokenChar + "▁"; } } } runtime/csharp/AliParaformerAsr/AliParaformerAsr/Utils/YamlHelper.cs
@@ -14,16 +14,19 @@ /// YamlHelper /// Copyright (c) 2023 by manyeyes /// </summary> internal class YamlHelper internal class YamlHelper { public static T ReadYaml<T>(string yamlFilePath) public static T ReadYaml<T>(string yamlFilePath) where T:new() { if (!File.Exists(yamlFilePath)) { #pragma warning disable CS8603 // 可能返回 null 引用。 return default(T); #pragma warning restore CS8603 // 可能返回 null 引用。 // 如果允许返回默认对象,则新建一个默认对象,否则应该是抛出异常 // If allowing to return a default object, create a new default object; otherwise, throw an exception return new T(); // throw new Exception($"not find yaml config file: {yamlFilePath}"); } StreamReader yamlReader = File.OpenText(yamlFilePath); Deserializer yamlDeserializer = new Deserializer(); T info = yamlDeserializer.Deserialize<T>(yamlReader);