// See https://github.com/manyeyes for more information // Copyright (c) 2024 by manyeyes using AliParaformerAsr.Model; using Microsoft.ML.OnnxRuntime; using Microsoft.ML.OnnxRuntime.Tensors; using AliParaformerAsr.Utils; namespace AliParaformerAsr { internal class OfflineProjOfSenseVoiceSmall : IOfflineProj, IDisposable { // To detect redundant calls private bool _disposed; private InferenceSession _modelSession; private int _blank_id = 0; private int _sos_eos_id = 1; private int _unk_id = 2; private int _featureDim = 80; private int _sampleRate = 16000; private bool _use_itn = false; private string _textnorm = "woitn"; private Dictionary _lidDict = new Dictionary() { { "auto", 0 }, { "zh", 3 }, { "en", 4 }, { "yue", 7 }, { "ja", 11 }, { "ko", 12 }, { "nospeech", 13 } }; private Dictionary _lidIntDict = new Dictionary() { { 24884, 3 }, { 24885, 4 }, { 24888, 7 }, { 24892, 11 }, { 24896, 12 }, { 24992, 13 } }; private Dictionary _textnormDict = new Dictionary() { { "withitn", 14 }, { "woitn", 15 } }; private Dictionary _textnormIntDict = new Dictionary() { { 25016, 14 }, { 25017, 15 } }; public OfflineProjOfSenseVoiceSmall(OfflineModel offlineModel) { _modelSession = offlineModel.ModelSession; _blank_id = offlineModel.Blank_id; _sos_eos_id = offlineModel.Sos_eos_id; _unk_id = offlineModel.Unk_id; _featureDim = offlineModel.FeatureDim; _sampleRate = offlineModel.SampleRate; } public InferenceSession ModelSession { get => _modelSession; set => _modelSession = value; } public int Blank_id { get => _blank_id; set => _blank_id = value; } public int Sos_eos_id { get => _sos_eos_id; set => _sos_eos_id = value; } public int Unk_id { get => _unk_id; set => _unk_id = value; } public int FeatureDim { get => _featureDim; set => _featureDim = value; } public int SampleRate { get => _sampleRate; set => _sampleRate = value; } public ModelOutputEntity ModelProj(List modelInputs) { int batchSize = modelInputs.Count; float[] padSequence = PadHelper.PadSequence(modelInputs); // string languageValue = "ja"; int languageId = 0; if (_lidDict.ContainsKey(languageValue)) { languageId = _lidDict.GetValueOrDefault(languageValue); } string textnormValue = "withitn"; int textnormId = 15; if (_textnormDict.ContainsKey(textnormValue)) { textnormId = _textnormDict.GetValueOrDefault(textnormValue); } var inputMeta = _modelSession.InputMetadata; var container = new List(); foreach (var name in inputMeta.Keys) { if (name == "speech") { int[] dim = new int[] { batchSize, padSequence.Length / 560 / batchSize, 560 }; var tensor = new DenseTensor(padSequence, dim, false); container.Add(NamedOnnxValue.CreateFromTensor(name, tensor)); } if (name == "speech_lengths") { int[] dim = new int[] { batchSize }; int[] speech_lengths = new int[batchSize]; for (int i = 0; i < batchSize; i++) { speech_lengths[i] = padSequence.Length / 560 / batchSize; } var tensor = new DenseTensor(speech_lengths, dim, false); container.Add(NamedOnnxValue.CreateFromTensor(name, tensor)); } if (name == "language") { int[] language = new int[batchSize]; for (int i = 0; i < batchSize; i++) { language[i] = languageId; } int[] dim = new int[] { batchSize }; var tensor = new DenseTensor(language, dim, false); container.Add(NamedOnnxValue.CreateFromTensor(name, tensor)); } if (name == "textnorm") { int[] textnorm = new int[batchSize]; for (int i = 0; i < batchSize; i++) { textnorm[i] = textnormId; } int[] dim = new int[] { batchSize }; var tensor = new DenseTensor(textnorm, dim, false); container.Add(NamedOnnxValue.CreateFromTensor(name, tensor)); } } ModelOutputEntity modelOutputEntity = new ModelOutputEntity(); try { IDisposableReadOnlyCollection results = _modelSession.Run(container); if (results != null) { var resultsArray = results.ToArray(); modelOutputEntity.model_out = resultsArray[0].AsTensor(); modelOutputEntity.model_out_lens = resultsArray[1].AsEnumerable().ToArray(); if (resultsArray.Length >= 4) { Tensor cif_peak_tensor = resultsArray[3].AsTensor(); modelOutputEntity.cif_peak_tensor = cif_peak_tensor; } } } catch (Exception ex) { // } return modelOutputEntity; } protected virtual void Dispose(bool disposing) { if (!_disposed) { if (disposing) { if (_modelSession != null) { _modelSession.Dispose(); } } _disposed = true; } } public void Dispose() { Dispose(disposing: true); GC.SuppressFinalize(this); } ~OfflineProjOfSenseVoiceSmall() { Dispose(_disposed); } } }