manyeyes
2023-06-21 c2a2575f198b1bfd452ea5769bec81bcce3d3a42
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
 
namespace AliFsmnVadSharp
{
    public enum FrameState
    {
        kFrameStateInvalid = -1,
        kFrameStateSpeech = 1,
        kFrameStateSil = 0
    }
 
    /// <summary>
    /// final voice/unvoice state per frame
    /// </summary>
    public enum AudioChangeState
    {
        kChangeStateSpeech2Speech = 0,
        kChangeStateSpeech2Sil = 1,
        kChangeStateSil2Sil = 2,
        kChangeStateSil2Speech = 3,
        kChangeStateNoBegin = 4,
        kChangeStateInvalid = 5
    }
 
 
    internal class WindowDetector
    {
        private int _window_size_ms = 0; //window_size_ms;
        private int _sil_to_speech_time = 0; //sil_to_speech_time;
        private int _speech_to_sil_time = 0; //speech_to_sil_time;
        private int _frame_size_ms = 0; //frame_size_ms;
 
        private int _win_size_frame = 0;
        private int _win_sum = 0;
        private int[] _win_state = new int[0];// * _win_size_frame;  // 初始化窗
 
        private int _cur_win_pos = 0;
        private int _pre_frame_state = (int)FrameState.kFrameStateSil;
        private int _cur_frame_state = (int)FrameState.kFrameStateSil;
        private int _sil_to_speech_frmcnt_thres = 0; //int(sil_to_speech_time / frame_size_ms);
        private int _speech_to_sil_frmcnt_thres = 0; //int(speech_to_sil_time / frame_size_ms);
 
        private int _voice_last_frame_count = 0;
        private int _noise_last_frame_count = 0;
        private int _hydre_frame_count = 0;
 
        public WindowDetector()
        {
 
        }
 
        public WindowDetector(int window_size_ms, int sil_to_speech_time, int speech_to_sil_time, int frame_size_ms)
        {
            _window_size_ms = window_size_ms;
            _sil_to_speech_time = sil_to_speech_time;
            _speech_to_sil_time = speech_to_sil_time;
            _frame_size_ms = frame_size_ms;
 
            _win_size_frame = (int)(window_size_ms / frame_size_ms);
            _win_sum = 0;
            _win_state = new int[_win_size_frame];//[0] * _win_size_frame;  // 初始化窗
 
            _cur_win_pos = 0;
            _pre_frame_state = (int)FrameState.kFrameStateSil;
            _cur_frame_state = (int)FrameState.kFrameStateSil;
            _sil_to_speech_frmcnt_thres = (int)(sil_to_speech_time / frame_size_ms);
            _speech_to_sil_frmcnt_thres = (int)(speech_to_sil_time / frame_size_ms);
 
            _voice_last_frame_count = 0;
            _noise_last_frame_count = 0;
            _hydre_frame_count = 0;
        }
 
        public void Reset()
        {
            _cur_win_pos = 0;
            _win_sum = 0;
            _win_state = new int[_win_size_frame];
            _pre_frame_state = (int)FrameState.kFrameStateSil;
            _cur_frame_state = (int)FrameState.kFrameStateSil;
            _voice_last_frame_count = 0;
            _noise_last_frame_count = 0;
            _hydre_frame_count = 0;
        }
        
 
        public int GetWinSize()
        {
            return _win_size_frame;
        }
 
        public AudioChangeState DetectOneFrame(FrameState frameState, int frame_count)
        {
 
 
            _cur_frame_state = (int)FrameState.kFrameStateSil;
            if (frameState == FrameState.kFrameStateSpeech)
            {
                _cur_frame_state = 1;
            }
 
            else if (frameState == FrameState.kFrameStateSil)
            {
                _cur_frame_state = 0;
            }
 
            else
            {
                return AudioChangeState.kChangeStateInvalid;
            }
 
            _win_sum -= _win_state[_cur_win_pos];
            _win_sum += _cur_frame_state;
            _win_state[_cur_win_pos] = _cur_frame_state;
            _cur_win_pos = (_cur_win_pos + 1) % _win_size_frame;
 
            if (_pre_frame_state == (int)FrameState.kFrameStateSil && _win_sum >= _sil_to_speech_frmcnt_thres)
            {
                _pre_frame_state = (int)FrameState.kFrameStateSpeech;
                return AudioChangeState.kChangeStateSil2Speech;
            }
 
 
            if (_pre_frame_state == (int)FrameState.kFrameStateSpeech && _win_sum <= _speech_to_sil_frmcnt_thres)
            {
                _pre_frame_state = (int)FrameState.kFrameStateSil;
                return AudioChangeState.kChangeStateSpeech2Sil;
            }
 
 
            if (_pre_frame_state == (int)FrameState.kFrameStateSil)
            {
                return AudioChangeState.kChangeStateSil2Sil;
            }
 
            if (_pre_frame_state == (int)FrameState.kFrameStateSpeech)
            {
                return AudioChangeState.kChangeStateSpeech2Speech;
            }
 
            return AudioChangeState.kChangeStateInvalid;
        }
 
        private int FrameSizeMs()
        {
            return _frame_size_ms;
        }
 
 
 
    }
}