update with main (#1152)
* v0.8.7
* update cmd version
* set openfst HAVE_BIN/HAVE_SCRIPT off for win32
* 修复为支持新版本的热词 (#1137)
* update CMakeLists.txt
* Revert "update CMakeLists.txt"
This reverts commit 54bcd1f6742269fc1ce90d9871245db5cd6a1cbf.
* rm log.h for wins-websocket
* fix bug of websocket lock blocking
* update funasr-wss-server
* update model-revision by model name
* update funasr-wss-server-2pass
* 增加分角色语音识别对ERes2Net模型的支持。
* Update README.md (#1140)
minor fix
* automatically configure parameters such as decoder-thread-num
* update docs
* update docs
* update docs
* 分角色语音识别支持更多的模型
* update spk inference
* remove never use code (#1151)
---------
Co-authored-by: 雾聪 <wucong.lyb@alibaba-inc.com>
Co-authored-by: 夜雨飘零 <yeyupiaoling@foxmail.com>
Co-authored-by: Ikko Eltociear Ashimine <eltociear@gmail.com>
Co-authored-by: Shi Xian <40013335+R1ckShi@users.noreply.github.com>
Co-authored-by: shixian.shi <shixian.shi@alibaba-inc.com>
| | |
| | | The contributors can be found in [contributors list](./Acknowledge.md) |
| | | |
| | | ## License |
| | | This project is licensed under the [The MIT License](https://opensource.org/licenses/MIT). FunASR also contains various third-party components and some code modified from other repos under other open source licenses. |
| | | This project is licensed under [The MIT License](https://opensource.org/licenses/MIT). FunASR also contains various third-party components and some code modified from other repos under other open source licenses. |
| | | The use of pretraining model is subject to [model license](./MODEL_LICENSE) |
| | | |
| | | |
| | |
| | | from funasr.utils.types import str2triple_str |
| | | from funasr.utils.types import str_or_none |
| | | from funasr.utils.vad_utils import slice_padding_fbank |
| | | from funasr.utils.speaker_utils import (check_audio_list, |
| | | sv_preprocess, |
| | | sv_chunk, |
| | | CAMPPlus, |
| | | extract_feature, |
| | | from funasr.utils.speaker_utils import (check_audio_list, |
| | | sv_preprocess, |
| | | sv_chunk, |
| | | extract_feature, |
| | | postprocess, |
| | | distribute_spk) |
| | | import funasr.modules.cnn as sv_module |
| | | from funasr.build_utils.build_model_from_file import build_model_from_file |
| | | from funasr.utils.cluster_backend import ClusterBackend |
| | | from funasr.utils.modelscope_utils import get_cache_dir |
| | |
| | | format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", |
| | | ) |
| | | |
| | | sv_model_file = asr_model_file.replace("model.pb", "campplus_cn_common.bin") |
| | | sv_model_config_path = asr_model_file.replace("model.pb", "sv_model_config.yaml") |
| | | if not os.path.exists(sv_model_config_path): |
| | | sv_model_config = {'sv_model_class': 'CAMPPlus','sv_model_file': 'campplus_cn_common.bin', 'models_config': {}} |
| | | else: |
| | | with open(sv_model_config_path, 'r') as f: |
| | | sv_model_config = yaml.load(f, Loader=yaml.FullLoader) |
| | | if sv_model_config['models_config'] is None: |
| | | sv_model_config['models_config'] = {} |
| | | sv_model_file = asr_model_file.replace("model.pb", sv_model_config['sv_model_file']) |
| | | |
| | | if param_dict is not None: |
| | | hotword_list_or_file = param_dict.get('hotword') |
| | |
| | | ##### speaker_verification ##### |
| | | ################################## |
| | | # load sv model |
| | | sv_model_dict = torch.load(sv_model_file, map_location=torch.device('cpu')) |
| | | sv_model = CAMPPlus() |
| | | if ngpu > 0: |
| | | sv_model_dict = torch.load(sv_model_file) |
| | | sv_model = getattr(sv_module, sv_model_config['sv_model_class'])(**sv_model_config['models_config']) |
| | | sv_model.cuda() |
| | | else: |
| | | sv_model_dict = torch.load(sv_model_file, map_location=torch.device('cpu')) |
| | | sv_model = getattr(sv_module, sv_model_config['sv_model_class'])(**sv_model_config['models_config']) |
| | | sv_model.load_state_dict(sv_model_dict) |
| | | print(f'load sv model params: {sv_model_file}') |
| | | sv_model.eval() |
| | | cb_model = ClusterBackend() |
| | | vad_segments = [] |
| | |
| | | embs = [] |
| | | for x in wavs: |
| | | x = extract_feature([x]) |
| | | if ngpu > 0: |
| | | x = x.cuda() |
| | | embs.append(sv_model(x)) |
| | | embs = torch.cat(embs) |
| | | embeddings.append(embs.detach().numpy()) |
| | | embeddings.append(embs.cpu().detach().numpy()) |
| | | embeddings = np.concatenate(embeddings) |
| | | labels = cb_model(embeddings) |
| | | sv_output = postprocess(segments, vad_segments, labels, embeddings) |
| New file |
| | |
| | | # Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved. |
| | | # Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0) |
| | | |
| | | """ This implementation is adapted from https://github.com/wenet-e2e/wespeaker.""" |
| | | |
| | | import torch |
| | | import torch.nn as nn |
| | | |
| | | |
| | | class TAP(nn.Module): |
| | | """ |
| | | Temporal average pooling, only first-order mean is considered |
| | | """ |
| | | |
| | | def __init__(self, **kwargs): |
| | | super(TAP, self).__init__() |
| | | |
| | | def forward(self, x): |
| | | pooling_mean = x.mean(dim=-1) |
| | | # To be compatable with 2D input |
| | | pooling_mean = pooling_mean.flatten(start_dim=1) |
| | | return pooling_mean |
| | | |
| | | |
| | | class TSDP(nn.Module): |
| | | """ |
| | | Temporal standard deviation pooling, only second-order std is considered |
| | | """ |
| | | |
| | | def __init__(self, **kwargs): |
| | | super(TSDP, self).__init__() |
| | | |
| | | def forward(self, x): |
| | | # The last dimension is the temporal axis |
| | | pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8) |
| | | pooling_std = pooling_std.flatten(start_dim=1) |
| | | return pooling_std |
| | | |
| | | |
| | | class TSTP(nn.Module): |
| | | """ |
| | | Temporal statistics pooling, concatenate mean and std, which is used in |
| | | x-vector |
| | | Comment: simple concatenation can not make full use of both statistics |
| | | """ |
| | | |
| | | def __init__(self, **kwargs): |
| | | super(TSTP, self).__init__() |
| | | |
| | | def forward(self, x): |
| | | # The last dimension is the temporal axis |
| | | pooling_mean = x.mean(dim=-1) |
| | | pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8) |
| | | pooling_mean = pooling_mean.flatten(start_dim=1) |
| | | pooling_std = pooling_std.flatten(start_dim=1) |
| | | |
| | | stats = torch.cat((pooling_mean, pooling_std), 1) |
| | | return stats |
| | | |
| | | |
| | | class ASTP(nn.Module): |
| | | """ Attentive statistics pooling: Channel- and context-dependent |
| | | statistics pooling, first used in ECAPA_TDNN. |
| | | """ |
| | | |
| | | def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False): |
| | | super(ASTP, self).__init__() |
| | | self.global_context_att = global_context_att |
| | | |
| | | # Use Conv1d with stride == 1 rather than Linear, then we don't |
| | | # need to transpose inputs. |
| | | if global_context_att: |
| | | self.linear1 = nn.Conv1d( |
| | | in_dim * 3, bottleneck_dim, |
| | | kernel_size=1) # equals W and b in the paper |
| | | else: |
| | | self.linear1 = nn.Conv1d( |
| | | in_dim, bottleneck_dim, |
| | | kernel_size=1) # equals W and b in the paper |
| | | self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, |
| | | kernel_size=1) # equals V and k in the paper |
| | | |
| | | def forward(self, x): |
| | | """ |
| | | x: a 3-dimensional tensor in tdnn-based architecture (B,F,T) |
| | | or a 4-dimensional tensor in resnet architecture (B,C,F,T) |
| | | 0-dim: batch-dimension, last-dim: time-dimension (frame-dimension) |
| | | """ |
| | | if len(x.shape) == 4: |
| | | x = x.reshape(x.shape[0], x.shape[1] * x.shape[2], x.shape[3]) |
| | | assert len(x.shape) == 3 |
| | | |
| | | if self.global_context_att: |
| | | context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x) |
| | | context_std = torch.sqrt( |
| | | torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x) |
| | | x_in = torch.cat((x, context_mean, context_std), dim=1) |
| | | else: |
| | | x_in = x |
| | | |
| | | # DON'T use ReLU here! ReLU may be hard to converge. |
| | | alpha = torch.tanh( |
| | | self.linear1(x_in)) # alpha = F.relu(self.linear1(x_in)) |
| | | alpha = torch.softmax(self.linear2(alpha), dim=2) |
| | | mean = torch.sum(alpha * x, dim=2) |
| | | var = torch.sum(alpha * (x ** 2), dim=2) - mean ** 2 |
| | | std = torch.sqrt(var.clamp(min=1e-10)) |
| | | return torch.cat([mean, std], dim=1) |
| New file |
| | |
| | | # Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved. |
| | | # Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0) |
| | | |
| | | from collections import OrderedDict |
| | | |
| | | import torch.nn.functional as F |
| | | from torch import nn |
| | | |
| | | from funasr.modules.cnn.layers import DenseLayer, StatsPool, TDNNLayer, CAMDenseTDNNBlock, TransitLayer, \ |
| | | BasicResBlock, get_nonlinear |
| | | |
| | | |
| | | class FCM(nn.Module): |
| | | def __init__(self, |
| | | block=BasicResBlock, |
| | | num_blocks=[2, 2], |
| | | m_channels=32, |
| | | feat_dim=80): |
| | | super(FCM, self).__init__() |
| | | self.in_planes = m_channels |
| | | self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False) |
| | | self.bn1 = nn.BatchNorm2d(m_channels) |
| | | |
| | | self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=2) |
| | | self.layer2 = self._make_layer(block, m_channels, num_blocks[0], stride=2) |
| | | |
| | | self.conv2 = nn.Conv2d(m_channels, m_channels, kernel_size=3, stride=(2, 1), padding=1, bias=False) |
| | | self.bn2 = nn.BatchNorm2d(m_channels) |
| | | self.out_channels = m_channels * (feat_dim // 8) |
| | | |
| | | def _make_layer(self, block, planes, num_blocks, stride): |
| | | strides = [stride] + [1] * (num_blocks - 1) |
| | | layers = [] |
| | | for stride in strides: |
| | | layers.append(block(self.in_planes, planes, stride)) |
| | | self.in_planes = planes * block.expansion |
| | | return nn.Sequential(*layers) |
| | | |
| | | def forward(self, x): |
| | | x = x.unsqueeze(1) |
| | | out = F.relu(self.bn1(self.conv1(x))) |
| | | out = self.layer1(out) |
| | | out = self.layer2(out) |
| | | out = F.relu(self.bn2(self.conv2(out))) |
| | | |
| | | shape = out.shape |
| | | out = out.reshape(shape[0], shape[1] * shape[2], shape[3]) |
| | | return out |
| | | |
| | | |
| | | class CAMPPlus(nn.Module): |
| | | def __init__(self, |
| | | feat_dim=80, |
| | | embedding_size=192, |
| | | growth_rate=32, |
| | | bn_size=4, |
| | | init_channels=128, |
| | | config_str='batchnorm-relu', |
| | | memory_efficient=True, |
| | | output_level='segment'): |
| | | super(CAMPPlus, self).__init__() |
| | | |
| | | self.head = FCM(feat_dim=feat_dim) |
| | | channels = self.head.out_channels |
| | | self.output_level = output_level |
| | | |
| | | self.xvector = nn.Sequential( |
| | | OrderedDict([ |
| | | |
| | | ('tdnn', |
| | | TDNNLayer(channels, |
| | | init_channels, |
| | | 5, |
| | | stride=2, |
| | | dilation=1, |
| | | padding=-1, |
| | | config_str=config_str)), |
| | | ])) |
| | | channels = init_channels |
| | | for i, (num_layers, kernel_size, |
| | | dilation) in enumerate(zip((12, 24, 16), (3, 3, 3), (1, 2, 2))): |
| | | block = CAMDenseTDNNBlock(num_layers=num_layers, |
| | | in_channels=channels, |
| | | out_channels=growth_rate, |
| | | bn_channels=bn_size * growth_rate, |
| | | kernel_size=kernel_size, |
| | | dilation=dilation, |
| | | config_str=config_str, |
| | | memory_efficient=memory_efficient) |
| | | self.xvector.add_module('block%d' % (i + 1), block) |
| | | channels = channels + num_layers * growth_rate |
| | | self.xvector.add_module( |
| | | 'transit%d' % (i + 1), |
| | | TransitLayer(channels, |
| | | channels // 2, |
| | | bias=False, |
| | | config_str=config_str)) |
| | | channels //= 2 |
| | | |
| | | self.xvector.add_module( |
| | | 'out_nonlinear', get_nonlinear(config_str, channels)) |
| | | |
| | | if self.output_level == 'segment': |
| | | self.xvector.add_module('stats', StatsPool()) |
| | | self.xvector.add_module( |
| | | 'dense', |
| | | DenseLayer( |
| | | channels * 2, embedding_size, config_str='batchnorm_')) |
| | | else: |
| | | assert self.output_level == 'frame', '`output_level` should be set to \'segment\' or \'frame\'. ' |
| | | |
| | | for m in self.modules(): |
| | | if isinstance(m, (nn.Conv1d, nn.Linear)): |
| | | nn.init.kaiming_normal_(m.weight.data) |
| | | if m.bias is not None: |
| | | nn.init.zeros_(m.bias) |
| | | |
| | | def forward(self, x): |
| | | x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T) |
| | | x = self.head(x) |
| | | x = self.xvector(x) |
| | | if self.output_level == 'frame': |
| | | x = x.transpose(1, 2) |
| | | return x |
| New file |
| | |
| | | # Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved. |
| | | # Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0) |
| | | |
| | | """ Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker. |
| | | ERes2Net incorporates both local and global feature fusion techniques to improve the performance. |
| | | The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal. |
| | | The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal. |
| | | ERes2Net-Large is an upgraded version of ERes2Net that uses a larger number of parameters to achieve better |
| | | recognition performance. Parameters expansion, baseWidth, and scale can be modified to obtain optimal performance. |
| | | """ |
| | | |
| | | import math |
| | | |
| | | import torch |
| | | import torch.nn as nn |
| | | import torch.nn.functional as F |
| | | |
| | | import funasr.models.pooling.pooling_layers as pooling_layers |
| | | from funasr.modules.cnn.fusion import AFF |
| | | |
| | | |
| | | class ReLU(nn.Hardtanh): |
| | | |
| | | def __init__(self, inplace=False): |
| | | super(ReLU, self).__init__(0, 20, inplace) |
| | | |
| | | def __repr__(self): |
| | | inplace_str = 'inplace' if self.inplace else '' |
| | | return self.__class__.__name__ + ' (' \ |
| | | + inplace_str + ')' |
| | | |
| | | |
| | | def conv1x1(in_planes, out_planes, stride=1): |
| | | "1x1 convolution without padding" |
| | | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, |
| | | padding=0, bias=False) |
| | | |
| | | |
| | | def conv3x3(in_planes, out_planes, stride=1): |
| | | "3x3 convolution with padding" |
| | | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, |
| | | padding=1, bias=False) |
| | | |
| | | |
| | | class BasicBlockERes2Net(nn.Module): |
| | | expansion = 2 |
| | | |
| | | def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2): |
| | | super(BasicBlockERes2Net, self).__init__() |
| | | width = int(math.floor(planes * (baseWidth / 64.0))) |
| | | self.conv1 = conv1x1(in_planes, width * scale, stride) |
| | | self.bn1 = nn.BatchNorm2d(width * scale) |
| | | self.nums = scale |
| | | |
| | | convs = [] |
| | | bns = [] |
| | | for i in range(self.nums): |
| | | convs.append(conv3x3(width, width)) |
| | | bns.append(nn.BatchNorm2d(width)) |
| | | self.convs = nn.ModuleList(convs) |
| | | self.bns = nn.ModuleList(bns) |
| | | self.relu = ReLU(inplace=True) |
| | | |
| | | self.conv3 = conv1x1(width * scale, planes * self.expansion) |
| | | self.bn3 = nn.BatchNorm2d(planes * self.expansion) |
| | | self.shortcut = nn.Sequential() |
| | | if stride != 1 or in_planes != self.expansion * planes: |
| | | self.shortcut = nn.Sequential( |
| | | nn.Conv2d(in_planes, |
| | | self.expansion * planes, |
| | | kernel_size=1, |
| | | stride=stride, |
| | | bias=False), |
| | | nn.BatchNorm2d(self.expansion * planes)) |
| | | self.stride = stride |
| | | self.width = width |
| | | self.scale = scale |
| | | |
| | | def forward(self, x): |
| | | residual = x |
| | | |
| | | out = self.conv1(x) |
| | | out = self.bn1(out) |
| | | out = self.relu(out) |
| | | spx = torch.split(out, self.width, 1) |
| | | for i in range(self.nums): |
| | | if i == 0: |
| | | sp = spx[i] |
| | | else: |
| | | sp = sp + spx[i] |
| | | sp = self.convs[i](sp) |
| | | sp = self.relu(self.bns[i](sp)) |
| | | if i == 0: |
| | | out = sp |
| | | else: |
| | | out = torch.cat((out, sp), 1) |
| | | |
| | | out = self.conv3(out) |
| | | out = self.bn3(out) |
| | | |
| | | residual = self.shortcut(x) |
| | | out += residual |
| | | out = self.relu(out) |
| | | |
| | | return out |
| | | |
| | | |
| | | class BasicBlockERes2Net_diff_AFF(nn.Module): |
| | | expansion = 2 |
| | | |
| | | def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2): |
| | | super(BasicBlockERes2Net_diff_AFF, self).__init__() |
| | | width = int(math.floor(planes * (baseWidth / 64.0))) |
| | | self.conv1 = conv1x1(in_planes, width * scale, stride) |
| | | self.bn1 = nn.BatchNorm2d(width * scale) |
| | | self.nums = scale |
| | | |
| | | convs = [] |
| | | fuse_models = [] |
| | | bns = [] |
| | | for i in range(self.nums): |
| | | convs.append(conv3x3(width, width)) |
| | | bns.append(nn.BatchNorm2d(width)) |
| | | for j in range(self.nums - 1): |
| | | fuse_models.append(AFF(channels=width)) |
| | | |
| | | self.convs = nn.ModuleList(convs) |
| | | self.bns = nn.ModuleList(bns) |
| | | self.fuse_models = nn.ModuleList(fuse_models) |
| | | self.relu = ReLU(inplace=True) |
| | | |
| | | self.conv3 = conv1x1(width * scale, planes * self.expansion) |
| | | self.bn3 = nn.BatchNorm2d(planes * self.expansion) |
| | | self.shortcut = nn.Sequential() |
| | | if stride != 1 or in_planes != self.expansion * planes: |
| | | self.shortcut = nn.Sequential( |
| | | nn.Conv2d(in_planes, |
| | | self.expansion * planes, |
| | | kernel_size=1, |
| | | stride=stride, |
| | | bias=False), |
| | | nn.BatchNorm2d(self.expansion * planes)) |
| | | self.stride = stride |
| | | self.width = width |
| | | self.scale = scale |
| | | |
| | | def forward(self, x): |
| | | residual = x |
| | | |
| | | out = self.conv1(x) |
| | | out = self.bn1(out) |
| | | out = self.relu(out) |
| | | spx = torch.split(out, self.width, 1) |
| | | for i in range(self.nums): |
| | | if i == 0: |
| | | sp = spx[i] |
| | | else: |
| | | sp = self.fuse_models[i - 1](sp, spx[i]) |
| | | |
| | | sp = self.convs[i](sp) |
| | | sp = self.relu(self.bns[i](sp)) |
| | | if i == 0: |
| | | out = sp |
| | | else: |
| | | out = torch.cat((out, sp), 1) |
| | | |
| | | out = self.conv3(out) |
| | | out = self.bn3(out) |
| | | |
| | | residual = self.shortcut(x) |
| | | out += residual |
| | | out = self.relu(out) |
| | | |
| | | return out |
| | | |
| | | |
| | | class ERes2Net(nn.Module): |
| | | def __init__(self, |
| | | block=BasicBlockERes2Net, |
| | | block_fuse=BasicBlockERes2Net_diff_AFF, |
| | | num_blocks=[3, 4, 6, 3], |
| | | m_channels=32, |
| | | feat_dim=80, |
| | | embedding_size=192, |
| | | pooling_func='TSTP', |
| | | two_emb_layer=False): |
| | | super(ERes2Net, self).__init__() |
| | | self.in_planes = m_channels |
| | | self.feat_dim = feat_dim |
| | | self.embedding_size = embedding_size |
| | | self.stats_dim = int(feat_dim / 8) * m_channels * 8 |
| | | self.two_emb_layer = two_emb_layer |
| | | |
| | | self.conv1 = nn.Conv2d(1, |
| | | m_channels, |
| | | kernel_size=3, |
| | | stride=1, |
| | | padding=1, |
| | | bias=False) |
| | | self.bn1 = nn.BatchNorm2d(m_channels) |
| | | self.layer1 = self._make_layer(block, |
| | | m_channels, |
| | | num_blocks[0], |
| | | stride=1) |
| | | self.layer2 = self._make_layer(block, |
| | | m_channels * 2, |
| | | num_blocks[1], |
| | | stride=2) |
| | | self.layer3 = self._make_layer(block_fuse, |
| | | m_channels * 4, |
| | | num_blocks[2], |
| | | stride=2) |
| | | self.layer4 = self._make_layer(block_fuse, |
| | | m_channels * 8, |
| | | num_blocks[3], |
| | | stride=2) |
| | | |
| | | # Downsampling module for each layer |
| | | self.layer1_downsample = nn.Conv2d(m_channels * 2, m_channels * 4, kernel_size=3, stride=2, padding=1, |
| | | bias=False) |
| | | self.layer2_downsample = nn.Conv2d(m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, |
| | | bias=False) |
| | | self.layer3_downsample = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, |
| | | bias=False) |
| | | |
| | | # Bottom-up fusion module |
| | | self.fuse_mode12 = AFF(channels=m_channels * 4) |
| | | self.fuse_mode123 = AFF(channels=m_channels * 8) |
| | | self.fuse_mode1234 = AFF(channels=m_channels * 16) |
| | | |
| | | self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2 |
| | | self.pool = getattr(pooling_layers, pooling_func)( |
| | | in_dim=self.stats_dim * block.expansion) |
| | | self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, |
| | | embedding_size) |
| | | if self.two_emb_layer: |
| | | self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False) |
| | | self.seg_2 = nn.Linear(embedding_size, embedding_size) |
| | | else: |
| | | self.seg_bn_1 = nn.Identity() |
| | | self.seg_2 = nn.Identity() |
| | | |
| | | def _make_layer(self, block, planes, num_blocks, stride): |
| | | strides = [stride] + [1] * (num_blocks - 1) |
| | | layers = [] |
| | | for stride in strides: |
| | | layers.append(block(self.in_planes, planes, stride)) |
| | | self.in_planes = planes * block.expansion |
| | | return nn.Sequential(*layers) |
| | | |
| | | def forward(self, x): |
| | | x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T) |
| | | x = x.unsqueeze_(1) |
| | | out = F.relu(self.bn1(self.conv1(x))) |
| | | out1 = self.layer1(out) |
| | | out2 = self.layer2(out1) |
| | | out1_downsample = self.layer1_downsample(out1) |
| | | fuse_out12 = self.fuse_mode12(out2, out1_downsample) |
| | | out3 = self.layer3(out2) |
| | | fuse_out12_downsample = self.layer2_downsample(fuse_out12) |
| | | fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample) |
| | | out4 = self.layer4(out3) |
| | | fuse_out123_downsample = self.layer3_downsample(fuse_out123) |
| | | fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample) |
| | | stats = self.pool(fuse_out1234) |
| | | |
| | | embed_a = self.seg_1(stats) |
| | | if self.two_emb_layer: |
| | | out = F.relu(embed_a) |
| | | out = self.seg_bn_1(out) |
| | | embed_b = self.seg_2(out) |
| | | return embed_b |
| | | else: |
| | | return embed_a |
| | | |
| | | |
| | | class BasicBlockRes2Net(nn.Module): |
| | | expansion = 2 |
| | | |
| | | def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2): |
| | | super(BasicBlockRes2Net, self).__init__() |
| | | width = int(math.floor(planes * (baseWidth / 64.0))) |
| | | self.conv1 = conv1x1(in_planes, width * scale, stride) |
| | | self.bn1 = nn.BatchNorm2d(width * scale) |
| | | self.nums = scale - 1 |
| | | convs = [] |
| | | bns = [] |
| | | for i in range(self.nums): |
| | | convs.append(conv3x3(width, width)) |
| | | bns.append(nn.BatchNorm2d(width)) |
| | | self.convs = nn.ModuleList(convs) |
| | | self.bns = nn.ModuleList(bns) |
| | | self.relu = ReLU(inplace=True) |
| | | |
| | | self.conv3 = conv1x1(width * scale, planes * self.expansion) |
| | | self.bn3 = nn.BatchNorm2d(planes * self.expansion) |
| | | self.shortcut = nn.Sequential() |
| | | if stride != 1 or in_planes != self.expansion * planes: |
| | | self.shortcut = nn.Sequential( |
| | | nn.Conv2d(in_planes, |
| | | self.expansion * planes, |
| | | kernel_size=1, |
| | | stride=stride, |
| | | bias=False), |
| | | nn.BatchNorm2d(self.expansion * planes)) |
| | | self.stride = stride |
| | | self.width = width |
| | | self.scale = scale |
| | | |
| | | def forward(self, x): |
| | | residual = x |
| | | |
| | | out = self.conv1(x) |
| | | out = self.bn1(out) |
| | | out = self.relu(out) |
| | | spx = torch.split(out, self.width, 1) |
| | | for i in range(self.nums): |
| | | if i == 0: |
| | | sp = spx[i] |
| | | else: |
| | | sp = sp + spx[i] |
| | | sp = self.convs[i](sp) |
| | | sp = self.relu(self.bns[i](sp)) |
| | | if i == 0: |
| | | out = sp |
| | | else: |
| | | out = torch.cat((out, sp), 1) |
| | | |
| | | out = torch.cat((out, spx[self.nums]), 1) |
| | | |
| | | out = self.conv3(out) |
| | | out = self.bn3(out) |
| | | |
| | | residual = self.shortcut(x) |
| | | out += residual |
| | | out = self.relu(out) |
| | | |
| | | return out |
| | | |
| | | |
| | | class Res2Net(nn.Module): |
| | | def __init__(self, |
| | | block=BasicBlockRes2Net, |
| | | num_blocks=[3, 4, 6, 3], |
| | | m_channels=32, |
| | | feat_dim=80, |
| | | embedding_size=192, |
| | | pooling_func='TSTP', |
| | | two_emb_layer=False): |
| | | super(Res2Net, self).__init__() |
| | | self.in_planes = m_channels |
| | | self.feat_dim = feat_dim |
| | | self.embedding_size = embedding_size |
| | | self.stats_dim = int(feat_dim / 8) * m_channels * 8 |
| | | self.two_emb_layer = two_emb_layer |
| | | |
| | | self.conv1 = nn.Conv2d(1, |
| | | m_channels, |
| | | kernel_size=3, |
| | | stride=1, |
| | | padding=1, |
| | | bias=False) |
| | | self.bn1 = nn.BatchNorm2d(m_channels) |
| | | self.layer1 = self._make_layer(block, |
| | | m_channels, |
| | | num_blocks[0], |
| | | stride=1) |
| | | self.layer2 = self._make_layer(block, |
| | | m_channels * 2, |
| | | num_blocks[1], |
| | | stride=2) |
| | | self.layer3 = self._make_layer(block, |
| | | m_channels * 4, |
| | | num_blocks[2], |
| | | stride=2) |
| | | self.layer4 = self._make_layer(block, |
| | | m_channels * 8, |
| | | num_blocks[3], |
| | | stride=2) |
| | | |
| | | self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2 |
| | | self.pool = getattr(pooling_layers, pooling_func)( |
| | | in_dim=self.stats_dim * block.expansion) |
| | | self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, |
| | | embedding_size) |
| | | if self.two_emb_layer: |
| | | self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False) |
| | | self.seg_2 = nn.Linear(embedding_size, embedding_size) |
| | | else: |
| | | self.seg_bn_1 = nn.Identity() |
| | | self.seg_2 = nn.Identity() |
| | | |
| | | def _make_layer(self, block, planes, num_blocks, stride): |
| | | strides = [stride] + [1] * (num_blocks - 1) |
| | | layers = [] |
| | | for stride in strides: |
| | | layers.append(block(self.in_planes, planes, stride)) |
| | | self.in_planes = planes * block.expansion |
| | | return nn.Sequential(*layers) |
| | | |
| | | def forward(self, x): |
| | | x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T) |
| | | |
| | | x = x.unsqueeze_(1) |
| | | out = F.relu(self.bn1(self.conv1(x))) |
| | | out = self.layer1(out) |
| | | out = self.layer2(out) |
| | | out = self.layer3(out) |
| | | out = self.layer4(out) |
| | | |
| | | stats = self.pool(out) |
| | | |
| | | embed_a = self.seg_1(stats) |
| | | if self.two_emb_layer: |
| | | out = F.relu(embed_a) |
| | | out = self.seg_bn_1(out) |
| | | embed_b = self.seg_2(out) |
| | | return embed_b |
| | | else: |
| | | return embed_a |
| New file |
| | |
| | | # Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved. |
| | | # Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0) |
| | | |
| | | """ Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker. |
| | | ERes2Net incorporates both local and global feature fusion techniques to improve the performance. |
| | | The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal. |
| | | The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal. |
| | | ERes2Net-Large is an upgraded version of ERes2Net that uses a larger number of parameters to achieve better |
| | | recognition performance. Parameters expansion, baseWidth, and scale can be modified to obtain optimal performance. |
| | | """ |
| | | |
| | | import math |
| | | |
| | | import torch |
| | | import torch.nn as nn |
| | | import torch.nn.functional as F |
| | | |
| | | import funasr.models.pooling.pooling_layers as pooling_layers |
| | | from funasr.modules.cnn.fusion import AFF |
| | | |
| | | |
| | | class ReLU(nn.Hardtanh): |
| | | |
| | | def __init__(self, inplace=False): |
| | | super(ReLU, self).__init__(0, 20, inplace) |
| | | |
| | | def __repr__(self): |
| | | inplace_str = 'inplace' if self.inplace else '' |
| | | return self.__class__.__name__ + ' (' \ |
| | | + inplace_str + ')' |
| | | |
| | | |
| | | def conv1x1(in_planes, out_planes, stride=1): |
| | | "1x1 convolution without padding" |
| | | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, |
| | | padding=0, bias=False) |
| | | |
| | | |
| | | def conv3x3(in_planes, out_planes, stride=1): |
| | | "3x3 convolution with padding" |
| | | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, |
| | | padding=1, bias=False) |
| | | |
| | | |
| | | class BasicBlockERes2Net(nn.Module): |
| | | expansion = 4 |
| | | |
| | | def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3): |
| | | super(BasicBlockERes2Net, self).__init__() |
| | | width = int(math.floor(planes * (baseWidth / 64.0))) |
| | | self.conv1 = conv1x1(in_planes, width * scale, stride) |
| | | self.bn1 = nn.BatchNorm2d(width * scale) |
| | | self.nums = scale |
| | | |
| | | convs = [] |
| | | bns = [] |
| | | for i in range(self.nums): |
| | | convs.append(conv3x3(width, width)) |
| | | bns.append(nn.BatchNorm2d(width)) |
| | | self.convs = nn.ModuleList(convs) |
| | | self.bns = nn.ModuleList(bns) |
| | | self.relu = ReLU(inplace=True) |
| | | |
| | | self.conv3 = conv1x1(width * scale, planes * self.expansion) |
| | | self.bn3 = nn.BatchNorm2d(planes * self.expansion) |
| | | self.shortcut = nn.Sequential() |
| | | if stride != 1 or in_planes != self.expansion * planes: |
| | | self.shortcut = nn.Sequential( |
| | | nn.Conv2d(in_planes, |
| | | self.expansion * planes, |
| | | kernel_size=1, |
| | | stride=stride, |
| | | bias=False), |
| | | nn.BatchNorm2d(self.expansion * planes)) |
| | | self.stride = stride |
| | | self.width = width |
| | | self.scale = scale |
| | | |
| | | def forward(self, x): |
| | | residual = x |
| | | |
| | | out = self.conv1(x) |
| | | out = self.bn1(out) |
| | | out = self.relu(out) |
| | | spx = torch.split(out, self.width, 1) |
| | | for i in range(self.nums): |
| | | if i == 0: |
| | | sp = spx[i] |
| | | else: |
| | | sp = sp + spx[i] |
| | | sp = self.convs[i](sp) |
| | | sp = self.relu(self.bns[i](sp)) |
| | | if i == 0: |
| | | out = sp |
| | | else: |
| | | out = torch.cat((out, sp), 1) |
| | | |
| | | out = self.conv3(out) |
| | | out = self.bn3(out) |
| | | |
| | | residual = self.shortcut(x) |
| | | out += residual |
| | | out = self.relu(out) |
| | | |
| | | return out |
| | | |
| | | |
| | | class BasicBlockERes2Net_diff_AFF(nn.Module): |
| | | expansion = 4 |
| | | |
| | | def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3): |
| | | super(BasicBlockERes2Net_diff_AFF, self).__init__() |
| | | width = int(math.floor(planes * (baseWidth / 64.0))) |
| | | self.conv1 = conv1x1(in_planes, width * scale, stride) |
| | | self.bn1 = nn.BatchNorm2d(width * scale) |
| | | |
| | | self.nums = scale |
| | | |
| | | convs = [] |
| | | fuse_models = [] |
| | | bns = [] |
| | | for i in range(self.nums): |
| | | convs.append(conv3x3(width, width)) |
| | | bns.append(nn.BatchNorm2d(width)) |
| | | for j in range(self.nums - 1): |
| | | fuse_models.append(AFF(channels=width)) |
| | | |
| | | self.convs = nn.ModuleList(convs) |
| | | self.bns = nn.ModuleList(bns) |
| | | self.fuse_models = nn.ModuleList(fuse_models) |
| | | self.relu = ReLU(inplace=True) |
| | | |
| | | self.conv3 = conv1x1(width * scale, planes * self.expansion) |
| | | self.bn3 = nn.BatchNorm2d(planes * self.expansion) |
| | | self.shortcut = nn.Sequential() |
| | | if stride != 1 or in_planes != self.expansion * planes: |
| | | self.shortcut = nn.Sequential( |
| | | nn.Conv2d(in_planes, |
| | | self.expansion * planes, |
| | | kernel_size=1, |
| | | stride=stride, |
| | | bias=False), |
| | | nn.BatchNorm2d(self.expansion * planes)) |
| | | self.stride = stride |
| | | self.width = width |
| | | self.scale = scale |
| | | |
| | | def forward(self, x): |
| | | residual = x |
| | | |
| | | out = self.conv1(x) |
| | | out = self.bn1(out) |
| | | out = self.relu(out) |
| | | spx = torch.split(out, self.width, 1) |
| | | for i in range(self.nums): |
| | | if i == 0: |
| | | sp = spx[i] |
| | | else: |
| | | sp = self.fuse_models[i - 1](sp, spx[i]) |
| | | |
| | | sp = self.convs[i](sp) |
| | | sp = self.relu(self.bns[i](sp)) |
| | | if i == 0: |
| | | out = sp |
| | | else: |
| | | out = torch.cat((out, sp), 1) |
| | | |
| | | out = self.conv3(out) |
| | | out = self.bn3(out) |
| | | |
| | | residual = self.shortcut(x) |
| | | out += residual |
| | | out = self.relu(out) |
| | | |
| | | return out |
| | | |
| | | |
| | | class ERes2NetAug(nn.Module): |
| | | def __init__(self, |
| | | block=BasicBlockERes2Net, |
| | | block_fuse=BasicBlockERes2Net_diff_AFF, |
| | | num_blocks=[3, 4, 6, 3], |
| | | m_channels=64, |
| | | feat_dim=80, |
| | | embedding_size=192, |
| | | pooling_func='TSTP', |
| | | two_emb_layer=False): |
| | | super(ERes2NetAug, self).__init__() |
| | | self.in_planes = m_channels |
| | | self.feat_dim = feat_dim |
| | | self.embedding_size = embedding_size |
| | | self.stats_dim = int(feat_dim / 8) * m_channels * 8 |
| | | self.two_emb_layer = two_emb_layer |
| | | |
| | | self.conv1 = nn.Conv2d(1, |
| | | m_channels, |
| | | kernel_size=3, |
| | | stride=1, |
| | | padding=1, |
| | | bias=False) |
| | | self.bn1 = nn.BatchNorm2d(m_channels) |
| | | self.layer1 = self._make_layer(block, |
| | | m_channels, |
| | | num_blocks[0], |
| | | stride=1) |
| | | self.layer2 = self._make_layer(block, |
| | | m_channels * 2, |
| | | num_blocks[1], |
| | | stride=2) |
| | | self.layer3 = self._make_layer(block_fuse, |
| | | m_channels * 4, |
| | | num_blocks[2], |
| | | stride=2) |
| | | self.layer4 = self._make_layer(block_fuse, |
| | | m_channels * 8, |
| | | num_blocks[3], |
| | | stride=2) |
| | | |
| | | self.layer1_downsample = nn.Conv2d(m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, |
| | | bias=False) |
| | | self.layer2_downsample = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, |
| | | bias=False) |
| | | self.layer3_downsample = nn.Conv2d(m_channels * 16, m_channels * 32, kernel_size=3, padding=1, stride=2, |
| | | bias=False) |
| | | self.fuse_mode12 = AFF(channels=m_channels * 8) |
| | | self.fuse_mode123 = AFF(channels=m_channels * 16) |
| | | self.fuse_mode1234 = AFF(channels=m_channels * 32) |
| | | |
| | | self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2 |
| | | self.pool = getattr(pooling_layers, pooling_func)( |
| | | in_dim=self.stats_dim * block.expansion) |
| | | self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, |
| | | embedding_size) |
| | | if self.two_emb_layer: |
| | | self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False) |
| | | self.seg_2 = nn.Linear(embedding_size, embedding_size) |
| | | else: |
| | | self.seg_bn_1 = nn.Identity() |
| | | self.seg_2 = nn.Identity() |
| | | |
| | | def _make_layer(self, block, planes, num_blocks, stride): |
| | | strides = [stride] + [1] * (num_blocks - 1) |
| | | layers = [] |
| | | for stride in strides: |
| | | layers.append(block(self.in_planes, planes, stride)) |
| | | self.in_planes = planes * block.expansion |
| | | return nn.Sequential(*layers) |
| | | |
| | | def forward(self, x): |
| | | x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T) |
| | | |
| | | x = x.unsqueeze_(1) |
| | | out = F.relu(self.bn1(self.conv1(x))) |
| | | out1 = self.layer1(out) |
| | | out2 = self.layer2(out1) |
| | | out1_downsample = self.layer1_downsample(out1) |
| | | fuse_out12 = self.fuse_mode12(out2, out1_downsample) |
| | | out3 = self.layer3(out2) |
| | | fuse_out12_downsample = self.layer2_downsample(fuse_out12) |
| | | fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample) |
| | | out4 = self.layer4(out3) |
| | | fuse_out123_downsample = self.layer3_downsample(fuse_out123) |
| | | fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample) |
| | | stats = self.pool(fuse_out1234) |
| | | |
| | | embed_a = self.seg_1(stats) |
| | | if self.two_emb_layer: |
| | | out = F.relu(embed_a) |
| | | out = self.seg_bn_1(out) |
| | | embed_b = self.seg_2(out) |
| | | return embed_b |
| | | else: |
| | | return embed_a |
| New file |
| | |
| | | from .DTDNN import CAMPPlus |
| | | from .ResNet import ERes2Net |
| | | from .ResNet_aug import ERes2NetAug |
| New file |
| | |
| | | # Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved. |
| | | # Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0) |
| | | |
| | | import torch |
| | | import torch.nn as nn |
| | | |
| | | |
| | | class AFF(nn.Module): |
| | | |
| | | def __init__(self, channels=64, r=4): |
| | | super(AFF, self).__init__() |
| | | inter_channels = int(channels // r) |
| | | |
| | | self.local_att = nn.Sequential( |
| | | nn.Conv2d(channels * 2, inter_channels, kernel_size=1, stride=1, padding=0), |
| | | nn.BatchNorm2d(inter_channels), |
| | | nn.SiLU(inplace=True), |
| | | nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0), |
| | | nn.BatchNorm2d(channels), |
| | | ) |
| | | |
| | | def forward(self, x, ds_y): |
| | | xa = torch.cat((x, ds_y), dim=1) |
| | | x_att = self.local_att(xa) |
| | | x_att = 1.0 + torch.tanh(x_att) |
| | | xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0-x_att) |
| | | |
| | | return xo |
| | | |
| New file |
| | |
| | | # Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved. |
| | | # Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0) |
| | | |
| | | import torch |
| | | import torch.nn.functional as F |
| | | import torch.utils.checkpoint as cp |
| | | from torch import nn |
| | | |
| | | |
| | | def get_nonlinear(config_str, channels): |
| | | nonlinear = nn.Sequential() |
| | | for name in config_str.split('-'): |
| | | if name == 'relu': |
| | | nonlinear.add_module('relu', nn.ReLU(inplace=True)) |
| | | elif name == 'prelu': |
| | | nonlinear.add_module('prelu', nn.PReLU(channels)) |
| | | elif name == 'batchnorm': |
| | | nonlinear.add_module('batchnorm', nn.BatchNorm1d(channels)) |
| | | elif name == 'batchnorm_': |
| | | nonlinear.add_module('batchnorm', |
| | | nn.BatchNorm1d(channels, affine=False)) |
| | | else: |
| | | raise ValueError('Unexpected module ({}).'.format(name)) |
| | | return nonlinear |
| | | |
| | | |
| | | def statistics_pooling(x, dim=-1, keepdim=False, unbiased=True, eps=1e-2): |
| | | mean = x.mean(dim=dim) |
| | | std = x.std(dim=dim, unbiased=unbiased) |
| | | stats = torch.cat([mean, std], dim=-1) |
| | | if keepdim: |
| | | stats = stats.unsqueeze(dim=dim) |
| | | return stats |
| | | |
| | | |
| | | class StatsPool(nn.Module): |
| | | def forward(self, x): |
| | | return statistics_pooling(x) |
| | | |
| | | |
| | | class TDNNLayer(nn.Module): |
| | | def __init__(self, |
| | | in_channels, |
| | | out_channels, |
| | | kernel_size, |
| | | stride=1, |
| | | padding=0, |
| | | dilation=1, |
| | | bias=False, |
| | | config_str='batchnorm-relu'): |
| | | super(TDNNLayer, self).__init__() |
| | | if padding < 0: |
| | | assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format( |
| | | kernel_size) |
| | | padding = (kernel_size - 1) // 2 * dilation |
| | | self.linear = nn.Conv1d(in_channels, |
| | | out_channels, |
| | | kernel_size, |
| | | stride=stride, |
| | | padding=padding, |
| | | dilation=dilation, |
| | | bias=bias) |
| | | self.nonlinear = get_nonlinear(config_str, out_channels) |
| | | |
| | | def forward(self, x): |
| | | x = self.linear(x) |
| | | x = self.nonlinear(x) |
| | | return x |
| | | |
| | | |
| | | class CAMLayer(nn.Module): |
| | | def __init__(self, |
| | | bn_channels, |
| | | out_channels, |
| | | kernel_size, |
| | | stride, |
| | | padding, |
| | | dilation, |
| | | bias, |
| | | reduction=2): |
| | | super(CAMLayer, self).__init__() |
| | | self.linear_local = nn.Conv1d(bn_channels, |
| | | out_channels, |
| | | kernel_size, |
| | | stride=stride, |
| | | padding=padding, |
| | | dilation=dilation, |
| | | bias=bias) |
| | | self.linear1 = nn.Conv1d(bn_channels, bn_channels // reduction, 1) |
| | | self.relu = nn.ReLU(inplace=True) |
| | | self.linear2 = nn.Conv1d(bn_channels // reduction, out_channels, 1) |
| | | self.sigmoid = nn.Sigmoid() |
| | | |
| | | def forward(self, x): |
| | | y = self.linear_local(x) |
| | | context = x.mean(-1, keepdim=True) + self.seg_pooling(x) |
| | | context = self.relu(self.linear1(context)) |
| | | m = self.sigmoid(self.linear2(context)) |
| | | return y * m |
| | | |
| | | def seg_pooling(self, x, seg_len=100, stype='avg'): |
| | | if stype == 'avg': |
| | | seg = F.avg_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True) |
| | | elif stype == 'max': |
| | | seg = F.max_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True) |
| | | else: |
| | | raise ValueError('Wrong segment pooling type.') |
| | | shape = seg.shape |
| | | seg = seg.unsqueeze(-1).expand(*shape, seg_len).reshape(*shape[:-1], -1) |
| | | seg = seg[..., :x.shape[-1]] |
| | | return seg |
| | | |
| | | |
| | | class CAMDenseTDNNLayer(nn.Module): |
| | | def __init__(self, |
| | | in_channels, |
| | | out_channels, |
| | | bn_channels, |
| | | kernel_size, |
| | | stride=1, |
| | | dilation=1, |
| | | bias=False, |
| | | config_str='batchnorm-relu', |
| | | memory_efficient=False): |
| | | super(CAMDenseTDNNLayer, self).__init__() |
| | | assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format( |
| | | kernel_size) |
| | | padding = (kernel_size - 1) // 2 * dilation |
| | | self.memory_efficient = memory_efficient |
| | | self.nonlinear1 = get_nonlinear(config_str, in_channels) |
| | | self.linear1 = nn.Conv1d(in_channels, bn_channels, 1, bias=False) |
| | | self.nonlinear2 = get_nonlinear(config_str, bn_channels) |
| | | self.cam_layer = CAMLayer(bn_channels, |
| | | out_channels, |
| | | kernel_size, |
| | | stride=stride, |
| | | padding=padding, |
| | | dilation=dilation, |
| | | bias=bias) |
| | | |
| | | def bn_function(self, x): |
| | | return self.linear1(self.nonlinear1(x)) |
| | | |
| | | def forward(self, x): |
| | | if self.training and self.memory_efficient: |
| | | x = cp.checkpoint(self.bn_function, x) |
| | | else: |
| | | x = self.bn_function(x) |
| | | x = self.cam_layer(self.nonlinear2(x)) |
| | | return x |
| | | |
| | | |
| | | class CAMDenseTDNNBlock(nn.ModuleList): |
| | | def __init__(self, |
| | | num_layers, |
| | | in_channels, |
| | | out_channels, |
| | | bn_channels, |
| | | kernel_size, |
| | | stride=1, |
| | | dilation=1, |
| | | bias=False, |
| | | config_str='batchnorm-relu', |
| | | memory_efficient=False): |
| | | super(CAMDenseTDNNBlock, self).__init__() |
| | | for i in range(num_layers): |
| | | layer = CAMDenseTDNNLayer(in_channels=in_channels + i * out_channels, |
| | | out_channels=out_channels, |
| | | bn_channels=bn_channels, |
| | | kernel_size=kernel_size, |
| | | stride=stride, |
| | | dilation=dilation, |
| | | bias=bias, |
| | | config_str=config_str, |
| | | memory_efficient=memory_efficient) |
| | | self.add_module('tdnnd%d' % (i + 1), layer) |
| | | |
| | | def forward(self, x): |
| | | for layer in self: |
| | | x = torch.cat([x, layer(x)], dim=1) |
| | | return x |
| | | |
| | | |
| | | class TransitLayer(nn.Module): |
| | | def __init__(self, |
| | | in_channels, |
| | | out_channels, |
| | | bias=True, |
| | | config_str='batchnorm-relu'): |
| | | super(TransitLayer, self).__init__() |
| | | self.nonlinear = get_nonlinear(config_str, in_channels) |
| | | self.linear = nn.Conv1d(in_channels, out_channels, 1, bias=bias) |
| | | |
| | | def forward(self, x): |
| | | x = self.nonlinear(x) |
| | | x = self.linear(x) |
| | | return x |
| | | |
| | | |
| | | class DenseLayer(nn.Module): |
| | | def __init__(self, |
| | | in_channels, |
| | | out_channels, |
| | | bias=False, |
| | | config_str='batchnorm-relu'): |
| | | super(DenseLayer, self).__init__() |
| | | self.linear = nn.Conv1d(in_channels, out_channels, 1, bias=bias) |
| | | self.nonlinear = get_nonlinear(config_str, out_channels) |
| | | |
| | | def forward(self, x): |
| | | if len(x.shape) == 2: |
| | | x = self.linear(x.unsqueeze(dim=-1)).squeeze(dim=-1) |
| | | else: |
| | | x = self.linear(x) |
| | | x = self.nonlinear(x) |
| | | return x |
| | | |
| | | |
| | | class BasicResBlock(nn.Module): |
| | | expansion = 1 |
| | | |
| | | def __init__(self, in_planes, planes, stride=1): |
| | | super(BasicResBlock, self).__init__() |
| | | self.conv1 = nn.Conv2d(in_planes, |
| | | planes, |
| | | kernel_size=3, |
| | | stride=(stride, 1), |
| | | padding=1, |
| | | bias=False) |
| | | self.bn1 = nn.BatchNorm2d(planes) |
| | | self.conv2 = nn.Conv2d(planes, |
| | | planes, |
| | | kernel_size=3, |
| | | stride=1, |
| | | padding=1, |
| | | bias=False) |
| | | self.bn2 = nn.BatchNorm2d(planes) |
| | | |
| | | self.shortcut = nn.Sequential() |
| | | if stride != 1 or in_planes != self.expansion * planes: |
| | | self.shortcut = nn.Sequential( |
| | | nn.Conv2d(in_planes, |
| | | self.expansion * planes, |
| | | kernel_size=1, |
| | | stride=(stride, 1), |
| | | bias=False), |
| | | nn.BatchNorm2d(self.expansion * planes)) |
| | | |
| | | def forward(self, x): |
| | | out = F.relu(self.bn1(self.conv1(x))) |
| | | out = self.bn2(self.conv2(out)) |
| | | out += self.shortcut(x) |
| | | out = F.relu(out) |
| | | return out |
| | |
| | | """ Some implementations are adapted from https://github.com/yuyq96/D-TDNN |
| | | """ |
| | | |
| | | import io |
| | | from typing import Union |
| | | |
| | | import librosa as sf |
| | | import numpy as np |
| | | import torch |
| | | import torch.nn.functional as F |
| | | import torch.utils.checkpoint as cp |
| | | import torchaudio.compliance.kaldi as Kaldi |
| | | from torch import nn |
| | | |
| | | import io |
| | | import os |
| | | from typing import Any, Dict, List, Union |
| | | |
| | | import numpy as np |
| | | import librosa as sf |
| | | import torch |
| | | import torchaudio |
| | | import logging |
| | | from funasr.utils.modelscope_file import File |
| | | from collections import OrderedDict |
| | | import torchaudio.compliance.kaldi as Kaldi |
| | | |
| | | |
| | | def check_audio_list(audio: list): |
| | |
| | | return segs |
| | | |
| | | |
| | | class BasicResBlock(nn.Module): |
| | | expansion = 1 |
| | | |
| | | def __init__(self, in_planes, planes, stride=1): |
| | | super(BasicResBlock, self).__init__() |
| | | self.conv1 = nn.Conv2d( |
| | | in_planes, |
| | | planes, |
| | | kernel_size=3, |
| | | stride=(stride, 1), |
| | | padding=1, |
| | | bias=False) |
| | | self.bn1 = nn.BatchNorm2d(planes) |
| | | self.conv2 = nn.Conv2d( |
| | | planes, planes, kernel_size=3, stride=1, padding=1, bias=False) |
| | | self.bn2 = nn.BatchNorm2d(planes) |
| | | |
| | | self.shortcut = nn.Sequential() |
| | | if stride != 1 or in_planes != self.expansion * planes: |
| | | self.shortcut = nn.Sequential( |
| | | nn.Conv2d( |
| | | in_planes, |
| | | self.expansion * planes, |
| | | kernel_size=1, |
| | | stride=(stride, 1), |
| | | bias=False), nn.BatchNorm2d(self.expansion * planes)) |
| | | |
| | | def forward(self, x): |
| | | out = F.relu(self.bn1(self.conv1(x))) |
| | | out = self.bn2(self.conv2(out)) |
| | | out += self.shortcut(x) |
| | | out = F.relu(out) |
| | | return out |
| | | |
| | | |
| | | class FCM(nn.Module): |
| | | |
| | | def __init__(self, |
| | | block=BasicResBlock, |
| | | num_blocks=[2, 2], |
| | | m_channels=32, |
| | | feat_dim=80): |
| | | super(FCM, self).__init__() |
| | | self.in_planes = m_channels |
| | | self.conv1 = nn.Conv2d( |
| | | 1, m_channels, kernel_size=3, stride=1, padding=1, bias=False) |
| | | self.bn1 = nn.BatchNorm2d(m_channels) |
| | | |
| | | self.layer1 = self._make_layer( |
| | | block, m_channels, num_blocks[0], stride=2) |
| | | self.layer2 = self._make_layer( |
| | | block, m_channels, num_blocks[0], stride=2) |
| | | |
| | | self.conv2 = nn.Conv2d( |
| | | m_channels, |
| | | m_channels, |
| | | kernel_size=3, |
| | | stride=(2, 1), |
| | | padding=1, |
| | | bias=False) |
| | | self.bn2 = nn.BatchNorm2d(m_channels) |
| | | self.out_channels = m_channels * (feat_dim // 8) |
| | | |
| | | def _make_layer(self, block, planes, num_blocks, stride): |
| | | strides = [stride] + [1] * (num_blocks - 1) |
| | | layers = [] |
| | | for stride in strides: |
| | | layers.append(block(self.in_planes, planes, stride)) |
| | | self.in_planes = planes * block.expansion |
| | | return nn.Sequential(*layers) |
| | | |
| | | def forward(self, x): |
| | | x = x.unsqueeze(1) |
| | | out = F.relu(self.bn1(self.conv1(x))) |
| | | out = self.layer1(out) |
| | | out = self.layer2(out) |
| | | out = F.relu(self.bn2(self.conv2(out))) |
| | | |
| | | shape = out.shape |
| | | out = out.reshape(shape[0], shape[1] * shape[2], shape[3]) |
| | | return out |
| | | |
| | | |
| | | class CAMPPlus(nn.Module): |
| | | |
| | | def __init__(self, |
| | | feat_dim=80, |
| | | embedding_size=192, |
| | | growth_rate=32, |
| | | bn_size=4, |
| | | init_channels=128, |
| | | config_str='batchnorm-relu', |
| | | memory_efficient=True, |
| | | output_level='segment'): |
| | | super(CAMPPlus, self).__init__() |
| | | |
| | | self.head = FCM(feat_dim=feat_dim) |
| | | channels = self.head.out_channels |
| | | self.output_level = output_level |
| | | |
| | | self.xvector = nn.Sequential( |
| | | OrderedDict([ |
| | | ('tdnn', |
| | | TDNNLayer( |
| | | channels, |
| | | init_channels, |
| | | 5, |
| | | stride=2, |
| | | dilation=1, |
| | | padding=-1, |
| | | config_str=config_str)), |
| | | ])) |
| | | channels = init_channels |
| | | for i, (num_layers, kernel_size, dilation) in enumerate( |
| | | zip((12, 24, 16), (3, 3, 3), (1, 2, 2))): |
| | | block = CAMDenseTDNNBlock( |
| | | num_layers=num_layers, |
| | | in_channels=channels, |
| | | out_channels=growth_rate, |
| | | bn_channels=bn_size * growth_rate, |
| | | kernel_size=kernel_size, |
| | | dilation=dilation, |
| | | config_str=config_str, |
| | | memory_efficient=memory_efficient) |
| | | self.xvector.add_module('block%d' % (i + 1), block) |
| | | channels = channels + num_layers * growth_rate |
| | | self.xvector.add_module( |
| | | 'transit%d' % (i + 1), |
| | | TransitLayer( |
| | | channels, channels // 2, bias=False, |
| | | config_str=config_str)) |
| | | channels //= 2 |
| | | |
| | | self.xvector.add_module('out_nonlinear', |
| | | get_nonlinear(config_str, channels)) |
| | | |
| | | if self.output_level == 'segment': |
| | | self.xvector.add_module('stats', StatsPool()) |
| | | self.xvector.add_module( |
| | | 'dense', |
| | | DenseLayer( |
| | | channels * 2, embedding_size, config_str='batchnorm_')) |
| | | else: |
| | | assert self.output_level == 'frame', '`output_level` should be set to \'segment\' or \'frame\'. ' |
| | | |
| | | for m in self.modules(): |
| | | if isinstance(m, (nn.Conv1d, nn.Linear)): |
| | | nn.init.kaiming_normal_(m.weight.data) |
| | | if m.bias is not None: |
| | | nn.init.zeros_(m.bias) |
| | | |
| | | def forward(self, x): |
| | | x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T) |
| | | x = self.head(x) |
| | | x = self.xvector(x) |
| | | if self.output_level == 'frame': |
| | | x = x.transpose(1, 2) |
| | | return x |
| | | |
| | | |
| | | def get_nonlinear(config_str, channels): |
| | | nonlinear = nn.Sequential() |
| | | for name in config_str.split('-'): |
| | | if name == 'relu': |
| | | nonlinear.add_module('relu', nn.ReLU(inplace=True)) |
| | | elif name == 'prelu': |
| | | nonlinear.add_module('prelu', nn.PReLU(channels)) |
| | | elif name == 'batchnorm': |
| | | nonlinear.add_module('batchnorm', nn.BatchNorm1d(channels)) |
| | | elif name == 'batchnorm_': |
| | | nonlinear.add_module('batchnorm', |
| | | nn.BatchNorm1d(channels, affine=False)) |
| | | else: |
| | | raise ValueError('Unexpected module ({}).'.format(name)) |
| | | return nonlinear |
| | | |
| | | |
| | | def statistics_pooling(x, dim=-1, keepdim=False, unbiased=True, eps=1e-2): |
| | | mean = x.mean(dim=dim) |
| | | std = x.std(dim=dim, unbiased=unbiased) |
| | | stats = torch.cat([mean, std], dim=-1) |
| | | if keepdim: |
| | | stats = stats.unsqueeze(dim=dim) |
| | | return stats |
| | | |
| | | |
| | | class StatsPool(nn.Module): |
| | | |
| | | def forward(self, x): |
| | | return statistics_pooling(x) |
| | | |
| | | |
| | | class TDNNLayer(nn.Module): |
| | | |
| | | def __init__(self, |
| | | in_channels, |
| | | out_channels, |
| | | kernel_size, |
| | | stride=1, |
| | | padding=0, |
| | | dilation=1, |
| | | bias=False, |
| | | config_str='batchnorm-relu'): |
| | | super(TDNNLayer, self).__init__() |
| | | if padding < 0: |
| | | assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format( |
| | | kernel_size) |
| | | padding = (kernel_size - 1) // 2 * dilation |
| | | self.linear = nn.Conv1d( |
| | | in_channels, |
| | | out_channels, |
| | | kernel_size, |
| | | stride=stride, |
| | | padding=padding, |
| | | dilation=dilation, |
| | | bias=bias) |
| | | self.nonlinear = get_nonlinear(config_str, out_channels) |
| | | |
| | | def forward(self, x): |
| | | x = self.linear(x) |
| | | x = self.nonlinear(x) |
| | | return x |
| | | |
| | | |
| | | def extract_feature(audio): |
| | | features = [] |
| | | for au in audio: |
| | |
| | | features = torch.cat(features) |
| | | return features |
| | | |
| | | |
| | | class CAMLayer(nn.Module): |
| | | |
| | | def __init__(self, |
| | | bn_channels, |
| | | out_channels, |
| | | kernel_size, |
| | | stride, |
| | | padding, |
| | | dilation, |
| | | bias, |
| | | reduction=2): |
| | | super(CAMLayer, self).__init__() |
| | | self.linear_local = nn.Conv1d( |
| | | bn_channels, |
| | | out_channels, |
| | | kernel_size, |
| | | stride=stride, |
| | | padding=padding, |
| | | dilation=dilation, |
| | | bias=bias) |
| | | self.linear1 = nn.Conv1d(bn_channels, bn_channels // reduction, 1) |
| | | self.relu = nn.ReLU(inplace=True) |
| | | self.linear2 = nn.Conv1d(bn_channels // reduction, out_channels, 1) |
| | | self.sigmoid = nn.Sigmoid() |
| | | |
| | | def forward(self, x): |
| | | y = self.linear_local(x) |
| | | context = x.mean(-1, keepdim=True) + self.seg_pooling(x) |
| | | context = self.relu(self.linear1(context)) |
| | | m = self.sigmoid(self.linear2(context)) |
| | | return y * m |
| | | |
| | | def seg_pooling(self, x, seg_len=100, stype='avg'): |
| | | if stype == 'avg': |
| | | seg = F.avg_pool1d( |
| | | x, kernel_size=seg_len, stride=seg_len, ceil_mode=True) |
| | | elif stype == 'max': |
| | | seg = F.max_pool1d( |
| | | x, kernel_size=seg_len, stride=seg_len, ceil_mode=True) |
| | | else: |
| | | raise ValueError('Wrong segment pooling type.') |
| | | shape = seg.shape |
| | | seg = seg.unsqueeze(-1).expand(*shape, |
| | | seg_len).reshape(*shape[:-1], -1) |
| | | seg = seg[..., :x.shape[-1]] |
| | | return seg |
| | | |
| | | |
| | | class CAMDenseTDNNLayer(nn.Module): |
| | | |
| | | def __init__(self, |
| | | in_channels, |
| | | out_channels, |
| | | bn_channels, |
| | | kernel_size, |
| | | stride=1, |
| | | dilation=1, |
| | | bias=False, |
| | | config_str='batchnorm-relu', |
| | | memory_efficient=False): |
| | | super(CAMDenseTDNNLayer, self).__init__() |
| | | assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format( |
| | | kernel_size) |
| | | padding = (kernel_size - 1) // 2 * dilation |
| | | self.memory_efficient = memory_efficient |
| | | self.nonlinear1 = get_nonlinear(config_str, in_channels) |
| | | self.linear1 = nn.Conv1d(in_channels, bn_channels, 1, bias=False) |
| | | self.nonlinear2 = get_nonlinear(config_str, bn_channels) |
| | | self.cam_layer = CAMLayer( |
| | | bn_channels, |
| | | out_channels, |
| | | kernel_size, |
| | | stride=stride, |
| | | padding=padding, |
| | | dilation=dilation, |
| | | bias=bias) |
| | | |
| | | def bn_function(self, x): |
| | | return self.linear1(self.nonlinear1(x)) |
| | | |
| | | def forward(self, x): |
| | | if self.training and self.memory_efficient: |
| | | x = cp.checkpoint(self.bn_function, x) |
| | | else: |
| | | x = self.bn_function(x) |
| | | x = self.cam_layer(self.nonlinear2(x)) |
| | | return x |
| | | |
| | | |
| | | class CAMDenseTDNNBlock(nn.ModuleList): |
| | | |
| | | def __init__(self, |
| | | num_layers, |
| | | in_channels, |
| | | out_channels, |
| | | bn_channels, |
| | | kernel_size, |
| | | stride=1, |
| | | dilation=1, |
| | | bias=False, |
| | | config_str='batchnorm-relu', |
| | | memory_efficient=False): |
| | | super(CAMDenseTDNNBlock, self).__init__() |
| | | for i in range(num_layers): |
| | | layer = CAMDenseTDNNLayer( |
| | | in_channels=in_channels + i * out_channels, |
| | | out_channels=out_channels, |
| | | bn_channels=bn_channels, |
| | | kernel_size=kernel_size, |
| | | stride=stride, |
| | | dilation=dilation, |
| | | bias=bias, |
| | | config_str=config_str, |
| | | memory_efficient=memory_efficient) |
| | | self.add_module('tdnnd%d' % (i + 1), layer) |
| | | |
| | | def forward(self, x): |
| | | for layer in self: |
| | | x = torch.cat([x, layer(x)], dim=1) |
| | | return x |
| | | |
| | | |
| | | class TransitLayer(nn.Module): |
| | | |
| | | def __init__(self, |
| | | in_channels, |
| | | out_channels, |
| | | bias=True, |
| | | config_str='batchnorm-relu'): |
| | | super(TransitLayer, self).__init__() |
| | | self.nonlinear = get_nonlinear(config_str, in_channels) |
| | | self.linear = nn.Conv1d(in_channels, out_channels, 1, bias=bias) |
| | | |
| | | def forward(self, x): |
| | | x = self.nonlinear(x) |
| | | x = self.linear(x) |
| | | return x |
| | | |
| | | |
| | | class DenseLayer(nn.Module): |
| | | |
| | | def __init__(self, |
| | | in_channels, |
| | | out_channels, |
| | | bias=False, |
| | | config_str='batchnorm-relu'): |
| | | super(DenseLayer, self).__init__() |
| | | self.linear = nn.Conv1d(in_channels, out_channels, 1, bias=bias) |
| | | self.nonlinear = get_nonlinear(config_str, out_channels) |
| | | |
| | | def forward(self, x): |
| | | if len(x.shape) == 2: |
| | | x = self.linear(x.unsqueeze(dim=-1)).squeeze(dim=-1) |
| | | else: |
| | | x = self.linear(x) |
| | | x = self.nonlinear(x) |
| | | return x |
| | | |
| | | def postprocess(segments: list, vad_segments: list, |
| | | labels: np.ndarray, embeddings: np.ndarray) -> list: |
| | |
| | | sentence_spk = spk |
| | | d['spk'] = sentence_spk |
| | | sd_sentence_list.append(d) |
| | | return sd_sentence_list |
| | | return sd_sentence_list |
| | |
| | | public static final int CHUNK_INTERVAL = 10; |
| | | public static final int SEND_SIZE = 1920; |
| | | // 热词 |
| | | private String hotWords = "阿里巴巴 达摩院 夜雨飘零"; |
| | | private String hotWords = "阿里巴巴 20\n达摩院 20\n夜雨飘零 20\n"; |
| | | // 采样率 |
| | | public static final int SAMPLE_RATE = 16000; |
| | | // 声道数 |
| | |
| | | } |
| | | }); |
| | | String message = getMessage(true); |
| | | Log.d(TAG, "WebSocket发送消息: " + message); |
| | | webSocket.send(message); |
| | | |
| | | audioRecord.startRecording(); |
| | |
| | | obj.put("chunk_interval", CHUNK_INTERVAL); |
| | | obj.put("wav_name", "default"); |
| | | if (!hotWords.equals("")) { |
| | | obj.put("hotwords", hotWords); |
| | | JSONObject hotwordsJSON = new JSONObject(); |
| | | // 分割每一行字符串 |
| | | String[] hotWordsList = hotWords.split("\n"); |
| | | for (String s : hotWordsList) { |
| | | if (s.equals("")) { |
| | | Log.w(TAG, "hotWords为空"); |
| | | continue; |
| | | } |
| | | // 按照空格分割字符串 |
| | | String[] hotWordsArray = s.split(" "); |
| | | if (hotWordsArray.length != 2) { |
| | | Log.w(TAG, "hotWords格式不正确"); |
| | | continue; |
| | | } |
| | | hotwordsJSON.put(hotWordsArray[0], Integer.valueOf(hotWordsArray[1])); |
| | | } |
| | | obj.put("hotwords", hotwordsJSON.toString()); |
| | | } |
| | | obj.put("wav_format", "pcm"); |
| | | obj.put("is_speaking", isSpeaking); |
| | |
| | | android:background="@drawable/edittext_border" |
| | | android:minLines="3" |
| | | android:gravity="top" |
| | | android:hint="每个热词用空格隔开" /> |
| | | android:hint="每一行为:热词 权重" /> |
| | | |
| | | </LinearLayout> |
| | |
| | | --punc-dir damo/punc_ct-transformer_cn-en-common-vocab471067-large-onnx \ |
| | | --itn-dir thuduj12/fst_itn_zh \ |
| | | --lm-dir damo/speech_ngram_lm_zh-cn-ai-wesp-fst \ |
| | | --decoder-thread-num 32 \ |
| | | --io-thread-num 8 \ |
| | | --port 10095 \ |
| | | --certfile ../../../ssl_key/server.crt \ |
| | | --keyfile ../../../ssl_key/server.key \ |
| | | --hotword ../../hotwords.txt > log.out 2>&1 & |
| | |
| | | --punc-quant: True for quantized PUNC model, False for non-quantized PUNC model. Default is True. |
| | | --itn-dir modelscope model ID or local model path. |
| | | --port: Port number that the server listens on. Default is 10095. |
| | | --decoder-thread-num: The number of thread pools on the server side that can handle concurrent requests. The default value is 8. |
| | | --decoder-thread-num: The number of thread pools on the server side that can handle concurrent requests. |
| | | The script will automatically configure parameters decoder-thread-num and io-thread-num based on the server's thread count. |
| | | --io-thread-num: Number of IO threads that the server starts. |
| | | --model-thread-num: The number of internal threads for each recognition route to control the parallelism of the ONNX model. |
| | | The default value is 1. It is recommended that decoder-thread-num * model-thread-num equals the total number of threads. |
| | | --io-thread-num: Number of IO threads that the server starts. Default is 1. |
| | | --certfile <string>: SSL certificate file. Default is ../../../ssl_key/server.crt. If you want to close ssl,set 0 |
| | | --keyfile <string>: SSL key file. Default is ../../../ssl_key/server.key. |
| | | --hotword: Hotword file path, one line for each hotword(e.g.:阿里巴巴 20), if the client provides hot words, then combined with the hot words provided by the client. |
| | |
| | | --model-dir damo/speech_paraformer-large_asr_nat-en-16k-common-vocab10020-onnx \ |
| | | --vad-dir damo/speech_fsmn_vad_zh-cn-16k-common-onnx \ |
| | | --punc-dir damo/punc_ct-transformer_cn-en-common-vocab471067-large-onnx \ |
| | | --decoder-thread-num 32 \ |
| | | --io-thread-num 8 \ |
| | | --port 10095 \ |
| | | --certfile ../../../ssl_key/server.crt \ |
| | | --keyfile ../../../ssl_key/server.key > log.out 2>&1 & |
| | | ``` |
| | |
| | | --punc-quant: True for quantized PUNC model, False for non-quantized PUNC model. Default is True. |
| | | --itn-dir modelscope model ID or local model path. |
| | | --port: Port number that the server listens on. Default is 10095. |
| | | --decoder-thread-num: The number of thread pools on the server side that can handle concurrent requests. The default value is 8. |
| | | --decoder-thread-num: The number of thread pools on the server side that can handle concurrent requests. |
| | | The script will automatically configure parameters decoder-thread-num and io-thread-num based on the server's thread count. |
| | | --io-thread-num: Number of IO threads that the server starts. |
| | | --model-thread-num: The number of internal threads for each recognition route to control the parallelism of the ONNX model. |
| | | The default value is 1. It is recommended that decoder-thread-num * model-thread-num equals the total number of threads. |
| | | --io-thread-num: Number of IO threads that the server starts. Default is 1. |
| | | --certfile <string>: SSL certificate file. Default is ../../../ssl_key/server.crt. If you want to close ssl,set 0 |
| | | --keyfile <string>: SSL key file. Default is ../../../ssl_key/server.key. |
| | | ``` |
| | |
| | | --model-dir damo/speech_paraformer-large_asr_nat-en-16k-common-vocab10020-onnx \ |
| | | --vad-dir damo/speech_fsmn_vad_zh-cn-16k-common-onnx \ |
| | | --punc-dir damo/punc_ct-transformer_cn-en-common-vocab471067-large-onnx \ |
| | | --decoder-thread-num 32 \ |
| | | --io-thread-num 8 \ |
| | | --port 10095 \ |
| | | --certfile ../../../ssl_key/server.crt \ |
| | | --keyfile ../../../ssl_key/server.key > log.out 2>&1 & |
| | | ``` |
| | |
| | | --punc-quant True为量化PUNC模型,False为非量化PUNC模型,默认是True |
| | | --itn-dir modelscope model ID 或者 本地模型路径 |
| | | --port 服务端监听的端口号,默认为 10095 |
| | | --decoder-thread-num 服务端线程池个数(支持的最大并发路数),默认为 8 |
| | | --decoder-thread-num 服务端线程池个数(支持的最大并发路数), |
| | | 脚本会根据服务器线程数自动配置decoder-thread-num、io-thread-num |
| | | --io-thread-num 服务端启动的IO线程数 |
| | | --model-thread-num 每路识别的内部线程数(控制ONNX模型的并行),默认为 1, |
| | | 其中建议 decoder-thread-num*model-thread-num 等于总线程数 |
| | | --io-thread-num 服务端启动的IO线程数,默认为 1 |
| | | --certfile ssl的证书文件,默认为:../../../ssl_key/server.crt,如果需要关闭ssl,参数设置为0 |
| | | --keyfile ssl的密钥文件,默认为:../../../ssl_key/server.key |
| | | ``` |
| | |
| | | --model-dir damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-onnx \ |
| | | --vad-dir damo/speech_fsmn_vad_zh-cn-16k-common-onnx \ |
| | | --punc-dir damo/punc_ct-transformer_cn-en-common-vocab471067-large-onnx \ |
| | | --lm-dir damo/speech_ngram_lm_zh-cn-ai-wesp-fst \ |
| | | --itn-dir thuduj12/fst_itn_zh \ |
| | | --decoder-thread-num 32 \ |
| | | --io-thread-num 8 \ |
| | | --port 10095 \ |
| | | --certfile ../../../ssl_key/server.crt \ |
| | | --keyfile ../../../ssl_key/server.key \ |
| | | --hotword ../../hotwords.txt > log.out 2>&1 & |
| | |
| | | --lm-dir modelscope model ID 或者 本地模型路径 |
| | | --itn-dir modelscope model ID 或者 本地模型路径 |
| | | --port 服务端监听的端口号,默认为 10095 |
| | | --decoder-thread-num 服务端线程池个数(支持的最大并发路数),默认为 8 |
| | | --decoder-thread-num 服务端线程池个数(支持的最大并发路数), |
| | | 脚本会根据服务器线程数自动配置decoder-thread-num、io-thread-num |
| | | --io-thread-num 服务端启动的IO线程数 |
| | | --model-thread-num 每路识别的内部线程数(控制ONNX模型的并行),默认为 1, |
| | | 其中建议 decoder-thread-num*model-thread-num 等于总线程数 |
| | | --io-thread-num 服务端启动的IO线程数,默认为 1 |
| | | --certfile ssl的证书文件,默认为:../../../ssl_key/server.crt,如果需要关闭ssl,参数设置为0 |
| | | --keyfile ssl的密钥文件,默认为:../../../ssl_key/server.key |
| | | --hotword 热词文件路径,每行一个热词,格式:热词 权重(例如:阿里巴巴 20), |
| | |
| | | --vad-dir damo/speech_fsmn_vad_zh-cn-16k-common-onnx \ |
| | | --punc-dir damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727-onnx \ |
| | | --itn-dir thuduj12/fst_itn_zh \ |
| | | --decoder-thread-num 32 \ |
| | | --io-thread-num 8 \ |
| | | --port 10095 \ |
| | | --certfile ../../../ssl_key/server.crt \ |
| | | --keyfile ../../../ssl_key/server.key \ |
| | | --hotword ../../hotwords.txt > log.out 2>&1 & |
| | |
| | | --punc-quant: True for quantized PUNC model, False for non-quantized PUNC model. Default is True. |
| | | --itn-dir modelscope model ID or local model path. |
| | | --port: Port number that the server listens on. Default is 10095. |
| | | --decoder-thread-num: The number of thread pools on the server side that can handle concurrent requests. The default value is 8. |
| | | --decoder-thread-num: The number of thread pools on the server side that can handle concurrent requests. |
| | | The script will automatically configure parameters decoder-thread-num and io-thread-num based on the server's thread count. |
| | | --io-thread-num: Number of IO threads that the server starts. |
| | | --model-thread-num: The number of internal threads for each recognition route to control the parallelism of the ONNX model. |
| | | The default value is 1. It is recommended that decoder-thread-num * model-thread-num equals the total number of threads. |
| | | --io-thread-num: Number of IO threads that the server starts. Default is 1. |
| | | --certfile <string>: SSL certificate file. Default is ../../../ssl_key/server.crt. If you want to close ssl,set 0 |
| | | --keyfile <string>: SSL key file. Default is ../../../ssl_key/server.key. |
| | | --hotword: Hotword file path, one line for each hotword(e.g.:阿里巴巴 20), if the client provides hot words, then combined with the hot words provided by the client. |
| | |
| | | --vad-dir damo/speech_fsmn_vad_zh-cn-16k-common-onnx \ |
| | | --punc-dir damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727-onnx \ |
| | | --itn-dir thuduj12/fst_itn_zh \ |
| | | --decoder-thread-num 32 \ |
| | | --io-thread-num 8 \ |
| | | --port 10095 \ |
| | | --certfile ../../../ssl_key/server.crt \ |
| | | --keyfile ../../../ssl_key/server.key \ |
| | | --hotword ../../hotwords.txt > log.out 2>&1 & |
| | |
| | | --punc-quant True为量化PUNC模型,False为非量化PUNC模型,默认是True |
| | | --itn-dir modelscope model ID 或者 本地模型路径 |
| | | --port 服务端监听的端口号,默认为 10095 |
| | | --decoder-thread-num 服务端线程池个数(支持的最大并发路数),默认为 8 |
| | | --decoder-thread-num 服务端线程池个数(支持的最大并发路数), |
| | | 脚本会根据服务器线程数自动配置decoder-thread-num、io-thread-num |
| | | --io-thread-num 服务端启动的IO线程数 |
| | | --model-thread-num 每路识别的内部线程数(控制ONNX模型的并行),默认为 1, |
| | | 其中建议 decoder-thread-num*model-thread-num 等于总线程数 |
| | | --io-thread-num 服务端启动的IO线程数,默认为 1 |
| | | --certfile ssl的证书文件,默认为:../../../ssl_key/server.crt,如果需要关闭ssl,参数设置为0 |
| | | --keyfile ssl的密钥文件,默认为:../../../ssl_key/server.key |
| | | --hotword 热词文件路径,每行一个热词,格式:热词 权重(例如:阿里巴巴 20), |
| | |
| | | option(HAVE_SPECIAL "Build special" ON) |
| | | |
| | | set(WITH_GFLAGS OFF CACHE BOOL "whether build glog with gflags" FORCE) |
| | | set(HAVE_BIN ON CACHE BOOL "Build the fst binaries" FORCE) |
| | | set(HAVE_SCRIPT ON CACHE BOOL "Build the fstscript" FORCE) |
| | | if (WIN32) |
| | | set(HAVE_BIN OFF CACHE BOOL "Build the fst binaries" FORCE) |
| | | set(HAVE_SCRIPT OFF CACHE BOOL "Build the fstscript" FORCE) |
| | | else() |
| | | set(HAVE_BIN ON CACHE BOOL "Build the fst binaries" FORCE) |
| | | set(HAVE_SCRIPT ON CACHE BOOL "Build the fstscript" FORCE) |
| | | endif (WIN32) |
| | | |
| | | set(HAVE_COMPACT OFF CACHE BOOL "Build compact" FORCE) |
| | | set(HAVE_CONST OFF CACHE BOOL "Build const" FORCE) |
| | |
| | | punc_dir="damo/punc_ct-transformer_cn-en-common-vocab471067-large-onnx" |
| | | itn_dir="thuduj12/fst_itn_zh" |
| | | lm_dir="damo/speech_ngram_lm_zh-cn-ai-wesp-fst" |
| | | decoder_thread_num=32 |
| | | model_thread_num=1 |
| | | io_thread_num=8 |
| | | port=10095 |
| | | certfile="../../../ssl_key/server.crt" |
| | | keyfile="../../../ssl_key/server.key" |
| | | hotword="../../hotwords.txt" |
| | | # set decoder_thread_num |
| | | decoder_thread_num=$(cat /proc/cpuinfo | grep "processor"|wc -l) || { echo "Get cpuinfo failed. Set decoder_thread_num = 32"; decoder_thread_num=32; } |
| | | multiple_io=16 |
| | | io_thread_num=$(( (decoder_thread_num + multiple_io - 1) / multiple_io )) |
| | | model_thread_num=1 |
| | | |
| | | . ../egs/aishell/transformer/utils/parse_options.sh || exit 1; |
| | | |
| | | if [ -z "$certfile" ] || [ "$certfile" = "0" ]; then |
| | | certfile="" |
| | | keyfile="" |
| | | fi |
| | | |
| | | cd /workspace/FunASR/runtime/websocket/build/bin |
| | | if [ -z "$certfile" ] || [ "$certfile" -eq 0 ]; then |
| | | ./funasr-wss-server \ |
| | | --download-model-dir "${download_model_dir}" \ |
| | | --model-dir "${model_dir}" \ |
| | | --vad-dir "${vad_dir}" \ |
| | | --punc-dir "${punc_dir}" \ |
| | | --itn-dir "${itn_dir}" \ |
| | | --lm-dir "${lm_dir}" \ |
| | | --decoder-thread-num ${decoder_thread_num} \ |
| | | --io-thread-num ${io_thread_num} \ |
| | | --model-thread-num ${model_thread_num} \ |
| | | --port ${port} \ |
| | | --certfile "" \ |
| | | --keyfile "" \ |
| | | --hotword "${hotword}" |
| | | else |
| | | ./funasr-wss-server \ |
| | | --download-model-dir "${download_model_dir}" \ |
| | | --model-dir "${model_dir}" \ |
| | |
| | | --certfile "${certfile}" \ |
| | | --keyfile "${keyfile}" \ |
| | | --hotword "${hotword}" |
| | | fi |
| | | |
| | |
| | | vad_dir="damo/speech_fsmn_vad_zh-cn-16k-common-onnx" |
| | | punc_dir="damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727-onnx" |
| | | itn_dir="thuduj12/fst_itn_zh" |
| | | decoder_thread_num=32 |
| | | model_thread_num=1 |
| | | io_thread_num=8 |
| | | port=10095 |
| | | certfile="../../../ssl_key/server.crt" |
| | | keyfile="../../../ssl_key/server.key" |
| | | hotword="../../hotwords.txt" |
| | | # set decoder_thread_num |
| | | decoder_thread_num=$(cat /proc/cpuinfo | grep "processor"|wc -l) || { echo "Get cpuinfo failed. Set decoder_thread_num = 32"; decoder_thread_num=32; } |
| | | multiple_io=16 |
| | | io_thread_num=$(( (decoder_thread_num + multiple_io - 1) / multiple_io )) |
| | | model_thread_num=1 |
| | | |
| | | . ../egs/aishell/transformer/utils/parse_options.sh || exit 1; |
| | | |
| | | if [ -z "$certfile" ] || [ "$certfile" = "0" ]; then |
| | | certfile="" |
| | | keyfile="" |
| | | fi |
| | | |
| | | cd /workspace/FunASR/runtime/websocket/build/bin |
| | | if [ -z "$certfile" ] || [ "$certfile" -eq 0 ]; then |
| | | ./funasr-wss-server-2pass \ |
| | | --download-model-dir "${download_model_dir}" \ |
| | | --model-dir "${model_dir}" \ |
| | | --online-model-dir "${online_model_dir}" \ |
| | | --vad-dir "${vad_dir}" \ |
| | | --punc-dir "${punc_dir}" \ |
| | | --itn-dir "${itn_dir}" \ |
| | | --decoder-thread-num ${decoder_thread_num} \ |
| | | --model-thread-num ${model_thread_num} \ |
| | | --io-thread-num ${io_thread_num} \ |
| | | --port ${port} \ |
| | | --certfile "" \ |
| | | --keyfile "" \ |
| | | --hotword "${hotword}" |
| | | else |
| | | ./funasr-wss-server-2pass \ |
| | | --download-model-dir "${download_model_dir}" \ |
| | | --model-dir "${model_dir}" \ |
| | |
| | | --certfile "${certfile}" \ |
| | | --keyfile "${keyfile}" \ |
| | | --hotword "${hotword}" |
| | | fi |
| | | |
| | |
| | | option(ENABLE_PORTAUDIO "Whether to build portaudio" ON) |
| | | |
| | | if(WIN32) |
| | | file(REMOVE ${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/glog/src/config.h |
| | | ${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/glog/src/glog/export.h |
| | | ${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/glog/src/glog/logging.h |
| | | ${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/glog/src/glog/raw_logging.h |
| | | ${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/glog/src/glog/stl_logging.h |
| | | ${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/glog/src/glog/vlog_is_on.h) |
| | | else() |
| | | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -fPIC") |
| | | endif() |
| | |
| | | try { |
| | | google::InitGoogleLogging(argv[0]); |
| | | FLAGS_logtostderr = true; |
| | | |
| | | TCLAP::CmdLine cmd("funasr-wss-server", ' ', "1.0"); |
| | | std::string tpass_version = ""; |
| | | #ifdef _WIN32 |
| | | tpass_version = "0.1.0"; |
| | | #endif |
| | | TCLAP::CmdLine cmd("funasr-wss-server", ' ', tpass_version); |
| | | TCLAP::ValueArg<std::string> download_model_dir( |
| | | "", "download-model-dir", |
| | | "Download model from Modelscope to download_model_dir", false, |
| | |
| | | "0.0.0.0", "string"); |
| | | TCLAP::ValueArg<int> port("", "port", "port", false, 10095, "int"); |
| | | TCLAP::ValueArg<int> io_thread_num("", "io-thread-num", "io thread num", |
| | | false, 8, "int"); |
| | | false, 2, "int"); |
| | | TCLAP::ValueArg<int> decoder_thread_num( |
| | | "", "decoder-thread-num", "decoder thread num", false, 8, "int"); |
| | | TCLAP::ValueArg<int> model_thread_num("", "model-thread-num", |
| | | "model thread num", false, 4, "int"); |
| | | "model thread num", false, 2, "int"); |
| | | |
| | | TCLAP::ValueArg<std::string> certfile( |
| | | "", "certfile", |
| | |
| | | std::string down_asr_path; |
| | | std::string down_asr_model; |
| | | |
| | | size_t found = s_offline_asr_path.find("speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404"); |
| | | if (found != std::string::npos) { |
| | | model_path["offline-model-revision"]="v1.2.4"; |
| | | } else{ |
| | | found = s_offline_asr_path.find("speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"); |
| | | if (found != std::string::npos) { |
| | | model_path["offline-model-revision"]="v1.0.5"; |
| | | } |
| | | } |
| | | |
| | | if (access(s_offline_asr_path.c_str(), F_OK) == 0) { |
| | | // local |
| | | python_cmd_asr = python_cmd + " --model-name " + s_offline_asr_path + |
| | |
| | | down_asr_path = s_offline_asr_path; |
| | | } |
| | | else { |
| | | size_t found = s_offline_asr_path.find("speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404"); |
| | | if (found != std::string::npos) { |
| | | model_path["offline-model-revision"]="v1.2.4"; |
| | | } else{ |
| | | found = s_offline_asr_path.find("speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"); |
| | | if (found != std::string::npos) { |
| | | model_path["offline-model-revision"]="v1.0.5"; |
| | | } |
| | | } |
| | | |
| | | // modelscope |
| | | LOG(INFO) << "Download model: " << s_offline_asr_path |
| | | << " from modelscope : "; |
| | |
| | | |
| | | google::InitGoogleLogging(argv[0]); |
| | | FLAGS_logtostderr = true; |
| | | |
| | | TCLAP::CmdLine cmd("funasr-wss-server", ' ', "1.0"); |
| | | std::string offline_version = ""; |
| | | #ifdef _WIN32 |
| | | offline_version = "0.1.0"; |
| | | #endif |
| | | TCLAP::CmdLine cmd("funasr-wss-server", ' ', offline_version); |
| | | TCLAP::ValueArg<std::string> download_model_dir( |
| | | "", "download-model-dir", |
| | | "Download model from Modelscope to download_model_dir", |
| | |
| | | "0.0.0.0", "string"); |
| | | TCLAP::ValueArg<int> port("", "port", "port", false, 10095, "int"); |
| | | TCLAP::ValueArg<int> io_thread_num("", "io-thread-num", "io thread num", |
| | | false, 8, "int"); |
| | | false, 2, "int"); |
| | | TCLAP::ValueArg<int> decoder_thread_num( |
| | | "", "decoder-thread-num", "decoder thread num", false, 8, "int"); |
| | | TCLAP::ValueArg<int> model_thread_num("", "model-thread-num", |
| | |
| | | std::string down_asr_path; |
| | | std::string down_asr_model; |
| | | |
| | | // modify model-revision by model name |
| | | size_t found = s_asr_path.find("speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404"); |
| | | if (found != std::string::npos) { |
| | | model_path["model-revision"]="v1.2.4"; |
| | | } |
| | | |
| | | found = s_asr_path.find("speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"); |
| | | if (found != std::string::npos) { |
| | | model_path["model-revision"]="v1.0.5"; |
| | | } |
| | | |
| | | found = s_asr_path.find("speech_paraformer-large_asr_nat-en-16k-common-vocab10020"); |
| | | if (found != std::string::npos) { |
| | | model_path["model-revision"]="v1.0.0"; |
| | | s_itn_path=""; |
| | | s_lm_path=""; |
| | | } |
| | | |
| | | if (access(s_asr_path.c_str(), F_OK) == 0){ |
| | | // local |
| | | python_cmd_asr = python_cmd + " --model-name " + s_asr_path + " --export-dir ./ " + " --model_revision " + model_path["model-revision"]; |
| | | down_asr_path = s_asr_path; |
| | | }else{ |
| | | size_t found = s_asr_path.find("speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404"); |
| | | if (found != std::string::npos) { |
| | | model_path["model-revision"]="v1.2.4"; |
| | | } |
| | | |
| | | found = s_asr_path.find("speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"); |
| | | if (found != std::string::npos) { |
| | | model_path["model-revision"]="v1.0.5"; |
| | | } |
| | | |
| | | found = s_asr_path.find("speech_paraformer-large_asr_nat-en-16k-common-vocab10020"); |
| | | if (found != std::string::npos) { |
| | | model_path["model-revision"]="v1.0.0"; |
| | | s_itn_path=""; |
| | | s_lm_path=""; |
| | | } |
| | | |
| | | // modelscope |
| | | LOG(INFO) << "Download model: " << s_asr_path << " from modelscope: "; |
| | | python_cmd_asr = python_cmd + " --model-name " + s_asr_path + " --export-dir " + s_download_model_dir + " --model_revision " + model_path["model-revision"]; |
| | |
| | | std::string wav_format, |
| | | FUNASR_HANDLE& tpass_online_handle) { |
| | | // lock for each connection |
| | | scoped_lock guard(thread_lock); |
| | | if(!tpass_online_handle){ |
| | | scoped_lock guard(thread_lock); |
| | | LOG(INFO) << "tpass_online_handle is free, return"; |
| | | msg["access_num"]=(int)msg["access_num"]-1; |
| | | return; |
| | |
| | | hotwords_embedding, itn); |
| | | |
| | | } else { |
| | | scoped_lock guard(thread_lock); |
| | | msg["access_num"]=(int)msg["access_num"]-1; |
| | | return; |
| | | } |
| | | } catch (std::exception const& e) { |
| | | scoped_lock guard(thread_lock); |
| | | LOG(ERROR) << e.what(); |
| | | msg["access_num"]=(int)msg["access_num"]-1; |
| | | return; |
| | |
| | | wav_format, (ASR_TYPE)asr_mode_, |
| | | hotwords_embedding, itn); |
| | | } else { |
| | | scoped_lock guard(thread_lock); |
| | | msg["access_num"]=(int)msg["access_num"]-1; |
| | | return; |
| | | } |
| | | } catch (std::exception const& e) { |
| | | scoped_lock guard(thread_lock); |
| | | LOG(ERROR) << e.what(); |
| | | msg["access_num"]=(int)msg["access_num"]-1; |
| | | return; |
| | |
| | | } catch (std::exception const& e) { |
| | | std::cerr << "Error: " << e.what() << std::endl; |
| | | } |
| | | scoped_lock guard(thread_lock); |
| | | msg["access_num"]=(int)msg["access_num"]-1; |
| | | |
| | | } |
| | |
| | | int audio_fs, |
| | | std::string wav_format, |
| | | FUNASR_DEC_HANDLE& decoder_handle) { |
| | | scoped_lock guard(thread_lock); |
| | | try { |
| | | int num_samples = buffer.size(); // the size of the buf |
| | | |
| | |
| | | } catch (std::exception const& e) { |
| | | std::cerr << "Error: " << e.what() << std::endl; |
| | | } |
| | | scoped_lock guard(thread_lock); |
| | | msg["access_num"]=(int)msg["access_num"]-1; |
| | | } |
| | | |
| | |
| | | # "soundfile>=0.12.1", |
| | | # "h5py>=3.1.0", |
| | | "kaldiio>=2.17.0", |
| | | # "torch_complex", |
| | | "torch_complex", |
| | | # "nltk>=3.4.5", |
| | | # ASR |
| | | "sentencepiece", # train |
| | |
| | | # "pypinyin>=0.44.0", |
| | | # "espnet_tts_frontend", |
| | | # ENH |
| | | # "pytorch_wpe", |
| | | "pytorch_wpe", |
| | | "editdistance>=0.5.2", |
| | | "tensorboard", |
| | | # "g2p", |
| | |
| | | "tqdm", |
| | | "hdbscan", |
| | | "umap", |
| | | "jaconv", |
| | | ], |
| | | # train: The modules invoked when training only. |
| | | "train": [ |