update with main (#1786)
* add cmakelist
* add paraformer-torch
* add debug for funasr-onnx-offline
* fix redefinition of jieba StdExtension.hpp
* add loading torch models
* update funasr-onnx-offline
* add SwitchArg for wss-server
* add SwitchArg for funasr-onnx-offline
* update cmakelist
* update funasr-onnx-offline-rtf
* add define condition
* add gpu define for offlne-stream
* update com define
* update offline-stream
* update cmakelist
* update func CompileHotwordEmbedding
* add timestamp for paraformer-torch
* add C10_USE_GLOG for paraformer-torch
* update paraformer-torch
* fix func FunASRWfstDecoderInit
* update model.h
* fix func FunASRWfstDecoderInit
* fix tpass_stream
* update paraformer-torch
* add bladedisc for funasr-onnx-offline
* update comdefine
* update funasr-wss-server
* add log for torch
* fix GetValue BLADEDISC
* fix log
* update cmakelist
* update warmup to 10
* update funasrruntime
* add batch_size for wss-server
* add batch for bins
* add batch for offline-stream
* add batch for paraformer
* add batch for offline-stream
* fix func SetBatchSize
* add SetBatchSize for model
* add SetBatchSize for model
* fix func Forward
* fix padding
* update funasrruntime
* add dec reset for batch
* set batch default value
* add argv for CutSplit
* sort frame_queue
* sorted msgs
* fix FunOfflineInfer
* add dynamic batch for fetch
* fix FetchDynamic
* update run_server.sh
* update run_server.sh
* cpp http post server support (#1739)
* add cpp http server
* add some comment
* remove some comments
* del debug infos
* restore run_server.sh
* adapt to new model struct
* 修复了onnxruntime在macos下编译失败的错误 (#1748)
* Add files via upload
增加macos的编译支持
* Add files via upload
增加macos支持
* Add files via upload
target_link_directories(funasr PUBLIC ${ONNXRUNTIME_DIR}/lib)
target_link_directories(funasr PUBLIC ${FFMPEG_DIR}/lib)
添加 if(APPLE) 限制
---------
Co-authored-by: Yabin Li <wucong.lyb@alibaba-inc.com>
* Delete docs/images/wechat.png
* Add files via upload
* fixed the issues about seaco-onnx timestamp
* fix bug (#1764)
当语音识别结果包含 `http` 时,标点符号预测会把它会被当成 url
* fix empty asr result (#1765)
解码结果为空的语音片段,text 用空字符串
* docs
* docs
* docs
* docs
* docs
* keep empty speech result (#1772)
* docs
* docs
* update wechat QRcode
* Add python funasr api support for websocket srv (#1777)
* add python funasr_api supoort
* change little to README.md
* add core tools stream
* modified a little
* fix bug for timeout
* support for buffer decode
* add ffmpeg decode for buffer
* auto frontend
* auto frontend
* auto frontend
* auto frontend
* auto frontend
* auto frontend
* auto frontend
* auto frontend
* Dev gzf exp (#1785)
* resume from step
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* train_loss_avg train_acc_avg
* train_loss_avg train_acc_avg
* train_loss_avg train_acc_avg
* log step
* wav is not exist
* wav is not exist
* decoding
* decoding
* decoding
* wechat
* decoding key
* decoding key
* decoding key
* decoding key
* decoding key
* decoding key
* dynamic batch
* start_data_split_i=0
* total_time/accum_grad
* total_time/accum_grad
* total_time/accum_grad
* update avg slice
* update avg slice
* sensevoice sanm
* sensevoice sanm
* sensevoice sanm
---------
Co-authored-by: 北念 <lzr265946@alibaba-inc.com>
* auto frontend
---------
Co-authored-by: 雾聪 <wucong.lyb@alibaba-inc.com>
Co-authored-by: zhaomingwork <61895407+zhaomingwork@users.noreply.github.com>
Co-authored-by: szsteven008 <97944818+szsteven008@users.noreply.github.com>
Co-authored-by: Ephemeroptera <605686962@qq.com>
Co-authored-by: 彭震东 <zhendong.peng@qq.com>
Co-authored-by: Shi Xian <40013335+R1ckShi@users.noreply.github.com>
Co-authored-by: 维石 <shixian.shi@alibaba-inc.com>
Co-authored-by: 北念 <lzr265946@alibaba-inc.com>
| | |
| | | |
| | | result_list = [] |
| | | num_samples = len(data_list) |
| | | pbar = tqdm(colour="blue", total=num_samples + 1, dynamic_ncols=True) |
| | | # pbar = tqdm(colour="blue", total=num_samples + 1, dynamic_ncols=True) |
| | | |
| | | time0 = time.perf_counter() |
| | | for beg_idx in range(0, num_samples, batch_size): |
| | |
| | | "input": speech, |
| | | "input_len": speech_lengths, |
| | | "key": key_batch, |
| | | data_type: "fbank", |
| | | "data_type": "fbank", |
| | | } |
| | | result_list.append(batch) |
| | | |
| | | pbar.update(1) |
| | | description = f"{meta_data}, " |
| | | pbar.set_description(description) |
| | | # pbar.update(1) |
| | | # description = f"{meta_data}, " |
| | | # pbar.set_description(description) |
| | | |
| | | time_end = time.perf_counter() |
| | | pbar.set_description(f"time escaped total: {time_end - time0:0.3f}") |
| | | # pbar.set_description(f"time escaped total: {time_end - time0:0.3f}") |
| | | |
| | | return result_list |
| | |
| | | start_idx = self.rank * batches_per_rank |
| | | end_idx = start_idx + batches_per_rank |
| | | rank_batches = buffer_batches[start_idx + self.start_step : end_idx] |
| | | |
| | | self.batch_num = len(rank_batches) |
| | | |
| | | logging.info( |
| | | f"rank: {self.rank}, dataloader start from step: {self.start_step}, batch_num: {end_idx-start_idx}, batch_num_after_step: {len(rank_batches)}" |
| | | ) |
| | |
| | | import torch |
| | | import torch.nn as nn |
| | | import torch.nn.functional as F |
| | | from funasr.models.transformer.utils.nets_utils import make_pad_mask |
| | | |
| | | from funasr.register import tables |
| | | |
| | |
| | | query_proj = self.norm(self.linear(query_output.last_hidden_state)) |
| | | |
| | | return query_proj |
| | | |
| | | |
| | | @tables.register("adaptor_classes", "Transformer") |
| | | class Transformer(nn.Module): |
| | | def __init__( |
| | | self, downsample_rate=2, encoder_dim=1280, llm_dim=4096, ffn_dim: int = 2048, **kwargs |
| | | ): |
| | | super().__init__() |
| | | self.k = downsample_rate |
| | | self.encoder_dim = encoder_dim |
| | | self.llm_dim = llm_dim |
| | | self.linear1 = nn.Linear(self.encoder_dim * self.k, ffn_dim) |
| | | self.relu = nn.ReLU() |
| | | self.linear2 = nn.Linear(ffn_dim, self.llm_dim) |
| | | from funasr.models.transformer.encoder import EncoderLayer |
| | | from funasr.models.transformer.attention import MultiHeadedAttention |
| | | from funasr.models.transformer.positionwise_feed_forward import PositionwiseFeedForward |
| | | |
| | | self.blocks = nn.ModuleList( |
| | | [ |
| | | EncoderLayer( |
| | | llm_dim, |
| | | MultiHeadedAttention( |
| | | kwargs.get("attention_heads", 8), |
| | | llm_dim, |
| | | kwargs.get("attention_dropout_rate", 0.0), |
| | | ), |
| | | PositionwiseFeedForward( |
| | | llm_dim, |
| | | llm_dim // 4, |
| | | kwargs.get("dropout_rate", 0.0), |
| | | ), |
| | | kwargs.get("dropout_rate", 0.0), |
| | | ) |
| | | for i in range(kwargs.get("n_layer", 2)) |
| | | ] |
| | | ) |
| | | |
| | | def forward(self, x, ilens=None): |
| | | |
| | | batch_size, seq_len, dim = x.size() |
| | | # num_frames_to_discard = seq_len % self.k |
| | | chunk_num = (seq_len - 1) // self.k + 1 |
| | | pad_num = chunk_num * self.k - seq_len |
| | | x = F.pad(x, (0, 0, 0, pad_num, 0, 0), value=0.0) |
| | | # if num_frames_to_discard > 0: |
| | | # x = x[:, :-num_frames_to_discard, :] |
| | | seq_len = x.size(1) |
| | | |
| | | x = x.contiguous() |
| | | x = x.view(batch_size, chunk_num, dim * self.k) |
| | | x = self.linear1(x) |
| | | x = self.relu(x) |
| | | x = self.linear2(x) |
| | | |
| | | olens = None |
| | | olens = (ilens - 1) // self.k + 1 |
| | | masks = (~make_pad_mask(olens)[:, None, :]).to(x.device) |
| | | for layer, block in enumerate(self.blocks): |
| | | x, masks = block(x, masks) |
| | | return x, olens |
| | |
| | | """Score.""" |
| | | ys_mask = subsequent_mask(len(ys), device=x.device).unsqueeze(0) |
| | | logp = self.forward(ys.unsqueeze(0), x.unsqueeze(0), cache=state) |
| | | logp = torch.log_softmax(logp, dim=-1) |
| | | return logp.squeeze(0)[-1, :], state |
| | | |
| | | |
| | |
| | | if isinstance(task, str): |
| | | task = [task] |
| | | task = "".join([f"<|{x}|>" for x in task]) |
| | | initial_prompt = kwargs.get("initial_prompt", f"<|startoftranscript|>{task}") |
| | | |
| | | sos = kwargs.get("model_conf").get("sos") |
| | | if isinstance(sos, str): |
| | | initial_prompt = kwargs.get("initial_prompt", f"<|startoftranscript|>{task}") |
| | | |
| | | language = DecodingOptions.get("language", None) |
| | | language = None if language == "auto" else language |
| | | language = DecodingOptions.get("language", None) |
| | | language = None if language == "auto" else language |
| | | |
| | | sos = f"{initial_prompt}<|{language}|>" if language is not None else initial_prompt |
| | | sos_int = tokenizer.encode(sos, allowed_special="all") |
| | | sos = f"{initial_prompt}<|{language}|>" if language is not None else initial_prompt |
| | | sos_int = tokenizer.encode(sos, allowed_special="all") |
| | | else: |
| | | language = DecodingOptions.get("language", None) |
| | | language = None if language == "auto" else language |
| | | initial_prompt = kwargs.get("initial_prompt", f"{task}") |
| | | initial_prompt_lid = f"{initial_prompt}<|{language}|>" if language is not None else initial_prompt |
| | | initial_prompt_lid_int = tokenizer.encode(initial_prompt_lid, allowed_special="all") |
| | | sos_int = [sos] + initial_prompt_lid_int |
| | | eos = kwargs.get("model_conf").get("eos") |
| | | eos_int = tokenizer.encode(eos, allowed_special="all") |
| | | if isinstance(eos, str): |
| | | eos_int = tokenizer.encode(eos, allowed_special="all") |
| | | else: |
| | | eos_int = [eos] |
| | | |
| | | self.beam_search.sos = sos_int |
| | | self.beam_search.eos = eos_int[0] |
| | | |
| | |
| | | self.beam_search.event_score_ga = DecodingOptions.get("gain_tokens_score", [1, 1, 1, 1]) |
| | | |
| | | encoder_out, encoder_out_lens = self.encode( |
| | | speech[None, :, :].permute(0, 2, 1), speech_lengths |
| | | speech[None, :, :], speech_lengths |
| | | ) |
| | | |
| | | if text_token_int is not None: |
| | |
| | | n_text_layer: int |
| | | |
| | | |
| | | # class LayerNorm(nn.LayerNorm): |
| | | # def forward(self, x: Tensor) -> Tensor: |
| | | # return super().forward(x.float()).type(x.dtype) |
| | | |
| | | |
| | | class LayerNorm(nn.LayerNorm): |
| | | def forward(self, x: Tensor) -> Tensor: |
| | | return super().forward(x.float()).type(x.dtype) |
| | | def __init__(self, *args, **kwargs): |
| | | super().__init__(*args, **kwargs) |
| | | |
| | | def forward(self, input): |
| | | output = F.layer_norm( |
| | | input.float(), |
| | | self.normalized_shape, |
| | | self.weight.float() if self.weight is not None else None, |
| | | self.bias.float() if self.bias is not None else None, |
| | | self.eps, |
| | | ) |
| | | return output.type_as(input) |
| | | |
| | | |
| | | class Linear(nn.Linear): |
| | |
| | | stochastic_depth_rate=0.0, |
| | | ): |
| | | """Construct an EncoderLayer object.""" |
| | | super(EncoderLayer, self).__init__() |
| | | super().__init__() |
| | | self.self_attn = self_attn |
| | | self.feed_forward = feed_forward |
| | | self.norm1 = LayerNorm(size) |
| | |
| | | self.train_acc_avg = train_acc_avg.detach().cpu().item() / self.world_size |
| | | |
| | | def forward_step(self, model, batch, loss_dict={}): |
| | | dtype = torch.bfloat16 |
| | | with maybe_autocast(dtype=self.dtype, use_deepspeed=self.use_deepspeed): |
| | | retval = model(**batch) |
| | | |