Dev gzf exp (#1700)
* resume from step
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* batch
* train_loss_avg train_acc_avg
* train_loss_avg train_acc_avg
* train_loss_avg train_acc_avg
* log step
* wav is not exist
* wav is not exist
* decoding
* decoding
* decoding
* wechat
* decoding key
* decoding key
* decoding key
* decoding key
* decoding key
| | |
| | | |
| | | model = AutoModel(model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch") |
| | | |
| | | mm = model.model |
| | | for p in mm.parameters(): |
| | | print(f"{p.numel()}") |
| | | res = model.generate(input=wav_file) |
| | | print(res) |
| | | |
| | | # [[beg1, end1], [beg2, end2], .., [begN, endN]] |
| | | # beg/end: ms |
| | | |
| | |
| | | ) |
| | | |
| | | res = model.generate( |
| | | input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" |
| | | ) |
| | | res = model.generate( |
| | | input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" |
| | | ) |
| | | |
| | | res = model.generate( |
| | | input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" |
| | | input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", |
| | | cache={}, |
| | | ) |
| | | |
| | | print(res) |
| | |
| | | from funasr.train_utils.set_all_random_seed import set_all_random_seed |
| | | from funasr.train_utils.load_pretrained_model import load_pretrained_model |
| | | from funasr.utils import export_utils |
| | | from funasr.utils import misc |
| | | |
| | | try: |
| | | from funasr.models.campplus.utils import sv_chunk, postprocess, distribute_spk |
| | |
| | | |
| | | |
| | | def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): |
| | | """ |
| | | |
| | | :param input: |
| | | :param input_len: |
| | | :param data_type: |
| | | :param frontend: |
| | | :return: |
| | | """ |
| | | """ """ |
| | | data_list = [] |
| | | key_list = [] |
| | | filelist = [".scp", ".txt", ".json", ".jsonl", ".text"] |
| | |
| | | key_list.append(key) |
| | | else: |
| | | if key is None: |
| | | key = "rand_key_" + "".join(random.choice(chars) for _ in range(13)) |
| | | # key = "rand_key_" + "".join(random.choice(chars) for _ in range(13)) |
| | | key = misc.extract_filename_without_extension(data_in) |
| | | data_list = [data_in] |
| | | key_list = [key] |
| | | elif isinstance(data_in, (list, tuple)): |
| | |
| | | else: |
| | | # [audio sample point, fbank, text] |
| | | data_list = data_in |
| | | key_list = [ |
| | | "rand_key_" + "".join(random.choice(chars) for _ in range(13)) |
| | | for _ in range(len(data_in)) |
| | | ] |
| | | key_list = [] |
| | | for data_i in data_in: |
| | | if isinstance(data_i, str) and os.path.exists(data_i): |
| | | key = misc.extract_filename_without_extension(data_i) |
| | | else: |
| | | key = "rand_key_" + "".join(random.choice(chars) for _ in range(13)) |
| | | key_list.append(key) |
| | | |
| | | else: # raw text; audio sample point, fbank; bytes |
| | | if isinstance(data_in, bytes): # audio bytes |
| | | data_in = load_bytes(data_in) |
| | |
| | | class AutoModel: |
| | | |
| | | def __init__(self, **kwargs): |
| | | |
| | | log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) |
| | | logging.basicConfig(level=log_level) |
| | | |
| | | if not kwargs.get("disable_log", True): |
| | | tables.print() |
| | | |
| | |
| | | return cfg_item |
| | | |
| | | kwargs = to_plain_list(cfg) |
| | | log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) |
| | | |
| | | logging.basicConfig(level=log_level) |
| | | |
| | | if kwargs.get("debug", False): |
| | | import pdb |
| | |
| | | return cfg_item |
| | | |
| | | kwargs = to_plain_list(cfg) |
| | | log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) |
| | | |
| | | logging.basicConfig(level=log_level) |
| | | |
| | | if kwargs.get("debug", False): |
| | | import pdb |
| | |
| | | |
| | | eos = self.tokenizer.encode(self.eos, allowed_special="all") # [eos] |
| | | |
| | | ids = prompt_ids + target_ids + eos |
| | | ids = prompt_ids + target_ids + eos # [sos, task, lid, text, eos] |
| | | ids_lengths = len(ids) |
| | | |
| | | text = torch.tensor(ids, dtype=torch.int64) |
| | |
| | | is_pad_mask = kwargs.get("is_pad_mask", False) |
| | | is_pad_memory_mask = kwargs.get("is_pad_memory_mask", False) |
| | | |
| | | fsmn_cache = cache[layer]["fsmn_cache"] if len(cache) > 0 or cache is None else None |
| | | fsmn_cache = cache[layer]["fsmn_cache"] if cache is not None and len(cache) > 0 else None |
| | | # if fsmn_cache is not None: |
| | | # x = x[:, -1:] |
| | | att_res, fsmn_cache = self.attn(self.attn_ln(x), mask=None, cache=fsmn_cache) |
| | |
| | | def score(self, ys, state, x): |
| | | """Score.""" |
| | | ys_mask = subsequent_mask(len(ys), device=x.device).unsqueeze(0) |
| | | logp = self.forward(ys.unsqueeze(0), x.unsqueeze(0), cache=state) |
| | | logp = self.forward(ys.unsqueeze(0), x.unsqueeze(0), cache=None) |
| | | logp = torch.log_softmax(logp, dim=-1) |
| | | return logp.squeeze(0)[-1, :], state |
| | |
| | | stats = {} |
| | | |
| | | # 1. Forward decoder |
| | | # ys_pad: [sos, task, lid, text, eos] |
| | | decoder_out = self.model.decoder( |
| | | x=ys_pad, xa=encoder_out, hlens=encoder_out_lens, ys_in_lens=ys_pad_lens |
| | | ) |
| | | |
| | | # 2. Compute attention loss |
| | | mask = torch.ones_like(ys_pad) * (-1) |
| | | ys_pad_mask = (ys_pad * target_mask + mask * (1 - target_mask)).to(torch.int64) |
| | | ys_pad_mask[ys_pad_mask == 0] = -1 |
| | | mask = torch.ones_like(ys_pad) * (-1) # [sos, task, lid, text, eos]: [-1, -1, -1, -1] |
| | | ys_pad_mask = (ys_pad * target_mask + mask * (1 - target_mask)).to( |
| | | torch.int64 |
| | | ) # [sos, task, lid, text, eos]: [0, 0, 1, 1, 1] + [-1, -1, 0, 0, 0] |
| | | ys_pad_mask[ys_pad_mask == 0] = -1 # [-1, -1, lid, text, eos] |
| | | # decoder_out: [sos, task, lid, text] |
| | | # ys_pad_mask: [-1, lid, text, eos] |
| | | loss_att = self.criterion_att(decoder_out[:, :-1, :], ys_pad_mask[:, 1:]) |
| | | |
| | | with torch.no_grad(): |
| | |
| | | data_type=kwargs.get("data_type", "sound"), |
| | | tokenizer=tokenizer, |
| | | ) |
| | | |
| | | if ( |
| | | isinstance(kwargs.get("data_type", None), (list, tuple)) |
| | | and len(kwargs.get("data_type", [])) > 1 |
| | | ): |
| | | audio_sample_list, text_token_int_list = audio_sample_list |
| | | text_token_int = text_token_int_list[0] |
| | | else: |
| | | text_token_int = None |
| | | |
| | | time2 = time.perf_counter() |
| | | meta_data["load_data"] = f"{time2 - time1:0.3f}" |
| | | speech, speech_lengths = extract_fbank( |
| | |
| | | speech[None, :, :].permute(0, 2, 1), speech_lengths |
| | | ) |
| | | |
| | | if text_token_int is not None: |
| | | i = 0 |
| | | results = [] |
| | | ibest_writer = None |
| | | if kwargs.get("output_dir") is not None: |
| | | if not hasattr(self, "writer"): |
| | | self.writer = DatadirWriter(kwargs.get("output_dir")) |
| | | ibest_writer = self.writer[f"1best_recog"] |
| | | |
| | | # 1. Forward decoder |
| | | ys_pad = torch.tensor(sos_int + text_token_int, dtype=torch.int64).to(kwargs["device"])[ |
| | | None, : |
| | | ] |
| | | ys_pad_lens = torch.tensor([len(sos_int + text_token_int)], dtype=torch.int64).to( |
| | | kwargs["device"] |
| | | )[None, :] |
| | | decoder_out = self.model.decoder( |
| | | x=ys_pad, xa=encoder_out, hlens=encoder_out_lens, ys_in_lens=ys_pad_lens |
| | | ) |
| | | |
| | | token_int = decoder_out.argmax(-1)[0, :].tolist() |
| | | text = tokenizer.decode(token_int) |
| | | |
| | | result_i = {"key": key[i], "text": text} |
| | | results.append(result_i) |
| | | |
| | | if ibest_writer is not None: |
| | | # ibest_writer["token"][key[i]] = " ".join(token) |
| | | ibest_writer["text"][key[i]] = text |
| | | return results, meta_data |
| | | |
| | | # c. Passed the encoder result and the beam search |
| | | nbest_hyps = self.beam_search( |
| | | x=encoder_out[0], |
| | |
| | | # post process of one iteration |
| | | running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps) |
| | | # end detection |
| | | # if len(ended_hyps) > 0: |
| | | # print(f"ended_hyps: {ended_hyps}") |
| | | if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i): |
| | | logging.info(f"end detected at {i}") |
| | | break |
| | |
| | | |
| | | else: |
| | | x = x.to(mel.device) |
| | | # FIX(funasr): sense vocie |
| | | # logits = model.logits(x[:, :-1], mel)[:, -1] |
| | | logits = model.logits(x[:, :], mel)[:, -1] |
| | | |
| | | logits = model.logits(x[:, :-1], mel)[:, -1] |
| | | # collect detected languages; suppress all non-language tokens |
| | | mask = torch.ones(logits.shape[-1], dtype=torch.bool) |
| | | mask[list(tokenizer.all_language_tokens)] = False |
| | |
| | | # config_json = os.path.join(model_path, "configuration.json") |
| | | # if os.path.exists(config_json): |
| | | # shutil.copy(config_json, os.path.join(kwargs.get("output_dir", "./"), "configuration.json")) |
| | | |
| | | |
| | | def extract_filename_without_extension(file_path): |
| | | """ |
| | | 从给定的文件路径中提取文件名(不包含路径和扩展名) |
| | | :param file_path: 完整的文件路径 |
| | | :return: 文件名(不含路径和扩展名) |
| | | """ |
| | | # 首先,使用os.path.basename获取路径中的文件名部分(含扩展名) |
| | | filename_with_extension = os.path.basename(file_path) |
| | | # 然后,使用os.path.splitext分离文件名和扩展名 |
| | | filename, extension = os.path.splitext(filename_with_extension) |
| | | # 返回不包含扩展名的文件名 |
| | | return filename |