Merge branch 'main' of github.com:alibaba-damo-academy/FunASR
merge
| | |
| | | # Sphinx build info version 1 |
| | | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. |
| | | config: a62852d90c3e533904d811bbf85f977d |
| | | config: 160d25833895e2f6c62a4c315cacc3b9 |
| | | tags: 645f666f9bcd5a90fca523b33c5a78b7 |
| | |
| | | # Sphinx build info version 1 |
| | | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. |
| | | config: 06d9c1d4093817b45b9d4df7ab350eaf |
| | | config: a4d4595bd4f85adbedc556dc23e6150a |
| | | tags: 645f666f9bcd5a90fca523b33c5a78b7 |
| | |
| | | mkdir -p ${sa_asr_exp}/log |
| | | INIT_FILE=${sa_asr_exp}/ddp_init |
| | | |
| | | if [ ! -f "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb" ]; then |
| | | if [ ! -f "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth" ]; then |
| | | # download xvector extractor model file |
| | | python local/download_xvector_model.py exp |
| | | log "Successfully download the pretrained xvector extractor to exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb" |
| | | log "Successfully download the pretrained xvector extractor to exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth" |
| | | fi |
| | | |
| | | if [ -f $INIT_FILE ];then |
| | |
| | | --init_param "${asr_exp}/valid.acc.ave.pb:decoder.decoders.3:decoder.decoder4.2" \ |
| | | --init_param "${asr_exp}/valid.acc.ave.pb:decoder.decoders.4:decoder.decoder4.3" \ |
| | | --init_param "${asr_exp}/valid.acc.ave.pb:decoder.decoders.5:decoder.decoder4.4" \ |
| | | --init_param "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb:encoder:spk_encoder" \ |
| | | --init_param "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb:decoder:spk_encoder:decoder.output_dense" \ |
| | | --init_param "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth:encoder:spk_encoder" \ |
| | | --init_param "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth:decoder:spk_encoder:decoder.output_dense" \ |
| | | --valid_data_path_and_name_and_type "${_asr_valid_dir}/${_scp},speech,${_type}" \ |
| | | --valid_data_path_and_name_and_type "${_asr_valid_dir}/text,text,text" \ |
| | | --valid_data_path_and_name_and_type "${_asr_valid_dir}/oracle_profile_nopadding.scp,profile,npy" \ |
| | |
| | | n_fft: 400 |
| | | win_length: 400 |
| | | hop_length: 160 |
| | | use_channel: 0 |
| | | |
| | | # encoder related |
| | | encoder: conformer |
| | |
| | | n_fft: 400 |
| | | win_length: 400 |
| | | hop_length: 160 |
| | | use_channel: 0 |
| | | |
| | | # encoder related |
| | | asr_encoder: conformer |
| | |
| | | if isinstance(speech, np.ndarray): |
| | | speech = torch.tensor(speech) |
| | | |
| | | feats = speech.unsqueeze(0).to(getattr(torch, self.dtype)) |
| | | feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1)) |
| | | if self.frontend is not None: |
| | | speech = torch.unsqueeze(speech, axis=0) |
| | | speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1)) |
| | | feats, feats_lengths = self.frontend(speech, speech_lengths) |
| | | else: |
| | | feats = speech.unsqueeze(0).to(getattr(torch, self.dtype)) |
| | | feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1)) |
| | | |
| | | if self.asr_model.normalize is not None: |
| | | feats, feats_lengths = self.asr_model.normalize(feats, feats_lengths) |
| | |
| | | |
| | | if isinstance(speech, np.ndarray): |
| | | speech = torch.tensor(speech) |
| | | |
| | | feats = speech.unsqueeze(0).to(getattr(torch, self.dtype)) |
| | | feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1)) |
| | | |
| | | if self.frontend is not None: |
| | | speech = torch.unsqueeze(speech, axis=0) |
| | | speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1)) |
| | | feats, feats_lengths = self.frontend(speech, speech_lengths) |
| | | else: |
| | | feats = speech.unsqueeze(0).to(getattr(torch, self.dtype)) |
| | | feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1)) |
| | | |
| | | feats = to_device(feats, device=self.device) |
| | | feats_lengths = to_device(feats_lengths, device=self.device) |
| | | |
| | | enc_out, _ = self.asr_model.encoder(feats, feats_lengths) |
| | | enc_out, _, _ = self.asr_model.encoder(feats, feats_lengths) |
| | | |
| | | nbest_hyps = self.beam_search(enc_out[0]) |
| | | |
| | |
| | | args = parse_args() |
| | | |
| | | # setup local gpu_id |
| | | os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id) |
| | | if args.ngpu > 0: |
| | | os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id) |
| | | |
| | | # DDP settings |
| | | if args.ngpu > 1: |
| | |
| | | |
| | | # re-compute batch size: when dataset type is small |
| | | if args.dataset_type == "small": |
| | | if args.batch_size is not None: |
| | | if args.batch_size is not None and args.ngpu > 0: |
| | | args.batch_size = args.batch_size * args.ngpu |
| | | if args.batch_bins is not None: |
| | | if args.batch_bins is not None and args.ngpu > 0: |
| | | args.batch_bins = args.batch_bins * args.ngpu |
| | | |
| | | main(args=args) |
| | |
| | | |
| | | # 7. Build iterator factories |
| | | if args.dataset_type == "large": |
| | | from funasr.datasets.large_datasets.build_dataloader import ArkDataLoader |
| | | train_iter_factory = ArkDataLoader(args.train_data_file, args.token_list, args.dataset_conf, |
| | | frontend_conf=args.frontend_conf if hasattr(args, |
| | | "frontend_conf") else None, |
| | | seg_dict_file=args.seg_dict_file if hasattr(args, |
| | | "seg_dict_file") else None, |
| | | punc_dict_file=args.punc_list if hasattr(args, |
| | | "punc_list") else None, |
| | | bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None, |
| | | mode="train") |
| | | valid_iter_factory = ArkDataLoader(args.valid_data_file, args.token_list, args.dataset_conf, |
| | | frontend_conf=args.frontend_conf if hasattr(args, |
| | | "frontend_conf") else None, |
| | | seg_dict_file=args.seg_dict_file if hasattr(args, |
| | | "seg_dict_file") else None, |
| | | punc_dict_file=args.punc_list if hasattr(args, |
| | | "punc_list") else None, |
| | | bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None, |
| | | mode="eval") |
| | | from funasr.datasets.large_datasets.build_dataloader import LargeDataLoader |
| | | train_iter_factory = LargeDataLoader(args, mode="train") |
| | | valid_iter_factory = LargeDataLoader(args, mode="eval") |
| | | |
| | | elif args.dataset_type == "small": |
| | | train_iter_factory = cls.build_iter_factory( |
| | | args=args, |
| | |
| | | default=get_default_kwargs(CTC), |
| | | help="The keyword arguments for CTC class.", |
| | | ) |
| | | group.add_argument( |
| | | "--joint_network_conf", |
| | | action=NestedDictAction, |
| | | default=None, |
| | | help="The keyword arguments for joint network class.", |
| | | ) |
| | | |
| | | group = parser.add_argument_group(description="Preprocess related") |
| | | group.add_argument( |
| | |
| | | num_optimizers: int = 1 |
| | | |
| | | class_choices_list = [ |
| | | model_choices, |
| | | frontend_choices, |
| | | specaug_choices, |
| | | normalize_choices, |
| | |
| | | try: |
| | | model_class = model_choices.get_class(args.model) |
| | | except AttributeError: |
| | | model_class = model_choices.get_class("asr") |
| | | model_class = model_choices.get_class("rnnt_unified") |
| | | |
| | | model = model_class( |
| | | vocab_size=vocab_size, |