| | |
| | | fusion_encoder_class = tables.encoder_classes.get(fusion_encoder) |
| | | fusion_encoder = fusion_encoder_class(**fusion_encoder_conf) |
| | | bias_predictor_class = tables.encoder_classes.get(bias_predictor) |
| | | bias_predictor = bias_predictor_class(bias_predictor_conf) |
| | | bias_predictor = bias_predictor_class(**bias_predictor_conf) |
| | | |
| | | if decoder is not None: |
| | | decoder_class = tables.decoder_classes.get(decoder) |
| | |
| | | else: |
| | | # extract fbank feats |
| | | time1 = time.perf_counter() |
| | | pdb.set_trace() |
| | | audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), |
| | | data_type=kwargs.get("data_type", "sound"), |
| | | tokenizer=tokenizer) |