| | |
| | | import torch |
| | | import random |
| | | |
| | | |
| | | from funasr.register import tables |
| | | from funasr.utils.load_utils import extract_fbank, load_audio_text_image_video |
| | | |
| | |
| | | index_ds: str = None, |
| | | frontend=None, |
| | | tokenizer=None, |
| | | is_training: bool = True, |
| | | int_pad_value: int = -1, |
| | | float_pad_value: float = 0.0, |
| | | **kwargs, |
| | |
| | | super().__init__() |
| | | index_ds_class = tables.index_ds_classes.get(index_ds) |
| | | self.index_ds = index_ds_class(path, **kwargs) |
| | | |
| | | self.preprocessor_speech = None |
| | | self.preprocessor_text = None |
| | | |
| | | if is_training: |
| | | preprocessor_speech = kwargs.get("preprocessor_speech", None) |
| | | if preprocessor_speech: |
| | | preprocessor_speech_class = tables.preprocessor_classes.get(preprocessor_speech) |
| | |
| | | data_src = load_audio_text_image_video(source, fs=self.fs) |
| | | if self.preprocessor_speech: |
| | | data_src = self.preprocessor_speech(data_src, fs=self.fs) |
| | | |
| | | speech, speech_lengths = extract_fbank( |
| | | data_src, data_type=self.data_type, frontend=self.frontend, is_final=True |
| | | ) # speech: [b, T, d] |
| | |
| | | target = item["target"] |
| | | if self.preprocessor_text: |
| | | target = self.preprocessor_text(target) |
| | | |
| | | if self.tokenizer: |
| | | ids = self.tokenizer.encode(target) |
| | | text = torch.tensor(ids, dtype=torch.int64) |