| | |
| | | |
| | | Examples: |
| | | >>> import soundfile |
| | | >>> speech2text = Speech2Text("asr_config.yml", "asr.pth") |
| | | >>> speech2text = Speech2Text("asr_config.yml", "asr.pb") |
| | | >>> audio, rate = soundfile.read("speech.wav") |
| | | >>> speech2text(audio) |
| | | [(text, token, token_int, hypothesis object), ...] |
| | |
| | | # Input as audio signal |
| | | if isinstance(speech, np.ndarray): |
| | | speech = torch.tensor(speech) |
| | | |
| | | |
| | | if(speech.dim()==3): |
| | | speech = torch.squeeze(speech, 2) |
| | | #speech = speech.unsqueeze(0).to(getattr(torch, self.dtype)) |
| | | speech = speech.to(getattr(torch, self.dtype)) |
| | | # lenghts: (1,) |
| | |
| | | data_path_and_name_and_type, |
| | | dtype=dtype, |
| | | batch_size=batch_size, |
| | | fs=fs, |
| | | mc=True, |
| | | key_file=key_file, |
| | | num_workers=num_workers, |
| | | preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), |