| | |
| | | |
| | | |
| | | from funasr.models.paraformer.search import Hypothesis |
| | | from funasr.models.sense_voice.utils.ctc_alignment import ctc_forced_align |
| | | from .utils.ctc_alignment import ctc_forced_align |
| | | |
| | | |
| | | class SinusoidalPositionEncoder(torch.nn.Module): |
| | |
| | | |
| | | if output_timestamp: |
| | | from itertools import groupby |
| | | |
| | | timestamp = [] |
| | | tokens = tokenizer.text2tokens(text)[4:] |
| | | logits_speech = self.ctc.softmax(encoder_out)[i, 4:encoder_out_lens[i].item(), :] |
| | |
| | | timestamp_new = [] |
| | | for i, t in enumerate(timestamp): |
| | | word, start, end = t |
| | | if word == '▁': |
| | | if word == "▁": |
| | | continue |
| | | if i == 0: |
| | | # timestamp_new.append([word, start, end]) |
| | |
| | | # timestamp_new[-1][0] += word |
| | | timestamp_new[-1][1] = int(end*1000) |
| | | return timestamp_new |
| | | |
| | | def export(self, **kwargs): |
| | | from export_meta import export_rebuild_model |
| | | |
| | |
| | | return models |
| | | |
| | | return results, meta_data |
| | | |