| | |
| | | #!/usr/bin/env python |
| | | import re |
| | | import numpy as np |
| | | from funasr.datasets.large_datasets.utils.hotword_utils import sample_hotword |
| | | |
| | | def forward_segment(text, seg_dict): |
| | | word_list = [] |
| | |
| | | vocab=None, |
| | | seg_dict=None, |
| | | punc_dict=None, |
| | | bpe_tokenizer=None): |
| | | bpe_tokenizer=None, |
| | | hw_config=None): |
| | | assert "text" in data |
| | | assert isinstance(vocab, dict) |
| | | text = data["text"] |
| | |
| | | text = seg_tokenize(text, seg_dict) |
| | | |
| | | length = len(text) |
| | | if 'hw_tag' in data: |
| | | hotword_indxs = sample_hotword(length, **hw_config) |
| | | data['hotword_indxs'] = hotword_indxs |
| | | del data['hw_tag'] |
| | | for i in range(length): |
| | | x = text[i] |
| | | if i == length-1 and "punc" in data and x.startswith("vad:"): |