| | |
| | | #!/usr/bin/env python |
| | | import re |
| | | import numpy as np |
| | | from funasr.datasets.large_datasets.utils.hotword_utils import sample_hotword |
| | | |
| | | def forward_segment(text, seg_dict): |
| | | word_list = [] |
| | |
| | | return word_list |
| | | |
| | | def seg_tokenize(txt, seg_dict): |
| | | pattern = re.compile(r'^[\u4E00-\u9FA50-9]+$') |
| | | out_txt = "" |
| | | for word in txt: |
| | | word = word.lower() |
| | | if word in seg_dict: |
| | | out_txt += seg_dict[word] + " " |
| | | else: |
| | | out_txt += "<unk>" + " " |
| | | if pattern.match(word): |
| | | for char in word: |
| | | if char in seg_dict: |
| | | out_txt += seg_dict[char] + " " |
| | | else: |
| | | out_txt += "<unk>" + " " |
| | | else: |
| | | out_txt += "<unk>" + " " |
| | | return out_txt.strip().split() |
| | | |
| | | def tokenize(data, |
| | | vocab=None, |
| | | seg_dict=None, |
| | | punc_dict=None, |
| | | bpe_tokenizer=None): |
| | | bpe_tokenizer=None, |
| | | hw_config=None): |
| | | assert "text" in data |
| | | assert isinstance(vocab, dict) |
| | | text = data["text"] |
| | | token = [] |
| | | vad = -2 |
| | | |
| | | if bpe_tokenizer is not None: |
| | | text = bpe_tokenizer.text2tokens("".join(text)) |
| | | |
| | | text = bpe_tokenizer.text2tokens(" ".join(text)) |
| | | if seg_dict is not None: |
| | | assert isinstance(seg_dict, dict) |
| | | txt = forward_segment("".join(text).lower(), seg_dict) |
| | | text = seg_tokenize(txt, seg_dict) |
| | | text = seg_tokenize(text, seg_dict) |
| | | |
| | | length = len(text) |
| | | if 'hw_tag' in data: |
| | | pre_index = None |
| | | if hw_config['pre_hwlist'] is not None and hw_config['pre_prob'] > 0: |
| | | # enable preset hotword detect in sampling |
| | | for hw in hw_config['pre_hwlist']: |
| | | hw = " ".join(seg_tokenize(hw, seg_dict)) |
| | | _find = " ".join(text).find(hw) |
| | | if _find != -1: |
| | | # _find = text[:_find].count(" ") # bpe sometimes |
| | | pre_index = [_find, _find + max(hw.count(" "), 1)] |
| | | break |
| | | hotword_indxs = sample_hotword(length, **hw_config, pre_index=pre_index) |
| | | data['hotword_indxs'] = hotword_indxs |
| | | del data['hw_tag'] |
| | | for i in range(length): |
| | | x = text[i] |
| | | if i == length-1 and "punc" in data and text[i].startswith("vad:"): |
| | | vad = x[-1][4:] |
| | | if i == length-1 and "punc" in data and x.startswith("vad:"): |
| | | vad = x[4:] |
| | | if len(vad) == 0: |
| | | vad = -1 |
| | | else: |