| | |
| | | |
| | | ## option 1, download model automatically |
| | | model_name_or_model_dir="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | |
| | | model_name_or_model_dir="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | |
| | | ## option 2, download model by git |
| | | #local_path_root=${workspace}/modelscope_models |
| | |
| | | options = whisper.DecodingOptions(**DecodingOptions) |
| | | |
| | | result = whisper.decode(self.model, speech, options) |
| | | |
| | | text = f"{result.text}\n" |
| | | results = [] |
| | | result_i = {"key": key[0], "text": result.text} |
| | | result_i = {"key": key[0], "text": text} |
| | | |
| | | results.append(result_i) |
| | | |
| | |
| | | return seg_dict |
| | | |
| | | def seg_tokenize(txt, seg_dict): |
| | | pattern = re.compile(r'^[\u4E00-\u9FA50-9]+$') |
| | | # pattern = re.compile(r'^[\u4E00-\u9FA50-9]+$') |
| | | pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])") |
| | | out_txt = "" |
| | | for word in txt: |
| | | word = word.lower() |