Merge pull request #256 from alibaba-damo-academy/dev_wjm
Dev wjm
| | |
| | | from funasr.utils import asr_utils, wav_utils, postprocess_utils |
| | | import pdb |
| | | |
| | | header_colors = '\033[95m' |
| | | end_colors = '\033[0m' |
| | | |
| | | global_asr_language: str = 'zh-cn' |
| | | global_sample_rate: Union[int, Dict[Any, int]] = { |
| | |
| | | |
| | | def seg_tokenize(txt, seg_dict): |
| | | out_txt = "" |
| | | pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])") |
| | | for word in txt: |
| | | if pattern.match(word): |
| | | if word in seg_dict: |
| | | out_txt += seg_dict[word] + " " |
| | | else: |
| | | out_txt += "<unk>" + " " |
| | | if word in seg_dict: |
| | | out_txt += seg_dict[word] + " " |
| | | else: |
| | | continue |
| | | out_txt += "<unk>" + " " |
| | | return out_txt.strip().split() |
| | | |
| | | def seg_tokenize_wo_pattern(txt, seg_dict): |
| | |
| | | def test_uniasr_2pass_zhcn_16k_common_vocab8358_offline(self): |
| | | inference_pipeline = pipeline( |
| | | task=Tasks.auto_speech_recognition, |
| | | model='damo/speech_UniASauto_speech_recognitionR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline') |
| | | model='damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline') |
| | | rec_result = inference_pipeline( |
| | | audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav', |
| | | param_dict={"decoding_model": "offline"}) |