python/FunASR-XL.git

parent: 735f7619 | 补丁 | 提交 | show whitespace

Merge pull request #93 from alibaba-damo-academy/dev_lzr

zhifu gao

2023-02-10 2edb3a1bf04fbd44c132756c0e6610ca62eccc97

Merge pull request #93 from alibaba-damo-academy/dev_lzr

update paraformer-large model RESULTS.md and support for turning off timestamps

8个文件已修改

3个文件已添加

	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/RESULTS.md	23 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/RESULTS.md	25 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/RESULTS.md	75 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer.py	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer_timestamp.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer_vad.py	11 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer_vad_punc.py	11 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_uniasr.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_uniasr_vad.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/utils/postprocess_utils.py	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/RESULTS.md

New file
@@ -0,0 +1,23 @@
# Paraformer-Large
- Model link: <https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/summary>
- Model size: 220M

# Environments
- date: `Fri Feb 10 13:34:24 CST 2023`
- python version: `3.7.12`
- FunASR version: `0.1.6`
- pytorch version: `pytorch 1.7.0`
- Git hash: ``
- Commit date: ``

# Beachmark Results

## AISHELL-1
- Decode config:
  - Decode without CTC
  - Decode without LM

| testset CER(%) | base model|finetune model |
|:--------------:|:---------:|:-------------:|
| dev            | 1.75      |1.62           |
| test           | 1.95      |1.78           |

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/RESULTS.md

New file
@@ -0,0 +1,25 @@
# Paraformer-Large
- Model link: <https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/summary>
- Model size: 220M

# Environments
- date: `Fri Feb 10 13:34:24 CST 2023`
- python version: `3.7.12`
- FunASR version: `0.1.6`
- pytorch version: `pytorch 1.7.0`
- Git hash: ``
- Commit date: ``

# Beachmark Results

## AISHELL-2
- Decode config: 
  - Decode without CTC
  - Decode without LM

| testset      | base model|finetune model|
|:------------:|:---------:|:------------:|
| dev_ios      | 2.80      |2.60          |
| test_android | 3.13      |2.84          |
| test_ios     | 2.85      |2.82          |
| test_mic     | 3.06      |2.88          |

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/RESULTS.md

New file
@@ -0,0 +1,75 @@
# Paraformer-Large
- Model link: <https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary>
- Model size: 220M

# Environments
- date: `Tue Nov 22 18:48:39 CST 2022`
- python version: `3.7.12`
- FunASR version: `0.1.0`
- pytorch version: `pytorch 1.7.0`
- Git hash: ``
- Commit date: ``

# Beachmark Results

## AISHELL-1
- Decode config: 
  - Decode without CTC
  - Decode without LM

| testset   | CER(%)|
|:---------:|:-----:|
| dev       | 1.75  |
| test      | 1.95  |

## AISHELL-2
- Decode config: 
  - Decode without CTC
  - Decode without LM

| testset      | CER(%)|
|:------------:|:-----:|
| dev_ios      | 2.80  |
| test_android | 3.13  |
| test_ios     | 2.85  |
| test_mic     | 3.06  |

## Wenetspeech
- Decode config: 
  - Decode without CTC
  - Decode without LM

| testset   | CER(%)|
|:---------:|:-----:|
| dev       | 3.57  |
| test      | 6.97  |
| test_net  | 6.74  |

## SpeechIO TIOBE
- Decode config 1:
  - Decode without CTC
  - Decode without LM
  - With text norm
- Decode config 2:
  - Decode without CTC
  - Decode with Transformer-LM
  - LM weight: 0.15
  - With text norm

| testset | w/o LM | w/ LM |
|:------------------:|:----:|:----:|
|SPEECHIO_ASR_ZH00001| 0.49 | 0.35 |
|SPEECHIO_ASR_ZH00002| 3.23 | 2.86 |
|SPEECHIO_ASR_ZH00003| 1.13 | 0.80 |
|SPEECHIO_ASR_ZH00004| 1.33 | 1.10 |
|SPEECHIO_ASR_ZH00005| 1.41 | 1.18 |
|SPEECHIO_ASR_ZH00006| 5.25 | 4.85 |
|SPEECHIO_ASR_ZH00007| 5.51 | 4.97 |
|SPEECHIO_ASR_ZH00008| 3.69 | 3.18 |
|SPEECHIO_ASR_ZH00009| 3.02 | 2.78 |
|SPEECHIO_ASR_ZH000010| 3.35 | 2.99 |
|SPEECHIO_ASR_ZH000011| 1.54 | 1.25 |
|SPEECHIO_ASR_ZH000012| 2.06 | 1.68 |
|SPEECHIO_ASR_ZH000013| 2.57 | 2.25 |
|SPEECHIO_ASR_ZH000014| 3.86 | 3.08 |
|SPEECHIO_ASR_ZH000015| 3.34 | 2.67 |

 funasr/bin/asr_inference.py

@@ -453,7 +453,7 @@
                    ibest_writer["score"][key] = str(hyp.score)
                
                if text is not None:
                    text_postprocessed = postprocess_utils.sentence_postprocess(token)
                    text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
                    item = {'key': key, 'value': text_postprocessed}
                    asr_result_list.append(item)
                    finish_count += 1

 funasr/bin/asr_inference_paraformer.py

@@ -428,7 +428,11 @@
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )

    hotword_list_or_file = param_dict['hotword']
    if param_dict is not None:
        hotword_list_or_file = param_dict.get('hotword')
    else:
        hotword_list_or_file = None

    if ngpu >= 1 and torch.cuda.is_available():
        device = "cuda"
    else:
@@ -539,7 +543,7 @@
                        ibest_writer["rtf"][key] = rtf_cur

                    if text is not None:
                        text_postprocessed = postprocess_utils.sentence_postprocess(token)
                        text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
                        item = {'key': key, 'value': text_postprocessed}
                        asr_result_list.append(item)
                        finish_count += 1

 funasr/bin/asr_inference_paraformer_timestamp.py

@@ -436,7 +436,7 @@
                    ibest_writer["score"][key] = str(hyp.score)
    
                if text is not None:
                    text_postprocessed = postprocess_utils.sentence_postprocess(token)
                    text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
                    item = {'key': key, 'value': text_postprocessed}
                    asr_result_list.append(item)
                    finish_count += 1

 funasr/bin/asr_inference_paraformer_vad.py

@@ -242,6 +242,11 @@
            inference=True,
        )
        
        if param_dict is not None:
            use_timestamp = param_dict.get('use_timestamp', True)
        else:
            use_timestamp = True
        
        finish_count = 0
        file_count = 1
        lfr_factor = 6
@@ -284,8 +289,10 @@
                text, token, token_int = result[0], result[1], result[2]
                time_stamp = None if len(result) < 4 else result[3]
               
                
                if use_timestamp and time_stamp is not None:
                postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
                else:
                    postprocessed_result = postprocess_utils.sentence_postprocess(token)
                text_postprocessed = ""
                time_stamp_postprocessed = ""
                text_postprocessed_punc = postprocessed_result
@@ -293,6 +300,8 @@
                    text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
                                                                               postprocessed_result[1], \
                                                                               postprocessed_result[2]
                else:
                    text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
                    text_postprocessed_punc = text_postprocessed
                    if len(word_lists) > 0 and text2punc is not None:
                        text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)

 funasr/bin/asr_inference_paraformer_vad_punc.py

@@ -571,6 +571,11 @@
            inference=True,
        )
    
        if param_dict is not None:
            use_timestamp = param_dict.get('use_timestamp', True)
        else:
            use_timestamp = True
    
        finish_count = 0
        file_count = 1
        lfr_factor = 6
@@ -613,7 +618,10 @@
                text, token, token_int = result[0], result[1], result[2]
                time_stamp = None if len(result) < 4 else result[3]
    
                if use_timestamp and time_stamp is not None: 
                postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
                else:
                    postprocessed_result = postprocess_utils.sentence_postprocess(token)
                text_postprocessed = ""
                time_stamp_postprocessed = ""
                text_postprocessed_punc = postprocessed_result
@@ -621,6 +629,9 @@
                    text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
                                                                               postprocessed_result[1], \
                                                                               postprocessed_result[2]
                else:
                    text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]

                    text_postprocessed_punc = text_postprocessed
                    if len(word_lists) > 0 and text2punc is not None:
                        text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)

 funasr/bin/asr_inference_uniasr.py

@@ -492,7 +492,7 @@
                    ibest_writer["score"][key] = str(hyp.score)
    
                if text is not None:
                    text_postprocessed = postprocess_utils.sentence_postprocess(token)
                    text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
                    item = {'key': key, 'value': text_postprocessed}
                    asr_result_list.append(item)
                    finish_count += 1

 funasr/bin/asr_inference_uniasr_vad.py

@@ -492,7 +492,7 @@
                    ibest_writer["score"][key] = str(hyp.score)
    
                if text is not None:
                    text_postprocessed = postprocess_utils.sentence_postprocess(token)
                    text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
                    item = {'key': key, 'value': text_postprocessed}
                    asr_result_list.append(item)
                    finish_count += 1

 funasr/utils/postprocess_utils.py

@@ -232,5 +232,9 @@
        return sentence, ts_lists, real_word_lists
    else:
        word_lists = abbr_dispose(word_lists)
        real_word_lists = []
        for ch in word_lists:
            if ch != ' ':
                real_word_lists.append(ch)
        sentence = ''.join(word_lists).strip()
        return sentence
        return sentence, real_word_lists

New file
			@@ -0,0 +1,23 @@
			# Paraformer-Large
			- Model link: <https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/summary>
			- Model size: 220M

			# Environments
			- date: `Fri Feb 10 13:34:24 CST 2023`
			- python version: `3.7.12`
			- FunASR version: `0.1.6`
			- pytorch version: `pytorch 1.7.0`
			- Git hash: ``
			- Commit date: ``

			# Beachmark Results

			## AISHELL-1
			- Decode config:
			- Decode without CTC
			- Decode without LM

			\| testset CER(%) \| base model\|finetune model \|
			\|:--------------:\|:---------:\|:-------------:\|
			\| dev \| 1.75 \|1.62 \|
			\| test \| 1.95 \|1.78 \|

New file
			@@ -0,0 +1,25 @@
			# Paraformer-Large
			- Model link: <https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/summary>
			- Model size: 220M

			# Environments
			- date: `Fri Feb 10 13:34:24 CST 2023`
			- python version: `3.7.12`
			- FunASR version: `0.1.6`
			- pytorch version: `pytorch 1.7.0`
			- Git hash: ``
			- Commit date: ``

			# Beachmark Results

			## AISHELL-2
			- Decode config:
			- Decode without CTC
			- Decode without LM

			\| testset \| base model\|finetune model\|
			\|:------------:\|:---------:\|:------------:\|
			\| dev_ios \| 2.80 \|2.60 \|
			\| test_android \| 3.13 \|2.84 \|
			\| test_ios \| 2.85 \|2.82 \|
			\| test_mic \| 3.06 \|2.88 \|

New file
			@@ -0,0 +1,75 @@
			# Paraformer-Large
			- Model link: <https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary>
			- Model size: 220M

			# Environments
			- date: `Tue Nov 22 18:48:39 CST 2022`
			- python version: `3.7.12`
			- FunASR version: `0.1.0`
			- pytorch version: `pytorch 1.7.0`
			- Git hash: ``
			- Commit date: ``

			# Beachmark Results

			## AISHELL-1
			- Decode config:
			- Decode without CTC
			- Decode without LM

			\| testset \| CER(%)\|
			\|:---------:\|:-----:\|
			\| dev \| 1.75 \|
			\| test \| 1.95 \|

			## AISHELL-2
			- Decode config:
			- Decode without CTC
			- Decode without LM

			\| testset \| CER(%)\|
			\|:------------:\|:-----:\|
			\| dev_ios \| 2.80 \|
			\| test_android \| 3.13 \|
			\| test_ios \| 2.85 \|
			\| test_mic \| 3.06 \|

			## Wenetspeech
			- Decode config:
			- Decode without CTC
			- Decode without LM

			\| testset \| CER(%)\|
			\|:---------:\|:-----:\|
			\| dev \| 3.57 \|
			\| test \| 6.97 \|
			\| test_net \| 6.74 \|

			## SpeechIO TIOBE
			- Decode config 1:
			- Decode without CTC
			- Decode without LM
			- With text norm
			- Decode config 2:
			- Decode without CTC
			- Decode with Transformer-LM
			- LM weight: 0.15
			- With text norm

			\| testset \| w/o LM \| w/ LM \|
			\|:------------------:\|:----:\|:----:\|
			\|SPEECHIO_ASR_ZH00001\| 0.49 \| 0.35 \|
			\|SPEECHIO_ASR_ZH00002\| 3.23 \| 2.86 \|
			\|SPEECHIO_ASR_ZH00003\| 1.13 \| 0.80 \|
			\|SPEECHIO_ASR_ZH00004\| 1.33 \| 1.10 \|
			\|SPEECHIO_ASR_ZH00005\| 1.41 \| 1.18 \|
			\|SPEECHIO_ASR_ZH00006\| 5.25 \| 4.85 \|
			\|SPEECHIO_ASR_ZH00007\| 5.51 \| 4.97 \|
			\|SPEECHIO_ASR_ZH00008\| 3.69 \| 3.18 \|
			\|SPEECHIO_ASR_ZH00009\| 3.02 \| 2.78 \|
			\|SPEECHIO_ASR_ZH000010\| 3.35 \| 2.99 \|
			\|SPEECHIO_ASR_ZH000011\| 1.54 \| 1.25 \|
			\|SPEECHIO_ASR_ZH000012\| 2.06 \| 1.68 \|
			\|SPEECHIO_ASR_ZH000013\| 2.57 \| 2.25 \|
			\|SPEECHIO_ASR_ZH000014\| 3.86 \| 3.08 \|
			\|SPEECHIO_ASR_ZH000015\| 3.34 \| 2.67 \|

			@@ -453,7 +453,7 @@
			ibest_writer["score"][key] = str(hyp.score)

			if text is not None:
			text_postprocessed = postprocess_utils.sentence_postprocess(token)
			text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
			item = {'key': key, 'value': text_postprocessed}
			asr_result_list.append(item)
			finish_count += 1

			@@ -428,7 +428,11 @@
			format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
			)

			hotword_list_or_file = param_dict['hotword']
			if param_dict is not None:
			hotword_list_or_file = param_dict.get('hotword')
			else:
			hotword_list_or_file = None

			if ngpu >= 1 and torch.cuda.is_available():
			device = "cuda"
			else:
			@@ -539,7 +543,7 @@
			ibest_writer["rtf"][key] = rtf_cur

			if text is not None:
			text_postprocessed = postprocess_utils.sentence_postprocess(token)
			text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
			item = {'key': key, 'value': text_postprocessed}
			asr_result_list.append(item)
			finish_count += 1

			@@ -436,7 +436,7 @@
			ibest_writer["score"][key] = str(hyp.score)

			if text is not None:
			text_postprocessed = postprocess_utils.sentence_postprocess(token)
			text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
			item = {'key': key, 'value': text_postprocessed}
			asr_result_list.append(item)
			finish_count += 1

			@@ -242,6 +242,11 @@
			inference=True,
			)

			if param_dict is not None:
			use_timestamp = param_dict.get('use_timestamp', True)
			else:
			use_timestamp = True

			finish_count = 0
			file_count = 1
			lfr_factor = 6
			@@ -284,8 +289,10 @@
			text, token, token_int = result[0], result[1], result[2]
			time_stamp = None if len(result) < 4 else result[3]


			if use_timestamp and time_stamp is not None:
			postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
			else:
			postprocessed_result = postprocess_utils.sentence_postprocess(token)
			text_postprocessed = ""
			time_stamp_postprocessed = ""
			text_postprocessed_punc = postprocessed_result
			@@ -293,6 +300,8 @@
			text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
			postprocessed_result[1], \
			postprocessed_result[2]
			else:
			text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
			text_postprocessed_punc = text_postprocessed
			if len(word_lists) > 0 and text2punc is not None:
			text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)

			@@ -571,6 +571,11 @@
			inference=True,
			)

			if param_dict is not None:
			use_timestamp = param_dict.get('use_timestamp', True)
			else:
			use_timestamp = True

			finish_count = 0
			file_count = 1
			lfr_factor = 6
			@@ -613,7 +618,10 @@
			text, token, token_int = result[0], result[1], result[2]
			time_stamp = None if len(result) < 4 else result[3]

			if use_timestamp and time_stamp is not None:
			postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
			else:
			postprocessed_result = postprocess_utils.sentence_postprocess(token)
			text_postprocessed = ""
			time_stamp_postprocessed = ""
			text_postprocessed_punc = postprocessed_result
			@@ -621,6 +629,9 @@
			text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
			postprocessed_result[1], \
			postprocessed_result[2]
			else:
			text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]

			text_postprocessed_punc = text_postprocessed
			if len(word_lists) > 0 and text2punc is not None:
			text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)

			@@ -492,7 +492,7 @@
			ibest_writer["score"][key] = str(hyp.score)

			if text is not None:
			text_postprocessed = postprocess_utils.sentence_postprocess(token)
			text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
			item = {'key': key, 'value': text_postprocessed}
			asr_result_list.append(item)
			finish_count += 1

			@@ -232,5 +232,9 @@
			return sentence, ts_lists, real_word_lists
			else:
			word_lists = abbr_dispose(word_lists)
			real_word_lists = []
			for ch in word_lists:
			if ch != ' ':
			real_word_lists.append(ch)
			sentence = ''.join(word_lists).strip()
			return sentence
			return sentence, real_word_lists