12个文件已修改
4个文件已添加
1 文件已重命名
2个文件已删除
| | |
| | | <a name="quick-start"></a> |
| | | ## Quick Start |
| | | |
| | | Below is a quick start tutorial. Test audio files ([Mandarin](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav), [English]()). |
| | | Below is a quick start tutorial. Test audio files ([Mandarin](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav), [English](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav)). |
| | | |
| | | ### Command-line usage |
| | | |
| | |
| | | <a name="快速开始"></a> |
| | | ## 快速开始 |
| | | |
| | | 下面为快速上手教程,测试音频([中文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav),[英文]()) |
| | | 下面为快速上手教程,测试音频([中文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav),[英文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav)) |
| | | |
| | | ### 可执行命令行 |
| | | |
| New file |
| | |
| | | {"key": "BAC009S0764W0121", "source": "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav", "source_len": 90, "target": "甚至出现交易几乎停滞的情况", "target_len": 13} |
| | | {"key": "BAC009S0916W0489", "source": "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav", "source_len": 90, "target": "湖北一公司以员工名义贷款数十员工负债千万", "target_len": 20} |
| | | {"key": "asr_example_cn_en", "source": "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav", "source_len": 91, "target": "所有只要处理 data 不管你是做 machine learning 做 deep learning 做 data analytics 做 data science 也好 scientist 也好通通都要都做的基本功啊那 again 先先对有一些也许对", "target_len": 19} |
| | | {"key": "ID0012W0014", "source": "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav", "source_len": 88, "target": "he tried to think how it could be", "target_len": 8} |
| | |
| | | ID0012W0013 当客户风险承受能力评估依据发生变化时 |
| | | ID0012W0014 杨涛不得不将工厂关掉 |
| | | BAC009S0764W0121 甚至出现交易几乎停滞的情况 |
| | | BAC009S0916W0489 湖北一公司以员工名义贷款数十员工负债千万 |
| | | asr_example_cn_en 所有只要处理 data 不管你是做 machine learning 做 deep learning 做 data analytics 做 data science 也好 scientist 也好通通都要都做的基本功啊那 again 先先对有一些也许对 |
| | | ID0012W0014 he tried to think how it could be |
| | |
| | | ID0012W0013 /Users/zhifu/funasr_github/test_local/aishell2_dev_ios/wav/D0012/ID0012W0013.wav |
| | | ID0012W0014 /Users/zhifu/funasr_github/test_local/aishell2_dev_ios/wav/D0012/ID0012W0014.wav |
| | | BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav |
| | | BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav |
| | | asr_example_cn_en https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav |
| | | ID0012W0014 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav |
| New file |
| | |
| | | {"key": "ID0012W0013", "source": "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", "source_len": 88, "target": "欢迎大家来体验达摩院推出的语音识别模型", "target_len": 19} |
| | | {"key": "ID0012W0014", "source": "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav", "source_len": 88, "target": "he tried to think how it could be", "target_len": 8} |
| | |
| | | ID0012W0013 当客户风险承受能力评估依据发生变化时 |
| | | ID0012W0014 杨涛不得不将工厂关掉 |
| | | ID0012W0013 欢迎大家来体验达摩院推出的语音识别模型 |
| | | ID0012W0014 he tried to think how it could be |
| | |
| | | ID0012W0013 /Users/zhifu/funasr_github/test_local/aishell2_dev_ios/wav/D0012/ID0012W0013.wav |
| | | ID0012W0014 /Users/zhifu/funasr_github/test_local/aishell2_dev_ios/wav/D0012/ID0012W0014.wav |
| | | ID0012W0013 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav |
| | | ID0012W0014 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav |
| New file |
| | |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | # method1, finetune from model hub |
| | | |
| | | # which gpu to train or finetune |
| | | export CUDA_VISIBLE_DEVICES="0,1" |
| | | gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | | |
| | | # model_name from model_hub, or model_dir in local path |
| | | |
| | | ## option 1, download model automatically |
| | | model_name_or_model_dir="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | model_revision="v2.0.4" |
| | | |
| | | ## option 2, download model by git |
| | | #local_path_root=${workspace}/modelscope_models |
| | | #mkdir -p ${local_path_root}/${model_name_or_model_dir} |
| | | #git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir} |
| | | #model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir} |
| | | |
| | | |
| | | # data dir, which contains: train.json, val.json |
| | | data_dir="../../../data/list" |
| | | |
| | | train_data="${data_dir}/train.jsonl" |
| | | val_data="${data_dir}/val.jsonl" |
| | | |
| | | # generate train.jsonl and val.jsonl from wav.scp and text.txt |
| | | scp2jsonl \ |
| | | ++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out="${train_data}" |
| | | |
| | | scp2jsonl \ |
| | | ++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out="${val_data}" |
| | | |
| | | |
| | | # exp output dir |
| | | output_dir="./outputs" |
| | | log_file="${output_dir}/log.txt" |
| | | |
| | | |
| | | mkdir -p ${output_dir} |
| | | echo "log_file: ${log_file}" |
| | | |
| | | torchrun \ |
| | | --nnodes 1 \ |
| | | --nproc_per_node ${gpu_num} \ |
| | | ../../../funasr/bin/train.py \ |
| | | ++model="${model_name_or_model_dir}" \ |
| | | ++model_revision="${model_revision}" \ |
| | | ++train_data_set_list="${train_data}" \ |
| | | ++valid_data_set_list="${val_data}" \ |
| | | ++dataset_conf.batch_size=20000 \ |
| | | ++dataset_conf.batch_type="token" \ |
| | | ++dataset_conf.num_workers=4 \ |
| | | ++train_conf.max_epoch=50 \ |
| | | ++train_conf.log_interval=1 \ |
| | | ++train_conf.resume=false \ |
| | | ++train_conf.validate_interval=2000 \ |
| | | ++train_conf.save_checkpoint_interval=2000 \ |
| | | ++train_conf.keep_nbest_models=20 \ |
| | | ++optim_conf.lr=0.0002 \ |
| | | ++output_dir="${output_dir}" &> ${log_file} |
| | |
| | | export CUDA_VISIBLE_DEVICES="0,1" |
| | | gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | | |
| | | # data dir, which contains: train.json, val.json |
| | | data_dir="/Users/zhifu/funasr1.0/data/list" |
| | | # model_name from model_hub, or model_dir in local path |
| | | |
| | | ## generate jsonl from wav.scp and text.txt |
| | | #python -m funasr.datasets.audio_datasets.scp2jsonl \ |
| | | #++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ |
| | | #++data_type_list='["source", "target"]' \ |
| | | #++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl |
| | | ## option 1, download model automatically |
| | | model_name_or_model_dir="iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" |
| | | model_revision="v2.0.4" |
| | | |
| | | ## option 2, download model by git |
| | | #local_path_root=${workspace}/modelscope_models |
| | | #mkdir -p ${local_path_root}/${model_name_or_model_dir} |
| | | #git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir} |
| | | #model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir} |
| | | |
| | | |
| | | # data dir, which contains: train.json, val.json |
| | | data_dir="../../../data/list" |
| | | |
| | | train_data="${data_dir}/train.jsonl" |
| | | val_data="${data_dir}/val.jsonl" |
| | | |
| | | # generate train.jsonl and val.jsonl from wav.scp and text.txt |
| | | scp2jsonl \ |
| | | ++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out="${train_data}" |
| | | |
| | | scp2jsonl \ |
| | | ++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out="${val_data}" |
| | | |
| | | |
| | | # exp output dir |
| | | output_dir="/Users/zhifu/exp" |
| | | output_dir="./outputs" |
| | | log_file="${output_dir}/log.txt" |
| | | |
| | | |
| | |
| | | torchrun \ |
| | | --nnodes 1 \ |
| | | --nproc_per_node ${gpu_num} \ |
| | | funasr/bin/train.py \ |
| | | ++model="iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" \ |
| | | ++model_revision="v2.0.5" \ |
| | | ../../../funasr/bin/train.py \ |
| | | ++model="${model_name_or_model_dir}" \ |
| | | ++model_revision="${model_revision}" \ |
| | | ++train_data_set_list="${train_data}" \ |
| | | ++valid_data_set_list="${val_data}" \ |
| | | ++dataset_conf.batch_size=32 \ |
| | | ++dataset_conf.batch_type="example" \ |
| | | ++dataset_conf.batch_size=20000 \ |
| | | ++dataset_conf.batch_type="token" \ |
| | | ++dataset_conf.num_workers=4 \ |
| | | ++train_conf.max_epoch=20 \ |
| | | ++train_conf.max_epoch=50 \ |
| | | ++train_conf.log_interval=1 \ |
| | | ++train_conf.resume=false \ |
| | | ++train_conf.validate_interval=2000 \ |
| | | ++train_conf.save_checkpoint_interval=2000 \ |
| | | ++train_conf.keep_nbest_models=20 \ |
| | | ++optim_conf.lr=0.0002 \ |
| | | ++output_dir="${output_dir}" &> ${log_file} |
| | |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | # method1, finetune from model hub |
| | | |
| | | # which gpu to train or finetune |
| | | export CUDA_VISIBLE_DEVICES="0,1" |
| | | gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | | |
| | | # data dir, which contains: train.json, val.json |
| | | data_dir="/Users/zhifu/funasr1.0/data/list" |
| | | # model_name from model_hub, or model_dir in local path |
| | | |
| | | ## generate jsonl from wav.scp and text.txt |
| | | #python -m funasr.datasets.audio_datasets.scp2jsonl \ |
| | | #++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ |
| | | #++data_type_list='["source", "target"]' \ |
| | | #++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl |
| | | ## option 1, download model automatically |
| | | model_name_or_model_dir="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | model_revision="v2.0.4" |
| | | |
| | | ## option 2, download model by git |
| | | #local_path_root=${workspace}/modelscope_models |
| | | #mkdir -p ${local_path_root}/${model_name_or_model_dir} |
| | | #git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir} |
| | | #model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir} |
| | | |
| | | |
| | | # data dir, which contains: train.json, val.json |
| | | data_dir="../../../data/list" |
| | | |
| | | train_data="${data_dir}/train.jsonl" |
| | | val_data="${data_dir}/val.jsonl" |
| | | |
| | | # generate train.jsonl and val.jsonl from wav.scp and text.txt |
| | | scp2jsonl \ |
| | | ++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out="${train_data}" |
| | | |
| | | scp2jsonl \ |
| | | ++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out="${val_data}" |
| | | |
| | | |
| | | # exp output dir |
| | | output_dir="/Users/zhifu/exp" |
| | | output_dir="./outputs" |
| | | log_file="${output_dir}/log.txt" |
| | | |
| | | |
| | |
| | | torchrun \ |
| | | --nnodes 1 \ |
| | | --nproc_per_node ${gpu_num} \ |
| | | funasr/bin/train.py \ |
| | | ++model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \ |
| | | ++model_revision="v2.0.4" \ |
| | | ../../../funasr/bin/train.py \ |
| | | ++model="${model_name_or_model_dir}" \ |
| | | ++model_revision="${model_revision}" \ |
| | | ++train_data_set_list="${train_data}" \ |
| | | ++valid_data_set_list="${val_data}" \ |
| | | ++dataset_conf.batch_size=20000 \ |
| | | ++dataset_conf.batch_type="token" \ |
| | | ++dataset_conf.num_workers=4 \ |
| | | ++train_conf.max_epoch=50 \ |
| | | ++train_conf.log_interval=10 \ |
| | | ++train_conf.log_interval=1 \ |
| | | ++train_conf.resume=false \ |
| | | ++train_conf.validate_interval=15 \ |
| | | ++train_conf.save_checkpoint_interval=15 \ |
| | | ++train_conf.keep_nbest_models=50 \ |
| | | ++train_conf.validate_interval=2000 \ |
| | | ++train_conf.save_checkpoint_interval=2000 \ |
| | | ++train_conf.keep_nbest_models=20 \ |
| | | ++optim_conf.lr=0.0002 \ |
| | | ++output_dir="${output_dir}" &> ${log_file} |
| | |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | import os |
| | | |
| | | from funasr import AutoModel |
| | | |
| | | chunk_size = [5, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms |
| | | encoder_chunk_look_back = 0 #number of chunks to lookback for encoder self-attention |
| | | decoder_chunk_look_back = 0 #number of encoder chunks to lookback for decoder cross-attention |
| | | wav_file="/Users/zhifu/Downloads/NCYzUhAtZNI_0015.wav" |
| | | chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms |
| | | encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention |
| | | decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention |
| | | model = AutoModel(model="iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online", model_revision="v2.0.4") |
| | | |
| | | wav_file = os.path.join(model.model_path, "example/asr_example.wav") |
| | | res = model.generate(input=wav_file, |
| | | chunk_size=chunk_size, |
| | | encoder_chunk_look_back=encoder_chunk_look_back, |
| | |
| | | ) |
| | | print(res) |
| | | |
| | | # exit() |
| | | |
| | | import soundfile |
| | | import os |
| | | |
| | | # wav_file = os.path.join(model.model_path, "example/asr_example.wav") |
| | | |
| | | wav_file = os.path.join(model.model_path, "example/asr_example.wav") |
| | | speech, sample_rate = soundfile.read(wav_file) |
| | | |
| | | chunk_stride = chunk_size[1] * 960 # 600ms、480ms |
| New file |
| | |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | |
| | | # which gpu to train or finetune |
| | | export CUDA_VISIBLE_DEVICES="0,1" |
| | | gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | | |
| | | # model_name from model_hub, or model_dir in local path |
| | | |
| | | ## option 1, download model automatically |
| | | model_name_or_model_dir="iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online" |
| | | model_revision="v2.0.4" |
| | | |
| | | ## option 2, download model by git |
| | | #local_path_root=${workspace}/modelscope_models |
| | | #mkdir -p ${local_path_root}/${model_name_or_model_dir} |
| | | #git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir} |
| | | #model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir} |
| | | |
| | | |
| | | # data dir, which contains: train.json, val.json |
| | | data_dir="../../../data/list" |
| | | |
| | | train_data="${data_dir}/train.jsonl" |
| | | val_data="${data_dir}/val.jsonl" |
| | | |
| | | # generate train.jsonl and val.jsonl from wav.scp and text.txt |
| | | scp2jsonl \ |
| | | ++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out="${train_data}" |
| | | |
| | | scp2jsonl \ |
| | | ++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out="${val_data}" |
| | | |
| | | |
| | | # exp output dir |
| | | output_dir="./outputs" |
| | | log_file="${output_dir}/log.txt" |
| | | |
| | | |
| | | mkdir -p ${output_dir} |
| | | echo "log_file: ${log_file}" |
| | | |
| | | torchrun \ |
| | | --nnodes 1 \ |
| | | --nproc_per_node ${gpu_num} \ |
| | | ../../../funasr/bin/train.py \ |
| | | ++model="${model_name_or_model_dir}" \ |
| | | ++model_revision="${model_revision}" \ |
| | | ++train_data_set_list="${train_data}" \ |
| | | ++valid_data_set_list="${val_data}" \ |
| | | ++dataset_conf.batch_size=20000 \ |
| | | ++dataset_conf.batch_type="token" \ |
| | | ++dataset_conf.num_workers=4 \ |
| | | ++train_conf.max_epoch=50 \ |
| | | ++train_conf.log_interval=1 \ |
| | | ++train_conf.resume=false \ |
| | | ++train_conf.validate_interval=2000 \ |
| | | ++train_conf.save_checkpoint_interval=2000 \ |
| | | ++train_conf.keep_nbest_models=20 \ |
| | | ++optim_conf.lr=0.0002 \ |
| | | ++output_dir="${output_dir}" &> ${log_file} |
| | |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | # method1, finetune from model hub |
| | | |
| | | # which gpu to train or finetune |
| | | export CUDA_VISIBLE_DEVICES="0,1" |
| | | gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | | |
| | | # data dir, which contains: train.json, val.json |
| | | data_dir="/Users/zhifu/funasr1.0/data/list" |
| | | # model_name from model_hub, or model_dir in local path |
| | | |
| | | ## generate jsonl from wav.scp and text.txt |
| | | #python -m funasr.datasets.audio_datasets.scp2jsonl \ |
| | | #++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ |
| | | #++data_type_list='["source", "target"]' \ |
| | | #++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl |
| | | ## option 1, download model automatically |
| | | model_name_or_model_dir="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | model_revision="v2.0.4" |
| | | |
| | | ## option 2, download model by git |
| | | #local_path_root=${workspace}/modelscope_models |
| | | #mkdir -p ${local_path_root}/${model_name_or_model_dir} |
| | | #git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir} |
| | | #model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir} |
| | | |
| | | |
| | | # data dir, which contains: train.json, val.json |
| | | data_dir="../../../data/list" |
| | | |
| | | train_data="${data_dir}/train.jsonl" |
| | | val_data="${data_dir}/val.jsonl" |
| | | |
| | | # generate train.jsonl and val.jsonl from wav.scp and text.txt |
| | | scp2jsonl \ |
| | | ++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out="${train_data}" |
| | | |
| | | scp2jsonl \ |
| | | ++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out="${val_data}" |
| | | |
| | | |
| | | # exp output dir |
| | | output_dir="/Users/zhifu/exp" |
| | | output_dir="./outputs" |
| | | log_file="${output_dir}/log.txt" |
| | | |
| | | |
| | |
| | | torchrun \ |
| | | --nnodes 1 \ |
| | | --nproc_per_node ${gpu_num} \ |
| | | funasr/bin/train.py \ |
| | | ++model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \ |
| | | ++model_revision="v2.0.6" \ |
| | | ../../../funasr/bin/train.py \ |
| | | ++model="${model_name_or_model_dir}" \ |
| | | ++model_revision="${model_revision}" \ |
| | | ++train_data_set_list="${train_data}" \ |
| | | ++valid_data_set_list="${val_data}" \ |
| | | ++dataset_conf.batch_size=32 \ |
| | | ++dataset_conf.batch_type="example" \ |
| | | ++dataset_conf.batch_size=20000 \ |
| | | ++dataset_conf.batch_type="token" \ |
| | | ++dataset_conf.num_workers=4 \ |
| | | ++train_conf.max_epoch=20 \ |
| | | ++train_conf.max_epoch=50 \ |
| | | ++train_conf.log_interval=1 \ |
| | | ++train_conf.resume=false \ |
| | | ++train_conf.validate_interval=2000 \ |
| | | ++train_conf.save_checkpoint_interval=2000 \ |
| | | ++train_conf.keep_nbest_models=20 \ |
| | | ++optim_conf.lr=0.0002 \ |
| | | ++output_dir="${output_dir}" &> ${log_file} |
| | |
| | | data_file_lists = f.readlines() |
| | | lines_for_each_th = (len(data_file_lists)-1)//cpu_cores + 1 |
| | | task_num = cpu_cores if len(data_file_lists) > cpu_cores else 1 |
| | | # import pdb;pdb.set_trace() |
| | | if task_num > 1: |
| | | with concurrent.futures.ThreadPoolExecutor(max_workers=cpu_cores) as executor: |
| | | |
| | | futures = [executor.submit(parse_context_length, data_file_lists[i*lines_for_each_th:(i+1)*lines_for_each_th], data_type) for i in range(task_num)] |
| | |
| | | for future in concurrent.futures.as_completed(futures): |
| | | |
| | | json_dict[data_type].update(future.result()) |
| | | # print(json_dict) |
| | | else: |
| | | res = parse_context_length(data_file_lists, data_type) |
| | | json_dict[data_type].update(res) |
| | | |
| | | with open(jsonl_file_out, "w") as f: |
| | | for key in json_dict[data_type_list[0]].keys(): |
| | |
| | | jsonl_line = json.dumps(jsonl_line, ensure_ascii=False) |
| | | f.write(jsonl_line+"\n") |
| | | f.flush() |
| | | print(f"processed {len(json_dict[data_type_list[0]])} samples") |
| | | |
| | | else: |
| | | pass |
| | |
| | | ], |
| | | entry_points={"console_scripts": [ |
| | | "funasr = funasr.bin.inference:main_hydra", |
| | | "funasr-train = funasr.bin.train:main_hydra", |
| | | "funasr-export = funasr.bin.export:main_hydra", |
| | | "scp2jsonl = funasr.datasets.audio_datasets.scp2jsonl:main_hydra", |
| | | "jsonl2scp = funasr.datasets.audio_datasets.jsonl2scp:main_hydra", |
| | | ]}, |
| | | ) |