| examples/industrial_data_pretraining/llm_asr/demo_speech2text.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| funasr/models/llm_asr/model.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 |
examples/industrial_data_pretraining/llm_asr/demo_speech2text.py
@@ -3,20 +3,37 @@ # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. # MIT License (https://opensource.org/licenses/MIT) import json import os import sys from funasr import AutoModel model = AutoModel( model="/nfs/beinian.lzr/workspace/GPT-4o/Exp/exp6/4m-8gpu/exp6_speech2text_0607_linear_ddp", ) ckpt_dir = "/nfs/beinian.lzr/workspace/GPT-4o/Exp/exp6/5m-8gpu/exp6_speech2text_linear_ddp_0609" ckpt_id = "model.pt.ep0.90000" jsonl = ( "/nfs/beinian.lzr/workspace/GPT-4o/Data/Speech2Text/TestData/aishell1_test_speech2text.jsonl" ) output_dir = f"{os.path.join(ckpt_dir, ckpt_id)}" ckpt_dir = sys.argv[1] ckpt_id = sys.argv[2] jsonl = sys.argv[3] output_dir = sys.argv[4] device = sys.argv[5] model = AutoModel( model=ckpt_dir, init_param=f"{os.path.join(ckpt_dir, ckpt_id)}", output_dir=output_dir, device=device, ) with open(jsonl, "r") as f: lines = f.readlines() tearchforing = True tearchforing = False for i, line in enumerate(lines): data_dict = json.loads(line.strip()) data = data_dict["messages"] examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh
New file @@ -0,0 +1,63 @@ ckpt_dir="/nfs/zhifu.gzf/ckpt/saves/qwen_1.5_7b/full/sft/asr_tts_text_exp1_ds_z3/checkpoint-11000" ckpt_id="model.pt.ep0.90000" jsonl_dir="/nfs/beinian.lzr/workspace/GPT-4o/Data/Speech2Text/TestData" out_dir="${ckpt_dir}/asr" mkdir -p ${out_dir} device="cuda:0" for data_set in "librispeech_test_clean_speech2text.jsonl" "librispeech_test_other_speech2text.jsonl"; do jsonl=${jsonl_dir}/${data_set} output_dir=${out_dir}/${data_set} pred_file=${out_dir}/${data_set}/1best_recog/text_tn ref_file=${out_dir}/${data_set}/1best_recog/label python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device} python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=false done for data_set in "aishell1_test_speech2text.jsonl" "aishell2_ios_test_speech2text.jsonl" "librispeech_test_other_speech2text.jsonl"; do jsonl=${jsonl_dir}/${data_set} output_dir=${out_dir}/${data_set} pred_file=${out_dir}/${data_set}/1best_recog/text_tn ref_file=${out_dir}/${data_set}/1best_recog/label python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=true done for data_set in "s2tt_en2zh.v20240605.test.jsonl"; do jsonl=${jsonl_dir}/${data_set} output_dir=${out_dir}/${data_set} pred_file=${out_dir}/${data_set}/1best_recog/text_tn ref_file=${out_dir}/${data_set}/1best_recog/label python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=true done for data_set in "s2tt_zh2en.v20240605.test.jsonl"; do jsonl=${jsonl_dir}/${data_set} output_dir=${out_dir}/${data_set} pred_file=${out_dir}/${data_set}/1best_recog/text_tn ref_file=${out_dir}/${data_set}/1best_recog/label python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=true done funasr/models/llm_asr/model.py
@@ -700,10 +700,10 @@ generated_ids = self.llm.generate( inputs_embeds=inputs_embeds, max_new_tokens=kwargs.get("max_length", 512) ) generated_ids = [ output_ids[len(input_id) :] for input_id, output_ids in zip(input_ids, generated_ids) ] # generated_ids = [ # output_ids[len(input_id) :] # for input_id, output_ids in zip(input_ids, generated_ids) # ] response = tokenizer.batch_decode( generated_ids, skip_special_tokens=kwargs.get("skip_special_tokens", True) )[0] @@ -733,7 +733,8 @@ ibest_writer = self.writer[f"{0 + 1}best_recog"] results = [] result_i = {"key": key[0], "text": response, "label": label} response_clean = re.sub("[^\w\s\u3000\u4e00-\u9fff]+", "", response) result_i = {"key": key[0], "text": response, "text_tn": response_clean, "label": label} if loss is not None: result_i["loss"] = loss results.append(result_i) @@ -741,5 +742,6 @@ if ibest_writer is not None: ibest_writer["text"][key[0]] = response ibest_writer["label"][key[0]] = label ibest_writer["text_tn"][key[0]] = response_clean return results, meta_data