kongdeqiang
5 天以前 28ccfbfc51068a663a80764e14074df5edf2b5ba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# coding=utf-8
 
import librosa
import base64
import io
import gradio as gr
import re
 
import numpy as np
import torch
import torchaudio
 
# from modelscope import HubApi
#
# api = HubApi()
#
# api.login('')
 
from funasr import AutoModel
 
# model = "/Users/zhifu/Downloads/modelscope_models/SenseVoiceCTC"
# model = "iic/SenseVoiceCTC"
# model = AutoModel(model=model,
#                   vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
#                   vad_kwargs={"max_single_segment_time": 30000},
#                   trust_remote_code=True,
#                   )
 
import re
import os
import sys
 
if len(sys.argv) > 1:
    ckpt_dir = sys.argv[1]
    ckpt_id = sys.argv[2]
    jsonl = sys.argv[3]
    output_dir = sys.argv[4]
    device = sys.argv[5]
    new_sys = False
    if len(sys.argv) > 6:
        new_sys = True
else:
    ckpt_dir = "/nfs/beinian.lzr/workspace/GPT-4o/Exp/exp7/5m-8gpu/exp5-1-0619"
    ckpt_id = "model.pt.ep6"
    jsonl = (
        "/nfs/beinian.lzr/workspace/GPT-4o/Data/Speech2Text/TestData/s2tchat.v20240619.test.jsonl"
    )
    dataset = jsonl.split("/")[-1]
    output_dir = os.path.join(ckpt_dir, f"inference-{ckpt_id}", dataset)
 
 
model = AutoModel(
    model=ckpt_dir,
    init_param=f"{os.path.join(ckpt_dir, ckpt_id)}",
    output_dir=output_dir,
    device=device,
    fp16=False,
    bf16=False,
    llm_dtype="bf16",
)
 
 
def model_inference(input_wav, text_inputs, fs=16000):
 
    if isinstance(input_wav, tuple):
        fs, input_wav = input_wav
        input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
        if len(input_wav.shape) > 1:
            input_wav = input_wav.mean(-1)
        if fs != 16000:
            print(f"audio_fs: {fs}")
            resampler = torchaudio.transforms.Resample(fs, 16000)
            input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
            input_wav = resampler(input_wav_t[None, :])[0, :].numpy().astype("float32")
 
    input_wav_byte = input_wav.tobytes()
 
    contents_i = []
    system_prompt = text_inputs
    user_prompt = f"<|startofspeech|>!!{input_wav_byte}<|endofspeech|>"
    contents_i.append({"role": "system", "content": system_prompt})
    contents_i.append({"role": "user", "content": user_prompt})
    contents_i.append({"role": "assistant", "content": "target_out"})
 
    res = model.generate(
        input=[contents_i],
        tearchforing=tearchforing,
        cache={},
        key=key,
    )
 
    print(res)
 
    return res
 
 
audio_examples = [
    [
        "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav",
        "You are a helpful assistant.",
    ],
]
 
description = """
Upload an audio file or input through a microphone, then type te System Prompt.
 
 
"""
 
 
def launch():
    with gr.Blocks() as demo:
        gr.Markdown(description)
        with gr.Row():
            with gr.Column():
                audio_inputs = gr.Audio(label="Upload audio or use the microphone")
                text_inputs = gr.Text(label="System Prompt", value="You are a helpful assistant.")
 
                # with gr.Accordion("Configuration"):
                #     # task_inputs = gr.Radio(choices=["Speech Recognition", "Rich Text Transcription"],
                #     #                        value="Speech Recognition", label="Task")
                #     language_inputs = gr.Dropdown(choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
                #                                   value="auto",
                #                                   label="Language")
            gr.Examples(examples=audio_examples, inputs=[audio_inputs, text_inputs])
 
        fn_button = gr.Button("Start")
 
        text_outputs = gr.HTML(label="Results")
 
        fn_button.click(model_inference, inputs=[audio_inputs, text_inputs], outputs=text_outputs)
        # with gr.Accordion("More examples"):
        #     gr.HTML(centered_table_html)
    demo.launch()
 
 
if __name__ == "__main__":
    # iface.launch()
    launch()