python/FunASR-XL.git

parent: 5c8af9c7 | 补丁 | 提交 | show whitespace

游雁

2024-03-01 cc41a9ee88a8dca027a34c37cc1c67f8198958b9

whisper

5个文件已修改

16个文件已删除

5个文件已添加

	examples/industrial_data_pretraining/whisper/demo.py	13 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	examples/industrial_data_pretraining/whisper/infer.sh	21 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	examples/industrial_data_pretraining/whisper/infer_from_local.sh	34 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/auto/auto_model.py	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/download/download_from_hub.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/frontends/whisper_frontend.py	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/model.py	310 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/template.yaml	46 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/__init__.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/assets/gpt2/merges.txt	50001 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/assets/gpt2/special_tokens_map.json	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/assets/gpt2/tokenizer_config.json	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/assets/gpt2/vocab.json	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/assets/mel_filters.npz	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/assets/multilingual/added_tokens.json	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/assets/multilingual/merges.txt	50000 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/assets/multilingual/special_tokens_map.json	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/assets/multilingual/tokenizer_config.json	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/assets/multilingual/vocab.json	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/audio.py	127 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/decoding.py	710 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/tokenizer.py	339 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/transcribe.py	316 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/whisper/utils/utils.py	163 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/tokenizer/whisper_tokenizer.py	24 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/train_utils/load_pretrained_model.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 examples/industrial_data_pretraining/whisper/demo.py

New file
@@ -0,0 +1,13 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
#  MIT License  (https://opensource.org/licenses/MIT)

from funasr import AutoModel

model = AutoModel(model="iic/speech_whisper-large_asr_multilingual",
                  model_revision="v2.0.4",
                  )

res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav")
print(res)

 examples/industrial_data_pretraining/whisper/infer.sh

New file
@@ -0,0 +1,21 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
#  MIT License  (https://opensource.org/licenses/MIT)

# method1, inference from model hub

# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"

output_dir="./outputs/debug"

model="iic/speech_whisper-large_asr_multilingual"
model_revision="v2.0.4"

device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"

python -m funasr.bin.inference \
++model=${model} \
++model_revision=${model_revision} \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \

 examples/industrial_data_pretraining/whisper/infer_from_local.sh

New file
@@ -0,0 +1,34 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
#  MIT License  (https://opensource.org/licenses/MIT)

# method2, inference from local model

# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"

output_dir="./outputs/debug"

workspace=`pwd`

# download model
local_path_root=${workspace}/modelscope_models
mkdir -p ${local_path_root}
local_path=${local_path_root}/speech_whisper-large_asr_multilingual
git clone https://www.modelscope.cn/iic/speech_whisper-large_asr_multilingual.git ${local_path}

device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"

config="config.yaml"
init_param="${local_path}/large-v2.pt"

python -m funasr.bin.inference \
--config-path "${local_path}" \
--config-name "${config}" \
++init_param="${init_param}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \





 funasr/auto/auto_model.py

@@ -165,17 +165,18 @@

            kwargs["token_list"] = tokenizer.token_list if hasattr(tokenizer, "token_list") else None
            kwargs["token_list"] = tokenizer.get_vocab() if hasattr(tokenizer, "get_vocab") else kwargs["token_list"]
            vocab_size = len(kwargs["token_list"])
            vocab_size = len(kwargs["token_list"]) if kwargs["token_list"] is not None else -1
        else:
            vocab_size = -1
        
        # build frontend
        frontend = kwargs.get("frontend", None)
        kwargs["input_size"] = None
        if frontend is not None:
            frontend_class = tables.frontend_classes.get(frontend)
            frontend = frontend_class(**kwargs["frontend_conf"])
            kwargs["frontend"] = frontend
            kwargs["input_size"] = frontend.output_size()
            kwargs["input_size"] = frontend.output_size() if hasattr(frontend, "output_size") else None
        
        # build model
        model_class = tables.model_classes.get(kwargs["model"])

 funasr/download/download_from_hub.py

@@ -18,13 +18,15 @@
        model_or_path = name_maps_ms[model_or_path]
    model_revision = kwargs.get("model_revision")
    if not os.path.exists(model_or_path):
        model_or_path = get_or_download_model_dir(model_or_path, model_revision, is_training=kwargs.get("is_training"), check_latest=kwargs.get("kwargs", True))
        model_or_path = get_or_download_model_dir(model_or_path, model_revision, is_training=kwargs.get("is_training"), check_latest=kwargs.get("check_latest", True))
    kwargs["model_path"] = model_or_path
    
    if os.path.exists(os.path.join(model_or_path, "configuration.json")):
        with open(os.path.join(model_or_path, "configuration.json"), 'r', encoding='utf-8') as f:
            conf_json = json.load(f)
            
            cfg = {}
            if "file_path_metas" in conf_json:
            add_file_root_path(model_or_path, conf_json["file_path_metas"], cfg)
            cfg.update(kwargs)
            config = OmegaConf.load(cfg["config"])

 funasr/frontends/whisper_frontend.py

@@ -17,8 +17,9 @@
    def __init__(
            self,
            fs: int = 16000,
            whisper_model: str = "large-v3",
            whisper_model: str = None,
            do_pad_trim: bool = True,
            n_mels: int = 80,
    ):
        super().__init__()
        assert fs == 16000
@@ -30,17 +31,16 @@
        self.pad_samples = N_SAMPLES
        self.frame_shift = self.hop_length
        self.lfr_n = 1
        self.n_mels = n_mels
        if whisper_model == "large-v3" or whisper_model == "large":
            self.n_mels = 128
        else:
            self.n_mels = 80

        self.mel_filters = whisper.audio.mel_filters
        self.do_pad_trim = do_pad_trim
        if do_pad_trim:
            self.pad_or_trim = whisper.pad_or_trim

        assert whisper_model in whisper.available_models()
        # assert whisper_model in whisper.available_models()

    def output_size(self) -> int:
        return self.n_mels

 funasr/models/whisper/model.py

@@ -1,273 +1,85 @@
from dataclasses import dataclass
from typing import Dict
from typing import Iterable, Optional

import time
import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor
from torch import nn
import whisper
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank


from funasr.models.whisper.utils.decoding import detect_language as detect_language_function, decode as decode_function
from funasr.register import tables


@dataclass
class ModelDimensions:
    n_mels: int
    n_audio_ctx: int
    n_audio_state: int
    n_audio_head: int
    n_audio_layer: int
    n_vocab: int
    n_text_ctx: int
    n_text_state: int
    n_text_head: int
    n_text_layer: int


class LayerNorm(nn.LayerNorm):
    def forward(self, x: Tensor) -> Tensor:
        return super().forward(x.float()).type(x.dtype)


class Linear(nn.Linear):
    def forward(self, x: Tensor) -> Tensor:
        return F.linear(
            x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype)
        )


class Conv1d(nn.Conv1d):
    def _conv_forward(self, x: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor:
        return super()._conv_forward(
            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
        )


def sinusoids(length, channels, max_timescale=10000):
    """Returns sinusoids for positional embedding"""
    assert channels % 2 == 0
    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)


class MultiHeadAttention(nn.Module):
    def __init__(self, n_state: int, n_head: int):
@tables.register("model_classes", "WhisperWarp")
class WhisperWarp(nn.Module):
    def __init__(self, whisper_dims: dict, **kwargs):
        super().__init__()
        self.n_head = n_head
        self.query = Linear(n_state, n_state)
        self.key = Linear(n_state, n_state, bias=False)
        self.value = Linear(n_state, n_state)
        self.out = Linear(n_state, n_state)

    def forward(
        self,
        x: Tensor,
        xa: Optional[Tensor] = None,
        mask: Optional[Tensor] = None,
        kv_cache: Optional[dict] = None,
    ):
        q = self.query(x)

        if kv_cache is None or xa is None or self.key not in kv_cache:
            # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
            # otherwise, perform key/value projections for self- or cross-attention as usual.
            k = self.key(x if xa is None else xa)
            v = self.value(x if xa is None else xa)
        hub = kwargs.get("hub", "funasr")
        if hub == "openai":
            init_param_path = kwargs.get("init_param_path", "large-v3")
            model = whisper.load_model(init_param_path)
        else:
            # for cross-attention, calculate keys and values once and reuse in subsequent calls.
            k = kv_cache[self.key]
            v = kv_cache[self.value]
            dims = whisper.model.ModelDimensions(**whisper_dims)
            model = whisper.model.Whisper(dims=dims)

        wv, qk = self.qkv_attention(q, k, v, mask)
        return self.out(wv), qk
        self.model = model

    def qkv_attention(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
        n_batch, n_ctx, n_state = q.shape
        scale = (n_state // self.n_head) ** -0.25
        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
    def forward(self, ):
        pass

        qk = q @ k
        if mask is not None:
            qk = qk + mask[:n_ctx, :n_ctx]
        qk = qk.float()

        w = F.softmax(qk, dim=-1).to(q.dtype)
        return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach()


class ResidualAttentionBlock(nn.Module):
    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
        super().__init__()

        self.attn = MultiHeadAttention(n_state, n_head)
        self.attn_ln = LayerNorm(n_state)

        self.cross_attn = MultiHeadAttention(n_state, n_head) if cross_attention else None
        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None

        n_mlp = n_state * 4
        self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state))
        self.mlp_ln = LayerNorm(n_state)

    def forward(
        self,
        x: Tensor,
        xa: Optional[Tensor] = None,
        mask: Optional[Tensor] = None,
        kv_cache: Optional[dict] = None,
    def inference(self,
                  data_in,
                  data_lengths=None,
                  key: list = None,
                  tokenizer=None,
                  frontend=None,
                  **kwargs,
    ):
        x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0]
        if self.cross_attn:
            x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[0]
        x = x + self.mlp(self.mlp_ln(x))
        return x
        if kwargs.get("batch_size", 1) > 1:
            raise NotImplementedError("batch decoding is not implemented")



@tables.register("encoder_classes", "WhisperEncoder")
class AudioEncoder(nn.Module):
    def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
        super().__init__()
        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
        self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))

        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
            [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
        )
        self.ln_post = LayerNorm(n_state)

    def forward(self, x: Tensor):
        """
        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
            the mel spectrogram of the audio
        """
        x = F.gelu(self.conv1(x))
        x = F.gelu(self.conv2(x))
        x = x.permute(0, 2, 1)

        assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
        x = (x + self.positional_embedding).to(x.dtype)

        for block in self.blocks:
            x = block(x)

        x = self.ln_post(x)
        return x

@tables.register("decoder_classes", "WhisperDecoder")
class TextDecoder(nn.Module):
    def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
        super().__init__()

        self.token_embedding = nn.Embedding(n_vocab, n_state)
        self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))

        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
            [ResidualAttentionBlock(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
        )
        self.ln = LayerNorm(n_state)

        mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
        self.register_buffer("mask", mask, persistent=False)

    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
        """
        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
            the text tokens
        xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
            the encoded audio features to be attended on
        """
        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
        x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
        x = x.to(xa.dtype)

        for block in self.blocks:
            x = block(x, xa, mask=self.mask, kv_cache=kv_cache)

        x = self.ln(x)
        logits = (x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)).float()

        return logits

@tables.register("model_classes", "Whisper")
class Whisper(nn.Module):
    def __init__(self, dims: dict):
        super().__init__()
        dims = ModelDimensions(**dims) 
        self.dims = dims
        self.sos = 1
        self.eos = 1
        self.encoder = AudioEncoder(
            self.dims.n_mels,
            self.dims.n_audio_ctx,
            self.dims.n_audio_state,
            self.dims.n_audio_head,
            self.dims.n_audio_layer,
        )
        self.decoder = TextDecoder(
            self.dims.n_vocab,
            self.dims.n_text_ctx,
            self.dims.n_text_state,
            self.dims.n_text_head,
            self.dims.n_text_layer,
        )

    def embed_audio(self, mel: torch.Tensor):
        return self.encoder(mel)

    def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
        return self.decoder(tokens, audio_features)

    def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
        return self.decoder(tokens, self.encoder(mel))

    @property
    def device(self):
        return next(self.parameters()).device

    @property
    def is_multilingual(self):
        return self.dims.n_vocab == 51865

    def install_kv_cache_hooks(self, cache: Optional[dict] = None):
        """
        The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
        tensors calculated for the previous positions. This method returns a dictionary that stores
        all caches, and the necessary hooks for the key and value projection modules that save the
        intermediate tensors to be reused during later calculations.

        Returns
        -------
        cache : Dict[nn.Module, torch.Tensor]
            A dictionary object mapping the key/value projection modules to its cache
        hooks : List[RemovableHandle]
            List of PyTorch RemovableHandle objects to stop the hooks to be called
        """
        cache = {**cache} if cache is not None else {}
        hooks = []

        def save_to_cache(module, _, output):
            if module not in cache or output.shape[1] > self.decoder.positional_embedding.shape[0]:
                cache[module] = output  # save as-is, for the first token or cross attention
        meta_data = {}
        if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank":  # fbank
            speech, speech_lengths = data_in, data_lengths
            if len(speech.shape) < 3:
                speech = speech[None, :, :]
            if speech_lengths is None:
                speech_lengths = speech.shape[1]
            else:
                cache[module] = torch.cat([cache[module], output], dim=1).detach()
            return cache[module]
            # extract fbank feats
            time1 = time.perf_counter()
            audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000),
                                                            data_type=kwargs.get("data_type", "sound"),
                                                            tokenizer=tokenizer)
            time2 = time.perf_counter()
            meta_data["load_data"] = f"{time2 - time1:0.3f}"
            speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
                                                   frontend=frontend)
            time3 = time.perf_counter()
            meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
            frame_shift = frontend.frame_shift if hasattr(frontend, "frame_shift") else 10
            lfr_n = frontend.lfr_n if hasattr(frontend, "lfr_n") else 1
            meta_data["batch_data_time"] = speech_lengths.sum().item() * frame_shift * lfr_n / 1000

        def install_hooks(layer: nn.Module):
            if isinstance(layer, MultiHeadAttention):
                hooks.append(layer.key.register_forward_hook(save_to_cache))
                hooks.append(layer.value.register_forward_hook(save_to_cache))
        speech = speech.to(device=kwargs["device"])[0, :, :]
        speech_lengths = speech_lengths.to(device=kwargs["device"])

        self.decoder.apply(install_hooks)
        return cache, hooks
        # detect the spoken language
        _, probs = self.model.detect_language(speech)
        print(f"Detected language: {max(probs, key=probs.get)}")

    detect_language = detect_language_function
    decode = decode_function
        # decode the audio
        options = whisper.DecodingOptions(language=kwargs.get("language", None), fp16=False)
        result = whisper.decode(self.model, speech, options)

        results = []
        result_i = {"key": key[0], "text": result.text}

        results.append(result_i)
    
        return results, meta_data
    

 funasr/models/whisper/template.yaml

New file
@@ -0,0 +1,46 @@
# This is an example that demonstrates how to configure a model file.
# You can modify the configuration according to your own requirements.

# to print the register_table:
# from funasr.register import tables
# tables.print()

# network architecture
model: WhisperWarp
model_conf:
    lsm_weight: 0.1
    length_normalized_loss: true
    hub: funasr # openai
    init_param_path: null # large-v2 or large-v3 if hub == "openai"



# only use for hub == funasr,
#  if hub == openai, whisper_dims is automaticall download
whisper_dims:
    'n_mels': 80
    'n_vocab': 51865
    'n_audio_ctx': 1500
    'n_audio_state': 1280
    'n_audio_head': 20
    'n_audio_layer': 32
    'n_text_ctx': 448
    'n_text_state': 1280
    'n_text_head': 20
    'n_text_layer': 32

# frontend related
frontend: WhisperFrontend
frontend_conf:
    fs: 16000
    n_mels: 80
    do_pad_trim: true

tokenizer: WhisperTokenizer
tokenizer_conf:
  language: null
  task: transcribe
  is_multilingual: true
  num_languages: 99

scope_map: ['none', "model."]

 funasr/models/whisper/utils/__init__.py


 funasr/models/whisper/utils/assets/gpt2/merges.txt

File was deleted

 funasr/models/whisper/utils/assets/gpt2/special_tokens_map.json

File was deleted

 funasr/models/whisper/utils/assets/gpt2/tokenizer_config.json

File was deleted

 funasr/models/whisper/utils/assets/gpt2/vocab.json

File was deleted

 funasr/models/whisper/utils/assets/mel_filters.npz

Binary files differ

 funasr/models/whisper/utils/assets/multilingual/added_tokens.json

File was deleted

 funasr/models/whisper/utils/assets/multilingual/merges.txt

File was deleted

 funasr/models/whisper/utils/assets/multilingual/special_tokens_map.json

File was deleted

 funasr/models/whisper/utils/assets/multilingual/tokenizer_config.json

File was deleted

 funasr/models/whisper/utils/assets/multilingual/vocab.json

File was deleted

 funasr/models/whisper/utils/audio.py

File was deleted

 funasr/models/whisper/utils/decoding.py

File was deleted

 funasr/models/whisper/utils/tokenizer.py

File was deleted

 funasr/models/whisper/utils/transcribe.py

File was deleted

 funasr/models/whisper/utils/utils.py

File was deleted

 funasr/tokenizer/whisper_tokenizer.py

New file
@@ -0,0 +1,24 @@

try:
    from whisper.tokenizer import get_tokenizer
except:
    print("If you want to use hugging, please `pip install -U transformers`")

from funasr.register import tables

@tables.register("tokenizer_classes", "WhisperTokenizer")
def WhisperTokenizer(**kwargs):

    language = kwargs.get("language", None)
    task = kwargs.get("task", "transcribe")
    is_multilingual = kwargs.get("is_multilingual", True)
    num_languages = kwargs.get("num_languages", 99)
    tokenizer = get_tokenizer(
        multilingual=is_multilingual,
        num_languages=num_languages,
        language=language,
        task=task,
    )
	
    return tokenizer


 funasr/train_utils/load_pretrained_model.py

@@ -68,9 +68,9 @@
    else:
        buffer = BytesIO(oss_bucket.get_object(path).read())
        src_state = torch.load(buffer, map_location=map_location)
    if "state_dict" in src_state:
        src_state = src_state["state_dict"]
    
    src_state = src_state["state_dict"] if "state_dict" in src_state else src_state
    src_state = src_state["model_state_dict"] if "model_state_dict" in src_state else src_state
    src_state = src_state["model"] if "model" in src_state else src_state
    
    if isinstance(scope_map, str):

New file
			@@ -0,0 +1,13 @@
			#!/usr/bin/env python3
			# -- encoding: utf-8 --
			# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
			# MIT License (https://opensource.org/licenses/MIT)

			from funasr import AutoModel

			model = AutoModel(model="iic/speech_whisper-large_asr_multilingual",
			model_revision="v2.0.4",
			)

			res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav")
			print(res)

New file
			@@ -0,0 +1,21 @@
			# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
			# MIT License (https://opensource.org/licenses/MIT)

			# method1, inference from model hub

			# for more input type, please ref to readme.md
			input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"

			output_dir="./outputs/debug"

			model="iic/speech_whisper-large_asr_multilingual"
			model_revision="v2.0.4"

			device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"

			python -m funasr.bin.inference \
			++model=${model} \
			++model_revision=${model_revision} \
			++input="${input}" \
			++output_dir="${output_dir}" \
			++device="${device}" \

New file
			@@ -0,0 +1,34 @@
			# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
			# MIT License (https://opensource.org/licenses/MIT)

			# method2, inference from local model

			# for more input type, please ref to readme.md
			input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"

			output_dir="./outputs/debug"

			workspace=`pwd`

			# download model
			local_path_root=${workspace}/modelscope_models
			mkdir -p ${local_path_root}
			local_path=${local_path_root}/speech_whisper-large_asr_multilingual
			git clone https://www.modelscope.cn/iic/speech_whisper-large_asr_multilingual.git ${local_path}

			device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"

			config="config.yaml"
			init_param="${local_path}/large-v2.pt"

			python -m funasr.bin.inference \
			--config-path "${local_path}" \
			--config-name "${config}" \
			++init_param="${init_param}" \
			++input="${input}" \
			++output_dir="${output_dir}" \
			++device="${device}" \

			@@ -165,17 +165,18 @@

			kwargs["token_list"] = tokenizer.token_list if hasattr(tokenizer, "token_list") else None
			kwargs["token_list"] = tokenizer.get_vocab() if hasattr(tokenizer, "get_vocab") else kwargs["token_list"]
			vocab_size = len(kwargs["token_list"])
			vocab_size = len(kwargs["token_list"]) if kwargs["token_list"] is not None else -1
			else:
			vocab_size = -1

			# build frontend
			frontend = kwargs.get("frontend", None)
			kwargs["input_size"] = None
			if frontend is not None:
			frontend_class = tables.frontend_classes.get(frontend)
			frontend = frontend_class(**kwargs["frontend_conf"])
			kwargs["frontend"] = frontend
			kwargs["input_size"] = frontend.output_size()
			kwargs["input_size"] = frontend.output_size() if hasattr(frontend, "output_size") else None

			# build model
			model_class = tables.model_classes.get(kwargs["model"])

			@@ -18,13 +18,15 @@
			model_or_path = name_maps_ms[model_or_path]
			model_revision = kwargs.get("model_revision")
			if not os.path.exists(model_or_path):
			model_or_path = get_or_download_model_dir(model_or_path, model_revision, is_training=kwargs.get("is_training"), check_latest=kwargs.get("kwargs", True))
			model_or_path = get_or_download_model_dir(model_or_path, model_revision, is_training=kwargs.get("is_training"), check_latest=kwargs.get("check_latest", True))
			kwargs["model_path"] = model_or_path

			if os.path.exists(os.path.join(model_or_path, "configuration.json")):
			with open(os.path.join(model_or_path, "configuration.json"), 'r', encoding='utf-8') as f:
			conf_json = json.load(f)

			cfg = {}
			if "file_path_metas" in conf_json:
			add_file_root_path(model_or_path, conf_json["file_path_metas"], cfg)
			cfg.update(kwargs)
			config = OmegaConf.load(cfg["config"])

			@@ -17,8 +17,9 @@
			def __init__(
			self,
			fs: int = 16000,
			whisper_model: str = "large-v3",
			whisper_model: str = None,
			do_pad_trim: bool = True,
			n_mels: int = 80,
			):
			super().__init__()
			assert fs == 16000
			@@ -30,17 +31,16 @@
			self.pad_samples = N_SAMPLES
			self.frame_shift = self.hop_length
			self.lfr_n = 1
			self.n_mels = n_mels
			if whisper_model == "large-v3" or whisper_model == "large":
			self.n_mels = 128
			else:
			self.n_mels = 80

			self.mel_filters = whisper.audio.mel_filters
			self.do_pad_trim = do_pad_trim
			if do_pad_trim:
			self.pad_or_trim = whisper.pad_or_trim

			assert whisper_model in whisper.available_models()
			# assert whisper_model in whisper.available_models()

			def output_size(self) -> int:
			return self.n_mels

			@@ -1,273 +1,85 @@
			from dataclasses import dataclass
			from typing import Dict
			from typing import Iterable, Optional

			import time
			import numpy as np
			import torch
			import torch.nn.functional as F
			from torch import Tensor
			from torch import nn
			import whisper
			from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank


			from funasr.models.whisper.utils.decoding import detect_language as detect_language_function, decode as decode_function
			from funasr.register import tables


			@dataclass
			class ModelDimensions:
			n_mels: int
			n_audio_ctx: int
			n_audio_state: int
			n_audio_head: int
			n_audio_layer: int
			n_vocab: int
			n_text_ctx: int
			n_text_state: int
			n_text_head: int
			n_text_layer: int


			class LayerNorm(nn.LayerNorm):
			def forward(self, x: Tensor) -> Tensor:
			return super().forward(x.float()).type(x.dtype)


			class Linear(nn.Linear):
			def forward(self, x: Tensor) -> Tensor:
			return F.linear(
			x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype)
			)


			class Conv1d(nn.Conv1d):
			def _conv_forward(self, x: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor:
			return super()._conv_forward(
			x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
			)


			def sinusoids(length, channels, max_timescale=10000):
			"""Returns sinusoids for positional embedding"""
			assert channels % 2 == 0
			log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
			inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
			scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
			return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)


			class MultiHeadAttention(nn.Module):
			def __init__(self, n_state: int, n_head: int):
			@tables.register("model_classes", "WhisperWarp")
			class WhisperWarp(nn.Module):
			def __init__(self, whisper_dims: dict, **kwargs):
			super().__init__()
			self.n_head = n_head
			self.query = Linear(n_state, n_state)
			self.key = Linear(n_state, n_state, bias=False)
			self.value = Linear(n_state, n_state)
			self.out = Linear(n_state, n_state)

			def forward(
			self,
			x: Tensor,
			xa: Optional[Tensor] = None,
			mask: Optional[Tensor] = None,
			kv_cache: Optional[dict] = None,
			):
			q = self.query(x)

			if kv_cache is None or xa is None or self.key not in kv_cache:
			# hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
			# otherwise, perform key/value projections for self- or cross-attention as usual.
			k = self.key(x if xa is None else xa)
			v = self.value(x if xa is None else xa)
			hub = kwargs.get("hub", "funasr")
			if hub == "openai":
			init_param_path = kwargs.get("init_param_path", "large-v3")
			model = whisper.load_model(init_param_path)
			else:
			# for cross-attention, calculate keys and values once and reuse in subsequent calls.
			k = kv_cache[self.key]
			v = kv_cache[self.value]
			dims = whisper.model.ModelDimensions(**whisper_dims)
			model = whisper.model.Whisper(dims=dims)

			wv, qk = self.qkv_attention(q, k, v, mask)
			return self.out(wv), qk
			self.model = model

			def qkv_attention(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
			n_batch, n_ctx, n_state = q.shape
			scale = (n_state // self.n_head) ** -0.25
			q = q.view(q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) scale
			k = k.view(k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) scale
			v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
			def forward(self, ):
			pass

			qk = q @ k
			if mask is not None:
			qk = qk + mask[:n_ctx, :n_ctx]
			qk = qk.float()

			w = F.softmax(qk, dim=-1).to(q.dtype)
			return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach()


			class ResidualAttentionBlock(nn.Module):
			def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
			super().__init__()

			self.attn = MultiHeadAttention(n_state, n_head)
			self.attn_ln = LayerNorm(n_state)

			self.cross_attn = MultiHeadAttention(n_state, n_head) if cross_attention else None
			self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None

			n_mlp = n_state * 4
			self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state))
			self.mlp_ln = LayerNorm(n_state)

			def forward(
			self,
			x: Tensor,
			xa: Optional[Tensor] = None,
			mask: Optional[Tensor] = None,
			kv_cache: Optional[dict] = None,
			def inference(self,
			data_in,
			data_lengths=None,
			key: list = None,
			tokenizer=None,
			frontend=None,
			**kwargs,
			):
			x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0]
			if self.cross_attn:
			x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[0]
			x = x + self.mlp(self.mlp_ln(x))
			return x
			if kwargs.get("batch_size", 1) > 1:
			raise NotImplementedError("batch decoding is not implemented")



			@tables.register("encoder_classes", "WhisperEncoder")
			class AudioEncoder(nn.Module):
			def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
			super().__init__()
			self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
			self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
			self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))

			self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
			[ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
			)
			self.ln_post = LayerNorm(n_state)

			def forward(self, x: Tensor):
			"""
			x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
			the mel spectrogram of the audio
			"""
			x = F.gelu(self.conv1(x))
			x = F.gelu(self.conv2(x))
			x = x.permute(0, 2, 1)

			assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
			x = (x + self.positional_embedding).to(x.dtype)

			for block in self.blocks:
			x = block(x)

			x = self.ln_post(x)
			return x

			@tables.register("decoder_classes", "WhisperDecoder")
			class TextDecoder(nn.Module):
			def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
			super().__init__()

			self.token_embedding = nn.Embedding(n_vocab, n_state)
			self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))

			self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
			[ResidualAttentionBlock(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
			)
			self.ln = LayerNorm(n_state)

			mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
			self.register_buffer("mask", mask, persistent=False)

			def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
			"""
			x : torch.LongTensor, shape = (batch_size, <= n_ctx)
			the text tokens
			xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
			the encoded audio features to be attended on
			"""
			offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
			x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
			x = x.to(xa.dtype)

			for block in self.blocks:
			x = block(x, xa, mask=self.mask, kv_cache=kv_cache)

			x = self.ln(x)
			logits = (x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)).float()

			return logits

			@tables.register("model_classes", "Whisper")
			class Whisper(nn.Module):
			def __init__(self, dims: dict):
			super().__init__()
			dims = ModelDimensions(**dims)
			self.dims = dims
			self.sos = 1
			self.eos = 1
			self.encoder = AudioEncoder(
			self.dims.n_mels,
			self.dims.n_audio_ctx,
			self.dims.n_audio_state,
			self.dims.n_audio_head,
			self.dims.n_audio_layer,
			)
			self.decoder = TextDecoder(
			self.dims.n_vocab,
			self.dims.n_text_ctx,
			self.dims.n_text_state,
			self.dims.n_text_head,
			self.dims.n_text_layer,
			)

			def embed_audio(self, mel: torch.Tensor):
			return self.encoder(mel)

			def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
			return self.decoder(tokens, audio_features)

			def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
			return self.decoder(tokens, self.encoder(mel))

			@property
			def device(self):
			return next(self.parameters()).device

			@property
			def is_multilingual(self):
			return self.dims.n_vocab == 51865

			def install_kv_cache_hooks(self, cache: Optional[dict] = None):
			"""
			The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
			tensors calculated for the previous positions. This method returns a dictionary that stores
			all caches, and the necessary hooks for the key and value projection modules that save the
			intermediate tensors to be reused during later calculations.

			Returns
			-------
			cache : Dict[nn.Module, torch.Tensor]
			A dictionary object mapping the key/value projection modules to its cache
			hooks : List[RemovableHandle]
			List of PyTorch RemovableHandle objects to stop the hooks to be called
			"""
			cache = {**cache} if cache is not None else {}
			hooks = []

			def save_to_cache(module, _, output):
			if module not in cache or output.shape[1] > self.decoder.positional_embedding.shape[0]:
			cache[module] = output # save as-is, for the first token or cross attention
			meta_data = {}
			if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank": # fbank
			speech, speech_lengths = data_in, data_lengths
			if len(speech.shape) < 3:
			speech = speech[None, :, :]
			if speech_lengths is None:
			speech_lengths = speech.shape[1]
			else:
			cache[module] = torch.cat([cache[module], output], dim=1).detach()
			return cache[module]
			# extract fbank feats
			time1 = time.perf_counter()
			audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000),
			data_type=kwargs.get("data_type", "sound"),
			tokenizer=tokenizer)
			time2 = time.perf_counter()
			meta_data["load_data"] = f"{time2 - time1:0.3f}"
			speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
			frontend=frontend)
			time3 = time.perf_counter()
			meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
			frame_shift = frontend.frame_shift if hasattr(frontend, "frame_shift") else 10
			lfr_n = frontend.lfr_n if hasattr(frontend, "lfr_n") else 1
			meta_data["batch_data_time"] = speech_lengths.sum().item() * frame_shift * lfr_n / 1000

			def install_hooks(layer: nn.Module):
			if isinstance(layer, MultiHeadAttention):
			hooks.append(layer.key.register_forward_hook(save_to_cache))
			hooks.append(layer.value.register_forward_hook(save_to_cache))
			speech = speech.to(device=kwargs["device"])[0, :, :]
			speech_lengths = speech_lengths.to(device=kwargs["device"])

			self.decoder.apply(install_hooks)
			return cache, hooks
			# detect the spoken language
			_, probs = self.model.detect_language(speech)
			print(f"Detected language: {max(probs, key=probs.get)}")

			detect_language = detect_language_function
			decode = decode_function
			# decode the audio
			options = whisper.DecodingOptions(language=kwargs.get("language", None), fp16=False)
			result = whisper.decode(self.model, speech, options)

			results = []
			result_i = {"key": key[0], "text": result.text}

			results.append(result_i)

			return results, meta_data

New file
			@@ -0,0 +1,46 @@
			# This is an example that demonstrates how to configure a model file.
			# You can modify the configuration according to your own requirements.

			# to print the register_table:
			# from funasr.register import tables
			# tables.print()

			# network architecture
			model: WhisperWarp
			model_conf:
			lsm_weight: 0.1
			length_normalized_loss: true
			hub: funasr # openai
			init_param_path: null # large-v2 or large-v3 if hub == "openai"



			# only use for hub == funasr,
			# if hub == openai, whisper_dims is automaticall download
			whisper_dims:
			'n_mels': 80
			'n_vocab': 51865
			'n_audio_ctx': 1500
			'n_audio_state': 1280
			'n_audio_head': 20
			'n_audio_layer': 32
			'n_text_ctx': 448
			'n_text_state': 1280
			'n_text_head': 20
			'n_text_layer': 32

			# frontend related
			frontend: WhisperFrontend
			frontend_conf:
			fs: 16000
			n_mels: 80
			do_pad_trim: true

			tokenizer: WhisperTokenizer
			tokenizer_conf:
			language: null
			task: transcribe
			is_multilingual: true
			num_languages: 99

			scope_map: ['none', "model."]

New file
			@@ -0,0 +1,24 @@

			try:
			from whisper.tokenizer import get_tokenizer
			except:
			print("If you want to use hugging, please `pip install -U transformers`")

			from funasr.register import tables

			@tables.register("tokenizer_classes", "WhisperTokenizer")
			def WhisperTokenizer(**kwargs):

			language = kwargs.get("language", None)
			task = kwargs.get("task", "transcribe")
			is_multilingual = kwargs.get("is_multilingual", True)
			num_languages = kwargs.get("num_languages", 99)
			tokenizer = get_tokenizer(
			multilingual=is_multilingual,
			num_languages=num_languages,
			language=language,
			task=task,
			)

			return tokenizer

			@@ -68,9 +68,9 @@
			else:
			buffer = BytesIO(oss_bucket.get_object(path).read())
			src_state = torch.load(buffer, map_location=map_location)
			if "state_dict" in src_state:
			src_state = src_state["state_dict"]

			src_state = src_state["state_dict"] if "state_dict" in src_state else src_state
			src_state = src_state["model_state_dict"] if "model_state_dict" in src_state else src_state
			src_state = src_state["model"] if "model" in src_state else src_state

			if isinstance(scope_map, str):