游雁
2023-11-24 b5d3df75cf6462aa3bf42fd3c86fa2aa7f1c8a15
funasr/bin/diar_inference_launch.py
@@ -15,7 +15,8 @@
from typing import Union
import numpy as np
import soundfile
# import librosa
import librosa
import torch
from scipy.signal import medfilt
@@ -92,10 +93,7 @@
            embedding_node="resnet1_dense"
        )
        logging.info("speech2xvector_kwargs: {}".format(speech2xvector_kwargs))
        speech2xvector = Speech2Xvector.from_pretrained(
            model_tag=model_tag,
            **speech2xvector_kwargs,
        )
        speech2xvector = Speech2Xvector(**speech2xvector_kwargs)
        speech2xvector.sv_model.eval()
    # 2b. Build speech2diar
@@ -109,10 +107,7 @@
        dur_threshold=dur_threshold,
    )
    logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
    speech2diar = Speech2DiarizationSOND.from_pretrained(
        model_tag=model_tag,
        **speech2diar_kwargs,
    )
    speech2diar = Speech2DiarizationSOND(**speech2diar_kwargs)
    speech2diar.diar_model.eval()
    def output_results_str(results: dict, uttid: str):
@@ -150,7 +145,9 @@
                        # read waveform file
                        example = [load_bytes(x) if isinstance(x, bytes) else x
                                   for x in example]
                        example = [soundfile.read(x)[0] if isinstance(x, str) else x
                        # example = [librosa.load(x)[0] if isinstance(x, str) else x
                        #            for x in example]
                        example = [librosa.load(x, dtype='float32')[0] if isinstance(x, str) else x
                                   for x in example]
                        # convert torch tensor to numpy array
                        example = [x.numpy() if isinstance(example[0], torch.Tensor) else x
@@ -257,10 +254,7 @@
        dtype=dtype,
    )
    logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
    speech2diar = Speech2DiarizationEEND.from_pretrained(
        model_tag=model_tag,
        **speech2diar_kwargs,
    )
    speech2diar = Speech2DiarizationEEND(**speech2diar_kwargs)
    speech2diar.diar_model.eval()
    def output_results_str(results: dict, uttid: str):
@@ -462,11 +456,17 @@
        help="The batch size for inference",
    )
    group.add_argument(
        "--diar_smooth_size",
        "--smooth_size",
        type=int,
        default=121,
        help="The smoothing size for post-processing"
    )
    group.add_argument(
        "--dur_threshold",
        type=int,
        default=10,
        help="The threshold of minimum duration"
    )
    return parser