From bf04c4bab811003ff442f6a01f29920b8d4f41ea Mon Sep 17 00:00:00 2001
From: 志浩 <neo.dzh@alibaba-inc.com>
Date: 星期四, 27 四月 2023 17:27:47 +0800
Subject: [PATCH] add docs for speaker verification and diarization
---
/dev/null | 20 -----
docs/modescope_pipeline/sd_pipeline.md | 1
egs_modelscope/speaker_diarization/TEMPLATE/README.md | 81 ++++++++++++++++++++
egs_modelscope/speaker_verification/TEMPLATE/README.md | 121 ++++++++++++++++++++++++++++++
egs_modelscope/speaker_verification/TEMPLATE/infer.py | 15 +++
docs/modescope_pipeline/sv_pipeline.md | 1
6 files changed, 219 insertions(+), 20 deletions(-)
diff --git a/docs/modescope_pipeline/sd_pipeline.md b/docs/modescope_pipeline/sd_pipeline.md
deleted file mode 100644
index 1330fe6..0000000
--- a/docs/modescope_pipeline/sd_pipeline.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# Speaker Diarization
-
-## Inference with pipeline
-
-### Quick start
-
-### Inference with you data
-
-### Inference with multi-threads on CPU
-
-### Inference with multi GPU
-
-## Finetune with pipeline
-
-### Quick start
-
-### Finetune with your data
-
-## Inference with your finetuned model
-
diff --git a/docs/modescope_pipeline/sd_pipeline.md b/docs/modescope_pipeline/sd_pipeline.md
new file mode 120000
index 0000000..9c3ac98
--- /dev/null
+++ b/docs/modescope_pipeline/sd_pipeline.md
@@ -0,0 +1 @@
+../../egs_modelscope/speaker_diarization/TEMPLATE/README.md
\ No newline at end of file
diff --git a/docs/modescope_pipeline/sv_pipeline.md b/docs/modescope_pipeline/sv_pipeline.md
deleted file mode 100644
index c57db38..0000000
--- a/docs/modescope_pipeline/sv_pipeline.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# Speaker Verification
-
-## Inference with pipeline
-
-### Quick start
-
-### Inference with you data
-
-### Inference with multi-threads on CPU
-
-### Inference with multi GPU
-
-## Finetune with pipeline
-
-### Quick start
-
-### Finetune with your data
-
-## Inference with your finetuned model
-
diff --git a/docs/modescope_pipeline/sv_pipeline.md b/docs/modescope_pipeline/sv_pipeline.md
new file mode 120000
index 0000000..3217355
--- /dev/null
+++ b/docs/modescope_pipeline/sv_pipeline.md
@@ -0,0 +1 @@
+../../egs_modelscope/speaker_verification/TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/speaker_diarization/TEMPLATE/README.md b/egs_modelscope/speaker_diarization/TEMPLATE/README.md
new file mode 100644
index 0000000..2cd702c
--- /dev/null
+++ b/egs_modelscope/speaker_diarization/TEMPLATE/README.md
@@ -0,0 +1,81 @@
+# Speaker Diarization
+
+> **Note**:
+> The modelscope pipeline supports all the models in
+[model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope)
+to inference and finetine. Here we take the model of xvector_sv as example to demonstrate the usage.
+
+## Inference with pipeline
+### Quick start
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+# initialize pipeline
+inference_diar_pipline = pipeline(
+ mode="sond_demo",
+ num_workers=0,
+ task=Tasks.speaker_diarization,
+ diar_model_config="sond.yaml",
+ model='damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch',
+ reversion="v1.0.5",
+ sv_model="damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch",
+ sv_model_revision="v1.2.2",
+)
+
+# input: a list of audio in which the first item is a speech recording to detect speakers,
+# and the following wav file are used to extract speaker embeddings.
+audio_list = [
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/record.wav",
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk1.wav",
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk2.wav",
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk3.wav",
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk4.wav",
+]
+
+results = inference_diar_pipline(audio_in=audio_list)
+print(results)
+```
+
+#### API-reference
+##### Define pipeline
+- `task`: `Tasks.speaker_diarization`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
+- `output_dir`: `None` (Default), the output path of results if set
+- `batch_size`: `1` (Default), batch size when decoding
+- `smooth_size`: `83` (Default), the window size to perform smoothing
+- `dur_threshold`: `10` (Default), segments shorter than 100 ms will be dropped
+- `out_format`: `vad` (Default), the output format, choices `["vad", "rttm"]`.
+ - vad format: spk1: [1.0, 3.0], [5.0, 8.0]
+ - rttm format: "SPEAKER test1 0 1.00 2.00 <NA> <NA> spk1 <NA> <NA>" and "SPEAKER test1 0 5.00 3.00 <NA> <NA> spk1 <NA> <NA>"
+
+##### Infer pipeline for speaker embedding extraction
+- `audio_in`: the input to process, which could be:
+ - list of url: `e.g.`: waveform files at a website
+ - list of local file path: `e.g.`: path/to/a.wav
+ - ("wav.scp,speech,sound", "profile.scp,profile,kaldi_ark"): a script file of waveform files and another script file of speaker profiles (extracted with the [model](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary))
+ ```text
+ wav.scp
+ test1 path/to/enroll1.wav
+ test2 path/to/enroll2.wav
+
+ profile.scp
+ test1 path/to/profile.ark:11
+ test2 path/to/profile.ark:234
+ ```
+ The profile.ark file contains speaker embeddings in a kaldi-like style.
+ Please refer [README.md](../../speaker_verification/TEMPLATE/README.md) for more details.
+
+### Inference with you data
+For single input, we recommend the "list of local file path" mode for inference.
+For multiple inputs, we recommend the last mode with pre-organized wav.scp and profile.scp.
+
+### Inference with multi-threads on CPU
+We recommend the last mode with split wav.scp and profile.scp. Then, run inference for each split part.
+Please refer [README.md](../../speaker_verification/TEMPLATE/README.md) to find a similar process.
+
+### Inference with multi GPU
+Similar to CPU, please set `ngpu=1` for inference on GPU.
+Besides, you should use `CUDA_VISIBLE_DEVICES=0` to specify a GPU device.
+Please refer [README.md](../../speaker_verification/TEMPLATE/README.md) to find a similar process.
diff --git a/egs_modelscope/speaker_verification/TEMPLATE/README.md b/egs_modelscope/speaker_verification/TEMPLATE/README.md
new file mode 100644
index 0000000..957da90
--- /dev/null
+++ b/egs_modelscope/speaker_verification/TEMPLATE/README.md
@@ -0,0 +1,121 @@
+# Speaker Verification
+
+> **Note**:
+> The modelscope pipeline supports all the models in
+[model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope)
+to inference and finetine. Here we take the model of xvector_sv as example to demonstrate the usage.
+
+## Inference with pipeline
+
+### Quick start
+#### Speaker verification
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_sv_pipline = pipeline(
+ task=Tasks.speaker_verification,
+ model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch'
+)
+
+# The same speaker
+rec_result = inference_sv_pipline(audio_in=(
+ 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav',
+ 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav'))
+print("Similarity", rec_result["scores"])
+
+# Different speakers
+rec_result = inference_sv_pipline(audio_in=(
+ 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav',
+ 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_different.wav'))
+print("Similarity", rec_result["scores"])
+```
+#### Speaker embedding extraction
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+# Define extraction pipeline
+inference_sv_pipline = pipeline(
+ task=Tasks.speaker_verification,
+ model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch'
+)
+# Extract speaker embedding
+rec_result = inference_sv_pipline(
+ audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav')
+speaker_embedding = rec_result["spk_embedding"]
+```
+Full code of demo, please ref to [infer.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer.py).
+
+#### API-reference
+##### Define pipeline
+- `task`: `Tasks.speaker_verification`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
+- `output_dir`: `None` (Default), the output path of results if set
+- `batch_size`: `1` (Default), batch size when decoding
+- `sv_threshold`: `0.9465` (Default), the similarity threshold to determine
+whether utterances belong to the same speaker (it should be in (0, 1))
+
+##### Infer pipeline for speaker embedding extraction
+- `audio_in`: the input to process, which could be:
+ - url (str): `e.g.`: https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav
+ - local_path: `e.g.`: path/to/a.wav
+ - wav.scp: `e.g.`: path/to/wav1.scp
+ ```text
+ wav.scp
+ test1 path/to/enroll1.wav
+ test2 path/to/enroll2.wav
+ ```
+ - bytes: `e.g.`: raw bytes data from a microphone
+ - fbank1.scp,speech,kaldi_ark: `e.g.`: extracted 80-dimensional fbank features
+with kaldi toolkits.
+
+##### Infer pipeline for speaker verification
+- `audio_in`: the input to process, which could be:
+ - Tuple(url1, url2): `e.g.`: (https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav, https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_different.wav)
+ - Tuple(local_path1, local_path2): `e.g.`: (path/to/a.wav, path/to/b.wav)
+ - Tuple(wav1.scp, wav2.scp): `e.g.`: (path/to/wav1.scp, path/to/wav2.scp)
+ ```text
+ wav1.scp
+ test1 path/to/enroll1.wav
+ test2 path/to/enroll2.wav
+
+ wav2.scp
+ test1 path/to/same1.wav
+ test2 path/to/diff2.wav
+ ```
+ - Tuple(bytes, bytes): `e.g.`: raw bytes data from a microphone
+ - Tuple("fbank1.scp,speech,kaldi_ark", "fbank2.scp,speech,kaldi_ark"): `e.g.`: extracted 80-dimensional fbank features
+with kaldi toolkits.
+
+### Inference with you data
+Use wav1.scp or fbank.scp to organize your own data to extract speaker embeddings or perform speaker verification.
+In this case, the `output_dir` should be set to save all the embeddings or scores.
+
+### Inference with multi-threads on CPU
+You can inference with multi-threads on CPU as follow steps:
+1. Set `ngpu=0` while defining the pipeline in `infer.py`.
+2. Split wav.scp to several files `e.g.: 4`
+ ```shell
+ split -l $((`wc -l < wav.scp`/4+1)) --numeric-suffixes wav.scp splits/wav.scp.
+ ```
+3. Start to extract embeddings
+ ```shell
+ for wav_scp in `ls splits/wav.scp.*`; do
+ infer.py ${wav_scp} outputs/$((basename ${wav_scp}))
+ done
+ ```
+4. The embeddings will be saved in `outputs/*`
+
+### Inference with multi GPU
+Similar to inference on CPU, the difference are as follows:
+
+Step 1. Set `ngpu=1` while defining the pipeline in `infer.py`.
+
+Step 3. specify the gpu device with `CUDA_VISIBLE_DEVICES`:
+```shell
+ for wav_scp in `ls splits/wav.scp.*`; do
+ CUDA_VISIBLE_DEVICES=1 infer.py ${wav_scp} outputs/$((basename ${wav_scp}))
+ done
+ ```
diff --git a/egs_modelscope/speaker_verification/TEMPLATE/infer.py b/egs_modelscope/speaker_verification/TEMPLATE/infer.py
new file mode 100644
index 0000000..efab097
--- /dev/null
+++ b/egs_modelscope/speaker_verification/TEMPLATE/infer.py
@@ -0,0 +1,15 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+import sys
+
+# Define extraction pipeline
+inference_sv_pipline = pipeline(
+ task=Tasks.speaker_verification,
+ model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch',
+ output_dir=sys.argv[2],
+)
+# Extract speaker embedding
+rec_result = inference_sv_pipline(
+ audio_in=sys.argv[1],
+
+)
--
Gitblit v1.9.1