yhliang
2023-05-11 062eb8ff5ae4fde12d9551c2d6f1b64bf39571bd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
€•£/Œsphinx.addnodes”Œdocument”“”)”}”(Œ    rawsource”Œ”Œchildren”]”Œdocutils.nodes”Œsection”“”)”}”(hhh]”(h    Œtitle”“”)”}”(hŒBaseline”h]”h    ŒText”“”ŒBaseline”…””}”(Œparent”hŒ    _document”hŒsource”NŒline”NubaŒ
attributes”}”(Œids”]”Œclasses”]”Œnames”]”Œdupnames”]”Œbackrefs”]”uŒtagname”hhKhŒ5/mnt/yhliang/workspace/FunASR/docs/m2met2/Baseline.md”hh hhubh )”}”(hhh]”(h)”}”(hŒOverview”h]”hŒOverview”…””}”(hh0hhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hhKhh,hh-hhubh    Œ    paragraph”“”)”}”(hXŠWe will release an E2E SA-ASR baseline conducted on [FunASR](https://github.com/alibaba-damo-academy/FunASR) at the time according to the timeline. The model architecture is shown in Figure 3. The SpeakerEncoder is initialized with a pre-trained speaker verification model from ModelScope. This speaker verification model is also be used to extract the speaker embedding in the speaker profile.”h]”(hŒ4We will release an E2E SA-ASR baseline conducted on ”…””}”(hh@hhhNhNubh    Œ    reference”“”)”}”(hŒFunASR”h]”hŒFunASR”…””}”(hhJhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”Œrefuri”Œ.https://github.com/alibaba-damo-academy/FunASR”uh+hHhKhh,hh@hhubhX at the time according to the timeline. The model architecture is shown in Figure 3. The SpeakerEncoder is initialized with a pre-trained speaker verification model from ModelScope. This speaker verification model is also be used to extract the speaker embedding in the speaker profile.”…””}”(hh@hhhNhNubeh}”(h!]”h#]”h%]”h']”h)]”uh+h>hKhh,hh-hhubh?)”}”(hŒ.![model archietecture](images/sa_asr_arch.png)”h]”h    Œimage”“”)”}”(hŒmodel archietecture”h]”h}”(h!]”h#]”h%]”h']”h)]”Œuri”Œimages/sa_asr_arch.png”Œalt”hlŒ
candidates”}”Œ*”husuh+hhhKhh,hhdhhubah}”(h!]”h#]”h%]”h']”h)]”uh+h>hKhh,hh-hhubeh}”(h!]”Œoverview”ah#]”h%]”Œoverview”ah']”h)]”Œslug”Œoverview”uh+h
hKhh,hh hhubh )”}”(hhh]”(h)”}”(hŒ Quick start”h]”hŒ Quick start”…””}”(hhhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hhKhh,hhŠhhubh?)”}”(hXtTo run the baseline, first you need to install FunASR and ModelScope. ([installation](https://alibaba-damo-academy.github.io/FunASR/en/installation.html))  
There are two startup scripts, `run.sh` for training and evaluating on the old eval and test sets, and `run_m2met_2023_infer.sh` for inference on the new test set of the Multi-Channel Multi-Party Meeting Transcription 2.0 ([M2MeT2.0](https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html)) Challenge.  
Before running `run.sh`, you must manually download and unpack the [AliMeeting](http://www.openslr.org/119/) corpus and place it in the `./dataset` directory:”h]”(hŒGTo run the baseline, first you need to install FunASR and ModelScope. (”…””}”(hh›hhhNhNubhI)”}”(hŒ installation”h]”hŒ installation”…””}”(hh£hhhNhNubah}”(h!]”h#]”h%]”h']”h)]”hXŒBhttps://alibaba-damo-academy.github.io/FunASR/en/installation.html”uh+hHhKhh,hh›hhubhŒ)”…””}”(hh›hhhNhNubh    Œraw”“”)”}”(hŒ<br />
”h]”hŒ<br />
”…””}”(hh¸hhhNhNubah}”(h!]”h#]”h%]”h']”h)]”Œformat”Œhtml”Œ    xml:space”Œpreserve”uh+h¶hh›hhhh,hKubh·)”}”(hŒ\\
”h]”hŒ\\
”…””}”(hhÊhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”Œformat”Œlatex”hÈhÉuh+h¶hh›hhhh,hKubhŒThere are two startup scripts, ”…””}”(hh›hhhNhNubh    Œliteral”“”)”}”(hŒrun.sh”h]”hŒrun.sh”…””}”(hhàhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hÞhKhh,hh›hhubhŒ@ for training and evaluating on the old eval and test sets, and ”…””}”(hh›hhhNhNubhß)”}”(hŒrun_m2met_2023_infer.sh”h]”hŒrun_m2met_2023_infer.sh”…””}”(hhòhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hÞhKhh,hh›hhubhŒ_ for inference on the new test set of the Multi-Channel Multi-Party Meeting Transcription 2.0 (”…””}”(hh›hhhNhNubhI)”}”(hŒM2MeT2.0”h]”hŒM2MeT2.0”…””}”(hjhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”hXŒ?https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html”uh+hHhKhh,hh›hhubhŒ ) Challenge.”…””}”(hh›hhhNhNubh·)”}”(hŒ<br />
”h]”hŒ<br />
”…””}”(hjhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”Œformat”hÇhÈhÉuh+h¶hh›hhhh,hKubh·)”}”(hŒ\\
”h]”hŒ\\
”…””}”(hj&hhhNhNubah}”(h!]”h#]”h%]”h']”h)]”Œformat”hÙhÈhÉuh+h¶hh›hhhh,hKubhŒBefore running ”…””}”(hh›hhhNhNubhß)”}”(hŒrun.sh”h]”hŒrun.sh”…””}”(hj9hhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hÞhKhh,hh›hhubhŒ,, you must manually download and unpack the ”…””}”(hh›hhhNhNubhI)”}”(hŒ
AliMeeting”h]”hŒ
AliMeeting”…””}”(hjKhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”hXŒhttp://www.openslr.org/119/”uh+hHhKhh,hh›hhubhŒ corpus and place it in the ”…””}”(hh›hhhNhNubhß)”}”(hŒ    ./dataset”h]”hŒ    ./dataset”…””}”(hj^hhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hÞhKhh,hh›hhubhŒ  directory:”…””}”(hh›hhhNhNubeh}”(h!]”h#]”h%]”h']”h)]”uh+h>hKhh,hhŠhhubh    Œ literal_block”“”)”}”(hŒ‹dataset
|—— Eval_Ali_far
|—— Eval_Ali_near
|—— Test_Ali_far
|—— Test_Ali_near
|—— Train_Ali_far
|—— Train_Ali_near
”h]”hŒ‹dataset
|—— Eval_Ali_far
|—— Eval_Ali_near
|—— Test_Ali_far
|—— Test_Ali_near
|—— Train_Ali_far
|—— Train_Ali_near
”…””}”hjxsbah}”(h!]”h#]”h%]”h']”h)]”Œlanguage”Œshell”hÈhÉuh+jvhh,hK hhŠhhubh?)”}”(hXHBefore running `run_m2met_2023_infer.sh`, you need to place the new test set `Test_2023_Ali_far` (to be released after the challenge starts) in the `./dataset` directory, which contains only raw audios. Then put the given `wav.scp`, `wav_raw.scp`, `segments`, `utt2spk` and `spk2utt` in the `./data/Test_2023_Ali_far` directory.”h]”(hŒBefore running ”…””}”(hjˆhhhNhNubhß)”}”(hŒrun_m2met_2023_infer.sh”h]”hŒrun_m2met_2023_infer.sh”…””}”(hjhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hÞhKhh,hjˆhhubhŒ%, you need to place the new test set ”…””}”(hjˆhhhNhNubhß)”}”(hŒTest_2023_Ali_far”h]”hŒTest_2023_Ali_far”…””}”(hj¢hhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hÞhKhh,hjˆhhubhŒ4 (to be released after the challenge starts) in the ”…””}”(hjˆhhhNhNubhß)”}”(hŒ    ./dataset”h]”hŒ    ./dataset”…””}”(hj´hhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hÞhKhh,hjˆhhubhŒ? directory, which contains only raw audios. Then put the given ”…””}”(hjˆhhhNhNubhß)”}”(hŒwav.scp”h]”hŒwav.scp”…””}”(hjÆhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hÞhKhh,hjˆhhubhŒ, ”…””}”(hjˆhhhNhNubhß)”}”(hŒ wav_raw.scp”h]”hŒ wav_raw.scp”…””}”(hjØhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hÞhKhh,hjˆhhubhŒ, ”…””}”(hjˆhhhh,hKubhß)”}”(hŒsegments”h]”hŒsegments”…””}”(hjêhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hÞhKhh,hjˆhhubhŒ, ”…””}”(hjˆhhhh,hKubhß)”}”(hŒutt2spk”h]”hŒutt2spk”…””}”(hjühhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hÞhKhh,hjˆhhubhŒ and ”…””}”(hjˆhhhNhNubhß)”}”(hŒspk2utt”h]”hŒspk2utt”…””}”(hjhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hÞhKhh,hjˆhhubhŒ in the ”…””}”(hjˆhhhNhNubhß)”}”(hŒ./data/Test_2023_Ali_far”h]”hŒ./data/Test_2023_Ali_far”…””}”(hj hhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hÞhKhh,hjˆhhubhŒ  directory.”…””}”(hjˆhhhNhNubeh}”(h!]”h#]”h%]”h']”h)]”uh+h>hKhh,hhŠhhubjw)”}”(hŒldata/Test_2023_Ali_far
|—— wav.scp
|—— wav_raw.scp
|—— segments
|—— utt2spk
|—— spk2utt
”h]”hŒldata/Test_2023_Ali_far
|—— wav.scp
|—— wav_raw.scp
|—— segments
|—— utt2spk
|—— spk2utt
”…””}”hj8sbah}”(h!]”h#]”h%]”h']”h)]”Œlanguage”Œshell”hÈhÉuh+jvhh,hKhhŠhhubh?)”}”(hŒ}For more details you can see [here](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs/alimeeting/sa-asr/README.md)”h]”(hŒFor more details you can see ”…””}”(hjHhhhNhNubhI)”}”(hŒhere”h]”hŒhere”…””}”(hjPhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”hXŒXhttps://github.com/alibaba-damo-academy/FunASR/blob/main/egs/alimeeting/sa-asr/README.md”uh+hHhKhh,hjHhhubeh}”(h!]”h#]”h%]”h']”h)]”uh+h>hKhh,hhŠhhubeh}”(h!]”Œ quick-start”ah#]”h%]”Œ quick start”ah']”h)]”hˆŒ quick-start”uh+h
hKhh,hh hhubh )”}”(hhh]”(h)”}”(hŒBaseline results”h]”hŒBaseline results”…””}”(hjqhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hhKhh,hjnhhubh?)”}”(hX¢The results of the baseline system are shown in Table 3. The speaker profile adopts the oracle speaker embedding during training. However, due to the lack of oracle speaker label during evaluation, the speaker profile provided by an additional spectral clustering is used. Meanwhile, the results of using the oracle speaker profile on Eval and Test Set are also provided to show the impact of speaker profile accuracy.”h]”hX¢The results of the baseline system are shown in Table 3. The speaker profile adopts the oracle speaker embedding during training. However, due to the lack of oracle speaker label during evaluation, the speaker profile provided by an additional spectral clustering is used. Meanwhile, the results of using the oracle speaker profile on Eval and Test Set are also provided to show the impact of speaker profile accuracy.”…””}”(hjhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+h>hK hh,hjnhhubh?)”}”(hŒ.![baseline_result](images/baseline_result.png)”h]”hi)”}”(hŒbaseline_result”h]”h}”(h!]”h#]”h%]”h']”h)]”htŒimages/baseline_result.png”hvj“hw}”hyj›suh+hhhK"hh,hjhhubah}”(h!]”h#]”h%]”h']”h)]”uh+h>hK"hh,hjnhhubeh}”(h!]”Œbaseline-results”ah#]”h%]”Œbaseline results”ah']”h)]”hˆŒbaseline-results”uh+h
hKhh,hh hhubeh}”(h!]”Œbaseline”ah#]”h%]”Œbaseline”ah']”h)]”hˆŒbaseline”uh+h
hKhh,hhhhubah}”(h!]”h#]”h%]”h']”h)]”Œsource”h,uh+hŒcurrent_source”NŒ current_line”NŒsettings”Œdocutils.frontend”ŒValues”“”)”}”(hNŒ    generator”NŒ    datestamp”NŒ source_link”NŒ
source_url”NŒ toc_backlinks”Œentry”Œfootnote_backlinks”KŒ sectnum_xform”KŒstrip_comments”NŒstrip_elements_with_classes”NŒ strip_classes”NŒ report_level”KŒ
halt_level”KŒexit_status_level”KŒdebug”NŒwarning_stream”NŒ    traceback”ˆŒinput_encoding”Œ    utf-8-sig”Œinput_encoding_error_handler”Œstrict”Œoutput_encoding”Œutf-8”Œoutput_encoding_error_handler”jØŒerror_encoding”ŒUTF-8”Œerror_encoding_error_handler”Œbackslashreplace”Œ language_code”Œen”Œrecord_dependencies”NŒconfig”NŒ    id_prefix”hŒauto_id_prefix”Œid”Œ dump_settings”NŒdump_internals”NŒdump_transforms”NŒdump_pseudo_xml”NŒexpose_internals”NŒstrict_visitor”NŒ_disable_config”NŒ_source”h,Œ _destination”NŒ _config_files”]”Œfile_insertion_enabled”ˆŒ raw_enabled”KŒline_length_limit”M'Œpep_references”NŒ pep_base_url”Œhttps://peps.python.org/”Œpep_file_url_template”Œpep-%04d”Œrfc_references”NŒ rfc_base_url”Œ&https://datatracker.ietf.org/doc/html/”Œ    tab_width”KŒtrim_footnote_reference_space”‰Œsyntax_highlight”Œlong”Œ smart_quotes”ˆŒsmartquotes_locales”]”Œcharacter_level_inline_markup”‰Œdoctitle_xform”‰Œ docinfo_xform”KŒsectsubtitle_xform”‰Œ image_loading”Œlink”Œembed_stylesheet”‰Œcloak_email_addresses”ˆŒsection_self_link”‰Œenv”NubŒreporter”NŒindirect_targets”]”Œsubstitution_defs”}”(Œwordcount-words”h    Œsubstitution_definition”“”)”}”(hŒ222”h]”hŒ222”…””}”hjsbah}”(h!]”h#]”h%]”Œwordcount-words”ah']”h)]”uh+jhh,ubŒwordcount-minutes”j)”}”(hŒ1”h]”hŒ1”…””}”hj&sbah}”(h!]”h#]”h%]”Œwordcount-minutes”ah']”h)]”uh+jhh,ubuŒsubstitution_names”}”(Œwordcount-words”jŒwordcount-minutes”j%uŒrefnames”}”Œrefids”}”Œnameids”}”(j±j®h…h‚jjjgj¨j¥uŒ    nametypes”}”(j±‰h…‰jj‰j¨‰uh!}”(j®h h‚h-jghŠj¥jnuŒ footnote_refs”}”Œ citation_refs”}”Œ autofootnotes”]”Œautofootnote_refs”]”Œsymbol_footnotes”]”Œsymbol_footnote_refs”]”Œ    footnotes”]”Œ    citations”]”Œautofootnote_start”KŒsymbol_footnote_start”KŒ
id_counter”Œ collections”ŒCounter”“”}”…”R”Œparse_messages”]”Œtransform_messages”]”Œ transformer”NŒ include_log”]”Œ
decoration”NhhŒ
myst_slugs”}”(j´Kj®ŒBaseline”‡”h‰Kh‚ŒOverview”‡”jmKjgŒ Quick start”‡”j«Kj¥ŒBaseline results”‡”uub.