yhliang
2023-04-25 3ef7c5c966bc025088d65212aaafaed2be3ce982
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
€•rŒsphinx.addnodes”Œdocument”“”)”}”(Œ    rawsource”Œ”Œchildren”]”Œdocutils.nodes”Œsection”“”)”}”(hhh]”(h    Œtitle”“”)”}”(hŒBaseline”h]”h    ŒText”“”ŒBaseline”…””}”(Œparent”hŒ    _document”hŒsource”NŒline”NubaŒ
attributes”}”(Œids”]”Œclasses”]”Œnames”]”Œdupnames”]”Œbackrefs”]”uŒtagname”hhKhŒ5/mnt/yhliang/workspace/FunASR/docs/m2met2/Baseline.md”hh hhubh )”}”(hhh]”(h)”}”(hŒOverview”h]”hŒOverview”…””}”(hh0hhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hhKhh,hh-hhubh    Œ    paragraph”“”)”}”(hX¦We will release an E2E SA-ASR~\cite{kanda21b_interspeech} baseline conducted on [FunASR](https://github.com/alibaba-damo-academy/FunASR) at the time according to the timeline. The model architecture is shown in Figure 3. The SpeakerEncoder is initialized with a pre-trained speaker verification model from ModelScope. This speaker verification model is also be used to extract the speaker embedding in the speaker profile.”h]”(hŒPWe will release an E2E SA-ASR~\cite{kanda21b_interspeech} baseline conducted on ”…””}”(hh@hhhNhNubh    Œ    reference”“”)”}”(hŒFunASR”h]”hŒFunASR”…””}”(hhJhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”Œrefuri”Œ.https://github.com/alibaba-damo-academy/FunASR”uh+hHhKhh,hh@hhubhX at the time according to the timeline. The model architecture is shown in Figure 3. The SpeakerEncoder is initialized with a pre-trained speaker verification model from ModelScope. This speaker verification model is also be used to extract the speaker embedding in the speaker profile.”…””}”(hh@hhhNhNubeh}”(h!]”h#]”h%]”h']”h)]”uh+h>hKhh,hh-hhubh?)”}”(hŒ.![model archietecture](images/sa_asr_arch.png)”h]”h    Œimage”“”)”}”(hŒmodel archietecture”h]”h}”(h!]”h#]”h%]”h']”h)]”Œuri”Œimages/sa_asr_arch.png”Œalt”hlŒ
candidates”}”Œ*”husuh+hhhKhh,hhdhhubah}”(h!]”h#]”h%]”h']”h)]”uh+h>hKhh,hh-hhubeh}”(h!]”Œoverview”ah#]”h%]”Œoverview”ah']”h)]”Œslug”Œoverview”uh+h
hKhh,hh hhubh )”}”(hhh]”(h)”}”(hŒ Quick start”h]”hŒ Quick start”…””}”(hhhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hhKhh,hhŠhhubh?)”}”(hŒ.#TODO: fill with the README.md of the baseline”h]”hŒ.#TODO: fill with the README.md of the baseline”…””}”(hh›hhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+h>hKhh,hhŠhhubeh}”(h!]”Œ quick-start”ah#]”h%]”Œ quick start”ah']”h)]”hˆŒ quick-start”uh+h
hKhh,hh hhubh )”}”(hhh]”(h)”}”(hŒBaseline results”h]”hŒBaseline results”…””}”(hhµhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+hhK
hh,hh²hhubh?)”}”(hX¢The results of the baseline system are shown in Table 3. The speaker profile adopts the oracle speaker embedding during training. However, due to the lack of oracle speaker label during evaluation, the speaker profile provided by an additional spectral clustering is used. Meanwhile, the results of using the oracle speaker profile on Eval and Test Set are also provided to show the impact of speaker profile accuracy.”h]”hX¢The results of the baseline system are shown in Table 3. The speaker profile adopts the oracle speaker embedding during training. However, due to the lack of oracle speaker label during evaluation, the speaker profile provided by an additional spectral clustering is used. Meanwhile, the results of using the oracle speaker profile on Eval and Test Set are also provided to show the impact of speaker profile accuracy.”…””}”(hhÃhhhNhNubah}”(h!]”h#]”h%]”h']”h)]”uh+h>hK hh,hh²hhubh?)”}”(hŒ.![baseline result](images/baseline_result.png)”h]”hi)”}”(hŒbaseline result”h]”h}”(h!]”h#]”h%]”h']”h)]”htŒimages/baseline_result.png”hvh×hw}”hyhßsuh+hhhK hh,hhÑhhubah}”(h!]”h#]”h%]”h']”h)]”uh+h>hK hh,hh²hhubeh}”(h!]”Œbaseline-results”ah#]”h%]”Œbaseline results”ah']”h)]”hˆŒbaseline-results”uh+h
hK
hh,hh hhubeh}”(h!]”Œbaseline”ah#]”h%]”Œbaseline”ah']”h)]”hˆŒbaseline”uh+h
hKhh,hhhhubah}”(h!]”h#]”h%]”h']”h)]”Œsource”h,uh+hŒcurrent_source”NŒ current_line”NŒsettings”Œdocutils.frontend”ŒValues”“”)”}”(hNŒ    generator”NŒ    datestamp”NŒ source_link”NŒ
source_url”NŒ toc_backlinks”Œentry”Œfootnote_backlinks”KŒ sectnum_xform”KŒstrip_comments”NŒstrip_elements_with_classes”NŒ strip_classes”NŒ report_level”KŒ
halt_level”KŒexit_status_level”KŒdebug”NŒwarning_stream”NŒ    traceback”ˆŒinput_encoding”Œ    utf-8-sig”Œinput_encoding_error_handler”Œstrict”Œoutput_encoding”Œutf-8”Œoutput_encoding_error_handler”jŒerror_encoding”ŒUTF-8”Œerror_encoding_error_handler”Œbackslashreplace”Œ language_code”Œen”Œrecord_dependencies”NŒconfig”NŒ    id_prefix”hŒauto_id_prefix”Œid”Œ dump_settings”NŒdump_internals”NŒdump_transforms”NŒdump_pseudo_xml”NŒexpose_internals”NŒstrict_visitor”NŒ_disable_config”NŒ_source”h,Œ _destination”NŒ _config_files”]”Œfile_insertion_enabled”ˆŒ raw_enabled”KŒline_length_limit”M'Œpep_references”NŒ pep_base_url”Œhttps://peps.python.org/”Œpep_file_url_template”Œpep-%04d”Œrfc_references”NŒ rfc_base_url”Œ&https://datatracker.ietf.org/doc/html/”Œ    tab_width”KŒtrim_footnote_reference_space”‰Œsyntax_highlight”Œlong”Œ smart_quotes”ˆŒsmartquotes_locales”]”Œcharacter_level_inline_markup”‰Œdoctitle_xform”‰Œ docinfo_xform”KŒsectsubtitle_xform”‰Œ image_loading”Œlink”Œembed_stylesheet”‰Œcloak_email_addresses”ˆŒsection_self_link”‰Œenv”NubŒreporter”NŒindirect_targets”]”Œsubstitution_defs”}”(Œwordcount-words”h    Œsubstitution_definition”“”)”}”(hŒ130”h]”hŒ130”…””}”hjZsbah}”(h!]”h#]”h%]”Œwordcount-words”ah']”h)]”uh+jXhh,ubŒwordcount-minutes”jY)”}”(hŒ1”h]”hŒ1”…””}”hjjsbah}”(h!]”h#]”h%]”Œwordcount-minutes”ah']”h)]”uh+jXhh,ubuŒsubstitution_names”}”(Œwordcount-words”jWŒwordcount-minutes”jiuŒrefnames”}”Œrefids”}”Œnameids”}”(hõhòh…h‚h®h«hìhéuŒ    nametypes”}”(hõ‰h…‰h®‰hì‰uh!}”(hòh h‚h-h«hŠhéh²uŒ footnote_refs”}”Œ citation_refs”}”Œ autofootnotes”]”Œautofootnote_refs”]”Œsymbol_footnotes”]”Œsymbol_footnote_refs”]”Œ    footnotes”]”Œ    citations”]”Œautofootnote_start”KŒsymbol_footnote_start”KŒ
id_counter”Œ collections”ŒCounter”“”}”…”R”Œparse_messages”]”Œtransform_messages”]”Œ transformer”NŒ include_log”]”Œ
decoration”NhhŒ
myst_slugs”}”(høKhòŒBaseline”‡”h‰Kh‚ŒOverview”‡”h±Kh«Œ Quick start”‡”hïK
héŒBaseline results”‡”uub.