python/FunASR-XL.git

parent: 624ba883 | 补丁 | 提交 | ignore whitespace

Merge pull request #525 from alibaba-damo-academy/dev_lyh

yhliang

2023-05-18 0ec7f0aea6a2ef86a607539eedb97334bdea56b7

Merge pull request #525 from alibaba-damo-academy/dev_lyh

fix m2met2 baseline bugs

5个文件已修改

	egs/alimeeting/sa-asr/asr_local.sh	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/alimeeting/sa-asr/local/alimeeting_data_prep.sh	11 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/alimeeting/sa-asr/local/gen_oracle_profile_padding.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/alimeeting/sa-asr/path.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/frontend/default.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 egs/alimeeting/sa-asr/asr_local.sh

@@ -1153,10 +1153,10 @@
        mkdir -p ${sa_asr_exp}/log
        INIT_FILE=${sa_asr_exp}/ddp_init
        
        if [ ! -f "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth" ]; then
        if [ ! -f "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb" ]; then
            # download xvector extractor model file
            python local/download_xvector_model.py exp
            log "Successfully download the pretrained xvector extractor to exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth"
            log "Successfully download the pretrained xvector extractor to exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb"
        fi
        
        if [ -f $INIT_FILE ];then
@@ -1195,8 +1195,8 @@
                    --init_param "${asr_exp}/valid.acc.ave.pb:decoder.decoders.3:decoder.decoder4.2" \
                    --init_param "${asr_exp}/valid.acc.ave.pb:decoder.decoders.4:decoder.decoder4.3" \
                    --init_param "${asr_exp}/valid.acc.ave.pb:decoder.decoders.5:decoder.decoder4.4" \
                    --init_param "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth:encoder:spk_encoder"   \
                    --init_param "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth:decoder:spk_encoder:decoder.output_dense"   \
                    --init_param "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb:encoder:spk_encoder"   \
                    --init_param "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb:decoder:spk_encoder:decoder.output_dense"   \
                    --valid_data_path_and_name_and_type "${_asr_valid_dir}/${_scp},speech,${_type}" \
                    --valid_data_path_and_name_and_type "${_asr_valid_dir}/text,text,text" \
                    --valid_data_path_and_name_and_type "${_asr_valid_dir}/oracle_profile_nopadding.scp,profile,npy" \

 egs/alimeeting/sa-asr/local/alimeeting_data_prep.sh

@@ -61,9 +61,9 @@
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then 
    log "stage 1:process alimeeting near dir"
    
    find -L $near_raw_dir/audio_dir -iname "*.wav" >  $near_dir/wavlist
    find -L $near_raw_dir/audio_dir -iname "*.wav" | sort >  $near_dir/wavlist
    awk -F '/' '{print $NF}' $near_dir/wavlist | awk -F '.' '{print $1}' > $near_dir/uttid   
    find -L $near_raw_dir/textgrid_dir  -iname "*.TextGrid" > $near_dir/textgrid.flist
    find -L $near_raw_dir/textgrid_dir  -iname "*.TextGrid" | sort > $near_dir/textgrid.flist
    n1_wav=$(wc -l < $near_dir/wavlist)
    n2_text=$(wc -l < $near_dir/textgrid.flist)
    log  near file found $n1_wav wav and $n2_text text.
@@ -90,9 +90,9 @@
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    log "stage 2:process alimeeting far dir"
    
    find -L $far_raw_dir/audio_dir -iname "*.wav" >  $far_dir/wavlist
    find -L $far_raw_dir/audio_dir -iname "*.wav" | sort >  $far_dir/wavlist
    awk -F '/' '{print $NF}' $far_dir/wavlist | awk -F '.' '{print $1}' > $far_dir/uttid   
    find -L $far_raw_dir/textgrid_dir  -iname "*.TextGrid" > $far_dir/textgrid.flist
    find -L $far_raw_dir/textgrid_dir  -iname "*.TextGrid" | sort > $far_dir/textgrid.flist
    n1_wav=$(wc -l < $far_dir/wavlist)
    n2_text=$(wc -l < $far_dir/textgrid.flist)
    log  far file found $n1_wav wav and $n2_text text.
@@ -120,7 +120,8 @@

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    log "stage 3: finali data process"

    local/fix_data_dir.sh $near_dir
    local/fix_data_dir.sh $far_dir
    local/copy_data_dir.sh $near_dir data/${tgt}_Ali_near
    local/copy_data_dir.sh $far_dir data/${tgt}_Ali_far


 egs/alimeeting/sa-asr/local/gen_oracle_profile_padding.py

@@ -42,8 +42,8 @@
            global_spk_list_tmp = global_spk_list[: ]
            for spk in meeting_map_tmp[meeting]:
                global_spk_list_tmp.remove(spk)
                padding_spk = random.sample(global_spk_list_tmp, 4 - num)
                meeting_map_tmp[meeting] = meeting_map_tmp[meeting] + padding_spk
            padding_spk = random.sample(global_spk_list_tmp, 4 - num)
            meeting_map_tmp[meeting] = meeting_map_tmp[meeting] + padding_spk
    
    meeting_map = {}
    os.system('mkdir -p ' + path + '/oracle_profile_padding')

 egs/alimeeting/sa-asr/path.sh

@@ -2,4 +2,4 @@

# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PATH=$FUNASR_DIR/funasr/bin:$PATH
export PATH=$FUNASR_DIR/funasr/bin:./utils:$PATH

 funasr/models/frontend/default.py

@@ -102,8 +102,8 @@
        if input_stft.dim() == 4:
            # h: (B, T, C, F) -> h: (B, T, F)
            if self.training:
                if self.use_channel == None:
                    input_stft = input_stft[:, :, 0, :]
                if self.use_channel is not None:
                    input_stft = input_stft[:, :, self.use_channel, :]
                else:
                    # Select 1ch randomly
                    ch = np.random.randint(input_stft.size(2))

			@@ -1153,10 +1153,10 @@
			mkdir -p ${sa_asr_exp}/log
			INIT_FILE=${sa_asr_exp}/ddp_init

			if [ ! -f "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth" ]; then
			if [ ! -f "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb" ]; then
			# download xvector extractor model file
			python local/download_xvector_model.py exp
			log "Successfully download the pretrained xvector extractor to exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth"
			log "Successfully download the pretrained xvector extractor to exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb"
			fi

			if [ -f $INIT_FILE ];then
			@@ -1195,8 +1195,8 @@
			--init_param "${asr_exp}/valid.acc.ave.pb:decoder.decoders.3:decoder.decoder4.2" \
			--init_param "${asr_exp}/valid.acc.ave.pb:decoder.decoders.4:decoder.decoder4.3" \
			--init_param "${asr_exp}/valid.acc.ave.pb:decoder.decoders.5:decoder.decoder4.4" \
			--init_param "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth:encoder:spk_encoder" \
			--init_param "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth:decoder:spk_encoder:decoder.output_dense" \
			--init_param "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb:encoder:spk_encoder" \
			--init_param "exp/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb:decoder:spk_encoder:decoder.output_dense" \
			--valid_data_path_and_name_and_type "${_asr_valid_dir}/${_scp},speech,${_type}" \
			--valid_data_path_and_name_and_type "${_asr_valid_dir}/text,text,text" \
			--valid_data_path_and_name_and_type "${_asr_valid_dir}/oracle_profile_nopadding.scp,profile,npy" \

			@@ -61,9 +61,9 @@
			if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
			log "stage 1:process alimeeting near dir"

			find -L $near_raw_dir/audio_dir -iname "*.wav" > $near_dir/wavlist
			find -L $near_raw_dir/audio_dir -iname "*.wav" \| sort > $near_dir/wavlist
			awk -F '/' '{print $NF}' $near_dir/wavlist \| awk -F '.' '{print $1}' > $near_dir/uttid
			find -L $near_raw_dir/textgrid_dir -iname "*.TextGrid" > $near_dir/textgrid.flist
			find -L $near_raw_dir/textgrid_dir -iname "*.TextGrid" \| sort > $near_dir/textgrid.flist
			n1_wav=$(wc -l < $near_dir/wavlist)
			n2_text=$(wc -l < $near_dir/textgrid.flist)
			log near file found $n1_wav wav and $n2_text text.
			@@ -90,9 +90,9 @@
			if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
			log "stage 2:process alimeeting far dir"

			find -L $far_raw_dir/audio_dir -iname "*.wav" > $far_dir/wavlist
			find -L $far_raw_dir/audio_dir -iname "*.wav" \| sort > $far_dir/wavlist
			awk -F '/' '{print $NF}' $far_dir/wavlist \| awk -F '.' '{print $1}' > $far_dir/uttid
			find -L $far_raw_dir/textgrid_dir -iname "*.TextGrid" > $far_dir/textgrid.flist
			find -L $far_raw_dir/textgrid_dir -iname "*.TextGrid" \| sort > $far_dir/textgrid.flist
			n1_wav=$(wc -l < $far_dir/wavlist)
			n2_text=$(wc -l < $far_dir/textgrid.flist)
			log far file found $n1_wav wav and $n2_text text.
			@@ -120,7 +120,8 @@

			if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
			log "stage 3: finali data process"

			local/fix_data_dir.sh $near_dir
			local/fix_data_dir.sh $far_dir
			local/copy_data_dir.sh $near_dir data/${tgt}_Ali_near
			local/copy_data_dir.sh $far_dir data/${tgt}_Ali_far

			@@ -42,8 +42,8 @@
			global_spk_list_tmp = global_spk_list[: ]
			for spk in meeting_map_tmp[meeting]:
			global_spk_list_tmp.remove(spk)
			padding_spk = random.sample(global_spk_list_tmp, 4 - num)
			meeting_map_tmp[meeting] = meeting_map_tmp[meeting] + padding_spk
			padding_spk = random.sample(global_spk_list_tmp, 4 - num)
			meeting_map_tmp[meeting] = meeting_map_tmp[meeting] + padding_spk

			meeting_map = {}
			os.system('mkdir -p ' + path + '/oracle_profile_padding')

			@@ -2,4 +2,4 @@

			# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
			export PYTHONIOENCODING=UTF-8
			export PATH=$FUNASR_DIR/funasr/bin:$PATH
			export PATH=$FUNASR_DIR/funasr/bin:./utils:$PATH

			@@ -102,8 +102,8 @@
			if input_stft.dim() == 4:
			# h: (B, T, C, F) -> h: (B, T, F)
			if self.training:
			if self.use_channel == None:
			input_stft = input_stft[:, :, 0, :]
			if self.use_channel is not None:
			input_stft = input_stft[:, :, self.use_channel, :]
			else:
			# Select 1ch randomly
			ch = np.random.randint(input_stft.size(2))