python/FunASR-XL.git

parent: 95bed233 | 补丁 | 提交 | ignore whitespace

游雁

2024-05-08 33a9e08dc9b65abc3f3e18d14253f95c79e0f749

dynamic batch

3个文件已修改

	funasr/auto/auto_model.py	20 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/sense_voice/model.py	30 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/sense_voice/search.py	9 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 funasr/auto/auto_model.py

@@ -364,7 +364,6 @@
            if len(sorted_data) > 0 and len(sorted_data[0]) > 0:
                batch_size = max(batch_size, sorted_data[0][0][1] - sorted_data[0][0][0])

            batch_size_ms_cum = 0
            beg_idx = 0
            beg_asr_total = time.time()
            time_speech_total_per_sample = speech_lengths / 16000
@@ -373,19 +372,22 @@
            # pbar_sample = tqdm(colour="blue", total=n, dynamic_ncols=True)

            all_segments = []
            max_len_in_batch = 0
            end_idx = 1
            for j, _ in enumerate(range(0, n)):
                # pbar_sample.update(1)
                batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0]
                sample_length = sorted_data[j][0][1] - sorted_data[j][0][0]
                potential_batch_length = max(max_len_in_batch, sample_length) * (j + 1 - beg_idx)
                # batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0]
                if (
                    j < n - 1
                    and (batch_size_ms_cum + sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0])
                    < batch_size
                    and (sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0])
                    < batch_size_threshold_ms
                    and sample_length < batch_size_threshold_ms
                    and potential_batch_length < batch_size
                ):
                    max_len_in_batch = max(max_len_in_batch, sample_length)
                    end_idx += 1
                    continue
                batch_size_ms_cum = 0
                end_idx = j + 1

                speech_j, speech_lengths_j = slice_padding_audio_samples(
                    speech, speech_lengths, sorted_data[beg_idx:end_idx]
                )
@@ -410,6 +412,8 @@
                        )
                        results[_b]["spk_embedding"] = spk_res[0]["spk_embedding"]
                beg_idx = end_idx
                end_idx += 1
                max_len_in_batch = sample_length
                if len(results) < 1:
                    continue
                results_sorted.extend(results)

 funasr/models/sense_voice/model.py

@@ -516,16 +516,23 @@

        # Paramterts for rich decoding
        self.beam_search.emo_unk = tokenizer.encode(
            DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all")[0]
            DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all"
        )[0]
        self.beam_search.emo_unk_score = 1
        self.beam_search.emo_tokens = tokenizer.encode(
            DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"), allowed_special="all")
            DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"),
            allowed_special="all",
        )
        self.beam_search.emo_scores = DecodingOptions.get("emo_target_threshold", [0.1, 0.1, 0.1])

        self.beam_search.event_bg_token = tokenizer.encode(
            DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"), allowed_special="all")
            DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"),
            allowed_special="all",
        )
        self.beam_search.event_ed_token = tokenizer.encode(
            DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"), allowed_special="all")
            DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"),
            allowed_special="all",
        )
        self.beam_search.event_score_ga = DecodingOptions.get("gain_tokens_score", [1, 1, 1, 1])

        encoder_out, encoder_out_lens = self.encode(
@@ -859,16 +866,23 @@

        # Paramterts for rich decoding
        self.beam_search.emo_unk = tokenizer.encode(
            DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all")[0]
            DecodingOptions.get("emo_unk_token", "<|SPECIAL_TOKEN_1|>"), allowed_special="all"
        )[0]
        self.beam_search.emo_unk_score = 1
        self.beam_search.emo_tokens = tokenizer.encode(
            DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"), allowed_special="all")
            DecodingOptions.get("emo_target_tokens", "<|HAPPY|><|SAD|><|ANGRY|>"),
            allowed_special="all",
        )
        self.beam_search.emo_scores = DecodingOptions.get("emo_target_threshold", [0.1, 0.1, 0.1])

        self.beam_search.event_bg_token = tokenizer.encode(
            DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"), allowed_special="all")
            DecodingOptions.get("gain_tokens_bg", "<|Speech|><|BGM|><|Applause|><|Laughter|>"),
            allowed_special="all",
        )
        self.beam_search.event_ed_token = tokenizer.encode(
            DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"), allowed_special="all")
            DecodingOptions.get("gain_tokens_ed", "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>"),
            allowed_special="all",
        )
        self.beam_search.event_score_ga = DecodingOptions.get("gain_tokens_score", [1, 1, 1, 1])

        encoder_out, encoder_out_lens = self.encode(

 funasr/models/sense_voice/search.py

@@ -54,7 +54,6 @@
        event_bg_token: List[int] = field(default_factory=lambda: [58946, 58948, 58950, 58952]),
        event_ed_token: List[int] = field(default_factory=lambda: [58947, 58949, 58951, 58953]),
        event_score_ga: List[float] = field(default_factory=lambda: [1, 1, 5, 25]),

        token_list: List[str] = None,
        pre_beam_ratio: float = 1.5,
        pre_beam_score_key: str = None,
@@ -209,15 +208,16 @@

            last_token = yseq[-1]
            if last_token in self.emo_tokens + [self.emo_unk]:
                # prevent output event after emotation token 
                # prevent output event after emotation token
                score[self.event_bg_token] = -np.inf

            for eve_bg, eve_ed, eve_ga in zip(self.event_bg_token, self.event_ed_token, self.event_score_ga):
            for eve_bg, eve_ed, eve_ga in zip(
                self.event_bg_token, self.event_ed_token, self.event_score_ga
            ):
                score_offset = get_score(yseq, eve_bg, eve_ed)
                score[eve_bg] += score_offset[0]
                score[eve_ed] += score_offset[1]
                score[eve_bg] += math.log(eve_ga)


            score[self.emo_unk] += math.log(self.emo_unk_score)
            for emo, emo_th in zip(self.emo_tokens, self.emo_scores):
@@ -231,7 +231,6 @@
            scores[k] = struct_score(hyp.yseq, scores[k])

        return scores, states


    def score_partial(
        self, hyp: Hypothesis, ids: torch.Tensor, x: torch.Tensor

			@@ -364,7 +364,6 @@
			if len(sorted_data) > 0 and len(sorted_data[0]) > 0:
			batch_size = max(batch_size, sorted_data[0][0][1] - sorted_data[0][0][0])

			batch_size_ms_cum = 0
			beg_idx = 0
			beg_asr_total = time.time()
			time_speech_total_per_sample = speech_lengths / 16000
			@@ -373,19 +372,22 @@
			# pbar_sample = tqdm(colour="blue", total=n, dynamic_ncols=True)

			all_segments = []
			max_len_in_batch = 0
			end_idx = 1
			for j, _ in enumerate(range(0, n)):
			# pbar_sample.update(1)
			batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0]
			sample_length = sorted_data[j][0][1] - sorted_data[j][0][0]
			potential_batch_length = max(max_len_in_batch, sample_length) * (j + 1 - beg_idx)
			# batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0]
			if (
			j < n - 1
			and (batch_size_ms_cum + sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0])
			< batch_size
			and (sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0])
			< batch_size_threshold_ms
			and sample_length < batch_size_threshold_ms
			and potential_batch_length < batch_size
			):
			max_len_in_batch = max(max_len_in_batch, sample_length)
			end_idx += 1
			continue
			batch_size_ms_cum = 0
			end_idx = j + 1

			speech_j, speech_lengths_j = slice_padding_audio_samples(
			speech, speech_lengths, sorted_data[beg_idx:end_idx]
			)
			@@ -410,6 +412,8 @@
			)
			results[_b]["spk_embedding"] = spk_res[0]["spk_embedding"]
			beg_idx = end_idx
			end_idx += 1
			max_len_in_batch = sample_length
			if len(results) < 1:
			continue
			results_sorted.extend(results)

			@@ -516,16 +516,23 @@

			# Paramterts for rich decoding
			self.beam_search.emo_unk = tokenizer.encode(
			DecodingOptions.get("emo_unk_token", "<\|SPECIAL_TOKEN_1\|>"), allowed_special="all")[0]
			DecodingOptions.get("emo_unk_token", "<\|SPECIAL_TOKEN_1\|>"), allowed_special="all"
			)[0]
			self.beam_search.emo_unk_score = 1
			self.beam_search.emo_tokens = tokenizer.encode(
			DecodingOptions.get("emo_target_tokens", "<\|HAPPY\|><\|SAD\|><\|ANGRY\|>"), allowed_special="all")
			DecodingOptions.get("emo_target_tokens", "<\|HAPPY\|><\|SAD\|><\|ANGRY\|>"),
			allowed_special="all",
			)
			self.beam_search.emo_scores = DecodingOptions.get("emo_target_threshold", [0.1, 0.1, 0.1])

			self.beam_search.event_bg_token = tokenizer.encode(
			DecodingOptions.get("gain_tokens_bg", "<\|Speech\|><\|BGM\|><\|Applause\|><\|Laughter\|>"), allowed_special="all")
			DecodingOptions.get("gain_tokens_bg", "<\|Speech\|><\|BGM\|><\|Applause\|><\|Laughter\|>"),
			allowed_special="all",
			)
			self.beam_search.event_ed_token = tokenizer.encode(
			DecodingOptions.get("gain_tokens_ed", "<\|/Speech\|><\|/BGM\|><\|/Applause\|><\|/Laughter\|>"), allowed_special="all")
			DecodingOptions.get("gain_tokens_ed", "<\|/Speech\|><\|/BGM\|><\|/Applause\|><\|/Laughter\|>"),
			allowed_special="all",
			)
			self.beam_search.event_score_ga = DecodingOptions.get("gain_tokens_score", [1, 1, 1, 1])

			encoder_out, encoder_out_lens = self.encode(
			@@ -859,16 +866,23 @@

			# Paramterts for rich decoding
			self.beam_search.emo_unk = tokenizer.encode(
			DecodingOptions.get("emo_unk_token", "<\|SPECIAL_TOKEN_1\|>"), allowed_special="all")[0]
			DecodingOptions.get("emo_unk_token", "<\|SPECIAL_TOKEN_1\|>"), allowed_special="all"
			)[0]
			self.beam_search.emo_unk_score = 1
			self.beam_search.emo_tokens = tokenizer.encode(
			DecodingOptions.get("emo_target_tokens", "<\|HAPPY\|><\|SAD\|><\|ANGRY\|>"), allowed_special="all")
			DecodingOptions.get("emo_target_tokens", "<\|HAPPY\|><\|SAD\|><\|ANGRY\|>"),
			allowed_special="all",
			)
			self.beam_search.emo_scores = DecodingOptions.get("emo_target_threshold", [0.1, 0.1, 0.1])

			self.beam_search.event_bg_token = tokenizer.encode(
			DecodingOptions.get("gain_tokens_bg", "<\|Speech\|><\|BGM\|><\|Applause\|><\|Laughter\|>"), allowed_special="all")
			DecodingOptions.get("gain_tokens_bg", "<\|Speech\|><\|BGM\|><\|Applause\|><\|Laughter\|>"),
			allowed_special="all",
			)
			self.beam_search.event_ed_token = tokenizer.encode(
			DecodingOptions.get("gain_tokens_ed", "<\|/Speech\|><\|/BGM\|><\|/Applause\|><\|/Laughter\|>"), allowed_special="all")
			DecodingOptions.get("gain_tokens_ed", "<\|/Speech\|><\|/BGM\|><\|/Applause\|><\|/Laughter\|>"),
			allowed_special="all",
			)
			self.beam_search.event_score_ga = DecodingOptions.get("gain_tokens_score", [1, 1, 1, 1])

			encoder_out, encoder_out_lens = self.encode(

			@@ -54,7 +54,6 @@
			event_bg_token: List[int] = field(default_factory=lambda: [58946, 58948, 58950, 58952]),
			event_ed_token: List[int] = field(default_factory=lambda: [58947, 58949, 58951, 58953]),
			event_score_ga: List[float] = field(default_factory=lambda: [1, 1, 5, 25]),

			token_list: List[str] = None,
			pre_beam_ratio: float = 1.5,
			pre_beam_score_key: str = None,
			@@ -209,15 +208,16 @@

			last_token = yseq[-1]
			if last_token in self.emo_tokens + [self.emo_unk]:
			# prevent output event after emotation token
			# prevent output event after emotation token
			score[self.event_bg_token] = -np.inf

			for eve_bg, eve_ed, eve_ga in zip(self.event_bg_token, self.event_ed_token, self.event_score_ga):
			for eve_bg, eve_ed, eve_ga in zip(
			self.event_bg_token, self.event_ed_token, self.event_score_ga
			):
			score_offset = get_score(yseq, eve_bg, eve_ed)
			score[eve_bg] += score_offset[0]
			score[eve_ed] += score_offset[1]
			score[eve_bg] += math.log(eve_ga)


			score[self.emo_unk] += math.log(self.emo_unk_score)
			for emo, emo_th in zip(self.emo_tokens, self.emo_scores):
			@@ -231,7 +231,6 @@
			scores[k] = struct_score(hyp.yseq, scores[k])

			return scores, states


			def score_partial(
			self, hyp: Hypothesis, ids: torch.Tensor, x: torch.Tensor