python/FunASR-XL.git - Gitblit

python / FunASR-XL

FUNASR训练

blame | 历史 | 补丁 | 提交 | 提交对比 | ignore whitespace

优化speakid和语句匹配逻辑，部分解决speakid不从0递增问题 (#1870)

wuhongsheng

2024-07-05 3a4281f4959534b1bf5d01acf0085f4f8e6f2ec8

 funasr/datasets/audio_datasets/samplers.py

@@ -392,17 +392,14 @@
            )
            batch = []
            max_len_in_batch = 0
            count = 0
            count = 1
            for idx in buffer:
                original_sample_length = self.dataset.get_source_len(idx)
                if original_sample_length > self.max_token_length:
                    continue
                sample_length = 1 if self.batch_type == "example" else original_sample_length
                potential_batch_length = max(max_len_in_batch, sample_length) * (len(batch) + 1)
                if (
                    potential_batch_length <= self.batch_size
                    and count <= self.batch_size_sample_max
                ):
                if potential_batch_length <= self.batch_size and count < self.batch_size_sample_max:
                    batch.append(idx)
                    max_len_in_batch = max(max_len_in_batch, sample_length)
                    count += 1
@@ -410,7 +407,7 @@
                    buffer_batches.append(batch)
                    batch = [idx]
                    max_len_in_batch = sample_length
                    count = 0
                    count = 1
            if batch:
                buffer_batches.append(batch)