kongdeqiang
5 天以前 28ccfbfc51068a663a80764e14074df5edf2b5ba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import torch
import random
 
 
from funasr.register import tables
from funasr.utils.load_utils import extract_fbank, load_audio_text_image_video
 
 
@tables.register("dataset_classes", "KwsMTDataset")
class KwsMTDataset(torch.utils.data.Dataset):
    """
    KwsMTDataset, support multi tokenizers
    """
    def __init__(self,
                 path,
                 index_ds: str = None,
                 frontend=None,
                 tokenizer=None,
                 is_training: bool = True,
                 int_pad_value: int = -1,
                 float_pad_value: float = 0.0,
                 **kwargs,
    ):
        super().__init__()
        index_ds_class = tables.index_ds_classes.get(index_ds)
        self.index_ds = index_ds_class(path, **kwargs)
 
        self.preprocessor_speech = None
        self.preprocessor_text = None
 
        if is_training:
            preprocessor_speech = kwargs.get("preprocessor_speech", None)
            if preprocessor_speech:
                preprocessor_speech_class = tables.preprocessor_classes.get(preprocessor_speech)
                preprocessor_speech = preprocessor_speech_class(
                    **kwargs.get("preprocessor_speech_conf")
                )
            self.preprocessor_speech = preprocessor_speech
 
            preprocessor_text = kwargs.get("preprocessor_text", None)
            if preprocessor_text:
                preprocessor_text_class = tables.preprocessor_classes.get(preprocessor_text)
                preprocessor_text = preprocessor_text_class(**kwargs.get("preprocessor_text_conf"))
            self.preprocessor_text = preprocessor_text
 
        self.frontend = frontend
        self.fs = 16000 if frontend is None else frontend.fs
        self.data_type = "sound"
        print(tokenizer)
        self.tokenizer = tokenizer
 
        self.int_pad_value = int_pad_value
        self.float_pad_value = float_pad_value
 
    def get_source_len(self, index):
        item = self.index_ds[index]
        return self.index_ds.get_source_len(item)
 
    def get_target_len(self, index):
        item = self.index_ds[index]
        return self.index_ds.get_target_len(item)
 
    def __len__(self):
        return len(self.index_ds)
 
    def __getitem__(self, index):
        item = self.index_ds[index]
        # import pdb;
        # pdb.set_trace()
        source = item["source"]
        data_src = load_audio_text_image_video(source, fs=self.fs)
        if self.preprocessor_speech:
            data_src = self.preprocessor_speech(data_src, fs=self.fs)
        speech, speech_lengths = extract_fbank(
            data_src, data_type=self.data_type, frontend=self.frontend, is_final=True
        ) # speech: [b, T, d]
 
        target = item["target"]
        if self.preprocessor_text:
            target = self.preprocessor_text(target)
 
        if self.tokenizer[0]:
            ids = self.tokenizer[0].encode(target)
            text = torch.tensor(ids, dtype=torch.int64)
            # print("target: ", target,  ", ids: ", str(ids))
        else:
            ids = target
            text = ids
 
        if self.tokenizer[1]:
            ids2 = self.tokenizer[1].encode(target)
            text2 = torch.tensor(ids2, dtype=torch.int64)
            # print("target: ", target,  ", ids2: ", str(ids2))
        else:
            ids2 = target
            text2 = ids2
 
        ids_lengths = len(ids)
        text_lengths = torch.tensor([ids_lengths], dtype=torch.int32)
 
        ids2_lengths = len(ids2)
        text2_lengths = torch.tensor([ids2_lengths], dtype=torch.int32)
 
        return {"speech": speech[0, :, :],
                "speech_lengths": speech_lengths,
                "text": text,
                "text_lengths": text_lengths,
                "text2": text2,
                "text2_lengths": text2_lengths,
                }
 
 
    def collator(self, samples: list=None):
        outputs = {}
        for sample in samples:
            for key in sample.keys():
                if key not in outputs:
                    outputs[key] = []
                outputs[key].append(sample[key])
 
        for key, data_list in outputs.items():
            if isinstance(data_list[0], torch.Tensor):
                if data_list[0].dtype == torch.int64 or data_list[0].dtype == torch.int32:
 
                    pad_value = self.int_pad_value
                else:
                    pad_value = self.float_pad_value
 
                outputs[key] = torch.nn.utils.rnn.pad_sequence(
                    data_list, batch_first=True, padding_value=pad_value
                )
        return outputs