Merge pull request #213 from zhuzizyf/small-datatype-resample
Add finetune resampling function under small data type.
| | |
| | | return value[()] |
| | | |
| | | |
| | | def sound_loader(path, float_dtype=None): |
| | | def sound_loader(path, dest_sample_rate=16000, float_dtype=None): |
| | | # The file is as follows: |
| | | # utterance_id_A /some/where/a.wav |
| | | # utterance_id_B /some/where/a.flac |
| | |
| | | # NOTE(kamo): SoundScpReader doesn't support pipe-fashion |
| | | # like Kaldi e.g. "cat a.wav |". |
| | | # NOTE(kamo): The audio signal is normalized to [-1,1] range. |
| | | loader = SoundScpReader(path, normalize=True, always_2d=False) |
| | | loader = SoundScpReader(path, dest_sample_rate=16000, normalize=True, always_2d=False) |
| | | |
| | | # SoundScpReader.__getitem__() returns Tuple[int, ndarray], |
| | | # but ndarray is desired, so Adapter class is inserted here |
| | |
| | | DATA_TYPES = { |
| | | "sound": dict( |
| | | func=sound_loader, |
| | | kwargs=["float_dtype"], |
| | | kwargs=["dest_sample_rate","float_dtype"], |
| | | help="Audio format types which supported by sndfile wav, flac, etc." |
| | | "\n\n" |
| | | " utterance_id_a a.wav\n" |
| | |
| | | int_dtype: str = "long", |
| | | max_cache_size: Union[float, int, str] = 0.0, |
| | | max_cache_fd: int = 0, |
| | | dest_sample_rate: int = 16000, |
| | | ): |
| | | assert check_argument_types() |
| | | if len(path_name_type_list) == 0: |
| | |
| | | self.float_dtype = float_dtype |
| | | self.int_dtype = int_dtype |
| | | self.max_cache_fd = max_cache_fd |
| | | self.dest_sample_rate = dest_sample_rate |
| | | |
| | | self.loader_dict = {} |
| | | self.debug_info = {} |
| | |
| | | for key2 in dic["kwargs"]: |
| | | if key2 == "loader_type": |
| | | kwargs["loader_type"] = loader_type |
| | | elif key2 == "dest_sample_rate" and loader_type=="sound": |
| | | kwargs["dest_sample_rate"] = self.dest_sample_rate |
| | | elif key2 == "float_dtype": |
| | | kwargs["float_dtype"] = self.float_dtype |
| | | elif key2 == "int_dtype": |
| | |
| | | |
| | | import numpy as np |
| | | import soundfile |
| | | import librosa |
| | | from typeguard import check_argument_types |
| | | |
| | | from funasr.fileio.read_text import read_2column_text |
| | |
| | | dtype=np.int16, |
| | | always_2d: bool = False, |
| | | normalize: bool = False, |
| | | dest_sample_rate: int = 16000, |
| | | ): |
| | | assert check_argument_types() |
| | | self.fname = fname |
| | |
| | | self.always_2d = always_2d |
| | | self.normalize = normalize |
| | | self.data = read_2column_text(fname) |
| | | self.dest_sample_rate = dest_sample_rate |
| | | |
| | | def __getitem__(self, key): |
| | | wav = self.data[key] |
| | | if self.normalize: |
| | | # soundfile.read normalizes data to [-1,1] if dtype is not given |
| | | array, rate = soundfile.read(wav, always_2d=self.always_2d) |
| | | array, rate = librosa.load( |
| | | wav, sr=self.dest_sample_rate, mono=not self.always_2d |
| | | ) |
| | | else: |
| | | array, rate = soundfile.read( |
| | | wav, dtype=self.dtype, always_2d=self.always_2d |
| | | array, rate = librosa.load( |
| | | wav, sr=self.dest_sample_rate, mono=not self.always_2d, dtype=self.dtype |
| | | ) |
| | | |
| | | return rate, array |
| | |
| | | preprocess=iter_options.preprocess_fn, |
| | | max_cache_size=iter_options.max_cache_size, |
| | | max_cache_fd=iter_options.max_cache_fd, |
| | | dest_sample_rate=args.frontend_conf["fs"], |
| | | ) |
| | | cls.check_task_requirements( |
| | | dataset, args.allow_variable_data_keys, train=iter_options.train |