雾聪
2023-06-28 54931dd4e1a099d7d6f144c4e12e5453deb3aa26
egs/aishell/transformer/utils/compute_cmvn.py
@@ -5,6 +5,7 @@
import numpy as np
import torchaudio
import torchaudio.compliance.kaldi as kaldi
import yaml
def get_parser():
@@ -24,6 +25,11 @@
        required=True,
        type=str,
        help="the path of wav scps",
    )
    parser.add_argument(
        "--config_file",
        type=str,
        help="the config file for computing cmvn",
    )
    parser.add_argument(
        "--idx",
@@ -70,11 +76,11 @@
    parser = get_parser()
    args = parser.parse_args()
    wav_scp_file = os.path.join(args.wav_path + "{}/wav.scp".format(args.idx))
    cmvn_file = os.path.join(args.wav_path + "{}/cmvn.json".format(args.idx))
    wav_scp_file = os.path.join(args.wav_path, "wav.{}.scp".format(args.idx))
    cmvn_file = os.path.join(args.wav_path, "cmvn.{}.json".format(args.idx))
    mean_stats = np.zeros(args.dims)
    var_stats = np.zeros(args.dims)
    mean_stats = np.zeros(args.dim)
    var_stats = np.zeros(args.dim)
    total_frames = 0
    # with ReadHelper('ark:{}'.format(ark_file)) as ark_reader:
@@ -82,11 +88,27 @@
    #         mean_stats += np.sum(mat, axis=0)
    #         var_stats += np.sum(np.square(mat), axis=0)
    #         total_frames += mat.shape[0]
    with open(args.config_file) as f:
        configs = yaml.safe_load(f)
        frontend_configs = configs.get("frontend_conf", {})
        num_mel_bins = frontend_configs.get("n_mels", 80)
        frame_length = frontend_configs.get("frame_length", 25)
        frame_shift = frontend_configs.get("frame_shift", 10)
        window_type = frontend_configs.get("window", "hamming")
        resample_rate = frontend_configs.get("fs", 16000)
        assert num_mel_bins == args.dim
    with open(wav_scp_file) as f:
        lines = f.readlines()
        for line in lines:
            _, wav_file = line.strip().split()
            fbank = compute_fbank(wav_file, num_mel_bins=args.dims)
            fbank = compute_fbank(wav_file,
                                  num_mel_bins=args.dim,
                                  frame_length=frame_length,
                                  frame_shift=frame_shift,
                                  resample_rate=resample_rate,
                                  window_type=window_type)
            mean_stats += np.sum(fbank, axis=0)
            var_stats += np.sum(np.square(fbank), axis=0)
            total_frames += fbank.shape[0]