From 33d3d2084403fd34b79c835d2f2fe04f6cd8f738 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 13 九月 2023 09:33:54 +0800
Subject: [PATCH] Merge branch 'main' of github.com:alibaba-damo-academy/FunASR add

---
 egs/aishell/transformer/utils/compute_cmvn.py |   28 +++++++++++++++++++++++++---
 1 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/egs/aishell/transformer/utils/compute_cmvn.py b/egs/aishell/transformer/utils/compute_cmvn.py
index 4fefd3a..4986a5a 100755
--- a/egs/aishell/transformer/utils/compute_cmvn.py
+++ b/egs/aishell/transformer/utils/compute_cmvn.py
@@ -5,6 +5,7 @@
 import numpy as np
 import torchaudio
 import torchaudio.compliance.kaldi as kaldi
+import yaml
 
 
 def get_parser():
@@ -24,6 +25,11 @@
         required=True,
         type=str,
         help="the path of wav scps",
+    )
+    parser.add_argument(
+        "--config_file",
+        type=str,
+        help="the config file for computing cmvn",
     )
     parser.add_argument(
         "--idx",
@@ -70,8 +76,8 @@
     parser = get_parser()
     args = parser.parse_args()
 
-    wav_scp_file = os.path.join(args.wav_path + "{}/wav.scp".format(args.idx))
-    cmvn_file = os.path.join(args.wav_path + "{}/cmvn.json".format(args.idx))
+    wav_scp_file = os.path.join(args.wav_path, "wav.{}.scp".format(args.idx))
+    cmvn_file = os.path.join(args.wav_path, "cmvn.{}.json".format(args.idx))
 
     mean_stats = np.zeros(args.dim)
     var_stats = np.zeros(args.dim)
@@ -82,11 +88,27 @@
     #         mean_stats += np.sum(mat, axis=0)
     #         var_stats += np.sum(np.square(mat), axis=0)
     #         total_frames += mat.shape[0]
+
+    with open(args.config_file) as f:
+        configs = yaml.safe_load(f)
+        frontend_configs = configs.get("frontend_conf", {})
+        num_mel_bins = frontend_configs.get("n_mels", 80)
+        frame_length = frontend_configs.get("frame_length", 25)
+        frame_shift = frontend_configs.get("frame_shift", 10)
+        window_type = frontend_configs.get("window", "hamming")
+        resample_rate = frontend_configs.get("fs", 16000)
+        assert num_mel_bins == args.dim
+
     with open(wav_scp_file) as f:
         lines = f.readlines()
         for line in lines:
             _, wav_file = line.strip().split()
-            fbank = compute_fbank(wav_file, num_mel_bins=args.dim)
+            fbank = compute_fbank(wav_file,
+                                  num_mel_bins=args.dim,
+                                  frame_length=frame_length,
+                                  frame_shift=frame_shift,
+                                  resample_rate=resample_rate,
+                                  window_type=window_type)
             mean_stats += np.sum(fbank, axis=0)
             var_stats += np.sum(np.square(fbank), axis=0)
             total_frames += fbank.shape[0]

--
Gitblit v1.9.1