kongdeqiang
5 天以前 28ccfbfc51068a663a80764e14074df5edf2b5ba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import os
import uuid as uuid_lib  # 避免和列名冲突
 
def validate_uuid(uuid_str):
    """校验 UUID 格式是否合法"""
    try:
        uuid_lib.UUID(uuid_str)
        return True
    except ValueError:
        return False
 
def process_excel_to_funasr_files(excel_path, output_dir="."):
    """
    从 Excel 生成 FunASR 训练所需的 train_text.txt 和 train_wav.scp
    :param excel_path: Excel 文件路径
    :param output_dir: 输出文件目录(默认当前目录)
    """
    # 1. 读取 Excel 文件
    try:
        if excel_path.endswith(".xlsx"):
            df = pd.read_excel(excel_path, engine="openpyxl")
        elif excel_path.endswith(".csv"):
            df = pd.read_csv(excel_path)
        else:
            raise ValueError("仅支持 .xlsx 和 .csv 格式文件")
    except FileNotFoundError:
        print(f"错误:未找到文件 {excel_path}")
        return
    except Exception as e:
        print(f"读取文件失败:{str(e)}")
        return
 
    # 2. 定义列名(请根据你的 Excel 实际列名修改)
    uuid_col = "音频唯一标识 (UUID)"
    text_col = "音频对应的文字内容"
    path_col = "音频保存的路径"
 
    # 检查必要列是否存在
    required_cols = [uuid_col, text_col, path_col]
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        print(f"错误:Excel 中缺少必要列:{missing_cols}")
        return
 
    # 3. 数据清洗
    # 去除空值行
    df_clean = df.dropna(subset=required_cols).copy()
    # 去重(基于 UUID)
    df_clean = df_clean.drop_duplicates(subset=[uuid_col], keep="first")
    # 过滤 UUID 格式错误的行
    df_clean["uuid_valid"] = df_clean[uuid_col].apply(validate_uuid)
    invalid_uuid_rows = df_clean[~df_clean["uuid_valid"]]
    if not invalid_uuid_rows.empty:
        print(f"警告:发现 {len(invalid_uuid_rows)} 行 UUID 格式错误,已过滤:")
        print(invalid_uuid_rows[uuid_col].tolist())
    df_clean = df_clean[df_clean["uuid_valid"]].drop(columns=["uuid_valid"])
 
    # 4. 生成文件
    os.makedirs(output_dir, exist_ok=True)
    text_file_path = os.path.join(output_dir, "train_text.txt")
    scp_file_path = os.path.join(output_dir, "train_wav.scp")
 
    # 生成 train_text.txt
    with open(text_file_path, "w", encoding="utf-8") as f_text:
        for _, row in df_clean.iterrows():
            uuid = str(row[uuid_col]).strip()
            text = str(row[text_col]).strip()
            # 过滤空文本
            if text:
                f_text.write(f"{uuid} {text}\n")
 
    # 生成 train_wav.scp(提取最后一段 uuid.wav)
    with open(scp_file_path, "w", encoding="utf-8") as f_scp:
        for _, row in df_clean.iterrows():
            uuid = str(row[uuid_col]).strip()
            full_path = str(row[path_col]).strip()
            # 提取 uuid.wav(兼容 / 和 \ 路径分隔符)
            wav_file = os.path.basename(full_path)
            # 确保是 .wav 文件
            if wav_file.endswith(".wav"):
                f_scp.write(f"{uuid} wav/{wav_file}\n")
            else:
                print(f"警告:{uuid} 的音频路径 {full_path} 不是 .wav 文件,已跳过")
 
    # 5. 输出统计信息
    print("\n=== 处理完成 ===")
    print(f"原始数据行数:{len(df)}")
    print(f"清洗后有效行数:{len(df_clean)}")
    print(f"生成 train_text.txt:{text_file_path}")
    print(f"生成 train_wav.scp:{scp_file_path}")
 
# ===================== 执行入口 =====================
if __name__ == "__main__":
    # 请修改为你的 Excel 文件路径
    EXCEL_FILE = "音频标注数据.xlsx"
    # 输出目录(默认当前目录,可修改为绝对路径如 "/home/boying/funasr_data")
    OUTPUT_DIR = "./data/list/"
 
    process_excel_to_funasr_files(EXCEL_FILE, OUTPUT_DIR)