import pandas as pd import os import uuid as uuid_lib # 避免和列名冲突 def validate_uuid(uuid_str): """校验 UUID 格式是否合法""" try: uuid_lib.UUID(uuid_str) return True except ValueError: return False def process_excel_to_funasr_files(excel_path, output_dir="."): """ 从 Excel 生成 FunASR 训练所需的 train_text.txt 和 train_wav.scp :param excel_path: Excel 文件路径 :param output_dir: 输出文件目录(默认当前目录) """ # 1. 读取 Excel 文件 try: if excel_path.endswith(".xlsx"): df = pd.read_excel(excel_path, engine="openpyxl") elif excel_path.endswith(".csv"): df = pd.read_csv(excel_path) else: raise ValueError("仅支持 .xlsx 和 .csv 格式文件") except FileNotFoundError: print(f"错误:未找到文件 {excel_path}") return except Exception as e: print(f"读取文件失败:{str(e)}") return # 2. 定义列名(请根据你的 Excel 实际列名修改) uuid_col = "音频唯一标识 (UUID)" text_col = "音频对应的文字内容" path_col = "音频保存的路径" # 检查必要列是否存在 required_cols = [uuid_col, text_col, path_col] missing_cols = [col for col in required_cols if col not in df.columns] if missing_cols: print(f"错误:Excel 中缺少必要列:{missing_cols}") return # 3. 数据清洗 # 去除空值行 df_clean = df.dropna(subset=required_cols).copy() # 去重(基于 UUID) df_clean = df_clean.drop_duplicates(subset=[uuid_col], keep="first") # 过滤 UUID 格式错误的行 df_clean["uuid_valid"] = df_clean[uuid_col].apply(validate_uuid) invalid_uuid_rows = df_clean[~df_clean["uuid_valid"]] if not invalid_uuid_rows.empty: print(f"警告:发现 {len(invalid_uuid_rows)} 行 UUID 格式错误,已过滤:") print(invalid_uuid_rows[uuid_col].tolist()) df_clean = df_clean[df_clean["uuid_valid"]].drop(columns=["uuid_valid"]) # 4. 生成文件 os.makedirs(output_dir, exist_ok=True) text_file_path = os.path.join(output_dir, "train_text.txt") scp_file_path = os.path.join(output_dir, "train_wav.scp") # 生成 train_text.txt with open(text_file_path, "w", encoding="utf-8") as f_text: for _, row in df_clean.iterrows(): uuid = str(row[uuid_col]).strip() text = str(row[text_col]).strip() # 过滤空文本 if text: f_text.write(f"{uuid} {text}\n") # 生成 train_wav.scp(提取最后一段 uuid.wav) with open(scp_file_path, "w", encoding="utf-8") as f_scp: for _, row in df_clean.iterrows(): uuid = str(row[uuid_col]).strip() full_path = str(row[path_col]).strip() # 提取 uuid.wav(兼容 / 和 \ 路径分隔符) wav_file = os.path.basename(full_path) # 确保是 .wav 文件 if wav_file.endswith(".wav"): f_scp.write(f"{uuid} wav/{wav_file}\n") else: print(f"警告:{uuid} 的音频路径 {full_path} 不是 .wav 文件,已跳过") # 5. 输出统计信息 print("\n=== 处理完成 ===") print(f"原始数据行数:{len(df)}") print(f"清洗后有效行数:{len(df_clean)}") print(f"生成 train_text.txt:{text_file_path}") print(f"生成 train_wav.scp:{scp_file_path}") # ===================== 执行入口 ===================== if __name__ == "__main__": # 请修改为你的 Excel 文件路径 EXCEL_FILE = "音频标注数据.xlsx" # 输出目录(默认当前目录,可修改为绝对路径如 "/home/boying/funasr_data") OUTPUT_DIR = "./data/list/" process_excel_to_funasr_files(EXCEL_FILE, OUTPUT_DIR)