import pandas as pd
|
import os
|
import uuid as uuid_lib # 避免和列名冲突
|
|
def validate_uuid(uuid_str):
|
"""校验 UUID 格式是否合法"""
|
try:
|
uuid_lib.UUID(uuid_str)
|
return True
|
except ValueError:
|
return False
|
|
def process_excel_to_funasr_files(excel_path, output_dir="."):
|
"""
|
从 Excel 生成 FunASR 训练所需的 train_text.txt 和 train_wav.scp
|
:param excel_path: Excel 文件路径
|
:param output_dir: 输出文件目录(默认当前目录)
|
"""
|
# 1. 读取 Excel 文件
|
try:
|
if excel_path.endswith(".xlsx"):
|
df = pd.read_excel(excel_path, engine="openpyxl")
|
elif excel_path.endswith(".csv"):
|
df = pd.read_csv(excel_path)
|
else:
|
raise ValueError("仅支持 .xlsx 和 .csv 格式文件")
|
except FileNotFoundError:
|
print(f"错误:未找到文件 {excel_path}")
|
return
|
except Exception as e:
|
print(f"读取文件失败:{str(e)}")
|
return
|
|
# 2. 定义列名(请根据你的 Excel 实际列名修改)
|
uuid_col = "音频唯一标识 (UUID)"
|
text_col = "音频对应的文字内容"
|
path_col = "音频保存的路径"
|
|
# 检查必要列是否存在
|
required_cols = [uuid_col, text_col, path_col]
|
missing_cols = [col for col in required_cols if col not in df.columns]
|
if missing_cols:
|
print(f"错误:Excel 中缺少必要列:{missing_cols}")
|
return
|
|
# 3. 数据清洗
|
# 去除空值行
|
df_clean = df.dropna(subset=required_cols).copy()
|
# 去重(基于 UUID)
|
df_clean = df_clean.drop_duplicates(subset=[uuid_col], keep="first")
|
# 过滤 UUID 格式错误的行
|
df_clean["uuid_valid"] = df_clean[uuid_col].apply(validate_uuid)
|
invalid_uuid_rows = df_clean[~df_clean["uuid_valid"]]
|
if not invalid_uuid_rows.empty:
|
print(f"警告:发现 {len(invalid_uuid_rows)} 行 UUID 格式错误,已过滤:")
|
print(invalid_uuid_rows[uuid_col].tolist())
|
df_clean = df_clean[df_clean["uuid_valid"]].drop(columns=["uuid_valid"])
|
|
# 4. 生成文件
|
os.makedirs(output_dir, exist_ok=True)
|
text_file_path = os.path.join(output_dir, "train_text.txt")
|
scp_file_path = os.path.join(output_dir, "train_wav.scp")
|
|
# 生成 train_text.txt
|
with open(text_file_path, "w", encoding="utf-8") as f_text:
|
for _, row in df_clean.iterrows():
|
uuid = str(row[uuid_col]).strip()
|
text = str(row[text_col]).strip()
|
# 过滤空文本
|
if text:
|
f_text.write(f"{uuid} {text}\n")
|
|
# 生成 train_wav.scp(提取最后一段 uuid.wav)
|
with open(scp_file_path, "w", encoding="utf-8") as f_scp:
|
for _, row in df_clean.iterrows():
|
uuid = str(row[uuid_col]).strip()
|
full_path = str(row[path_col]).strip()
|
# 提取 uuid.wav(兼容 / 和 \ 路径分隔符)
|
wav_file = os.path.basename(full_path)
|
# 确保是 .wav 文件
|
if wav_file.endswith(".wav"):
|
f_scp.write(f"{uuid} wav/{wav_file}\n")
|
else:
|
print(f"警告:{uuid} 的音频路径 {full_path} 不是 .wav 文件,已跳过")
|
|
# 5. 输出统计信息
|
print("\n=== 处理完成 ===")
|
print(f"原始数据行数:{len(df)}")
|
print(f"清洗后有效行数:{len(df_clean)}")
|
print(f"生成 train_text.txt:{text_file_path}")
|
print(f"生成 train_wav.scp:{scp_file_path}")
|
|
# ===================== 执行入口 =====================
|
if __name__ == "__main__":
|
# 请修改为你的 Excel 文件路径
|
EXCEL_FILE = "音频标注数据.xlsx"
|
# 输出目录(默认当前目录,可修改为绝对路径如 "/home/boying/funasr_data")
|
OUTPUT_DIR = "./data/list/"
|
|
process_excel_to_funasr_files(EXCEL_FILE, OUTPUT_DIR)
|