Merge pull request #158 from alibaba-damo-academy/dev_lhn
fix data dir filter bug
| | |
| | | wav_dict[sample_name] = wav_path |
| | | text_dict = {} |
| | | for line in text_lines: |
| | | parts = line.strip().split(" ", 1) |
| | | parts = line.strip().split() |
| | | if len(parts) < 2: |
| | | continue |
| | | sample_name, txt = parts |
| | | text_dict[sample_name] = txt |
| | | sample_name = parts[0] |
| | | text_dict[sample_name] = " ".join(parts[1:]) |
| | | filter_count = 0 |
| | | with open(wav_file, "w") as f_wav, open(text_file, "w") as f_text: |
| | | for sample_name, wav_path in wav_dict.items(): |
| | |
| | | f_text.write(sample_name + " " + text_dict[sample_name] + "\n") |
| | | else: |
| | | filter_count += 1 |
| | | print("{}/{} samples in {} are filtered because of the mismatch between wav.scp and text".format(len(wav_lines), filter_count, dataset)) |
| | | print("{}/{} samples in {} are filtered because of the mismatch between wav.scp and text".format(len(wav_lines), filter_count, dataset)) |