| | |
| | | import argparse |
| | | import tqdm |
| | | import codecs |
| | | import textgrid |
| | | try: |
| | | import textgrid |
| | | except: |
| | | raise "Please install textgrid firstly: pip install textgrid" |
| | | import pdb |
| | | |
| | | class Segment(object): |
| | |
| | | import codecs |
| | | from distutils.util import strtobool |
| | | from pathlib import Path |
| | | import textgrid |
| | | try: |
| | | import textgrid |
| | | except: |
| | | raise "Please install textgrid firstly: pip install textgrid" |
| | | import pdb |
| | | |
| | | class Segment(object): |
| | |
| | | import codecs |
| | | from distutils.util import strtobool |
| | | from pathlib import Path |
| | | import textgrid |
| | | try: |
| | | import textgrid |
| | | except: |
| | | raise "Please install textgrid firstly: pip install textgrid" |
| | | import pdb |
| | | |
| | | class Segment(object): |
| | |
| | | import codecs |
| | | from distutils.util import strtobool |
| | | from pathlib import Path |
| | | import textgrid |
| | | try: |
| | | import textgrid |
| | | except: |
| | | raise "Please install textgrid firstly: pip install textgrid" |
| | | import pdb |
| | | |
| | | class Segment(object): |
| | |
| | | import codecs |
| | | from distutils.util import strtobool |
| | | from pathlib import Path |
| | | import textgrid |
| | | try: |
| | | import textgrid |
| | | except: |
| | | raise "Please install textgrid firstly: pip install textgrid" |
| | | import pdb |
| | | |
| | | def get_args(): |
| | |
| | | import codecs |
| | | from distutils.util import strtobool |
| | | from pathlib import Path |
| | | import textgrid |
| | | |
| | | try: |
| | | import textgrid |
| | | except: |
| | | raise "Please install textgrid firstly: pip install textgrid" |
| | | |
| | | import pdb |
| | | import numpy as np |
| | | import sys |
| | |
| | | """Speech2Text class |
| | | |
| | | Examples: |
| | | >>> import soundfile |
| | | >>> import librosa |
| | | >>> speech2text = Speech2Text("asr_config.yml", "asr.pb") |
| | | >>> audio, rate = soundfile.read("speech.wav") |
| | | >>> audio, rate = librosa.load("speech.wav") |
| | | >>> speech2text(audio) |
| | | [(text, token, token_int, hypothesis object), ...] |
| | | |
| | |
| | | """Speech2Text class |
| | | |
| | | Examples: |
| | | >>> import soundfile |
| | | >>> import librosa |
| | | >>> speech2text = Speech2TextParaformer("asr_config.yml", "asr.pb") |
| | | >>> audio, rate = soundfile.read("speech.wav") |
| | | >>> audio, rate = librosa.load("speech.wav") |
| | | >>> speech2text(audio) |
| | | [(text, token, token_int, hypothesis object), ...] |
| | | |
| | |
| | | """Speech2Text class |
| | | |
| | | Examples: |
| | | >>> import soundfile |
| | | >>> import librosa |
| | | >>> speech2text = Speech2TextParaformerOnline("asr_config.yml", "asr.pth") |
| | | >>> audio, rate = soundfile.read("speech.wav") |
| | | >>> audio, rate = librosa.load("speech.wav") |
| | | >>> speech2text(audio) |
| | | [(text, token, token_int, hypothesis object), ...] |
| | | |
| | |
| | | """Speech2Text class |
| | | |
| | | Examples: |
| | | >>> import soundfile |
| | | >>> import librosa |
| | | >>> speech2text = Speech2TextUniASR("asr_config.yml", "asr.pb") |
| | | >>> audio, rate = soundfile.read("speech.wav") |
| | | >>> audio, rate = librosa.load("speech.wav") |
| | | >>> speech2text(audio) |
| | | [(text, token, token_int, hypothesis object), ...] |
| | | |
| | |
| | | """Speech2Text class |
| | | |
| | | Examples: |
| | | >>> import soundfile |
| | | >>> import librosa |
| | | >>> speech2text = Speech2TextMFCCA("asr_config.yml", "asr.pb") |
| | | >>> audio, rate = soundfile.read("speech.wav") |
| | | >>> audio, rate = librosa.load("speech.wav") |
| | | >>> speech2text(audio) |
| | | [(text, token, token_int, hypothesis object), ...] |
| | | |
| | |
| | | """Speech2Text class |
| | | |
| | | Examples: |
| | | >>> import soundfile |
| | | >>> import librosa |
| | | >>> speech2text = Speech2TextSAASR("asr_config.yml", "asr.pb") |
| | | >>> audio, rate = soundfile.read("speech.wav") |
| | | >>> audio, rate = librosa.load("speech.wav") |
| | | >>> speech2text(audio) |
| | | [(text, token, token_int, hypothesis object), ...] |
| | | |
| | |
| | | """Speech2Text class |
| | | |
| | | Examples: |
| | | >>> import soundfile |
| | | >>> import librosa |
| | | >>> speech2text = Speech2Text("asr_config.yml", "asr.pb") |
| | | >>> audio, rate = soundfile.read("speech.wav") |
| | | >>> audio, rate = librosa.load("speech.wav") |
| | | >>> speech2text(audio) |
| | | [(text, token, token_int, hypothesis object), ...] |
| | | |
| | |
| | | import numpy as np |
| | | import torch |
| | | import torchaudio |
| | | import soundfile |
| | | # import librosa |
| | | import librosa |
| | | import yaml |
| | | |
| | | from funasr.bin.asr_infer import Speech2Text |
| | |
| | | try: |
| | | raw_inputs = torchaudio.load(data_path_and_name_and_type[0])[0][0] |
| | | except: |
| | | raw_inputs = soundfile.read(data_path_and_name_and_type[0], dtype='float32')[0] |
| | | # raw_inputs = librosa.load(data_path_and_name_and_type[0], dtype='float32')[0] |
| | | raw_inputs, sr = librosa.load(data_path_and_name_and_type[0], dtype='float32') |
| | | if raw_inputs.ndim == 2: |
| | | raw_inputs = raw_inputs[:, 0] |
| | | raw_inputs = torch.tensor(raw_inputs) |
| | |
| | | """Speech2Diarlization class |
| | | |
| | | Examples: |
| | | >>> import soundfile |
| | | >>> import librosa |
| | | >>> import numpy as np |
| | | >>> speech2diar = Speech2DiarizationEEND("diar_sond_config.yml", "diar_sond.pb") |
| | | >>> profile = np.load("profiles.npy") |
| | | >>> audio, rate = soundfile.read("speech.wav") |
| | | >>> audio, rate = librosa.load("speech.wav") |
| | | >>> speech2diar(audio, profile) |
| | | {"spk1": [(int, int), ...], ...} |
| | | |
| | |
| | | """Speech2Xvector class |
| | | |
| | | Examples: |
| | | >>> import soundfile |
| | | >>> import librosa |
| | | >>> import numpy as np |
| | | >>> speech2diar = Speech2DiarizationSOND("diar_sond_config.yml", "diar_sond.pb") |
| | | >>> profile = np.load("profiles.npy") |
| | | >>> audio, rate = soundfile.read("speech.wav") |
| | | >>> audio, rate = librosa.load("speech.wav") |
| | | >>> speech2diar(audio, profile) |
| | | {"spk1": [(int, int), ...], ...} |
| | | |
| | |
| | | from typing import Union |
| | | |
| | | import numpy as np |
| | | import soundfile |
| | | # import librosa |
| | | import librosa |
| | | import torch |
| | | from scipy.signal import medfilt |
| | | |
| | |
| | | # read waveform file |
| | | example = [load_bytes(x) if isinstance(x, bytes) else x |
| | | for x in example] |
| | | example = [soundfile.read(x)[0] if isinstance(x, str) else x |
| | | # example = [librosa.load(x)[0] if isinstance(x, str) else x |
| | | # for x in example] |
| | | example = [librosa.load(x, dtype='float32')[0] if isinstance(x, str) else x |
| | | for x in example] |
| | | # convert torch tensor to numpy array |
| | | example = [x.numpy() if isinstance(example[0], torch.Tensor) else x |
| | |
| | | """SpeechSeparator class |
| | | |
| | | Examples: |
| | | >>> import soundfile |
| | | >>> import librosa |
| | | >>> speech_separator = MossFormer("ss_config.yml", "ss.pt") |
| | | >>> audio, rate = soundfile.read("speech.wav") |
| | | >>> audio, rate = librosa.load("speech.wav") |
| | | >>> separated_wavs = speech_separator(audio) |
| | | |
| | | """ |
| | |
| | | |
| | | import numpy as np |
| | | import torch |
| | | import soundfile as sf |
| | | import librosa |
| | | from funasr.build_utils.build_streaming_iterator import build_streaming_iterator |
| | | from funasr.torch_utils.set_all_random_seed import set_all_random_seed |
| | | from funasr.utils import config_argparse |
| | |
| | | ss_results = speech_separator(**batch) |
| | | |
| | | for spk in range(num_spks): |
| | | sf.write(os.path.join(output_path, keys[0] + '_s' + str(spk+1)+'.wav'), ss_results[spk], sample_rate) |
| | | # sf.write(os.path.join(output_path, keys[0] + '_s' + str(spk+1)+'.wav'), ss_results[spk], sample_rate) |
| | | try: |
| | | librosa.output.write_wav(os.path.join(output_path, keys[0] + '_s' + str(spk+1)+'.wav'), ss_results[spk], sample_rate) |
| | | except: |
| | | print("To write wav by librosa, you should install librosa<=0.8.0") |
| | | raise |
| | | torch.cuda.empty_cache() |
| | | return ss_results |
| | | |
| | |
| | | """Speech2Xvector class |
| | | |
| | | Examples: |
| | | >>> import soundfile |
| | | >>> import librosa |
| | | >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb") |
| | | >>> audio, rate = soundfile.read("speech.wav") |
| | | >>> audio, rate = librosa.load("speech.wav") |
| | | >>> speech2xvector(audio) |
| | | [(text, token, token_int, hypothesis object), ...] |
| | | |
| | |
| | | """Speech2VadSegment class |
| | | |
| | | Examples: |
| | | >>> import soundfile |
| | | >>> import librosa |
| | | >>> speech2segment = Speech2VadSegment("vad_config.yml", "vad.pt") |
| | | >>> audio, rate = soundfile.read("speech.wav") |
| | | >>> audio, rate = librosa.load("speech.wav") |
| | | >>> speech2segment(audio) |
| | | [[10, 230], [245, 450], ...] |
| | | |
| | |
| | | """Speech2VadSegmentOnline class |
| | | |
| | | Examples: |
| | | >>> import soundfile |
| | | >>> import librosa |
| | | >>> speech2segment = Speech2VadSegmentOnline("vad_config.yml", "vad.pt") |
| | | >>> audio, rate = soundfile.read("speech.wav") |
| | | >>> audio, rate = librosa.load("speech.wav") |
| | | >>> speech2segment(audio) |
| | | [[10, 230], [245, 450], ...] |
| | | |
| | |
| | | for iepoch in range(start_epoch, trainer_options.max_epoch + 1): |
| | | if iepoch != start_epoch: |
| | | logging.info( |
| | | "{}/{}epoch started. Estimated time to finish: {}".format( |
| | | "{}/{}epoch started. Estimated time to finish: {} hours".format( |
| | | iepoch, |
| | | trainer_options.max_epoch, |
| | | humanfriendly.format_timespan( |
| | | (time.perf_counter() - start_time) |
| | | / (iepoch - start_epoch) |
| | | * (trainer_options.max_epoch - iepoch + 1) |
| | | ), |
| | | (time.perf_counter() - start_time) / 3600.0 / (iepoch - start_epoch) * ( |
| | | trainer_options.max_epoch - iepoch + 1), |
| | | ) |
| | | ) |
| | | else: |
| | |
| | | from typing import Mapping |
| | | from typing import Tuple |
| | | from typing import Union |
| | | |
| | | import h5py |
| | | try: |
| | | import h5py |
| | | except: |
| | | print("If you want use h5py dataset, please pip install h5py, and try it again") |
| | | import humanfriendly |
| | | import kaldiio |
| | | import numpy as np |
| | |
| | | import numpy as np |
| | | import torch |
| | | import torchaudio |
| | | import soundfile |
| | | # import librosa |
| | | import librosa |
| | | from torch.utils.data.dataset import IterableDataset |
| | | import os.path |
| | | |
| | |
| | | try: |
| | | return torchaudio.load(input)[0].numpy() |
| | | except: |
| | | waveform, _ = soundfile.read(input, dtype='float32') |
| | | # waveform, _ = librosa.load(input, dtype='float32') |
| | | waveform, _ = librosa.load(input, dtype='float32') |
| | | if waveform.ndim == 2: |
| | | waveform = waveform[:, 0] |
| | | return np.expand_dims(waveform, axis=0) |
| | |
| | | import torch.distributed as dist |
| | | import torchaudio |
| | | import numpy as np |
| | | import soundfile |
| | | # import librosa |
| | | import librosa |
| | | from kaldiio import ReadHelper |
| | | from torch.utils.data import IterableDataset |
| | | |
| | |
| | | try: |
| | | waveform, sampling_rate = torchaudio.load(path) |
| | | except: |
| | | waveform, sampling_rate = soundfile.read(path, dtype='float32') |
| | | # waveform, sampling_rate = librosa.load(path, dtype='float32') |
| | | waveform, sampling_rate = librosa.load(path, dtype='float32') |
| | | if waveform.ndim == 2: |
| | | waveform = waveform[:, 0] |
| | | waveform = np.expand_dims(waveform, axis=0) |
| | |
| | | |
| | | import numpy as np |
| | | import scipy.signal |
| | | import soundfile |
| | | import librosa |
| | | import jieba |
| | | |
| | | from funasr.text.build_tokenizer import build_tokenizer |
| | |
| | | if self.rirs is not None and self.rir_apply_prob >= np.random.random(): |
| | | rir_path = np.random.choice(self.rirs) |
| | | if rir_path is not None: |
| | | rir, _ = soundfile.read( |
| | | rir, _ = librosa.load( |
| | | rir_path, dtype=np.float64, always_2d=True |
| | | ) |
| | | |
| | |
| | | noise_db = np.random.uniform( |
| | | self.noise_db_low, self.noise_db_high |
| | | ) |
| | | with soundfile.SoundFile(noise_path) as f: |
| | | if f.frames == nsamples: |
| | | noise = f.read(dtype=np.float64, always_2d=True) |
| | | elif f.frames < nsamples: |
| | | offset = np.random.randint(0, nsamples - f.frames) |
| | | # noise: (Time, Nmic) |
| | | noise = f.read(dtype=np.float64, always_2d=True) |
| | | # Repeat noise |
| | | noise = np.pad( |
| | | noise, |
| | | [(offset, nsamples - f.frames - offset), (0, 0)], |
| | | mode="wrap", |
| | | ) |
| | | else: |
| | | offset = np.random.randint(0, f.frames - nsamples) |
| | | f.seek(offset) |
| | | # noise: (Time, Nmic) |
| | | noise = f.read( |
| | | nsamples, dtype=np.float64, always_2d=True |
| | | ) |
| | | if len(noise) != nsamples: |
| | | raise RuntimeError(f"Something wrong: {noise_path}") |
| | | |
| | | audio_data = librosa.load(noise_path, dtype='float32')[0][None, :] |
| | | frames = len(audio_data[0]) |
| | | if frames == nsamples: |
| | | noise = audio_data |
| | | elif frames < nsamples: |
| | | offset = np.random.randint(0, nsamples - frames) |
| | | # noise: (Time, Nmic) |
| | | noise = audio_data |
| | | # Repeat noise |
| | | noise = np.pad( |
| | | noise, |
| | | [(offset, nsamples - frames - offset), (0, 0)], |
| | | mode="wrap", |
| | | ) |
| | | else: |
| | | noise = audio_data[:, nsamples] |
| | | # offset = np.random.randint(0, frames - nsamples) |
| | | # f.seek(offset) |
| | | # noise: (Time, Nmic) |
| | | # noise = f.read( |
| | | # nsamples, dtype=np.float64, always_2d=True |
| | | # ) |
| | | # if len(noise) != nsamples: |
| | | # raise RuntimeError(f"Something wrong: {noise_path}") |
| | | # noise: (Nmic, Time) |
| | | noise = noise.T |
| | | |
| | |
| | | |
| | | import numpy as np |
| | | import scipy.signal |
| | | import soundfile |
| | | import librosa |
| | | |
| | | from funasr.text.build_tokenizer import build_tokenizer |
| | | from funasr.text.cleaner import TextCleaner |
| | |
| | | if self.rirs is not None and self.rir_apply_prob >= np.random.random(): |
| | | rir_path = np.random.choice(self.rirs) |
| | | if rir_path is not None: |
| | | rir, _ = soundfile.read( |
| | | rir, _ = librosa.load( |
| | | rir_path, dtype=np.float64, always_2d=True |
| | | ) |
| | | |
| | |
| | | noise_db = np.random.uniform( |
| | | self.noise_db_low, self.noise_db_high |
| | | ) |
| | | with soundfile.SoundFile(noise_path) as f: |
| | | if f.frames == nsamples: |
| | | noise = f.read(dtype=np.float64, always_2d=True) |
| | | elif f.frames < nsamples: |
| | | offset = np.random.randint(0, nsamples - f.frames) |
| | | # noise: (Time, Nmic) |
| | | noise = f.read(dtype=np.float64, always_2d=True) |
| | | # Repeat noise |
| | | noise = np.pad( |
| | | noise, |
| | | [(offset, nsamples - f.frames - offset), (0, 0)], |
| | | mode="wrap", |
| | | ) |
| | | else: |
| | | offset = np.random.randint(0, f.frames - nsamples) |
| | | f.seek(offset) |
| | | # noise: (Time, Nmic) |
| | | noise = f.read( |
| | | nsamples, dtype=np.float64, always_2d=True |
| | | ) |
| | | if len(noise) != nsamples: |
| | | raise RuntimeError(f"Something wrong: {noise_path}") |
| | | audio_data = librosa.load(noise_path, dtype='float32')[0][None, :] |
| | | frames = len(audio_data[0]) |
| | | if frames == nsamples: |
| | | noise = audio_data |
| | | elif frames < nsamples: |
| | | offset = np.random.randint(0, nsamples - frames) |
| | | # noise: (Time, Nmic) |
| | | noise = audio_data |
| | | # Repeat noise |
| | | noise = np.pad( |
| | | noise, |
| | | [(offset, nsamples - frames - offset), (0, 0)], |
| | | mode="wrap", |
| | | ) |
| | | else: |
| | | noise = audio_data[:, nsamples] |
| | | # offset = np.random.randint(0, frames - nsamples) |
| | | # f.seek(offset) |
| | | # noise: (Time, Nmic) |
| | | # noise = f.read( |
| | | # nsamples, dtype=np.float64, always_2d=True |
| | | # ) |
| | | # if len(noise) != nsamples: |
| | | # raise RuntimeError(f"Something wrong: {noise_path}") |
| | | # noise: (Nmic, Time) |
| | | noise = noise.T |
| | | |
| | |
| | | |
| | | import random |
| | | import numpy as np |
| | | import soundfile |
| | | import librosa |
| | | import librosa |
| | | |
| | | import torch |
| | |
| | | def __getitem__(self, key): |
| | | wav = self.data[key] |
| | | if self.normalize: |
| | | # soundfile.read normalizes data to [-1,1] if dtype is not given |
| | | # librosa.load normalizes data to [-1,1] if dtype is not given |
| | | array, rate = librosa.load( |
| | | wav, sr=self.dest_sample_rate, mono=self.always_2d |
| | | ) |
| | |
| | | from typing import Union |
| | | |
| | | import torch |
| | | from torch_complex import functional as FC |
| | | from torch_complex.tensor import ComplexTensor |
| | | try: |
| | | from torch_complex import functional as FC |
| | | from torch_complex.tensor import ComplexTensor |
| | | except: |
| | | raise "Please install torch_complex firstly" |
| | | |
| | | |
| | | |
| | | EPS = torch.finfo(torch.double).eps |
| | |
| | | from typing import Union |
| | | |
| | | import torch |
| | | from torch_complex.tensor import ComplexTensor |
| | | |
| | | try: |
| | | from torch_complex.tensor import ComplexTensor |
| | | except: |
| | | raise "Please install torch_complex firstly" |
| | | from funasr.modules.nets_utils import make_pad_mask |
| | | from funasr.layers.complex_utils import is_complex |
| | | from funasr.layers.inversible_interface import InversibleInterface |
| | |
| | | import torch |
| | | import torch.nn as nn |
| | | import torch.nn.functional as F |
| | | |
| | | from rotary_embedding_torch import RotaryEmbedding |
| | | try: |
| | | from rotary_embedding_torch import RotaryEmbedding |
| | | except: |
| | | raise "Please install rotary_embedding_torch by: \n pip install -U funasr[all]" |
| | | from funasr.modules.layer_norm import GlobalLayerNorm, CumulativeLayerNorm, ScaleNorm |
| | | from funasr.modules.embedding import ScaledSinuEmbedding |
| | | from funasr.modules.mossformer import FLASH_ShareA_FFConvM |
| | |
| | | import humanfriendly |
| | | import numpy as np |
| | | import torch |
| | | from torch_complex.tensor import ComplexTensor |
| | | try: |
| | | from torch_complex.tensor import ComplexTensor |
| | | except: |
| | | raise "Please install torch_complex firstly" |
| | | |
| | | from funasr.layers.log_mel import LogMel |
| | | from funasr.layers.stft import Stft |
| | |
| | | import sys |
| | | import numpy as np |
| | | import subprocess |
| | | import soundfile as sf |
| | | import librosa as sf |
| | | import io |
| | | from functools import lru_cache |
| | | |
| | |
| | | # input piped command |
| | | p = subprocess.Popen(wav_rxfilename[:-1], shell=True, |
| | | stdout=subprocess.PIPE) |
| | | data, samplerate = sf.read(io.BytesIO(p.stdout.read()), |
| | | data, samplerate = sf.load(io.BytesIO(p.stdout.read()), |
| | | dtype='float32') |
| | | # cannot seek |
| | | data = data[start:end] |
| | | elif wav_rxfilename == '-': |
| | | # stdin |
| | | data, samplerate = sf.read(sys.stdin, dtype='float32') |
| | | data, samplerate = sf.load(sys.stdin, dtype='float32') |
| | | # cannot seek |
| | | data = data[start:end] |
| | | else: |
| | | # normal wav file |
| | | data, samplerate = sf.read(wav_rxfilename, start=start, stop=end) |
| | | data, samplerate = sf.load(wav_rxfilename, start=start, stop=end) |
| | | return data, samplerate |
| | | |
| | | |
| | |
| | | for iepoch in range(start_epoch, trainer_options.max_epoch + 1): |
| | | if iepoch != start_epoch: |
| | | logging.info( |
| | | "{}/{}epoch started. Estimated time to finish: {}".format( |
| | | "{}/{}epoch started. Estimated time to finish: {} hours".format( |
| | | iepoch, |
| | | trainer_options.max_epoch, |
| | | humanfriendly.format_timespan( |
| | | (time.perf_counter() - start_time) |
| | | / (iepoch - start_epoch) |
| | | * (trainer_options.max_epoch - iepoch + 1) |
| | | ), |
| | | (time.perf_counter() - start_time) / 3600.0 / (iepoch - start_epoch) * ( |
| | | trainer_options.max_epoch - iepoch + 1), |
| | | ) |
| | | ) |
| | | else: |
| | |
| | | from typing import Any, Dict, List, Union |
| | | |
| | | import torchaudio |
| | | import soundfile |
| | | import librosa |
| | | import numpy as np |
| | | import pkg_resources |
| | | from modelscope.utils.logger import get_logger |
| | |
| | | try: |
| | | audio, fs = torchaudio.load(fname) |
| | | except: |
| | | audio, fs = soundfile.read(fname) |
| | | audio, fs = librosa.load(fname) |
| | | break |
| | | if audio_type.rfind(".scp") >= 0: |
| | | with open(fname, encoding="utf-8") as f: |
| | |
| | | |
| | | import kaldiio |
| | | import numpy as np |
| | | import soundfile |
| | | import librosa |
| | | import torch.distributed as dist |
| | | import torchaudio |
| | | |
| | |
| | | try: |
| | | waveform, sampling_rate = torchaudio.load(wav_path) |
| | | except: |
| | | waveform, sampling_rate = soundfile.read(wav_path) |
| | | waveform, sampling_rate = librosa.load(wav_path) |
| | | waveform = np.expand_dims(waveform, axis=0) |
| | | n_frames = (waveform.shape[1] * 1000.0) / (sampling_rate * frontend_conf["frame_shift"] * frontend_conf["lfr_n"]) |
| | | feature_dim = frontend_conf["n_mels"] * frontend_conf["lfr_m"] |
| | |
| | | from typing import Any, Dict, List, Union |
| | | |
| | | import numpy as np |
| | | import soundfile as sf |
| | | import librosa as sf |
| | | import torch |
| | | import torchaudio |
| | | import logging |
| | |
| | | for i in range(len(inputs)): |
| | | if isinstance(inputs[i], str): |
| | | file_bytes = File.read(inputs[i]) |
| | | data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32') |
| | | data, fs = sf.load(io.BytesIO(file_bytes), dtype='float32') |
| | | if len(data.shape) == 2: |
| | | data = data[:, 0] |
| | | data = torch.from_numpy(data).unsqueeze(0) |
| | |
| | | import logging |
| | | import argparse |
| | | import numpy as np |
| | | import edit_distance |
| | | # import edit_distance |
| | | from itertools import zip_longest |
| | | |
| | | |
| | |
| | | return res |
| | | |
| | | |
| | | class AverageShiftCalculator(): |
| | | def __init__(self): |
| | | logging.warning("Calculating average shift.") |
| | | def __call__(self, file1, file2): |
| | | uttid_list1, ts_dict1 = self.read_timestamps(file1) |
| | | uttid_list2, ts_dict2 = self.read_timestamps(file2) |
| | | uttid_intersection = self._intersection(uttid_list1, uttid_list2) |
| | | res = self.as_cal(uttid_intersection, ts_dict1, ts_dict2) |
| | | logging.warning("Average shift of {} and {}: {}.".format(file1, file2, str(res)[:8])) |
| | | logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift, self.max_shift_uttid)) |
| | | |
| | | def _intersection(self, list1, list2): |
| | | set1 = set(list1) |
| | | set2 = set(list2) |
| | | if set1 == set2: |
| | | logging.warning("Uttid same checked.") |
| | | return set1 |
| | | itsc = list(set1 & set2) |
| | | logging.warning("Uttid differs: file1 {}, file2 {}, lines same {}.".format(len(list1), len(list2), len(itsc))) |
| | | return itsc |
| | | |
| | | def read_timestamps(self, file): |
| | | # read timestamps file in standard format |
| | | uttid_list = [] |
| | | ts_dict = {} |
| | | with codecs.open(file, 'r') as fin: |
| | | for line in fin.readlines(): |
| | | text = '' |
| | | ts_list = [] |
| | | line = line.rstrip() |
| | | uttid = line.split()[0] |
| | | uttid_list.append(uttid) |
| | | body = " ".join(line.split()[1:]) |
| | | for pd in body.split(';'): |
| | | if not len(pd): continue |
| | | # pdb.set_trace() |
| | | char, start, end = pd.lstrip(" ").split(' ') |
| | | text += char + ',' |
| | | ts_list.append((float(start), float(end))) |
| | | # ts_lists.append(ts_list) |
| | | ts_dict[uttid] = (text[:-1], ts_list) |
| | | logging.warning("File {} read done.".format(file)) |
| | | return uttid_list, ts_dict |
| | | |
| | | def _shift(self, filtered_timestamp_list1, filtered_timestamp_list2): |
| | | shift_time = 0 |
| | | for fts1, fts2 in zip(filtered_timestamp_list1, filtered_timestamp_list2): |
| | | shift_time += abs(fts1[0] - fts2[0]) + abs(fts1[1] - fts2[1]) |
| | | num_tokens = len(filtered_timestamp_list1) |
| | | return shift_time, num_tokens |
| | | |
| | | def as_cal(self, uttid_list, ts_dict1, ts_dict2): |
| | | # calculate average shift between timestamp1 and timestamp2 |
| | | # when characters differ, use edit distance alignment |
| | | # and calculate the error between the same characters |
| | | self._accumlated_shift = 0 |
| | | self._accumlated_tokens = 0 |
| | | self.max_shift = 0 |
| | | self.max_shift_uttid = None |
| | | for uttid in uttid_list: |
| | | (t1, ts1) = ts_dict1[uttid] |
| | | (t2, ts2) = ts_dict2[uttid] |
| | | _align, _align2, _align3 = [], [], [] |
| | | fts1, fts2 = [], [] |
| | | _t1, _t2 = [], [] |
| | | sm = edit_distance.SequenceMatcher(t1.split(','), t2.split(',')) |
| | | s = sm.get_opcodes() |
| | | for j in range(len(s)): |
| | | if s[j][0] == "replace" or s[j][0] == "insert": |
| | | _align.append(0) |
| | | if s[j][0] == "replace" or s[j][0] == "delete": |
| | | _align3.append(0) |
| | | elif s[j][0] == "equal": |
| | | _align.append(1) |
| | | _align3.append(1) |
| | | else: |
| | | continue |
| | | # use s to index t2 |
| | | for a, ts , t in zip(_align, ts2, t2.split(',')): |
| | | if a: |
| | | fts2.append(ts) |
| | | _t2.append(t) |
| | | sm2 = edit_distance.SequenceMatcher(t2.split(','), t1.split(',')) |
| | | s = sm2.get_opcodes() |
| | | for j in range(len(s)): |
| | | if s[j][0] == "replace" or s[j][0] == "insert": |
| | | _align2.append(0) |
| | | elif s[j][0] == "equal": |
| | | _align2.append(1) |
| | | else: |
| | | continue |
| | | # use s2 tp index t1 |
| | | for a, ts, t in zip(_align3, ts1, t1.split(',')): |
| | | if a: |
| | | fts1.append(ts) |
| | | _t1.append(t) |
| | | if len(fts1) == len(fts2): |
| | | shift_time, num_tokens = self._shift(fts1, fts2) |
| | | self._accumlated_shift += shift_time |
| | | self._accumlated_tokens += num_tokens |
| | | if shift_time/num_tokens > self.max_shift: |
| | | self.max_shift = shift_time/num_tokens |
| | | self.max_shift_uttid = uttid |
| | | else: |
| | | logging.warning("length mismatch") |
| | | return self._accumlated_shift / self._accumlated_tokens |
| | | # class AverageShiftCalculator(): |
| | | # def __init__(self): |
| | | # logging.warning("Calculating average shift.") |
| | | # def __call__(self, file1, file2): |
| | | # uttid_list1, ts_dict1 = self.read_timestamps(file1) |
| | | # uttid_list2, ts_dict2 = self.read_timestamps(file2) |
| | | # uttid_intersection = self._intersection(uttid_list1, uttid_list2) |
| | | # res = self.as_cal(uttid_intersection, ts_dict1, ts_dict2) |
| | | # logging.warning("Average shift of {} and {}: {}.".format(file1, file2, str(res)[:8])) |
| | | # logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift, self.max_shift_uttid)) |
| | | # |
| | | # def _intersection(self, list1, list2): |
| | | # set1 = set(list1) |
| | | # set2 = set(list2) |
| | | # if set1 == set2: |
| | | # logging.warning("Uttid same checked.") |
| | | # return set1 |
| | | # itsc = list(set1 & set2) |
| | | # logging.warning("Uttid differs: file1 {}, file2 {}, lines same {}.".format(len(list1), len(list2), len(itsc))) |
| | | # return itsc |
| | | # |
| | | # def read_timestamps(self, file): |
| | | # # read timestamps file in standard format |
| | | # uttid_list = [] |
| | | # ts_dict = {} |
| | | # with codecs.open(file, 'r') as fin: |
| | | # for line in fin.readlines(): |
| | | # text = '' |
| | | # ts_list = [] |
| | | # line = line.rstrip() |
| | | # uttid = line.split()[0] |
| | | # uttid_list.append(uttid) |
| | | # body = " ".join(line.split()[1:]) |
| | | # for pd in body.split(';'): |
| | | # if not len(pd): continue |
| | | # # pdb.set_trace() |
| | | # char, start, end = pd.lstrip(" ").split(' ') |
| | | # text += char + ',' |
| | | # ts_list.append((float(start), float(end))) |
| | | # # ts_lists.append(ts_list) |
| | | # ts_dict[uttid] = (text[:-1], ts_list) |
| | | # logging.warning("File {} read done.".format(file)) |
| | | # return uttid_list, ts_dict |
| | | # |
| | | # def _shift(self, filtered_timestamp_list1, filtered_timestamp_list2): |
| | | # shift_time = 0 |
| | | # for fts1, fts2 in zip(filtered_timestamp_list1, filtered_timestamp_list2): |
| | | # shift_time += abs(fts1[0] - fts2[0]) + abs(fts1[1] - fts2[1]) |
| | | # num_tokens = len(filtered_timestamp_list1) |
| | | # return shift_time, num_tokens |
| | | # |
| | | # # def as_cal(self, uttid_list, ts_dict1, ts_dict2): |
| | | # # # calculate average shift between timestamp1 and timestamp2 |
| | | # # # when characters differ, use edit distance alignment |
| | | # # # and calculate the error between the same characters |
| | | # # self._accumlated_shift = 0 |
| | | # # self._accumlated_tokens = 0 |
| | | # # self.max_shift = 0 |
| | | # # self.max_shift_uttid = None |
| | | # # for uttid in uttid_list: |
| | | # # (t1, ts1) = ts_dict1[uttid] |
| | | # # (t2, ts2) = ts_dict2[uttid] |
| | | # # _align, _align2, _align3 = [], [], [] |
| | | # # fts1, fts2 = [], [] |
| | | # # _t1, _t2 = [], [] |
| | | # # sm = edit_distance.SequenceMatcher(t1.split(','), t2.split(',')) |
| | | # # s = sm.get_opcodes() |
| | | # # for j in range(len(s)): |
| | | # # if s[j][0] == "replace" or s[j][0] == "insert": |
| | | # # _align.append(0) |
| | | # # if s[j][0] == "replace" or s[j][0] == "delete": |
| | | # # _align3.append(0) |
| | | # # elif s[j][0] == "equal": |
| | | # # _align.append(1) |
| | | # # _align3.append(1) |
| | | # # else: |
| | | # # continue |
| | | # # # use s to index t2 |
| | | # # for a, ts , t in zip(_align, ts2, t2.split(',')): |
| | | # # if a: |
| | | # # fts2.append(ts) |
| | | # # _t2.append(t) |
| | | # # sm2 = edit_distance.SequenceMatcher(t2.split(','), t1.split(',')) |
| | | # # s = sm2.get_opcodes() |
| | | # # for j in range(len(s)): |
| | | # # if s[j][0] == "replace" or s[j][0] == "insert": |
| | | # # _align2.append(0) |
| | | # # elif s[j][0] == "equal": |
| | | # # _align2.append(1) |
| | | # # else: |
| | | # # continue |
| | | # # # use s2 tp index t1 |
| | | # # for a, ts, t in zip(_align3, ts1, t1.split(',')): |
| | | # # if a: |
| | | # # fts1.append(ts) |
| | | # # _t1.append(t) |
| | | # # if len(fts1) == len(fts2): |
| | | # # shift_time, num_tokens = self._shift(fts1, fts2) |
| | | # # self._accumlated_shift += shift_time |
| | | # # self._accumlated_tokens += num_tokens |
| | | # # if shift_time/num_tokens > self.max_shift: |
| | | # # self.max_shift = shift_time/num_tokens |
| | | # # self.max_shift_uttid = uttid |
| | | # # else: |
| | | # # logging.warning("length mismatch") |
| | | # # return self._accumlated_shift / self._accumlated_tokens |
| | | |
| | | |
| | | def convert_external_alphas(alphas_file, text_file, output_file): |
| | |
| | | |
| | | |
| | | def main(args): |
| | | if args.mode == 'cal_aas': |
| | | asc = AverageShiftCalculator() |
| | | asc(args.input, args.input2) |
| | | elif args.mode == 'read_ext_alphas': |
| | | # if args.mode == 'cal_aas': |
| | | # asc = AverageShiftCalculator() |
| | | # asc(args.input, args.input2) |
| | | if args.mode == 'read_ext_alphas': |
| | | convert_external_alphas(args.input, args.input2, args.output) |
| | | else: |
| | | logging.error("Mode {} not in SUPPORTED_MODES: {}.".format(args.mode, SUPPORTED_MODES)) |
| | |
| | | import numpy as np |
| | | import torch |
| | | import torchaudio |
| | | import soundfile |
| | | import librosa |
| | | import torchaudio.compliance.kaldi as kaldi |
| | | |
| | | |
| | |
| | | try: |
| | | waveform, audio_sr = torchaudio.load(wav_file) |
| | | except: |
| | | waveform, audio_sr = soundfile.read(wav_file, dtype='float32') |
| | | waveform, audio_sr = librosa.load(wav_file, dtype='float32') |
| | | if waveform.ndim == 2: |
| | | waveform = waveform[:, 0] |
| | | waveform = torch.tensor(np.expand_dims(waveform, axis=0)) |
| | |
| | | try: |
| | | waveform, sampling_rate = torchaudio.load(wav_path) |
| | | except: |
| | | waveform, sampling_rate = soundfile.read(wav_path) |
| | | waveform, sampling_rate = librosa.load(wav_path) |
| | | waveform = torch.tensor(np.expand_dims(waveform, axis=0)) |
| | | speech_length = (waveform.shape[1] / sampling_rate) * 1000. |
| | | n_frames = (waveform.shape[1] * 1000.0) / (sampling_rate * frontend_conf["frame_shift"] * frontend_conf["lfr_n"]) |
| | |
| | | import os |
| | | from functools import lru_cache |
| | | from typing import Union |
| | | try: |
| | | import ffmpeg |
| | | except: |
| | | print("Please Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.") |
| | | |
| | | import ffmpeg |
| | | import numpy as np |
| | | import torch |
| | | import torch.nn.functional as F |
| | |
| | | |
| | | requirements = { |
| | | "install": [ |
| | | "setuptools>=38.5.1", |
| | | # "setuptools>=38.5.1", |
| | | "humanfriendly", |
| | | "scipy>=1.4.1", |
| | | "librosa", |
| | | "jamo", # For kss |
| | | # "jamo", # For kss |
| | | "PyYAML>=5.1.2", |
| | | "soundfile>=0.12.1", |
| | | "h5py>=3.1.0", |
| | | # "soundfile>=0.12.1", |
| | | # "h5py>=3.1.0", |
| | | "kaldiio>=2.17.0", |
| | | "torch_complex", |
| | | "nltk>=3.4.5", |
| | | # "torch_complex", |
| | | # "nltk>=3.4.5", |
| | | # ASR |
| | | "sentencepiece", |
| | | "sentencepiece", # train |
| | | "jieba", |
| | | "rotary_embedding_torch", |
| | | "ffmpeg", |
| | | # "rotary_embedding_torch", |
| | | # "ffmpeg-python", |
| | | # TTS |
| | | "pypinyin>=0.44.0", |
| | | "espnet_tts_frontend", |
| | | # "pypinyin>=0.44.0", |
| | | # "espnet_tts_frontend", |
| | | # ENH |
| | | "pytorch_wpe", |
| | | # "pytorch_wpe", |
| | | "editdistance>=0.5.2", |
| | | "tensorboard", |
| | | "g2p", |
| | | "nara_wpe", |
| | | # "g2p", |
| | | # "nara_wpe", |
| | | # PAI |
| | | "oss2", |
| | | "edit-distance", |
| | | "textgrid", |
| | | "protobuf", |
| | | # "edit-distance", |
| | | # "textgrid", |
| | | # "protobuf", |
| | | "tqdm", |
| | | "hdbscan", |
| | | "umap", |
| | |
| | | name="funasr", |
| | | version=version, |
| | | url="https://github.com/alibaba-damo-academy/FunASR.git", |
| | | author="Speech Lab of DAMO Academy, Alibaba Group", |
| | | author="Speech Lab of Alibaba Group", |
| | | author_email="funasr@list.alibaba-inc.com", |
| | | description="FunASR: A Fundamental End-to-End Speech Recognition Toolkit", |
| | | long_description=open(os.path.join(dirname, "README.md"), encoding="utf-8").read(), |