| | |
| | | cmd="funasr/bin/inference.py" |
| | | |
| | | python $cmd \ |
| | | +model="/Users/zhifu/modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \ |
| | | +model="/Users/zhifu/Downloads/modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \ |
| | | +input="/Users/zhifu/Downloads/asr_example.wav" \ |
| | | +output_dir="/Users/zhifu/Downloads/ckpt/funasr2/exp2" \ |
| | | +device="cpu" \ |
| | | |
| | | python $cmd \ |
| | | +model="/Users/zhifu/modelscope_models/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" \ |
| | | +model="/Users/zhifu/Downloads/modelscope_models/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" \ |
| | | +input="/Users/zhifu/Downloads/asr_example.wav" \ |
| | | +output_dir="/Users/zhifu/Downloads/ckpt/funasr2/exp2" \ |
| | | +device="cpu" \ |
| | |
| | | import time |
| | | import random |
| | | import string |
| | | from funasr.utils.register import registry_tables |
| | | from funasr.register import tables |
| | | |
| | | |
| | | def build_iter_for_infer(data_in, input_len=None, data_type="sound"): |
| | |
| | | |
| | | class AutoModel: |
| | | def __init__(self, **kwargs): |
| | | registry_tables.print() |
| | | tables.print() |
| | | assert "model" in kwargs |
| | | if "model_conf" not in kwargs: |
| | | logging.info("download models from model hub: {}".format(kwargs.get("model_hub", "ms"))) |
| | |
| | | # build tokenizer |
| | | tokenizer = kwargs.get("tokenizer", None) |
| | | if tokenizer is not None: |
| | | tokenizer_class = registry_tables.tokenizer_classes.get(tokenizer.lower()) |
| | | tokenizer_class = tables.tokenizer_classes.get(tokenizer.lower()) |
| | | tokenizer = tokenizer_class(**kwargs["tokenizer_conf"]) |
| | | kwargs["tokenizer"] = tokenizer |
| | | kwargs["token_list"] = tokenizer.token_list |
| | |
| | | # build frontend |
| | | frontend = kwargs.get("frontend", None) |
| | | if frontend is not None: |
| | | frontend_class = registry_tables.frontend_classes.get(frontend.lower()) |
| | | frontend_class = tables.frontend_classes.get(frontend.lower()) |
| | | frontend = frontend_class(**kwargs["frontend_conf"]) |
| | | kwargs["frontend"] = frontend |
| | | kwargs["input_size"] = frontend.output_size() |
| | | |
| | | # build model |
| | | model_class = registry_tables.model_classes.get(kwargs["model"].lower()) |
| | | model_class = tables.model_classes.get(kwargs["model"].lower()) |
| | | model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=len(tokenizer.token_list) if tokenizer is not None else -1) |
| | | model.eval() |
| | | model.to(device) |
| | |
| | | from torch.nn.parallel import DistributedDataParallel as DDP |
| | | from torch.distributed.fsdp import FullyShardedDataParallel as FSDP |
| | | from funasr.download.download_from_hub import download_model |
| | | from funasr.utils.register import registry_tables |
| | | from funasr.register import tables |
| | | |
| | | @hydra.main(config_name=None, version_base=None) |
| | | def main_hydra(kwargs: DictConfig): |
| | |
| | | # preprocess_config(kwargs) |
| | | # import pdb; pdb.set_trace() |
| | | # set random seed |
| | | registry_tables.print() |
| | | tables.print() |
| | | set_all_random_seed(kwargs.get("seed", 0)) |
| | | torch.backends.cudnn.enabled = kwargs.get("cudnn_enabled", torch.backends.cudnn.enabled) |
| | | torch.backends.cudnn.benchmark = kwargs.get("cudnn_benchmark", torch.backends.cudnn.benchmark) |
| | |
| | | |
| | | tokenizer = kwargs.get("tokenizer", None) |
| | | if tokenizer is not None: |
| | | tokenizer_class = registry_tables.tokenizer_classes.get(tokenizer.lower()) |
| | | tokenizer_class = tables.tokenizer_classes.get(tokenizer.lower()) |
| | | tokenizer = tokenizer_class(**kwargs["tokenizer_conf"]) |
| | | kwargs["tokenizer"] = tokenizer |
| | | |
| | | # build frontend if frontend is none None |
| | | frontend = kwargs.get("frontend", None) |
| | | if frontend is not None: |
| | | frontend_class = registry_tables.frontend_classes.get(frontend.lower()) |
| | | frontend_class = tables.frontend_classes.get(frontend.lower()) |
| | | frontend = frontend_class(**kwargs["frontend_conf"]) |
| | | kwargs["frontend"] = frontend |
| | | kwargs["input_size"] = frontend.output_size() |
| | |
| | | # import pdb; |
| | | # pdb.set_trace() |
| | | # build model |
| | | model_class = registry_tables.model_classes.get(kwargs["model"].lower()) |
| | | model_class = tables.model_classes.get(kwargs["model"].lower()) |
| | | model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=len(tokenizer.token_list)) |
| | | |
| | | |
| | |
| | | # import pdb; |
| | | # pdb.set_trace() |
| | | # dataset |
| | | dataset_class = registry_tables.dataset_classes.get(kwargs.get("dataset", "AudioDataset").lower()) |
| | | dataset_class = tables.dataset_classes.get(kwargs.get("dataset", "AudioDataset").lower()) |
| | | dataset_tr = dataset_class(kwargs.get("train_data_set_list"), frontend=frontend, tokenizer=tokenizer, **kwargs.get("dataset_conf")) |
| | | |
| | | # dataloader |
| | | batch_sampler = kwargs["dataset_conf"].get("batch_sampler", "DynamicBatchLocalShuffleSampler") |
| | | batch_sampler_class = registry_tables.batch_sampler_classes.get(batch_sampler.lower()) |
| | | batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler.lower()) |
| | | if batch_sampler is not None: |
| | | batch_sampler = batch_sampler_class(dataset_tr, **kwargs.get("dataset_conf")) |
| | | dataloader_tr = torch.utils.data.DataLoader(dataset_tr, |
| | |
| | | import logging |
| | | |
| | | from funasr.datasets.audio_datasets.load_audio_extract_fbank import load_audio, extract_fbank |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | @register_class("dataset_classes", "AudioDataset") |
| | | @tables.register("dataset_classes", "AudioDataset") |
| | | class AudioDataset(torch.utils.data.Dataset): |
| | | def __init__(self, |
| | | path, |
| | |
| | | float_pad_value: float = 0.0, |
| | | **kwargs): |
| | | super().__init__() |
| | | index_ds_class = registry_tables.index_ds_classes.get(index_ds.lower()) |
| | | index_ds_class = tables.index_ds_classes.get(index_ds.lower()) |
| | | self.index_ds = index_ds_class(path) |
| | | preprocessor_speech = kwargs.get("preprocessor_speech", None) |
| | | if preprocessor_speech: |
| | | preprocessor_speech_class = registry_tables.preprocessor_speech_classes.get(preprocessor_speech.lower()) |
| | | preprocessor_speech_class = tables.preprocessor_speech_classes.get(preprocessor_speech.lower()) |
| | | preprocessor_speech = preprocessor_speech_class(**kwargs.get("preprocessor_speech_conf")) |
| | | self.preprocessor_speech = preprocessor_speech |
| | | preprocessor_text = kwargs.get("preprocessor_text", None) |
| | | if preprocessor_text: |
| | | preprocessor_text_class = registry_tables.preprocessor_text_classes.get(preprocessor_text.lower()) |
| | | preprocessor_text_class = tables.preprocessor_text_classes.get(preprocessor_text.lower()) |
| | | preprocessor_text = preprocessor_text_class(**kwargs.get("preprocessor_text_conf")) |
| | | self.preprocessor_text = preprocessor_text |
| | | |
| | |
| | | import time |
| | | import logging |
| | | |
| | | from funasr.utils.register import register_class |
| | | from funasr.register import tables |
| | | |
| | | @register_class("index_ds_classes", "IndexDSJsonl") |
| | | @tables.register("index_ds_classes", "IndexDSJsonl") |
| | | class IndexDSJsonl(torch.utils.data.Dataset): |
| | | |
| | | def __init__(self, path): |
| | |
| | | |
| | | import numpy as np |
| | | |
| | | from funasr.utils.register import register_class |
| | | from funasr.register import tables |
| | | |
| | | @register_class("batch_sampler_classes", "DynamicBatchLocalShuffleSampler") |
| | | @tables.register("batch_sampler_classes", "DynamicBatchLocalShuffleSampler") |
| | | class BatchSampler(torch.utils.data.BatchSampler): |
| | | |
| | | def __init__(self, dataset, |
| | |
| | | from torch.nn.utils.rnn import pad_sequence |
| | | |
| | | import funasr.frontends.eend_ola_feature as eend_ola_feature |
| | | from funasr.utils.register import register_class |
| | | from funasr.register import tables |
| | | |
| | | |
| | | |
| | |
| | | LFR_outputs = torch.vstack(LFR_inputs) |
| | | return LFR_outputs.type(torch.float32) |
| | | |
| | | @register_class("frontend_classes", "WavFrontend") |
| | | @tables.register("frontend_classes", "WavFrontend") |
| | | class WavFrontend(nn.Module): |
| | | """Conventional frontend structure for ASR. |
| | | """ |
| | |
| | | return feats_pad, feats_lens |
| | | |
| | | |
| | | @register_class("frontend_classes", "WavFrontendOnline") |
| | | @tables.register("frontend_classes", "WavFrontendOnline") |
| | | class WavFrontendOnline(nn.Module): |
| | | """Conventional frontend structure for streaming ASR/VAD. |
| | | """ |
| | |
| | | # from funasr.models.scama.utils import sequence_mask
|
| | | # from typing import Optional, Tuple
|
| | | #
|
| | | # from funasr.utils.register import register_class
|
| | | # from funasr.register import tables
|
| | | #
|
| | | # class mae_loss(nn.Module):
|
| | | #
|
| | |
| | | # fires = torch.stack(list_fires, 1)
|
| | | # return fires
|
| | | #
|
| | | # @register_class("predictor_classes", "BATPredictor")
|
| | | # @tables.register("predictor_classes", "BATPredictor")
|
| | | # class BATPredictor(nn.Module):
|
| | | # def __init__(self, idim, l_order, r_order, threshold=1.0, dropout=0.1, smooth_factor=1.0, noise_threshold=0, return_accum=False):
|
| | | # super(BATPredictor, self).__init__()
|
| | |
| | | from funasr.models.transformer.utils.subsampling import TooShortUttError |
| | | from funasr.models.transformer.utils.subsampling import check_short_utt |
| | | from funasr.models.transformer.utils.subsampling import StreamingConvInput |
| | | from funasr.utils.register import register_class |
| | | from funasr.register import tables |
| | | |
| | | |
| | | |
| | |
| | | |
| | | return x, cache |
| | | |
| | | @register_class("encoder_classes", "ConformerChunkEncoder") |
| | | @tables.register("encoder_classes", "ConformerChunkEncoder") |
| | | class ConformerChunkEncoder(nn.Module): |
| | | """Encoder module definition. |
| | | Args: |
| | |
| | | from funasr.models.scama.utils import sequence_mask
|
| | | from typing import Optional, Tuple
|
| | |
|
| | | from funasr.utils.register import register_class
|
| | | from funasr.register import tables
|
| | |
|
| | |
|
| | | class mae_loss(nn.Module):
|
| | |
| | | fires = torch.stack(list_fires, 1)
|
| | | return fires
|
| | |
|
| | | @register_class("predictor_classes", "CifPredictorV3")
|
| | | @tables.register("predictor_classes", "CifPredictorV3")
|
| | | class CifPredictorV3(nn.Module):
|
| | | def __init__(self,
|
| | | idim,
|
| | |
| | | from funasr.utils import postprocess_utils |
| | | from funasr.utils.datadir_writer import DatadirWriter |
| | | from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | from funasr.models.ctc.ctc import CTC |
| | | |
| | | from funasr.models.paraformer.model import Paraformer |
| | | |
| | | @register_class("model_classes", "BiCifParaformer") |
| | | @tables.register("model_classes", "BiCifParaformer") |
| | | class BiCifParaformer(Paraformer): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | check_short_utt, |
| | | ) |
| | | |
| | | from funasr.utils.register import register_class |
| | | from funasr.register import tables |
| | | |
| | | class BranchformerEncoderLayer(torch.nn.Module): |
| | | """Branchformer encoder layer module. |
| | |
| | | |
| | | return x, mask |
| | | |
| | | @register_class("encoder_classes", "BranchformerEncoder") |
| | | @tables.register("encoder_classes", "BranchformerEncoder") |
| | | class BranchformerEncoder(nn.Module): |
| | | """Branchformer encoder module.""" |
| | | |
| | |
| | | import logging |
| | | |
| | | from funasr.models.transformer.model import Transformer |
| | | from funasr.utils.register import register_class |
| | | from funasr.register import tables |
| | | |
| | | @register_class("model_classes", "Branchformer") |
| | | @tables.register("model_classes", "Branchformer") |
| | | class Branchformer(Transformer): |
| | | """CTC-attention hybrid Encoder-Decoder model""" |
| | | |
| | |
| | | from funasr.models.transformer.utils.subsampling import check_short_utt |
| | | from funasr.models.transformer.utils.subsampling import Conv2dSubsamplingPad |
| | | from funasr.models.transformer.utils.subsampling import StreamingConvInput |
| | | from funasr.utils.register import register_class |
| | | from funasr.register import tables |
| | | |
| | | |
| | | class ConvolutionModule(nn.Module): |
| | |
| | | return x, mask |
| | | |
| | | |
| | | @register_class("encoder_classes", "ConformerEncoder") |
| | | @tables.register("encoder_classes", "ConformerEncoder") |
| | | class ConformerEncoder(nn.Module): |
| | | """Conformer encoder module. |
| | | |
| | |
| | | import torch |
| | | |
| | | from funasr.models.transformer.model import Transformer |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | @register_class("model_classes", "Conformer") |
| | | @tables.register("model_classes", "Conformer") |
| | | class Conformer(Transformer): |
| | | """CTC-attention hybrid Encoder-Decoder model""" |
| | | |
| | |
| | | # You can modify the configuration according to your own requirements. |
| | | |
| | | # to print the register_table: |
| | | # from funasr.utils.register import registry_tables |
| | | # registry_tables.print() |
| | | # from funasr.register import tables |
| | | # tables.print() |
| | | |
| | | # network architecture |
| | | #model: funasr.models.paraformer.model:Paraformer |
| | |
| | | |
| | | from funasr.models.ctc.ctc import CTC |
| | | |
| | | from funasr.utils.register import register_class |
| | | from funasr.register import tables |
| | | |
| | | class EncoderLayerSANM(nn.Module): |
| | | def __init__( |
| | |
| | | return x, cache |
| | | |
| | | |
| | | @register_class("encoder_classes", "SANMVadEncoder") |
| | | @tables.register("encoder_classes", "SANMVadEncoder") |
| | | class SANMVadEncoder(nn.Module): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | import torch |
| | | import torch.nn as nn |
| | | |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | @register_class("model_classes", "CTTransformer") |
| | | @tables.register("model_classes", "CTTransformer") |
| | | class CTTransformer(nn.Module): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | |
| | | |
| | | self.embed = nn.Embedding(vocab_size, embed_unit) |
| | | encoder_class = registry_tables.encoder_classes.get(encoder.lower()) |
| | | encoder_class = tables.encoder_classes.get(encoder.lower()) |
| | | encoder = encoder_class(**encoder_conf) |
| | | |
| | | self.decoder = nn.Linear(att_unit, punc_size) |
| | |
| | | TooShortUttError, |
| | | check_short_utt, |
| | | ) |
| | | from funasr.utils.register import register_class |
| | | from funasr.register import tables |
| | | |
| | | class EBranchformerEncoderLayer(torch.nn.Module): |
| | | """E-Branchformer encoder layer module. |
| | |
| | | |
| | | return x, mask |
| | | |
| | | @register_class("encoder_classes", "EBranchformerEncoder") |
| | | @tables.register("encoder_classes", "EBranchformerEncoder") |
| | | class EBranchformerEncoder(nn.Module): |
| | | """E-Branchformer encoder module.""" |
| | | |
| | |
| | | import logging |
| | | |
| | | from funasr.models.transformer.model import Transformer |
| | | from funasr.utils.register import register_class |
| | | from funasr.register import tables |
| | | |
| | | @register_class("model_classes", "EBranchformer") |
| | | @tables.register("model_classes", "EBranchformer") |
| | | class EBranchformer(Transformer): |
| | | """CTC-attention hybrid Encoder-Decoder model""" |
| | | |
| | |
| | | import torch.nn as nn |
| | | import torch.nn.functional as F |
| | | |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | class LinearTransform(nn.Module): |
| | | |
| | |
| | | fsmn_layers: no. of sequential fsmn layers |
| | | ''' |
| | | |
| | | @register_class("encoder_classes", "FSMN") |
| | | @tables.register("encoder_classes", "FSMN") |
| | | class FSMN(nn.Module): |
| | | def __init__( |
| | | self, |
| | |
| | | rstride: right stride |
| | | ''' |
| | | |
| | | @register_class("encoder_classes", "DFSMN") |
| | | @tables.register("encoder_classes", "DFSMN") |
| | | class DFSMN(nn.Module): |
| | | |
| | | def __init__(self, dimproj=64, dimlinear=128, lorder=20, rorder=1, lstride=1, rstride=1): |
| | |
| | | import math |
| | | from typing import Optional |
| | | import time |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | from funasr.datasets.audio_datasets.load_audio_extract_fbank import load_audio,extract_fbank |
| | | from funasr.utils.datadir_writer import DatadirWriter |
| | | from torch.nn.utils.rnn import pad_sequence |
| | |
| | | return int(self.frame_size_ms) |
| | | |
| | | |
| | | @register_class("model_classes", "FsmnVAD") |
| | | @tables.register("model_classes", "FsmnVAD") |
| | | class FsmnVAD(nn.Module): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | self.vad_opts.speech_to_sil_time_thres, |
| | | self.vad_opts.frame_in_ms) |
| | | |
| | | encoder_class = registry_tables.encoder_classes.get(encoder.lower()) |
| | | encoder_class = tables.encoder_classes.get(encoder.lower()) |
| | | encoder = encoder_class(**encoder_conf) |
| | | self.encoder = encoder |
| | | # init variables |
| New file |
| | |
| | | # This is an example that demonstrates how to configure a model file. |
| | | # You can modify the configuration according to your own requirements. |
| | | |
| | | # to print the register_table: |
| | | # from funasr.register import tables |
| | | # tables.print() |
| | | |
| | | # network architecture |
| | | model: FsmnVAD |
| | | model_conf: |
| | | sample_rate: 16000 |
| | | detect_mode: 1 |
| | | snr_mode: 0 |
| | | max_end_silence_time: 800 |
| | | max_start_silence_time: 3000 |
| | | do_start_point_detection: True |
| | | do_end_point_detection: True |
| | | window_size_ms: 200 |
| | | sil_to_speech_time_thres: 150 |
| | | speech_to_sil_time_thres: 150 |
| | | speech_2_noise_ratio: 1.0 |
| | | do_extend: 1 |
| | | lookback_time_start_point: 200 |
| | | lookahead_time_end_point: 100 |
| | | max_single_segment_time: 60000 |
| | | snr_thres: -100.0 |
| | | noise_frame_num_used_for_snr: 100 |
| | | decibel_thres: -100.0 |
| | | speech_noise_thres: 0.6 |
| | | fe_prior_thres: 0.0001 |
| | | silence_pdf_num: 1 |
| | | sil_pdf_ids: [0] |
| | | speech_noise_thresh_low: -0.1 |
| | | speech_noise_thresh_high: 0.3 |
| | | output_frame_probs: False |
| | | frame_in_ms: 10 |
| | | frame_length_ms: 25 |
| | | |
| | | encoder: FSMN |
| | | encoder_conf: |
| | | input_dim: 400 |
| | | input_affine_dim: 140 |
| | | fsmn_layers: 4 |
| | | linear_dim: 250 |
| | | proj_dim: 128 |
| | | lorder: 20 |
| | | rorder: 0 |
| | | lstride: 1 |
| | | rstride: 0 |
| | | output_affine_dim: 140 |
| | | output_dim: 248 |
| | | |
| | | frontend: WavFrontend |
| | | frontend_conf: |
| | | fs: 16000 |
| | | window: hamming |
| | | n_mels: 80 |
| | | frame_length: 25 |
| | | frame_shift: 10 |
| | | dither: 0.0 |
| | | lfr_m: 5 |
| | | lfr_n: 1 |
| | |
| | | from funasr.models.transformer.utils.repeat import repeat |
| | | from funasr.models.paraformer.decoder import DecoderLayerSANM, ParaformerSANMDecoder |
| | | |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | class ContextualDecoderLayer(nn.Module): |
| | | def __init__( |
| | |
| | | x = self.dropout(self.src_attn(x, memory, memory_mask)) |
| | | return x, tgt_mask, memory, memory_mask, cache |
| | | |
| | | @register_class("decoder_classes", "ContextualParaformerDecoder") |
| | | @tables.register("decoder_classes", "ContextualParaformerDecoder") |
| | | class ContextualParaformerDecoder(ParaformerSANMDecoder): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | |
| | | from funasr.models.paraformer.model import Paraformer |
| | | |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | @register_class("model_classes", "NeatContextualParaformer") |
| | | @tables.register("model_classes", "NeatContextualParaformer") |
| | | class NeatContextualParaformer(Paraformer): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | # You can modify the configuration according to your own requirements. |
| | | |
| | | # to print the register_table: |
| | | # from funasr.utils.register import registry_tables |
| | | # registry_tables.print() |
| | | # from funasr.register import tables |
| | | # tables.print() |
| | | |
| | | # network architecture |
| | | model: NeatContextualParaformer |
| | |
| | | import torch |
| | | |
| | | from funasr.models.transformer.utils.nets_utils import make_pad_mask |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | @register_class("normalize_classes", "GlobalMVN") |
| | | @tables.register("normalize_classes", "GlobalMVN") |
| | | class GlobalMVN(torch.nn.Module): |
| | | """Apply global mean and variance normalization |
| | | TODO(kamo): Make this class portable somehow |
| | |
| | | import torch |
| | | |
| | | from funasr.models.transformer.utils.nets_utils import make_pad_mask |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | @register_class("normalize_classes", "UtteranceMVN") |
| | | @tables.register("normalize_classes", "UtteranceMVN") |
| | | class UtteranceMVN(torch.nn.Module): |
| | | def __init__( |
| | | self, |
| | |
| | | from funasr.models.scama.utils import sequence_mask
|
| | | from typing import Optional, Tuple
|
| | |
|
| | | from funasr.utils.register import register_class, registry_tables
|
| | | from funasr.register import tables
|
| | |
|
| | | @register_class("predictor_classes", "CifPredictor")
|
| | | @tables.register("predictor_classes", "CifPredictor")
|
| | | class CifPredictor(nn.Module):
|
| | | def __init__(self, idim, l_order, r_order, threshold=1.0, dropout=0.1, smooth_factor=1.0, noise_threshold=0, tail_threshold=0.45):
|
| | | super().__init__()
|
| | |
| | | predictor_alignments_length = predictor_alignments.sum(-1).type(encoder_sequence_length.dtype)
|
| | | return predictor_alignments.detach(), predictor_alignments_length.detach()
|
| | |
|
| | | @register_class("predictor_classes", "CifPredictorV2")
|
| | | @tables.register("predictor_classes", "CifPredictorV2")
|
| | | class CifPredictorV2(nn.Module):
|
| | | def __init__(self,
|
| | | idim,
|
| | |
| | | from funasr.models.transformer.embedding import PositionalEncoding |
| | | from funasr.models.transformer.utils.nets_utils import make_pad_mask |
| | | from funasr.models.transformer.positionwise_feed_forward import PositionwiseFeedForward |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | class DecoderLayerSANM(nn.Module): |
| | | """Single decoder layer module. |
| | |
| | | return x, memory, fsmn_cache, opt_cache |
| | | |
| | | |
| | | @register_class("decoder_classes", "ParaformerSANMDecoder") |
| | | @tables.register("decoder_classes", "ParaformerSANMDecoder") |
| | | class ParaformerSANMDecoder(BaseTransformerDecoder): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | return y, new_cache |
| | | |
| | | |
| | | @register_class("decoder_classes", "ParaformerDecoderSAN") |
| | | @tables.register("decoder_classes", "ParaformerDecoderSAN") |
| | | class ParaformerDecoderSAN(BaseTransformerDecoder): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | from funasr.datasets.audio_datasets.load_audio_extract_fbank import load_audio, extract_fbank |
| | | from funasr.utils import postprocess_utils |
| | | from funasr.utils.datadir_writer import DatadirWriter |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | from funasr.models.ctc.ctc import CTC |
| | | |
| | | @register_class("model_classes", "Paraformer") |
| | | @tables.register("model_classes", "Paraformer") |
| | | class Paraformer(nn.Module): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | super().__init__() |
| | | |
| | | if specaug is not None: |
| | | specaug_class = registry_tables.specaug_classes.get(specaug.lower()) |
| | | specaug_class = tables.specaug_classes.get(specaug.lower()) |
| | | specaug = specaug_class(**specaug_conf) |
| | | if normalize is not None: |
| | | normalize_class = registry_tables.normalize_classes.get(normalize.lower()) |
| | | normalize_class = tables.normalize_classes.get(normalize.lower()) |
| | | normalize = normalize_class(**normalize_conf) |
| | | encoder_class = registry_tables.encoder_classes.get(encoder.lower()) |
| | | encoder_class = tables.encoder_classes.get(encoder.lower()) |
| | | encoder = encoder_class(input_size=input_size, **encoder_conf) |
| | | encoder_output_size = encoder.output_size() |
| | | |
| | | if decoder is not None: |
| | | decoder_class = registry_tables.decoder_classes.get(decoder.lower()) |
| | | decoder_class = tables.decoder_classes.get(decoder.lower()) |
| | | decoder = decoder_class( |
| | | vocab_size=vocab_size, |
| | | encoder_output_size=encoder_output_size, |
| | |
| | | odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf |
| | | ) |
| | | if predictor is not None: |
| | | predictor_class = registry_tables.predictor_classes.get(predictor.lower()) |
| | | predictor_class = tables.predictor_classes.get(predictor.lower()) |
| | | predictor = predictor_class(**predictor_conf) |
| | | |
| | | # note that eos is the same as sos (equivalent ID) |
| | |
| | | # You can modify the configuration according to your own requirements. |
| | | |
| | | # to print the register_table: |
| | | # from funasr.utils.register import registry_tables |
| | | # registry_tables.print() |
| | | # from funasr.register import tables |
| | | # tables.print() |
| | | |
| | | # network architecture |
| | | #model: funasr.models.paraformer.model:Paraformer |
| | |
| | | from funasr.utils import postprocess_utils |
| | | from funasr.utils.datadir_writer import DatadirWriter |
| | | from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard |
| | | from funasr.utils.register import registry_tables |
| | | from funasr.register import tables |
| | | from funasr.models.ctc.ctc import CTC |
| | | |
| | | class Paraformer(nn.Module): |
| | |
| | | # pdb.set_trace() |
| | | |
| | | if frontend is not None: |
| | | frontend_class = registry_tables.frontend_classes.get_class(frontend.lower()) |
| | | frontend_class = tables.frontend_classes.get_class(frontend.lower()) |
| | | frontend = frontend_class(**frontend_conf) |
| | | if specaug is not None: |
| | | specaug_class = registry_tables.specaug_classes.get_class(specaug.lower()) |
| | | specaug_class = tables.specaug_classes.get_class(specaug.lower()) |
| | | specaug = specaug_class(**specaug_conf) |
| | | if normalize is not None: |
| | | normalize_class = registry_tables.normalize_classes.get_class(normalize.lower()) |
| | | normalize_class = tables.normalize_classes.get_class(normalize.lower()) |
| | | normalize = normalize_class(**normalize_conf) |
| | | encoder_class = registry_tables.encoder_classes.get_class(encoder.lower()) |
| | | encoder_class = tables.encoder_classes.get_class(encoder.lower()) |
| | | encoder = encoder_class(input_size=input_size, **encoder_conf) |
| | | encoder_output_size = encoder.output_size() |
| | | if decoder is not None: |
| | | decoder_class = registry_tables.decoder_classes.get_class(decoder.lower()) |
| | | decoder_class = tables.decoder_classes.get_class(decoder.lower()) |
| | | decoder = decoder_class( |
| | | vocab_size=vocab_size, |
| | | encoder_output_size=encoder_output_size, |
| | |
| | | odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf |
| | | ) |
| | | if predictor is not None: |
| | | predictor_class = registry_tables.predictor_classes.get_class(predictor.lower()) |
| | | predictor_class = tables.predictor_classes.get_class(predictor.lower()) |
| | | predictor = predictor_class(**predictor_conf) |
| | | |
| | | # note that eos is the same as sos (equivalent ID) |
| | |
| | | from funasr.models.sanm.positionwise_feed_forward import PositionwiseFeedForwardDecoderSANM |
| | | from funasr.models.transformer.utils.repeat import repeat |
| | | |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | class DecoderLayerSANM(nn.Module): |
| | | """Single decoder layer module. |
| | |
| | | return x, memory, fsmn_cache, opt_cache |
| | | |
| | | |
| | | @register_class("decoder_classes", "ParaformerSANMDecoder") |
| | | @tables.register("decoder_classes", "ParaformerSANMDecoder") |
| | | class ParaformerSANMDecoder(BaseTransformerDecoder): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | from funasr.models.transformer.utils.repeat import repeat |
| | | from funasr.models.transformer.scorers.scorer_interface import BatchScorerInterface |
| | | |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | class DecoderLayer(nn.Module): |
| | | """Single decoder layer module. |
| | |
| | | state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)] |
| | | return logp, state_list |
| | | |
| | | @register_class("decoder_classes", "TransformerDecoder") |
| | | @tables.register("decoder_classes", "TransformerDecoder") |
| | | class TransformerDecoder(BaseTransformerDecoder): |
| | | def __init__( |
| | | self, |
| | |
| | | ) |
| | | |
| | | |
| | | @register_class("decoder_classes", "ParaformerDecoderSAN") |
| | | @tables.register("decoder_classes", "ParaformerDecoderSAN") |
| | | class ParaformerDecoderSAN(BaseTransformerDecoder): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | else: |
| | | return x, olens |
| | | |
| | | @register_class("decoder_classes", "LightweightConvolutionTransformerDecoder") |
| | | @tables.register("decoder_classes", "LightweightConvolutionTransformerDecoder") |
| | | class LightweightConvolutionTransformerDecoder(BaseTransformerDecoder): |
| | | def __init__( |
| | | self, |
| | |
| | | ), |
| | | ) |
| | | |
| | | @register_class("decoder_classes", "LightweightConvolution2DTransformerDecoder") |
| | | @tables.register("decoder_classes", "LightweightConvolution2DTransformerDecoder") |
| | | class LightweightConvolution2DTransformerDecoder(BaseTransformerDecoder): |
| | | def __init__( |
| | | self, |
| | |
| | | ) |
| | | |
| | | |
| | | @register_class("decoder_classes", "DynamicConvolutionTransformerDecoder") |
| | | @tables.register("decoder_classes", "DynamicConvolutionTransformerDecoder") |
| | | class DynamicConvolutionTransformerDecoder(BaseTransformerDecoder): |
| | | def __init__( |
| | | self, |
| | |
| | | ), |
| | | ) |
| | | |
| | | @register_class("decoder_classes", "DynamicConvolution2DTransformerDecoder") |
| | | @tables.register("decoder_classes", "DynamicConvolution2DTransformerDecoder") |
| | | class DynamicConvolution2DTransformerDecoder(BaseTransformerDecoder): |
| | | def __init__( |
| | | self, |
| | |
| | | from funasr.models.sanm.positionwise_feed_forward import PositionwiseFeedForwardDecoderSANM |
| | | from funasr.models.transformer.utils.repeat import repeat |
| | | |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | class DecoderLayerSANM(nn.Module): |
| | | """Single decoder layer module. |
| | |
| | | return x, memory, fsmn_cache, opt_cache |
| | | |
| | | |
| | | @register_class("decoder_classes", "FsmnDecoder") |
| | | @tables.register("decoder_classes", "FsmnDecoder") |
| | | class FsmnDecoder(BaseTransformerDecoder): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | |
| | | from funasr.models.ctc.ctc import CTC |
| | | |
| | | from funasr.utils.register import register_class |
| | | from funasr.register import tables |
| | | |
| | | class EncoderLayerSANM(nn.Module): |
| | | def __init__( |
| | |
| | | |
| | | return x, cache |
| | | |
| | | @register_class("encoder_classes", "SANMEncoder") |
| | | @tables.register("encoder_classes", "SANMEncoder") |
| | | class SANMEncoder(nn.Module): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | import torch |
| | | |
| | | from funasr.models.transformer.model import Transformer |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | @register_class("model_classes", "SANM") |
| | | @tables.register("model_classes", "SANM") |
| | | class SANM(Transformer): |
| | | """CTC-attention hybrid Encoder-Decoder model""" |
| | | |
| | |
| | | from funasr.models.sanm.positionwise_feed_forward import PositionwiseFeedForwardDecoderSANM |
| | | from funasr.models.transformer.utils.repeat import repeat |
| | | |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | class DecoderLayerSANM(nn.Module): |
| | | """Single decoder layer module. |
| | |
| | | |
| | | return x, memory, fsmn_cache, opt_cache |
| | | |
| | | @register_class("decoder_classes", "FsmnDecoderSCAMAOpt") |
| | | @tables.register("decoder_classes", "FsmnDecoderSCAMAOpt") |
| | | class FsmnDecoderSCAMAOpt(BaseTransformerDecoder): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | |
| | | from funasr.models.ctc.ctc import CTC |
| | | |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | class EncoderLayerSANM(nn.Module): |
| | | def __init__( |
| | |
| | | return x, cache |
| | | |
| | | |
| | | @register_class("encoder_classes", "SANMEncoderChunkOpt") |
| | | @tables.register("encoder_classes", "SANMEncoderChunkOpt") |
| | | class SANMEncoderChunkOpt(nn.Module): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | from funasr.utils.datadir_writer import DatadirWriter |
| | | |
| | | from funasr.models.paraformer.model import Paraformer |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | |
| | | @register_class("model_classes", "SeacoParaformer") |
| | | @tables.register("model_classes", "SeacoParaformer") |
| | | class SeacoParaformer(Paraformer): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | seaco_decoder = kwargs.get("seaco_decoder", None) |
| | | if seaco_decoder is not None: |
| | | seaco_decoder_conf = kwargs.get("seaco_decoder_conf") |
| | | seaco_decoder_class = registry_tables.decoder_classes.get(seaco_decoder.lower()) |
| | | seaco_decoder_class = tables.decoder_classes.get(seaco_decoder.lower()) |
| | | self.seaco_decoder = seaco_decoder_class( |
| | | vocab_size=self.vocab_size, |
| | | encoder_output_size=self.inner_dim, |
| | |
| | | # You can modify the configuration according to your own requirements. |
| | | |
| | | # to print the register_table: |
| | | # from funasr.utils.register import registry_tables |
| | | # registry_tables.print() |
| | | # from funasr.register import tables |
| | | # tables.print() |
| | | |
| | | # network architecture |
| | | model: SeacoParaformer |
| | |
| | | from funasr.models.specaug.mask_along_axis import MaskAlongAxisVariableMaxWidth |
| | | from funasr.models.specaug.mask_along_axis import MaskAlongAxisLFR |
| | | from funasr.models.specaug.time_warp import TimeWarp |
| | | from funasr.utils.register import register_class |
| | | from funasr.register import tables |
| | | |
| | | import torch.nn as nn |
| | | |
| | | @register_class("specaug_classes", "SpecAug") |
| | | @tables.register("specaug_classes", "SpecAug") |
| | | class SpecAug(nn.Module): |
| | | """Implementation of SpecAug. |
| | | |
| | |
| | | x, x_lengths = self.time_mask(x, x_lengths) |
| | | return x, x_lengths |
| | | |
| | | @register_class("specaug_classes", "SpecAugLFR") |
| | | @tables.register("specaug_classes", "SpecAugLFR") |
| | | class SpecAugLFR(nn.Module): |
| | | """Implementation of SpecAug. |
| | | lfr_rate:low frame rate |
| | |
| | | from funasr.models.transformer.utils.repeat import repeat |
| | | from funasr.models.transformer.scorers.scorer_interface import BatchScorerInterface |
| | | |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | class DecoderLayer(nn.Module): |
| | | """Single decoder layer module. |
| | |
| | | state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)] |
| | | return logp, state_list |
| | | |
| | | @register_class("decoder_classes", "TransformerDecoder") |
| | | @tables.register("decoder_classes", "TransformerDecoder") |
| | | class TransformerDecoder(BaseTransformerDecoder): |
| | | def __init__( |
| | | self, |
| | |
| | | ) |
| | | |
| | | |
| | | @register_class("decoder_classes", "LightweightConvolutionTransformerDecoder") |
| | | @tables.register("decoder_classes", "LightweightConvolutionTransformerDecoder") |
| | | class LightweightConvolutionTransformerDecoder(BaseTransformerDecoder): |
| | | def __init__( |
| | | self, |
| | |
| | | ), |
| | | ) |
| | | |
| | | @register_class("decoder_classes", "LightweightConvolution2DTransformerDecoder") |
| | | @tables.register("decoder_classes", "LightweightConvolution2DTransformerDecoder") |
| | | class LightweightConvolution2DTransformerDecoder(BaseTransformerDecoder): |
| | | def __init__( |
| | | self, |
| | |
| | | ) |
| | | |
| | | |
| | | @register_class("decoder_classes", "DynamicConvolutionTransformerDecoder") |
| | | @tables.register("decoder_classes", "DynamicConvolutionTransformerDecoder") |
| | | class DynamicConvolutionTransformerDecoder(BaseTransformerDecoder): |
| | | def __init__( |
| | | self, |
| | |
| | | ), |
| | | ) |
| | | |
| | | @register_class("decoder_classes", "DynamicConvolution2DTransformerDecoder") |
| | | @tables.register("decoder_classes", "DynamicConvolution2DTransformerDecoder") |
| | | class DynamicConvolution2DTransformerDecoder(BaseTransformerDecoder): |
| | | def __init__( |
| | | self, |
| | |
| | | from funasr.models.transformer.utils.subsampling import TooShortUttError |
| | | from funasr.models.transformer.utils.subsampling import check_short_utt |
| | | |
| | | from funasr.utils.register import register_class |
| | | from funasr.register import tables |
| | | |
| | | class EncoderLayer(nn.Module): |
| | | """Encoder layer module. |
| | |
| | | |
| | | return x, mask |
| | | |
| | | @register_class("encoder_classes", "TransformerEncoder") |
| | | @tables.register("encoder_classes", "TransformerEncoder") |
| | | class TransformerEncoder(nn.Module): |
| | | """Transformer encoder module. |
| | | |
| | |
| | | from funasr.datasets.audio_datasets.load_audio_extract_fbank import load_audio, extract_fbank |
| | | from funasr.utils import postprocess_utils |
| | | from funasr.utils.datadir_writer import DatadirWriter |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.register import tables |
| | | |
| | | @register_class("model_classes", "Transformer") |
| | | @tables.register("model_classes", "Transformer") |
| | | class Transformer(nn.Module): |
| | | """CTC-attention hybrid Encoder-Decoder model""" |
| | | |
| | |
| | | super().__init__() |
| | | |
| | | if frontend is not None: |
| | | frontend_class = registry_tables.frontend_classes.get_class(frontend.lower()) |
| | | frontend_class = tables.frontend_classes.get_class(frontend.lower()) |
| | | frontend = frontend_class(**frontend_conf) |
| | | if specaug is not None: |
| | | specaug_class = registry_tables.specaug_classes.get_class(specaug.lower()) |
| | | specaug_class = tables.specaug_classes.get_class(specaug.lower()) |
| | | specaug = specaug_class(**specaug_conf) |
| | | if normalize is not None: |
| | | normalize_class = registry_tables.normalize_classes.get_class(normalize.lower()) |
| | | normalize_class = tables.normalize_classes.get_class(normalize.lower()) |
| | | normalize = normalize_class(**normalize_conf) |
| | | encoder_class = registry_tables.encoder_classes.get_class(encoder.lower()) |
| | | encoder_class = tables.encoder_classes.get_class(encoder.lower()) |
| | | encoder = encoder_class(input_size=input_size, **encoder_conf) |
| | | encoder_output_size = encoder.output_size() |
| | | if decoder is not None: |
| | | decoder_class = registry_tables.decoder_classes.get_class(decoder.lower()) |
| | | decoder_class = tables.decoder_classes.get_class(decoder.lower()) |
| | | decoder = decoder_class( |
| | | vocab_size=vocab_size, |
| | | encoder_output_size=encoder_output_size, |
| | |
| | | # You can modify the configuration according to your own requirements. |
| | | |
| | | # to print the register_table: |
| | | # from funasr.utils.register import registry_tables |
| | | # registry_tables.print() |
| | | # from funasr.register import tables |
| | | # tables.print() |
| | | |
| | | # network architecture |
| | | #model: funasr.models.paraformer.model:Paraformer |
| New file |
| | |
| | | import logging |
| | | import inspect |
| | | from dataclasses import dataclass |
| | | |
| | | |
| | | @dataclass |
| | | class RegisterTables: |
| | | model_classes = {} |
| | | frontend_classes = {} |
| | | specaug_classes = {} |
| | | normalize_classes = {} |
| | | encoder_classes = {} |
| | | decoder_classes = {} |
| | | joint_network_classes = {} |
| | | predictor_classes = {} |
| | | stride_conv_classes = {} |
| | | tokenizer_classes = {} |
| | | batch_sampler_classes = {} |
| | | dataset_classes = {} |
| | | index_ds_classes = {} |
| | | |
| | | def print(self,): |
| | | print("\ntables: \n") |
| | | fields = vars(self) |
| | | for classes_key, classes_dict in fields.items(): |
| | | print(f"----------- ** {classes_key.replace('_meta', '')} ** --------------") |
| | | |
| | | if classes_key.endswith("_meta"): |
| | | headers = ["class name", "register name", "class location"] |
| | | metas = [] |
| | | for register_key, meta in classes_dict.items(): |
| | | metas.append(meta) |
| | | metas.sort(key=lambda x: x[0]) |
| | | data = [headers] + metas |
| | | col_widths = [max(len(str(item)) for item in col) for col in zip(*data)] |
| | | |
| | | for row in data: |
| | | print("| " + " | ".join(str(item).ljust(width) for item, width in zip(row, col_widths)) + " |") |
| | | print("\n") |
| | | |
| | | |
| | | def register(self, register_tables_key: str, key=None): |
| | | def decorator(target_class): |
| | | |
| | | if not hasattr(self, register_tables_key): |
| | | setattr(self, register_tables_key, {}) |
| | | logging.info("new registry table has been added: {}".format(register_tables_key)) |
| | | |
| | | registry = getattr(self, register_tables_key) |
| | | registry_key = key if key is not None else target_class.__name__ |
| | | registry_key = registry_key.lower() |
| | | # import pdb; pdb.set_trace() |
| | | assert not registry_key in registry, "(key: {} / class: {}) has been registered already,in {}".format( |
| | | registry_key, target_class, register_tables_key) |
| | | |
| | | registry[registry_key] = target_class |
| | | |
| | | # meta, headers = ["class name", "register name", "class location"] |
| | | register_tables_key_meta = register_tables_key + "_meta" |
| | | if not hasattr(self, register_tables_key_meta): |
| | | setattr(self, register_tables_key_meta, {}) |
| | | registry_meta = getattr(self, register_tables_key_meta) |
| | | class_file = inspect.getfile(target_class) |
| | | class_line = inspect.getsourcelines(target_class)[1] |
| | | meata_data = [f"{target_class.__name__}", f"{registry_key}", f"{class_file}:{class_line}"] |
| | | registry_meta[registry_key] = meata_data |
| | | # print(f"Registering class: {class_file}:{class_line} - {target_class.__name__} as {registry_key}") |
| | | return target_class |
| | | |
| | | return decorator |
| | | |
| | | |
| | | tables = RegisterTables() |
| | | |
| | | |
| | | import funasr |
| | | |
| | |
| | | import warnings |
| | | |
| | | from funasr.tokenizer.abs_tokenizer import BaseTokenizer |
| | | from funasr.utils.register import register_class |
| | | from funasr.register import tables |
| | | |
| | | @register_class("tokenizer_classes", "CharTokenizer") |
| | | @tables.register("tokenizer_classes", "CharTokenizer") |
| | | class CharTokenizer(BaseTokenizer): |
| | | def __init__( |
| | | self, |