From 37d6930561f4e798e76f81c37d912906c97b6bea Mon Sep 17 00:00:00 2001
From: pointerhacker <145901472+pointerhacker@users.noreply.github.com>
Date: 星期三, 20 十一月 2024 18:39:05 +0800
Subject: [PATCH] fix:添加音频码率验证以防止模型异常 (#2219)
---
funasr/tokenizer/build_tokenizer.py | 23 ++++-------------------
1 files changed, 4 insertions(+), 19 deletions(-)
diff --git a/funasr/tokenizer/build_tokenizer.py b/funasr/tokenizer/build_tokenizer.py
index 05db6a6..d6080f7 100644
--- a/funasr/tokenizer/build_tokenizer.py
+++ b/funasr/tokenizer/build_tokenizer.py
@@ -1,17 +1,7 @@
from pathlib import Path
from typing import Iterable
from typing import Union
-from abc import ABC
-from abc import abstractmethod
-from typing import Iterable
-from typing import List
-from pathlib import Path
-from typing import Dict
-from typing import Iterable
-from typing import List
-from typing import Union
-import numpy as np
from funasr.tokenizer.abs_tokenizer import AbsTokenizer
from funasr.tokenizer.char_tokenizer import CharTokenizer
@@ -28,8 +18,7 @@
space_symbol: str = "<space>",
delimiter: str = None,
g2p_type: str = None,
- **kwargs,
-):
+) -> AbsTokenizer:
"""A helper function to instantiate Tokenizer"""
if token_type == "bpe":
if bpemodel is None:
@@ -39,7 +28,7 @@
raise RuntimeError(
"remove_non_linguistic_symbols is not implemented for token_type=bpe"
)
- return SentencepiecesTokenizer(bpemodel, **kwargs)
+ return SentencepiecesTokenizer(bpemodel)
elif token_type == "word":
if remove_non_linguistic_symbols and non_linguistic_symbols is not None:
@@ -49,14 +38,13 @@
remove_non_linguistic_symbols=True,
)
else:
- return WordTokenizer(delimiter=delimiter, **kwargs)
+ return WordTokenizer(delimiter=delimiter)
elif token_type == "char":
return CharTokenizer(
non_linguistic_symbols=non_linguistic_symbols,
space_symbol=space_symbol,
remove_non_linguistic_symbols=remove_non_linguistic_symbols,
- **kwargs
)
elif token_type == "phn":
@@ -65,10 +53,7 @@
non_linguistic_symbols=non_linguistic_symbols,
space_symbol=space_symbol,
remove_non_linguistic_symbols=remove_non_linguistic_symbols,
- **kwargs
)
else:
- raise ValueError(
- f"token_mode must be one of bpe, word, char or phn: " f"{token_type}"
- )
+ raise ValueError(f"token_mode must be one of bpe, word, char or phn: " f"{token_type}")
--
Gitblit v1.9.1