python/FunASR-XL.git

			@@ -36,6 +36,7 @@
			self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
			self.split_with_space = split_with_space
			self.seg_dict = None
			seg_dict = seg_dict if seg_dict is not None else kwargs.get("seg_dict_file", None)
			if seg_dict is not None:
			self.seg_dict = load_seg_dict(seg_dict)

			@@ -92,7 +93,8 @@
			return seg_dict

			def seg_tokenize(txt, seg_dict):
			pattern = re.compile(r'^[\u4E00-\u9FA50-9]+$')
			# pattern = re.compile(r'^[\u4E00-\u9FA50-9]+$')
			pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
			out_txt = ""
			for word in txt:
			word = word.lower()