| | |
| | | self.remove_non_linguistic_symbols = remove_non_linguistic_symbols |
| | | self.split_with_space = split_with_space |
| | | self.seg_dict = None |
| | | seg_dict = seg_dict if seg_dict is not None else kwargs.get("seg_dict_file", None) |
| | | if seg_dict is not None: |
| | | self.seg_dict = load_seg_dict(seg_dict) |
| | | |
| | |
| | | return seg_dict |
| | | |
| | | def seg_tokenize(txt, seg_dict): |
| | | pattern = re.compile(r'^[\u4E00-\u9FA50-9]+$') |
| | | # pattern = re.compile(r'^[\u4E00-\u9FA50-9]+$') |
| | | pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])") |
| | | out_txt = "" |
| | | for word in txt: |
| | | word = word.lower() |