| | |
| | | self.remove_non_linguistic_symbols = remove_non_linguistic_symbols |
| | | self.split_with_space = split_with_space |
| | | self.seg_dict = None |
| | | seg_dict = seg_dict if seg_dict is not None else kwargs.get("seg_dict_file", None) |
| | | if seg_dict is not None: |
| | | self.seg_dict = load_seg_dict(seg_dict) |
| | | |
| | |
| | | |
| | | def text2tokens(self, line: Union[str, list]) -> List[str]: |
| | | |
| | | if self.split_with_space: |
| | | # if self.split_with_space: |
| | | |
| | | if self.seg_dict is not None: |
| | | tokens = line.strip().split(" ") |
| | | if self.seg_dict is not None: |
| | | tokens = seg_tokenize(tokens, self.seg_dict) |
| | | tokens = seg_tokenize(tokens, self.seg_dict) |
| | | else: |
| | | tokens = [] |
| | | while len(line) != 0: |
| | |
| | | else: |
| | | t = line[0] |
| | | if t == " ": |
| | | t = "<space>" |
| | | # t = "<space>" |
| | | line = line[1:] |
| | | continue |
| | | tokens.append(t) |
| | | line = line[1:] |
| | | return tokens |