Merge pull request #370 from alibaba-damo-academy/dev_lhn2
update
| | |
| | | def seg_tokenize(txt, seg_dict): |
| | | out_txt = "" |
| | | for word in txt: |
| | | word = word.lower() |
| | | if word in seg_dict: |
| | | out_txt += seg_dict[word] + " " |
| | | else: |
| | |
| | | |
| | | if seg_dict is not None: |
| | | assert isinstance(seg_dict, dict) |
| | | txt = forward_segment("".join(text).lower(), seg_dict) |
| | | text = seg_tokenize(txt, seg_dict) |
| | | text = seg_tokenize(text, seg_dict) |
| | | |
| | | length = len(text) |
| | | for i in range(length): |
| | |
| | | def seg_tokenize(txt, seg_dict): |
| | | out_txt = "" |
| | | for word in txt: |
| | | word = word.lower() |
| | | if word in seg_dict: |
| | | out_txt += seg_dict[word] + " " |
| | | else: |
| | |
| | | if self.split_with_space: |
| | | tokens = text.strip().split(" ") |
| | | if self.seg_dict is not None: |
| | | tokens = forward_segment("".join(tokens), self.seg_dict) |
| | | tokens = seg_tokenize(tokens, self.seg_dict) |
| | | else: |
| | | tokens = self.tokenizer.text2tokens(text) |