1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
| import re
| import argparse
|
|
| def load_dict(seg_file):
| seg_dict = {}
| with open(seg_file, "r") as infile:
| for line in infile:
| s = line.strip().split()
| key = s[0]
| value = s[1:]
| seg_dict[key] = " ".join(value)
| return seg_dict
|
|
| def forward_segment(text, dic):
| word_list = []
| i = 0
| while i < len(text):
| longest_word = text[i]
| for j in range(i + 1, len(text) + 1):
| word = text[i:j]
| if word in dic:
| if len(word) > len(longest_word):
| longest_word = word
| word_list.append(longest_word)
| i += len(longest_word)
| return word_list
|
|
| def tokenize(txt, seg_dict):
| out_txt = ""
| pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
| for word in txt:
| if pattern.match(word):
| if word in seg_dict:
| out_txt += seg_dict[word] + " "
| else:
| out_txt += "<unk>" + " "
| else:
| continue
| return out_txt.strip()
|
|
| def get_parser():
| parser = argparse.ArgumentParser(
| description="text tokenize",
| formatter_class=argparse.ArgumentDefaultsHelpFormatter,
| )
| parser.add_argument(
| "--text-file",
| "-t",
| default=False,
| required=True,
| type=str,
| help="input text",
| )
| parser.add_argument(
| "--seg-file",
| "-s",
| default=False,
| required=True,
| type=str,
| help="seg file",
| )
| parser.add_argument(
| "--txt-index",
| "-i",
| default=1,
| required=True,
| type=int,
| help="txt index",
| )
| parser.add_argument(
| "--output-dir",
| "-o",
| default=False,
| required=True,
| type=str,
| help="output dir",
| )
| return parser
|
|
| def main():
| parser = get_parser()
| args = parser.parse_args()
|
| txt_writer = open("{}/text.{}.txt".format(args.output_dir, args.txt_index), "w")
| shape_writer = open("{}/len.{}".format(args.output_dir, args.txt_index), "w")
| seg_dict = load_dict(args.seg_file)
| with open(args.text_file, "r") as infile:
| for line in infile:
| s = line.strip().split()
| text_id = s[0]
| text_list = forward_segment("".join(s[1:]).lower(), seg_dict)
| text = tokenize(text_list, seg_dict)
| lens = len(text.strip().split())
| txt_writer.write(text_id + " " + text + "\n")
| shape_writer.write(text_id + " " + str(lens) + "\n")
|
|
| if __name__ == "__main__":
| main()
|
|