1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
| import sys
| import re
|
| in_f = sys.argv[1]
| out_f = sys.argv[2]
|
|
| with open(in_f, "r", encoding="utf-8") as f:
| lines = f.readlines()
|
| with open(out_f, "w", encoding="utf-8") as f:
| for line in lines:
| outs = line.strip().split(" ", 1)
| if len(outs) == 2:
| idx, text = outs
| text = re.sub("</s>", "", text)
| text = re.sub("<s>", "", text)
| text = re.sub("@@", "", text)
| text = re.sub("@", "", text)
| text = re.sub("<unk>", "", text)
| text = re.sub(" ", "", text)
| text = text.lower()
| else:
| idx = outs[0]
| text = " "
|
| text = [x for x in text]
| text = " ".join(text)
| out = "{} {}\n".format(idx, text)
| f.write(out)
|
|