| | |
| | | # sys.argv[3]: lexicon file for corpus.dict |
| | | |
| | | lex_dict = {} |
| | | with open(sys.argv[2], 'r', encoding='utf8') as fin: |
| | | with open(sys.argv[2], "r", encoding="utf8") as fin: |
| | | for line in fin: |
| | | words = line.strip().split('\t') |
| | | words = line.strip().split("\t") |
| | | if len(words) != 2: |
| | | continue |
| | | lex_dict[words[0]] = words[1] |
| | | |
| | | with open(sys.argv[1], 'r', encoding='utf8') as fin, \ |
| | | open(sys.argv[3], 'w', encoding='utf8') as fout: |
| | | with open(sys.argv[1], "r", encoding="utf8") as fin, open( |
| | | sys.argv[3], "w", encoding="utf8" |
| | | ) as fout: |
| | | for line in fin: |
| | | word = line.strip() |
| | | if word == '<s>' or word == '</s>': |
| | | if word == "<s>" or word == "</s>": |
| | | continue |
| | | word_lex = "" |
| | | if word in lex_dict: |
| | |
| | | if word[i] in lex_dict: |
| | | word_lex += " " + lex_dict[word[i]] |
| | | else: |
| | | word_lex += " <unk>" |
| | | |
| | | fout.write('{}\t{}\n'.format(word, word_lex.strip())) |
| | | word_lex += " <unk>" |
| | | |
| | | fout.write("{}\t{}\n".format(word, word_lex.strip())) |