1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
| #!/usr/bin/env python
| import numpy as np
|
| def tokenize(data,
| vocab=None):
| assert "text" in data
| assert isinstance(vocab, dict)
| text = data["text"]
| token = []
| for x in text:
| if x in vocab:
| token.append(vocab[x])
| else:
| token.append(vocab['<unk>'])
|
| data["text"] = np.array(token)
| return data
|
|