| | |
| | | import codecs |
| | | import re |
| | | import sys |
| | | import json |
| | | |
| | | is_python2 = sys.version_info[0] == 2 |
| | | |
| | |
| | | help="number of characters to split, i.e., \ |
| | | aabb -> a a b b with -n 1 and aa bb with -n 2", |
| | | ) |
| | | parser.add_argument( |
| | | "--skip-ncols", "-s", default=0, type=int, help="skip first n columns" |
| | | ) |
| | | parser.add_argument("--skip-ncols", "-s", default=0, type=int, help="skip first n columns") |
| | | parser.add_argument("--space", default="<space>", type=str, help="space symbol") |
| | | parser.add_argument( |
| | | "--non-lang-syms", |
| | |
| | | read from SI1279.PHN file -> "sil b r ih sil k s aa r er n aa l |
| | | sil t er n ih sil t ih v sil" """, |
| | | ) |
| | | parser.add_argument( |
| | | "--text_format", |
| | | default="text", |
| | | type=str, |
| | | help="text, jsonl", |
| | | ) |
| | | return parser |
| | | |
| | | |
| | |
| | | else: |
| | | f = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer) |
| | | |
| | | sys.stdout = codecs.getwriter("utf-8")( |
| | | sys.stdout if is_python2 else sys.stdout.buffer |
| | | ) |
| | | sys.stdout = codecs.getwriter("utf-8")(sys.stdout if is_python2 else sys.stdout.buffer) |
| | | line = f.readline() |
| | | n = args.nchar |
| | | while line: |
| | | if args.text_format == "jsonl": |
| | | data = json.loads(line.strip()) |
| | | line = data["target"] |
| | | x = line.split() |
| | | print(" ".join(x[: args.skip_ncols]), end=" ") |
| | | a = " ".join(x[args.skip_ncols :]) |