bug fix for punc and umap
| | |
| | | self.metric = metric |
| | | |
| | | def __call__(self, X): |
| | | from umap.umap_ import UMAP |
| | | umap_X = umap.UMAP( |
| | | n_neighbors=self.n_neighbors, |
| | | min_dist=0.0, |
| | |
| | | if X.shape[0] < 20: |
| | | return np.zeros(X.shape[0], dtype='int') |
| | | if X.shape[0] < 2048 or k is not None: |
| | | # unexpected corner case |
| | | labels = self.spectral_cluster(X, k) |
| | | else: |
| | | labels = self.umap_hdbscan_cluster(X) |
| | |
| | | elif new_mini_sentence[-1] != "。" and new_mini_sentence[-1] != "?" and len(new_mini_sentence[-1].encode())!=1: |
| | | new_mini_sentence_out = new_mini_sentence + "。" |
| | | new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.sentence_end_id] |
| | | if len(punctuations): punctuations[-1] = 2 |
| | | elif new_mini_sentence[-1] != "." and new_mini_sentence[-1] != "?" and len(new_mini_sentence[-1].encode())==1: |
| | | new_mini_sentence_out = new_mini_sentence + "." |
| | | new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.sentence_end_id] |
| | | |
| | | if len(punctuations): punctuations[-1] = 2 |
| | | # keep a punctuations array for punc segment |
| | | if punc_array is None: |
| | | punc_array = punctuations |
| | |
| | | punc_array = torch.cat([punc_array, punctuations], dim=0) |
| | | result_i = {"key": key[0], "text": new_mini_sentence_out, "punc_array": punc_array} |
| | | results.append(result_i) |
| | | |
| | | return results, meta_data |
| | | |
| | |
| | | # "protobuf", |
| | | "tqdm", |
| | | "hdbscan", |
| | | "umap", |
| | | "umap_learn", |
| | | "jaconv", |
| | | "hydra-core>=1.3.2", |
| | | ], |