diff --git a/.gitignore b/.gitignore index b6e4761..c5c1289 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.py[cod] *$py.class +.idea/ # C extensions *.so diff --git a/examples/cluster.py b/examples/cluster.py new file mode 100644 index 0000000..0c6c062 --- /dev/null +++ b/examples/cluster.py @@ -0,0 +1,46 @@ +import time +import sys + +import numpy as np +import matplotlib.pyplot as plt +import tqdm +import torch + +from graphgrove.vec_scc import Cosine_SCC +from graphgrove.graph_builder import unit_norm + +gt = time.time + +np.random.seed(123) +cores = 80 + +num_rounds = 50 +thresholds = np.geomspace(1.0, 0.001, num_rounds).astype(np.float32) + +for idx in tqdm.tqdm(range(50, 10000)): + vectors = np.load('../knnlm-distill/dstore/ids/' + str(idx) + '.npy') + print(vectors.shape) + vectors = unit_norm(vectors) + vectors = vectors.astype(np.float32) + + t = gt() + scc = Cosine_SCC(k=25, num_rounds=num_rounds, thresholds=thresholds, + index_name='cosine_faisshnsw', cores=cores, verbosity=0) + scc.partial_fit(vectors) + b_t = gt() - t + scc = scc.scc + # cos_scc operates on vectors, it's member object, scc + # (https://github.com/nmonath/graphgrove/blob/main/graphgrove/scc.py) operates on the k-nearest neighbor graph. + levels = scc.levels # clustering will store the flat clustering + + cluster_data_save = {'thresholds': thresholds, + 'cluster_levels': []} + + for selected_level in range(num_rounds + 1): + clustering = [] + for node in levels[selected_level].nodes: + clustering.append( + node.descendants().squeeze(-1)) # descendants has the ids of the data points which are the descendant leaves + cluster_data_save['cluster_levels'].append(clustering) + + torch.save(cluster_data_save, '../knnlm-distill/dstore/clusters/' + str(idx) + '.pt') \ No newline at end of file diff --git a/examples/clustering.py b/examples/clustering.py deleted file mode 100644 index 2dda7b8..0000000 --- a/examples/clustering.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -Copyright (c) 2021 The authors of SCC All rights reserved. - -Initially modified from CoverTree -https://github.com/manzilzaheer/CoverTree -Copyright (c) 2017 Manzil Zaheer All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import time -import sys - -import numpy as np - -from graphgrove.vec_scc import Cosine_SCC -from graphgrove.graph_builder import unit_norm - -gt = time.time - -np.random.seed(123) -cores = 1 - -print('======== Building Dataset ==========') -N=100 -K=5 -D=784 -means = 20*np.random.rand(K,D) - 10 -x = np.vstack([np.random.randn(N,D) + means[i] for i in range(K)]) -np.random.shuffle(x) -x = unit_norm(x) -x = x.astype(np.float32) -x = np.require(x, requirements=['A', 'C', 'O', 'W']) -y = np.vstack([np.random.randn(N,D) + means[i] for i in range(K)]) -y = y.astype(np.float32) -y = np.require(y, requirements=['A', 'C', 'O', 'W']) - -print('======== SCC ==========') -t = gt() -num_rounds = 50 -thresholds = np.geomspace(1.0, 0.001, num_rounds).astype(np.float32) -scc = Cosine_SCC(k=5, num_rounds=num_rounds, thresholds=thresholds, index_name='cosine_sgtree', cores=cores, verbosity=1) -scc.partial_fit(x) -b_t = gt() - t -print("Clustering time:", b_t, "seconds") -sys.stdout.flush() - -print('======== MB-SCC ==========') -t = gt() -num_rounds = 50 -thresholds = np.geomspace(1.0, 0.001, num_rounds).astype(np.float32) -scc = Cosine_SCC(k=5, num_rounds=num_rounds, thresholds=thresholds, index_name='cosine_sgtree', cores=cores, verbosity=0) -bs = 1 -for i in range(0, x.shape[0], bs): - # print(i) - scc.partial_fit(x[i:i+bs]) -b_t = gt() - t -print("Clustering time:", b_t, "seconds") -del scc -sys.stdout.flush() diff --git a/examples/dag_clustering.py b/examples/dag_clustering.py deleted file mode 100644 index 289a114..0000000 --- a/examples/dag_clustering.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Copyright (c) 2021 The authors of LLama All rights reserved. - -Initially modified from CoverTree -https://github.com/manzilzaheer/CoverTree -Copyright (c) 2017 Manzil Zaheer All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -from graphgrove.llama import LLAMA -import numpy as np -from scipy.sparse import coo_matrix - -num_nodes = 100 -num_edges = 2500 -r = np.random.choice(num_nodes, size=num_edges) -c = np.random.choice(num_nodes, size=num_edges) -sim = np.random.random_sample(size=num_edges).astype(np.float32) -graph = coo_matrix((sim,(r,c))) - -def make_symmetric(coo_mat): - lil = coo_mat.tolil() - rows, cols = lil.nonzero() - lil[cols, rows] = lil[rows, cols].maximum(lil[cols, rows]) - return lil.tocoo() - -graph = make_symmetric(graph) - -llama = LLAMA.from_graph(graph, num_rounds=10, cores=3, linkage='approx_average') -llama.cluster() - -print(llama.assignments()) -print(llama.structure()) -print(llama.round(2)) \ No newline at end of file diff --git a/examples/nearest_neighbor_search.py b/examples/nearest_neighbor_search.py deleted file mode 100644 index 7ffaad1..0000000 --- a/examples/nearest_neighbor_search.py +++ /dev/null @@ -1,87 +0,0 @@ -""" -Copyright (c) 2021 The authors of SG Tree All rights reserved. - -Initially modified from CoverTree -https://github.com/manzilzaheer/CoverTree -Copyright (c) 2017 Manzil Zaheer All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import time -import numpy as np - -from graphgrove.sgtree import NNS_L2 as SGTree_NNS_L2 -from graphgrove.covertree import NNS_L2 as CoverTree_NNS_L2 - -gt = time.time - -np.random.seed(123) -cores = 4 - -print('======== Building Dataset ==========') -N=1000 -K=10 -D=784 -means = 20*np.random.rand(K,D) - 10 -x = np.vstack([np.random.randn(N,D) + means[i] for i in range(K)]) -np.random.shuffle(x) -x = x.astype(np.float32) -x = np.require(x, requirements=['A', 'C', 'O', 'W']) -y = np.vstack([np.random.randn(N,D) + means[i] for i in range(K)]) -y = y.astype(np.float32) -y = np.require(y, requirements=['A', 'C', 'O', 'W']) - -print('======== Cover Tree ==========') -t = gt() -ct = CoverTree_NNS_L2.from_matrix(x, use_multi_core=cores) -b_t = gt() - t -#ct.display() -print("Building time:", b_t, "seconds") - -print('Test k-Nearest Neighbours - Exact (k=3): ') -t = gt() -idx1, d1 = ct.kNearestNeighbours(y,3, use_multi_core=cores) -b_t = gt() - t -print("Query time - Exact:", b_t, "seconds") - -print('======== SG Tree ==========') -t = gt() -ct = SGTree_NNS_L2.from_matrix(x, use_multi_core=cores) -b_t = gt() - t -#ct.display() -print("Building time:", b_t, "seconds") - -print('Test k-Nearest Neighbours - Exact (k=3): ') -t = gt() -idx1, d1 = ct.kNearestNeighbours(y,3, use_multi_core=cores) -b_t = gt() - t -print("Query time - Exact:", b_t, "seconds") - -print('Test k-Nearest Neighbours - Beam (k=3, beam_size=10): ') -t = gt() -idx1, d1 = ct.kNearestNeighboursBeam(y, 3, use_multi_core=cores, beam_size=10) -b_t = gt() - t -print("Query time - Beam:", b_t, "seconds") - -print('Test Range - cores=0') -t = gt() -idx1, d1 = ct.RangeSearch(y, r=0.5, use_multi_core=0) -b_t = gt() - t -print("Query time - Range:", b_t, "seconds") - -print('Test Range - cores=%s' % cores) -t = gt() -idx1, d1 = ct.RangeSearch(y, r=0.5, use_multi_core=cores) -b_t = gt() - t -print("Query time - Range:", b_t, "seconds") \ No newline at end of file diff --git a/examples/plot_cluster.py b/examples/plot_cluster.py new file mode 100644 index 0000000..5aceea2 --- /dev/null +++ b/examples/plot_cluster.py @@ -0,0 +1,52 @@ +import time +import sys + +import numpy as np +import matplotlib.pyplot as plt +import tqdm + +from graphgrove.vec_scc import Cosine_SCC +from graphgrove.graph_builder import unit_norm +from fairseq.data import Dictionary +dictionary = Dictionary.load('../knnlm-distill/data-bin/wikitext103-bpe/dict.txt') + +to_save = ['bank', 'shore', 'institution', 'beautiful'] + +gt = time.time + +np.random.seed(123) +cores = 80 + +for word in to_save: + plot_x = [] + plot_y = [] + + idx = dictionary.index(word) + print(idx) + vectors = np.load('../knnlm-distill/dstore/ids/' + str(idx) + '.npy') + print(vectors.shape) + vectors = unit_norm(vectors) + vectors = vectors.astype(np.float32) + t = gt() + num_rounds = 50 + thresholds = np.geomspace(1.0, 0.001, num_rounds).astype(np.float32) + scc = Cosine_SCC(k=25, num_rounds=num_rounds, thresholds=thresholds, + index_name='cosine_faisshnsw', cores=cores, verbosity=0) + scc.partial_fit(vectors) + b_t = gt() - t + scc = scc.scc # cos_scc operates on vectors, it's member object, scc (https://github.com/nmonath/graphgrove/blob/main/graphgrove/scc.py) operates on the k-nearest neighbor graph. + levels = scc.levels # clustering will store the flat clustering + for selected_level in range(num_rounds + 1): + clustering = [] + for node in levels[selected_level].nodes: + clustering.append( + node.descendants()) # descendants has the ids of the data points which are the descendant leaves + number_clusters = len(clustering) + print(selected_level, number_clusters) + plot_x.append(selected_level) + plot_y.append(number_clusters) + + plt.scatter(plot_x, plot_y, label=word, s=4) +plt.legend(loc="upper right") +plt.ylim([0, 1000]) +plt.savefig('words.pdf') diff --git a/examples/plot_cluster_num_distribution.py b/examples/plot_cluster_num_distribution.py new file mode 100644 index 0000000..7a73d61 --- /dev/null +++ b/examples/plot_cluster_num_distribution.py @@ -0,0 +1,54 @@ +import time + +import matplotlib.pyplot as plt +import numpy as np +import torch +import tqdm +from multiprocessing import Pool +gt = time.time + +np.random.seed(123) +cores = 80 + +num_rounds = 50 +thresholds = np.geomspace(1.0, 0.001, num_rounds).astype(np.float32) +print(len(thresholds)) +plot_x = [] +plot_y = [] + +plot_detail_x = [] +plot_detail_y = [] + +levels_interested = [40, 45, 50] +histo_data = dict.fromkeys(levels_interested) +for x in histo_data: + histo_data[x] = [] + +# def process_cluster_id(cluster_id): +# cluster_result = torch.load('../knnlm-distill/dstore/clusters/' + str(cluster_id) + '.pt') +# for level, cluster in enumerate(cluster_result['cluster_levels']): +# if level in levels_interested: +# histo_data[level].append(len(cluster)) +# +# with Pool(processes=60) as pool: +bins = np.linspace(0, 40, 20) +print(bins) + +for idx in tqdm.tqdm(range(50, 10000, 10)): + cluster_result = torch.load('../knnlm-distill/dstore/clusters/' + str(idx) + '.pt') + for level, cluster in enumerate(cluster_result['cluster_levels']): + if level in levels_interested: + num_cls = len(cluster) + if num_cls > 40: + num_cls = 40 + histo_data[level].append(num_cls) + +for l in histo_data: + print(np.sum(histo_data[l])*10) + plt.hist(histo_data[l], bins, alpha=0.5, label=thresholds[l-1]) +plt.legend(loc='upper right') +# plt.ylim(0, 1000) +plt.savefig('histogram_num_clusters.pdf') + +plt.clf() + diff --git a/graphgrove/graph_builder.py b/graphgrove/graph_builder.py index 34c7d0e..3bbdd1c 100644 --- a/graphgrove/graph_builder.py +++ b/graphgrove/graph_builder.py @@ -14,6 +14,7 @@ limitations under the License. """ + from absl import logging import numpy as np from scipy.sparse import coo_matrix @@ -256,15 +257,14 @@ def topk(self, query): return results[0].astype(np.float32), results[1].astype(np.int32) class Cosine_FaissHNSW(Index): - def __init__(self, k, max_degree=128, efSearch=128, efConstruction=200, add_noise=True, noise_amount=1e-6, - assume_unit_norm=True): - super(FaissHNSW, self).__init__(k) + assume_unit_normed=True): + super(Cosine_FaissHNSW, self).__init__(k) self.index = None self.row = None self.col = None @@ -276,7 +276,7 @@ def __init__(self, k, self.efSearch = efSearch self.efConstruction = efConstruction self.noise_amount = noise_amount - self.assume_unit_norm = assume_unit_norm + self.assume_unit_normed = assume_unit_normed def build(self, vectors): t0 = time.time() @@ -285,6 +285,7 @@ def build(self, vectors): self.num_points += vectors.shape[0] if not self.assume_unit_normed: vectors = unit_norm(vectors) + import faiss self.index = faiss.IndexHNSWFlat(vectors.shape[1], self.max_degree) self.index.hnsw.efConstruction = self.efConstruction self.index.hnsw.efSearch = self.efSearch @@ -326,5 +327,5 @@ def topk(self, query): if not self.assume_unit_normed: query = unit_norm(query) results = self.index.search(query, min(self.k, self.num_points)) - return (2-results[0].astype(np.float32)**2)/2, results[1].astype(np.int32) + return (2-results[0].astype(np.float32))/2, results[1].astype(np.int32) diff --git a/graphgrove/vec_scc.py b/graphgrove/vec_scc.py index fde13fa..f053573 100644 --- a/graphgrove/vec_scc.py +++ b/graphgrove/vec_scc.py @@ -42,7 +42,6 @@ def __init__(self, k=25, num_rounds=50, thresholds=None, index_name='cosine_sgtr self.hnsw_max_degree = hnsw_max_degree self.hnsw_ef_search = hnsw_ef_search self.hnsw_ef_construction = hnsw_ef_construction - if self.index_name.lower() == 'cosine_covertree': self.index = graph_builder.Cosine_CoverTree(self.k, self.cores) elif self.index_name.lower() == 'cosine_sgtree': @@ -53,7 +52,6 @@ def __init__(self, k=25, num_rounds=50, thresholds=None, index_name='cosine_sgtr self.index = graph_builder.Cosine_FaissFlat(self.k) elif self.index_name.lower() == 'cosine_faisshnsw': self.index = graph_builder.Cosine_FaissHNSW(self.k, self.hnsw_max_degree, self.hnsw_ef_search, self.hnsw_ef_construction) - self.scc = SCC.init(self.thresholds, self.cores, self.cc_alg, self.par_minimum, self.verbosity) # def __del__(self): diff --git a/histogram_num_clusters.pdf b/histogram_num_clusters.pdf new file mode 100644 index 0000000..dcc1007 Binary files /dev/null and b/histogram_num_clusters.pdf differ diff --git a/words.pdf b/words.pdf new file mode 100644 index 0000000..bfbac72 Binary files /dev/null and b/words.pdf differ