nmonath · frankxu2004 · Oct 12, 2021 · Oct 15, 2021 · Oct 21, 2021 · Oct 21, 2021
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@ __pycache__/
 *.py[cod]
 *$py.class
 
+.idea/
 # C extensions
 *.so
 

diff --git a/examples/cluster.py b/examples/cluster.py
@@ -0,0 +1,46 @@
+import time
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import tqdm
+import torch
+
+from graphgrove.vec_scc import Cosine_SCC
+from graphgrove.graph_builder import unit_norm
+
+gt = time.time
+
+np.random.seed(123)
+cores = 80
+
+num_rounds = 50
+thresholds = np.geomspace(1.0, 0.001, num_rounds).astype(np.float32)
+
+for idx in tqdm.tqdm(range(50, 10000)):
+    vectors = np.load('../knnlm-distill/dstore/ids/' + str(idx) + '.npy')
+    print(vectors.shape)
+    vectors = unit_norm(vectors)
+    vectors = vectors.astype(np.float32)
+
+    t = gt()
+    scc = Cosine_SCC(k=25, num_rounds=num_rounds, thresholds=thresholds,
+                     index_name='cosine_faisshnsw', cores=cores, verbosity=0)
+    scc.partial_fit(vectors)
+    b_t = gt() - t
+    scc = scc.scc
+    # cos_scc operates on vectors, it's member object, scc
+    # (https://github.com/nmonath/graphgrove/blob/main/graphgrove/scc.py) operates on the k-nearest neighbor graph.
+    levels = scc.levels  # clustering will store the flat clustering
+
+    cluster_data_save = {'thresholds': thresholds,
+                         'cluster_levels': []}
+
+    for selected_level in range(num_rounds + 1):
+        clustering = []
+        for node in levels[selected_level].nodes:
+            clustering.append(
+                node.descendants().squeeze(-1))  # descendants has the ids of the data points which are the descendant leaves
+        cluster_data_save['cluster_levels'].append(clustering)
+
+    torch.save(cluster_data_save, '../knnlm-distill/dstore/clusters/' + str(idx) + '.pt')
diff --git a/examples/clustering.py b/examples/clustering.py
diff --git a/examples/dag_clustering.py b/examples/dag_clustering.py
diff --git a/examples/nearest_neighbor_search.py b/examples/nearest_neighbor_search.py
diff --git a/examples/plot_cluster.py b/examples/plot_cluster.py
@@ -0,0 +1,52 @@
+import time
+import sys
+
+import numpy as np
+import matplotlib.pyplot as plt
+import tqdm
+
+from graphgrove.vec_scc import Cosine_SCC
+from graphgrove.graph_builder import unit_norm
+from fairseq.data import Dictionary
+dictionary = Dictionary.load('../knnlm-distill/data-bin/wikitext103-bpe/dict.txt')
+
+to_save = ['bank', 'shore', 'institution', 'beautiful']
+
+gt = time.time
+
+np.random.seed(123)
+cores = 80
+
+for word in to_save:
+    plot_x = []
+    plot_y = []
+
+    idx = dictionary.index(word)
+    print(idx)
+    vectors = np.load('../knnlm-distill/dstore/ids/' + str(idx) + '.npy')
+    print(vectors.shape)
+    vectors = unit_norm(vectors)
+    vectors = vectors.astype(np.float32)
+    t = gt()
+    num_rounds = 50
+    thresholds = np.geomspace(1.0, 0.001, num_rounds).astype(np.float32)
+    scc = Cosine_SCC(k=25, num_rounds=num_rounds, thresholds=thresholds,
+                     index_name='cosine_faisshnsw', cores=cores, verbosity=0)
+    scc.partial_fit(vectors)
+    b_t = gt() - t
+    scc = scc.scc  # cos_scc operates on vectors, it's member object, scc (https://github.com/nmonath/graphgrove/blob/main/graphgrove/scc.py) operates on the k-nearest neighbor graph.
+    levels = scc.levels  # clustering will store the flat clustering
+    for selected_level in range(num_rounds + 1):
+        clustering = []
+        for node in levels[selected_level].nodes:
+            clustering.append(
+                node.descendants())  # descendants has the ids of the data points which are the descendant leaves
+        number_clusters = len(clustering)
+        print(selected_level, number_clusters)
+        plot_x.append(selected_level)
+        plot_y.append(number_clusters)
+
+    plt.scatter(plot_x, plot_y, label=word, s=4)
+plt.legend(loc="upper right")
+plt.ylim([0, 1000])
+plt.savefig('words.pdf')
diff --git a/examples/plot_cluster_num_distribution.py b/examples/plot_cluster_num_distribution.py
@@ -0,0 +1,54 @@
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import tqdm
+from multiprocessing import Pool
+gt = time.time
+
+np.random.seed(123)
+cores = 80
+
+num_rounds = 50
+thresholds = np.geomspace(1.0, 0.001, num_rounds).astype(np.float32)
+print(len(thresholds))
+plot_x = []
+plot_y = []
+
+plot_detail_x = []
+plot_detail_y = []
+
+levels_interested = [40, 45, 50]
+histo_data = dict.fromkeys(levels_interested)
+for x in histo_data:
+    histo_data[x] = []
+
+# def process_cluster_id(cluster_id):
+#     cluster_result = torch.load('../knnlm-distill/dstore/clusters/' + str(cluster_id) + '.pt')
+#     for level, cluster in enumerate(cluster_result['cluster_levels']):
+#         if level in levels_interested:
+#             histo_data[level].append(len(cluster))
+#
+# with Pool(processes=60) as pool:
+bins = np.linspace(0, 40, 20)
+print(bins)
+
+for idx in tqdm.tqdm(range(50, 10000, 10)):
+    cluster_result = torch.load('../knnlm-distill/dstore/clusters/' + str(idx) + '.pt')
+    for level, cluster in enumerate(cluster_result['cluster_levels']):
+        if level in levels_interested:
+            num_cls = len(cluster)
+            if num_cls > 40:
+                num_cls = 40
+            histo_data[level].append(num_cls)
+
+for l in histo_data:
+    print(np.sum(histo_data[l])*10)
+    plt.hist(histo_data[l], bins, alpha=0.5, label=thresholds[l-1])
+plt.legend(loc='upper right')
+# plt.ylim(0, 1000)
+plt.savefig('histogram_num_clusters.pdf')
+
+plt.clf()
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,7 @@ __pycache__/ @@
     *.py[cod]
     *$py.class
+    .idea/
     # C extensions
     *.so
@@ Expand Down @@