Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ __pycache__/
*.py[cod]
*$py.class

.idea/
# C extensions
*.so

Expand Down
46 changes: 46 additions & 0 deletions examples/cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import time
import sys

import numpy as np
import matplotlib.pyplot as plt
import tqdm
import torch

from graphgrove.vec_scc import Cosine_SCC
from graphgrove.graph_builder import unit_norm

gt = time.time

np.random.seed(123)
cores = 80

num_rounds = 50
thresholds = np.geomspace(1.0, 0.001, num_rounds).astype(np.float32)

for idx in tqdm.tqdm(range(50, 10000)):
vectors = np.load('../knnlm-distill/dstore/ids/' + str(idx) + '.npy')
print(vectors.shape)
vectors = unit_norm(vectors)
vectors = vectors.astype(np.float32)

t = gt()
scc = Cosine_SCC(k=25, num_rounds=num_rounds, thresholds=thresholds,
index_name='cosine_faisshnsw', cores=cores, verbosity=0)
scc.partial_fit(vectors)
b_t = gt() - t
scc = scc.scc
# cos_scc operates on vectors, it's member object, scc
# (https://github.com/nmonath/graphgrove/blob/main/graphgrove/scc.py) operates on the k-nearest neighbor graph.
levels = scc.levels # clustering will store the flat clustering

cluster_data_save = {'thresholds': thresholds,
'cluster_levels': []}

for selected_level in range(num_rounds + 1):
clustering = []
for node in levels[selected_level].nodes:
clustering.append(
node.descendants().squeeze(-1)) # descendants has the ids of the data points which are the descendant leaves
cluster_data_save['cluster_levels'].append(clustering)

torch.save(cluster_data_save, '../knnlm-distill/dstore/clusters/' + str(idx) + '.pt')
70 changes: 0 additions & 70 deletions examples/clustering.py

This file was deleted.

45 changes: 0 additions & 45 deletions examples/dag_clustering.py

This file was deleted.

87 changes: 0 additions & 87 deletions examples/nearest_neighbor_search.py

This file was deleted.

52 changes: 52 additions & 0 deletions examples/plot_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import time
import sys

import numpy as np
import matplotlib.pyplot as plt
import tqdm

from graphgrove.vec_scc import Cosine_SCC
from graphgrove.graph_builder import unit_norm
from fairseq.data import Dictionary
dictionary = Dictionary.load('../knnlm-distill/data-bin/wikitext103-bpe/dict.txt')

to_save = ['bank', 'shore', 'institution', 'beautiful']

gt = time.time

np.random.seed(123)
cores = 80

for word in to_save:
plot_x = []
plot_y = []

idx = dictionary.index(word)
print(idx)
vectors = np.load('../knnlm-distill/dstore/ids/' + str(idx) + '.npy')
print(vectors.shape)
vectors = unit_norm(vectors)
vectors = vectors.astype(np.float32)
t = gt()
num_rounds = 50
thresholds = np.geomspace(1.0, 0.001, num_rounds).astype(np.float32)
scc = Cosine_SCC(k=25, num_rounds=num_rounds, thresholds=thresholds,
index_name='cosine_faisshnsw', cores=cores, verbosity=0)
scc.partial_fit(vectors)
b_t = gt() - t
scc = scc.scc # cos_scc operates on vectors, it's member object, scc (https://github.com/nmonath/graphgrove/blob/main/graphgrove/scc.py) operates on the k-nearest neighbor graph.
levels = scc.levels # clustering will store the flat clustering
for selected_level in range(num_rounds + 1):
clustering = []
for node in levels[selected_level].nodes:
clustering.append(
node.descendants()) # descendants has the ids of the data points which are the descendant leaves
number_clusters = len(clustering)
print(selected_level, number_clusters)
plot_x.append(selected_level)
plot_y.append(number_clusters)

plt.scatter(plot_x, plot_y, label=word, s=4)
plt.legend(loc="upper right")
plt.ylim([0, 1000])
plt.savefig('words.pdf')
54 changes: 54 additions & 0 deletions examples/plot_cluster_num_distribution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import time

import matplotlib.pyplot as plt
import numpy as np
import torch
import tqdm
from multiprocessing import Pool
gt = time.time

np.random.seed(123)
cores = 80

num_rounds = 50
thresholds = np.geomspace(1.0, 0.001, num_rounds).astype(np.float32)
print(len(thresholds))
plot_x = []
plot_y = []

plot_detail_x = []
plot_detail_y = []

levels_interested = [40, 45, 50]
histo_data = dict.fromkeys(levels_interested)
for x in histo_data:
histo_data[x] = []

# def process_cluster_id(cluster_id):
# cluster_result = torch.load('../knnlm-distill/dstore/clusters/' + str(cluster_id) + '.pt')
# for level, cluster in enumerate(cluster_result['cluster_levels']):
# if level in levels_interested:
# histo_data[level].append(len(cluster))
#
# with Pool(processes=60) as pool:
bins = np.linspace(0, 40, 20)
print(bins)

for idx in tqdm.tqdm(range(50, 10000, 10)):
cluster_result = torch.load('../knnlm-distill/dstore/clusters/' + str(idx) + '.pt')
for level, cluster in enumerate(cluster_result['cluster_levels']):
if level in levels_interested:
num_cls = len(cluster)
if num_cls > 40:
num_cls = 40
histo_data[level].append(num_cls)

for l in histo_data:
print(np.sum(histo_data[l])*10)
plt.hist(histo_data[l], bins, alpha=0.5, label=thresholds[l-1])
plt.legend(loc='upper right')
# plt.ylim(0, 1000)
plt.savefig('histogram_num_clusters.pdf')

plt.clf()

Loading