-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgmc.py
More file actions
102 lines (90 loc) · 3.59 KB
/
gmc.py
File metadata and controls
102 lines (90 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 28 14:08:00 2025
@author: gjgan
"""
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from sklearn.datasets import make_blobs
from ucimlrepo import fetch_ucirepo
from dcutil import createCM
def gmc(X, k=3, tol=1e-8, maxit=100):
X = np.ascontiguousarray(X)
n, d = X.shape
mixingProp = np.ones(k) / k
ind = np.random.choice(n, k, replace=False)
clusterCenters = X[ind,:]
varianceMatrices = [np.diag(np.var(X, axis=0))] * k
density = np.zeros((n, k))
for i in range(k):
density[:,i] = mixingProp[i]*multivariate_normal(mean=clusterCenters[i,:], cov=varianceMatrices[i]).pdf(X)
conditionalProb = density / np.sum(density, axis=1, keepdims=True)
logLikelihood = np.sum(np.log(np.sum(density, axis=1))).item()
numIter = 1
while numIter < maxit:
# M-step
dv = np.sum(conditionalProb, axis=0)
mixingProp = dv / np.sum(dv)
for i in range(k):
clusterCenters[i,:] = np.sum(X*conditionalProb[:,i].reshape((n,1)), axis=0)/np.sum(conditionalProb[:,i])
Xc = X - clusterCenters[i,:]
varianceMatrices[i] = np.matmul(np.transpose(Xc), Xc * conditionalProb[:,i].reshape((n,1)))/np.sum(conditionalProb[:,i])
# E-step
density = np.zeros((n, k))
for i in range(k):
density[:,i] = mixingProp[i]*multivariate_normal(mean=clusterCenters[i,:], cov=varianceMatrices[i]).pdf(X)
conditionalProb = density / np.sum(density, axis=1, keepdims=True)
logLikelihood_ = logLikelihood
logLikelihood = np.sum(np.log(np.sum(density, axis=1))).item()
numIter += 1
if np.abs(logLikelihood - logLikelihood_) < tol:
break;
return conditionalProb, clusterCenters, mixingProp, logLikelihood, numIter
def gmc2(X, k=3, numrun=10, maxit=100):
bestCP, bestCC, bestMP, bestLL, bestIters = gmc(X, k, maxit)
vLL = np.zeros(numrun)
vLL[0] = bestLL
for i in range(numrun-1):
cp, cc, mp, ll, iters = gmc(X, k, maxit)
vLL[i+1] =ll
if ll > bestLL:
bestCP, bestCC, bestMP, bestLL, bestIters = cp, cc, mp, ll, iters
return bestCP, bestCC, bestMP, bestLL, bestIters, vLL
# examples
# synthetic data
centers = [[3, 3], [-3, -3], [3, -3]]
X, y = make_blobs(n_samples=300, centers=centers, cluster_std=1, random_state=1)
bcp, bcc, bmp, bll, biters, vLL = gmc2(X, k=3, numrun=100)
yhat = np.argmax(bcp, axis=1)
cm1 = createCM(y, yhat)
print(cm1)
print([bll, biters])
fig, ax = plt.subplots(1, 1, figsize=(6, 4))
ax.scatter(range(len(vLL)), vLL, color="black")
ax.set_xlabel("Run number")
ax.set_ylabel("log-likelihood")
fig.savefig("300ll.pdf", bbox_inches='tight')
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
markers = ["x", "o", "+"]
for i in range(3):
members = yhat == i
center = bcc[i,:]
ax.plot(X[members, 0], X[members, 1], markers[i], color="black")
ax.plot(center[0], center[1], "^", markerfacecolor="white",
markeredgecolor="black", markersize=15)
fig.savefig("gmc1.pdf", bbox_inches='tight')
# iris data
iris = fetch_ucirepo(id=53)
X = iris.data.features
y = iris.data.targets
bcp, bcc, bmp, bll, biters, vLL = gmc2(X, k=3, numrun=100)
yhat = np.argmax(bcp, axis=1)
cm1 = createCM(y, yhat)
print(cm1)
print([bll, biters])
fig, ax = plt.subplots(1, 1, figsize=(6, 4))
ax.scatter(range(len(vLL)), vLL, color="black")
ax.set_xlabel("Run number")
ax.set_ylabel("log-likelihood")
fig.savefig("irisll.pdf", bbox_inches='tight')