谱聚类一般会先对两两样本间求相似度, 然后根据相似度矩阵求出拉普拉斯矩阵,然后将每个样本映射到拉普拉斯矩阵特诊向量中,最后使用k-means聚类。
scikit-learn开源包中已经有现成的接口可以使用,具体见
写了一个测试例子
构造二维空间样本点,
#!/usr/bin/env python
import random
import numpy as np
import math
index = 0
pointlist = []
fd = open("points.txt", 'w')
for x in np.arange(0.1, 10., 0.5) :
for y in np.arange(0., 10., 0.1) :
print >> fd, str(index)+'\t'+str(x)+'\t'+str(y)
pointlist.append((index, (x, y)))
index += 1
for x in np.arange(-10.0, -0.1, 0.5) :
for y in np.arange(0., 10., 0.1) :
print >> fd, str(index)+'\t'+str(x)+'\t'+str(y)
pointlist.append((index, (x, y)))
index += 1
for x in np.arange(-10.0, -0.1, 0.5) :
for y in np.arange(-10.0, 0., 0.1) :
print >> fd, str(index)+'\t'+str(x)+'\t'+str(y)
pointlist.append((index, (x, y)))
index += 1
fd.close()
def get_dist(pnt1, pnt2) :
return math.sqrt((pnt1[1][0] - pnt2[1][0])**2 + (pnt1[1][1] - pnt2[1][1])**2)
simfd = open("sim_pnts.txt", 'w')
for pnt1 in pointlist :
for pnt2 in pointlist :
index1, index2 = pnt1[0], pnt2[0]
dist = get_dist(pnt1, pnt2)
if dist <=0.00001 :
print >> simfd, str(index1) + "\t"+str(index2) + "\t" + "10"
continue
sim = 1.0 / dist
print >> simfd, str(index1) + "\t"+str(index2) + "\t" + str(sim)
simfd.close()
使用谱聚类:
#!/usr/bin/env python
# Authors: Emmanuelle Gouillart <emmanuelle.gouillart@normalesup.org>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# License: BSD 3 clause
import sys
import numpy as np
from sklearn.cluster import spectral_clustering
from scipy.sparse import coo_matrix
###############################################################################
fid2fname = {}
for line in open("points.txt") :
line = line.strip().split('\t')
fid2fname.setdefault(int(line[0]), (float(line[1]), float(line[2])))
N = len(fid2fname)
rowlist = []
collist = []
datalist = []
for line in open("sim_pnts.txt") :
line = line.strip().split('\t')
if len(line) < 3 : continue
f1, f2, sim = line[:3]
rowlist.append(int(f1))
collist.append(int(f2))
datalist.append(float(sim))
for id in fid2fname :
rowlist.append(int(id))
collist.append(int(id))
datalist.append(1.0)
row = np.array(rowlist)
col = np.array(collist)
data = np.array(datalist)
graph = coo_matrix((data, (row, col)), shape=(N, N))
###############################################################################
# Force the solver to be arpack, since amg is numerically
# unstable on this example
labels = spectral_clustering(graph, n_clusters=3, eigen_solver='arpack')
#print labels
cluster2fid = {}
for index, lab in enumerate(labels) :
cluster2fid.setdefault(lab, [])
cluster2fid[lab].append(index)
for index, lab in enumerate(cluster2fid) :
fd = open("cluster_%s" % index, "w")
for fid in cluster2fid[lab] :
print >> fd , fid2fname[fid]
#!/usr/bin/env python
import matplotlib.pyplot as plt
plt.figure(figsize=(12,6))
cluster_list = []
cluster_0_x = []
cluster_0_y = []
for line in open("cluster_0"):
line = line.strip().split(',')
x = float(line[0][1:].strip())
y = float(line[1][:-1].strip())
cluster_0_x.append(x)
cluster_0_y.append(y)
plt.plot(cluster_0_x, cluster_0_y, 'or')
cluster_1_x = []
cluster_1_y = []
for line in open("cluster_1"):
line = line.strip().split(',')
x = float(line[0][1:].strip())
y = float(line[1][:-1].strip())
cluster_1_x.append(x)
cluster_1_y.append(y)
plt.plot(cluster_1_x, cluster_1_y, 'xb')
cluster_2_x = []
cluster_2_y = []
for line in open("cluster_2"):
line = line.strip().split(',')
x = float(line[0][1:].strip())
y = float(line[1][:-1].strip())
cluster_2_x.append(x)
cluster_2_y.append(y)
plt.plot(cluster_2_x, cluster_2_y, '+g')
plt.show()
原文:http://blog.csdn.net/lming_08/article/details/46334949