-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalysis.py
More file actions
99 lines (82 loc) · 2.82 KB
/
analysis.py
File metadata and controls
99 lines (82 loc) · 2.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import sys
import copy
import math
import numpy as np
import symnmf
from sklearn.metrics import silhouette_score, pairwise_distances
# User Args
k = int(sys.argv[1])
file_name = sys.argv[2]
# Define variables for k-means
EPSILON = 0.0001
ITER = 300
centroids = []
vectors = []
counter = 0
oldcentroids = []
def read_data(file_name):
with open(file_name, 'r') as file:
lines = file.readlines()
data = []
for line in lines:
data.append([float(val) for val in line.strip().split(',')])
return np.array(data)
vectors = read_data(file_name)
# K-means Area
# Functions for k-means area
def delta(vector1, vector2):
output = 0
for i in range(0, len(vector1)):
output += (vector1[i]-vector2[i])**2
return math.sqrt(output)
# Checks if all the deltas are smaller then epsilon
def convergence(centroids, oldcentroids):
if len(oldcentroids) == 0:
return False
for i in range(0, len(centroids)):
if delta(centroids[i][0], oldcentroids[i][0]) >= EPSILON:
return False
return True
# Puts the centroid in the dictionary and also puts the vector in the centroid list
def putvectorinmatchingcentroid(vector):
closestcentroid = min(centroids, key=lambda x: delta(vector, x[0]))
closestcentroid[1].append(vector)
def update_centroid(centroid):
updated_centroid = [0] * len(centroid[0])
for i in range(len(centroid[0])):
total = 0
for vector in centroid[1]:
total += vector[i]
if total != 0:
updated_centroid[i] = total / len(centroid[1])
centroid[0] = updated_centroid
# End functions for k-means area
# K-means Algorithm
for i in range(0, k):
centroids.append([vectors[i], []])
while not convergence(centroids, oldcentroids) and counter < ITER:
for vector in vectors:
putvectorinmatchingcentroid(vector)
oldcentroids = copy.deepcopy(centroids)
for centroid in centroids:
update_centroid(centroid)
counter += 1
for centroid in centroids:
centroid[1] = []
cluster_labels_kmeans = []
for vector in vectors:
distances = [pairwise_distances(np.array(vector).reshape(1, -1), np.array(centroid[0]).reshape(1, -1))[0][0] for centroid in centroids]
cluster_labels_kmeans.append(distances.index(min(distances)))
# Calculate silhouette score for K-means
silhouette_kmeans = silhouette_score(vectors, cluster_labels_kmeans)
# Symnmf Area
# Call the symnmf function and capture the result
vectors = vectors.tolist()
H = symnmf.symnmf(k, vectors)
# Derive cluster assignments for each data point based on the maximum association score
cluster_labels_nmf = np.argmax(H, axis=1)
# Calculate silhouette score for nmf
silhouette_nmf = silhouette_score(vectors, cluster_labels_nmf)
# Print the scores
print("nmf: {:.4f}".format(silhouette_nmf))
print("kmeans: {:.4f}".format(silhouette_kmeans))