Learn practical skills, build real-world projects, and advance your career
Created 4 years ago
import pandas as pd
import numpy as np
from pprint import pprint
data = pd.DataFrame([(185, 72),
(170, 56),
(168, 60),
(179, 68),
(182, 72),
(188, 77),
(180, 71),
(180, 70),
(183, 84),
(180, 88),
(180, 67),
(177, 76)
],
columns=['height', 'weight'])
data
class KMean2:
def __init__(self, data):
self.data = data
self.cluster1 = None
self.cluster2 = None
self.u1 = None
self.u2 = None
@staticmethod
def calculate_distance(p1, p2):
return (np.square(p1[0] - p2[0]) + np.square(p1[1] - p2[1])) ** 0.5
@staticmethod
def assign_clusters(data, u1, u2):
cluster1 = []
cluster2 = []
for i in data.index:
p = data.iloc[i]
d1 = KMean2.calculate_distance(p, u1)
d2 = KMean2.calculate_distance(p, u2)
if d2 <= d1:
cluster2.append(p)
else:
cluster1.append(p)
return np.array(cluster1), np.array(cluster2)
@staticmethod
def recalc_mean(c1, c2):
m = lambda c: np.array(c).mean(axis=0)
return m(c1), m(c2)
@staticmethod
def print_clusters(c1, c2):
print("Cluster 1")
pprint(np.array(c1))
print("Cluster 2")
pprint(np.array(c2))
def run(self):
u1 = np.array(self.data.iloc[0])
u2 = np.array(self.data.iloc[1])
cluster1 = []
cluster2 = []
while True:
cluster1, cluster2 = self.assign_clusters(self.data, u1, u2)
u1_updated, u2_updated = self.recalc_mean(cluster1, cluster2)
if np.array_equal(u1, u1_updated) and np.array_equal(u2, u2_updated):
break
u1 = u1_updated
u2 = u2_updated
self.cluster1 = cluster1
self.cluster2 = cluster2
self.u1 = u1
self.u2 = u2
return cluster1, cluster2, u1, u2
def print_result(self):
try:
print("Cluster 1", self.cluster1)
print("Cluster 2", self.cluster2)
print("u1", self.u1)
print("u2", self.u2)
except AttributeError:
print("Error: run the algorithm with .run() first and then print the result")
k = KMean2(data)
k.run()
k.print_result()
Cluster 1 [[185 72]
[179 68]
[182 72]
[188 77]
[180 71]
[180 70]
[183 84]
[180 88]
[180 67]
[177 76]]
Cluster 2 [[170 56]
[168 60]]
u1 [181.4 74.5]
u2 [169. 58.]