import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import random
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn import metrics
#data file downloaded from https://archive.ics.uci.edu/ml/datasets/Wholesale+customers
df = pd.read_csv("./Wholesale customers data.csv");
df.describe()
print(df.info())
#define columns to use for clustering data
#change this to test different combinations
columnsToUse=('Milk','Grocery')
#plot selected columns
sns.lmplot(columnsToUse[0],columnsToUse[1],data=df,fit_reg=False,height=5)
df1 = df[[columnsToUse[0],columnsToUse[1]]]
#normalize data (zero mean, unit variance)
stscaler = StandardScaler().fit(df1)
nda = stscaler.transform(df1)
#run DBSCAN
dbsc = DBSCAN(eps = .2, min_samples = 5).fit(nda)
#get assignmets based on DBSCAN
#rename for convinience (-1 = noise points)
labels=['Noise' if x==-1 else 'Cluster '+str(x) for x in list(dbsc.labels_)]
df["Labels"]=labels
sns.lmplot(columnsToUse[0],columnsToUse[1],data=df,fit_reg=False,hue="Labels",height=6)
#now try KMeans on the same (scaled) data
dfkm=pd.DataFrame(data=nda,columns=['X','Y'])
mat = dfkm.values
km = KMeans(n_clusters=2)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
centers = np.array(km.cluster_centers_)
colors=['blue' if x==0 else 'orange' for x in labels]
plt.plot()
plt.title('k-means result(k=2, plotting scaled dataset)')
plt.scatter(dfkm['X'],dfkm['Y'],c=colors)
#plot centroids as well
plt.scatter(centers[:,0], centers[:,1], marker="x", color='r')
plt.show()
#now try a synthetic dataset
#lets make a donought
x=[]
y=[]
for i in range(1,500):
theta = random.random()*2*math.pi
x.append((random.random()*2+10)*math.cos(theta))
y.append((random.random()*2+10)*math.sin(theta))
theta = random.random()*2*math.pi
x.append((random.random()*2+30)*math.cos(theta))
y.append(random.random()*2+30*math.sin(theta))
dfc=pd.DataFrame()
dfc['X']=x
dfc['Y']=y
sns.lmplot('X','Y',data=dfc,fit_reg=False,height=4)
#run DBSCAN first
dbsc = DBSCAN(eps =4, min_samples = 2).fit(dfc)
labels=list(dbsc.labels_)
#get number of clusters in labels, ignoring noise (-1) if present
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(dfc, labels))
labels=['Noise' if x==-1 else 'Cluster '+str(x) for x in labels]
dfc["Labels"]=labels
sns.lmplot('X','Y',data=dfc,fit_reg=False,hue="Labels",height=4)
ax = plt.gca()
ax.set_title("DBScan result")
#now try KMeans
dfkm=dfc[['X','Y']]
mat = dfkm.values
km = KMeans(n_clusters=2)
km.fit(mat)
#get cluster assignment labels
labels = km.labels_
centers = np.array(km.cluster_centers_)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(dfkm, labels))
colors=['blue' if x==0 else 'orange' for x in labels]
plt.plot()
plt.title('k-means result (k=2)')
plt.scatter(dfc['X'],dfc['Y'],c=colors)
plt.scatter(centers[:,0], centers[:,1], marker="x", color='r')
plt.show()