In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import random
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn import metrics
In [2]:
#data file downloaded from https://archive.ics.uci.edu/ml/datasets/Wholesale+customers
df = pd.read_csv("./Wholesale customers data.csv");
df.describe()
Out[2]:
Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
count 440.000000 440.000000 440.000000 440.000000 440.000000 440.000000 440.000000 440.000000
mean 1.322727 2.543182 12000.297727 5796.265909 7951.277273 3071.931818 2881.493182 1524.870455
std 0.468052 0.774272 12647.328865 7380.377175 9503.162829 4854.673333 4767.854448 2820.105937
min 1.000000 1.000000 3.000000 55.000000 3.000000 25.000000 3.000000 3.000000
25% 1.000000 2.000000 3127.750000 1533.000000 2153.000000 742.250000 256.750000 408.250000
50% 1.000000 3.000000 8504.000000 3627.000000 4755.500000 1526.000000 816.500000 965.500000
75% 2.000000 3.000000 16933.750000 7190.250000 10655.750000 3554.250000 3922.000000 1820.250000
max 2.000000 3.000000 112151.000000 73498.000000 92780.000000 60869.000000 40827.000000 47943.000000
In [3]:
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
Channel             440 non-null int64
Region              440 non-null int64
Fresh               440 non-null int64
Milk                440 non-null int64
Grocery             440 non-null int64
Frozen              440 non-null int64
Detergents_Paper    440 non-null int64
Delicassen          440 non-null int64
dtypes: int64(8)
memory usage: 27.6 KB
None
In [4]:
#define columns to use for clustering data
#change this to test different combinations
columnsToUse=('Milk','Grocery')
#plot selected columns
sns.lmplot(columnsToUse[0],columnsToUse[1],data=df,fit_reg=False,height=5)
Out[4]:
<seaborn.axisgrid.FacetGrid at 0x1196e8f28>
In [5]:
df1 = df[[columnsToUse[0],columnsToUse[1]]]
#normalize data (zero mean, unit variance)
stscaler = StandardScaler().fit(df1)
nda = stscaler.transform(df1)
In [6]:
#run DBSCAN
dbsc = DBSCAN(eps = .2, min_samples = 5).fit(nda)
#get assignmets based on DBSCAN
#rename for convinience (-1 = noise points)
labels=['Noise' if x==-1 else 'Cluster '+str(x) for x in list(dbsc.labels_)]
df["Labels"]=labels
sns.lmplot(columnsToUse[0],columnsToUse[1],data=df,fit_reg=False,hue="Labels",height=6)
Out[6]:
<seaborn.axisgrid.FacetGrid at 0x1196e5ef0>
In [7]:
#now try KMeans on the same (scaled) data
dfkm=pd.DataFrame(data=nda,columns=['X','Y'])
mat = dfkm.values
km = KMeans(n_clusters=2)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
centers = np.array(km.cluster_centers_)
colors=['blue' if x==0 else 'orange' for x in labels]
plt.plot()
plt.title('k-means result(k=2, plotting scaled dataset)')
plt.scatter(dfkm['X'],dfkm['Y'],c=colors)
#plot centroids as well
plt.scatter(centers[:,0], centers[:,1], marker="x", color='r')
plt.show()
In [8]:
#now try a synthetic dataset
#lets make a donought
x=[]
y=[]
for i in range(1,500):
    theta = random.random()*2*math.pi
    x.append((random.random()*2+10)*math.cos(theta))
    y.append((random.random()*2+10)*math.sin(theta))
    theta = random.random()*2*math.pi
    x.append((random.random()*2+30)*math.cos(theta))
    y.append(random.random()*2+30*math.sin(theta))
    
dfc=pd.DataFrame()
dfc['X']=x
dfc['Y']=y
sns.lmplot('X','Y',data=dfc,fit_reg=False,height=4)
Out[8]:
<seaborn.axisgrid.FacetGrid at 0x11ba5a240>
In [9]:
#run DBSCAN first
dbsc = DBSCAN(eps =4, min_samples = 2).fit(dfc)
labels=list(dbsc.labels_)
#get number of clusters in labels, ignoring noise (-1) if present
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(dfc, labels))
labels=['Noise' if x==-1 else 'Cluster '+str(x) for x in labels]
dfc["Labels"]=labels
sns.lmplot('X','Y',data=dfc,fit_reg=False,hue="Labels",height=4)
ax = plt.gca()
ax.set_title("DBScan result")
Estimated number of clusters: 2
Estimated number of noise points: 0
Silhouette Coefficient: 0.182
Out[9]:
Text(0.5,1,'DBScan result')
In [10]:
#now try KMeans
dfkm=dfc[['X','Y']]
mat = dfkm.values
km = KMeans(n_clusters=2)
km.fit(mat)
#get cluster assignment labels
labels = km.labels_
centers = np.array(km.cluster_centers_)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(dfkm, labels))
colors=['blue' if x==0 else 'orange' for x in labels]
plt.plot()
plt.title('k-means result (k=2)')
plt.scatter(dfc['X'],dfc['Y'],c=colors)
plt.scatter(centers[:,0], centers[:,1], marker="x", color='r')
plt.show()
Silhouette Coefficient: 0.338