In [1]:
import itertools

#define jaccard similarity (size of intersection over size of union)
def jaccard_sim(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union
In [2]:
def print_customers():
    for x in cust:
        print (x,":",cust[x])
In [3]:
def merge():
    sim={}
    print("Computing pair wise similarities")
    for pair in itertools.combinations(cust, r=2):
        s=jaccard_sim(cust[pair[0]],cust[pair[1]])
        print("Jaccard_sim of",*pair,' is ',s)
        sim[tuple([pair[0],pair[1]])]=s
    #find pair with max similarity
    best_pair=max(sim, key=sim.get)
    print("Best pair to merge is",best_pair)
    #new cluster contains union of customers
    new_user_purchases=set(cust[best_pair[0]]).union(cust[best_pair[1]])
    #remove individual users
    cust.pop(best_pair[0])
    cust.pop(best_pair[1])
    #add merged pair
    cust[best_pair[0]+'+'+best_pair[1]]=new_user_purchases
    
In [4]:
def HierarchicalClustering(k):
    while(len(cust)>k):
        print("================================")
        print("There are",len(cust),"clusters")
        print_customers()
        merge()
        
In [5]:
#define some toy customer purchases to play around
cust={
    'user1':['milk','bread','coffee'],
    'user2':['milk','bread','cola'],
    'user3':['cereal','milk','donut'],
    'user4':['donut','cream','cola'],
    'user5':['cola','milk','cereal','tea']
}
In [6]:
HierarchicalClustering(2)
print("================================")
print("Final Clusters:")
print_customers()
================================
There are 5 clusters
user1 : ['milk', 'bread', 'coffee']
user2 : ['milk', 'bread', 'cola']
user3 : ['cereal', 'milk', 'donut']
user4 : ['donut', 'cream', 'cola']
user5 : ['cola', 'milk', 'cereal', 'tea']
Computing pair wise similarities
Jaccard_sim of user1 user2  is  0.5
Jaccard_sim of user1 user3  is  0.2
Jaccard_sim of user1 user4  is  0.0
Jaccard_sim of user1 user5  is  0.16666666666666666
Jaccard_sim of user2 user3  is  0.2
Jaccard_sim of user2 user4  is  0.2
Jaccard_sim of user2 user5  is  0.4
Jaccard_sim of user3 user4  is  0.2
Jaccard_sim of user3 user5  is  0.4
Jaccard_sim of user4 user5  is  0.16666666666666666
Best pair to merge is ('user1', 'user2')
================================
There are 4 clusters
user3 : ['cereal', 'milk', 'donut']
user4 : ['donut', 'cream', 'cola']
user5 : ['cola', 'milk', 'cereal', 'tea']
user1+user2 : {'coffee', 'milk', 'cola', 'bread'}
Computing pair wise similarities
Jaccard_sim of user3 user4  is  0.2
Jaccard_sim of user3 user5  is  0.4
Jaccard_sim of user3 user1+user2  is  0.16666666666666666
Jaccard_sim of user4 user5  is  0.16666666666666666
Jaccard_sim of user4 user1+user2  is  0.16666666666666666
Jaccard_sim of user5 user1+user2  is  0.3333333333333333
Best pair to merge is ('user3', 'user5')
================================
There are 3 clusters
user4 : ['donut', 'cream', 'cola']
user1+user2 : {'coffee', 'milk', 'cola', 'bread'}
user3+user5 : {'donut', 'milk', 'cola', 'tea', 'cereal'}
Computing pair wise similarities
Jaccard_sim of user4 user1+user2  is  0.16666666666666666
Jaccard_sim of user4 user3+user5  is  0.3333333333333333
Jaccard_sim of user1+user2 user3+user5  is  0.2857142857142857
Best pair to merge is ('user4', 'user3+user5')
================================
Final Clusters:
user1+user2 : {'coffee', 'milk', 'cola', 'bread'}
user4+user3+user5 : {'donut', 'cream', 'milk', 'cola', 'tea', 'cereal'}