In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns',100)
#pd.set_option('display.width',2000)
pd.set_option('display.max_colwidth', -1)
In [2]:
#load dataset
df = pd.read_table('./groceries.csv',header=None)
#create binary matrix to be used as input to apriori
df1= df.iloc[:,0].str.get_dummies(sep=',')
df1.head()
Out[2]:
Instant food products UHT-milk abrasive cleaner artif. sweetener baby cosmetics baby food bags baking powder bathroom cleaner beef berries beverages bottled beer bottled water brandy brown bread butter butter milk cake bar candles candy canned beer canned fish canned fruit canned vegetables cat food cereals chewing gum chicken chocolate chocolate marshmallow citrus fruit cleaner cling film/bags cocoa drinks coffee condensed milk cooking chocolate cookware cream cream cheese curd curd cheese decalcifier dental care dessert detergent dish cleaner dishes dog food ... ready soups red/blush wine rice roll products rolls/buns root vegetables rubbing alcohol rum salad dressing salt salty snack sauces sausage seasonal products semi-finished bread shopping bags skin care sliced cheese snack products soap soda soft cheese softener sound storage medium soups sparkling wine specialty bar specialty cheese specialty chocolate specialty fat specialty vegetables spices spread cheese sugar sweet spreads syrup tea tidbits toilet cleaner tropical fruit turkey vinegar waffles whipped/sour cream whisky white bread white wine whole milk yogurt zwieback
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0

5 rows × 169 columns

In [3]:
#count number of transactions in dataset 
len(df1.index)
Out[3]:
9835
In [4]:
#how many transactions contain beef
df1['beef'].sum()
Out[4]:
516
In [5]:
#row sum computes number of products in a basket
dfcounts=df1.sum(axis=1)
plt.xlabel('Number of items in a transaction')
plt.ylabel('Number of Transactions')
plt.hist(dfcounts,bins=range(1,32),rwidth=0.8)
plt.show()
In [6]:
# Build up the frequent items
frequent_itemsets = apriori(df1, min_support=0.002, use_colnames=True)
frequent_itemsets
Out[6]:
support itemsets
0 0.008033 (Instant food products)
1 0.033452 (UHT-milk)
2 0.003559 (abrasive cleaner)
3 0.003254 (artif. sweetener)
4 0.017692 (baking powder)
5 0.002745 (bathroom cleaner)
6 0.052466 (beef)
7 0.033249 (berries)
8 0.026029 (beverages)
9 0.080529 (bottled beer)
10 0.110524 (bottled water)
11 0.004169 (brandy)
12 0.064870 (brown bread)
13 0.055414 (butter)
14 0.027961 (butter milk)
15 0.013218 (cake bar)
16 0.008948 (candles)
17 0.029893 (candy)
18 0.077682 (canned beer)
19 0.015048 (canned fish)
20 0.003254 (canned fruit)
21 0.010778 (canned vegetables)
22 0.023284 (cat food)
23 0.005694 (cereals)
24 0.021047 (chewing gum)
25 0.042908 (chicken)
26 0.049619 (chocolate)
27 0.009049 (chocolate marshmallow)
28 0.082766 (citrus fruit)
29 0.005084 (cleaner)
... ... ...
4193 0.004881 (tropical fruit, yogurt, rolls/buns, whole milk)
4194 0.003050 (yogurt, rolls/buns, whipped/sour cream, whole milk)
4195 0.002745 (tropical fruit, sausage, whole milk, root vegetables)
4196 0.003254 (yogurt, sausage, whole milk, root vegetables)
4197 0.002440 (soda, whole milk, yogurt, root vegetables)
4198 0.002745 (tropical fruit, whole milk, whipped/sour cream, root vegetables)
4199 0.002339 (tropical fruit, yogurt, whipped/sour cream, root vegetables)
4200 0.005694 (tropical fruit, yogurt, whole milk, root vegetables)
4201 0.003660 (yogurt, whole milk, whipped/sour cream, root vegetables)
4202 0.002237 (soda, sausage, yogurt, whole milk)
4203 0.003152 (tropical fruit, yogurt, sausage, whole milk)
4204 0.003152 (tropical fruit, soda, whole milk, yogurt)
4205 0.004372 (tropical fruit, yogurt, whole milk, whipped/sour cream)
4206 0.002034 (white bread, tropical fruit, yogurt, whole milk)
4207 0.002034 (bottled water, tropical fruit, other vegetables, whole milk, yogurt)
4208 0.002339 (tropical fruit, other vegetables, whole milk, yogurt, butter)
4209 0.003152 (citrus fruit, tropical fruit, other vegetables, whole milk, root vegetables)
4210 0.002339 (citrus fruit, other vegetables, whole milk, yogurt, root vegetables)
4211 0.002440 (citrus fruit, tropical fruit, other vegetables, whole milk, yogurt)
4212 0.002034 (other vegetables, whole milk, fruit/vegetable juice, yogurt, root vegetables)
4213 0.002440 (tropical fruit, other vegetables, whole milk, pip fruit, root vegetables)
4214 0.002339 (other vegetables, whole milk, pip fruit, yogurt, root vegetables)
4215 0.002339 (tropical fruit, other vegetables, whole milk, pip fruit, yogurt)
4216 0.002034 (rolls/buns, tropical fruit, other vegetables, whole milk, root vegetables)
4217 0.002440 (rolls/buns, other vegetables, whole milk, yogurt, root vegetables)
4218 0.002542 (rolls/buns, tropical fruit, other vegetables, whole milk, yogurt)
4219 0.003559 (tropical fruit, other vegetables, whole milk, yogurt, root vegetables)
4220 0.002339 (other vegetables, whole milk, yogurt, whipped/sour cream, root vegetables)
4221 0.002440 (tropical fruit, other vegetables, whole milk, yogurt, whipped/sour cream)
4222 0.002237 (rolls/buns, tropical fruit, whole milk, yogurt, root vegetables)

4223 rows × 2 columns

In [7]:
#plot itemsets with exactly 2 items with highest support values
frequent_itemsets[frequent_itemsets.itemsets.str.len()==3].sort_values(by=['support'],ascending=False).head()
Out[7]:
support itemsets
3486 0.023183 (other vegetables, whole milk, root vegetables)
3546 0.022267 (yogurt, whole milk, other vegetables)
3473 0.017895 (other vegetables, rolls/buns, whole milk)
3535 0.017082 (tropical fruit, other vegetables, whole milk)
3698 0.015557 (yogurt, rolls/buns, whole milk)
In [20]:
# Create the rules
rules = association_rules(frequent_itemsets)
rules[['antecedents','consequents','support','confidence','lift']]
Out[20]:
antecedents consequents support confidence lift
0 (curd, hamburger meat) (whole milk) 0.002542 0.806452 3.156169
1 (herbs, rolls/buns) (whole milk) 0.002440 0.800000 3.130919
2 (herbs, tropical fruit) (whole milk) 0.002339 0.821429 3.214783
3 (other vegetables, butter, pork) (whole milk) 0.002237 0.846154 3.311549
4 (other vegetables, curd, domestic eggs) (whole milk) 0.002847 0.823529 3.223005
5 (tropical fruit, grapes, whole milk) (other vegetables) 0.002034 0.800000 4.134524
6 (tropical fruit, whole milk, citrus fruit, root vegetables) (other vegetables) 0.003152 0.885714 4.577509
7 (other vegetables, yogurt, citrus fruit, root vegetables) (whole milk) 0.002339 0.821429 3.214783
8 (other vegetables, yogurt, fruit/vegetable juice, root vegetables) (whole milk) 0.002034 0.833333 3.261374
9 (root vegetables, fruit/vegetable juice, whole milk, yogurt) (other vegetables) 0.002034 0.800000 4.134524
10 (tropical fruit, yogurt, rolls/buns, root vegetables) (whole milk) 0.002237 0.814815 3.188899
In [21]:
rules[ (rules['lift'] >= 4) &
       (rules['confidence'] >= 0.8) ] [['antecedents','consequents','support','confidence','lift']]
Out[21]:
antecedents consequents support confidence lift
5 (tropical fruit, grapes, whole milk) (other vegetables) 0.002034 0.800000 4.134524
6 (tropical fruit, whole milk, citrus fruit, root vegetables) (other vegetables) 0.003152 0.885714 4.577509
9 (root vegetables, fruit/vegetable juice, whole milk, yogurt) (other vegetables) 0.002034 0.800000 4.134524
In [22]:
def draw(rules):
    import networkx as nx  
    G = nx.DiGraph()

    for i,row in rules.iterrows():      
        for c in row['consequents']:
            G.add_nodes_from([c])
            for a in row['antecedents']: 
                G.add_nodes_from([a])
                G.add_edge(a,c,color='black',weight=row['confidence'])
    color_map=[]
    for node in G:
        #color blue nodes that participate as a consequent to a rule
        if (rules['consequents'].str.contains(node, regex=False).any()):
            color_map.append('blue') 
        else:
            color_map.append('red')  

    edges = G.edges()
    colors = [G[u][v]['color'] for u,v in edges]
    weights = [G[u][v]['weight'] for u,v in edges]
    
    min_weight=min(weights)-0.01
    max_weight=max(weights)
    weights = [5*(w-min_weight)/(max_weight-min_weight) for w in weights]
    
    pos = nx.spring_layout(G, k=10, scale=1)
    nx.draw(G, pos,edges=edges,edge_color=colors,node_color=color_map,width=weights,font_size=8,with_labels=False)            
    for p in pos:  # raise text positions
      pos[p][1] += 0.18
    nx.draw_networkx_labels(G, pos)
 
    plt.show()
In [23]:
draw(rules)  
In [24]:
#list rules that contain 'citrus fruit' in the LHS
rules[rules['antecedents'].apply(str).str.contains('citrus fruit')][['antecedents','consequents','support','confidence','lift']]
Out[24]:
antecedents consequents support confidence lift
6 (tropical fruit, whole milk, citrus fruit, root vegetables) (other vegetables) 0.003152 0.885714 4.577509
7 (other vegetables, yogurt, citrus fruit, root vegetables) (whole milk) 0.002339 0.821429 3.214783
In [ ]: