In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
df = pd.read_table('./groceries.csv',header=None)
df1= df.iloc[:,0].str.get_dummies(sep=',')
products=df1.columns
df1.head()
Out[2]:
Instant food products UHT-milk abrasive cleaner artif. sweetener baby cosmetics baby food bags baking powder bathroom cleaner beef ... turkey vinegar waffles whipped/sour cream whisky white bread white wine whole milk yogurt zwieback
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0

5 rows × 169 columns

In [3]:
df1[(df1['baby food']>0) & (df1['whisky']>0)]
Out[3]:
Instant food products UHT-milk abrasive cleaner artif. sweetener baby cosmetics baby food bags baking powder bathroom cleaner beef ... turkey vinegar waffles whipped/sour cream whisky white bread white wine whole milk yogurt zwieback

0 rows × 169 columns

In [4]:
from numpy import array
from numpy import diag
from numpy import dot
from numpy import zeros
from scipy.linalg import svd
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
In [5]:
A=np.array(df1)
#print(A)
In [6]:
A.shape
Out[6]:
(9835, 169)
In [7]:
U, L, VT = svd(A)
plt.plot(L*L)
Out[7]:
[<matplotlib.lines.Line2D at 0x20ecce5b2b0>]
In [8]:
#lets verify that the sum of squares match

print("Sum of squares of singular values is",(L*L).sum())
print("Sum of squares of sales is",(A*A).sum())
Sum of squares of singular values is 43367.000000000015
Sum of squares of sales is 43367
In [24]:
-VT[0]
Out[24]:
array([0.013, 0.041, 0.006, 0.005, 0.001, 0.000, 0.000, 0.031, 0.004,
       0.087, 0.053, 0.030, 0.090, 0.162, 0.004, 0.100, 0.102, 0.046,
       0.020, 0.011, 0.041, 0.056, 0.023, 0.005, 0.018, 0.036, 0.010,
       0.023, 0.072, 0.074, 0.013, 0.140, 0.008, 0.016, 0.003, 0.077,
       0.012, 0.004, 0.003, 0.002, 0.067, 0.091, 0.008, 0.003, 0.009,
       0.059, 0.029, 0.014, 0.022, 0.012, 0.115, 0.009, 0.008, 0.003,
       0.031, 0.014, 0.001, 0.091, 0.001, 0.017, 0.019, 0.002, 0.040,
       0.014, 0.082, 0.120, 0.036, 0.001, 0.044, 0.055, 0.041, 0.028,
       0.002, 0.014, 0.052, 0.027, 0.011, 0.010, 0.007, 0.010, 0.001,
       0.005, 0.001, 0.007, 0.008, 0.008, 0.057, 0.001, 0.005, 0.098,
       0.016, 0.041, 0.006, 0.032, 0.020, 0.085, 0.111, 0.004, 0.004,
       0.045, 0.053, 0.003, 0.003, 0.364, 0.017, 0.023, 0.135, 0.010,
       0.007, 0.030, 0.129, 0.010, 0.091, 0.005, 0.023, 0.000, 0.028,
       0.002, 0.004, 0.004, 0.022, 0.016, 0.018, 0.284, 0.206, 0.002,
       0.006, 0.001, 0.016, 0.051, 0.008, 0.151, 0.018, 0.028, 0.133,
       0.006, 0.046, 0.004, 0.003, 0.241, 0.032, 0.008, 0.000, 0.012,
       0.006, 0.031, 0.014, 0.034, 0.006, 0.003, 0.008, 0.017, 0.055,
       0.014, 0.005, 0.007, 0.004, 0.001, 0.190, 0.015, 0.011, 0.057,
       0.129, 0.001, 0.069, 0.016, 0.475, 0.254, 0.008])
In [25]:
#get product names with 5 largest values in firt row of VT
#note that all values are negative so I am asking for the bottom-5 values next
products[VT[0].argsort()[:5]]
Out[25]:
Index(['whole milk', 'other vegetables', 'rolls/buns', 'yogurt', 'soda'], dtype='object')
In [11]:
def generate_user(items):
    user=np.zeros(A.shape[1])
    for i in (items):
        tf = products.get_loc(i)
        user[tf]=100
    return user
In [12]:
#CREATE A CUSTOMER John THAT LIKES tropical fruit
john=generate_user(['tropical fruit'])
print(john)
[0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000 0.000 100.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
 0.000]
In [13]:
def what_do_you_like(c,products):
    return products[c.argmax()]
    
print('John likes',what_do_you_like(john,products))
John likes tropical fruit
In [14]:
ATA=A.transpose().dot(A)

friends_of_john=ATA.dot(john) 
print('Friends of John like',what_do_you_like(friends_of_john,products))
Friends of John like tropical fruit
In [15]:
#print Friends-of-john vector
print(friends_of_john)
print('Product loc=',products.get_loc(what_do_you_like(friends_of_john,products)))
[1500.000 4900.000 600.000 500.000 100.000 100.000 0.000 3700.000 600.000
 7500.000 6600.000 4200.000 8100.000 18200.000 100.000 10500.000 9800.000
 5400.000 1900.000 1500.000 5300.000 3000.000 2400.000 700.000 2700.000
 4700.000 1000.000 2500.000 6300.000 8000.000 1800.000 19600.000 500.000
 1900.000 200.000 7000.000 1500.000 400.000 400.000 200.000 7100.000
 10100.000 1500.000 400.000 1000.000 6200.000 3300.000 1400.000 1900.000
 1600.000 11200.000 800.000 700.000 300.000 3200.000 1200.000 100.000
 9300.000 100.000 2600.000 2500.000 0.000 5400.000 1100.000 8600.000
 13500.000 6000.000 200.000 5300.000 4200.000 4000.000 2800.000 500.000
 1600.000 6600.000 4000.000 1200.000 1100.000 800.000 1300.000 200.000
 500.000 100.000 400.000 900.000 600.000 6200.000 200.000 300.000 9200.000
 1600.000 3300.000 600.000 4500.000 2600.000 9900.000 11600.000 600.000
 800.000 4600.000 5600.000 600.000 300.000 35300.000 1900.000 2200.000
 13000.000 1300.000 400.000 3700.000 20100.000 1100.000 8400.000 400.000
 3000.000 100.000 4200.000 300.000 400.000 300.000 2500.000 1900.000
 2000.000 24200.000 20700.000 400.000 900.000 0.000 1400.000 5500.000
 1200.000 13700.000 2300.000 4200.000 13300.000 700.000 5200.000 400.000
 300.000 20500.000 3200.000 900.000 0.000 900.000 100.000 3200.000
 1600.000 3300.000 500.000 600.000 1100.000 2000.000 4700.000 1300.000
 900.000 1300.000 300.000 100.000 103200.000 2600.000 900.000 6000.000
 13600.000 0.000 8600.000 1200.000 41600.000 28800.000 1400.000]
Product loc= 158
In [19]:
friends_of_friends_of_john=ATA.dot(friends_of_john)
print('Friends of friends of John like',what_do_you_like(friends_of_friends_of_john,products))
Friends of friends of John like whole milk