#Basic necessary Libraries
import numpy as np
import pandas as pd

#Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import altair as alt
import plotly.express as px
#import WordCloud as wordcloud
#from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')

#Apriori libraries 
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


groceries=pd.read_csv('Groceries_dataset.csv')
print(f'Groceries_dataset.csv : {groceries.shape}')
groceries.head()

Groceries_dataset.csv : (38765, 3)


groceries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38765 non-null  int64 
 1   Date             38765 non-null  object
 2   itemDescription  38765 non-null  object
dtypes: int64(1), object(2)
memory usage: 908.7+ KB


#Renaming the columns to simple words
groceries.rename(columns = {'Member_number':'id','itemDescription':'item'}, inplace = True) 
groceries


#Convert the 'Date' column to datetime format
groceries['Date']= pd.to_datetime(groceries['Date'])
 
#Extracting year,month and day
groceries['year'] = groceries['Date'].apply(lambda x : x.year)
groceries['month'] = groceries['Date'].apply(lambda x : x.month)
groceries['day'] = groceries['Date'].apply(lambda x : x.day)
groceries['weekday'] = groceries['Date'].apply(lambda x : x.weekday())

#Rearranging the columns
groceries=groceries[['id', 'Date','year', 'month', 'day','weekday','item']]
groceries.head()


bar_plot(groceries,'item')


def bar_plot(df,col):

    fig = px.bar(df,
        x = df[col].value_counts().keys(), 
        y = df[col].value_counts().values,
        color= df[col].value_counts().keys()
    )
    fig.update_layout(
    xaxis_title= col,
    yaxis_title="Count",
    legend_title=col,
    font_family="Courier New",
    font_color="blue",
    title_font_family="Times New Roman",
    title_font_color="red",
    legend_title_font_color="green"
)
    
    fig.show()


groceries.to_csv('groceries_out.csv')


#Filtering data by year 2014 and 2015  
df1=groceries.groupby(['year']).filter(lambda x: (x['year'] == 2014).any())
df2=groceries.groupby(['year']).filter(lambda x: (x['year'] == 2015).any())

#Plotting monthly data of number of quantity purchased in 2014 and 2015 
sales_2014=hv.Bars(df1.groupby(['month'])['item'].count()).opts(ylabel="# of items", title='# of items sold in 2014')
sales_2015=hv.Bars(df2.groupby(['month'])['item'].count()).opts(ylabel="# of items", title='# of items sold in 2015')

#Merging both plots
(sales_2014 + sales_2015).opts(opts.Bars(width=380, height=300,tools=['hover'],show_grid=True))


#Plotting day transaction across a typical month in 2014 and 2015
sales_day=hv.Curve(groceries.groupby(['day'])['item'].count()).opts(ylabel="# of items", title='Cummulative day transactions-2014 & 2015')

#Line chart
sales_day.opts(opts.Curve(width=800, height=300,tools=['hover'],show_grid=True))


#Creating temporary data which has quantity purchased column
temp=groceries.copy()
temp['qty_purchased']=groceries['id'].map(groceries['id'].value_counts())

#Slicing first 5000 rows as altair library can't plot any data which has record beyond that
temp1=temp[:5000]
temp1.columns

#Plotting
brush = alt.selection(type='interval', encodings=['x'])

#Plotting the bar chart
bars = alt.Chart().mark_bar(color="green").encode(
    x=alt.X('month(Date):O',title="Month"),
    y=alt.Y('mean(qty_purchased):Q',title="Last Price"),
    opacity=alt.condition(brush, alt.OpacityValue(1), alt.OpacityValue(0.7)),
    tooltip=['month(Date)','mean(qty_purchased)']
).add_selection(
    brush
).properties(height=400,width=600,title="Monthly quantity purchased from grocery store-Drag over bars and find average")

#Plotting avrage line
line = alt.Chart().mark_rule(color='firebrick').encode(
    y='mean(qty_purchased):Q',
    size=alt.SizeValue(3),
    tooltip=['mean(qty_purchased)']
).transform_filter(
    brush
)

#Display plot using sliced data
alt.layer(bars, line, data=temp1)


#Converting weekday variable to category
temp1.weekday = temp1.weekday.astype('category') 

#Creating a new dataframe which has the frequency of weekdays
weekday_bin=temp1['weekday'].value_counts().to_frame().reset_index().rename(columns={'index':'weekday','weekday':'count'})

#Plotting bar chart
bars = alt.Chart(weekday_bin).mark_bar(color="darkorange").encode(
    x='weekday',
    y=alt.Y("count",title='Number of purchases')
)

#Adding data labels
text = bars.mark_text(
    align='center',
    baseline='middle',
    dy=-7 ,
    size=15,
).encode(
    text='count',
    tooltip=[alt.Tooltip('weekday'),
            alt.Tooltip('count')]
)

#Combining both
(bars + text).properties(
    width=800,
    height=400,
    title="Number of quanityt purchases across weekdays"
)

C:\Users\Yogeshwar\anaconda3\lib\site-packages\pandas\core\generic.py:5516: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


#Setting plot style
plt.figure(figsize = (15, 8))
plt.style.use('seaborn-white')

#Top 10 fast moving products
plt.subplot(1,2,1)
ax=sns.countplot(y="item", hue="year", data=groceries, palette="pastel",
              order=groceries.item.value_counts().iloc[:10].index)

ax.set_xticklabels(ax.get_xticklabels(),fontsize=11,rotation=40, ha="right")
ax.set_title('Top 10 Fast moving products',fontsize= 22)
ax.set_xlabel('Total # of items purchased',fontsize = 20) 
ax.set_ylabel('Top 10 items', fontsize = 20)
plt.tight_layout()

#Bottom 10 fast moving products
plt.subplot(1,2,2)
ax=sns.countplot(y="item", hue="year", data=groceries, palette="pastel",
              order=groceries.item.value_counts().iloc[-10:].index)
ax.set_xticklabels(ax.get_xticklabels(),fontsize=11,rotation=40, ha="right")
ax.set_title('Bottom 10 Fast moving products',fontsize= 22)
ax.set_xlabel('Total # of items purchased',fontsize = 20) 
ax.set_ylabel('Bottom 10 items', fontsize = 20)
plt.tight_layout()

C:\Users\YOGESH~1\AppData\Local\Temp/ipykernel_10028/2789563947.py:10: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(ax.get_xticklabels(),fontsize=11,rotation=40, ha="right")
C:\Users\YOGESH~1\AppData\Local\Temp/ipykernel_10028/2789563947.py:20: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(ax.get_xticklabels(),fontsize=11,rotation=40, ha="right")


#Getting the top customers based on quantity purchased
top_customers=temp[['id', 'qty_purchased','year']].sort_values(by = 'qty_purchased',ascending = False).head(500)

#Converting the datatype of id and year
top_customers.id = top_customers.id.astype('category') 
top_customers.year = top_customers.year.astype('category') 

#Plotting
alt.Chart(top_customers).mark_bar(color="darkgreen").encode(
    x='qty_purchased',
    y=alt.Y('id', sort='-x'),
    color='year',
    tooltip=['id','qty_purchased']
).properties(height=400,width=600,title="Top Customers")


#Creating sparse matrix 
basket = (temp.groupby(['id', 'item'])['qty_purchased']
          .sum().unstack().reset_index().fillna(0)
          .set_index('id'))

#Eoding the quantity urchased
def encode(x):
    '''Encoding the quantity of products with 0s and 1s
    0:when qty is less than or equal to 0
    1:when qty is greater than or equal to 1'''
    if x <= 0:
        return 0
    if x >= 1:
        return 1
    
#Appying on our data
basket_sets = basket.applymap(encode)
basket_sets


#Apriori- Support70%
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)

#Associaton rules-using lift
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.tail()

C:\Users\Yogeshwar\anaconda3\lib\site-packages\mlxtend\frequent_patterns\fpcommon.py:111: DeprecationWarning: DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type
  warnings.warn(


rules


rules.to_csv('rules_out.csv')


#Customizable function to change the lift and confidence
def rules_mod(lift,confidence):
    '''rules_mod is a function to control the rules 
    based on lift and confidence threshold'''
    return rules[ (rules['lift'] >= lift) &
      (rules['confidence'] >= confidence) ]

#Calling function
a=rules_mod(0.7,0.2)
a


a.to_csv('specic_rule_out.csv')


#Setting up the style
plt.figure(figsize = (15, 15))
plt.style.use('seaborn-white')
#Plotting the relationship between the metrics
plt.subplot(221)
sns.scatterplot(x="support", y="confidence",data=rules)
plt.subplot(222)
sns.scatterplot(x="support", y="lift",data=rules)
plt.subplot(223)
sns.scatterplot(x="confidence", y="lift",data=rules)
plt.subplot(224)
sns.scatterplot(x="antecedent support", y="consequent support",data=rules)

<AxesSubplot:xlabel='antecedent support', ylabel='consequent support'>


'''a function to build a network diagram connecting antecedents and consequents'''
def draw_graph(rules, rules_to_show):
  import networkx as nx  
  G1 = nx.DiGraph()
   
  color_map=[]
  N = 50
  colors = np.random.rand(N)    
  strs=['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11']   
   
   
  for i in range (rules_to_show):      
    G1.add_nodes_from(["R"+str(i)])
    
     
    for a in rules.iloc[i]['antecedents']:
                
        G1.add_nodes_from([a])
        
        G1.add_edge(a, "R"+str(i), color=colors[i] , weight = 2)
       
    for c in rules.iloc[i]['consequents']:
             
            G1.add_nodes_from([a])
            
            G1.add_edge("R"+str(i), c, color=colors[i],  weight=2)
 
  for node in G1:
       found_a_string = False
       for item in strs: 
           if node==item:
                found_a_string = True
       if found_a_string:
            color_map.append('yellow')
       else:
            color_map.append('green')       
 
 
   
  edges = G1.edges()
  colors = [G1[u][v]['color'] for u,v in edges]
  weights = [G1[u][v]['weight'] for u,v in edges]
 
  pos = nx.spring_layout(G1, k=16, scale=1)
  nx.draw(G1, pos, edges=edges, node_color = color_map, edge_color=colors, width=weights, font_size=16, with_labels=False)            
   
  for p in pos:  # raise text positions
           pos[p][1] += 0.07
  nx.draw_networkx_labels(G1, pos)
  plt.show()

#Calling function with 10 rules
draw_graph(rules, 10)


rules['lhs items'] = rules['antecedents'].apply(lambda x:len(x) )
rules[rules['lhs items']>1].sort_values('lift', ascending=False).head()

# Replace frozen sets with strings
rules['antecedents_'] = rules['antecedents'].apply(lambda a: ','.join(list(a)))
rules['consequents_'] = rules['consequents'].apply(lambda a: ','.join(list(a)))

# Transform the DataFrame of rules into a matrix using the lift metric
pivot = rules[rules['lhs items']>1].pivot(index = 'antecedents_', 
                    columns = 'consequents_', values= 'lift')


# Replace frozen sets with strings
rules['antecedents_'] = rules['antecedents'].apply(lambda a: ','.join(list(a)))
rules['consequents_'] = rules['consequents'].apply(lambda a: ','.join(list(a)))

# Transform the DataFrame of rules into a matrix using the lift metric
pivot = rules[rules['lhs items']>1].pivot(index = 'antecedents_', 
                    columns = 'consequents_', values= 'lift')

# Generate a heatmap with annotations on and the colorbar off
sns.heatmap(pivot, annot = True)
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()

	Member_number	Date	itemDescription
0	1808	21-07-2015	tropical fruit
1	2552	05-01-2015	whole milk
2	2300	19-09-2015	pip fruit
3	1187	12-12-2015	other vegetables
4	3037	01-02-2015	whole milk

	id	Date	item
0	1808	21-07-2015	tropical fruit
1	2552	05-01-2015	whole milk
2	2300	19-09-2015	pip fruit
3	1187	12-12-2015	other vegetables
4	3037	01-02-2015	whole milk
...	...	...	...
38760	4471	08-10-2014	sliced cheese
38761	2022	23-02-2014	candy
38762	1097	16-04-2014	cake bar
38763	1510	03-12-2014	fruit/vegetable juice
38764	1521	26-12-2014	cat food

	id	Date	year	month	day	weekday	item
0	1808	2015-07-21	2015	7	21	1	tropical fruit
1	2552	2015-05-01	2015	5	1	4	whole milk
2	2300	2015-09-19	2015	9	19	5	pip fruit
3	1187	2015-12-12	2015	12	12	5	other vegetables
4	3037	2015-01-02	2015	1	2	4	whole milk

item	Instant food products	UHT-milk	abrasive cleaner	artif. sweetener	baby cosmetics	bags	baking powder	bathroom cleaner	beef	berries	...	turkey	vinegar	waffles	whipped/sour cream	whisky	white bread	white wine	whole milk	yogurt	zwieback
id
1000	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	1	0
1001	0	0	0	0	0	0	0	0	1	0	...	0	0	0	1	0	1	0	1	0	0
1002	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
1003	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1004	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4996	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4997	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	1	1	0	0
4998	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4999	0	0	0	0	0	0	0	0	0	1	...	0	0	0	1	0	0	0	0	1	0
5000	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	antecedents	consequents	antecedent support	consequent support	support	confidence	lift	leverage	conviction
93	(other vegetables, whole milk)	(yogurt)	0.191380	0.282966	0.071832	0.375335	1.326434	0.017678	1.147870
94	(yogurt, whole milk)	(other vegetables)	0.150590	0.376603	0.071832	0.477002	1.266589	0.015119	1.191967
95	(other vegetables)	(yogurt, whole milk)	0.376603	0.150590	0.071832	0.190736	1.266589	0.015119	1.049608
96	(yogurt)	(other vegetables, whole milk)	0.282966	0.191380	0.071832	0.253853	1.326434	0.017678	1.083727
97	(whole milk)	(other vegetables, yogurt)	0.458184	0.120318	0.071832	0.156775	1.303003	0.016704	1.043235

Market Basket Analysis By Apriori -Yogeshwar

Keywords : Apriori Association rules Holoviews Support

Table of Contents ¶

1. Overview¶

Introduction¶

Project Detail¶

Goal of this notebook¶

2. Import libraries¶

3. Getting the data¶

4. Pre-processing¶

Renaming column¶

Date information¶

5. EDA¶

No. of items sold in 2014 and 2015¶

Cummulative day transactions in 2014 & 2015¶

Monthly quantity purchased from grocery store¶

Number of quantity purchased across weekdays¶

Top and bottom 10 Fast moving products¶

Top Customers in 2014 and 2015¶

6. Associate Rule Mining with Apriori Algorithm¶

What is market basket analysis?¶

What is association rule mining?¶

What is Apriori?¶

Preparing the data¶

Applying Apriori¶

Building dynamic function to customize rules¶

7. Visualizing the results¶

Relationship between the metrics¶

Network diagram of rules¶

Strength of association using heatmap¶

8. Conclusion¶

	antecedents	consequents	antecedent support	consequent support	support	confidence	lift	leverage	conviction
0	(bottled beer)	(whole milk)	0.158799	0.458184	0.085428	0.537964	1.174124	0.012669	1.172672
1	(whole milk)	(bottled beer)	0.458184	0.158799	0.085428	0.186450	1.174124	0.012669	1.033988
2	(other vegetables)	(bottled water)	0.376603	0.213699	0.093894	0.249319	1.166680	0.013414	1.047450
3	(bottled water)	(other vegetables)	0.213699	0.376603	0.093894	0.439376	1.166680	0.013414	1.111969
4	(bottled water)	(rolls/buns)	0.213699	0.349666	0.079271	0.370948	1.060863	0.004548	1.033832
...	...	...	...	...	...	...	...	...	...
93	(other vegetables, whole milk)	(yogurt)	0.191380	0.282966	0.071832	0.375335	1.326434	0.017678	1.147870
94	(yogurt, whole milk)	(other vegetables)	0.150590	0.376603	0.071832	0.477002	1.266589	0.015119	1.191967
95	(other vegetables)	(yogurt, whole milk)	0.376603	0.150590	0.071832	0.190736	1.266589	0.015119	1.049608
96	(yogurt)	(other vegetables, whole milk)	0.282966	0.191380	0.071832	0.253853	1.326434	0.017678	1.083727
97	(whole milk)	(other vegetables, yogurt)	0.458184	0.120318	0.071832	0.156775	1.303003	0.016704	1.043235

Market Basket Analysis By Apriori -Yogeshwar

Keywords : Apriori Association rules Holoviews Support

Table of Contents¶

1. Overview¶

Introduction¶

Project Detail¶

Goal of this notebook¶

2. Import libraries¶

3. Getting the data¶

4. Pre-processing¶

Renaming column¶

Date information¶

5. EDA¶

No. of items sold in 2014 and 2015¶

Cummulative day transactions in 2014 & 2015¶

Monthly quantity purchased from grocery store¶

Number of quantity purchased across weekdays¶

Top and bottom 10 Fast moving products¶

Top Customers in 2014 and 2015¶

6. Associate Rule Mining with Apriori Algorithm¶

What is market basket analysis?¶

What is association rule mining?¶

What is Apriori?¶

Preparing the data¶

Applying Apriori¶

Building dynamic function to customize rules¶

7. Visualizing the results¶

Relationship between the metrics¶

Network diagram of rules¶

Strength of association using heatmap¶

8. Conclusion¶

Table of Contents ¶