In [144]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [119]:
df=pd.read_csv('Customers EDA+stats.csv')
In [120]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   order_id                       500 non-null    object 
 1   customer_id                    500 non-null    object 
 2   date                           500 non-null    object 
 3   nearest_warehouse              500 non-null    object 
 4   shopping_cart                  500 non-null    object 
 5   order_price                    500 non-null    int64  
 6   delivery_charges               500 non-null    float64
 7   customer_lat                   500 non-null    float64
 8   customer_long                  500 non-null    float64
 9   coupon_discount                500 non-null    int64  
 10  order_total                    500 non-null    float64
 11  season                         500 non-null    object 
 12  is_expedited_delivery          500 non-null    bool   
 13  distance_to_nearest_warehouse  500 non-null    float64
 14  latest_customer_review         499 non-null    object 
 15  is_happy_customer              500 non-null    bool   
dtypes: bool(2), float64(5), int64(2), object(7)
memory usage: 55.8+ KB
In [121]:
df.head()
Out[121]:
order_id customer_id date nearest_warehouse shopping_cart order_price delivery_charges customer_lat customer_long coupon_discount order_total season is_expedited_delivery distance_to_nearest_warehouse latest_customer_review is_happy_customer
0 ORD182494 ID6197211592 2019-06-22 Thompson [('Lucent 330S', 1), ('Thunder line', 2), ('iS... 12200 79.89 -37.815105 144.932843 10 11059.89 Winter True 1.2800 perfect phone and trusted seller. phone itself... True
1 ORD395518 ID0282825849 2019-12-29 Thompson [('Thunder line', 1), ('Universe Note', 2)] 9080 62.71 -37.802736 144.951118 0 9142.71 Summer False 1.1621 it keeps dropping calls the wifi don't work th... False
2 ORD494479 ID0579391891 2019-03-02 Nickolson [('Thunder line', 1), ('pearTV', 2)] 10670 65.87 -37.821302 144.957581 10 9668.87 Autumn False 1.0949 five stars this is a great cheap phone. True
3 ORD019224 ID4544561904 2019-01-12 Nickolson [('Universe Note', 1), ('Alcon 10', 2), ('Oliv... 24800 57.61 -37.811416 144.973073 15 21137.61 Summer False 0.8571 charger did not fit the charger didn't fit. False
4 ORD104032 ID6231506320 2019-11-28 Nickolson [('Universe Note', 1), ('Olivia x460', 1), ('i... 9145 75.54 37.823859 144.969892 25 6934.29 Spring False 0.5867 four stars good True
In [122]:
df.describe()
Out[122]:
order_price delivery_charges customer_lat customer_long coupon_discount order_total distance_to_nearest_warehouse
count 500.000000 500.000000 500.000000 500.000000 500.000000 5.000000e+02 500.000000
mean 25522.216000 76.658200 -35.835234 144.969494 10.890000 3.920967e+04 2.204224
std 86333.729169 14.481465 12.045393 0.022720 8.649134 2.741940e+05 8.812416
min 585.000000 46.350000 -37.827123 144.924967 0.000000 6.392900e+02 0.107800
25% 7050.000000 65.982500 -37.818222 144.953488 5.000000 6.454735e+03 0.751425
50% 12807.500000 76.310000 -37.812165 144.965357 10.000000 1.129396e+04 1.030100
75% 20360.000000 82.555000 -37.805364 144.983985 15.000000 1.811919e+04 1.408625
max 947691.000000 114.040000 37.826339 145.019837 25.000000 5.688270e+06 94.973400
In [123]:
df.isnull().sum()
Out[123]:
order_id                         0
customer_id                      0
date                             0
nearest_warehouse                0
shopping_cart                    0
order_price                      0
delivery_charges                 0
customer_lat                     0
customer_long                    0
coupon_discount                  0
order_total                      0
season                           0
is_expedited_delivery            0
distance_to_nearest_warehouse    0
latest_customer_review           1
is_happy_customer                0
dtype: int64
In [124]:
df['latest_customer_review'] = df['latest_customer_review'].fillna('No feedBack')
In [125]:
df.isnull().sum()
Out[125]:
order_id                         0
customer_id                      0
date                             0
nearest_warehouse                0
shopping_cart                    0
order_price                      0
delivery_charges                 0
customer_lat                     0
customer_long                    0
coupon_discount                  0
order_total                      0
season                           0
is_expedited_delivery            0
distance_to_nearest_warehouse    0
latest_customer_review           0
is_happy_customer                0
dtype: int64
In [126]:
df['nearest_warehouse'].unique()
Out[126]:
array(['Thompson', 'Nickolson', 'Bakers', 'nickolson', 'thompson'],
      dtype=object)
In [145]:
df.nearest_warehouse=df.nearest_warehouse.str.title()
In [146]:
df['nearest_warehouse'].unique()
Out[146]:
array(['Thompson', 'Nickolson', 'Bakers'], dtype=object)
In [128]:
df.duplicated().sum()
Out[128]:
0
In [129]:
df['order_price'].plot(kind='hist',bins=20)
plt.title("Order Price distributed ")
plt.xlabel("Order Price ")
plt.ylabel("Frequency")
plt.show()
In [130]:
df['nearest_warehouse'].value_counts().plot(kind='bar', color=['r','b','g'])
plt.title("Quantity of the warehouses")
plt.xlabel("Warehouse")
plt.ylabel("Counts")
Out[130]:
Text(0, 0.5, 'Counts')
In [131]:
plt.pie(df['is_happy_customer'].value_counts(), labels=['happy', 'Not happy'], autopct="%.1f%%")
plt.title("is_happy_customer or not")
plt.show()
In [137]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['month'] = df['date'].dt.month
sns.barplot(x='month', y='is_happy_customer', data=df, estimator=lambda x: len(x))
plt.suptitle('is_happy_customer(Across Months)')
plt.xlabel('Customer happiness')
plt.ylabel('Month')
plt.show()
In [143]:
sns.barplot(x='is_happy_customer', y='order_price', data=df, estimator=lambda x: x.mean())
plt.title("Avg Order Price by Customer happiness")
plt.xlabel("Customer happiness")
plt.ylabel("Avg Order Price")
plt.show()
In [ ]: