import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv('Customers EDA+stats.csv')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 500 entries, 0 to 499 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 500 non-null object 1 customer_id 500 non-null object 2 date 500 non-null object 3 nearest_warehouse 500 non-null object 4 shopping_cart 500 non-null object 5 order_price 500 non-null int64 6 delivery_charges 500 non-null float64 7 customer_lat 500 non-null float64 8 customer_long 500 non-null float64 9 coupon_discount 500 non-null int64 10 order_total 500 non-null float64 11 season 500 non-null object 12 is_expedited_delivery 500 non-null bool 13 distance_to_nearest_warehouse 500 non-null float64 14 latest_customer_review 499 non-null object 15 is_happy_customer 500 non-null bool dtypes: bool(2), float64(5), int64(2), object(7) memory usage: 55.8+ KB
df.head()
order_id | customer_id | date | nearest_warehouse | shopping_cart | order_price | delivery_charges | customer_lat | customer_long | coupon_discount | order_total | season | is_expedited_delivery | distance_to_nearest_warehouse | latest_customer_review | is_happy_customer | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ORD182494 | ID6197211592 | 2019-06-22 | Thompson | [('Lucent 330S', 1), ('Thunder line', 2), ('iS... | 12200 | 79.89 | -37.815105 | 144.932843 | 10 | 11059.89 | Winter | True | 1.2800 | perfect phone and trusted seller. phone itself... | True |
1 | ORD395518 | ID0282825849 | 2019-12-29 | Thompson | [('Thunder line', 1), ('Universe Note', 2)] | 9080 | 62.71 | -37.802736 | 144.951118 | 0 | 9142.71 | Summer | False | 1.1621 | it keeps dropping calls the wifi don't work th... | False |
2 | ORD494479 | ID0579391891 | 2019-03-02 | Nickolson | [('Thunder line', 1), ('pearTV', 2)] | 10670 | 65.87 | -37.821302 | 144.957581 | 10 | 9668.87 | Autumn | False | 1.0949 | five stars this is a great cheap phone. | True |
3 | ORD019224 | ID4544561904 | 2019-01-12 | Nickolson | [('Universe Note', 1), ('Alcon 10', 2), ('Oliv... | 24800 | 57.61 | -37.811416 | 144.973073 | 15 | 21137.61 | Summer | False | 0.8571 | charger did not fit the charger didn't fit. | False |
4 | ORD104032 | ID6231506320 | 2019-11-28 | Nickolson | [('Universe Note', 1), ('Olivia x460', 1), ('i... | 9145 | 75.54 | 37.823859 | 144.969892 | 25 | 6934.29 | Spring | False | 0.5867 | four stars good | True |
df.describe()
order_price | delivery_charges | customer_lat | customer_long | coupon_discount | order_total | distance_to_nearest_warehouse | |
---|---|---|---|---|---|---|---|
count | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 5.000000e+02 | 500.000000 |
mean | 25522.216000 | 76.658200 | -35.835234 | 144.969494 | 10.890000 | 3.920967e+04 | 2.204224 |
std | 86333.729169 | 14.481465 | 12.045393 | 0.022720 | 8.649134 | 2.741940e+05 | 8.812416 |
min | 585.000000 | 46.350000 | -37.827123 | 144.924967 | 0.000000 | 6.392900e+02 | 0.107800 |
25% | 7050.000000 | 65.982500 | -37.818222 | 144.953488 | 5.000000 | 6.454735e+03 | 0.751425 |
50% | 12807.500000 | 76.310000 | -37.812165 | 144.965357 | 10.000000 | 1.129396e+04 | 1.030100 |
75% | 20360.000000 | 82.555000 | -37.805364 | 144.983985 | 15.000000 | 1.811919e+04 | 1.408625 |
max | 947691.000000 | 114.040000 | 37.826339 | 145.019837 | 25.000000 | 5.688270e+06 | 94.973400 |
df.isnull().sum()
order_id 0 customer_id 0 date 0 nearest_warehouse 0 shopping_cart 0 order_price 0 delivery_charges 0 customer_lat 0 customer_long 0 coupon_discount 0 order_total 0 season 0 is_expedited_delivery 0 distance_to_nearest_warehouse 0 latest_customer_review 1 is_happy_customer 0 dtype: int64
df['latest_customer_review'] = df['latest_customer_review'].fillna('No feedBack')
df.isnull().sum()
order_id 0 customer_id 0 date 0 nearest_warehouse 0 shopping_cart 0 order_price 0 delivery_charges 0 customer_lat 0 customer_long 0 coupon_discount 0 order_total 0 season 0 is_expedited_delivery 0 distance_to_nearest_warehouse 0 latest_customer_review 0 is_happy_customer 0 dtype: int64
df['nearest_warehouse'].unique()
array(['Thompson', 'Nickolson', 'Bakers', 'nickolson', 'thompson'], dtype=object)
df.nearest_warehouse=df.nearest_warehouse.str.title()
df['nearest_warehouse'].unique()
array(['Thompson', 'Nickolson', 'Bakers'], dtype=object)
df.duplicated().sum()
0
df['order_price'].plot(kind='hist',bins=20)
plt.title("Order Price distributed ")
plt.xlabel("Order Price ")
plt.ylabel("Frequency")
plt.show()
df['nearest_warehouse'].value_counts().plot(kind='bar', color=['r','b','g'])
plt.title("Quantity of the warehouses")
plt.xlabel("Warehouse")
plt.ylabel("Counts")
Text(0, 0.5, 'Counts')
plt.pie(df['is_happy_customer'].value_counts(), labels=['happy', 'Not happy'], autopct="%.1f%%")
plt.title("is_happy_customer or not")
plt.show()
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['month'] = df['date'].dt.month
sns.barplot(x='month', y='is_happy_customer', data=df, estimator=lambda x: len(x))
plt.suptitle('is_happy_customer(Across Months)')
plt.xlabel('Customer happiness')
plt.ylabel('Month')
plt.show()
sns.barplot(x='is_happy_customer', y='order_price', data=df, estimator=lambda x: x.mean())
plt.title("Avg Order Price by Customer happiness")
plt.xlabel("Customer happiness")
plt.ylabel("Avg Order Price")
plt.show()