import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sb
from matplotlib.cbook import boxplot_stats
from matplotlib import cbook
from matplotlib.pyplot import figure
df_calendar=pd.read_csv('calendar.csv')
df_reviews=pd.read_csv('reviews.csv')
df_listings=pd.read_csv('listings.csv')
df_normal = pd.read_excel('Normal-data.xlsx')
"""Filling the null values of rating by mean"""

df_listings["review_scores_rating"] = df_listings["review_scores_rating"].fillna(df_listings["review_scores_rating"].mean())
"""Function to remove $ sign"""

def remove_sign(x,sign):
    if type(x) is str:
        x = float(x.replace(sign,'').replace(',',''))
    return x
df_listings.price = df_listings.price.apply(remove_sign,sign='$')
"""Boxplot of price across property type"""

sb.boxplot(y='price', x='property_type',data=df_listings)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
df_listings_apt = df_listings[df_listings['property_type'] == 'Apartment']
df_listings_apt.price
1        65.0
2        65.0
6       100.0
9       229.0
13      150.0
        ...  
3580     69.0
3581    150.0
3582    198.0
3583     65.0
3584     65.0
Name: price, Length: 2612, dtype: float64
df = df_listings_apt
Q1 = df.price.quantile(0.25)
Q3 = df.price.quantile(0.75)
IQR = Q3-Q1
df_final = df[~((df.price<(Q1-1.5*IQR)) | (df.price>(Q3+1.5*IQR)))]
"""Boxplot of price for apt after IQR implementation"""
figure(figsize=(6, 8), dpi=80)
sb.boxplot(y='price', x='property_type',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
stat = boxplot_stats(df_final.price)
stat
[{'mean': 166.03392504930966,
  'iqr': 125.0,
  'cilo': 146.10218983747075,
  'cihi': 153.89781016252925,
  'whishi': 402.0,
  'whislo': 10.0,
  'fliers': array([417.]),
  'q1': 95.0,
  'med': 150.0,
  'q3': 220.0}]
len(stat[0]['fliers'])
1
 

IQR multiplier = 1.7 (3 sigma)

df = df_listings_apt
# df = df_normal
Q1 = df.price.quantile(0.25)
Q3 = df.price.quantile(0.75)
IQR = Q3-Q1
df_final = df[~((df.price<(Q1-1.7*IQR)) | (df.price>(Q3+1.7*IQR)))]
"""Boxplot of price for apt after IQR implementation"""

sb.boxplot(y='price', x='property_type',data=df_final)
# sb.boxplot(y='price',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
stat = boxplot_stats(df_final.price)
stat
[{'mean': 167.74911799294395,
  'iqr': 129.5,
  'cilo': 148.97455227210247,
  'cihi': 157.02544772789753,
  'whishi': 417.0,
  'whislo': 10.0,
  'fliers': array([429., 450., 450., 450., 449., 425., 450., 450., 450., 425., 450.,
         450., 425., 425., 425., 429.]),
  'q1': 95.5,
  'med': 153.0,
  'q3': 225.0}]
len(stat[0]['fliers'])
16

2 sigma and 3 sigma

df = df_listings_apt
# df = df_normal
two_sd = df.price.std() *2
three_sd = df.price.std() *3
df_final = df[~((df.price<(np.mean(df.price) - two_sd)) | (df.price>(np.mean(df.price) + two_sd)))]
# df_final = df[~((df.price<(np.mean(df.price) - three_sd)) | (df.price>(np.mean(df.price) + three_sd)))]
np.mean(df.price) + three_sd
644.9362244380891
figure(figsize=(6, 8), dpi=80)
sb.boxplot(y='price',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
stat = boxplot_stats(df_final.price)
stat
[{'mean': 168.21174168297455,
  'iqr': 129.0,
  'cilo': 149.9932346400486,
  'cihi': 158.0067653599514,
  'whishi': 417.0,
  'whislo': 10.0,
  'fliers': array([429., 450., 450., 450., 449., 425., 479., 450., 464., 450., 451.,
         450., 425., 450., 450., 425., 425., 425., 429., 459.]),
  'q1': 96.0,
  'med': 154.0,
  'q3': 225.0}]
len(stat[0]['fliers'])
20

Z-score

df = df_listings_apt
df_price = pd.DataFrame(df.price)
df_price
price
1 65.0
2 65.0
6 100.0
9 229.0
13 150.0
... ...
3580 69.0
3581 150.0
3582 198.0
3583 65.0
3584 65.0

2612 rows × 1 columns

m = np.mean(df_price.price)
s = np.std(df_price.price)
df_price['Z-score'] = (df_price.price - m)/s
df_price
price Z-score
1 65.0 -0.758079
2 65.0 -0.758079
6 100.0 -0.531238
9 229.0 0.304830
13 150.0 -0.207181
... ... ...
3580 69.0 -0.732154
3581 150.0 -0.207181
3582 198.0 0.103914
3583 65.0 -0.758079
3584 65.0 -0.758079

2612 rows × 2 columns

df = df_price
df_outlier = df[abs(df['Z-score']) >3]
df_outlier
price Z-score
391 725.0 3.519480
408 872.0 4.472209
793 1400.0 7.894255
889 650.0 3.033394
1085 800.0 4.005566
1234 650.0 3.033394
1262 1250.0 6.922083
1280 695.0 3.325045
1399 975.0 5.139767
1764 1000.0 5.301796
1854 769.0 3.804650
1896 1235.0 6.824866
1932 849.0 4.323142
1950 1345.0 7.537792
2204 1372.0 7.712783
2285 4000.0 24.745242
2394 750.0 3.681508
2448 1275.0 7.084112
2485 999.0 5.295314
3055 3000.0 18.264093
3096 1250.0 6.922083
df_outlier.shape
(21, 2)
df = df.drop(index = df_outlier.index)
figure(figsize=(6, 8), dpi=80)
sb.boxplot(y='price',data=df)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
stat = boxplot_stats(df_final.price)
stat
[{'mean': 168.21174168297455,
  'iqr': 129.0,
  'cilo': 149.9932346400486,
  'cihi': 158.0067653599514,
  'whishi': 417.0,
  'whislo': 10.0,
  'fliers': array([429., 450., 450., 450., 449., 425., 479., 450., 464., 450., 451.,
         450., 425., 450., 450., 425., 425., 425., 429., 459.]),
  'q1': 96.0,
  'med': 154.0,
  'q3': 225.0}]
len(stat[0]['fliers'])
20

check Z-score for outliers

figure(figsize=(15, 8), dpi=80)
plt.hist(df_price['Z-score'], bins=50)
(array([579., 846., 637., 359., 113.,  21.,  23.,  13.,   4.,   3.,   2.,
          0.,   3.,   0.,   0.,   4.,   1.,   2.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   1.]),
 array([-1.11454172, -0.59734605, -0.08015037,  0.4370453 ,  0.95424098,
         1.47143665,  1.98863233,  2.50582801,  3.02302368,  3.54021936,
         4.05741503,  4.57461071,  5.09180639,  5.60900206,  6.12619774,
         6.64339341,  7.16058909,  7.67778476,  8.19498044,  8.71217612,
         9.22937179,  9.74656747, 10.26376314, 10.78095882, 11.2981545 ,
        11.81535017, 12.33254585, 12.84974152, 13.3669372 , 13.88413287,
        14.40132855, 14.91852423, 15.4357199 , 15.95291558, 16.47011125,
        16.98730693, 17.50450261, 18.02169828, 18.53889396, 19.05608963,
        19.57328531, 20.09048098, 20.60767666, 21.12487234, 21.64206801,
        22.15926369, 22.67645936, 23.19365504, 23.71085072, 24.22804639,
        24.74524207]),
 <BarContainer object of 50 artists>)
df_price['Z-score'].mean()
-8.373427865664466e-17

Modified Z-score

df = df_listings_apt
df_price = pd.DataFrame(df.price)
df_price
price
1 65.0
2 65.0
6 100.0
9 229.0
13 150.0
... ...
3580 69.0
3581 150.0
3582 198.0
3583 65.0
3584 65.0

2612 rows × 1 columns

m = np.mean(df_price.price)
s = np.std(df_price.price)
df_price['Z-score'] = (df_price.price - m)/s
df_price
price Z-score
1 65.0 -0.758079
2 65.0 -0.758079
6 100.0 -0.531238
9 229.0 0.304830
13 150.0 -0.207181
... ... ...
3580 69.0 -0.732154
3581 150.0 -0.207181
3582 198.0 0.103914
3583 65.0 -0.758079
3584 65.0 -0.758079

2612 rows × 2 columns

m = np.median(df_price.price)
df_price['AD (Absolute Deviation)'] = abs(df_price.price - m)
df_price
price Z-score AD (Absolute Deviation)
1 65.0 -0.758079 94.0
2 65.0 -0.758079 94.0
6 100.0 -0.531238 59.0
9 229.0 0.304830 70.0
13 150.0 -0.207181 9.0
... ... ... ...
3580 69.0 -0.732154 90.0
3581 150.0 -0.207181 9.0
3582 198.0 0.103914 39.0
3583 65.0 -0.758079 94.0
3584 65.0 -0.758079 94.0

2612 rows × 3 columns

MAD = np.mean(df_price['AD (Absolute Deviation)'])
MAD
84.66117917304747
df_price['Modified Z-score'] = (0.6745*df_price['AD (Absolute Deviation)'])/MAD
df_price
price Z-score AD (Absolute Deviation) Modified Z-score
1 65.0 -0.758079 94.0 0.748903
2 65.0 -0.758079 94.0 0.748903
6 100.0 -0.531238 59.0 0.470056
9 229.0 0.304830 70.0 0.557694
13 150.0 -0.207181 9.0 0.071703
... ... ... ... ...
3580 69.0 -0.732154 90.0 0.717035
3581 150.0 -0.207181 9.0 0.071703
3582 198.0 0.103914 39.0 0.310715
3583 65.0 -0.758079 94.0 0.748903
3584 65.0 -0.758079 94.0 0.748903

2612 rows × 4 columns

df_outlier = df_price[df_price['Modified Z-score'] >= 3.5]
df_outlier
price Z-score AD (Absolute Deviation) Modified Z-score
391 725.0 3.519480 566.0 4.509351
408 872.0 4.472209 713.0 5.680508
727 600.0 2.709336 441.0 3.513470
793 1400.0 7.894255 1241.0 9.887111
889 650.0 3.033394 491.0 3.911822
894 625.0 2.871365 466.0 3.712646
982 600.0 2.709336 441.0 3.513470
1085 800.0 4.005566 641.0 5.106880
1234 650.0 3.033394 491.0 3.911822
1262 1250.0 6.922083 1091.0 8.692054
1280 695.0 3.325045 536.0 4.270340
1399 975.0 5.139767 816.0 6.501114
1402 600.0 2.709336 441.0 3.513470
1516 600.0 2.709336 441.0 3.513470
1764 1000.0 5.301796 841.0 6.700291
1854 769.0 3.804650 610.0 4.859902
1896 1235.0 6.824866 1076.0 8.572548
1932 849.0 4.323142 690.0 5.497266
1950 1345.0 7.537792 1186.0 9.448923
1982 600.0 2.709336 441.0 3.513470
2192 600.0 2.709336 441.0 3.513470
2204 1372.0 7.712783 1213.0 9.664034
2285 4000.0 24.745242 3841.0 30.601446
2394 750.0 3.681508 591.0 4.708528
2405 603.0 2.728780 444.0 3.537371
2432 603.0 2.728780 444.0 3.537371
2448 1275.0 7.084112 1116.0 8.891230
2485 999.0 5.295314 840.0 6.692324
3055 3000.0 18.264093 2841.0 22.634394
3062 603.0 2.728780 444.0 3.537371
3064 600.0 2.709336 441.0 3.513470
3096 1250.0 6.922083 1091.0 8.692054
3102 603.0 2.728780 444.0 3.537371
df_outlier.shape
(33, 4)
df = df.drop(index = df_outlier.index)
figure(figsize=(6, 8), dpi=80)
sb.boxplot(y='price',data=df)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
stat = boxplot_stats(df_final.price)
stat
[{'mean': 168.21174168297455,
  'iqr': 129.0,
  'cilo': 149.9932346400486,
  'cihi': 158.0067653599514,
  'whishi': 417.0,
  'whislo': 10.0,
  'fliers': array([429., 450., 450., 450., 449., 425., 479., 450., 464., 450., 451.,
         450., 425., 450., 450., 425., 425., 425., 429., 459.]),
  'q1': 96.0,
  'med': 154.0,
  'q3': 225.0}]
len(stat[0]['fliers'])
20