Outlier Detection (Part1)
IQR, Standard Deviation, Z-score and Modified Z-score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sb
from matplotlib.cbook import boxplot_stats
from matplotlib import cbook
from matplotlib.pyplot import figure
df_calendar=pd.read_csv('calendar.csv')
df_reviews=pd.read_csv('reviews.csv')
df_listings=pd.read_csv('listings.csv')
df_normal = pd.read_excel('Normal-data.xlsx')
"""Filling the null values of rating by mean"""
df_listings["review_scores_rating"] = df_listings["review_scores_rating"].fillna(df_listings["review_scores_rating"].mean())
"""Function to remove $ sign"""
def remove_sign(x,sign):
if type(x) is str:
x = float(x.replace(sign,'').replace(',',''))
return x
df_listings.price = df_listings.price.apply(remove_sign,sign='$')
"""Boxplot of price across property type"""
sb.boxplot(y='price', x='property_type',data=df_listings)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
df_listings_apt = df_listings[df_listings['property_type'] == 'Apartment']
df_listings_apt.price
df = df_listings_apt
Q1 = df.price.quantile(0.25)
Q3 = df.price.quantile(0.75)
IQR = Q3-Q1
df_final = df[~((df.price<(Q1-1.5*IQR)) | (df.price>(Q3+1.5*IQR)))]
"""Boxplot of price for apt after IQR implementation"""
figure(figsize=(6, 8), dpi=80)
sb.boxplot(y='price', x='property_type',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
stat = boxplot_stats(df_final.price)
stat
len(stat[0]['fliers'])
df = df_listings_apt
# df = df_normal
Q1 = df.price.quantile(0.25)
Q3 = df.price.quantile(0.75)
IQR = Q3-Q1
df_final = df[~((df.price<(Q1-1.7*IQR)) | (df.price>(Q3+1.7*IQR)))]
"""Boxplot of price for apt after IQR implementation"""
sb.boxplot(y='price', x='property_type',data=df_final)
# sb.boxplot(y='price',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
stat = boxplot_stats(df_final.price)
stat
len(stat[0]['fliers'])
df = df_listings_apt
# df = df_normal
two_sd = df.price.std() *2
three_sd = df.price.std() *3
df_final = df[~((df.price<(np.mean(df.price) - two_sd)) | (df.price>(np.mean(df.price) + two_sd)))]
# df_final = df[~((df.price<(np.mean(df.price) - three_sd)) | (df.price>(np.mean(df.price) + three_sd)))]
np.mean(df.price) + three_sd
figure(figsize=(6, 8), dpi=80)
sb.boxplot(y='price',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
stat = boxplot_stats(df_final.price)
stat
len(stat[0]['fliers'])
df = df_listings_apt
df_price = pd.DataFrame(df.price)
df_price
m = np.mean(df_price.price)
s = np.std(df_price.price)
df_price['Z-score'] = (df_price.price - m)/s
df_price
df = df_price
df_outlier = df[abs(df['Z-score']) >3]
df_outlier
df_outlier.shape
df = df.drop(index = df_outlier.index)
figure(figsize=(6, 8), dpi=80)
sb.boxplot(y='price',data=df)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
stat = boxplot_stats(df_final.price)
stat
len(stat[0]['fliers'])
figure(figsize=(15, 8), dpi=80)
plt.hist(df_price['Z-score'], bins=50)
df_price['Z-score'].mean()
df = df_listings_apt
df_price = pd.DataFrame(df.price)
df_price
m = np.mean(df_price.price)
s = np.std(df_price.price)
df_price['Z-score'] = (df_price.price - m)/s
df_price
m = np.median(df_price.price)
df_price['AD (Absolute Deviation)'] = abs(df_price.price - m)
df_price
MAD = np.mean(df_price['AD (Absolute Deviation)'])
MAD
df_price['Modified Z-score'] = (0.6745*df_price['AD (Absolute Deviation)'])/MAD
df_price
df_outlier = df_price[df_price['Modified Z-score'] >= 3.5]
df_outlier
df_outlier.shape
df = df.drop(index = df_outlier.index)
figure(figsize=(6, 8), dpi=80)
sb.boxplot(y='price',data=df)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
stat = boxplot_stats(df_final.price)
stat
len(stat[0]['fliers'])