Outlier Detection (Part2)
Skewed distribution and adjusted boxplot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from matplotlib.cbook import boxplot_stats
from matplotlib import cbook
from matplotlib.pyplot import figure
df_listings=pd.read_csv('listings.csv')
"""Filling the null values of rating by mean"""
df_listings["review_scores_rating"] = df_listings["review_scores_rating"].fillna(df_listings["review_scores_rating"].mean())
"""Function to remove $ sign"""
def remove_sign(x,sign):
if type(x) is str:
x = float(x.replace(sign,'').replace(',',''))
return x
df_listings.price = df_listings.price.apply(remove_sign,sign='$')
"""Boxplot of price across property type"""
figure(figsize=(10, 8), dpi=80)
sns.boxplot(y='price', x='property_type',data=df_listings)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
"""Boxplot of price across for Apartments"""
df_listings_apt = df_listings[df_listings['property_type'] == 'Apartment']
figure(figsize=(5, 8), dpi=80)
sns.boxplot(y='price', x='property_type',data=df_listings_apt)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
"""Boxplot of price across for Townhouse to check if it is skewed"""
df_listings_apt = df_listings[df_listings['property_type'] == 'Townhouse']
figure(figsize=(5, 8), dpi=80)
sns.boxplot(y='price', x='property_type',data=df_listings_apt)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
df = df_listings_apt
Q1 = df.price.quantile(0.25)
Q3 = df.price.quantile(0.75)
IQR = Q3-Q1
df_final = df[~((df.price<(Q1-1.7*IQR)) | (df.price>(Q3+1.7*IQR)))]
figure(figsize=(5, 8), dpi=80)
sns.boxplot(y='price', x='property_type', data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
stat = boxplot_stats(df_final.price)
stat
df = df_listings_apt
Q1 = df.price.quantile(0.25)
Q2 = df.price.quantile(0.50)
Q3 = df.price.quantile(0.75)
df_final = df[~((df.price<(Q2-3*(Q2-Q1))) | (df.price>(Q2+3*(Q3-Q2))))]
"""Boxplot of price for apt after SIQR implementation"""
figure(figsize=(5, 8), dpi=80)
sns.boxplot(y='price', x='property_type',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
stat = boxplot_stats(df_final.price)
stat
len(stat[0]['fliers'])
from statsmodels.stats.stattools import medcouple
arr = np.array([1, 2, 3, 8, 12, 6, 15])
medcouple(arr)