Practical implementation of outlier detection in python
IQR, Hampel and DBSCAN method
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.pyplot import figure
df=pd.read_csv('listings.csv')
df.head()
def remove_sign(x,sign):
if type(x) is str:
x = float(x.replace(sign,'').replace(',',''))
return x
df['price']
df=df[['price','property_type']]
df=pd.DataFrame(df)
figure(figsize=(12, 8), dpi=80)
df.price = df.price.apply(remove_sign,sign='$')
sns.boxplot(y='price', x='property_type',data=df)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
def remove_outlier_IQR(df):
Q1=df.quantile(0.25)
Q3=df.quantile(0.75)
IQR=Q3-Q1
df_final=df[~((df<(Q1-1.5*IQR)) | (df>(Q3+1.5*IQR)))]
return df_final
df_outlier_removed=remove_outlier_IQR(df.price)
df_outlier_removed=pd.DataFrame(df_outlier_removed)
ind_diff=df.index.difference(df_outlier_removed.index)
figure(figsize=(12, 8), dpi=80)
for i in range(0, len(ind_diff),1):
df_final=df.drop([ind_diff[i]])
df=df_final
sns.boxplot(y='price', x='property_type',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
def remove_outlier_Hampel(df):
med=df.median()
List=abs(df-med)
cond=List.median()*4.5
good_list=List[~(List>cond)]
return good_list
df=pd.read_csv('listings.csv')
df.price = df.price.apply(remove_sign,sign='$')
df_outlier_removed=remove_outlier_Hampel(df.price)
df_outlier_removed=pd.DataFrame(df_outlier_removed)
ind_diff=df.index.difference(df_outlier_removed.index)
for i in range(0, len(ind_diff),1):
df_final=df.drop([ind_diff[i]])
df=df_final
sns.boxplot(y='price', x='property_type',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
len(ind_diff)
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
def remove_outliers_DBSCAN(df,eps,min_samples):
outlier_detection = DBSCAN(eps = eps, min_samples = min_samples)
clusters = outlier_detection.fit_predict(df.values.reshape(-1,1))
data = pd.DataFrame()
data['cluster'] = clusters
return data['cluster']
df=pd.read_csv('listings.csv')
df.price = df.price.apply(remove_sign,sign='$')
clusters=remove_outliers_DBSCAN((df['price']),0.5,5)
clusters.value_counts().sort_values(ascending=False)
plt.plot(clusters)
df_cluster=pd.DataFrame(clusters)
ind_outlier=df_cluster.index[df_cluster['cluster']==-1]
ind_outlier
for i in range(0, len(ind_outlier),1):
df_final=df.drop([ind_outlier[i]])
df=df_final
sns.boxplot(y='price', x='property_type',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
len(ind_outlier)
neigh = NearestNeighbors(n_neighbors=3)
a=df.price.values.reshape(-1,1)
nbrs = neigh.fit(a)
distances, indices = nbrs.kneighbors(a)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
iris=pd.read_csv("Iris.csv")
df=iris[iris['Species']=='Iris-virginica']
x=df['SepalLengthCm']
y=df['SepalWidthCm']
plt.scatter(x,y)
coef = np.polyfit(x,y,1)
poly1d_fn = np.poly1d(coef)
plt.plot(x,y, 'yo', x, poly1d_fn(x), '--k')
x=x.append(pd.Series([20]))
y=y.append(pd.Series([6.08]))
coef
20*0.23189+1.446
coef = np.polyfit(x,y,1)
poly1d_fn = np.poly1d(coef)
plt.plot(x,y, 'yo', x, poly1d_fn(x), '--k')