import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.pyplot import figure
df=pd.read_csv('listings.csv')
df.head()
id listing_url scrape_id last_scraped name summary space description experiences_offered neighborhood_overview ... review_scores_value requires_license license jurisdiction_names instant_bookable cancellation_policy require_guest_profile_picture require_guest_phone_verification calculated_host_listings_count reviews_per_month
0 12147973 https://www.airbnb.com/rooms/12147973 2.016090e+13 9/7/2016 Sunny Bungalow in the City Cozy, sunny, family home. Master bedroom high... The house has an open and cozy feel at the sam... Cozy, sunny, family home. Master bedroom high... none Roslindale is quiet, convenient and friendly. ... ... NaN f NaN NaN f moderate f f 1 NaN
1 3075044 https://www.airbnb.com/rooms/3075044 2.016090e+13 9/7/2016 Charming room in pet friendly apt Charming and quiet room in a second floor 1910... Small but cozy and quite room with a full size... Charming and quiet room in a second floor 1910... none The room is in Roslindale, a diverse and prima... ... 9.0 f NaN NaN t moderate f f 1 1.30
2 6976 https://www.airbnb.com/rooms/6976 2.016090e+13 9/7/2016 Mexican Folk Art Haven in Boston Come stay with a friendly, middle-aged guy in ... Come stay with a friendly, middle-aged guy in ... Come stay with a friendly, middle-aged guy in ... none The LOCATION: Roslindale is a safe and diverse... ... 10.0 f NaN NaN f moderate t f 1 0.47
3 1436513 https://www.airbnb.com/rooms/1436513 2.016090e+13 9/7/2016 Spacious Sunny Bedroom Suite in Historic Home Come experience the comforts of home away from... Most places you find in Boston are small howev... Come experience the comforts of home away from... none Roslindale is a lovely little neighborhood loc... ... 10.0 f NaN NaN f moderate f f 1 1.00
4 7651065 https://www.airbnb.com/rooms/7651065 2.016090e+13 9/7/2016 Come Home to Boston My comfy, clean and relaxing home is one block... Clean, attractive, private room, one block fro... My comfy, clean and relaxing home is one block... none I love the proximity to downtown, the neighbor... ... 10.0 f NaN NaN f flexible f f 1 2.25

5 rows × 95 columns

def remove_sign(x,sign):
    if type(x) is str:
        x = float(x.replace(sign,'').replace(',',''))
    return x
df['price']
0       $250.00 
1        $65.00 
2        $65.00 
3        $75.00 
4        $79.00 
          ...   
3580     $69.00 
3581    $150.00 
3582    $198.00 
3583     $65.00 
3584     $65.00 
Name: price, Length: 3585, dtype: object
df=df[['price','property_type']]
df=pd.DataFrame(df)

figure(figsize=(12, 8), dpi=80)
df.price = df.price.apply(remove_sign,sign='$')
sns.boxplot(y='price', x='property_type',data=df)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
def remove_outlier_IQR(df):
    Q1=df.quantile(0.25)
    Q3=df.quantile(0.75)
    IQR=Q3-Q1
    df_final=df[~((df<(Q1-1.5*IQR)) | (df>(Q3+1.5*IQR)))]
    return df_final
df_outlier_removed=remove_outlier_IQR(df.price)
df_outlier_removed=pd.DataFrame(df_outlier_removed)
ind_diff=df.index.difference(df_outlier_removed.index)

figure(figsize=(12, 8), dpi=80)
for i in range(0, len(ind_diff),1):
    df_final=df.drop([ind_diff[i]])
    df=df_final
    
sns.boxplot(y='price', x='property_type',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
def remove_outlier_Hampel(df):
    med=df.median()
    List=abs(df-med)
    cond=List.median()*4.5
    good_list=List[~(List>cond)]
    return good_list
df=pd.read_csv('listings.csv')
df.price = df.price.apply(remove_sign,sign='$')
df_outlier_removed=remove_outlier_Hampel(df.price)
df_outlier_removed=pd.DataFrame(df_outlier_removed)
ind_diff=df.index.difference(df_outlier_removed.index)

for i in range(0, len(ind_diff),1):
    df_final=df.drop([ind_diff[i]])
    df=df_final
    
sns.boxplot(y='price', x='property_type',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
len(ind_diff)
95
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
def remove_outliers_DBSCAN(df,eps,min_samples):
    outlier_detection = DBSCAN(eps = eps, min_samples = min_samples)
    clusters = outlier_detection.fit_predict(df.values.reshape(-1,1))
    data = pd.DataFrame()
    data['cluster'] = clusters
    return data['cluster']
df=pd.read_csv('listings.csv')
df.price = df.price.apply(remove_sign,sign='$')
clusters=remove_outliers_DBSCAN((df['price']),0.5,5)
clusters.value_counts().sort_values(ascending=False)
-1      384
 9      144
 21     117
 4      101
 0       95
       ... 
 81       6
 56       5
 82       5
 124      5
 8        5
Name: cluster, Length: 127, dtype: int64
plt.plot(clusters)
[<matplotlib.lines.Line2D at 0x282abed7748>]
df_cluster=pd.DataFrame(clusters)
ind_outlier=df_cluster.index[df_cluster['cluster']==-1]
ind_outlier
Int64Index([  12,   40,   70,   75,   81,   84,   96,   99,  100,  107,
            ...
            3501, 3529, 3532, 3539, 3550, 3552, 3565, 3572, 3576, 3582],
           dtype='int64', length=384)
for i in range(0, len(ind_outlier),1):
    df_final=df.drop([ind_outlier[i]])
    df=df_final
    
sns.boxplot(y='price', x='property_type',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
len(ind_outlier)
384
neigh = NearestNeighbors(n_neighbors=3)
a=df.price.values.reshape(-1,1)
nbrs = neigh.fit(a)
distances, indices = nbrs.kneighbors(a)

distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
[<matplotlib.lines.Line2D at 0x28297a2c548>]
iris=pd.read_csv("Iris.csv")
df=iris[iris['Species']=='Iris-virginica']
x=df['SepalLengthCm']
y=df['SepalWidthCm']
plt.scatter(x,y)
<matplotlib.collections.PathCollection at 0x282af0ba9c8>
coef = np.polyfit(x,y,1)
poly1d_fn = np.poly1d(coef) 

plt.plot(x,y, 'yo', x, poly1d_fn(x), '--k')
[<matplotlib.lines.Line2D at 0x282af057f48>,
 <matplotlib.lines.Line2D at 0x282af02bec8>]
x=x.append(pd.Series([20]))
y=y.append(pd.Series([6.08]))
coef
array([0.23161465, 1.44811456])
20*0.23189+1.446
6.0838
coef = np.polyfit(x,y,1)
poly1d_fn = np.poly1d(coef) 

plt.plot(x,y, 'yo', x, poly1d_fn(x), '--k')
[<matplotlib.lines.Line2D at 0x282af0f6bc8>,
 <matplotlib.lines.Line2D at 0x282af165d08>]