Simply Stepwise and Weighted Regression
Find the best regression model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
import statsmodels.api as sm
import pylab as py
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
url = "Car/toyota.csv"
data = pd.read_csv(url)
data.head()
data.shape
num = ['mileage', 'tax', 'mpg', 'engineSize']
# num = ['mileage', 'tax', 'mpg', 'engineSize', 'engineSize2']
num = ['mileage', 'tax', 'mpg', 'engineSize']
for i in range(0,len(num),1):
x = pd.DataFrame(data[num[0:i+1]])
y = data['price']
model = sm.OLS(y, x).fit()
print("With predictor {}".format(num[0:i+1]))
print("R-squared = ", model.rsquared)
print("Adj R-squared =", model.rsquared_adj)
print("AIC =", model.aic)
print("BIC =", model.bic)
print("\n")
model.summary()
y_pred = model.predict(x)
from RegscorePy import *
p = 4
y = y
aic.aic(y, y_pred, p) # using RegscorePy
n=6738
P=4
n*np.log(model.ssr) - n*np.log(n) + 2*P # using formula
p = 4
y = y
bic.bic(y, y_pred, p) # using RegscorePy
n=6738
P=4
n*np.log(model.ssr) - n*np.log(n) + P*np.log(n) # using formula
data[num]
data.mpg.min()
data[num].sort_values('mpg').head(50)
data["mpg_weighted"] = data['mpg'] - data.mpg.min()
data.sort_values('mpg').head(50)
num = ['mileage', 'tax', 'mpg', 'engineSize']
# num = ['mileage', 'tax', 'mpg', 'engineSize', 'mpg_weighted']
num = ['mileage', 'tax', 'mpg', 'engineSize']
for i in range(0,len(num),1):
x = pd.DataFrame(data[num[0:i+1]])
y = data['price']
model = linear_model.LinearRegression()
model.fit(x, y)
model.coef_
for i in range(0,len(num),1):
x = pd.DataFrame(data[num[0:i+1]])
y = data['price']
model = linear_model.LinearRegression()
model.fit(x, y, sample_weight = data['mpg_weighted'])
model.coef_