Heteroscedasticity in python
Use of statsmodel
import pandas as pd
import numpy as np
from random import random
X = []
Y = []
for i in range(300):
x = random()
c = random() + 1*random()
y = 5*x + 3*c
X.append(x)
Y.append(y)
data = pd.DataFrame(columns = ['X','Y'])
data.X = X
data.Y = Y
data
model = ols('Y ~ X', data=data).fit()
p = model.params
#view model summary
print(model.summary())
p
p.Intercept, p.X
ax = data.plot(kind='scatter', x='X', y='Y')
ax.plot(X, p.Intercept + p.X*np.array(X),'r')
data = pd.read_excel('sample_heteroscedasticity_data.xlsx')
X = data['var1']
Y = data['var2']
plt.plot(X,Y,'o')
X
model = ols('Y ~ X', data=data).fit()
p = model.params
ax = data.plot(kind='scatter', x='var1', y='var2')
ax.plot(X, p.Intercept + p.X*np.array(X),'r')
model = ols('Y ~ X', data=data).fit()
print(model.summary())
from statsmodels.stats.diagnostic import het_white
from statsmodels.compat import lzip
from patsy import dmatrices
expr = 'var2~ var1'
y, X = dmatrices(expr, data, return_type='dataframe')
keys = ['Lagrange Multiplier statistic:', 'LM test\'s p-value:', 'F-statistic:', 'F-test\'s p-value:']
results = het_white(model.resid, X)
lzip(keys, results)
data = pd.read_excel('sample_heteroscedasticity_data.xlsx')
X = data['var1']
Y = np.log(data['var2'])
plt.plot(X,Y,'o')
plt.yscale("log")
dataX = pd.DataFrame(columns=['var1','var2'])
dataX.var1 = X
dataX.var2 = Y
dataX
model = ols('var2 ~ var1', data=dataX).fit()
p = model.params
dataX = dataX[dataX['var2'] > 2.5]
ax = dataX.plot(kind='scatter', x='var1', y='var2')
ax.plot(X, p.Intercept + p.var1*np.array(X),'r')
# plt.yscale("log")
model = ols('var2 ~ var1', data=dataX).fit()
expr = 'var2~ var1'
y, X = dmatrices(expr, dataX, return_type='dataframe')
keys = ['Lagrange Multiplier statistic:', 'LM test\'s p-value:', 'F-statistic:', 'F-test\'s p-value:']
results = het_white(model.resid, X)
lzip(keys, results)