import pandas as pd
import numpy as np
from random import random

Homoscedastic data

X = []
Y = []
for i in range(300):
    x = random()
    c = random() + 1*random()
    y = 5*x + 3*c
    X.append(x)
    Y.append(y)

data = pd.DataFrame(columns = ['X','Y'])
data.X = X
data.Y = Y
data
X Y
0 0.556561 6.726886
1 0.863728 7.319267
2 0.290025 4.498806
3 0.519170 8.242732
4 0.823742 8.299956
... ... ...
295 0.677121 5.114706
296 0.446470 5.468886
297 0.108419 3.728836
298 0.397281 6.823616
299 0.645970 7.452578

300 rows × 2 columns

model = ols('Y ~ X', data=data).fit()
p = model.params

#view model summary
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.611
Model:                            OLS   Adj. R-squared:                  0.610
Method:                 Least Squares   F-statistic:                     468.2
Date:                Wed, 28 Jul 2021   Prob (F-statistic):           4.56e-63
Time:                        17:11:46   Log-Likelihood:                -461.78
No. Observations:                 300   AIC:                             927.6
Df Residuals:                     298   BIC:                             935.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.0404      0.130     23.321      0.000       2.784       3.297
X              4.9368      0.228     21.639      0.000       4.488       5.386
==============================================================================
Omnibus:                        1.830   Durbin-Watson:                   2.049
Prob(Omnibus):                  0.401   Jarque-Bera (JB):                1.585
Skew:                           0.033   Prob(JB):                        0.453
Kurtosis:                       2.650   Cond. No.                         4.41
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
p
Intercept    3.040375
X            4.936788
dtype: float64
p.Intercept, p.X
(3.0403754853720484, 4.9367875427951295)
ax = data.plot(kind='scatter', x='X', y='Y')
ax.plot(X, p.Intercept + p.X*np.array(X),'r')
[<matplotlib.lines.Line2D at 0x240cc526220>]

Heteroscedastic data

data = pd.read_excel('sample_heteroscedasticity_data.xlsx')
X = data['var1'] 
Y = data['var2']
plt.plot(X,Y,'o')
[<matplotlib.lines.Line2D at 0x240d13c9280>]
X
0        1
1        2
2        3
3        4
4        5
      ... 
195     96
196     97
197     98
198     99
199    100
Name: var1, Length: 200, dtype: int64
model = ols('Y ~ X', data=data).fit()
p = model.params

ax = data.plot(kind='scatter', x='var1', y='var2')
ax.plot(X, p.Intercept + p.X*np.array(X),'r')
[<matplotlib.lines.Line2D at 0x240cc38fc70>]
model = ols('Y ~ X', data=data).fit()
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.799
Model:                            OLS   Adj. R-squared:                  0.798
Method:                 Least Squares   F-statistic:                     786.2
Date:                Wed, 28 Jul 2021   Prob (F-statistic):           7.18e-71
Time:                        17:16:54   Log-Likelihood:                -825.66
No. Observations:                 200   AIC:                             1655.
Df Residuals:                     198   BIC:                             1662.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.6606      2.151     -0.307      0.759      -4.902       3.581
X              1.0368      0.037     28.039      0.000       0.964       1.110
==============================================================================
Omnibus:                       33.905   Durbin-Watson:                   1.995
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              164.943
Skew:                           0.472   Prob(JB):                     1.52e-36
Kurtosis:                       7.348   Cond. No.                         117.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from statsmodels.stats.diagnostic import het_white
from statsmodels.compat import lzip
from patsy import dmatrices
expr = 'var2~ var1'
y, X = dmatrices(expr, data, return_type='dataframe')
keys = ['Lagrange Multiplier statistic:', 'LM test\'s p-value:', 'F-statistic:', 'F-test\'s p-value:']
results = het_white(model.resid, X)
lzip(keys, results)
[('Lagrange Multiplier statistic:', 23.080654955447134),
 ("LM test's p-value:", 9.729700165356245e-06),
 ('F-statistic:', 12.850174821408205),
 ("F-test's p-value:", 5.680868506694736e-06)]

Data Transformation

data = pd.read_excel('sample_heteroscedasticity_data.xlsx')
X = data['var1'] 
Y = np.log(data['var2'])
plt.plot(X,Y,'o')
plt.yscale("log")
dataX = pd.DataFrame(columns=['var1','var2'])
dataX.var1 = X
dataX.var2 = Y
dataX
var1 var2
0 1 -0.473549
1 2 0.107743
2 3 1.254872
3 4 1.388049
4 5 1.511434
... ... ...
195 96 4.740987
196 97 4.555702
197 98 4.659135
198 99 5.164662
199 100 4.515980

200 rows × 2 columns

model = ols('var2 ~ var1', data=dataX).fit()
p = model.params
dataX = dataX[dataX['var2'] > 2.5]
ax = dataX.plot(kind='scatter', x='var1', y='var2')
ax.plot(X, p.Intercept + p.var1*np.array(X),'r')
# plt.yscale("log")
[<matplotlib.lines.Line2D at 0x240cd7fe880>,
 <matplotlib.lines.Line2D at 0x240ceb57cd0>]
model = ols('var2 ~ var1', data=dataX).fit()
expr = 'var2~ var1'
y, X = dmatrices(expr, dataX, return_type='dataframe')

keys = ['Lagrange Multiplier statistic:', 'LM test\'s p-value:', 'F-statistic:', 'F-test\'s p-value:']
results = het_white(model.resid, X)
lzip(keys, results)
[('Lagrange Multiplier statistic:', 1.5156231931616286),
 ("LM test's p-value:", 0.46869099048803),
 ('F-statistic:', 0.7510952808302751),
 ("F-test's p-value:", 0.4734461091533566)]