import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from scipy.stats import norm

df = pd.read_excel('data.xlsx')

fig, ax = plt.subplots(figsize=(10, 8))
plt.hist(df.data, bins=30, color= 'green')

(array([   3.,    4.,   24.,  144.,  594., 1387., 2024., 2040., 1520.,
        1220., 1116., 1099., 1193., 1212., 1214., 1341., 1422., 1770.,
        1927., 1995., 1991., 1709., 1309.,  851.,  484.,  263.,  102.,
          30.,    8.,    4.]),
 array([29.77 , 32.076, 34.382, 36.688, 38.994, 41.3  , 43.606, 45.912,
        48.218, 50.524, 52.83 , 55.136, 57.442, 59.748, 62.054, 64.36 ,
        66.666, 68.972, 71.278, 73.584, 75.89 , 78.196, 80.502, 82.808,
        85.114, 87.42 , 89.726, 92.032, 94.338, 96.644, 98.95 ]),
 <BarContainer object of 30 artists>)

df.data.mean()

63.30878300000027

d = []
sample_number = 50
for i in range(1000):
    a = random.sample(range(0, 29999), sample_number)
    b = list(df.data.iloc[a])
    m = np.mean(b)
    d.append(m)
print(np.mean(d))    
fig, ax = plt.subplots(figsize=(10, 8))
plt.hist(d, bins=30, color= 'green')

63.306354999999996

(array([ 1.,  4.,  5.,  8., 14., 13., 30., 30., 38., 58., 53., 51., 84.,
        78., 83., 68., 83., 51., 62., 41., 35., 43., 25.,  9., 10.,  6.,
        10.,  3.,  3.,  1.]),
 array([57.778     , 58.16018667, 58.54237333, 58.92456   , 59.30674667,
        59.68893333, 60.07112   , 60.45330667, 60.83549333, 61.21768   ,
        61.59986667, 61.98205333, 62.36424   , 62.74642667, 63.12861333,
        63.5108    , 63.89298667, 64.27517333, 64.65736   , 65.03954667,
        65.42173333, 65.80392   , 66.18610667, 66.56829333, 66.95048   ,
        67.33266667, 67.71485333, 68.09704   , 68.47922667, 68.86141333,
        69.2436    ]),
 <BarContainer object of 30 artists>)

d = []
sample_number = 50
for i in range(1000):
    a = random.sample(range(0, 29999), sample_number)
    b = list(df.data.iloc[a])
    m = np.mean(b)
    d.append(m)
print(np.mean(d))    
fig, ax = plt.subplots(figsize=(10, 8))
plt.hist(d, bins=30, color= 'red')

d = []
sample_number = 500
for i in range(1000):
    a = random.sample(range(0, 29999), sample_number)
    b = list(df.data.iloc[a])
    m = np.mean(b)
    d.append(m)
print(np.mean(d))    
# fig, ax = plt.subplots(figsize=(10, 8))
plt.hist(d, bins=30, color= 'green')

63.33670059999999
63.32107628

(array([ 3.,  1.,  3.,  8., 13., 13., 21., 33., 21., 41., 67., 64., 97.,
        91., 83., 97., 77., 67., 55., 39., 31., 23., 17., 13.,  9.,  4.,
         2.,  1.,  4.,  2.]),
 array([61.45406   , 61.58445333, 61.71484667, 61.84524   , 61.97563333,
        62.10602667, 62.23642   , 62.36681333, 62.49720667, 62.6276    ,
        62.75799333, 62.88838667, 63.01878   , 63.14917333, 63.27956667,
        63.40996   , 63.54035333, 63.67074667, 63.80114   , 63.93153333,
        64.06192667, 64.19232   , 64.32271333, 64.45310667, 64.5835    ,
        64.71389333, 64.84428667, 64.97468   , 65.10507333, 65.23546667,
        65.36586   ]),
 <BarContainer object of 30 artists>)

def t_stat(x):
    n = len(x)
    m = 63.3
    s = np.std(x)
    t = (np.mean(x) - m)/(s/np.sqrt(n))
    return t

d = []
sample_number = 5
for i in range(1000):
    a = random.sample(range(0, 29999), sample_number)
    b = list(df.data.iloc[a])
    m = t_stat(b) 
    d.append(m)
    
fig, ax = plt.subplots(figsize=(10, 8))
plt.hist(d, bins=30, color= 'yellow')
plt.xlabel("t values")
plt.ylabel("f(t)")

Text(0, 0.5, 'f(t)')

d = []
sample_number = 10
for i in range(1000):
    a = random.sample(range(0, 29999), sample_number)
    b = list(df.data.iloc[a])
    m = t_stat(b) 
    d.append(m)
    
fig, ax = plt.subplots(figsize=(10, 8))
plt.hist(d, bins=30, color= 'blue')
plt.xlabel("t values")
plt.ylabel("f(t)")

Text(0, 0.5, 'f(t)')

d = []
sample_number = 50
for i in range(1000):
    a = random.sample(range(0, 29999), sample_number)
    b = list(df.data.iloc[a])
    m = t_stat(b) 
    d.append(m)
    
fig, ax = plt.subplots(figsize=(10, 8))
plt.hist(d, bins=30, color= 'green')
plt.xlabel("t values")
plt.ylabel("f(t)")

Text(0, 0.5, 'f(t)')

d = []
sample_number = 5
for i in range(1000):
    a = random.sample(range(0, 29999), sample_number)
    b = list(df.data.iloc[a])
    m = t_stat(b) 
    d.append(m)
    
fig, ax = plt.subplots(figsize=(10, 8))
plt.hist(d, bins=30, color= 'yellow')
plt.xlabel("t values")
plt.ylabel("f(t)")

d = []
sample_number = 10
for i in range(1000):
    a = random.sample(range(0, 29999), sample_number)
    b = list(df.data.iloc[a])
    m = t_stat(b) 
    d.append(m)
    
# fig, ax = plt.subplots(figsize=(10, 8))
plt.hist(d, bins=30, color= 'blue')
plt.xlabel("t values")
plt.ylabel("f(t)")

d = []
sample_number = 50
for i in range(1000):
    a = random.sample(range(0, 29999), sample_number)
    b = list(df.data.iloc[a])
    m = t_stat(b) 
    d.append(m)
    
# fig, ax = plt.subplots(figsize=(10, 8))
plt.hist(d, bins=30, color= 'green')
plt.xlabel("t values")
plt.ylabel("f(t)")

Text(0, 0.5, 'f(t)')

d1 = []
sample_number = 5
for i in range(1000):
    a = random.sample(range(0, 29999), sample_number)
    b = list(df.data.iloc[a])
    m = t_stat(b) 
    d1.append(m)
    
fig, ax = plt.subplots(figsize=(10, 8))
plt.hist(d1, bins=30, color= 'yellow')
plt.xlabel("t values")
plt.ylabel("f(t)")

d2 = []
sample_number = 10
for i in range(1000):
    a = random.sample(range(0, 29999), sample_number)
    b = list(df.data.iloc[a])
    m = t_stat(b) 
    d2.append(m)
    
# fig, ax = plt.subplots(figsize=(10, 8))
plt.hist(d2, bins=30, color= 'green')
plt.xlabel("t values")
plt.ylabel("f(t)")

d3 = []
sample_number = 50
for i in range(1000):
    a = random.sample(range(0, 29999), sample_number)
    b = list(df.data.iloc[a])
    m = t_stat(b) 
    d3.append(m)
    
# fig, ax = plt.subplots(figsize=(10, 8))
plt.hist(d3, bins=30, color= 'blue')
plt.xlabel("t values")
plt.ylabel("f(t)")

Text(0, 0.5, 'f(t)')

fig, ax = plt.subplots(figsize=(10, 8))
sns.kdeplot(d1, label = "sample size = 5")
sns.kdeplot(d2, label = "sample size = 10")
sns.kdeplot(d3, label = "sample size = 50")
plt.legend()

<matplotlib.legend.Legend at 0x1408443dfd0>