8: Discriptive statistics, Estimation¶

Back to main page ¶

Box plots¶

Below we display the blox plot of the following data:

$$\{-30,-1, -5, -0.5, 0.5, 0.6, 0, 2, 3, 4.6, 4, 7, 18, 35\}.$$

# nbi:hide_in
import matplotlib.pyplot as plt
import numpy as np

# set up the figure
fig, ax = plt.subplots(1,1, figsize=(7,4))

# data
data = np.array([-30, -5,  9, 10, 13, 20, 40, 500]).astype(float)
# data = np.array([-30,-1, -5, -0.5, 0.5, 0.6, 0, 2, 3, 4.6, 4, 7, 18, 35]).astype(float)


def percentile(data, p):
    """
    Compute the percentiles the way we defined in class 
    data : array of size N 
    p : percentile
    """
    data = np.sort(data, axis=0)
    rank = int(p * (data.shape[0] + 1) - 1) # the rank
    assert rank > 0, "the rank does not exist" 
    alpha = p * (data.shape[0] + 1) - 1 - rank # the fractional part
    return data[rank] + alpha * (data[rank + 1] - data [rank])

def box_plot(ax, data, width=0.4, showout = True, position = np.array([0.4])):
    """
    ax : matplotlib ax
    data : the data 
    width : box width
    showout : show the outliers   
    position: the y axis of the box plot
    """
    # compute the five number summary 
    minim = np.min(data)
    maxim = np.max(data)
    q1 = percentile(data, 0.25)
    q2 = np.median(data)
    q3 = percentile(data, 0.75)

    # interquartile range
    iqr = q3 - q1

    # inner fences
    left_innerfence = q1 - 1.5 * iqr
    right_innerfence = q3 + 1.5 * iqr

    # compute outliers 
    outliers = data[np.logical_or(data <left_innerfence, data >= right_innerfence)]
    
    # whiskers
    if showout==True:
        low_whisker = np.min(data[data >= left_innerfence])
        high_whisker = np.max(data[data <= right_innerfence])
    else:
        low_whisker = np.min(data)
        high_whisker = np.max(data)



    stats = [{'iqr': iqr,
              'whishi': high_whisker,
              'whislo': low_whisker,
              'fliers': outliers,
              'q1': q1,
              'med': q2,
              'q3': q3}]

    # add the box plot
    flierprops = dict(markerfacecolor='black', markersize=5)
    ax.bxp(stats, vert = False, widths=width, positions = position, 
           flierprops=flierprops, showfliers=showout)

    # add Tukey's fences
    if showout==True:
        ax.vlines(q1-1.5*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)
        ax.vlines(q3+1.5*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)

        ax.vlines(q1-3*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)
        ax.vlines(q3+3*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)

    #  
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.set_ylim(-0.1,position+0.3)
    ax.set_yticks([])
    plt.figtext(1,0.8,
                r"$\min={:.4}$".format(minim)+"\n"+
                r"$q_1={:.4}$".format(q1)+"\n"+
                r"med$={:.4}$".format(q2)+"\n"+
                r"$q_3={:.4}$".format(q3)+"\n"+
                r"max$={:.4}$".format(maxim),
                ha="left", va="top",
                backgroundcolor=(0.1, 0.1, 1, 0.15),
                fontsize="large")
    
def disp_data(ax, data):
    ax.scatter(data, np.zeros(data.shape), zorder=2, s=10)
    ax.set_yticks([])
#     ax.set_xticks([])
    mean = np.mean(data)
    ax.scatter(mean, 0, zorder=2, s=20, color="red")
    ax.set_ylim(-0.01,0.1)
    ax.axhline(y=0, color='k', zorder=1, linewidth=0.5)

    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)

    ax.set_ylim(-0.1,1.5)
            
box_plot(ax, data, width=0.2, showout=True)

plt.show();

Simple least squares regression¶

The following data is taken from here. The dataset contains systolic blood pressure for 30 people of different ages.

import pandas as pd 
from IPython.display import display, HTML
display(HTML("<style>.container { width:61% !important; }</style>"))

pdata=pd.DataFrame.transpose(pd.read_csv('files/blood_pressure.csv').astype(int))
display(HTML(pdata.to_html(header=False)))

Age	39	47	45	47	65	46	67	42	67	56	64	56	59	34	42	48	45	17	20	19	36	50	39	21	44	53	63	29	25	69
SDP	144	220	138	145	162	142	170	124	158	154	162	150	140	110	128	130	135	114	116	124	136	142	120	120	160	158	144	130	125	175

# nbi:hide_in
import numpy as np
import matplotlib.pyplot as plt

# plot setup
plt.figure(figsize=(7,6)) 
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# load data from a csv file
data = np.genfromtxt('files/blood_pressure.csv', delimiter=',')[1:,:]
age = data[:,0]
sbp = data[:,1]

# the least squares line
meanx = np.mean(age)
meany = np.mean(sbp)
varx = np.var(age)
cov = np.cov(age, sbp)[0,1]

xvals = np.arange(np.min(age)-5, np.max(age)+5, 0.1)
yvals = cov*(xvals - meanx)/varx + meany


plt.xlabel("Age")
plt.ylabel("Systolic Blood Pressure (SDP)")
plt.plot(xvals, yvals, c="r", zorder=1)
plt.scatter(age, sbp, zorder=0);

# nbi:hide_in
import numpy as np
import matplotlib.pyplot as plt

# plot setup
plt.figure(figsize=(7,6)) 
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# load data from a csv file
data = np.genfromtxt('files/blood_pressure.csv', delimiter=',')[1:,:]
age = data[:,0]
sbp = data[:,1]

# the least squares line
meanx = np.mean(age)
meany = np.mean(sbp)
varx = np.var(age)
cov = np.cov(age, sbp)[0,1]

res = sbp - cov*(age - meanx)/varx - meany

plt.gca().spines['bottom'].set_position(('data',0))
plt.title("Residuals")
plt.scatter(age, res, zorder=0, alpha=0.7);

MLE linear regression for synthetic data¶

# nbi:hide_in
import numpy as np
import matplotlib.pyplot as plt

# plot setup
plt.figure(figsize=(7,6)) 
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# true parameters
a = -2
b = 1
sigma = 0.2
numData = 100

# generate data
x = np.arange(0,1,1/numData)
y = a*x + b
eps = np.random.randn(x.shape[0])*sigma
y = y + eps

# the least squares line
meanx = np.mean(x)
meany = np.mean(y)
varx = np.var(x)
cov = np.cov(x, y)[0,1]

# MLE 
aMLE = cov / varx
bMLE = meany - cov*meanx / varx
res = y - cov*(x - meanx)/varx - meany
varMLE = np.linalg.norm(res)**2 / numData


# MLE line
xvals = np.arange(np.min(x)-0.3, np.max(x)+0.3, 0.1)
yvals = cov*(xvals - meanx)/varx + meany

plt.xlabel("x")
plt.ylabel("y")
plt.figtext(0.83,0.75, r" $\hat a$={:.4f}".format(aMLE) +
            "\n $\hat b$ = {:.4f}\n".format(bMLE) +
            r" $\hat\sigma^2$ = {:.4f}".format(varMLE),
            ha="left", va="top",
            backgroundcolor=(0.1, 0.1, 1, 0.15), fontsize="large")
plt.plot(xvals, yvals, c="r", zorder=1)
plt.scatter(x, y, zorder=0, alpha=0.7);

Age	39	47	45	47	65	46	67	42	67	56	64	56	59	34	42	48	45	17	20	19	36	50	39	21	44	53	63	29	25	69
SDP	144	220	138	145	162	142	170	124	158	154	162	150	140	110	128	130	135	114	116	124	136	142	120	120	160	158	144	130	125	175

Age	39	47	45	47	65	46	67	42	67	56	64	56	59	34	42	48	45	17	20	19	36	50	39	21	44	53	63	29	25	69
SDP	144	220	138	145	162	142	170	124	158	154	162	150	140	110	128	130	135	114	116	124	136	142	120	120	160	158	144	130	125	175

8: Discriptive statistics, Estimation¶

Back to main page¶

Box plots¶

Simple least squares regression¶

MLE linear regression for synthetic data¶

Back to main page ¶

Age	39	47	45	47	65	46	67	42	67	56	64	56	59	34	42	48	45	17	20	19	36	50	39	21	44	53	63	29	25	69
SDP	144	220	138	145	162	142	170	124	158	154	162	150	140	110	128	130	135	114	116	124	136	142	120	120	160	158	144	130	125	175