Below we display the blox plot of the following data:
$$\{-30,-1, -5, -0.5, 0.5, 0.6, 0, 2, 3, 4.6, 4, 7, 18, 35\}.$$# nbi:hide_in
import matplotlib.pyplot as plt
import numpy as np
# set up the figure
fig, ax = plt.subplots(1,1, figsize=(7,4))
# data
data = np.array([-30, -5, 9, 10, 13, 20, 40, 500]).astype(float)
# data = np.array([-30,-1, -5, -0.5, 0.5, 0.6, 0, 2, 3, 4.6, 4, 7, 18, 35]).astype(float)
def percentile(data, p):
"""
Compute the percentiles the way we defined in class
data : array of size N
p : percentile
"""
data = np.sort(data, axis=0)
rank = int(p * (data.shape[0] + 1) - 1) # the rank
assert rank > 0, "the rank does not exist"
alpha = p * (data.shape[0] + 1) - 1 - rank # the fractional part
return data[rank] + alpha * (data[rank + 1] - data [rank])
def box_plot(ax, data, width=0.4, showout = True, position = np.array([0.4])):
"""
ax : matplotlib ax
data : the data
width : box width
showout : show the outliers
position: the y axis of the box plot
"""
# compute the five number summary
minim = np.min(data)
maxim = np.max(data)
q1 = percentile(data, 0.25)
q2 = np.median(data)
q3 = percentile(data, 0.75)
# interquartile range
iqr = q3 - q1
# inner fences
left_innerfence = q1 - 1.5 * iqr
right_innerfence = q3 + 1.5 * iqr
# compute outliers
outliers = data[np.logical_or(data <left_innerfence, data >= right_innerfence)]
# whiskers
if showout==True:
low_whisker = np.min(data[data >= left_innerfence])
high_whisker = np.max(data[data <= right_innerfence])
else:
low_whisker = np.min(data)
high_whisker = np.max(data)
stats = [{'iqr': iqr,
'whishi': high_whisker,
'whislo': low_whisker,
'fliers': outliers,
'q1': q1,
'med': q2,
'q3': q3}]
# add the box plot
flierprops = dict(markerfacecolor='black', markersize=5)
ax.bxp(stats, vert = False, widths=width, positions = position,
flierprops=flierprops, showfliers=showout)
# add Tukey's fences
if showout==True:
ax.vlines(q1-1.5*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)
ax.vlines(q3+1.5*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)
ax.vlines(q1-3*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)
ax.vlines(q3+3*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)
#
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.set_ylim(-0.1,position+0.3)
ax.set_yticks([])
plt.figtext(1,0.8,
r"$\min={:.4}$".format(minim)+"\n"+
r"$q_1={:.4}$".format(q1)+"\n"+
r"med$={:.4}$".format(q2)+"\n"+
r"$q_3={:.4}$".format(q3)+"\n"+
r"max$={:.4}$".format(maxim),
ha="left", va="top",
backgroundcolor=(0.1, 0.1, 1, 0.15),
fontsize="large")
def disp_data(ax, data):
ax.scatter(data, np.zeros(data.shape), zorder=2, s=10)
ax.set_yticks([])
# ax.set_xticks([])
mean = np.mean(data)
ax.scatter(mean, 0, zorder=2, s=20, color="red")
ax.set_ylim(-0.01,0.1)
ax.axhline(y=0, color='k', zorder=1, linewidth=0.5)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.set_ylim(-0.1,1.5)
box_plot(ax, data, width=0.2, showout=True)
plt.show();
import pandas as pd
from IPython.display import display, HTML
display(HTML("<style>.container { width:61% !important; }</style>"))
pdata=pd.DataFrame.transpose(pd.read_csv('files/blood_pressure.csv').astype(int))
display(HTML(pdata.to_html(header=False)))
# nbi:hide_in
import numpy as np
import matplotlib.pyplot as plt
# plot setup
plt.figure(figsize=(7,6))
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# load data from a csv file
data = np.genfromtxt('files/blood_pressure.csv', delimiter=',')[1:,:]
age = data[:,0]
sbp = data[:,1]
# the least squares line
meanx = np.mean(age)
meany = np.mean(sbp)
varx = np.var(age)
cov = np.cov(age, sbp)[0,1]
xvals = np.arange(np.min(age)-5, np.max(age)+5, 0.1)
yvals = cov*(xvals - meanx)/varx + meany
plt.xlabel("Age")
plt.ylabel("Systolic Blood Pressure (SDP)")
plt.plot(xvals, yvals, c="r", zorder=1)
plt.scatter(age, sbp, zorder=0);
# nbi:hide_in
import numpy as np
import matplotlib.pyplot as plt
# plot setup
plt.figure(figsize=(7,6))
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# load data from a csv file
data = np.genfromtxt('files/blood_pressure.csv', delimiter=',')[1:,:]
age = data[:,0]
sbp = data[:,1]
# the least squares line
meanx = np.mean(age)
meany = np.mean(sbp)
varx = np.var(age)
cov = np.cov(age, sbp)[0,1]
res = sbp - cov*(age - meanx)/varx - meany
plt.gca().spines['bottom'].set_position(('data',0))
plt.title("Residuals")
plt.scatter(age, res, zorder=0, alpha=0.7);
# nbi:hide_in
import numpy as np
import matplotlib.pyplot as plt
# plot setup
plt.figure(figsize=(7,6))
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# true parameters
a = -2
b = 1
sigma = 0.2
numData = 100
# generate data
x = np.arange(0,1,1/numData)
y = a*x + b
eps = np.random.randn(x.shape[0])*sigma
y = y + eps
# the least squares line
meanx = np.mean(x)
meany = np.mean(y)
varx = np.var(x)
cov = np.cov(x, y)[0,1]
# MLE
aMLE = cov / varx
bMLE = meany - cov*meanx / varx
res = y - cov*(x - meanx)/varx - meany
varMLE = np.linalg.norm(res)**2 / numData
# MLE line
xvals = np.arange(np.min(x)-0.3, np.max(x)+0.3, 0.1)
yvals = cov*(xvals - meanx)/varx + meany
plt.xlabel("x")
plt.ylabel("y")
plt.figtext(0.83,0.75, r" $\hat a$={:.4f}".format(aMLE) +
"\n $\hat b$ = {:.4f}\n".format(bMLE) +
r" $\hat\sigma^2$ = {:.4f}".format(varMLE),
ha="left", va="top",
backgroundcolor=(0.1, 0.1, 1, 0.15), fontsize="large")
plt.plot(xvals, yvals, c="r", zorder=1)
plt.scatter(x, y, zorder=0, alpha=0.7);