多维随机变量 ¶

In [1]:

Copied!





## 例：XY的联合分布和边缘分布
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

x = np.random.poisson(5, 10000)
y = [np.random.poisson(i) for i in x]

sns.histplot(x, discrete=True)
sns.histplot(y, discrete=True)
g0 = sns.jointplot(x=x, y=y, kind="hex")
g0.ax_joint.set_title("Joint Distribution of X and Y")
plt.tight_layout()
## 例：XY 的联合分布和边缘分布
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

x = np.random.poisson(5, 10000)
y = [np.random.poisson(i) for i in x]

sns.histplot(x, discrete=True)
sns.histplot(y, discrete=True)
g0 = sns.jointplot(x=x, y=y, kind="hex")
g0.ax_joint.set_title("Joint Distribution of X and Y")
plt.tight_layout()

No description has been provided for this image

若多个随机变量构建新的变量，Z= g(X,Y), 则可以通过联合分布得到 Z 的分布

$$ E(g(X,Y)) = \sum_x\sum_y g(x,y)p(x,y) $$

若 g 是 X,Y 的线性函数时

$$ E(aX+bY+c) = a E(X)+ bE(Y)+c $$

班上有 300 个学生，每人得 A 的概率是 1/3。X 是班上得到 A 的学生的总数，问 X 的期望值？

In [2]:

Copied!





import scipy.stats as st

n = 300
p = 1 / 3
mean, var, skew, kurt = st.binom.stats(n, p, moments="mvsk")
print(mean, var, skew, kurt)
import scipy.stats as st

n = 300
p = 1 / 3
mean, var, skew, kurt = st.binom.stats(n, p, moments="mvsk")
print(mean, var, skew, kurt)

100.0 66.66666666666667 0.04082482904638632 -0.0049999999999999975

多维正态分布 ¶

二维正态随机变量 $\left(\mathbf{X}_1,\mathbf{X}_2\right)$ 的概率密度函数为：

$$ f(x_1, x_2) = \frac{1}{2\pi\sigma_1\sigma_2\sqrt{1-\rho^2}} \exp \left\{ \frac{-1}{2(1-\rho^2)} \left[ \frac{(x_1 - \mu_1)^2}{\sigma_1^2} + \frac{(x_2 - \mu_2)^2}{\sigma_2^2} - 2\rho \frac{(x_1 - \mu_1)(x_2 - \mu_2)}{\sigma_1\sigma_2} \right] \right\} $$

将上式中的协方差矩阵 $\mathbf{C}$ 写成矩阵形式，为：

$$ \mathbf{C} = \begin{pmatrix} c_{11} & c_{12} \\ c_{21} & c_{22} \end{pmatrix} = \begin{pmatrix} \sigma_1^2 & \rho\sigma_1\sigma_2 \\ \rho\sigma_1\sigma_2 & \sigma_2^2 \end{pmatrix} $$

它的行列式 $\det \mathbf{C} = \sigma_1^2\sigma_2^2(1-\rho^2)$，$\mathbf{C}$ 的逆矩阵为：

$$ \mathbf{C}^{-1} = \frac{1}{\det \mathbf{C}} \begin{pmatrix} \sigma_2^2 & -\rho\sigma_1\sigma_2 \\ -\rho\sigma_1\sigma_2 & \sigma_1^2 \end{pmatrix} = \frac{1}{\sigma_1^2\sigma_2^2(1-\rho^2)} \begin{pmatrix} \sigma_2^2 & -\rho\sigma_1\sigma_2 \\ -\rho\sigma_1\sigma_2 & \sigma_1^2 \end{pmatrix} $$

经过计算可知，这里矩阵 $(\mathbf{X}-\mathbf{\mu})^T$ 是 $(\mathbf{X}-\mathbf{\mu})$ 的转置矩阵：

$$ (\mathbf{X}-\mathbf{\mu})^T \mathbf{C}^{-1} (\mathbf{X}-\mathbf{\mu}) = \frac{1}{\det \mathbf{C}} \begin{pmatrix} x_1 - \mu_1 & x_2 - \mu_2 \end{pmatrix} \begin{pmatrix} \sigma_2^2 & -\rho\sigma_1\sigma_2 \\ -\rho\sigma_1\sigma_2 & \sigma_1^2 \end{pmatrix} \begin{pmatrix} x_1 - \mu_1 \\ x_2 - \mu_2 \end{pmatrix} = \frac{1}{\sigma_1^2\sigma_2^2(1-\rho^2)} \left[ (x_1 - \mu_1)^2 \sigma_2^2 + (x_2 - \mu_2)^2 \sigma_1^2 - 2\rho (x_1 - \mu_1)(x_2 - \mu_2) \sigma_1\sigma_2 \right] $$

于是 $(\mathbf{X}_1, \mathbf{X}_2)$ 的概率密度函数可写成：

$$ f(x_1, x_2) = \frac{1}{2\pi\sqrt{\det \mathbf{C}}} \exp \left\{ -\frac{1}{2} (\mathbf{X}-\mathbf{\mu})^T \mathbf{C}^{-1} (\mathbf{X}-\mathbf{\mu}) \right\} $$

二维正态随机变量 $(\mathbf{X}_1, \mathbf{X}_2)$ 的概率密度函数为：

$$ f(x_1, x_2) = \frac{1}{(2\pi)^{2/2} (\det \mathbf{C})^{1/2}} \exp \left\{ -\frac{1}{2} (\mathbf{X}-\mathbf{\mu})^T \mathbf{C}^{-1} (\mathbf{X}-\mathbf{\mu}) \right\} $$

上式容易推广到 $n$ 维正态随机变量 $(\mathbf{X}_1, \mathbf{X}_2, \cdots, \mathbf{X}_n)$ 的情况。引入列矩阵：

$$ \mathbf{X} = \begin{pmatrix}x_1 \\ x_2 \\ \vdots \\ x_n \end{pmatrix} \quad \text{和} \quad \mathbf{\mu} = \begin{pmatrix} \mu_1 \\ \mu_2 \\ \vdots \\ \mu_n \end{pmatrix} = \begin{pmatrix}E(\mathbf{X}_1) \\ E(\mathbf{X}_2) \\ \vdots \\ E(\mathbf{X}_n) \end{pmatrix} $$

$n$ 维正态随机变量 $(\mathbf{X}_1, \mathbf{X}_2, \cdots, \mathbf{X}_n)$ 的概率密度定义为：

$$ f(x_1, x_2, \cdots, x_n) = \frac{1}{(2\pi)^{n/2} (\det \mathbf{C})^{1/2}} \exp \left\{ -\frac{1}{2} (\mathbf{X}-\mathbf{\mu})^T \mathbf{C}^{-1} (\mathbf{X}-\mathbf{\mu}) \right\} $$

其中 $\mathbf{C}$ 是 $(\mathbf{X}_1, \mathbf{X}_2, \cdots, \mathbf{X}_n)$ 的协方差矩阵。

In [3]:

Copied!





import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.stats import multivariate_normal

xmin = -4
xmax = 6
ymin = -4
ymax = 2
xmean = 1
ymean = -1
cov = np.array([[2.0, 0.3], [0.3, 0.5]])
# cmapV='Set3'
cmapV = "Reds"

##定义网格坐标
x, y = np.mgrid[xmin:xmax:0.1, ymin:ymax:0.06]
pos = np.dstack((x, y))
print(x, y, pos, sep="\n")

##定义二维正态分布的pdf
rv = multivariate_normal([xmean, ymean], cov)

fig = plt.figure(figsize=plt.figaspect(0.5))
ax = fig.add_subplot(1, 2, 1, projection="3d")  # subplot1,2,1,分为1行2列，得到第一列

ax.plot_surface(x, y, rv.pdf(pos), rstride=1, cstride=1, cmap=cmapV, edgecolor="none")
ax.set_title("3D Bivariate Normal PDF")

ax2 = fig.add_subplot(1, 2, 2)

##用颜色表示密度概率函数
pos2 = ax2.contourf(x, y, rv.pdf(pos), 6, cmap=cmapV, alpha=0.5)
##等高线
cset = ax2.contour(x, y, rv.pdf(pos), 6, cmap=cmapV)
##等高线的数值
ax2.clabel(cset, inline=1, fontsize=10)
ax2.set_title("Contour Plot of Bivariate Normal PDF")
##密度概率值与颜色对应图例
fig.colorbar(pos2)
fig.suptitle("Bivariate Normal Distribution Visualization")

plt.show()
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.stats import multivariate_normal

xmin = -4
xmax = 6
ymin = -4
ymax = 2
xmean = 1
ymean = -1
cov = np.array([[2.0, 0.3], [0.3, 0.5]])
# cmapV='Set3'
cmapV = "Reds"

##定义网格坐标
x, y = np.mgrid[xmin:xmax:0.1, ymin:ymax:0.06]
pos = np.dstack((x, y))
print(x, y, pos, sep="\n")

##定义二维正态分布的pdf
rv = multivariate_normal([xmean, ymean], cov)

fig = plt.figure(figsize=plt.figaspect(0.5))
ax = fig.add_subplot(1, 2, 1, projection="3d")  # subplot1,2,1,分为1行2列，得到第一列

ax.plot_surface(x, y, rv.pdf(pos), rstride=1, cstride=1, cmap=cmapV, edgecolor="none")
ax.set_title("3D Bivariate Normal PDF")

ax2 = fig.add_subplot(1, 2, 2)

##用颜色表示密度概率函数
pos2 = ax2.contourf(x, y, rv.pdf(pos), 6, cmap=cmapV, alpha=0.5)
##等高线
cset = ax2.contour(x, y, rv.pdf(pos), 6, cmap=cmapV)
##等高线的数值
ax2.clabel(cset, inline=1, fontsize=10)
ax2.set_title("Contour Plot of Bivariate Normal PDF")
##密度概率值与颜色对应图例
fig.colorbar(pos2)
fig.suptitle("Bivariate Normal Distribution Visualization")

plt.show()

[[-4.  -4.  -4.  ... -4.  -4.  -4. ]
 [-3.9 -3.9 -3.9 ... -3.9 -3.9 -3.9]
 [-3.8 -3.8 -3.8 ... -3.8 -3.8 -3.8]
 ...
 [ 5.7  5.7  5.7 ...  5.7  5.7  5.7]
 [ 5.8  5.8  5.8 ...  5.8  5.8  5.8]
 [ 5.9  5.9  5.9 ...  5.9  5.9  5.9]]
[[-4.   -3.94 -3.88 ...  1.82  1.88  1.94]
 [-4.   -3.94 -3.88 ...  1.82  1.88  1.94]
 [-4.   -3.94 -3.88 ...  1.82  1.88  1.94]
 ...
 [-4.   -3.94 -3.88 ...  1.82  1.88  1.94]
 [-4.   -3.94 -3.88 ...  1.82  1.88  1.94]
 [-4.   -3.94 -3.88 ...  1.82  1.88  1.94]]
[[[-4.   -4.  ]
  [-4.   -3.94]
  [-4.   -3.88]
  ...
  [-4.    1.82]
  [-4.    1.88]
  [-4.    1.94]]

 [[-3.9  -4.  ]
  [-3.9  -3.94]
  [-3.9  -3.88]
  ...
  [-3.9   1.82]
  [-3.9   1.88]
  [-3.9   1.94]]

 [[-3.8  -4.  ]
  [-3.8  -3.94]
  [-3.8  -3.88]
  ...
  [-3.8   1.82]
  [-3.8   1.88]
  [-3.8   1.94]]

 ...

 [[ 5.7  -4.  ]
  [ 5.7  -3.94]
  [ 5.7  -3.88]
  ...
  [ 5.7   1.82]
  [ 5.7   1.88]
  [ 5.7   1.94]]

 [[ 5.8  -4.  ]
  [ 5.8  -3.94]
  [ 5.8  -3.88]
  ...
  [ 5.8   1.82]
  [ 5.8   1.88]
  [ 5.8   1.94]]

 [[ 5.9  -4.  ]
  [ 5.9  -3.94]
  [ 5.9  -3.88]
  ...
  [ 5.9   1.82]
  [ 5.9   1.88]
  [ 5.9   1.94]]]

边缘分布 ¶

对于给定给定 $\mu_1$ , $\mu_2$ , $\sigma_1$ , $\sigma_2$, $\rho$ 的二维正态分布，不同的 $\rho$ 对应不同的二位正态分布，他们的边缘分布是一样的

In [4]:

Copied!





rv = multivariate_normal([xmean, ymean], cov)

fig = plt.figure(figsize=plt.figaspect(0.5))
ax = fig.add_subplot(111)  ##1,1,1简写

x1d = np.arange(xmin, xmax, 0.1)
y1d = np.arange(-2, -2, 0.06)

for yval in [-2, -1, 0]:
    z = []
    for i in x1d:
        z.append(rv.pdf([i, yval]))
    con = ax.plot(x1d, z / sum(z), label="y={:.1f}".format(yval))
plt.legend()
plt.title("Conditional Distributions for Different Y Values")
plt.xlabel("X values")
plt.ylabel("Conditional probability density")
rv = multivariate_normal([xmean, ymean], cov)

fig = plt.figure(figsize=plt.figaspect(0.5))
ax = fig.add_subplot(111)  ##1,1,1简写

x1d = np.arange(xmin, xmax, 0.1)
y1d = np.arange(-2, -2, 0.06)

for yval in [-2, -1, 0]:
    z = []
    for i in x1d:
        z.append(rv.pdf([i, yval]))
    con = ax.plot(x1d, z / sum(z), label="y={:.1f}".format(yval))
plt.legend()
plt.title("Conditional Distributions for Different Y Values")
plt.xlabel("X values")
plt.ylabel("Conditional probability density")

Out[4]:

Text(0, 0.5, 'Conditional probability density')

条件分布 ¶

In [5]:

Copied!

from scipy.stats.contingency import margins

fig2 = plt.figure(figsize=plt.figaspect(0.5))
ax3 = fig2.add_subplot(111)

x1d = np.arange(xmin, xmax, 0.1)
y1d = np.arange(ymin, ymax, 0.06)

xmargin, ymargin = margins(rv.pdf(pos))

ax3.plot(x1d, xmargin / sum(xmargin), y1d, ymargin.T / sum(ymargin.T))
from scipy.stats.contingency import margins

fig2 = plt.figure(figsize=plt.figaspect(0.5))
ax3 = fig2.add_subplot(111)

x1d = np.arange(xmin, xmax, 0.1)
y1d = np.arange(ymin, ymax, 0.06)

xmargin, ymargin = margins(rv.pdf(pos))

ax3.plot(x1d, xmargin / sum(xmargin), y1d, ymargin.T / sum(ymargin.T))

Out[5]:

[<matplotlib.lines.Line2D at 0x7f4b660d3b30>,
 <matplotlib.lines.Line2D at 0x7f4b660ed040>]

多维随机变量 ¶

In [6]:

Copied!





cov = np.array([[2.0, 0.3], [0.3, 0.5]])

rns = multivariate_normal.rvs(mean=[xmean, ymean], cov=cov, size=1000)
g = sns.jointplot(x=rns[:, 0], y=rns[:, 1])
g.plot_joint(sns.kdeplot, color="r", levels=6)
plt.title("Bivariate Normal Distribution")
plt.xlabel("X values")
plt.ylabel("Y values")
plt.tight_layout()
cov = np.array([[2.0, 0.3], [0.3, 0.5]])

rns = multivariate_normal.rvs(mean=[xmean, ymean], cov=cov, size=1000)
g = sns.jointplot(x=rns[:, 0], y=rns[:, 1])
g.plot_joint(sns.kdeplot, color="r", levels=6)
plt.title("Bivariate Normal Distribution")
plt.xlabel("X values")
plt.ylabel("Y values")
plt.tight_layout()

In [7]:

Copied!





fig2 = plt.figure(figsize=plt.figaspect(1))
ax3 = fig2.add_subplot(111)

rns = multivariate_normal.rvs(mean=[xmean, ymean], cov=cov, size=2000)
ax3.plot(
    rns[:, 0], rns[:, 1], "ro", alpha=0.6, markersize=1, markeredgecolor="k", markeredgewidth=0.5
)
# rns[:,0]：表示所有样本的 x 坐标, rns[:,1]表示所有y坐标

conf = ax3.contourf(x, y, rv.pdf(pos), 6, cmap=cmapV, alpha=0.6)
ax3.contour(x, y, rv.pdf(pos), 6, cmap=cmapV, alpha=0.6)
plt.colorbar(conf)  # 为等高线图添加颜色条
plt.title("multivariate_normal")
fig2 = plt.figure(figsize=plt.figaspect(1))
ax3 = fig2.add_subplot(111)

rns = multivariate_normal.rvs(mean=[xmean, ymean], cov=cov, size=2000)
ax3.plot(
    rns[:, 0], rns[:, 1], "ro", alpha=0.6, markersize=1, markeredgecolor="k", markeredgewidth=0.5
)
# rns[:,0]：表示所有样本的 x 坐标, rns[:,1]表示所有y坐标

conf = ax3.contourf(x, y, rv.pdf(pos), 6, cmap=cmapV, alpha=0.6)
ax3.contour(x, y, rv.pdf(pos), 6, cmap=cmapV, alpha=0.6)
plt.colorbar(conf)  # 为等高线图添加颜色条
plt.title("multivariate_normal")

Out[7]:

Text(0.5, 1.0, 'multivariate_normal')