多元回归分析 ¶
多元线性回归模型表示为: $$ f(x_i) = \mathbf{w}^T \mathbf{x}_i + b $$ 其中,$\mathbf{w}$ 是权重向量,$b$ 是偏置项,$\mathbf{x}_i$ 是输入向量,$f(x_i)$ 是预测值。
参数估计 ¶
为了估计 $\mathbf{w}$ 和 $b$,可以使用最小二乘法。将 $\mathbf{w}$ 和 $b$ 合并为向量形式 $\hat{\mathbf{w}} = (\mathbf{w}; b)$,数据集 $D$ 表示为矩阵 $\mathbf{X}$,其中每行对应一个样本,前 $d$ 个元素对应样本的 $d$ 个属性值,最后一个元素恒定为 1。
$$ \mathbf{X} = \begin{pmatrix} x_{11} & x_{12} & \ldots & x_{1d} & 1 \\ x_{21} & x_{22} & \ldots & x_{2d} & 1 \\ \vdots & \vdots & \ddots & \vdots & \vdots \\ x_{m1} & x_{m2} & \ldots & x_{md} & 1 \\ \end{pmatrix} = \begin{pmatrix} \mathbf{x}_1^T & 1 \\ \mathbf{x}_2^T & 1 \\ \vdots & \vdots \\ \mathbf{x}_m^T & 1 \\ \end{pmatrix} $$
标记向量 $\mathbf{y} = (y_1; y_2; \ldots; y_m)$,则目标函数为:
$$ \hat{\mathbf{w}}^* = \arg \min_{\hat{\mathbf{w}}} (\mathbf{y} - \mathbf{X}\hat{\mathbf{w}})^T (\mathbf{y} - \mathbf{X}\hat{\mathbf{w}}) $$
令 $E_{\hat{\mathbf{w}}} = (\mathbf{y} - \mathbf{X}\hat{\mathbf{w}})^T (\mathbf{y} - \mathbf{X}\hat{\mathbf{w}})$,对 $\hat{\mathbf{w}}$ 求导并令其为零,得到最优解的形式:
$$ \frac{\partial E_{\hat{\mathbf{w}}}}{\partial \hat{\mathbf{w}}} = 2 \mathbf{X}^T (\mathbf{X}\hat{\mathbf{w}} - \mathbf{y}) = 0 $$
当 $\mathbf{X}^T \mathbf{X}$ 为满秩矩阵(full-rank matrix)或正定矩阵(positive definite matrix)时,可得:
$$ \hat{\mathbf{w}}^* = (\mathbf{X}^T \mathbf{X})^{-1} \mathbf{X}^T \mathbf{y} $$
参数的区间 ¶
$$ (\hat\beta_j -\beta_j)/(\hat{\sigma}\sqrt{L^{-1}_{jj}}) \sim t_{n-p-1} $$
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
n = 2
fa = n / 2 / (n - 1)
a = st.f(2, n - 2).ppf(0.95) / fa
achi = st.chi2(2).ppf(0.95)
print(a, achi)
nan 5.991464547107979
线性回归的误差 ¶
$L=(X^TX)$,$\sigma^2 L^{-1}$ 是 $\hat{\mathbf{w}}$ 的协方差矩阵
有
- $\text{Var}(\beta_0) = \sigma^2/n$
- $\text{Var}(\beta_i) = \sigma^2 L^{-1}_{ii}$
- $\text{Var}(\beta_0,\beta_j)=0$
- $\text{Var}(\beta_i,\beta_j) = \sigma^2 L^{-1}_{ij}$
$\sigma^2$ 的估计仍然可以用残差估计
$$ \begin{aligned} \delta^2 = (Y - X\beta)^\intercal (Y-X\beta)\\ \hat \sigma^2 =\delta^2/(n-p-1)\\ \delta^2/\sigma^2 \sim \chi^2_{n-p-1} \end{aligned} $$
%matplotlib notebook
import scipy.stats as st
np.random.seed(304)
one = np.ones(20)
x = st.uniform(0, 10).rvs(20)
y = st.uniform(0, 8).rvs(20)
print(x.shape, y.shape)
Xmm = np.vstack(
[one, x - np.mean(x), y - np.mean(y)]
).T # 先叠成3行,再转置成为3列,变成每一个样本为一行
# print(Xmm.shape)
beta = np.linalg.solve(Xmm.T @ Xmm, Xmm.T @ Y)
# print(beta,"\n")
L_inv = np.linalg.inv(Xmm.T @ Xmm)
sigma2 = (Y - Xmm @ beta).T @ (Y - Xmm @ beta) / (20 - 2 - 1)
# print(L_inv,"\n")
err = np.sqrt(sigma2 * np.diag(L_inv))
# print(err)
(20,) (20,)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[2], line 16 12 Xmm = np.vstack( 13 [one, x - np.mean(x), y - np.mean(y)] 14 ).T # 先叠成3行,再转置成为3列,变成每一个样本为一行 15 # print(Xmm.shape) ---> 16 beta = np.linalg.solve(Xmm.T @ Xmm, Xmm.T @ Y) 17 # print(beta,"\n") 18 L_inv = np.linalg.inv(Xmm.T @ Xmm) NameError: name 'Y' is not defined
两个变量的时候,通常检验 $\beta_1$ 和 $\beta_2$ 的计算表达式是否小于 5.99
$f(x)$ 的区间比 $f(x_i)$ 更宽泛
%matplotlib notebook
import numpy as np
import scipy.stats as st
from matplotlib.animation import FuncAnimation
np.random.seed(304)
a = 4
b = 4
c = 10
x = st.uniform(0, 10).rvs(20)
y = st.uniform(0, 8).rvs(20)
z = [np.random.normal(a * i + b * j + c, 5) for i, j in zip(x, y)]
def animate(dfn):
ax.view_init(elev=32, azim=dfn)
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(111, projection="3d")
ax.plot(x, y, z, color="k", zorder=15, linestyle="none", marker="o", alpha=0.5)
anim = FuncAnimation(fig, animate, frames=range(0, 360, 2))
# anim.save('data.gif',fps=3,dpi=200)
from IPython.display import HTML
HTML(anim.to_jshtml())
fig.show()
# 使用sklearn进行回归
from sklearn import linear_model
X = np.vstack((x, y)).T # vstack 叠叠乐函数
ols = linear_model.LinearRegression() # 创建模型
model = ols.fit(X, z) # 预测模型
x_pred = np.linspace(0, 10, 50)
y_pred = np.linspace(0, 8, 50)
xx_pred, yy_pred = np.meshgrid(x_pred, y_pred) # 拼接成一个50x50的网格
# flatten() change the 50x50 to 1x2500
model_viz = np.array([xx_pred.flatten(), yy_pred.flatten()]).T
# print(model_viz.shape)----- (2500,2)
predicted = model.predict(model_viz)
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(x, y, z, color="k", alpha=0.5)
ax.plot(xx_pred.flatten(), yy_pred.flatten(), predicted, alpha=0.9)
anim = FuncAnimation(fig, animate, frames=range(0, 360, 3))
fig.show()
# anim.save('fit.gif',fps=3,dpi=200)
from IPython.display import HTML
HTML(anim.to_jshtml())
print(model.intercept_, model.coef_) # 打印截距和每个X的系数
7.546774866213575 [3.8898734 4.2202737]
当然,也可以使用矩阵进行计算 需要注意的是,这里第一列是全 1 列
- 非 0 偏置(截距)缩到 $\beta$ 矩阵当中了,所以对应的 $\beta X$ 中 $X$ 要多一列全 1 值
# 使用矩阵方法进行计算
one = np.ones(20)
x = st.uniform(0, 10).rvs(20)
y = st.uniform(0, 8).rvs(20)
z = [np.random.normal(a * i + b * j + c, 5) for i, j in zip(x, y)]
Xm = np.vstack([one, x, y]).T
Y = np.array(z).reshape(20, 1)
b = np.linalg.solve(Xm.T @ Xm, Xm.T @ Y)
print(b)
[[6.96602352] [4.34440642] [4.11604012]]
stats 中的 OLS model ¶
import statsmodels.api as sm
# Xp=np.vstack((np.ones(20),x, y)).T
Xp = sm.add_constant(X) # 等同于上面一行
# Ordinary least square
model = sm.OLS(z, Xp)
results = model.fit()
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.294
Model: OLS Adj. R-squared: 0.211
Method: Least Squares F-statistic: 3.540
Date: Sun, 04 Jan 2026 Prob (F-statistic): 0.0518
Time: 05:53:34 Log-Likelihood: -78.492
No. Observations: 20 AIC: 163.0
Df Residuals: 17 BIC: 166.0
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 28.1183 8.023 3.505 0.003 11.192 45.045
x1 0.7903 1.102 0.717 0.483 -1.535 3.115
x2 3.5215 1.388 2.537 0.021 0.593 6.450
==============================================================================
Omnibus: 0.855 Durbin-Watson: 2.811
Prob(Omnibus): 0.652 Jarque-Bera (JB): 0.685
Skew: 0.415 Prob(JB): 0.710
Kurtosis: 2.635 Cond. No. 18.6
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
可以使用 python 集成的功能进行计算
F 值
t 值:相当于对 x1,x2 进行了原假设为 $\mu_{x_1} = \mu_{x_2} = 0$ 的假设
最后两列还会给出参数 95% 置信区间
可以通过 result 的函数获取计算结果
pingouin 包 ¶
也可以实现相同的效果,更加精简一点
import pingouin as pg
lm = pg.linear_regression(X, z)
print(lm)
names coef se T pval r2 adj_r2 \
0 Intercept 28.118337 8.022802 3.504803 0.002716 0.294045 0.210992
1 x1 0.790336 1.102023 0.717168 0.483009 0.294045 0.210992
2 x2 3.521503 1.387927 2.537239 0.021260 0.294045 0.210992
CI[2.5%] CI[97.5%]
0 11.191704 45.044970
1 -1.534730 3.115402
2 0.593233 6.449772
假设检验 ¶
检验每个 $\beta_i$ ¶
$$ (\hat\beta_j -\beta_j)/(\hat{\sigma}\sqrt{L^{-1}_{jj}}) \sim t_{n-p-1} $$
检验整体 ( 看是否有相关性 ) ¶
一系列 $\beta_1,\beta_2,\beta_3 \dots$ 系数是否为 0
- $H_0: \beta_1, \cdots, \beta_r = 0$
- $R_2 = \delta'^2 | \beta_1, \cdots, \beta_r = 0$
- $R_1 = \delta^2$
- $\delta$ 重新在只有 $p-r$ 个变量时通过最小二乘法计算
- $(R_2 - R_1)/\sigma^2 \sim \chi^2_r$
- $(n-p-1)\hat{\sigma}^2/\sigma^2 \sim \chi^2_{n-p-1}$
- $(R_2 - R_1)/(r\hat{\sigma}^2) \sim F_{r,n-p-1}$
- 当 $(R_2 - R_1)/(r\hat{\sigma}^2) > F_{r,n-p-1}(0.05)$ 时否定假设
## Single beta_i=0
print(results.tvalues)
## multiple beta_r=0
## 验证方法1
print(results.f_pvalue)
hypotheses = "(x1 = x2 =0)"
print(results.f_test(hypotheses))
# p值小于0.05 就要拒绝原来的假设了
## 验证方法2:使得A矩阵线性无关的两个向量是要验证的向量
A = np.identity(len(results.params))
A = A[1:, :]
print(A)
print(results.f_test(A))
[3.50480251 0.71716783 2.53723929] 0.051832539952542715 <F test: F=3.540434571475472, p=0.05183253995254327, df_denom=17, df_num=2> [[0. 1. 0.] [0. 0. 1.]] <F test: F=3.540434571475474, p=0.0518325399525432, df_denom=17, df_num=2>
%matplotlib inline
from sklearn.preprocessing import PolynomialFeatures
a = 0.3
b = -4
c = 10
d = 8
x2 = st.uniform(0, 10).rvs(40)
x2 = np.sort(x2)
print("x2=", x2.shape)
y2 = np.random.normal(a * x2**3 + b * x2**2 + c * x2 + d, 2)
x2= (40,)
需要准备数据,把数据转换成每列代表不同的属性,每一行代表不同的样本
poly = PolynomialFeatures(degree = 5)
X_poly = poly.fit_transform(np.array(x2).reshape(40,1))
fig = plt.figure()
plt.scatter(x2, y2)
poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(np.array(x2).reshape(40, 1))
print(X_poly.shape)
model2 = sm.OLS(y2, X_poly)
results2 = model2.fit()
print(x2.shape, y2.shape)
print(results2.params.shape)
ypred = model2.predict(results2.params, X_poly)
plt.plot(x2, ypred, color="C0")
# plt.show()
print(results2.summary())
(40, 6)
(40,) (40,)
(6,)
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.980
Model: OLS Adj. R-squared: 0.977
Method: Least Squares F-statistic: 325.7
Date: Sun, 04 Jan 2026 Prob (F-statistic): 1.09e-27
Time: 05:53:35 Log-Likelihood: -72.219
No. Observations: 40 AIC: 156.4
Df Residuals: 34 BIC: 166.6
Df Model: 5
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 6.8112 1.211 5.624 0.000 4.350 9.272
x1 13.8635 3.163 4.383 0.000 7.436 20.291
x2 -6.7615 2.080 -3.251 0.003 -10.989 -2.534
x3 1.0314 0.555 1.857 0.072 -0.097 2.160
x4 -0.0797 0.065 -1.235 0.225 -0.211 0.051
x5 0.0031 0.003 1.134 0.265 -0.002 0.009
==============================================================================
Omnibus: 1.219 Durbin-Watson: 2.149
Prob(Omnibus): 0.544 Jarque-Bera (JB): 0.414
Skew: 0.047 Prob(JB): 0.813
Kurtosis: 3.489 Cond. No. 2.63e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.63e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) File ~/work/Notebook/Notebook/.venv/lib/python3.12/site-packages/matplotlib_inline/backend_inline.py:99, in show(close, block) 96 # only call close('all') if any to close 97 # close triggers gc.collect, which can be slow 98 if close and Gcf.get_all_fig_managers(): ---> 99 matplotlib.pyplot.close("all") File ~/work/Notebook/Notebook/.venv/lib/python3.12/site-packages/matplotlib/pyplot.py:1201, in close(fig) 1199 _pylab_helpers.Gcf.destroy(manager) 1200 elif fig == 'all': -> 1201 _pylab_helpers.Gcf.destroy_all() 1202 elif isinstance(fig, int): 1203 _pylab_helpers.Gcf.destroy(fig) File ~/work/Notebook/Notebook/.venv/lib/python3.12/site-packages/matplotlib/_pylab_helpers.py:81, in Gcf.destroy_all(cls) 79 for manager in list(cls.figs.values()): 80 manager.canvas.mpl_disconnect(manager._cidgcf) ---> 81 manager.destroy() 82 cls.figs.clear() File ~/work/Notebook/Notebook/.venv/lib/python3.12/site-packages/matplotlib/backends/backend_nbagg.py:144, in FigureManagerNbAgg.destroy(self) 142 for comm in list(self.web_sockets): 143 comm.on_close() --> 144 self.clearup_closed() File ~/work/Notebook/Notebook/.venv/lib/python3.12/site-packages/matplotlib/backends/backend_nbagg.py:152, in FigureManagerNbAgg.clearup_closed(self) 148 self.web_sockets = {socket for socket in self.web_sockets 149 if socket.is_open()} 151 if len(self.web_sockets) == 0: --> 152 CloseEvent("close_event", self.canvas)._process() File ~/work/Notebook/Notebook/.venv/lib/python3.12/site-packages/matplotlib/backend_bases.py:1189, in Event._process(self) 1187 def _process(self): 1188 """Process this event on ``self.canvas``, then unset ``guiEvent``.""" -> 1189 self.canvas.callbacks.process(self.name, self) 1190 self.guiEvent = None File ~/work/Notebook/Notebook/.venv/lib/python3.12/site-packages/matplotlib/cbook.py:366, in CallbackRegistry.process(self, s, *args, **kwargs) 364 except Exception as exc: 365 if self.exception_handler is not None: --> 366 self.exception_handler(exc) 367 else: 368 raise File ~/work/Notebook/Notebook/.venv/lib/python3.12/site-packages/matplotlib/cbook.py:110, in _exception_printer(exc) 108 def _exception_printer(exc): 109 if _get_running_interactive_framework() in ["headless", None]: --> 110 raise exc 111 else: 112 traceback.print_exc() File ~/work/Notebook/Notebook/.venv/lib/python3.12/site-packages/matplotlib/cbook.py:361, in CallbackRegistry.process(self, s, *args, **kwargs) 359 if func is not None: 360 try: --> 361 func(*args, **kwargs) 362 # this does not capture KeyboardInterrupt, SystemExit, 363 # and GeneratorExit 364 except Exception as exc: File ~/work/Notebook/Notebook/.venv/lib/python3.12/site-packages/matplotlib/animation.py:940, in Animation._stop(self, *args) 938 self._fig.canvas.mpl_disconnect(self._resize_id) 939 self._fig.canvas.mpl_disconnect(self._close_id) --> 940 self.event_source.remove_callback(self._step) 941 self.event_source = None AttributeError: 'NoneType' object has no attribute 'remove_callback'
# 使用f-test对poly阶数进行检验
print(results2.f_test("x4=x5=0"))
print(results2.f_test("x3=x4=x5=0"))
<F test: F=1.2003554155678247, p=0.3135262379521731, df_denom=34, df_num=2> <F test: F=115.33656688365886, p=6.8633367520199366e-18, df_denom=34, df_num=3>
for i in (2, 3):
poly = PolynomialFeatures(degree=i)
X_poly = poly.fit_transform(np.array(x2).reshape(40, 1))
model2 = sm.OLS(y2, X_poly)
results2 = model2.fit()
print(results2.summary())
ypred = model2.predict(results2.params, X_poly)
fig.gca().plot(x2, ypred, color="C{:d}".format(i))
fig
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.771
Model: OLS Adj. R-squared: 0.759
Method: Least Squares F-statistic: 62.42
Date: Sun, 04 Jan 2026 Prob (F-statistic): 1.39e-12
Time: 05:53:36 Log-Likelihood: -120.50
No. Observations: 40 AIC: 247.0
Df Residuals: 37 BIC: 252.1
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 15.6061 1.855 8.415 0.000 11.848 19.364
x1 -3.8036 1.029 -3.697 0.001 -5.888 -1.719
x2 0.0469 0.119 0.394 0.696 -0.195 0.289
==============================================================================
Omnibus: 11.658 Durbin-Watson: 0.499
Prob(Omnibus): 0.003 Jarque-Bera (JB): 13.411
Skew: 0.915 Prob(JB): 0.00122
Kurtosis: 5.168 Cond. No. 80.7
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.978
Model: OLS Adj. R-squared: 0.976
Method: Least Squares F-statistic: 536.0
Date: Sun, 04 Jan 2026 Prob (F-statistic): 6.48e-30
Time: 05:53:36 Log-Likelihood: -73.584
No. Observations: 40 AIC: 155.2
Df Residuals: 36 BIC: 161.9
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 8.3412 0.703 11.869 0.000 6.916 9.766
x1 9.1856 0.775 11.852 0.000 7.614 10.757
x2 -3.7585 0.210 -17.915 0.000 -4.184 -3.333
x3 0.2844 0.015 18.434 0.000 0.253 0.316
==============================================================================
Omnibus: 1.220 Durbin-Watson: 2.023
Prob(Omnibus): 0.543 Jarque-Bera (JB): 0.424
Skew: 0.102 Prob(JB): 0.809
Kurtosis: 3.461 Cond. No. 960.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
指数拟合 ¶
$$ \begin{align*} Y = b_0 e^{b_1X}\\ \log Y = \log b_0 + b_1 X\\ \end{align*} $$
两个变量的相关性 ¶
- 相关系数 (Correlation)
$$ \rho = \frac{\text{Cov}(x, y)}{\sqrt{\text{Var}(x) \text{Var}(y)}} $$
- 用样本估计
$$ r = \frac{\sum (x - \bar{x})(y - \bar{y})}{\sqrt{\sum (x - \bar{x})^2 \sum (y - \bar{y})^2}} $$
- 假设检验:皮尔逊相关系数检验
$$ H_0: \rho = 0,H_1: \rho \ne 0 $$
检验统计量为
$$ T = \frac{r\sqrt{n - 2}}{\sqrt{1 - r^2}} \sim t_{n-2} $$
$$ C = t_{n-2}(\alpha/2) \sqrt{\frac{n - 2 + t_{n-2}^2(\alpha/2)}{n - 2}} $$
需要注意,在判断相关性的时候,要注意系统的自由度
系统自由度很低的时候(样本很少
由下图可以看出,在样本数量 20 的时候,小于 0.63 就认为不相关了
## C的取值
cl = []
nl = []
for n in range(10, 100, 5):
# print(n,cl)
tn = st.t.ppf(0.975, n - 2)
c = tn / np.sqrt(n - 2 + tn**2)
cl.append(c)
nl.append(n)
plt.plot(nl, cl)
plt.title("Critical Correlation Coefficient vs Sample Size")
plt.xlabel("Sample Size (n)")
plt.ylabel("Critical r value")
plt.grid(True, alpha=0.3)
偏相关 ¶
分析相关性,给 x1 和给 x2 差不多的话,就可以先去掉一个
取结果中方差最小的
根据全相关系数来求偏相关系数
p 值很大,支持原假设 p 值很小,不支持原假设
一组观测量 $X_i$ 中,互相之间有关联
- $X_3$ 为一个人的收入,$X_1, X_2$ 为一个人在吃和穿上的花费
- 观察到 $X_1, X_2$ 的正相关,$r > 0$
- 实际 $X_1, X_2$ 均由 $X_3$ 带动,使得他们呈现正相关
- 若能去掉 $X_3$ 的影响,观测它们的实际相关,可能转为负相关
重新定义 $X_1', X_2'$
$$ X_1' = X_1 - L(X_3 \cdots) $$
$$ X_2' = X_2 - L'(X_3 \cdots) $$
使得 $E(X_1')^2$ 和 $E(X_2')^2$ 最小
可以证明
$$ \rho(X_1', X_2') = (\rho_{12} - \rho_{13} \rho_{23})/[(1 - \rho_{13}^2)(1 - \rho_{23}^2)]^{1/2} $$
将原本 $X$ 的相关矩阵写为
$$ P = \begin{pmatrix} 1 & \cdots & \rho_{1p} \\ \vdots & 1 & \vdots \\ \rho_{p1} & \cdots & 1 \end{pmatrix} $$
记 $P_{ij}$ 为 $\rho(X_i', X_j')$ 为去掉第 $i$ 行第 $j$ 列后的行列式
- $\rho(X_1', X_2') = P_{12}/\sqrt{P_{11} P_{22}}$
%matplotlib inline
import pandas as pd
x3 = st.expon(0, 1000).rvs(50) # income
x1 = np.random.normal(x3 * 0.4 + 30, 100) # spent on eating
x2 = np.random.normal(x3 - x1 - 20, 10) # spent on clothes
plt.scatter(x1, x2)
plt.title("Spending on Food vs Clothes")
plt.xlabel("Spending on Food")
plt.ylabel("Spending on Clothes")
plt.show()
xyz = np.vstack((x1, x2, x3)).T
df = pd.DataFrame(xyz, columns=["x", "y", "z"])
print("Correlation\n", df.corr())
print("Partial Correlation\n", df.pcorr())
print(pg.partial_corr(data=df, x="x", y="y", covar="z", method="pearson"))
cor = np.corrcoef(xyz.T)
Correlation
x y z
x 1.000000 0.968299 0.988874
y 0.968299 1.000000 0.994647
z 0.988874 0.994647 1.000000
Partial Correlation
x y z
x 1.000000 -0.994087 0.997907
y -0.994087 1.000000 0.998991
z 0.997907 0.998991 1.000000
n r CI95% p-val
pearson 50 -0.994087 [-1.0, -0.99] 5.593235e-47
# 自己计算也是一样的效果
print((cor[0, 1] - cor[0, 2] * cor[1, 2]) / np.sqrt((1 - cor[0, 2] ** 2) * (1 - cor[1, 2] ** 2)))
print((cor[1, 2] - cor[0, 1] * cor[0, 2]) / np.sqrt((1 - cor[0, 1] ** 2) * (1 - cor[0, 2] ** 2)))
print((cor[0, 2] - cor[0, 1] * cor[1, 2]) / np.sqrt((1 - cor[0, 1] ** 2) * (1 - cor[1, 2] ** 2)))
-0.9940869184414207 0.9989905555878712 0.9979069885854778
meana = [23, 34, 51]
covD = np.array((4, 5, 6))
corD = [[1, 0.5, 0.9], [0.5, 1, 0.2], [0.9, 0.2, 1]]
cov = covD * covD.reshape(3, -1) * corD
print(np.array(corD))
mn = st.multivariate_normal(meana, cov)
xyz = mn.rvs(100)
df = pd.DataFrame(xyz, columns=["x", "y", "z"])
print(df.corr())
df.pcorr().round(3)
[[1. 0.5 0.9]
[0.5 1. 0.2]
[0.9 0.2 1. ]]
x y z
x 1.000000 0.503486 0.890283
y 0.503486 1.000000 0.194100
z 0.890283 0.194100 1.000000
| x | y | z | |
|---|---|---|---|
| x | 1.000 | 0.740 | 0.935 |
| y | 0.740 | 1.000 | -0.646 |
| z | 0.935 | -0.646 | 1.000 |