楚新元 | All in R

Welcome to R Square

利用 R 进行多元回归估计

楚新元 / 2021-08-23


  案例来源于古扎拉蒂《经济计量学精要》(第四版)第二章例 2-5 古董钟与拍卖价格。之所以选择此案例是因为此案例只有 3 个变量,32 个样本,便于手工验证。

读取数据

library(readxl)
library(dplyr)
library(kableExtra)
data = read_excel("data/Table2_14.xls", skip = 5)
data %>% 
  kable(
    caption = "古董钟与拍卖价格",
    format = "html"
  ) %>% 
   kable_styling(
     bootstrap_options = "striped",
     fixed_thead = TRUE,
     font_size = 14,
  ) %>% 
  column_spec(1, width = "80px") %>%
  column_spec(2:4, width = "120px")
古董钟与拍卖价格
Observation Price Age Number of Bidders
1 1235 127 13
2 1080 115 12
3 845 127 7
4 1552 150 9
5 1047 156 6
6 1979 182 11
7 1822 156 12
8 1253 132 10
9 1297 137 9
10 946 113 9
11 1713 137 15
12 1024 117 11
13 2131 170 14
14 1550 182 8
15 1884 162 11
16 2041 184 10
17 854 143 6
18 1483 159 9
19 1055 108 14
20 1545 175 8
21 729 108 6
22 1792 179 9
23 1175 111 15
24 1593 187 8
25 1147 137 8
26 1092 153 6
27 1152 117 13
28 1336 126 10
29 785 111 7
30 744 115 7
31 1356 194 5
32 1262 168 7

变量含义

利用 R 自带函数进行参数估计

fit = lm(Price ~ Age + `Number of Bidders`, data = data)
summary(fit)
## 
## Call:
## lm(formula = Price ~ Age + `Number of Bidders`, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -208.60 -119.04   15.73  101.84  212.54 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1336.0493   175.2725  -7.623 2.10e-08 ***
## Age                    12.7414     0.9124  13.965 2.09e-14 ***
## `Number of Bidders`    85.7641     8.8020   9.744 1.19e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 134.6 on 29 degrees of freedom
## Multiple R-squared:  0.8906,	Adjusted R-squared:  0.8831 
## F-statistic: 118.1 on 2 and 29 DF,  p-value: 1.161e-14

手动计算参数值和各个统计量(含截距项)

生成矩阵 \(X\)

data %>%
  mutate(X1 = 1) %>%
  select(X1, Age, `Number of Bidders`) %>%
  as.matrix() -> X
print(X)
##       X1 Age Number of Bidders
##  [1,]  1 127                13
##  [2,]  1 115                12
##  [3,]  1 127                 7
##  [4,]  1 150                 9
##  [5,]  1 156                 6
##  [6,]  1 182                11
##  [7,]  1 156                12
##  [8,]  1 132                10
##  [9,]  1 137                 9
## [10,]  1 113                 9
## [11,]  1 137                15
## [12,]  1 117                11
## [13,]  1 170                14
## [14,]  1 182                 8
## [15,]  1 162                11
## [16,]  1 184                10
## [17,]  1 143                 6
## [18,]  1 159                 9
## [19,]  1 108                14
## [20,]  1 175                 8
## [21,]  1 108                 6
## [22,]  1 179                 9
## [23,]  1 111                15
## [24,]  1 187                 8
## [25,]  1 137                 8
## [26,]  1 153                 6
## [27,]  1 117                13
## [28,]  1 126                10
## [29,]  1 111                 7
## [30,]  1 115                 7
## [31,]  1 194                 5
## [32,]  1 168                 7

生成矩阵 \(Y\)

Y = as.matrix(data$Price)
print(Y)
##       [,1]
##  [1,] 1235
##  [2,] 1080
##  [3,]  845
##  [4,] 1552
##  [5,] 1047
##  [6,] 1979
##  [7,] 1822
##  [8,] 1253
##  [9,] 1297
## [10,]  946
## [11,] 1713
## [12,] 1024
## [13,] 2131
## [14,] 1550
## [15,] 1884
## [16,] 2041
## [17,]  854
## [18,] 1483
## [19,] 1055
## [20,] 1545
## [21,]  729
## [22,] 1792
## [23,] 1175
## [24,] 1593
## [25,] 1147
## [26,] 1092
## [27,] 1152
## [28,] 1336
## [29,]  785
## [30,]  744
## [31,] 1356
## [32,] 1262

计算样本容量 \(n\) 和自由度 \(df\)

n = nrow(data)
k = ncol(X)
df = n - k
print(c(n = n, k = k, df = df))
##  n  k df 
## 32  3 29

估计参数 \(\beta\)

Beta_hat = solve(t(X) %*% X) %*% t(X) %*% Y
colnames(Beta_hat) = "Beta_hat"
print(Beta_hat)
##                      Beta_hat
## X1                -1336.04929
## Age                  12.74138
## Number of Bidders    85.76407

计算回归残差 \(e\)

residuals = Y - (X %*% Beta_hat)
colnames(residuals) = "residuals"
print(residuals)
##        residuals
##  [1,] -162.03929
##  [2,]  -78.37862
##  [3,]  -37.45489
##  [4,]  204.96516
##  [5,] -119.19094
##  [6,]   52.71275
##  [7,]  141.22465
##  [8,]   49.54599
##  [9,]  115.60314
## [10,]   70.39635
## [11,]   17.01874
## [12,]  -74.09732
## [13,]  100.31715
## [14,] -118.99505
## [15,]  212.54042
## [16,]  174.99405
## [17,] -146.55296
## [18,]   21.29270
## [19,] -185.71707
## [20,]  -34.80536
## [21,]  174.39547
## [22,]   75.46503
## [23,] -189.70529
## [24,] -139.70197
## [25,]   51.36721
## [26,]  -35.96679
## [27,] -117.62546
## [28,]  208.99429
## [29,]  106.40725
## [30,]   14.44171
## [31,] -208.59945
## [32,] -142.85161

计算回归标准误 \(\hat{\sigma}\)

Sigma_hat = sqrt(sum(residuals^2) / df)
c(Sigma_hat = Sigma_hat)
## Sigma_hat 
##  134.6083

计算 \(\beta\) 估计量的方差与标准误

Beta_hat_var = solve(t(X) %*% X) * (Sigma_hat)^2
Beta1_hat_se = sqrt(Beta_hat_var[1, 1])
Beta2_hat_se = sqrt(Beta_hat_var[2, 2])
Beta3_hat_se = sqrt(Beta_hat_var[3, 3])
c(
  Beta1_hat_se = Beta1_hat_se,
  Beta2_hat_se  = Beta2_hat_se,
  Beta3_hat_se  = Beta3_hat_se
)
## Beta1_hat_se Beta2_hat_se Beta3_hat_se 
##  175.2724965    0.9123559    8.8019949

计算 \(t\) 统计量

t_Beta1_hat = Beta_hat[1, 1] / Beta1_hat_se
t_Beta2_hat = Beta_hat[2, 1] / Beta2_hat_se
t_Beta3_hat = Beta_hat[3, 1] / Beta3_hat_se

c(
  t_Beta1_hat = t_Beta1_hat,
  t_Beta2_hat = t_Beta2_hat,
  t_Beta3_hat = t_Beta3_hat
)
## t_Beta1_hat t_Beta2_hat t_Beta3_hat 
##   -7.622698   13.965366    9.743708

计算拟合优度 \(R^2\)

TSS = sum((Y - mean(Y))^2)
RSS = sum(residuals^2)
ESS = TSS - RSS
R_squared = 1 - RSS/TSS
c(R_squared = R_squared)
## R_squared 
## 0.8906143

计算修正的拟合优度 \(Adjusted\ R^2\)

Adjusted_R_squared  = 1 - (1 - R_squared) * (n - 1) / (n - k)
c(Adjusted_R_squared  = Adjusted_R_squared)
## Adjusted_R_squared 
##          0.8830705

计算 \(F\) 统计量

F_statistic = (R_squared / (k - 1)) /
  ((1 - R_squared) / (n - k))
c(F_statistic = F_statistic)
## F_statistic 
##    118.0585