对于模型,如果出现Var(Ui) = σi^2,即对于不同的样本点,随机误差项的方差不再是常数,而互不相同,则认为出现了异方差性(Heteroscedasticity)。

加载数据

1
2
3
library(tidyverse)
df = read.csv("~/mydata/agricul.csv")
print(df)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
##      aeras product
## 1    907.5   16.31
## 2    873.2   17.14
## 3  13159.2  125.24
## 4   5928.1   42.24
## 5   6834.4   40.28
## 6   5495.5   84.47
## 7   6055.2   70.70
## 8  12694.6  101.67
## 9   1018.5   16.83
## 10 12770.9  211.51
## 11  6542.7  101.00
## 12 12244.3  155.87
## 13  3601.5   49.72
## 14  8158.1   69.70
## 15 16564.5  255.92
## 16 17729.2  183.65
## 17 11061.5  146.79
## 18 11304.7  129.63
## 19  9166.2  154.28
## 20  6821.7   61.24
## 21 17779.6  206.50
## 22  4701.3   44.37
## 23  6036.1   51.79
## 24   316.5    3.53
## 25  7016.5   59.45
## 26  5252.5   37.29
## 27   761.7    6.33
## 28  1235.2   10.07
## 29  4275.1   44.78

建立一元线性回归模型

1
2
agricul_model = lm(product ~ aeras, data=df)
summary(agricul_model)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
## 
## Call:
## lm(formula = product ~ aeras, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.924 -21.254   0.527  11.051  59.976 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.6610192  8.9241561  -0.634    0.531    
## aeras        0.0123088  0.0009888  12.449 1.07e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27.06 on 27 degrees of freedom
## Multiple R-squared:  0.8516,	Adjusted R-squared:  0.8461 
## F-statistic:   155 on 1 and 27 DF,  p-value: 1.066e-12

残差的图形检验

1
2
library(ggfortify)
autoplot(agricul_model)

图上残差的走势呈现喇叭状,所以初步判断是异方差。

非参数–方差齐性检验

1
2
3
4
5
6
n = nrow(df)
m = ceiling(n/2)
o = order(df$aeras)  # order把顺序号计算出来
resid1 = agricul_model$residuals[o[1:m]]
resid2 = agricul_model$residuals[o[-1:-m]]
var.test(resid1,resid2)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
## 
## 	F test to compare two variances
## 
## data:  resid1 and resid2
## F = 0.17408, num df = 14, denom df = 13, p-value = 0.002558
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.05648489 0.52430502
## sample estimates:
## ratio of variances 
##          0.1740782

原假设是同方差,p-value = 0.0025576,所以拒绝同方差原假设。

Breusch-Pagan检验

1
2
library(lmtest)
bptest(agricul_model) # 默认是学生化的检验,即studentize = TRUE
1
2
3
4
5
## 
## 	studentized Breusch-Pagan test
## 
## data:  agricul_model
## BP = 7.7339, df = 1, p-value = 0.005419
1
bptest(agricul_model,studentize = FALSE)  
1
2
3
4
5
## 
## 	Breusch-Pagan test
## 
## data:  agricul_model
## BP = 7.8051, df = 1, p-value = 0.00521

p-value很小,说明存在异方差。

White异方差检验

1
2
model_white = lm(I(agricul_model$residuals^2) ~ aeras + I(aeras^2), data=df)
summary(model_white)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
## 
## Call:
## lm(formula = I(agricul_model$residuals^2) ~ aeras + I(aeras^2), 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1452.92  -392.76   -49.53   194.52  2357.00 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.197e+02  4.018e+02  -0.547    0.589
## aeras        1.595e-01  1.075e-01   1.484    0.150
## I(aeras^2)  -3.539e-06  5.947e-06  -0.595    0.557
## 
## Residual standard error: 870.1 on 26 degrees of freedom
## Multiple R-squared:  0.2765,	Adjusted R-squared:  0.2209 
## F-statistic: 4.969 on 2 and 26 DF,  p-value: 0.01487

此处也可以用bptest函数进行White异方差检验,具体如下:
bptest(agricul_model, ~ aeras + I(aeras^2), data=df)

在5%的置信水平下,以上模型F统计量的p值很小,说明整个模型显著,拒绝同方差原假设,White异方差检验结果表明原模型存在异方差。

异方差的修正(注意R的weights参数设置)

误差方差与aeras^2成比例,原模型两边同除以aeras

1
2
3
4
agricul_model_w1 = glm(product ~ aeras, 
                       weights = 1/aeras^2, 
                       data = df)
summary(agricul_model_w1)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
## 
## Call:
## glm(formula = product ~ aeras, data = df, weights = 1/aeras^2)
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## -0.005568  -0.002899  -0.001029   0.003880   0.007344  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.8238808  1.0984633    0.75     0.46    
## aeras       0.0113412  0.0008247   13.75 1.03e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.413013e-05)
## 
##     Null deviance: 0.00305385  on 28  degrees of freedom
## Residual deviance: 0.00038151  on 27  degrees of freedom
## AIC: 257.31
## 
## Number of Fisher Scoring iterations: 2
1
2
3
4
# agricul_model_w1模型的等价形式
agricul_model_w1_equal = lm(I(product/aeras) ~ I(1/aeras), 
                            data = df)
summary(agricul_model_w1_equal)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
## 
## Call:
## lm(formula = I(product/aeras) ~ I(1/aeras), data = df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.005568 -0.002899 -0.001029  0.003880  0.007344 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0113412  0.0008247   13.75 1.03e-13 ***
## I(1/aeras)  0.8238808  1.0984633    0.75     0.46    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.003759 on 27 degrees of freedom
## Multiple R-squared:  0.02041,	Adjusted R-squared:  -0.01587 
## F-statistic: 0.5625 on 1 and 27 DF,  p-value: 0.4597

误差方差与aeras成比例,原模型两边同除以sqrt(aeras)

1
2
3
4
agricul_model_w2 = glm(product ~ aeras, 
                       weights = 1/aeras, 
                       data = df)
summary(agricul_model_w2)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
## 
## Call:
## glm(formula = product ~ aeras, data = df, weights = 1/aeras)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4681  -0.2301  -0.0315   0.1802   0.5693  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.4542166  3.2207060   0.141    0.889    
## aeras       0.0114889  0.0007456  15.408 6.71e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.07993051)
## 
##     Null deviance: 21.1347  on 28  degrees of freedom
## Residual deviance:  2.1581  on 27  degrees of freedom
## AIC: 260.42
## 
## Number of Fisher Scoring iterations: 2
1
2
3
4
5
6
# agricul_model_w2模型的等价形式
agricul_model_w2_equal = lm(I(product/sqrt(aeras)) ~ 
                              0 + I(1/sqrt(aeras)) + 
                              I(sqrt(aeras)), 
                            data = df)
summary(agricul_model_w2_equal)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
## 
## Call:
## lm(formula = I(product/sqrt(aeras)) ~ 0 + I(1/sqrt(aeras)) + 
##     I(sqrt(aeras)), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4681 -0.2301 -0.0315  0.1802  0.5693 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## I(1/sqrt(aeras)) 0.4542166  3.2207060   0.141    0.889    
## I(sqrt(aeras))   0.0114889  0.0007456  15.408 6.71e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2827 on 27 degrees of freedom
## Multiple R-squared:  0.9304,	Adjusted R-squared:  0.9253 
## F-statistic: 180.5 on 2 and 27 DF,  p-value: 2.365e-16

利用原模型计算1/|残差|作为权重来修正

1
2
3
4
agricul_model_w3 = glm(product ~ aeras, 
                       weights = 1/agricul_model$residuals^2, 
                       data = df)
summary(agricul_model_w3)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
## 
## Call:
## glm(formula = product ~ aeras, data = df, weights = 1/agricul_model$residuals^2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9797  -0.9474  -0.3832   1.0227   1.3687  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.5721494  0.6222762  -7.347 6.65e-08 ***
## aeras        0.0120176  0.0002083  57.700  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.9615698)
## 
##     Null deviance: 3227.314  on 28  degrees of freedom
## Residual deviance:   25.962  on 27  degrees of freedom
## AIC: 233.83
## 
## Number of Fisher Scoring iterations: 2
1
2
3
4
5
6
# agricul_model_w3模型的等价形式
agricul_model_w3_equal = lm(I(product/abs(agricul_model$residuals)) ~ 
                              0 + I(1/abs(agricul_model$residuals)) +
                              I(aeras/abs(agricul_model$residuals)), 
                            data=df)
summary(agricul_model_w3_equal)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
## 
## Call:
## lm(formula = I(product/abs(agricul_model$residuals)) ~ 0 + I(1/abs(agricul_model$residuals)) + 
##     I(aeras/abs(agricul_model$residuals)), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9797 -0.9474 -0.3832  1.0227  1.3687 
## 
## Coefficients:
##                                         Estimate Std. Error t value
## I(1/abs(agricul_model$residuals))     -4.5721494  0.6222762  -7.347
## I(aeras/abs(agricul_model$residuals))  0.0120176  0.0002083  57.700
##                                       Pr(>|t|)    
## I(1/abs(agricul_model$residuals))     6.65e-08 ***
## I(aeras/abs(agricul_model$residuals))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9806 on 27 degrees of freedom
## Multiple R-squared:  0.9947,	Adjusted R-squared:  0.9943 
## F-statistic:  2533 on 2 and 27 DF,  p-value: < 2.2e-16