收集数据

当光学字符识别软件第一次处理文件时,它将文件划分成一个矩阵,从而网格中的每一个单元包含一个单一的图像字符,当图像字符被扫描到计算机中,它们将转换成像素,并且具有16个统计属性。这里我们使用W.Frey和D.J.Slate捐赠给UCI机器学习数据库的一个数据集。该数据集包含了26个大写英文字母的2000个案例,使用随机重塑和扭曲的20种不同的黑白字体印刷。

1
2
3
# 加载数据
letters = read.csv("C:/Users/cxy/Documents/MLwR/letterdata.csv")
head(letters)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
##   letter xbox ybox width height onpix xbar ybar x2bar y2bar xybar x2ybar
## 1      T    2    8     3      5     1    8   13     0     6     6     10
## 2      I    5   12     3      7     2   10    5     5     4    13      3
## 3      D    4   11     6      8     6   10    6     2     6    10      3
## 4      N    7   11     6      6     3    5    9     4     6     4      4
## 5      G    2    1     3      1     1    8    6     6     6     6      5
## 6      S    4   11     5      8     3    8    8     6     9     5      6
##   xy2bar xedge xedgey yedge yedgex
## 1      8     0      8     0      8
## 2      9     2      8     4     10
## 3      7     3      7     3      9
## 4     10     6     10     2      8
## 5      9     1      7     5     10
## 6      6     0      8     9      7
1
2
3
4
5
# 生成训练数据集和验证数据集
set.seed(1234)  # 设置随机数种子,方便重复性研究
train = sample(nrow(letters), 0.80*nrow(letters))  # 原数据的80%用来训练模型
letters_train = letters[train,]  # letters_train为训练数据集
letters_validate = letters[-train,]  # letters_validate为验证数据集

基于数据训练模型

1
2
3
library(kernlab)  # 这里也可以选择e1071包
letter_classifier = ksvm(letter ~ ., data = letters_train,
                         kernel = "vanilladot")
1
##  Setting default kernel parameters
1
letter_classifier
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 7087 
## 
## Objective Function Value : -17.1625 -21.9459 -33.4735 -6.9901 -10.5295 -37.639 -57.9569 -22.9405 -62.7193 -33.812 -19.0966 -36.5079 -34.2352 -60.7029 -14.99 -38.3104 -33.5718 -19.6905 -17.5157 -37.6273 -28.4013 -9.0914 -11.0804 -40.1353 -14.8104 -7.5027 -151.6477 -47.0951 -63.8892 -123.648 -160.4772 -52.6903 -41.4783 -64.1682 -23.9272 -25.5132 -23.6743 -38.7907 -44.6906 -122.5024 -195.481 -196.9728 -21.6046 -10.3491 -52.5781 -10.3882 -50.3628 -8.787 -18.6375 -9.4933 -106.8694 -24.8972 -240.1406 -68.7374 -7.5178 -4.4809 -139.4837 -75.8127 -17.7 -12.5495 -75.1063 -12.9046 -30.1192 -17.7359 -22.1243 -23.9015 -54.8566 -9.6428 -4.7055 -14.2817 -4.9307 -3.6473 -7.0685 -35.5702 -59.685 -187.5025 -44.6121 -46.1629 -44.212 -15.532 -19.1347 -86.9701 -116.9465 -43.554 -33.6991 -130.3125 -30.243 -22.5265 -37.516 -15.0602 -5.6181 -42.8328 -8.5747 -20.6065 -53.9882 -168.1133 -48.4149 -40.8736 -33.1417 -72.3728 -140.7413 -10.0315 -5.2327 -13.3438 -27.1343 -141.7615 -52.8795 -174.9276 -94.997 -10.4755 -15.4408 -3.2229 -72.5447 -8.3577 -96.2938 -51.6979 -93.5146 -60.3291 -63.775 -20.5539 -11.4034 -7.3782 -26.2084 -11.7299 -256.4183 -31.4 -22.6267 -122.8697 -136.2184 -9.5398 -34.9232 -6.2188 -56.5235 -73.8718 -26.9924 -222.8179 -38.264 -16.3067 -136.4516 -154.9506 -50.612 -26.5532 -160.2191 -88.5747 -367.8869 -149.1929 -159.799 -39.2032 -38.4578 -59.8829 -29.2192 -47.3866 -6.9396 -10.6619 -26.8862 -57.9398 -175.9671 -52.5475 -83.5204 -146.1853 -598.4773 -122.7002 -130.0893 -318.4298 -29.2198 -68.2831 -145.9986 -110.6847 -34.3504 -68.0199 -49.6147 -7.7643 -205.5669 -12.9713 -37.9594 -2.0513 -3.4879 -16.4576 -17.7269 -61.5912 -22.6026 -189.7024 -16.4508 -3.5889 -2.5059 -0.7722 -115.4117 -7.505 -70.1516 -16.9002 -13.294 -4.3679 -14.2365 -28.0992 -21.2094 -79.63 -27.1319 -96.3586 -14.062 -8.8737 -5.5603 -1.4428 -79.6025 -7.3053 -105.4057 -106.4451 -42.469 -23.8829 -58.6698 -21.8172 -54.004 -238.909 -39.6512 -38.9357 -36.9735 -16.4592 -12.4913 -120.4031 -4.8629 -4.8756 -9.27 -12.3598 -21.2478 -24.7086 -134.1187 -33.9921 -96.5581 -32.9834 -15.7082 -9.0423 -2.9788 -88.3023 -7.1291 -14.1987 -65.9215 -103.0513 -14.2214 -13.7459 -55.3047 -2.5083 -8.2094 -85.4378 -34.4511 -106.5784 -4.5944 -8.8086 -1.2328 -91.3846 -23.4225 -8.1484 -52.1673 -2.5054 -19.8359 -70.1496 -40.8848 -55.1086 -6.0493 -20.6786 -2.1779 -74.8517 -121.3369 -115.4715 -25.963 -18.0631 -54.781 -30.6583 -63.0163 -23.4562 -5.8952 -4.9898 -60.9887 -34.4195 -56.5201 -34.2965 -10.4098 -52.44 -13.9148 -20.8159 -65.3842 -3.8132 -58.299 -239.8772 -14.2543 -12.7388 -17.3769 -9.7719 -66.7161 -16.1958 -43.8982 -45.6504 -25.3446 -16.0018 -42.8316 -17.5277 -65.6143 -5.6351 -5.364 -82.941 -2.7926 -4.8064 -1.0543 -123.8981 -18.7577 -387.5045 -28.8589 -31.421 -4.8537 -74.2339 -143.9392 -78.4175 -21.9282 -42.8139 -10.9232 -26.8554 -1.9197 -58.6673 -7.8522 -161.1745 -1.8109 -1.9068 -11.4779 -0.4895 -25.9059 -33.5916 -5.9703 
## Training error : 0.129938

评估模型的性能

1
2
letter_pred = predict(letter_classifier, letters_validate)
table(letter_pred, letters_validate$letter)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
##            
## letter_pred   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O
##           A 147   2   0   0   0   1   0   1   0   4   1   1   1   2   0
##           B   0 135   0   6   0   1   0   3   2   0   2   0   3   0   0
##           C   0   0 120   0   3   1   5   4   1   0   4   1   0   0   1
##           D   0   4   0 139   0   1   2  13   5   3   2   0   0   2   2
##           E   0   1   5   0 144   4   0   0   0   0   3   4   0   0   0
##           F   0   0   1   1   1 126   0   1   6   2   0   0   0   1   0
##           G   0   4   8   0   3   0 108   3   1   0   1   6   1   0   2
##           H   0   0   1   1   1   0   0 113   0   1   1   1   0   7  13
##           I   0   1   0   0   0   0   0   0 130   5   0   0   0   0   0
##           J   2   0   0   0   0   0   0   4   8 135   0   0   0   0   0
##           K   1   0   2   0   0   1   2   8   0   0 122   0   0   1   0
##           L   0   0   1   0   2   0   3   0   0   0   1 146   0   0   0
##           M   1   0   0   0   0   0   0   0   0   0   0   0 128   3   0
##           N   0   0   0   6   0   1   0   2   0   1   0   0   0 152   0
##           O   0   0   1   1   0   0   4   8   0   0   0   0   1   1 126
##           P   0   0   0   0   0   2   0   1   0   0   0   0   0   0   4
##           Q   1   0   0   0   2   0   3   3   0   0   0   3   0   0   1
##           R   0   6   0   1   1   0   2  13   0   0  10   0   1   0   2
##           S   0   0   0   0   3   2   6   0   2   2   0   1   0   0   0
##           T   0   0   0   0   3   6   0   0   0   0   0   0   0   0   0
##           U   0   0   2   0   0   0   0   4   0   0   1   0   1   1   1
##           V   0   2   0   0   0   0   1   1   0   0   0   0   0   1   0
##           W   0   0   0   0   0   0   1   0   0   0   0   0   4   1   4
##           X   0   2   0   0   1   0   0   2   2   0   4   2   0   0   0
##           Y   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##           Z   0   0   0   0   2   0   0   0   1   1   0   0   0   0   0
##            
## letter_pred   P   Q   R   S   T   U   V   W   X   Y   Z
##           A   0   5   3   0   0   0   0   0   0   1   0
##           B   1   2   6   6   0   0   4   0   0   0   0
##           C   0   0   0   0   0   0   0   0   0   0   0
##           D   1   1   2   0   1   1   0   0   0   2   0
##           E   0   3   0  12   0   0   0   0   2   0   0
##           F   8   1   0   4   2   0   0   0   1   3   0
##           G   2   3   3   5   0   0   1   0   0   0   0
##           H   0   1   1   0   3   2   2   1   0   0   0
##           I   0   0   0   2   0   0   0   0   5   1   0
##           J   0   0   0   0   0   0   0   0   1   0   4
##           K   2   0   9   0   1   0   0   0   4   0   0
##           L   0   2   0   3   0   0   0   0   3   0   0
##           M   0   0   0   0   0   1   1   9   0   0   0
##           N   0   0   2   0   0   0   1   2   0   0   0
##           O   1   3   0   0   0   1   0   0   1   0   0
##           P 127   0   0   0   0   0   0   0   0   1   0
##           Q   0 134   1   1   0   0   0   0   0   1   0
##           R   0   0 143   1   2   0   0   0   0   0   0
##           S   0   8   0 106   3   0   0   0   0   0   9
##           T   0   0   0   2 126   2   0   0   0   2   2
##           U   0   0   0   0   0 146   0   0   2   0   0
##           V   0   0   0   0   0   0 138   0   0   3   0
##           W   0   0   0   0   0   0   3 142   0   0   0
##           X   0   0   0   0   1   0   0   0 137   0   0
##           Y   3   0   0   1   2   0   3   0   2 132   0
##           Z   0   2   0   8   4   0   0   0   0   0 110

对角线值147、135……表示预测值和真实值相匹配的总记录数。同样的,错误的数据也列出来了。例如位于A行B列的2表示有两条记录字母B被模型误认为A。

1
2
agreement = letter_pred == letters_validate$letter
table(agreement)
1
2
3
## agreement
## FALSE  TRUE 
##   588  3412
1
prop.table(table(agreement))
1
2
3
## agreement
## FALSE  TRUE 
## 0.147 0.853

根据统计结果,在4000个测试记录中,分类器正确识别的字母有3412个。准确率为85.3%。

提高模型的性能

之前的支持向量机模型使用简单的线性核函数。通过使用一个更复杂的核函数,我们一个将数据映射到一个更高维的空间。一个流行的惯例就是从高斯RBF核函数开始。

1
2
3
4
5
6
set.seed(1234) # 由于ksvm RBF核函数的随机性,设置随机数种子便于研究结果再现。
letter_classifier_rbf = ksvm(letter ~ ., data = letters_train,
                         kernel = "rbfdot")
letter_pred_rbf = predict(letter_classifier_rbf, letters_validate)
agreement_rbf = letter_pred_rbf == letters_validate$letter
table(agreement_rbf)
1
2
3
## agreement_rbf
## FALSE  TRUE 
##   257  3743
1
prop.table(table(agreement_rbf))
1
2
3
## agreement_rbf
##   FALSE    TRUE 
## 0.06425 0.93575

从上面的结果我们可以看到,利用高斯RBF核函数后,预测精度提高到了93.58%。另外我们还可以尝试调整成本约束参数c,c值越高,对错误的惩罚越大,决策边界越窄。