E029

主成分分析

 

#include "stdafx.h"

#include "orsciJWVCL.h"
#include "orsciVM.h"
using namespace orsci;
using namespace orsci::vmt;


int main()
{
cout << " orsci:Principal component analysis (PCA)" << endl
<< " --- http://www.orsci.cn" << endl;
cout << "Part1:进行主成分分析系数矩阵计算!" << endl;
mdouble X;
//randn(13, 5, X);
X = "148 41 72 78;"
"139 34 71 76;"
"160 49 77 86;"
"149 36 67 79;"
"159 45 80 86;"
"142 31 66 76;"
"153 43 76 83;"
"150 43 77 79;"
"151 42 77 80;"
"139 31 68 74;"
"140 29 64 74;"
"161 47 78 84;"
"158 49 78 83;"
"140 33 67 77;"
"137 31 66 73;"
"152 35 73 79;"
"149 47 82 79;"
"145 35 70 77;"
"160 47 74 87;"
"156 44 78 85;"
"151 42 73 82;"
"147 38 73 78;"
"157 39 68 80;"
"147 30 65 75;"
"157 48 80 88;"
"151 36 74 80;"
"144 36 68 76;"
"141 30 67 76;"
"139 32 68 73;"
"148 38 70 78;";

cout << "X = " << endl;
cout << X << endl;
cout << "sample count = " << X.rowCount() << endl;
mdouble coeff_out, score_out;
coldouble latent_out;
coldouble tsquared_out;

bool mFlag = princomp(X, coeff_out, score_out, latent_out, tsquared_out);
cout << "主成分分析成功标记:" << mFlag << endl;
cout << "coeff = " << endl;
cout << coeff_out << endl;
cout << "score = " << endl;
cout << score_out << endl;
cout << "latent = " << endl;
cout << latent_out << endl;
cout << "tsquared = " << endl;
cout << tsquared_out << endl;

rowdouble mu;
mu.assign(vmt::mean(X, 0));
cout << "X的列均值mu=" << endl;
cout << mu << endl;
cout << "=======now display the result: (X - mu) * coeff_out=====" << endl;
cout << (X - mu) * coeff_out << endl;

cout << "计算贡献值与累积贡献值!" << endl;
double mSum = vmt::sum(latent_out);
if (mSum == 0) {cout << "there is a bug here!" << endl; return 0;}
coldouble mContribute = latent_out / mSum;
cout << mContribute << endl;
coldouble mSumContribute = mContribute;
for (int k = 1; k < mSumContribute.size(); ++k)
{
mSumContribute(k) = mSumContribute(k) + mSumContribute(k - 1);
}
cout << "累计贡献:" << endl;
cout << mSumContribute << endl;
const double mThreshold = 0.95; //按照0.95做贡献值过滤。
const int mIndex = vmt::find_first_of(mSumContribute, vmt::GEQ(mThreshold));
cout << "保留的属性个数为:" << mIndex + 1 << endl;
if (mIndex < 0) {cout << "there is a bug here!" << endl; return 0;}
mdouble coeff, score;
coeff = coeff_out.subcol(span(0, mIndex)); //截取0至mIndex列
score = score_out.subcol(span(0, mIndex)); //截取0至mIndex列
cout << "保留的系数矩阵coeff=" << endl;
cout << coeff << endl;
cout << "保留的分数矩阵score=" << endl;
cout << score << endl;

cout << "Part2:下面开始测试或应用!" << endl;
mdouble test = "150 31 70 80";
cout << "假设新的测试样例test=" << endl;
cout << test << endl;
mdouble test_score = (test - mu) * coeff;
cout << "新测试样例的特征表示test_score=" << endl;
cout << test_score << endl;

cout << endl;
cout << "press any key to stop..." << endl;
char pp;
cin >> pp;
return 0;
}


int Demo_test() //下面的程序执行与前面程序等价!
{
cout << " orsci:Principal component analysis (PCA)" << endl
<< " --- http://www.orsci.cn" << endl;
cout << "Part1:进行主成分分析系数矩阵计算!" << endl;
mdouble X;
//randn(13, 5, X);
X = "148 41 72 78;"
"139 34 71 76;"
"160 49 77 86;"
"149 36 67 79;"
"159 45 80 86;"
"142 31 66 76;"
"153 43 76 83;"
"150 43 77 79;"
"151 42 77 80;"
"139 31 68 74;"
"140 29 64 74;"
"161 47 78 84;"
"158 49 78 83;"
"140 33 67 77;"
"137 31 66 73;"
"152 35 73 79;"
"149 47 82 79;"
"145 35 70 77;"
"160 47 74 87;"
"156 44 78 85;"
"151 42 73 82;"
"147 38 73 78;"
"157 39 68 80;"
"147 30 65 75;"
"157 48 80 88;"
"151 36 74 80;"
"144 36 68 76;"
"141 30 67 76;"
"139 32 68 73;"
"148 38 70 78;";

cout << "X = " << endl;
cout << X << endl;
cout << "sample count = " << X.rowCount() << endl;
mdouble coeff_out, score_out;
coldouble latent_out;
rowdouble mu;
coldouble tsquared_out;

const double mThreshold = 0.95; //按照0.95做贡献值过滤。
bool mFlag = princomp(X, coeff_out, score_out, latent_out, tsquared_out, mThreshold, mu);
cout << "主成分分析成功标记:" << mFlag << endl;
cout << "coeff = " << endl;
cout << coeff_out << endl;
cout << "score = " << endl;
cout << score_out << endl;
cout << "latent = " << endl;
cout << latent_out << endl;
cout << "tsquared = " << endl;
cout << tsquared_out << endl;
cout << "X的列均值mu=" << endl;
cout << mu << endl;

cout << "Part2:下面开始测试或应用!" << endl;
mdouble test = "150 31 70 80";
cout << "假设新的测试样例test=" << endl;
cout << test << endl;
mdouble test_score = princomp_test(test, mu, coeff_out);
cout << "新测试样例的特征表示test_score=" << endl;
cout << test_score << endl;
return 0;

 

输出

(一)运行结果

orsci:Principal component analysis (PCA)
--- http://www.orsci.cn
Part1:进行主成分分析系数矩阵计算!
X =
rowCount = 30 colCount = 4
148 41 72 78
139 34 71 76
160 49 77 86
149 36 67 79
159 45 80 86
142 31 66 76
153 43 76 83
150 43 77 79
151 42 77 80
139 31 68 74
140 29 64 74
161 47 78 84
158 49 78 83
140 33 67 77
137 31 66 73
152 35 73 79
149 47 82 79
145 35 70 77
160 47 74 87
156 44 78 85
151 42 73 82
147 38 73 78
157 39 68 80
147 30 65 75
157 48 80 88
151 36 74 80
144 36 68 76
141 30 67 76
139 32 68 73
148 38 70 78

sample count = 30
主成分分析成功标记:1
coeff =
rowCount = 4 colCount = 4
0.624023 0.645564 -0.223638 -0.379248
0.559191 -0.345639 0.745768 -0.10802
0.408334 -0.66047 -0.62447 -0.0841418
0.362166 0.166013 -0.06207 0.915108

score =
rowCount = 30 colCount = 4
0.0718789 -1.51331 2.16944 -1.10021
-10.5913 -4.57546 -0.289584 1.32309
16.9727 1.4941 1.83303 0.384807
-3.77956 4.32881 1.27724 0.396455
15.3369 0.249682 -2.79982 0.943708
-12.4385 1.70049 -0.0754517 0.93011
7.75454 -0.788582 -0.265439 1.02648
4.84214 -4.0498 0.0292841 -1.58035
5.26914 -2.89258 -1.00219 -0.936469
-14.2182 -1.88916 -0.529338 0.0693552
-16.3459 2.08956 0.253368 0.242713
16.1623 1.83844 -0.382479 -1.69276
15.0465 -0.955539 1.84204 -1.68616
-11.7977 -0.776369 1.17682 2.30353
-16.6451 -2.02536 1.22895 0.0810275
-0.0166779 2.64832 -3.88626 -1.13812
8.49655 -9.38027 0.113645 -2.05389
-6.33417 -0.221238 -0.323242 -0.061173
14.9915 4.3328 2.15283 1.76838
11.7268 -0.186444 -1.56366 1.44265
4.36014 0.081326 1.37155 1.23031
-1.82138 -1.78243 -0.468694 -0.481046
3.6607 7.96195 1.03891 -2.13062
-10.6481 5.76841 -1.25287 -1.68908
16.4908 -1.74634 -0.23938 3.20836
0.688991 1.16266 -3.60339 -0.0359259
-7.57784 -0.0575147 1.95717 -0.536769
-13.2134 0.740099 -1.22205 1.33324
-14.0212 -2.40082 0.2785 -0.953772
-2.42236 0.844548 1.18108 -0.607869

latent =
124.814
10.8487
2.49549
1.8634

tsquared =
2.74673
3.8015
3.93966
2.57979
5.50951
1.97266
1.13279
3.04027
1.8668
2.06351
2.6005
4.00079
4.78351
4.57329
3.20664
7.39375
10.958
0.369841
7.0665
3.20167
1.71906
0.531641
8.81938
6.13562
8.00697
5.33226
2.14998
3.00167
2.62566
0.870042

X的列均值mu=
149 38.7 72.2333 79.3667
=======now display the result: (X - mu) * coeff_out=====
rowCount = 30 colCount = 4
0.0718789 -1.51331 2.16944 -1.10021
-10.5913 -4.57546 -0.289584 1.32309
16.9727 1.4941 1.83303 0.384807
-3.77956 4.32881 1.27724 0.396455
15.3369 0.249682 -2.79982 0.943708
-12.4385 1.70049 -0.0754517 0.93011
7.75454 -0.788582 -0.265439 1.02648
4.84214 -4.0498 0.0292841 -1.58035
5.26914 -2.89258 -1.00219 -0.936469
-14.2182 -1.88916 -0.529338 0.0693552
-16.3459 2.08956 0.253368 0.242713
16.1623 1.83844 -0.382479 -1.69276
15.0465 -0.955539 1.84204 -1.68616
-11.7977 -0.776369 1.17682 2.30353
-16.6451 -2.02536 1.22895 0.0810275
-0.0166779 2.64832 -3.88626 -1.13812
8.49655 -9.38027 0.113645 -2.05389
-6.33417 -0.221238 -0.323242 -0.061173
14.9915 4.3328 2.15283 1.76838
11.7268 -0.186444 -1.56366 1.44265
4.36014 0.081326 1.37155 1.23031
-1.82138 -1.78243 -0.468694 -0.481046
3.6607 7.96195 1.03891 -2.13062
-10.6481 5.76841 -1.25287 -1.68908
16.4908 -1.74634 -0.23938 3.20836
0.688991 1.16266 -3.60339 -0.0359259
-7.57784 -0.0575147 1.95717 -0.536769
-13.2134 0.740099 -1.22205 1.33324
-14.0212 -2.40082 0.2785 -0.953772
-2.42236 0.844548 1.18108 -0.607869

计算贡献值与累积贡献值!
0.891391
0.0774788
0.0178222
0.0133079

累计贡献:
0.891391
0.96887
0.986692
1

保留的属性个数为:2
保留的系数矩阵coeff=
rowCount = 4 colCount = 2
0.624023 0.645564
0.559191 -0.345639
0.408334 -0.66047
0.362166 0.166013

保留的分数矩阵score=
rowCount = 30 colCount = 2
0.0718789 -1.51331
-10.5913 -4.57546
16.9727 1.4941
-3.77956 4.32881
15.3369 0.249682
-12.4385 1.70049
7.75454 -0.788582
4.84214 -4.0498
5.26914 -2.89258
-14.2182 -1.88916
-16.3459 2.08956
16.1623 1.83844
15.0465 -0.955539
-11.7977 -0.776369
-16.6451 -2.02536
-0.0166779 2.64832
8.49655 -9.38027
-6.33417 -0.221238
14.9915 4.3328
11.7268 -0.186444
4.36014 0.081326
-1.82138 -1.78243
3.6607 7.96195
-10.6481 5.76841
16.4908 -1.74634
0.688991 1.16266
-7.57784 -0.0575147
-13.2134 0.740099
-14.0212 -2.40082
-2.42236 0.844548

Part2:下面开始测试或应用!
假设新的测试样例test=
rowCount = 1 colCount = 4
150 31 70 80

新测试样例的特征表示test_score=
rowCount = 1 colCount = 2
-4.36432 4.88718

press any key to stop...


(二)主成分运行结果应用

可以运用降维的新空间中特征表示进行训练和分类。分类器实现了降维空间中的运行!

(三)说明:

(1)主成分的应用非常广,在数据分析和数据挖掘上都有广泛应用。

(2)orsci提供主成分分析支持。

(3)orsci包支持向量和矩阵计算,可下载配套软件orsci应用。

书籍 姜维. 《数据分析与数据挖掘》、《数据分析与数据挖掘建模与工具》、《文本分析与文本挖掘》
软件 orsci开发包(C++语言、Delphi语言和C语言)。