1
2
3 """
4 Principal component analysis
5 """
6 from numpy import abs, array, average, corrcoef, mat, shape, std, sum, transpose, zeros
7 from numpy.linalg import svd
8
9
10 __author__ = "Henning Risvik"
11 __date__ = "February 2008"
12 __version__ = "1.1.02"
13
14
16 """
17
18 @param X: 2-dimensional matrix of number data
19 @type X: numpy array
20
21
22 @return: Mean centered X (always has same dimensions as X)
23
24 """
25 (rows, cols) = shape(X)
26 new_X = zeros((rows, cols), float)
27 _averages = average(X, 0)
28
29 for row in range(rows):
30 new_X[row, 0:cols] = X[row, 0:cols] - _averages[0:cols]
31 return new_X
32
33
35 """
36
37 @param X: 2-dimensional matrix of number data
38 @type X: numpy array
39
40
41 @return: Standardized X (always has same dimensions as X)
42
43 """
44 (rows, cols) = shape(X)
45 new_X = zeros((rows, cols))
46 _STDs = std(X, 0)
47
48 for value in _STDs:
49 if value == 0: raise ZeroDivisionError, 'division by zero, cannot proceed'
50
51 for row in range(rows):
52 new_X[row, 0:cols] = X[row, 0:cols] / _STDs[0:cols]
53 return new_X
54
55
56
57
59 """
60 Get an acceptable column-vector of E.
61
62 @param E: 2-dimensional matrix of number data
63 @type E: numpy array
64
65 @return: a non-zero vector
66 """
67 (rows, cols) = shape(E)
68 for col_ind in range(cols):
69 t = E[:,col_ind]
70 if (vec_inner(t) > 0):
71 return t
72 raise ValueError, 'all column vectors in E are zero vectors'
73
74
76 """
77 NIPALS matrix help function.
78 Get an acceptable column-vector of E.
79
80 @param E: 2-dimensional matrix of number data
81 @type E: numpy matrix
82
83 @return: a non-zero vector
84 """
85 (rows, cols) = shape(E)
86 for col_ind in range(cols):
87 t = E[:,col_ind]
88 eig = transpose(t)*t
89 if (eig > 0):
90 return t
91 raise ValueError, 'all column vectors in E are zero vectors'
92
94 """
95 @param v: Vector of number data.
96 @type v: numpy array
97
98 @return: transpose(v) * v (float or int)
99 """
100 return sum(v * v);
101
103 """
104 @param A: 2-dimensional matrix of number data.
105 @type A: numpy array
106
107 @param x: Vector of number data.
108 @type x: numpy array
109
110 @return: b of (Ax = b). Product of: matrix A (m,n) * vector x (n) = vector b (m)
111 """
112
113
114
115
116
117
118
119 return array(map(lambda a: sum(a[:]*x), A))
120
122 """
123
124 sets: E = E - (t*transpose(p))
125 E: (m, n)-matrix, (t*transpose(p)): (m, n)-matrix
126
127
128 @param E: 2-dimensional matrix of number data.
129 @type E: numpy array
130
131 @param t: Vector of number data. Current Scores (of PC_i).
132 @type t: numpy array
133
134 @param p: Vector of number data. Current Loading (of PC_i).
135 @type p: numpy array
136
137
138 @return: None
139
140
141 """
142
143 m = E.shape[0]
144 for i in range(m):
145 E[i, :] = E[i, :] - (t[i] * p)
146
147
148
149
150
151 """
152 Estimation of PC components with the iterative NIPALS method:
153
154
155 E[0] = mean_center(X) (the E-matrix for the zero-th PC)
156
157 t = E(:, 0) (a column in X (mean centered) is set as starting t vector)
158
159 for i=1 to (PCs):
160
161 1 p=(E[i-1]'t) / (t't) Project X onto t to find the corresponding (improve estimated) loading p
162
163 2 p = p * (p'p)^-0.5 Normalise loading vector p to length 1
164
165 3 t = (E[i-1]p) / (p'p) Project X onto p to find corresponding (improve estimated) score vector t
166
167 4 Check for convergence, if difference between eigenval_new and eigenval_old is larger than threshold*eigenval_new return to step 1
168
169 5 E[i] = E[i-1] - tp' Remove the estimated PC component from E[i-1]
170
171 """
173 """
174
175
176 PCA by NIPALS using numpy matrix
177
178
179 @param X: 2-dimensional matrix of number data.
180 @type X: numpy array
181
182 @param PCs: Number of Principal Components.
183 @type PCs: int
184
185 @param threshold: Convergence check value. For checking on convergence to zero difference (e.g. 0.000001).
186 @type threshold: float
187
188 @param E_matrices: If E-matrices should be retrieved or not. E-matrices (for each PC) or explained_var (explained variance for each PC).
189 @type E_matrices: bool
190
191 @return: (Scores, Loadings, E)
192
193 """
194 (rows, cols) = shape(X)
195
196 maxPCs = min(rows, cols)
197 if maxPCs < PCs: PCs = maxPCs
198
199 Scores = zeros((rows, PCs), float)
200 Loadings = zeros((PCs, cols), float)
201
202 E = mat(X.copy())
203
204
205 if E_matrices:
206 Error_matrices = zeros((PCs, rows, cols), float)
207 else:
208 explained_var = zeros((PCs))
209 tot_explained_var = 0
210
211
212 e_tot0 = 0
213 for k in range(rows):
214 e_k = array(E[k, :])**2
215 e_tot0 += sum(e_k)
216
217
218
219 t = get_column_mat(E)
220
221
222
223
224 for i in range(PCs):
225 convergence = False
226 ready_for_compare = False
227 E_t = transpose(E)
228
229 while not convergence:
230 eigenval = float(transpose(t)*t)
231 p = (E_t*t) / eigenval
232
233 _p = float(transpose(p)*p)
234 p = p * _p**(-0.5)
235 t = (E*p) / (transpose(p)*p)
236
237
238 eigenval_new = float(transpose(t)*t)
239 if not ready_for_compare:
240 ready_for_compare = True
241 else:
242 if (eigenval_new - eigenval_old) < threshold*eigenval_new:
243 convergence = True
244 eigenval_old = eigenval_new;
245
246
247 p = transpose(p)
248 E = E - (t*p)
249
250
251 _t = array(t)
252 Scores[:, i] = _t[:,0]; Loadings[i, :] = p[0,:]
253
254 if E_matrices:
255
256
257
258
259 Error_matrices[i] = E.copy()
260
261 else:
262
263 e_tot = 0
264
265 for k in range(rows):
266 e_ = zeros((cols), float)
267 for k_col in range(cols):
268 e_[k_col] = E[k, k_col]*E[k, k_col]
269 e_tot += sum(e_)
270 tot_obj_residual_var = (e_tot / e_tot0)
271 explained_var[i] = 1 - tot_obj_residual_var - tot_explained_var
272 tot_explained_var += explained_var[i]
273
274
275
276
277 if E_matrices:
278 return Scores, Loadings, Error_matrices
279 return Scores, Loadings, explained_var
280
281
282
284 """
285
286 PCA by NIPALS using numpy array
287
288
289 @param X: 2-dimensional matrix of number data.
290 @type X: numpy array
291
292 @param PCs: Number of Principal Components.
293 @type PCs: int
294
295 @param threshold: Convergence check value. For checking on convergence to zero (e.g. 0.000001).
296 @type threshold: float
297
298 @param E_matrices: If E-matrices should be retrieved or not. E-matrices (for each PC) or explained_var (explained variance for each PC).
299 @type E_matrices: bool
300
301 @return: (Scores, Loadings, E)
302
303
304 """
305
306 (rows, cols) = shape(X)
307 maxPCs = min(rows, cols)
308 if maxPCs < PCs: PCs = maxPCs
309
310 Scores = zeros((rows, PCs), float)
311 Loadings = zeros((PCs, cols), float)
312
313 E = X.copy()
314
315 if E_matrices:
316 Error_matrices = zeros((PCs, rows, cols), float)
317 else:
318 explained_var = zeros((PCs))
319 tot_explained_var = 0
320
321
322 e_tot0 = 0
323 for k in range(rows):
324 e_k = E[k, :]**2
325 e_tot0 += sum(e_k)
326
327
328
329 t = get_column(E)
330 p = zeros((cols), float)
331
332
333 for i in range(PCs):
334 convergence = False
335 ready_for_compare = False
336 E_t = transpose(E)
337
338 while not convergence:
339 _temp = vec_inner(t)
340 p = mat_prod(E_t, t) / _temp
341
342 _temp = vec_inner(p)**(-0.5)
343 p = p * _temp
344
345 _temp = vec_inner(p)
346 t = mat_prod(E, p) / _temp
347
348
349 eigenval_new = vec_inner(t)
350 if not ready_for_compare:
351 ready_for_compare = True
352 else:
353 if (eigenval_new - eigenval_old) < threshold*eigenval_new:
354 convergence = True
355 eigenval_old = eigenval_new;
356
357 remove_tp_prod(E, t, p)
358
359
360 Scores[:, i] = t; Loadings[i, :] = p
361
362
363 if E_matrices:
364
365
366
367
368 Error_matrices[i] = E.copy()
369
370 else:
371
372 e_tot = 0
373 for k in range(rows):
374 e_k = E[k, :]**2
375 e_tot += sum(e_k)
376 tot_obj_residual_var = (e_tot / e_tot0)
377 explained_var[i] = 1 - tot_obj_residual_var - tot_explained_var
378 tot_explained_var += explained_var[i]
379
380 if E_matrices:
381 return Scores, Loadings, Error_matrices
382 else:
383 return Scores, Loadings, explained_var
384
385
386
387
388
389 -def PCA_nipals(X, standardize=True, PCs=10, threshold=0.0001, E_matrices=False):
390 """
391
392 PCA by NIPALS and get Scores, Loadings, E
393
394 @param X: 2-dimensional matrix of number data.
395 @type X: numpy array
396
397 @param standardize: Wheter X should be standardized or not.
398 @type standardize: bool
399
400 @param PCs: Number of Principal Components.
401 @type PCs: int
402
403 @param threshold: Convergence check value. For checking on convergence to zero (e.g. 0.000001).
404 @type threshold: float
405
406 @param E_matrices: If E-matrices should be retrieved or not. E-matrices (for each PC) or explained_var (explained variance for each PC).
407 @type E_matrices: bool
408
409 @return: nipals_mat(X, PCs, threshold, E_matrices)
410
411 """
412
413 X = mean_center(X)
414
415 if standardize:
416 X = standardization(X)
417
418 return nipals_mat(X, PCs, threshold, E_matrices)
419
420
421 -def PCA_nipals2(X, standardize=True, PCs=10, threshold=0.0001, E_matrices=False):
422 """
423
424 PCA by NIPALS and get Scores, Loadings, E
425
426 @param X: 2-dimensional matrix of number data.
427 @type X: numpy array
428
429 @param standardize: Wheter X should be standardized or not.
430 @type standardize: bool
431
432 @param PCs: Number of Principal Components.
433 @type PCs: int
434
435 @param threshold: Convergence check value. For checking on convergence to zero (e.g. 0.000001).
436 @type threshold: float
437
438 @param E_matrices: If E-matrices should be retrieved or not. E-matrices (for each PC) or explained_var (explained variance for each PC).
439 @type E_matrices: bool
440
441 @return: nipals_arr(X, PCs, threshold, E_matrices)
442
443 """
444
445 X = mean_center(X)
446
447 if standardize:
448 X = standardization(X)
449
450 return nipals_arr(X, PCs, threshold, E_matrices)
451
452
453 -def PCA_nipals_c(X, standardize=True, PCs=10, threshold=0.0001, E_matrices=False):
454 """
455
456 PCA by NIPALS and get Scores, Loadings, E
457
458 @param X: 2-dimensional matrix of number data.
459 @type X: numpy array
460
461 @param standardize: Wheter X should be standardized or not.
462 @type standardize: bool
463
464 @param PCs: Number of Principal Components.
465 @type PCs: int
466
467 @param threshold: Convergence check value. For checking on convergence to zero (e.g. 0.000001).
468 @type threshold: float
469
470 @param E_matrices: If E-matrices should be retrieved or not. E-matrices (for each PC) or explained_var (explained variance for each PC).
471 @type E_matrices: bool
472
473 @return: nipals_c(X, PCs, threshold, E_matrices)
474
475 """
476
477 """ USING C PYTHON EXTENSION """
478 X = mean_center(X)
479
480 if standardize:
481 X = standardization(X)
482
483 return nipals_c(X, PCs, threshold, E_matrices)
484
485
486
487
489 """
490 PCA by SVD and get Scores, Loadings, E
491 Remake of method made by Oliver Tomic Ph.D.
492
493 @param X: 2-dimensional matrix of number data.
494 @type X: numpy array
495
496 @param standardize: Wheter X should be standardized or not.
497 @type standardize: bool
498
499 @return: (Scores, Loadings, explained_var)
500
501 """
502
503 X = mean_center(X)
504
505
506 if standardize:
507 X = standardization(X)
508
509 (rows, cols) = shape(X)
510
511
512 [U, S, V] = svd(X)
513
514
515 if shape(S)[0] < shape(U)[0]: U = U[:, 0:shape(S)[0]]
516
517 Scores = U * S
518 Loadings = V
519
520 variances = S**2 / cols
521 variances_sum = sum(variances)
522 explained_var = variances / variances_sum
523
524 return Scores, Loadings, explained_var
525
526
527
528
529
531 """
532 Get correlation loadings matrix based on Scores (T of PCA) and X (original variables, not mean centered).
533 Remake of method made by Oliver Tomic Ph.D.
534
535 @param X: 2-dimensional matrix of number data.
536 @type X: numpy array
537
538 @param Scores: Scores of PCA (T).
539 @type Scores: numpy array
540
541 @return: Returns the correlation loadings matrix
542
543 """
544
545
546 PCs = shape(Scores)[1]
547 rows = shape(X)[1]
548 CorrLoadings = zeros((PCs, rows), float)
549
550
551
552
553
554 for i in range(PCs):
555 Scores_PC_i = Scores[:, i]
556
557
558 for row in range(rows):
559 orig_vars = X[:, row]
560 corrs = corrcoef(Scores_PC_i, orig_vars)
561 CorrLoadings[i, row] = corrs[0,1]
562
563 return CorrLoadings
564
565
566