ivs.statistics.pca

1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 """ 4 Principal component analysis 5 """ 6 from numpy import abs, array, average, corrcoef, mat, shape, std, sum, transpose, zeros 7 from numpy.linalg import svd 8 9 10 __author__ = "Henning Risvik" 11 __date__ = "February 2008" 12 __version__ = "1.1.02" 13 14 #{ Preprocessing Methods

15 -def mean_center(X):

16 """ 17 18 @param X: 2-dimensional matrix of number data 19 @type X: numpy array 20 21 22 @return: Mean centered X (always has same dimensions as X) 23 24 """ 25 (rows, cols) = shape(X) 26 new_X = zeros((rows, cols), float) 27 _averages = average(X, 0) 28 29 for row in range(rows): 30 new_X[row, 0:cols] = X[row, 0:cols] - _averages[0:cols] 31 return new_X

32 33

34 -def standardization(X):

35 """ 36 37 @param X: 2-dimensional matrix of number data 38 @type X: numpy array 39 40 41 @return: Standardized X (always has same dimensions as X) 42 43 """ 44 (rows, cols) = shape(X) 45 new_X = zeros((rows, cols)) 46 _STDs = std(X, 0) 47 48 for value in _STDs: 49 if value == 0: raise ZeroDivisionError, 'division by zero, cannot proceed' 50 51 for row in range(rows): 52 new_X[row, 0:cols] = X[row, 0:cols] / _STDs[0:cols] 53 return new_X 54 55 #} 56 57 #{ NIPALS array help functions

58 -def get_column(E):

59 """ 60 Get an acceptable column-vector of E. 61 62 @param E: 2-dimensional matrix of number data 63 @type E: numpy array 64 65 @return: a non-zero vector 66 """ 67 (rows, cols) = shape(E) 68 for col_ind in range(cols): 69 t = E[:,col_ind] # extract a column 70 if (vec_inner(t) > 0): 71 return t 72 raise ValueError, 'all column vectors in E are zero vectors' # error: sum of matrix is 0

73 74

75 -def get_column_mat(E):

76 """ 77 NIPALS matrix help function. 78 Get an acceptable column-vector of E. 79 80 @param E: 2-dimensional matrix of number data 81 @type E: numpy matrix 82 83 @return: a non-zero vector 84 """ 85 (rows, cols) = shape(E) 86 for col_ind in range(cols): 87 t = E[:,col_ind] # extract a column 88 eig = transpose(t)*t 89 if (eig > 0): 90 return t 91 raise ValueError, 'all column vectors in E are zero vectors' # error: sum of matrix is 0

92

93 -def vec_inner(v):

94 """ 95 @param v: Vector of number data. 96 @type v: numpy array 97 98 @return: transpose(v) * v (float or int) 99 """ 100 return sum(v * v);

101

102 -def mat_prod(A, x):

103 """ 104 @param A: 2-dimensional matrix of number data. 105 @type A: numpy array 106 107 @param x: Vector of number data. 108 @type x: numpy array 109 110 @return: b of (Ax = b). Product of: matrix A (m,n) * vector x (n) = vector b (m) 111 """ 112 #m = A.shape[0] 113 #b = zeros((m), float) 114 115 # calc: Ax = b 116 #for i in range(m): 117 # b[i] = sum(A[i,:]*x) 118 119 return array(map(lambda a: sum(a[:]*x), A))

120

121 -def remove_tp_prod(E, t, p):

122 """ 123 124 sets: E = E - (t*transpose(p)) 125 E: (m, n)-matrix, (t*transpose(p)): (m, n)-matrix 126 127 128 @param E: 2-dimensional matrix of number data. 129 @type E: numpy array 130 131 @param t: Vector of number data. Current Scores (of PC_i). 132 @type t: numpy array 133 134 @param p: Vector of number data. Current Loading (of PC_i). 135 @type p: numpy array 136 137 138 @return: None 139 140 141 """ 142 143 m = E.shape[0] 144 for i in range(m): 145 E[i, :] = E[i, :] - (t[i] * p)

146 147 148 #} 149 150 #{ NIPALS Algorithm 151 """ 152 Estimation of PC components with the iterative NIPALS method: 153 154 155 E[0] = mean_center(X) (the E-matrix for the zero-th PC) 156 157 t = E(:, 0) (a column in X (mean centered) is set as starting t vector) 158 159 for i=1 to (PCs): 160 161 1 p=(E[i-1]'t) / (t't) Project X onto t to find the corresponding (improve estimated) loading p 162 163 2 p = p * (p'p)^-0.5 Normalise loading vector p to length 1 164 165 3 t = (E[i-1]p) / (p'p) Project X onto p to find corresponding (improve estimated) score vector t 166 167 4 Check for convergence, if difference between eigenval_new and eigenval_old is larger than threshold*eigenval_new return to step 1 168 169 5 E[i] = E[i-1] - tp' Remove the estimated PC component from E[i-1] 170 171 """

172 -def nipals_mat(X, PCs, threshold, E_matrices):

173 """ 174 175 176 PCA by NIPALS using numpy matrix 177 178 179 @param X: 2-dimensional matrix of number data. 180 @type X: numpy array 181 182 @param PCs: Number of Principal Components. 183 @type PCs: int 184 185 @param threshold: Convergence check value. For checking on convergence to zero difference (e.g. 0.000001). 186 @type threshold: float 187 188 @param E_matrices: If E-matrices should be retrieved or not. E-matrices (for each PC) or explained_var (explained variance for each PC). 189 @type E_matrices: bool 190 191 @return: (Scores, Loadings, E) 192 193 """ 194 (rows, cols) = shape(X) 195 196 maxPCs = min(rows, cols) # max number of PCs is min(objects, variables) 197 if maxPCs < PCs: PCs = maxPCs # change to maxPCs if PCs > maxPCs 198 199 Scores = zeros((rows, PCs), float) # all Scores (T) 200 Loadings = zeros((PCs, cols), float) # all Loadings (P) 201 202 E = mat(X.copy()) #E[0] (should already be mean centered) 203 204 205 if E_matrices: 206 Error_matrices = zeros((PCs, rows, cols), float) # all Error matrices (E) 207 else: 208 explained_var = zeros((PCs)) 209 tot_explained_var = 0 210 211 # total object residual variance for PC[0] (calculating from E[0]) 212 e_tot0 = 0 # for E[0] the total object residual variance is 100% 213 for k in range(rows): 214 e_k = array(E[k, :])**2 215 e_tot0 += sum(e_k) 216 217 218 219 t = get_column_mat(E) # extract a column 220 221 222 223 # do iterations (0, PCs) 224 for i in range(PCs): 225 convergence = False 226 ready_for_compare = False 227 E_t = transpose(E) 228 229 while not convergence: 230 eigenval = float(transpose(t)*t) 231 p = (E_t*t) / eigenval # ........................................... step 1 232 233 _p = float(transpose(p)*p) 234 p = p * _p**(-0.5) # ............................................... step 2 235 t = (E*p) / (transpose(p)*p) # ..................................... step 3 236 237 238 eigenval_new = float(transpose(t)*t) 239 if not ready_for_compare: 240 ready_for_compare = True 241 else: # ready for convergence check 242 if (eigenval_new - eigenval_old) < threshold*eigenval_new: # ... step 4 243 convergence = True 244 eigenval_old = eigenval_new; 245 246 247 p = transpose(p) 248 E = E - (t*p) # ........................................................ step 5 249 250 # add Scores and Loadings for PC[i] to the collection of all PCs 251 _t = array(t) # NOT optimal 252 Scores[:, i] = _t[:,0]; Loadings[i, :] = p[0,:] # co 253 254 if E_matrices: 255 # complete error matrix 256 # can calculate object residual variance (row-wise) or variable resiudal variance (column-wise) 257 # total residual variance can also be calculated 258 259 Error_matrices[i] = E.copy() 260 261 else: 262 # total object residual variance for E[i] 263 e_tot = 0 264 265 for k in range(rows): 266 e_ = zeros((cols), float) 267 for k_col in range(cols): 268 e_[k_col] = E[k, k_col]*E[k, k_col] 269 e_tot += sum(e_) 270 tot_obj_residual_var = (e_tot / e_tot0) 271 explained_var[i] = 1 - tot_obj_residual_var - tot_explained_var 272 tot_explained_var += explained_var[i] 273 274 275 276 277 if E_matrices: 278 return Scores, Loadings, Error_matrices 279 return Scores, Loadings, explained_var

280 281 282

283 -def nipals_arr(X, PCs, threshold, E_matrices):

284 """ 285 286 PCA by NIPALS using numpy array 287 288 289 @param X: 2-dimensional matrix of number data. 290 @type X: numpy array 291 292 @param PCs: Number of Principal Components. 293 @type PCs: int 294 295 @param threshold: Convergence check value. For checking on convergence to zero (e.g. 0.000001). 296 @type threshold: float 297 298 @param E_matrices: If E-matrices should be retrieved or not. E-matrices (for each PC) or explained_var (explained variance for each PC). 299 @type E_matrices: bool 300 301 @return: (Scores, Loadings, E) 302 303 304 """ 305 306 (rows, cols) = shape(X) 307 maxPCs = min(rows, cols) # max number of PCs is min(objects, variables) 308 if maxPCs < PCs: PCs = maxPCs # change to maxPCs if PCs > maxPCs 309 310 Scores = zeros((rows, PCs), float) # all Scores (T) 311 Loadings = zeros((PCs, cols), float) # all Loadings (P) 312 313 E = X.copy() #E[0] (should already be mean centered) 314 315 if E_matrices: 316 Error_matrices = zeros((PCs, rows, cols), float) # all Error matrices (E) 317 else: 318 explained_var = zeros((PCs)) 319 tot_explained_var = 0 320 321 # total object residual variance for PC[0] (calculating from E[0]) 322 e_tot0 = 0 # for E[0] the total object residual variance is 100% 323 for k in range(rows): 324 e_k = E[k, :]**2 325 e_tot0 += sum(e_k) 326 327 328 329 t = get_column(E) # extract a column 330 p = zeros((cols), float) 331 332 # do iterations (0, PCs) 333 for i in range(PCs): 334 convergence = False 335 ready_for_compare = False 336 E_t = transpose(E) 337 338 while not convergence: 339 _temp = vec_inner(t) 340 p = mat_prod(E_t, t) / _temp # ..................................... step 1 341 342 _temp = vec_inner(p)**(-0.5) 343 p = p * _temp # .................................................... step 2 344 345 _temp = vec_inner(p) 346 t = mat_prod(E, p) / _temp # ....................................... step 3 347 348 349 eigenval_new = vec_inner(t) 350 if not ready_for_compare: 351 ready_for_compare = True 352 else: # ready for convergence check 353 if (eigenval_new - eigenval_old) < threshold*eigenval_new: # ... step 4 354 convergence = True 355 eigenval_old = eigenval_new; 356 357 remove_tp_prod(E, t, p) # .............................................. step 5 358 359 # add Scores and Loadings for PC[i] to the collection of all PCs 360 Scores[:, i] = t; Loadings[i, :] = p 361 362 363 if E_matrices: 364 # complete error matrix 365 # can calculate object residual variance (row-wise) or variable resiudal variance (column-wise) 366 # total residual variance can also be calculated 367 368 Error_matrices[i] = E.copy() 369 370 else: 371 # total object residual variance for E[i] 372 e_tot = 0 373 for k in range(rows): 374 e_k = E[k, :]**2 375 e_tot += sum(e_k) 376 tot_obj_residual_var = (e_tot / e_tot0) 377 explained_var[i] = 1 - tot_obj_residual_var - tot_explained_var 378 tot_explained_var += explained_var[i] 379 380 if E_matrices: 381 return Scores, Loadings, Error_matrices 382 else: 383 return Scores, Loadings, explained_var

384 385 386 #} 387 388 #{ Principal Component Analysis (using NIPALS)

389 -def PCA_nipals(X, standardize=True, PCs=10, threshold=0.0001, E_matrices=False):

390 """ 391 392 PCA by NIPALS and get Scores, Loadings, E 393 394 @param X: 2-dimensional matrix of number data. 395 @type X: numpy array 396 397 @param standardize: Wheter X should be standardized or not. 398 @type standardize: bool 399 400 @param PCs: Number of Principal Components. 401 @type PCs: int 402 403 @param threshold: Convergence check value. For checking on convergence to zero (e.g. 0.000001). 404 @type threshold: float 405 406 @param E_matrices: If E-matrices should be retrieved or not. E-matrices (for each PC) or explained_var (explained variance for each PC). 407 @type E_matrices: bool 408 409 @return: nipals_mat(X, PCs, threshold, E_matrices) 410 411 """ 412 413 X = mean_center(X) 414 415 if standardize: 416 X = standardization(X) 417 418 return nipals_mat(X, PCs, threshold, E_matrices)

419 420

421 -def PCA_nipals2(X, standardize=True, PCs=10, threshold=0.0001, E_matrices=False):

422 """ 423 424 PCA by NIPALS and get Scores, Loadings, E 425 426 @param X: 2-dimensional matrix of number data. 427 @type X: numpy array 428 429 @param standardize: Wheter X should be standardized or not. 430 @type standardize: bool 431 432 @param PCs: Number of Principal Components. 433 @type PCs: int 434 435 @param threshold: Convergence check value. For checking on convergence to zero (e.g. 0.000001). 436 @type threshold: float 437 438 @param E_matrices: If E-matrices should be retrieved or not. E-matrices (for each PC) or explained_var (explained variance for each PC). 439 @type E_matrices: bool 440 441 @return: nipals_arr(X, PCs, threshold, E_matrices) 442 443 """ 444 445 X = mean_center(X) 446 447 if standardize: 448 X = standardization(X) 449 450 return nipals_arr(X, PCs, threshold, E_matrices)

451 452

453 -def PCA_nipals_c(X, standardize=True, PCs=10, threshold=0.0001, E_matrices=False):

454 """ 455 456 PCA by NIPALS and get Scores, Loadings, E 457 458 @param X: 2-dimensional matrix of number data. 459 @type X: numpy array 460 461 @param standardize: Wheter X should be standardized or not. 462 @type standardize: bool 463 464 @param PCs: Number of Principal Components. 465 @type PCs: int 466 467 @param threshold: Convergence check value. For checking on convergence to zero (e.g. 0.000001). 468 @type threshold: float 469 470 @param E_matrices: If E-matrices should be retrieved or not. E-matrices (for each PC) or explained_var (explained variance for each PC). 471 @type E_matrices: bool 472 473 @return: nipals_c(X, PCs, threshold, E_matrices) 474 475 """ 476 477 """ USING C PYTHON EXTENSION """ 478 X = mean_center(X) 479 480 if standardize: 481 X = standardization(X) 482 483 return nipals_c(X, PCs, threshold, E_matrices)

484 485 #} 486 487 #{ Principal Component Analysis (using SVD)

488 -def PCA_svd(X, standardize=True):

489 """ 490 PCA by SVD and get Scores, Loadings, E 491 Remake of method made by Oliver Tomic Ph.D. 492 493 @param X: 2-dimensional matrix of number data. 494 @type X: numpy array 495 496 @param standardize: Wheter X should be standardized or not. 497 @type standardize: bool 498 499 @return: (Scores, Loadings, explained_var) 500 501 """ 502 503 X = mean_center(X) 504 #print X 505 506 if standardize: 507 X = standardization(X) 508 509 (rows, cols) = shape(X) 510 511 # Singular Value Decomposition 512 [U, S, V] = svd(X) 513 514 # adjust if matrix shape does not match: 515 if shape(S)[0] < shape(U)[0]: U = U[:, 0:shape(S)[0]] 516 517 Scores = U * S # all Scores (T) 518 Loadings = V # all Loadings (P) 519 520 variances = S**2 / cols 521 variances_sum = sum(variances) 522 explained_var = variances / variances_sum 523 524 return Scores, Loadings, explained_var

525 526 527 #} 528 529 #{ Correlation Loadings

530 -def CorrelationLoadings(X, Scores):

531 """ 532 Get correlation loadings matrix based on Scores (T of PCA) and X (original variables, not mean centered). 533 Remake of method made by Oliver Tomic Ph.D. 534 535 @param X: 2-dimensional matrix of number data. 536 @type X: numpy array 537 538 @param Scores: Scores of PCA (T). 539 @type Scores: numpy array 540 541 @return: Returns the correlation loadings matrix 542 543 """ 544 545 # Creates empty matrix for correlation loadings 546 PCs = shape(Scores)[1] # number of PCs 547 rows = shape(X)[1] # number of objects (rows) in X 548 CorrLoadings = zeros((PCs, rows), float) 549 550 # Calculates correlation loadings with formula: 551 # correlation = cov(x,y)/(std(x)*std(y)) 552 553 # For each PC in score matrix 554 for i in range(PCs): 555 Scores_PC_i = Scores[:, i] # Scores for PC[i] 556 557 # For each variable/attribute in X 558 for row in range(rows): 559 orig_vars = X[:, row] # column of variables in X 560 corrs = corrcoef(Scores_PC_i, orig_vars) 561 CorrLoadings[i, row] = corrs[0,1] 562 563 return CorrLoadings

564 565 #} 566

Source Code for Module ivs.statistics.pca