#! /usr/bin/env python """Multivariate Matrix Regression algorithm implemented with qr decomposition in scipy. Input consists of square distance matrix (NxN) and predictor variables Written by Ondrej Libiger and Matt Zapala Based on some of the methods presented in: McArdle, B.H. and Anderson, M.J. (2001). Fitting multivariate models to community data: a comment on distance-based redundancy analysis. Ecology, 82, 290-297. 9/12/2006 Version 1.0.""" # This program was written as a script; it contains no classes and no functions. The commands are executed linearly from the beginning to the end. # Importing modules: from numpy import * #needed for array(), matrixmultiply(), zeros(), ones(), etc. from scipy.linalg import * #needed for qr() from numarray.linear_algebra import * from numarray.random_array import * #needed for permutation() from string import * from sys import * #needed for argv[] # Checking program arguments: if len(argv) < 4: print "Need three arguments: distance matrix file, predictor variables file, number of permutation runs." else: # Reading file containing the N x N distance matrix: distance_file = open(argv[1], "r") #open file list_of_lines = distance_file.readlines() #read file into list_of_lines aux_matrix = [] #create auxiliary matrix for line in list_of_lines: aux_matrix.append(map(float,line.strip().split('\t'))) #input distances into auxiliary matrix distance_file.close() #close distance file distance_matrix = array(aux_matrix,Float) #create numpy supported matrix from auxiliary matrix that contains the distances dist_rows,dist_columns = distance_matrix.shape #get dimensions of distance matrix if dist_rows != dist_columns: #check that matrix is square print 'Error: non square distance matrix!' # Creating centered (Gower's) matrix from the distance matrix: a_matrix = (distance_matrix*distance_matrix)/(-2) #create A matrix to be used for constructing Gower's centered matrix aux_centered_matrix = identity(dist_rows,Float) - (ones([dist_rows,dist_rows],Float)/dist_rows) #create auxiliary centered matrix centered_matrix = matrixmultiply(matrixmultiply(aux_centered_matrix,a_matrix),aux_centered_matrix) #create Gower's centered matrix # Reading file containing N x M predictor variables (design) matrix predictor_file = open(argv[2], "r") #open file with predictor variables list_of_lines = predictor_file.readlines() #read file into list_of_lines aux_matrix = [] #create auxiliary matrix for line in list_of_lines: aux_matrix.append(map(float,line.strip().split('\t'))) #input predictor variables into auxiliary matrix predictor_file.close() #close file with predictor variables predictor_matrix = array(aux_matrix,Float) #create numpy supported matrix using the auxiliary matrix that contains the predictor variables predictor_rows,predictor_columns = predictor_matrix.shape #get matrix dimensions unit_vector = ones((predictor_rows,1)) #create a vector of ones to be used in regression # Calculating single regression print "SINGLE REGRESSION:" for var in range(predictor_columns): #for each predictor variable decomposition_matrix = concatenate((unit_vector,predictor_matrix[:,var,NewAxis]),1) #create matrix that contains a vector of ones in the first column and the pred. variable in the second column decomp_rows,decomp_columns = decomposition_matrix.shape #get the dimensions of the matrix if decomp_rows != dist_rows: #check dimensions print 'Error: predictor matrix does not conform to distance matrix!' q_matrix,r_matrix = qr(decomposition_matrix) #perform economical qr decomposition (as implemented in scipy) qhat_matrix = q_matrix[:,:decomp_columns] hat_matrix = matrixmultiply(qhat_matrix,transpose(qhat_matrix)) #calculate hat matrix using the decomposition id_minus_hat_matrix = identity(dist_rows,Float) - hat_matrix #calculate identity matrix - hat matrix to be used in the calculation of F statistic # calculate the F-statistic f_stat = ((trace(matrixmultiply(matrixmultiply(hat_matrix,centered_matrix),hat_matrix)))/(decomp_columns-1)) / ((trace(matrixmultiply(matrixmultiply(id_minus_hat_matrix,centered_matrix),id_minus_hat_matrix)))/(dist_rows-decomp_columns)) #calculate proportion of variance explained variance_expl = trace(matrixmultiply(matrixmultiply(hat_matrix,centered_matrix),hat_matrix)) / trace(centered_matrix) # Printing single regression analysis output print 'variable:', var+1, ' F-statistic:', f_stat,' PVE:', variance_expl, # Approximating a p-value through permutation analysis # Permutation analysis carried out by simultaneously permuting rows and columns of the Gower's centered matrix number_perm_runs = int(argv[3]) #assess the number of runs from the program arguments greater = 0 #variable contains the count of permutations that yield higher F statistic than actual data centered_permuted_rows_matrix = zeros((dist_rows,dist_rows),Float) #create matrix that will contain cenetered matrix with permuted rows centered_permuted_matrix = zeros((dist_rows,dist_rows),Float) #create matrix that will contain cenetered matrix with permuted rows and columns for run in range(number_perm_runs): #for each run permutation_array = permutation(dist_rows) #create a list of permuted variable indices for i in range(dist_rows): #permute rows using the list of permuted indices centered_permuted_rows_matrix[i] = centered_matrix[permutation_array[i]] centered_permuted_rows_matrix = transpose(centered_permuted_rows_matrix) #transpose matrix with permuted rows for i in range(dist_rows): #permute columns using the same list of permuted indices centered_permuted_matrix[i] = centered_permuted_rows_matrix[permutation_array[i]] centered_permuted_matrix = transpose(centered_permuted_matrix) #transpose matrix back #calculate F-statistic using permuted matrix f_permuted = (trace(matrixmultiply(matrixmultiply(hat_matrix, centered_permuted_matrix), hat_matrix)) / (decomp_columns-1)) / (trace(matrixmultiply(matrixmultiply(id_minus_hat_matrix,centered_permuted_matrix),id_minus_hat_matrix))/(dist_rows-decomp_columns)) if f_permuted >= f_stat: #if greater than F-statistic calculated using actual data then increase the count greater = greater + 1 print " p-value:", float(greater)/number_perm_runs #use the count to approximate a p-value by dividing the count by the number of runs # Performing multiple regression # Ordering predictor variables according to the proportion of variance that each explains print print "MULTIPLE REGRESSION:" decomposition_matrix = unit_vector #create matrix containing a vector of ones (and later a subset of pred. variables) to be used in the decomposition ordered_var = [] #this list will contain the indices of remaining predictor variables for i in range(predictor_columns): #assign the starting order of predictor variable indices ordered_var.append(i+1); result_var = [] #this list will contain the indices of resulting (ordered by proportion of variance explained (p. v. e.)) predictor variables while predictor_columns > 1: #take each pred. variable max = -9999 #set maximum value maxvar = 0 #maxvar will hold the index of pred. variable explaining greatest proportion of variance for var in range(predictor_columns): #for each predictor variable #add the variable to matrix for decomposition decomposition_matrix = concatenate((decomposition_matrix,predictor_matrix[:,var,NewAxis]),1) decomp_rows,decomp_columns = decomposition_matrix.shape #get the dimensions of the matrix for decomposition q_matrix,r_matrix = qr(decomposition_matrix) #perform economical qr decomposition (as implemented in scipy) qhat_matrix = q_matrix[:,:decomp_columns] hat_matrix = matrixmultiply(qhat_matrix,transpose(qhat_matrix)) #calculate the hat matrix using the result of the decomposition #calculate the proportion of variance that the variable explains variance_expl = trace(matrixmultiply(matrixmultiply(hat_matrix,centered_matrix),hat_matrix)) / trace(centered_matrix) if max <= variance_expl: #if this proportion is greater than proportions of variables already calculated, store its index as well as the proportion of variance it explains max = variance_expl maxvar = var decomposition_matrix = decomposition_matrix[:,0:-1] #remove the last added variable from the matrix to be decomposed to make room for next variable result_var.append(ordered_var[maxvar]) #store the index of the variable that yielded maximum p. v. e. explained for output if maxvar < predictor_columns: #delete the index of the variable that yielded maximum p. v. e. from the list of remaining pred. variables to be ordered for i in range(maxvar,predictor_columns-1): ordered_var[i] = ordered_var[i+1] else: ordered_var[maxvar] = None #add the pred. variable that yielded maximum p. v. e. to the matrix used for decomposition with remaining pred. variables decomposition_matrix = concatenate((decomposition_matrix,predictor_matrix[:,maxvar,NewAxis]),1) #delete the pred. variable that yielded maximum p. v. e. from the matrix that contains remaining variables to be ordered if maxvar > 0 and maxvar < predictor_columns-1: predictor_matrix = concatenate((predictor_matrix[:,0:maxvar],predictor_matrix[:,maxvar+1:]),1) elif maxvar == predictor_columns-1: predictor_matrix = predictor_matrix[:,0:-1] else: predictor_matrix = predictor_matrix[:,1:] predictor_rows,predictor_columns = predictor_matrix.shape #get the dimensions of matrix with remaining pred. variables result_var.append(ordered_var[0]) #append the index of the last pred. variable ordered_var = None #delete the list storing remaining variables decomposition_matrix = concatenate((decomposition_matrix,predictor_matrix),1) #add the last variable (that explains the least proportion of variance) to decomposition_matrix decomposition_matrix = decomposition_matrix[:,1:] #take out the first column consisting of a vector of ones in decomposition_matrix decomp_rows,decomp_columns = decomposition_matrix.shape #get the dimensions of decomposition_matrix # Calculating F statistic based on the ordered set of predictor variables ordered_matrix = unit_vector #create a new matrix used for F statistic calculation, the first column consists of a vector of ones f_previous = 0 #scalar used for F-statistic calculation for var in range(decomp_columns): #for each ordered predictor variable ordered_matrix = concatenate((ordered_matrix,decomposition_matrix[:,var,NewAxis]),1) #add variable to ordered_matrix that is used in F-statistic calculation ordered_rows,ordered_columns = ordered_matrix.shape #get the dimension of the ordered_matrix q_matrix,r_matrix = qr(ordered_matrix) #perform economical qr decomposition of ordered_matrix (as implemented in scipy) qhat_matrix = q_matrix[:,:ordered_columns] hat_matrix = matrixmultiply(qhat_matrix,transpose(qhat_matrix)) #calculate hat matrix id_minus_hat_matrix = identity(ordered_rows,Float) - hat_matrix #calculate identity matrix - hat matrix aux_f = trace(matrixmultiply(matrixmultiply(hat_matrix,centered_matrix),hat_matrix)) - f_previous #calculate the F statistic f_stat = aux_f / (((trace(matrixmultiply(matrixmultiply(id_minus_hat_matrix,centered_matrix),id_minus_hat_matrix)))/(ordered_rows - ordered_columns))) f_previous = trace(matrixmultiply(matrixmultiply(hat_matrix,centered_matrix),hat_matrix)) #calculate the proportion of variance explained variance_expl = trace(matrixmultiply(matrixmultiply(hat_matrix,centered_matrix),hat_matrix)) / trace(centered_matrix) #Printing of multiple regression output print 'variable:', result_var[var], ' F-statistic:', f_stat,' PVE:', variance_expl, # Approximating a p-value through permutation analysis # Permutation analysis carried out by simultaneously permuting rows and columns of the Gower's centered matrix # Comments provided on lines 84 - 103 greater = 0 centered_permuted_rows_matrix = zeros((ordered_rows,ordered_rows),Float) centered_permuted_matrix = zeros((ordered_rows,ordered_rows),Float) for run in range(number_perm_runs): permutation_array = permutation(ordered_rows) for i in range(ordered_rows): centered_permuted_rows_matrix[i] = centered_matrix[permutation_array[i]] # transpose centered_permuted_rows_matrix centered_permuted_rows_matrix = transpose(centered_permuted_rows_matrix) for i in range(ordered_rows): centered_permuted_matrix[i] = centered_permuted_rows_matrix[permutation_array[i]] #transpose back centered_permuted_matrix = transpose(centered_permuted_matrix) f_permuted = (trace(matrixmultiply(matrixmultiply(hat_matrix,centered_permuted_matrix),hat_matrix))/(ordered_columns - 1)) / (trace(matrixmultiply(matrixmultiply(id_minus_hat_matrix,centered_permuted_matrix),id_minus_hat_matrix))/(ordered_rows - ordered_columns)) if f_permuted >= f_stat: greater = greater + 1 print " p-value:", float(greater)/number_perm_runs