wranglR.r -- main function

See readme.txt for usage
GabeAl · Jul 29, 2014 · ce21ddd · ce21ddd
1 parent 612479f
commit ce21ddd
Showing 1 changed file with 196 additions and 0 deletions.
diff --git a/wranglR.r b/wranglR.r
@@ -0,0 +1,196 @@
+################ Within-Rank Analysis of Novel Groups by Likeness in R ###############
+###################### 1987 Nei-Saitou-inspired special edition ######################
+
+# Read map and taxonomy files in old QIIME format (no comments), OR provide directly
+read_map_taxa = function(mapPath, taxaPath, GroupCol, taxThres)  {
+	if (length(mapPath) > 1) { map = mapPath; }
+	else if (!length(mapPath)) { map = 0; }
+	else map = read.delim(mapPath,sep='\t',comment='',head=T,row.names=1,check.names=F);
+	if (length(taxaPath) > 1) { taxa = taxaPath; } else taxa = 
+		read.delim(taxaPath,sep='\t',comment='',head=T,row.names=1,check.names=F); 
+	if (map) {
+		# map = map[!is.na(match(rownames(map),colnames(taxa))),]; # asymmetry
+		taxa = as.matrix(taxa[,match(rownames(map),colnames(taxa))]); 
+	}
+	if (length(GroupCol) > 1) { Groups = GroupCol; } else Groups = map[[GroupCol]];
+	return(list(map = map, XT = taxa[rowMeans(taxa) > taxThres,], Groups = factor(Groups)));
+}
+
+# Setup function; this generates initial matrices upon which later algorithms operate
+setup = function(mapPath, taxaPath, GroupCol, taxThres = 0, randSubset = 0, mat.tt = NULL, 
+  A=T, hardCut = F, shift = 1) {
+	data = read_map_taxa(mapPath, taxaPath, GroupCol, taxThres);
+	  map = data$map; XT = data$XT; Groups = data$Groups;
+	num = dim(XT)[1]; area = num * num; mdim = num; whole = seq(1,dim(XT)[2]);
+	G1ix = Groups == levels(Groups)[1]; G2ix = Groups == levels(Groups)[2];
+	if (randSubset) {  # Randomly subset a portion of samples to examine
+		whole = sample(seq(1,dim(XT)[2]),dim(XT)[2]); 
+		part = whole[seq(1,length(whole)*randSubset)]; 
+		  XT = XT[,part]; Groups = Groups[part];
+	}
+
+	# Generate baseline t-test p-values between (binary) groups for each feature
+	baseline.tt = apply(XT,1,function(xx) t.test(xx[G1ix], xx[G2ix])$p.value);
+
+	# Generate matrix of pairwise additive t-test p values for each feature pair
+	if (is.null(mat.tt)) {
+		print("Generating baseline matrix...");
+		mat.tt = matrix(rep(0,area),num); count = 0; timer = 0;
+		for (i in 1:num) { 
+			for (j in i:num) {
+				mat.tt[i,j] = t.test(XT[i,G1ix] + XT[j,G1ix], XT[i,G2ix] + XT[j,G2ix])$p.value;
+				mat.tt[j,i] = mat.tt[i,j];
+				q = ifelse(j == i, 1, 2); 
+				count = count + q; timer = timer + q;
+			}
+			if (timer >= area*0.1) 
+				{ print(paste("Progress:",count/area*100,"%...")); timer = 0; }
+		}
+	}
+	print("Baseline matrix generated.");
+
+	# Calculate pairwise distances based on ratio of improvement
+	mat.tt.dist = matrix(rep(0,area),num);
+	for (i in 1:mdim) for (j in i:mdim) {
+		if (A) { mat.tt.dist[i,j] = mat.tt[i,j] / (.5 * (baseline.tt[i] + baseline.tt[j])); }
+		else mat.tt.dist[i,j] = 0.5*(mat.tt[i,j]/baseline.tt[j]+mat.tt[i,j]/baseline.tt[i]);
+		if (mat.tt.dist[i,j] > 1) mat.tt.dist[i,j] = shift + 
+			ifelse(hardCut, 0, log10(mat.tt.dist[i,j]));
+		mat.tt.dist[j,i] = mat.tt.dist[i,j];
+	}
+	# stor = mat.tt.dist;
+
+	# mat.tt.dist = apply(mat.tt,1,function(xx) xx / baseline.tt)
+	# for (j in 1:mdim) for (k in j:mdim) { 
+	 # mat.tt.dist[j,k] = 0.5*(mat.tt.dist[j,k]+mat.tt.dist[k,j]); 
+	 # if (mat.tt.dist[j,k] > 1) mat.tt.dist[j,k] = 1+log10(mat.tt.dist[j,k]);
+	 # mat.tt.dist[k,j]=mat.tt.dist[j,k]; 
+	# }
+	# print(paste("premat similarity:",sum(mat.tt.dist==stor)/length(stor)))
+
+	# Calculate D, the Nei-Saitou neighbor-finder, based on improvement distances
+	D = matrix(rep(0,area),num);
+	r = rowSums(mat.tt.dist);
+	for (i in 1:mdim) for (j in i:mdim) {
+	  d = mat.tt.dist[i,j];
+	  ri = 1 / (mdim - 2) * r[i] - d;
+	  rj = 1 / (mdim - 2) * r[j] - d;
+	  D[i,j] = d - ri - rj;
+	  D[j,i] = D[i,j];
+	}
+
+	## faster version, only use if not reusing ri, rj in later dist calculation
+	# for (i in 1:mdim) for (j in i:mdim) {
+	 # D[i,j] = (mdim - 2) * mat.tt.dist[i,j] - r[i] - r[j];
+	 # D[j,i] = D[i,j];
+	# }
+
+	for (i in 1:mdim) D[i,i] = Inf; # Set diagonal to infinity to avert self-selection
+	return(list(G1ix = G1ix, G2ix = G2ix, XT = XT, baseline.tt = baseline.tt, mat.tt = mat.tt,
+			mat.tt.dist = mat.tt.dist, D = D, r = r, randSubset = randSubset, whole = whole));
+}
+
+# The main function to be called from R or commandline; only binary groups are supported
+wranglR = function(mapPath, taxaPath, GroupCol = "Group", taxThres = 0, randSubset = 0, dat=NA,
+  A=T, hardCut = F, shift = 1) {
+	if (!is.na(dat)) { dt = dat; }
+	else dt = setup(mapPath, taxaPath, GroupCol, taxThres, randSubset, NULL, A, hardCut, shift);
+	  G1ix = dt$G1ix; G2ix = dt$G2ix; XT = dt$XT; baseline.tt = dt$baseline.tt; D = dt$D;
+	  mat.tt = dt$mat.tt; mat.tt.dist = dt$mat.tt.dist; randSubset = dt$randSubset; r = dt$r;
+	num = dim(XT)[1]; mdim = num;   # Cache the initial number of species/entries
+	  iter = 1; found = 0; 
+
+	numTests = num*(num-1)/2 + num; # The number of t-tests performed thus far
+	while (mdim > 2) {
+		pair = which.min(D);
+		px = ceiling(pair/mdim); py = pair - mdim * (px - 1);
+		low = min(px,py); high = max(px, py); # to shrink matrix later
+		tt.both = mat.tt[low,high]; 
+		if (low == high) { print(paste("match exact at:",low)); break; } # Error condition
+
+		found = found + 1; 
+		XT[low,] = XT[low,] + XT[high,]; # Merge rows into lower ix row
+		# distG = 0.5 * (mat.tt.dist[low,high] + (1 / (mdim - 2) * r[low] - 
+			# mat.tt.dist[low,high]) - (1 / (mdim - 2) * r[high] - mat.tt.dist[low,high]));
+		distG = mat.tt.dist[low,high];
+		print(paste("found:",found,"at:",low,high,"Dist:",mat.tt.dist[low,high], 
+			"New, l, h:",tt.both,baseline.tt[low], baseline.tt[high]));
+		# Assign the new row a Newick name with distances according to Nei-Saitou NJ
+		rownames(XT)[low] = paste('(',rownames(XT)[low],':',distG,',',
+			rownames(XT)[high],':',distG,')',sep='');
+		# Remove the trailing extra row (and column, for the matrices based on XT)
+		XT = XT[-high,];
+		mat.tt = mat.tt[-high,-high];
+		mat.tt.dist = mat.tt.dist[-high,-high];
+		baseline.tt = baseline.tt[-high];
+		mdim = mdim - 1;
+
+		# The baseline of this row becomes this round's combined score
+		baseline.tt[low] = tt.both;
+		for (j in 1:mdim) { 
+		  mat.tt[low,j] = t.test(XT[low,G1ix] + XT[j,G1ix],XT[low,G2ix] + XT[j,G2ix])$p.value; 
+		  if (A) { mat.tt.dist[low,j] = mat.tt[low,j] / (.5 * (baseline.tt[j] + baseline.tt[low])); }
+		  else mat.tt.dist[low,j] = 0.5*(mat.tt[low,j]/baseline.tt[j]+mat.tt[low,j]/baseline.tt[low]);
+		  if (mat.tt.dist[low,j] > 1) mat.tt.dist[low,j] = shift + 
+		  	ifelse(hardCut, 0, log10(mat.tt.dist[low,j]));
+		}
+		mat.tt[,low] = mat.tt[low,] # row and col the same
+		mat.tt.dist[,low] = mat.tt.dist[low,];
+		# stor = mat.tt.dist;
+
+		# mat.tt.dist = matrix(rep(0,mdim*mdim),mdim);
+		# for (i in 1:mdim) for (j in i:mdim) {
+			# if (A) { mat.tt.dist[i,j] = mat.tt[i,j] / (.5 * (baseline.tt[i] + baseline.tt[j])); }
+			# else mat.tt.dist[i,j] = 0.5*(mat.tt[i,j]/baseline.tt[j]+mat.tt[i,j]/baseline.tt[i]);
+			# if (mat.tt.dist[i,j] > 1) mat.tt.dist[i,j] = 1 + log10(mat.tt.dist[i,j]);
+			# mat.tt.dist[j,i] = mat.tt.dist[i,j];
+		# }
+		# print(sum(mat.tt.dist==stor)/length(stor))
+
+		# This bit repeats the D calculation code. Functionalize?
+		D = matrix(rep(0,mdim*mdim),mdim);
+		r = rowSums(mat.tt.dist);
+		for (i in 1:mdim) for (j in i:mdim) {
+			d = mat.tt.dist[i,j];
+			ri = 1 / (mdim - 2) * r[i] - d;
+			rj = 1 / (mdim - 2) * r[j] - d;
+			D[i,j] = d - ri - rj;
+			D[j,i] = D[i,j];
+		}
+		# for (i in 1:mdim) for (j in i:mdim) {
+			# D[i,j] = (mdim - 2) * mat.tt.dist[i,j] - r[i] - r[j];
+			# D[j,i] = D[i,j];
+		# }
+		for (i in 1:mdim) D[i,i] = Inf;  # Set diagonal to infinity
+
+		numTests = numTests + mdim - 1;  # Increment the number of tests performed
+		iter = iter + 1; 
+	}
+	names(baseline.tt) = rownames(XT);   # Assign new names back to tt object
+
+	distG = mat.tt.dist[1,2]; # Set the final distance to the remaining
+	tree_pre = paste('(',rownames(XT)[1],':',distG,',',rownames(XT)[2],':',distG,')',sep='');
+
+	tree = paste(gsub(";","-",tree_pre),";",sep="");
+	tree = gsub('[[]',"<",tree);
+	tree = gsub('[]]',">",tree);
+	tree = gsub(" ","_",tree);
+	return(list(tree = tree, baseline.tt = baseline.tt, combinedData = XT));
+}
+
+# Re-combine any dataset of same row order from given path (w/o branch lengths)
+fitTree = function(YT, path) {
+	endp = length(path) / 2; 
+	for (i in 1:endp) {
+	 high = path[i*2]; low = path[i*2 - 1];
+	 YT[low,] = YT[low,] + YT[high,];
+	 rownames(YT)[low] = paste('(',rownames(YT)[low],',',
+			rownames(XT)[high],')',sep='');
+	 YT = YT[-high,];
+	}
+}
+
+# Cuts the tree at a given branch depth and returns subtrees with paths
+cutTree = function (XT, depth, useDist = F) {
+
+}