-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
See readme.txt for usage
- Loading branch information
Showing
1 changed file
with
196 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
################ Within-Rank Analysis of Novel Groups by Likeness in R ############### | ||
###################### 1987 Nei-Saitou-inspired special edition ###################### | ||
|
||
# Read map and taxonomy files in old QIIME format (no comments), OR provide directly | ||
read_map_taxa = function(mapPath, taxaPath, GroupCol, taxThres) { | ||
if (length(mapPath) > 1) { map = mapPath; } | ||
else if (!length(mapPath)) { map = 0; } | ||
else map = read.delim(mapPath,sep='\t',comment='',head=T,row.names=1,check.names=F); | ||
if (length(taxaPath) > 1) { taxa = taxaPath; } else taxa = | ||
read.delim(taxaPath,sep='\t',comment='',head=T,row.names=1,check.names=F); | ||
if (map) { | ||
# map = map[!is.na(match(rownames(map),colnames(taxa))),]; # asymmetry | ||
taxa = as.matrix(taxa[,match(rownames(map),colnames(taxa))]); | ||
} | ||
if (length(GroupCol) > 1) { Groups = GroupCol; } else Groups = map[[GroupCol]]; | ||
return(list(map = map, XT = taxa[rowMeans(taxa) > taxThres,], Groups = factor(Groups))); | ||
} | ||
|
||
# Setup function; this generates initial matrices upon which later algorithms operate | ||
setup = function(mapPath, taxaPath, GroupCol, taxThres = 0, randSubset = 0, mat.tt = NULL, | ||
A=T, hardCut = F, shift = 1) { | ||
data = read_map_taxa(mapPath, taxaPath, GroupCol, taxThres); | ||
map = data$map; XT = data$XT; Groups = data$Groups; | ||
num = dim(XT)[1]; area = num * num; mdim = num; whole = seq(1,dim(XT)[2]); | ||
G1ix = Groups == levels(Groups)[1]; G2ix = Groups == levels(Groups)[2]; | ||
if (randSubset) { # Randomly subset a portion of samples to examine | ||
whole = sample(seq(1,dim(XT)[2]),dim(XT)[2]); | ||
part = whole[seq(1,length(whole)*randSubset)]; | ||
XT = XT[,part]; Groups = Groups[part]; | ||
} | ||
|
||
# Generate baseline t-test p-values between (binary) groups for each feature | ||
baseline.tt = apply(XT,1,function(xx) t.test(xx[G1ix], xx[G2ix])$p.value); | ||
|
||
# Generate matrix of pairwise additive t-test p values for each feature pair | ||
if (is.null(mat.tt)) { | ||
print("Generating baseline matrix..."); | ||
mat.tt = matrix(rep(0,area),num); count = 0; timer = 0; | ||
for (i in 1:num) { | ||
for (j in i:num) { | ||
mat.tt[i,j] = t.test(XT[i,G1ix] + XT[j,G1ix], XT[i,G2ix] + XT[j,G2ix])$p.value; | ||
mat.tt[j,i] = mat.tt[i,j]; | ||
q = ifelse(j == i, 1, 2); | ||
count = count + q; timer = timer + q; | ||
} | ||
if (timer >= area*0.1) | ||
{ print(paste("Progress:",count/area*100,"%...")); timer = 0; } | ||
} | ||
} | ||
print("Baseline matrix generated."); | ||
|
||
# Calculate pairwise distances based on ratio of improvement | ||
mat.tt.dist = matrix(rep(0,area),num); | ||
for (i in 1:mdim) for (j in i:mdim) { | ||
if (A) { mat.tt.dist[i,j] = mat.tt[i,j] / (.5 * (baseline.tt[i] + baseline.tt[j])); } | ||
else mat.tt.dist[i,j] = 0.5*(mat.tt[i,j]/baseline.tt[j]+mat.tt[i,j]/baseline.tt[i]); | ||
if (mat.tt.dist[i,j] > 1) mat.tt.dist[i,j] = shift + | ||
ifelse(hardCut, 0, log10(mat.tt.dist[i,j])); | ||
mat.tt.dist[j,i] = mat.tt.dist[i,j]; | ||
} | ||
# stor = mat.tt.dist; | ||
|
||
# mat.tt.dist = apply(mat.tt,1,function(xx) xx / baseline.tt) | ||
# for (j in 1:mdim) for (k in j:mdim) { | ||
# mat.tt.dist[j,k] = 0.5*(mat.tt.dist[j,k]+mat.tt.dist[k,j]); | ||
# if (mat.tt.dist[j,k] > 1) mat.tt.dist[j,k] = 1+log10(mat.tt.dist[j,k]); | ||
# mat.tt.dist[k,j]=mat.tt.dist[j,k]; | ||
# } | ||
# print(paste("premat similarity:",sum(mat.tt.dist==stor)/length(stor))) | ||
|
||
# Calculate D, the Nei-Saitou neighbor-finder, based on improvement distances | ||
D = matrix(rep(0,area),num); | ||
r = rowSums(mat.tt.dist); | ||
for (i in 1:mdim) for (j in i:mdim) { | ||
d = mat.tt.dist[i,j]; | ||
ri = 1 / (mdim - 2) * r[i] - d; | ||
rj = 1 / (mdim - 2) * r[j] - d; | ||
D[i,j] = d - ri - rj; | ||
D[j,i] = D[i,j]; | ||
} | ||
|
||
## faster version, only use if not reusing ri, rj in later dist calculation | ||
# for (i in 1:mdim) for (j in i:mdim) { | ||
# D[i,j] = (mdim - 2) * mat.tt.dist[i,j] - r[i] - r[j]; | ||
# D[j,i] = D[i,j]; | ||
# } | ||
|
||
for (i in 1:mdim) D[i,i] = Inf; # Set diagonal to infinity to avert self-selection | ||
return(list(G1ix = G1ix, G2ix = G2ix, XT = XT, baseline.tt = baseline.tt, mat.tt = mat.tt, | ||
mat.tt.dist = mat.tt.dist, D = D, r = r, randSubset = randSubset, whole = whole)); | ||
} | ||
|
||
# The main function to be called from R or commandline; only binary groups are supported | ||
wranglR = function(mapPath, taxaPath, GroupCol = "Group", taxThres = 0, randSubset = 0, dat=NA, | ||
A=T, hardCut = F, shift = 1) { | ||
if (!is.na(dat)) { dt = dat; } | ||
else dt = setup(mapPath, taxaPath, GroupCol, taxThres, randSubset, NULL, A, hardCut, shift); | ||
G1ix = dt$G1ix; G2ix = dt$G2ix; XT = dt$XT; baseline.tt = dt$baseline.tt; D = dt$D; | ||
mat.tt = dt$mat.tt; mat.tt.dist = dt$mat.tt.dist; randSubset = dt$randSubset; r = dt$r; | ||
num = dim(XT)[1]; mdim = num; # Cache the initial number of species/entries | ||
iter = 1; found = 0; | ||
|
||
numTests = num*(num-1)/2 + num; # The number of t-tests performed thus far | ||
while (mdim > 2) { | ||
pair = which.min(D); | ||
px = ceiling(pair/mdim); py = pair - mdim * (px - 1); | ||
low = min(px,py); high = max(px, py); # to shrink matrix later | ||
tt.both = mat.tt[low,high]; | ||
if (low == high) { print(paste("match exact at:",low)); break; } # Error condition | ||
|
||
found = found + 1; | ||
XT[low,] = XT[low,] + XT[high,]; # Merge rows into lower ix row | ||
# distG = 0.5 * (mat.tt.dist[low,high] + (1 / (mdim - 2) * r[low] - | ||
# mat.tt.dist[low,high]) - (1 / (mdim - 2) * r[high] - mat.tt.dist[low,high])); | ||
distG = mat.tt.dist[low,high]; | ||
print(paste("found:",found,"at:",low,high,"Dist:",mat.tt.dist[low,high], | ||
"New, l, h:",tt.both,baseline.tt[low], baseline.tt[high])); | ||
# Assign the new row a Newick name with distances according to Nei-Saitou NJ | ||
rownames(XT)[low] = paste('(',rownames(XT)[low],':',distG,',', | ||
rownames(XT)[high],':',distG,')',sep=''); | ||
# Remove the trailing extra row (and column, for the matrices based on XT) | ||
XT = XT[-high,]; | ||
mat.tt = mat.tt[-high,-high]; | ||
mat.tt.dist = mat.tt.dist[-high,-high]; | ||
baseline.tt = baseline.tt[-high]; | ||
mdim = mdim - 1; | ||
|
||
# The baseline of this row becomes this round's combined score | ||
baseline.tt[low] = tt.both; | ||
for (j in 1:mdim) { | ||
mat.tt[low,j] = t.test(XT[low,G1ix] + XT[j,G1ix],XT[low,G2ix] + XT[j,G2ix])$p.value; | ||
if (A) { mat.tt.dist[low,j] = mat.tt[low,j] / (.5 * (baseline.tt[j] + baseline.tt[low])); } | ||
else mat.tt.dist[low,j] = 0.5*(mat.tt[low,j]/baseline.tt[j]+mat.tt[low,j]/baseline.tt[low]); | ||
if (mat.tt.dist[low,j] > 1) mat.tt.dist[low,j] = shift + | ||
ifelse(hardCut, 0, log10(mat.tt.dist[low,j])); | ||
} | ||
mat.tt[,low] = mat.tt[low,] # row and col the same | ||
mat.tt.dist[,low] = mat.tt.dist[low,]; | ||
# stor = mat.tt.dist; | ||
|
||
# mat.tt.dist = matrix(rep(0,mdim*mdim),mdim); | ||
# for (i in 1:mdim) for (j in i:mdim) { | ||
# if (A) { mat.tt.dist[i,j] = mat.tt[i,j] / (.5 * (baseline.tt[i] + baseline.tt[j])); } | ||
# else mat.tt.dist[i,j] = 0.5*(mat.tt[i,j]/baseline.tt[j]+mat.tt[i,j]/baseline.tt[i]); | ||
# if (mat.tt.dist[i,j] > 1) mat.tt.dist[i,j] = 1 + log10(mat.tt.dist[i,j]); | ||
# mat.tt.dist[j,i] = mat.tt.dist[i,j]; | ||
# } | ||
# print(sum(mat.tt.dist==stor)/length(stor)) | ||
|
||
# This bit repeats the D calculation code. Functionalize? | ||
D = matrix(rep(0,mdim*mdim),mdim); | ||
r = rowSums(mat.tt.dist); | ||
for (i in 1:mdim) for (j in i:mdim) { | ||
d = mat.tt.dist[i,j]; | ||
ri = 1 / (mdim - 2) * r[i] - d; | ||
rj = 1 / (mdim - 2) * r[j] - d; | ||
D[i,j] = d - ri - rj; | ||
D[j,i] = D[i,j]; | ||
} | ||
# for (i in 1:mdim) for (j in i:mdim) { | ||
# D[i,j] = (mdim - 2) * mat.tt.dist[i,j] - r[i] - r[j]; | ||
# D[j,i] = D[i,j]; | ||
# } | ||
for (i in 1:mdim) D[i,i] = Inf; # Set diagonal to infinity | ||
|
||
numTests = numTests + mdim - 1; # Increment the number of tests performed | ||
iter = iter + 1; | ||
} | ||
names(baseline.tt) = rownames(XT); # Assign new names back to tt object | ||
|
||
distG = mat.tt.dist[1,2]; # Set the final distance to the remaining | ||
tree_pre = paste('(',rownames(XT)[1],':',distG,',',rownames(XT)[2],':',distG,')',sep=''); | ||
|
||
tree = paste(gsub(";","-",tree_pre),";",sep=""); | ||
tree = gsub('[[]',"<",tree); | ||
tree = gsub('[]]',">",tree); | ||
tree = gsub(" ","_",tree); | ||
return(list(tree = tree, baseline.tt = baseline.tt, combinedData = XT)); | ||
} | ||
|
||
# Re-combine any dataset of same row order from given path (w/o branch lengths) | ||
fitTree = function(YT, path) { | ||
endp = length(path) / 2; | ||
for (i in 1:endp) { | ||
high = path[i*2]; low = path[i*2 - 1]; | ||
YT[low,] = YT[low,] + YT[high,]; | ||
rownames(YT)[low] = paste('(',rownames(YT)[low],',', | ||
rownames(XT)[high],')',sep=''); | ||
YT = YT[-high,]; | ||
} | ||
} | ||
|
||
# Cuts the tree at a given branch depth and returns subtrees with paths | ||
cutTree = function (XT, depth, useDist = F) { | ||
|
||
} |