Added some files I forgot.

richardneal · Jan 6, 2013 · 96637b6 · 96637b6
1 parent c5b48b1
commit 96637b6
Show file tree

Hide file tree

Showing 3 changed files with 444 additions and 0 deletions.
diff --git a/clustr.r b/clustr.r
@@ -0,0 +1,32 @@
+#!/usr/bin/Rscript
+
+source( 'clustr1.r' );
+#source('mycluster.r');
+source('hclusttophylo.r');
+
+# hacky way for now
+args <- commandArgs(trailingOnly=TRUE);
+ifile <-  args[1];
+method <- args[2];
+metric <- args[3];
+output <- args[4];
+title <- args[5];
+p <- args[6];
+type <- args[7];
+labelFile <- args[8];
+scrubtags <- args[9];
+divitags <- args[10];
+
+filename <- paste("/tmp/rcluster",runif(1), sep="" );
+
+if(output == "phyloxml")
+{
+	filename <- paste(filename, ".xml", sep="");
+}
+
+rownames<-myCluster( ifile, method=method, metric=metric, output.type=output,
+        outputfile=filename, main=title, p=p, type=type, labelFile=labelFile,
+	scrubtags=scrubtags, divitags=divitags);
+
+cat(filename,rownames,sep=",");
+
diff --git a/clustr1.r b/clustr1.r
@@ -0,0 +1,124 @@
+# Idea for text labeling input
+# textlabs is a list of character strings for the text part of the the label
+# chunksize is a list of numeric for the number of chunks in each text
+# textlabs and chunksize must have same length and are assumed to correspond elementwise
+# so that first textlabs has first chunksize number of chunks
+
+myCluster <- function(input.file , textlabs = NULL , chunksize = NULL ,
+                      metric = "euclidean" , method = "average" ,
+                      output.type = "pdf", outputfile = "Dendogram" , 
+					  main = "Dendogram",header=T, comment.char="", 
+					  row.names=1, p=2, type='tsv',
+					  labelFile=NULL,scrubtags=" ",divitags=" "){
+
+
+
+
+        ## List of possible distance metrics
+        ## METHODS <- c("euclidean", "maximum", "manhattan", "canberra",
+        ## "binary", "minkowski")
+
+        ## List of possible cluster-distance methods
+        ## METHODS <- c("ward", "single", "complete", "average", "mcquitty",
+        ## "median", "centroid")
+
+
+    library(stats)
+     #change this for the text you'd like to input
+    if (type == 'csv') {
+    	input.data <- read.table(as.character(input.file), header=header, comment.char=comment.char, row.names=row.names, sep=",")
+    }
+    else if (type == 'txt') {
+		input.data <- read.table(as.character(input.file), header=header, comment.char=comment.char, row.names=row.names, sep="")
+    }
+    else {
+    	input.data <- read.table(as.character(input.file), header=header, comment.char=comment.char, row.names=row.names, sep="\t")
+    }
+
+    tTable <- input.data #Transpose is necessary if data file originally has words for rows and chunks for columns
+					# hclust assumes objects/chunks are the rows
+					# if someone passes us data, we should check which direction
+
+    rowSums <- apply(tTable, 1, sum) # getting the total for each chunk/row, we know we are adding acrossing the rows because of the second argument
+	denoms <- matrix(rep(rowSums, dim(tTable)[2]), byrow=F, ncol=dim(tTable)[2]) # repeat the row sums by n times; n= the number of columns. 
+# the matrix is filled by column. 
+
+    relFreq <- tTable/denoms #  the original data divides denoms piece-wise. This gives the proportion of each word in a chunk 
+
+    if( !is.null(textlabs) && !is.null(chunksize)) { # if use textlabs and chunksize, the data must be order with one text followed by the next
+		if(length(textlabs) != length(chunksize)) stop("number of texts and corresponding chunk numbers must match")
+		else {# check that sum(chunksize) == dim(relFreq)[1] , total number of chunks equals number of rows in relFreq
+			L <- length(chunksize)
+			temp <- NULL
+			for(i in 1:L) {
+				for(k in 1:chunksize[i]){
+				temp <- c(temp,paste(textlabs[i],as.character(k),sep=""))
+				}
+			}
+			row.names(relFreq) <- temp
+		}
+	}
+	# else 0
+
+    # change the names of the labels
+    #row.names(relFreq) <- c("a","b","c","d","e","f","g","h","i","j"...)
+	if (file.info(labelFile)$size!=0)
+	{
+	    tempLABELS <- read.csv(as.character(labelFile), sep=",", as.is=TRUE, header=FALSE);
+	    if (length(tempLABELS) == length(row.names(relFreq)))
+	    {
+	        row.names(relFreq) <- tempLABELS;
+	    }	
+    }
+
+
+
+
+    dist.tTable <- dist(relFreq , method = metric, p=p)
+
+    hCluster <- hclust(dist.tTable, method = method)
+
+    if(!is.character(main)) stop("main must be a character string")
+
+    if(output.type=="pdf"){
+    	# dev.control()
+		#outfilename <- paste(outputfile,".pdf",sep="")
+		outfilename<-paste(outputfile,sep="")
+    	pdf(outfilename , onefile = TRUE, width=7.25, height=10)
+		max <- max( nchar( hCluster$labels ) ) # be sure there's room for the labels
+		par( mar=c( 6.1, 2.1, 4.1, ( max / 2.0 ) ) ) # margins
+    	#plot(hCluster, hang = -1, main = main)
+
+		# create bottom lines of tags from scrubber an divitext
+		t.subtitle <- paste("TreeView Options: Distance Metric: ",metric,", Linkage Method: ",method)
+    		s.subtitle <- gsub("_"," ",scrubtags,fixed=T)
+		s.subtitle <- paste(s.subtitle,sep="")
+    		d.subtitle <- gsub("_"," ",divitags,fixed=T)
+		d.subtitle <- paste(d.subtitle,sep="")
+
+		subtitle <- paste(s.subtitle,"\n",d.subtitle,"\n",t.subtitle)
+
+		plot( as.dendrogram(hCluster), main=main, horiz=TRUE, cex=2, axes=FALSE, xlab="", sub=subtitle, cex.sub=.5);
+		# to put the title on top:
+			# change ylab=main to main=main
+			# change font.lab=2 to font.main=2
+			# and adjust margins
+    	junk <- dev.off() # junk catches stdout of dev. (otherwise everything breaks)
+    }
+    else if (output.type=="phyloxml"){
+	outfilename <- paste(outputfile,sep="");
+	hClustToXML(hCluster, outfilename, TRUE, metric, method)
+
+	}
+    else{}
+
+	# return the row labels as a string 
+	str<-"<r>";
+	for (i in row.names(relFreq)) {
+		str<-paste(str,i,sep=",");
+	}	
+
+	return (str);
+
+
+}