Skip to content

Commit

Permalink
Added some files I forgot.
Browse files Browse the repository at this point in the history
  • Loading branch information
Richard Neal committed Jan 6, 2013
1 parent c5b48b1 commit 96637b6
Show file tree
Hide file tree
Showing 3 changed files with 444 additions and 0 deletions.
32 changes: 32 additions & 0 deletions clustr.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/Rscript

source( 'clustr1.r' );
#source('mycluster.r');
source('hclusttophylo.r');

# hacky way for now
args <- commandArgs(trailingOnly=TRUE);
ifile <- args[1];
method <- args[2];
metric <- args[3];
output <- args[4];
title <- args[5];
p <- args[6];
type <- args[7];
labelFile <- args[8];
scrubtags <- args[9];
divitags <- args[10];

filename <- paste("/tmp/rcluster",runif(1), sep="" );

if(output == "phyloxml")
{
filename <- paste(filename, ".xml", sep="");
}

rownames<-myCluster( ifile, method=method, metric=metric, output.type=output,
outputfile=filename, main=title, p=p, type=type, labelFile=labelFile,
scrubtags=scrubtags, divitags=divitags);

cat(filename,rownames,sep=",");

124 changes: 124 additions & 0 deletions clustr1.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# Idea for text labeling input
# textlabs is a list of character strings for the text part of the the label
# chunksize is a list of numeric for the number of chunks in each text
# textlabs and chunksize must have same length and are assumed to correspond elementwise
# so that first textlabs has first chunksize number of chunks

myCluster <- function(input.file , textlabs = NULL , chunksize = NULL ,
metric = "euclidean" , method = "average" ,
output.type = "pdf", outputfile = "Dendogram" ,
main = "Dendogram",header=T, comment.char="",
row.names=1, p=2, type='tsv',
labelFile=NULL,scrubtags=" ",divitags=" "){




## List of possible distance metrics
## METHODS <- c("euclidean", "maximum", "manhattan", "canberra",
## "binary", "minkowski")

## List of possible cluster-distance methods
## METHODS <- c("ward", "single", "complete", "average", "mcquitty",
## "median", "centroid")


library(stats)
#change this for the text you'd like to input
if (type == 'csv') {
input.data <- read.table(as.character(input.file), header=header, comment.char=comment.char, row.names=row.names, sep=",")
}
else if (type == 'txt') {
input.data <- read.table(as.character(input.file), header=header, comment.char=comment.char, row.names=row.names, sep="")
}
else {
input.data <- read.table(as.character(input.file), header=header, comment.char=comment.char, row.names=row.names, sep="\t")
}

tTable <- input.data #Transpose is necessary if data file originally has words for rows and chunks for columns
# hclust assumes objects/chunks are the rows
# if someone passes us data, we should check which direction

rowSums <- apply(tTable, 1, sum) # getting the total for each chunk/row, we know we are adding acrossing the rows because of the second argument
denoms <- matrix(rep(rowSums, dim(tTable)[2]), byrow=F, ncol=dim(tTable)[2]) # repeat the row sums by n times; n= the number of columns.
# the matrix is filled by column.

relFreq <- tTable/denoms # the original data divides denoms piece-wise. This gives the proportion of each word in a chunk

if( !is.null(textlabs) && !is.null(chunksize)) { # if use textlabs and chunksize, the data must be order with one text followed by the next
if(length(textlabs) != length(chunksize)) stop("number of texts and corresponding chunk numbers must match")
else {# check that sum(chunksize) == dim(relFreq)[1] , total number of chunks equals number of rows in relFreq
L <- length(chunksize)
temp <- NULL
for(i in 1:L) {
for(k in 1:chunksize[i]){
temp <- c(temp,paste(textlabs[i],as.character(k),sep=""))
}
}
row.names(relFreq) <- temp
}
}
# else 0

# change the names of the labels
#row.names(relFreq) <- c("a","b","c","d","e","f","g","h","i","j"...)
if (file.info(labelFile)$size!=0)
{
tempLABELS <- read.csv(as.character(labelFile), sep=",", as.is=TRUE, header=FALSE);
if (length(tempLABELS) == length(row.names(relFreq)))
{
row.names(relFreq) <- tempLABELS;
}
}




dist.tTable <- dist(relFreq , method = metric, p=p)

hCluster <- hclust(dist.tTable, method = method)

if(!is.character(main)) stop("main must be a character string")

if(output.type=="pdf"){
# dev.control()
#outfilename <- paste(outputfile,".pdf",sep="")
outfilename<-paste(outputfile,sep="")
pdf(outfilename , onefile = TRUE, width=7.25, height=10)
max <- max( nchar( hCluster$labels ) ) # be sure there's room for the labels
par( mar=c( 6.1, 2.1, 4.1, ( max / 2.0 ) ) ) # margins
#plot(hCluster, hang = -1, main = main)

# create bottom lines of tags from scrubber an divitext
t.subtitle <- paste("TreeView Options: Distance Metric: ",metric,", Linkage Method: ",method)
s.subtitle <- gsub("_"," ",scrubtags,fixed=T)
s.subtitle <- paste(s.subtitle,sep="")
d.subtitle <- gsub("_"," ",divitags,fixed=T)
d.subtitle <- paste(d.subtitle,sep="")

subtitle <- paste(s.subtitle,"\n",d.subtitle,"\n",t.subtitle)

plot( as.dendrogram(hCluster), main=main, horiz=TRUE, cex=2, axes=FALSE, xlab="", sub=subtitle, cex.sub=.5);
# to put the title on top:
# change ylab=main to main=main
# change font.lab=2 to font.main=2
# and adjust margins
junk <- dev.off() # junk catches stdout of dev. (otherwise everything breaks)
}
else if (output.type=="phyloxml"){
outfilename <- paste(outputfile,sep="");
hClustToXML(hCluster, outfilename, TRUE, metric, method)

}
else{}

# return the row labels as a string
str<-"<r>";
for (i in row.names(relFreq)) {
str<-paste(str,i,sep=",");
}

return (str);


}
Loading

0 comments on commit 96637b6

Please sign in to comment.