#########################################################
## Copyright (c) 2005
## University of Washington
## Licensed under the terms set forth by University of
## Washington. If you did not sign such a license, you
## are using this software/code illegally and you do not
## have permission to use, modify, or redistribute
## this or any files in this software package.
##
## File : cluster.r
## Code for clustering datasets.
## Similar to R's clust() function, but:
## Uses internal C functions and can run clustering in a
## background thread.
#########################################################

if(!is.loaded("StartClusteringThread"))
  dyn.load(paste("edge", .Platform$dynlib.ext, sep=""))

## Creates an object with all clustering options
## (ordering type, distance type etc)
## that can be passed around easily
new.storeyclust.options <- function(
  ordering="none", 
  distance="euclidean",
  linkage="centroid") {

  options <- list(ordering=ordering, distance=distance, linkage=linkage)
  class(options) <- "storeyclust.options"
  return(options)
}

## Creates an object with all cluster plot options
new.storeyclust.plot.options <- function(
  colorby="value",
  colors=NA,
  showRowLabels=FALSE,
  showColLabels=FALSE) {

  options <- list(colorby=colorby, colors=colors, showRowLabels=showRowLabels,
                  showColLabels=showColLabels)
  class(options) <- "storeyclust.plot.options"
  return(options)
}


#########################################################################
## This function clusters the rows of a matrix of (gene            
## expression) data.
##
## Arguments:       
##  m: the matrix to cluster
##  orderfunc: the function to use for ordering leaves of the 
##     clustering tree. Possible values are 
##     0 (no ordering),
##     1 (heuristic ordering), and 
##     2 (optimal ordering, very slow)
##         
## Return value: an R object of class "hclust" (same structure that
##  is returned by the built-in R function "hclust".
##
## Author(s): Eva Monsen           
#########################################################################
storeyclust.func <- function (m, orderfunc = 0, distance="euclidean")
{

if(is.null(m))
{
  stop("m may not be null")
}

# 0 = euclidean, 1 = correlation
if(distance == "euclidean") distance=0 
if(distance == "correlation") distance=1
if(!is.numeric(distance)) stop(paste("Unknown distance:", distance))

if(!is.numeric(orderfunc))
{
  if("none" == orderfunc || is.na(orderfunc) || is.null(orderfunc)) orderfunc = 0
  if("heuristic" == orderfunc) orderfunc = 1
  if("optimal" == orderfunc) orderfunc = 2
}
if(!is.numeric(orderfunc)) stop(paste("Unknown ordering:", orderfunc))

# allocate arguments for library call
Hmerge.vec <- integer(len=2*(nrow(m)-1))
Hheight <- double(len=nrow(m)-1)
Horder <- integer(len=nrow(m))

result <- .Call("Cluster", 
       m, 
       nrow(m), 
       ncol(m), 
       Hmerge.vec,
       Hheight,
       Horder,
       orderfunc,
       distance)

# row labels (none for now; TODO: gene names, or pass as arg)
#Hlabels <- as.character(1:nrow(m))
Hmethod <- "centroid"

Hmerge <- matrix(Hmerge.vec, ncol=2)

tree    <- list(merge=Hmerge,height=Hheight,order=Horder,labels=NULL,method=Hmethod)
class(tree) <- "hclust"
return(tree)
}

#########################################################################
## This function converts integer Red, Green, and Blue values to a string
## representing the color. E.g. r = 31, g = 32, b = 33 becomes "#1F2021"
#########################################################################
i2rgb <- function(r, g, b)
{
  if(r < 0 || r > 255 || g < 0 || g > 255 || b < 0 || b > 255)
    stop("some value(s) not < 255 and > 0")
  return (sprintf("#%.2X%.2X%.2X", r, g, b));
}

#########################################################################
## This function generates an array of hex color strings ("#FFFFFF")
## ranging from intense red (#FF0000) to white (#FFFFFF) to intense blue
## (#0000FF)
#########################################################################
redwhiteblue <- function(n)
{
	## hold red, vary others from 0 to 255
	red = n %/% 2
	redif = 255 %/% red
	v <- vector(len=0)
	for(i in 0:(red-1))
	{	
		v <- c(v, i2rgb(255, redif*i, redif*i))
	}
	v <- c(v, i2rgb(255,255,255))

	blue = (n - 1) %/% 2
	bluedif = 255 %/% blue
	for(i in (blue-1):0)
	{
		v <- c(v, i2rgb(bluedif*i, bluedif*i, 255))
	}
	return(v)
}

#########################################################################
## This function generates an array of hex color strings ("#FFFFFF")
## ranging from intense blue (#0000FF) to white (#FFFFFF) to intense red
## (#FF0000)
#########################################################################
bluewhitered <- function(n, log=TRUE)
{
  cmax <- 255
  n2 <- (n+1)%/%2 + 1
  if(log)
  {
    x <- c(1:n2)
  
    logx <- log(x)
  
    vx <- as.integer((max(logx)-logx)*cmax/max(logx))
  }
  else
  {
    vx <- as.integer(rev(seq(0, cmax, length=n2)))
  }
  
  v <- NULL

  ## blue
  for(i in (n2):1)
  {
    v <- c(v, i2rgb(vx[i], vx[i], 255))
  }

  ## red
  for(i in 2:(n2-1))
  {
    v <- c(v, i2rgb(255, vx[i], vx[i]))
  }

  return(v)

}

## Truncates all values in a matrix within
## the range [cutoff-around, cutoff+around]
## Arguments:
##    m: the data matrix to cut off
##    cutoff: half the size of the range
##    around: defaults to the mean of m, but
##      may be anything
cutoff.around <- function(m, cutoff, around=NA)
{
  if(is.na(around))
    around <- mean(as.vector(m))

  if(cutoff < 0) 
    cutoff <- -cutoff

  minval <- around - cutoff
  maxval <- around + cutoff

  for(i in 1:length(m))
    if(m[i] < minval) 
      m[i] <- minval
    else if (m[i] > maxval) 
      m[i] <- maxval

  return(m)
}

#########################################################################
## This function plots a matrix of rows and the dendrogram representing
## hierarchical clustering of the rows.
##
## Arguments:                                                      
##   m: The matrix of rows
##   hc: an R object of class "hclust" (contains a dendrogram)
##   numcolors: number of distinct colors in the heatmap
##   rowLabels: Labels for the rows
##   rank: if TRUE, colors the data based on RANK, not on VALUE.
##     this makes a more even plot. (plotting a dataset with one 
##     crazy outlier would result in a plot that is mostly one color.)
##   sd.cutoff: Whether or not to truncate values at a cutoff that
##     is a multiple of the standard deviation
##   sd.coeff: Coefficient for SD cutoff
##
## Return value: none
##
## Author(s): Eva Monsen                                           
#########################################################################
plstoreyclust.func <- function(m, hc, numcolors=50, rowLabels=NA, colLabels=NA, colorby="value",
		               cutoffvalue=NA, scaleby="row", logcolors=TRUE)
{

if(!is.na(cutoffvalue))
{
  m <- cutoff.around(m, cutoffvalue)
}

## get the dendrogram from the hclust object
dd <- as.dendrogram(hc)

## generate colors - 
## blue representing low numbers, white representing near-mean numbers, 
## and red representing high numbers
mycols = bluewhitered(numcolors, log=logcolors)


pm <- m
if ("rank"==colorby)
{
  ## input to plotting is a matrix of the ranks of data values
  ## (for more even plotting)
  pm <- matrix(rank(m), ncol=ncol(m))
}

# heatmap uses a deep recursion, so increase the allowed depth

options(expressions=100000)

## generate heatmap
heatmap(pm, 
	Rowv=dd, 
	Colv=NA, 
	scale=scaleby, 
	col=mycols,
	labRow=rowLabels,
	labCol=colLabels,
	reorderfun=NA,
	keep.dendro=TRUE)
# gah - reduce recursion depth to save core file space
# right after increasing it I find an infinite recursion bug!

options(expressions=1000)

}

storeyclust.msgids <<- c(
		   "Creating distance matrix",
		   "Creating clusters",
		   "(Ordering stage 1 of 3) Computing costs",
		   "(Ordering stage 2 of 3) Getting best pair",
		   "(Ordering stage 3 of 3) Backtracking",
		   "Ordering leaves")

storeyclust.getmessage <- function(msgid)
{
  if(is.null(msgid) || is.na(msgid) || !is.numeric(msgid))
    stop("msgid must be numeric and non-NA")

  if(msgid < 0 || msgid >= length(storeyclust.msgids))
    stop(paste("msgid must be > 0 and <", length(storeyclust.msgids)))

  return (storeyclust.msgids[msgid+1])
}

#########################################################################
### This function gets the progress of a previously started clustering
### thread.
##
### Arguments:       
###  An integer previously returned from storeyclust.start.func
##
### Return value: a list containing the following integer values:
###   min, max, current: Indicators of how far along the clustering process
###    is in the current stage.
###   done: 1 if clustering is finished, 0 otherwise.
###   msgid: Indicates which stage of clustering we are in. Possible values:
###      0 = Creating distance matrix (BCLUST_CREATE_DISTANCE_MATRIX)
###      1 = Creating clusters (BCLUST_CREATE_CLUSTERS)
###      2 = (Optimal ordering only) Computing costs (OPTORD_COMPUTECOSTS)
###      3 = (Optimal ordering only) Getting best pair (OPTORD_GETBESTPAIR)
###      4 = (Optimal ordering only) Backtracking (OPTORD_BACKTRACK)
###      5 = (Heuristic ordering only) Ordering leaves (HEURORD_ORDERLEAVES)
##
### Author(s): Eva Monsen           
#########################################################################
storeyclust.progress.func <- function(tid)
{
  if(is.null(tid) || is.na(tid) || !is.numeric(tid))
    stop("tid must be a non-NA number")

  progress <- .Call("GetProgressValues", tid)
  
  return(progress)
}

#########################################################################
## This function gets the result of a clustering process.
##
## Arguments:       
##  An integer previously returned from storeyclust.start.func
##
## Return value: an "hclust" object
#########################################################################
storeyclust.finish.func <- function(m, tid)
{
  if(is.null(tid) || is.na(tid) || !is.numeric(tid))
    stop("tid must be a non-NA number")

  Hmerge.vec <- integer(len=2*(nrow(m)-1))
  Hheight <- double(len=nrow(m)-1)
  Horder <- integer(len=nrow(m))

  .Call("FinishClusteringThread", tid, Hmerge.vec, Hheight, Horder)

# gah - print clustering end time if debugOn
  if(debugOn) print(cat("Clustering ended",as.character(Sys.time())));

  Hmethod <- "centroid"
  Hmerge <- matrix(Hmerge.vec, ncol=2)
  tree    <- list(merge=Hmerge,height=Hheight,order=Horder,labels=NULL,method=Hmethod)
  class(tree) <- "hclust"
  return(tree)
}

#########################################################################
## This function explicitly deletes information associated with a thread
## from memory.
##
## Arguments:       
##  An integer previously returned from storeyclust.start.func
##
## Return value: none
#########################################################################
storeyclust.delete.func <- function(tid)
{
  .Call("DeleteThread", tid)
}

#########################################################################
## This function stops a clustering thread if it is running.
##
## Arguments:       
##  An integer previously returned from storeyclust.start.func
##
## Return value: none
#########################################################################
storeyclust.stop.func <- function(tid)
{
  status <- .Call("StopThread", tid)
  return(status)
}


#########################################################################
## This function starts a clustering process in a background thread.
##
## Arguments:       
##  Same as those for storeyclust.func.
##
## Return value: a unique integer identifier for the background thread,
##   that may be passed to storeyclust.progress.func and 
##   storeyclust.finish.func
##
## Author(s): Eva Monsen           
#########################################################################
storeyclust.start.func <- function(data)
{

  if(class(data) != "list")
  {
    stop("data must be a list")
  }

  m <- data$m
  orderfunc <- data$orderfunc
  distance <- data$distance
  
  if(is.null(m))
  {
    stop("m may not be null")
  }
  
  ## distance. 0 = euclidean, 1 = correlation
  if(distance == "euclidean") distance=0 
  if(distance == "correlation") distance=1
  if(!is.numeric(distance)) stop(paste("Unknown distance:", distance))
  
  if(!is.numeric(orderfunc))
  {
    if("none" == orderfunc || is.na(orderfunc) || is.null(orderfunc)) orderfunc = 0
    if("heuristic" == orderfunc) orderfunc = 1
    if("optimal" == orderfunc) orderfunc = 2
  }
  if(!is.numeric(orderfunc)) stop(paste("Unknown ordering:", orderfunc))

# gah - report the start time if debugOn
  if(debugOn) print(cat("Clustering started",as.character(Sys.time())));

  tid <- .Call("StartClusteringThread", m, nrow(m), ncol(m), orderfunc, distance)
  return(tid)
}

