#########################################################
## Copyright (c) 2005
## University of Washington
## Licensed under the terms set forth by University of
## Washington. If you did not sign such a license, you
## are using this software/code illegally and you do not
## have permission to use, modify, or redistribute
## this or any files in this software package.
##
## File : util.r
## Utility functions for EDGE, including reading/writing
## expression data, etc.
#########################################################

source("qvalue.r")
source("include.r")

# Reads gene expression data
# Arguments:
#   expr.file: the name of the file to read
#   na.char: string that represents NA (missing) values in the file,
#      default "NA"
#   description.column: boolean indicating whether the input file has a
#     "Description" column, default FALSE
# Returns:
#   a "edge.expr.data" object, with items named exprs, annotate, geneNames
readexpr.func <- function(expr.file, na.char="NA", description.column=FALSE) {
  check.file.readable(expr.file)
  
  expr.input <- tryCatch(
                         read.table(expr.file, sep = "\t",
                           header = FALSE, as.is = TRUE,
                           fill = TRUE, na.strings = na.char),
                         error=function(e) stop(paste("Could not read from file", expr.file, ":", e$message)))
  
  nc <- ncol(expr.input)
  nr <- nrow(expr.input)

  if(description.column) { 
    if (nc < 3)
      stop(paste("Expected the data file to contain at least 3 columns",
                 "(gene names, description, and at least one data column)",
                 "but it actually has", nc, "columns."))
    expr.data <- matrix(as.numeric(as.matrix(expr.input[2:nr, 3:nc])), nrow = nr - 1)
    ret <- list(exprs = expr.data, annotate = as.character(expr.input[2:nr, 2]),
                geneNames = as.matrix(expr.input[2:nr, 1]))
  } else {
    if (nc < 2)
      stop(paste("Expected the data file to contain at least 2 columns",
                 "(gene names column and at least one data column)",
                 "but it actually has", nc, "columns."))
    expr.data <- matrix(as.numeric(as.matrix(expr.input[2:nr, 2:nc])), nrow = nr - 1)
    ret <- list(exprs=expr.data, geneNames= as.matrix(expr.input[2:nr, 1]))
  }
  class(ret) <- "edge.expr.data"
  ret
}

## Writes gene expression data
writeexpr.func <- function(expr.file, exprs, geneNames, annotate=NULL)
{
  m <- cbind(geneNames, annotate, exprs)
  write.table(m, file=expr.file,  quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE)
}

## Reads covariate data
readcov.func <- function(cov.file) {
  check.file.readable(cov.file)
  cov.input <- read.table(cov.file, fill = T, sep = "\t", as.is = TRUE, header = FALSE)
  nr = dim(cov.input)[1]
  nc = dim(cov.input)[2]
  
  cov.data <- cov.input[2:nr, 2:nc]
  if(length(cov.data) == 1)
    cov.data <- matrix(cov.data, nrow=1)
  
  names(cov.data) <- as.character(cov.input[1, 2:nc])
  rownames(cov.data) <- as.character(cov.input[2:nr, 1])
  
  ret <- list(covData = cov.data, varLabels = rownames(cov.data))
  class(ret) <- "edge.expr.covariates"
  ret
}

# Saves results of a differential expression analysis to a file.
## Arguments:
##   filename : Path of file to save
##   psig.ord : p-values
##   genesig.ord : gene names
##   qsig.ord : q-values
##   headers : Text for the column headers. Columns are in the order
##     rank, gene name, p-value, and q-value
##   nsmall : Number of digits to save to the right of the decimal point
savediffexresults.func <- function(filename, psig.ord, genesig.ord, qsig.ord,
                                   headers = c("Rank", "Gene Name", "P-Value", "Q-Value"),
                                   nsmall=5)
{
  if(length(psig.ord)==0)
    stop("Cannot save empty result set")
  
  debug.params()
  
  if(is.null(filename))
    stop("filename may not be null")
  
  genecount <- length(psig.ord)
  vecs <- c("genesig.ord", "qsig.ord")
  oldWarn = getOption("warn")
  options(warn = -1)
  tryCatch(
           output <- cbind(1:length(psig.ord),
                           flatSQuote(genesig.ord),
                           as.numeric(format(psig.ord, nsmall=nsmall)),
                           as.numeric(format(qsig.ord, nsmall=nsmall))
                           ),
           finally=options(warn = oldWarn)
           )
  
  colnames(output) <- headers
  
  # Sanity check : vector lengths are all the same
  for(vec in vecs) {
    len <- eval(parse(text = paste("length(", vec, ")")))
    if(len != genecount)
      stop(paste("length(", vec, ") =", len, ", different than length(psig.ord) =", genecount))
  }
  
  write.table(output, file = filename, col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
}

calc.qvalues <- function(p, diffextype, data, pi0.method, lambda, chunks=NULL)
{
  ## Calculate q values.
  if(is.null(p))
    stop("p value array is null")

  if(is.null(data))
    stop("data is null")

  if(!is.matrix(data) && !is.table(data))
    stop("data is not a matrix nor a table")

  if(is.null(lambda))
    stop(paste("got a null value for lambda"))
         
  if(lambda < 0 || lambda >= 1)
    stop(paste("lambda =", lambda, "must be in the interval [0, 1)."))
  
  all.p <- p

  if(diffextype == "static")
  {
    if(is.null(chunks)) {
      chunks <- c(1, nrow(data)+1) ##stop("chunks is null")
    }
  }
  else
  {
    chunks <- c(1, nrow(data)+1)
  }
  pvals <- NULL
  qvals <- NULL

  results <- list()
  results$pi0 <- NULL
  for(i in 1:(length(chunks)-1)) {
    p <- all.p[chunks[i]:(chunks[i+1]-1)]

    err <- NULL
    qobj <- tryCatch(qvalue(p[!is.na(p)], lambda = lambda,
      pi0.method = pi0.method), error = function(e) err <<- e)

    ## Was there an error while calculating q-values?
    if(!is.null(err)) {
      if (class(err) == "idError") {
        if(err$id == qvalue.errors$pi0.lessthanzero) {
          ## If it is a pi0 <= 0 error, set pi0 to 1 and try again
          qobj <- qvalue(p[!is.na(p)], lambda = lambda,
                         pi0.method = "supplied", pi0.supplied = 1)
          results$warning <- paste("pi0 was less than or equal to zero", if (length(chunks) > 1) " (for one or more chunks)", ", so it was set to one for the q-value calculation.", sep="")
        } else {
          stop(err)
        }
      } else {
        print("Stopping")
        ## pass non-idError objects on through
        stop(err)
      }      
    } 
    
    results$pi0 <- c(results$pi0, qobj$pi0)

    # This orders the significant genes according to either p-values or q-values.
    pvals <- c(pvals, qobj$pvalues)
    qvals <- c(qvals, qobj$qvalues)
  }

  results$qobj <- qobj   
  results$q <- qvals

  return(results)
}

###
### Orders the results of a differential expression analysis
### by observed statistics,
### and returns a list of result vectors above the
### alpha p- or q-value cutoff.
###
### If maxnumsig is not NA or 0, adjusts the cutoff
### so that there are no more than maxnumsig
### significant genes.
### Exception: If the first X p- or q-values are
### equivalent, return those X results, regardless
### of maxnumsig
###
order.results <- function(obs.stats, pvals, qvals, data, geneNames, porq, alpha, maxnumsig=1000)
{
  if(missing(obs.stats))
    stop("argument obs.stats is missing")

  debug.params()
  
  results <- list()
  
  check.not.nullna(pvals)
  check.not.nullna(qvals)

  if(is.null(data))
    stop("data is null")

  if(is.null(geneNames))
    stop("geneNames is null")

  ## Everything is ordered by the observed statistics
  results$all.ord <- quick.order(obs.stats, decreasing = T)

  ## Order the p- and q-values
  pvals.ord <- pvals[results$all.ord]
  qvals.ord <- qvals[results$all.ord]

  if(porq == "p") {
    ## Prune by p-value
    prune <- pvals.ord <= alpha
  } else if(porq == "q") {
    ## Prune by q-value
    prune <- qvals.ord <= alpha
  }
  ## Turn NAs into FALSE
  prune <- sapply(prune, function(x) if(is.na(x)) FALSE else x)

  ## Number of significant genes
  nsig <- sum(prune)
  
  ## Check whether number of significant genes exceeds maxnumsig
  if(!is.null(maxnumsig)
     && !is.na(maxnumsig)
     && maxnumsig > 0
     && nsig > maxnumsig)
    {
      ## Results are already ordered by p-value; just take the top maxnumsig
      vals.ord <- (if (porq == "p") pvals.ord else qvals.ord)
      results$newalpha <- vals.ord[maxnumsig]

      ## Ensure that the number of results will be <= maxnumsig
      if(maxnumsig < length(vals.ord))
        {
          newmax = maxnumsig
          while(newmax > 0 && vals.ord[newmax + 1] == results$newalpha)
            {
              newmax <- newmax - 1
            }

          if(newmax != 0)
            {
              results$newalpha <- vals.ord[newmax]
              prune <- (vals.ord <= results$newalpha)
            }
          else
            {
              ## The first maxnumsig p/q-values are all equal!
              ## We did the best we could; just return all
              ## results that have p/q value equal to alpha
              prune <- (vals.ord == results$newalpha)
            }
        }
      
      nsig <- sum(prune)
      if(results$newalpha > 1)
        stop(paste("newalpha =", results$newalpha))
    }

  ## Ordered, significant p-values
  psig.ord <- pvals.ord[prune]
  ## Ordered, significant q-values
  qsig.ord <- qvals.ord[prune]
  ## Ordered, significant gene names
  genesig.ord <- geneNames[results$all.ord][prune]
  ## Ordered rows of significant gene expression data
  datasig.ord <- data[results$all.ord,][prune,] 
  ## Ordered gene names (all)
  geneNames.ord <- geneNames[results$all.ord]
  

  datasig.m <- datasig.ord
  if(is.null(datasig.m))
  {
    results$qerr <- c(results$qerr, simpleError("could not get significant gene data"))
  }

  results$nsig <- nsig
  results$psig.ord <- psig.ord
  results$qsig.ord <- qsig.ord
  results$genesig.ord <- genesig.ord
  results$datasig.ord <- datasig.ord
  results$datasig.m <- datasig.m
  results$prune <- prune
  results$geneNames.ord <- geneNames.ord
  
  return(results)
}

## Concatenates two matrices
## Same as cbind()
concat.matrices <- function(m1, m2)
{
  if(is.null(m1) && is.null(m2))
    return(NULL)
  
  if(!is.null(m1) && !is.matrix(m1))
    stop("m1 is not a matrix")
  if(!is.null(m2) && !is.matrix(m2))
    stop("m2 is not a matrix")

  if(!is.null(m1) && !is.null(m2) && ncol(m1) != ncol(m2))
    stop("matrices do not have same number of columns")
  
  v <- NULL
  nr <- 0
  if(!is.null(m1)) {
    v <- c(as.vector(t(m1)))
    nr <- nrow(m1)
  }
  if(!is.null(m2)) {
    v <- c(v, as.vector(t(m2)))
    nr <- nr + nrow(m2)
  }
    
  m3 <- matrix(v, nrow=nr, byrow = TRUE)
}
