qvalue <- function(p, alpha=NULL, lam=NULL, robust=F)
{
#This is a function for estimating the q-values for a given set of p-values. The
#methodology comes from a series of recent papers on false discovery rates by John
#D. Storey et al. See http://www.stat.berkeley.edu/~storey/ for references to these
#papers. This function was written by John D. Storey. Copyright 2002 by John D. Storey.
#All rights are reserved and no responsibility is assumed for mistakes in or caused by
#the program.
#
#Input
#=============================================================================
#p: a vector of p-values (only necessary input)
#alpha: a level at which to control the FDR (optional)
#lam: the value of the tuning parameter to estimate pi0 (optional)
#robust: an indicator of whether it is desired to make the estimate more robust 
#        for small p-values (optional)
#
#Output
#=============================================================================
#remarks: tells the user what options were used, and gives any relevant warnings
#pi0: an estimate of the proportion of null p-values
#qvalues: a vector of the estimated q-values (the main quantity of interest)
#pvalues: a vector of the original p-values
#significant: if alpha is specified, and indicator of whether the q-value fell below alpha 
#    (taking all such q-values to be significant controls FDR at level alpha)

#This is just some pre-processing
    m <- length(p)
#These next few functions are the various ways to automatically choose lam
#and estimate pi0
    if(!is.null(lam)) {
        pi0 <- mean(p>lam)/(1-lam)
        remark <- "The user prespecified lam in the calculation of pi0."
    }
    else{
        remark <- "A smoothing method was used in the calculation of pi0."
        library(modreg)
        lam <- seq(0,0.95,0.01)
        pi0 <- rep(0,length(lam))
        for(i in 1:length(lam)) {
        pi0[i] <- mean(p>lam[i])/(1-lam[i])
        }
        spi0 <- smooth.spline(lam,pi0,df=3,w=(1-lam))
        pi0 <- predict.smooth.spline(spi0,x=0.95)$y
    }
#The q-values are actually calculated here
    u <- order(p)
    v <- rank(p)
    qvalue <- pi0*m*p/v
    if(robust) {
        qvalue <- pi0*m*p/(v*(1-(1-p)^m))
        remark <- c(remark, "The robust version of the q-value was calculated. See Storey JD (2002) JRSS-B 64: 479-498.")
    }
    qvalue[u[m]] <- min(qvalue[u[m]],1)
    for(i in (m-1):1) {
    qvalue[u[i]] <- min(qvalue[u[i]],qvalue[u[i+1]],1)
    }
#Here the results are returned
    if(!is.null(alpha)) {
        return(remarks=remark, pi0=pi0, qvalue=qvalue, significant=(qvalue <= alpha), pvalue=p)
    }
    else {
        return(remarks=remark, pi0=pi0, qvalue=qvalue, pvalue=p)
    }
}

#read in data, get rid of outliers
dat2 <- read.table("brcadata.txt",header=TRUE,sep="\t")
nam <- dimnames(dat2)[[2]]
dat <- matrix(rep(0,ncol(dat2)*nrow(dat2)),ncol=ncol(dat2))
for(i in 1:ncol(dat)) {
dat[,i] <- as.numeric(dat2[,i])
}
y <- rep(0,length(nam))
y[nam=="BRCA1"] <- 1
y[nam=="BRCA2"] <- 2
dat <- dat[,y>0]
y <- y[y>0]
o <- rep(T,nrow(dat))
for(i in 1:nrow(dat)) {
if(max(dat[i,])>20) {o[i]<-F}
}
dat <- dat[o,]
dat <- log(dat,base=2)
n <- ncol(dat)
m <- nrow(dat)

#makes two-sample t-statistics
ttest <- function(x,y,f=0) {
m1 <- apply(x[,y==1],1,mean)
m2 <- apply(x[,y==2],1,mean)
v1 <- apply(x[,y==1],1,var)
v2 <- apply(x[,y==2],1,var)
s1 <- sqrt(v1/sum(y==1))
s2 <- sqrt(v2/sum(y==2))
tt <- (m2-m1)/(sqrt(s1^2 + s2^2)+f)
return(tt=tt, ss=(sqrt(s1^2 + s2^2)+f))
}

tt <- ttest(dat,y)$tt

#calculates null statistics
tt0 <- 0
set.seed(123)
B <- 100
for(i in 1:B) {
v <- sample(y)
tt0 <- c(tt0,ttest(dat,v)$tt)
}
tt0 <- tt0[-1]

#form p-values
att <- abs(tt)
att0 <- abs(tt0)
v <- c(rep(T,m),rep(F,m*B))
v <- v[rev(order(c(att,att0)))]
u <- 1:length(v)
w <- 1:m
p <- (u[v==TRUE]-w)/(B*m)
p <- p[rank(-att)]

#calculates q-values and pi0 estimate
qobj <- qvalue(p)

#qobj$qval contains q-values
#qobj$pval contains p-values
#qobj$pi0 contains pi0 estimate

#calculate fold-change (on log base 2 scale)
m1 <- apply(dat[,y==1],1,mean)
m2 <- apply(dat[,y==2],1,mean)
fc <- (m2-m1)

#write q-values, p-values, and fold-change into file in same order as brca.txt
qval <- rep(NA,length(o))
pval <- rep(NA,length(o))
fchng <- rep(NA,length(o))
qval[o] <- qobj$q
pval[o] <- qobj$pval
fchng[o] <- fc
for(i in 1:length(o)) {
cat(c(round(qval[i],6),pval[i],round(fchng[i],3),'\n'),file="qvalues.txt",append=T)
}


#Figures:
#Figure 1
hist(p,nclass=20,main=" ",xlab="Density of observed p-values",ylab=" ",freq=F,ylim=c(0,4))
abline(h=1)
abline(h=0.67,lty=2)
#Figure 2a
plot(tt[order(tt)],qobj$q[order(tt)],type="l",xlim=c(-8,6),xlab="t-statistics",ylab="q-values",main="a")
#Figure 2b
p2 <- qobj$pval[order(qobj$pval)]
q2 <- qobj$qval[order(qobj$pval)]
plot(p2,q2,type="l",xlab="p-values",ylab="q-values",main="b")
#Figure 2c
plot(q2,1:3170,type="l",xlim=c(0,0.1),ylim=c(0,350),xlab="q-value",ylab="number of significant genes",main="c")
#Figure 2d
plot(1:500,q2[1:500]*(1:500),type="l",xlab="number of significant genes",ylab="number of expected false positives",main="d")
#Figure 3
library(modreg)
lam <- seq(0,0.95,0.01)
pi0 <- rep(0,length(lam))
for(i in 1:length(lam)) {
pi0[i] <- mean(p>lam[i])/(1-lam[i])
}
spi0 <- smooth.spline(lam,pi0,df=3,w=(1-lam))
plot(lam,pi0,xlab=expression(lambda),ylab=expression(hat(pi)[0](lambda)))
lines(spi0)