qvalue <- function(p, alpha=NULL, lam=NULL, robust=F) { #This is a function for estimating the q-values for a given set of p-values. The #methodology comes from a series of recent papers on false discovery rates by John #D. Storey et al. See http://www.stat.berkeley.edu/~storey/ for references to these #papers. This function was written by John D. Storey. Copyright 2002 by John D. Storey. #All rights are reserved and no responsibility is assumed for mistakes in or caused by #the program. # #Input #============================================================================= #p: a vector of p-values (only necessary input) #alpha: a level at which to control the FDR (optional) #lam: the value of the tuning parameter to estimate pi0 (optional) #robust: an indicator of whether it is desired to make the estimate more robust # for small p-values (optional) # #Output #============================================================================= #remarks: tells the user what options were used, and gives any relevant warnings #pi0: an estimate of the proportion of null p-values #qvalues: a vector of the estimated q-values (the main quantity of interest) #pvalues: a vector of the original p-values #significant: if alpha is specified, and indicator of whether the q-value fell below alpha # (taking all such q-values to be significant controls FDR at level alpha) #This is just some pre-processing m <- length(p) #These next few functions are the various ways to automatically choose lam #and estimate pi0 if(!is.null(lam)) { pi0 <- mean(p>lam)/(1-lam) remark <- "The user prespecified lam in the calculation of pi0." } else{ remark <- "A smoothing method was used in the calculation of pi0." library(modreg) lam <- seq(0,0.95,0.01) pi0 <- rep(0,length(lam)) for(i in 1:length(lam)) { pi0[i] <- mean(p>lam[i])/(1-lam[i]) } spi0 <- smooth.spline(lam,pi0,df=3,w=(1-lam)) pi0 <- predict.smooth.spline(spi0,x=0.95)$y } #The q-values are actually calculated here u <- order(p) v <- rank(p) qvalue <- pi0*m*p/v if(robust) { qvalue <- pi0*m*p/(v*(1-(1-p)^m)) remark <- c(remark, "The robust version of the q-value was calculated. See Storey JD (2002) JRSS-B 64: 479-498.") } qvalue[u[m]] <- min(qvalue[u[m]],1) for(i in (m-1):1) { qvalue[u[i]] <- min(qvalue[u[i]],qvalue[u[i+1]],1) } #Here the results are returned if(!is.null(alpha)) { return(remarks=remark, pi0=pi0, qvalue=qvalue, significant=(qvalue <= alpha), pvalue=p) } else { return(remarks=remark, pi0=pi0, qvalue=qvalue, pvalue=p) } } #read in data, get rid of outliers dat2 <- read.table("brcadata.txt",header=TRUE,sep="\t") nam <- dimnames(dat2)[[2]] dat <- matrix(rep(0,ncol(dat2)*nrow(dat2)),ncol=ncol(dat2)) for(i in 1:ncol(dat)) { dat[,i] <- as.numeric(dat2[,i]) } y <- rep(0,length(nam)) y[nam=="BRCA1"] <- 1 y[nam=="BRCA2"] <- 2 dat <- dat[,y>0] y <- y[y>0] o <- rep(T,nrow(dat)) for(i in 1:nrow(dat)) { if(max(dat[i,])>20) {o[i]<-F} } dat <- dat[o,] dat <- log(dat,base=2) n <- ncol(dat) m <- nrow(dat) #makes two-sample t-statistics ttest <- function(x,y,f=0) { m1 <- apply(x[,y==1],1,mean) m2 <- apply(x[,y==2],1,mean) v1 <- apply(x[,y==1],1,var) v2 <- apply(x[,y==2],1,var) s1 <- sqrt(v1/sum(y==1)) s2 <- sqrt(v2/sum(y==2)) tt <- (m2-m1)/(sqrt(s1^2 + s2^2)+f) return(tt=tt, ss=(sqrt(s1^2 + s2^2)+f)) } tt <- ttest(dat,y)$tt #calculates null statistics tt0 <- 0 set.seed(123) B <- 100 for(i in 1:B) { v <- sample(y) tt0 <- c(tt0,ttest(dat,v)$tt) } tt0 <- tt0[-1] #form p-values att <- abs(tt) att0 <- abs(tt0) v <- c(rep(T,m),rep(F,m*B)) v <- v[rev(order(c(att,att0)))] u <- 1:length(v) w <- 1:m p <- (u[v==TRUE]-w)/(B*m) p <- p[rank(-att)] #calculates q-values and pi0 estimate qobj <- qvalue(p) #qobj$qval contains q-values #qobj$pval contains p-values #qobj$pi0 contains pi0 estimate #calculate fold-change (on log base 2 scale) m1 <- apply(dat[,y==1],1,mean) m2 <- apply(dat[,y==2],1,mean) fc <- (m2-m1) #write q-values, p-values, and fold-change into file in same order as brca.txt qval <- rep(NA,length(o)) pval <- rep(NA,length(o)) fchng <- rep(NA,length(o)) qval[o] <- qobj$q pval[o] <- qobj$pval fchng[o] <- fc for(i in 1:length(o)) { cat(c(round(qval[i],6),pval[i],round(fchng[i],3),'\n'),file="qvalues.txt",append=T) } #Figures: #Figure 1 hist(p,nclass=20,main=" ",xlab="Density of observed p-values",ylab=" ",freq=F,ylim=c(0,4)) abline(h=1) abline(h=0.67,lty=2) #Figure 2a plot(tt[order(tt)],qobj$q[order(tt)],type="l",xlim=c(-8,6),xlab="t-statistics",ylab="q-values",main="a") #Figure 2b p2 <- qobj$pval[order(qobj$pval)] q2 <- qobj$qval[order(qobj$pval)] plot(p2,q2,type="l",xlab="p-values",ylab="q-values",main="b") #Figure 2c plot(q2,1:3170,type="l",xlim=c(0,0.1),ylim=c(0,350),xlab="q-value",ylab="number of significant genes",main="c") #Figure 2d plot(1:500,q2[1:500]*(1:500),type="l",xlab="number of significant genes",ylab="number of expected false positives",main="d") #Figure 3 library(modreg) lam <- seq(0,0.95,0.01) pi0 <- rep(0,length(lam)) for(i in 1:length(lam)) { pi0[i] <- mean(p>lam[i])/(1-lam[i]) } spi0 <- smooth.spline(lam,pi0,df=3,w=(1-lam)) plot(lam,pi0,xlab=expression(lambda),ylab=expression(hat(pi)[0](lambda))) lines(spi0)