/*********************************************************
** Copyright (c) 2005
** University of Washington
** Licensed under the terms set forth by University of
** Washington. If you did not sign such a license, you
** are using this software/code illegally and you do not
** have permission to use, modify, or redistribute
** this or any files in this software package.
**
** File: edge.c  $Revision: 288 $
**
**********************************************************/
#include <assert.h>
#include <stdio.h>
#include "edge.h"
#include "ProgressThreadDef.h"
#include "MsgIds.h"
#include "debug.h"

#ifdef __cplusplus
extern "C" {
#endif

/**************************************************************************
functions for statex:  
  odp?: compute sum of normal densities to be used as numerator 
  or denominator in score nullODPScore: 

  odpScore had previously been replaced by odp, which computed either
  the numerator or denominator.  There was only one line common to both,
  related to updating the status display, so it seemed obvious to
  separate them into separate routines.  Also, the alternative and null
  routines, computing lr and lr0, are separate allowing for changes in
  the array indexing.  So there are now four:

  odp1:  alternative numerator
  odp2:  alternative denominator
  odp3:  null numerator
  odp4:  null denominator

  They currently all have the same arguments, even though they are
  not all used in all routines.  If I move the k loop in, the null
  routines might have different arguments.

***********************************************************************/

/*
  Compute numerators or denominators of ODP statistics

  gah - change to four separate routines for numerator and denominator
        for lr, and numerator and denominator for lr0.  A few extra
        arguments for some, if they are not all needed.

  xx: m-vector of sum of squared data for each gene
  xk: (m x k) matrix of group-specific sums of squares, collapsed by columns
  mu: (m x k) matrix of group-specific means for each gene, collapsed by columns
  sigma: m-vector of pooled sd's
  m: number of genes
  n: number of samples
  k: number of comparison groups
  nGrp: number of samples in each comparison group
  *null: compute numerator (*null = 0) or denominator (*null = 1)
  m0: number of genes to use in denominator
  nii: m0-vector of indices for genes to use in denominator
  scr: output
*/

void odp(double *xx, double *xk, double *mu, double *sigma, long *m, 
     long *n, long *k, int *nGrp, long *null, long *m0, int *nii, 
     double *scr, update_progress_routine* callback, void* callbackArg, 
     double *currentStart, double *maxStart) {
//   assert(0);
}

/* gah - size parameters first, and passed by value */
/* odp?t routines return the proper maxStart value for the
   corresponding routine.  This allows convenient calculation
   of maxStart in the general case 
 */

/* Every ENOUGH cycles do the callback routine to update the
   percent completed display.  A little more than once a second should
   be enough.  */
#define ENOUGH 200000
/* These constants are used to approximate the time needed for each
   odp routine.  For large mi and mj, only the last term will be
   important, but it isn't hard to get all four of them.  The time
   should be proportional to:  ODPA+ODPB*mj+ODPC*mi+ODP1D*mi*mj
   **************/
#define ODPA  1834.
#define ODPB  367.
#define ODPC  10.
#define ODP1D 445.
#define ODP2D 386.
#define ODP3D 445.
#define ODP4D 373.

double  odp1t(long mi, long mj, long n, long k) {
  return ODPA+ODPB*mj+ODPC*mi+ODP1D*(mi)*(mj);
  }
   
void odp1(long mi, long mj, long n, long k,
     double *xx, double *xk, double *mu, double *sigma, 
     int *nGrp, int *nii, 
     double *scr, update_progress_routine* callback, void* callbackArg, 
     double *currentStart, double *maxStart) {
  long i, j, l;
  int iCurrent = 0;
#ifdef EDGE_DEBUG
/* gah - for rdtsc timing */
  long long t, t1, t2, t3, t4, t5, timing=0,rdtsc();
#endif

  /* tracking thread progress: */ 
  double max, current;

  /* temporaries for i and j loops */
  double middlet, lastt, powt, sigma2t; 

#ifdef EDGE_DEBUG
timing= - rdtsc();
#endif
//  max = maxStart ? *maxStart : (double)(mi) * (mj);
  max = *maxStart;
  current = *currentStart+ODPA+ODPC*mi;

/*  gah - I think the following is wrong: increment is outside k loop */
/*  if(!maxStart && *null == 0) max = max * (k);   */

#ifndef EDGE_DEBUG 
  do_callback(callback, callbackArg, max, current, ODP_INIT);
#else
  t1= -timing;
  do_callback(callback, callbackArg, max, current, ODP_INIT);
  t2=rdtsc();
#endif

  /* compute numerator if alternative component */

/* exchange the i and j loops */
  for(i = 0; i < mi; i++) scr[i] = 0;
#ifdef EDGE_DEBUG
  t4=t5=0;
  t3=rdtsc();
#endif
  for(j = 0; j < mj; j++) {
    current += ODPB;
#ifdef EDGE_DEBUG
  t4-=rdtsc();
#endif
    powt = pow(1 / sigma[j], n);
    sigma2t = -0.5/(sigma[j]*sigma[j]);
/* lastt loop moved out of i loop */
    lastt = 0;
    for(l = 0; l < k; l++) {
      lastt += nGrp[l] * mu[j + l * mj] * mu[j + l * mj];
    }
#ifdef EDGE_DEBUG
  t=rdtsc();
  t4+=t;
  t5-=t;
#endif
    for(i = 0; i < mi; i++, iCurrent++) {
      current += ODP1D;

      if (iCurrent > ENOUGH) {
        do_callback(callback, callbackArg, max, current, ODP_COMPUTING_ODP);
        iCurrent = 0;
      }

/* sum over l for each i and j, so zero middlet here */
/* lastt has no i dependence so move it out of the loop */
      middlet=0;
      for(l = 0; l < k; l++) {
        middlet += 2 * xk[i + l * mi] * mu[j + l * mj];
      }

      scr[i] += powt * exp(sigma2t * (xx[i] - middlet + lastt));
    }
#ifdef EDGE_DEBUG
  t5+=rdtsc();
#endif
  }
#ifdef EDGE_DEBUG
  timing += rdtsc();
#endif
#ifdef EDGE_DEBUG
  for(i=0;i<10 && i<mi;i++) fprintf(stderr,"num[%ld]=%g\n",i,scr[i]);
#endif

  *currentStart = current;

#ifdef EDGE_DEBUG
fprintf(stderr,"odp1 timing=%lld current=%.0f\n",timing,current);
fprintf(stderr,"odpx t2-t1=%lld t3-t2=%lld t4=%lld t5=%lld\n",
         t2-t1,t3-t2,t4,t5);
#endif
}

double  odp2t(long mi, long mj, long n, long k) {
  return ODPA+ODPB*mj+ODPC*mi+ODP2D*(mi)*(mj);
  }
   
void odp2(long mi, long mj, long n, long k,
     double *xx, double *xk, double *mu, double *sigma, 
     int *nGrp, int *nii, 
     double *scr, update_progress_routine* callback, void* callbackArg, 
     double *currentStart, double *maxStart) {
  long i, j;
  int iCurrent = 0;
#ifdef EDGE_DEBUG
/* gah - for rdtsc timing */
  long long timing=0,rdtsc();
#endif

  /* tracking thread progress: */ 
  double max, current;

  /* temporaries for i and j loops */
  double powt, sigma2t; 

#ifdef EDGE_DEBUG
timing -= rdtsc();
#endif
//  max = maxStart ? *maxStart : (double)(mi) * (mj);
  max = *maxStart;
  current = *currentStart+ODPA+ODPC*mi;

/*  gah - I think the following is wrong: increment is outside k loop */
/*  if(!maxStart && *null == 0) max = max * (k);   */

  do_callback(callback, callbackArg, max, current, ODP_INIT);

  /* compute denominator if null component */
  for(i = 0; i < mi; i++) scr[i] = 0.0;
  for(j = 0; j < mj; j++) {
    current += ODPB;
    powt = pow(sigma[nii[j]],-n); 
    sigma2t= -0.5 / (sigma[nii[j]] * sigma[nii[j]]);
    for(i = 0; i < mi; i++, iCurrent++) {
      current += ODP2D;

      if (iCurrent > ENOUGH)
        {
        do_callback(callback, callbackArg, max, current, ODP_COMPUTING_ODP);
        iCurrent = 0;
      }

      scr[i] += powt * exp(sigma2t * xx[i]);
    }
  }
#ifdef EDGE_DEBUG
  for(i=0;i<10 && i<mi;i++) fprintf(stderr,"den[%ld]=%g\n",i,scr[i]);
#endif

  *currentStart = current;
#ifdef EDGE_DEBUG
timing += rdtsc();
#endif
#ifdef EDGE_DEBUG
fprintf(stderr,"odp2 timing=%lld current=%.0f\n",timing,current);
#endif
}

double  odp3t(long mi, long mj, long n, long k) {
  return ODPA+ODPB*mj+ODPC*mi+ODP3D*(mi)*(mj);
  }
   
void odp3(long mi, long mj, long n, long k,
     double *xx, double *xk, double *mu, double *sigma, 
     int *nGrp, int *nii, 
     double *scr, update_progress_routine* callback, void* callbackArg, 
     double *currentStart, double *maxStart) {
  long i, j, l;
  int iCurrent = 0;
#ifdef EDGE_DEBUG
/* gah - for rdtsc timing */
  long long timing=0,rdtsc();
#endif

  /* tracking thread progress: */ 
  double max, current;

  /* temporaries for i and j loops */
  double middlet, lastt, powt, sigma2t; 

#ifdef EDGE_DEBUG
timing -= rdtsc();
#endif
//  max = maxStart ? *maxStart : (double)(mi) * (mj);
  max = *maxStart;
  current = *currentStart+ODPA+ODPC*mi;

/*  gah - I think the following is wrong: increment is outside k loop */
/*  if(!maxStart && *null == 0) max = max * (k);   */

  do_callback(callback, callbackArg, max, current, ODP_INIT);

  /* compute numerator if alternative component */

/* exchange the i and j loops */
  for(i = 0; i < mi; i++) scr[i] = 0;
  for(j = 0; j < mj; j++) {
    current += ODPB;
    powt = pow(1 / sigma[j], n);
    sigma2t = -0.5/(sigma[j]*sigma[j]);
/* lastt loop moved out of i loop */
    lastt = 0;
    for(l = 0; l < k; l++) {
      lastt += nGrp[l] * mu[j + l * mj] * mu[j + l * mj];
    }

    for(i = 0; i < mi; i++, iCurrent++) {

      current += ODP3D;
      if (iCurrent > ENOUGH) {
        do_callback(callback, callbackArg, max, current, ODP_COMPUTING_ODP);
        iCurrent = 0;
      }

/* sum over l for each i and j, so zero middlet here */
/* lastt has no i dependence so move it out of the loop */
      middlet=0;
      for(l = 0; l < k; l++) {
        middlet += 2 * xk[i + l * mi] * mu[j + l * mj];
      }

      scr[i] += powt * exp(sigma2t * (xx[i] - middlet + lastt));
    }
  }
#ifdef EDGE_DEBUG
  for(i=0;i<10 && i<mi;i++) fprintf(stderr,"num0[%ld]=%g\n",i,scr[i]);
#endif
  *currentStart = current;

#ifdef EDGE_DEBUG
timing += rdtsc();
#endif
#ifdef EDGE_DEBUG
fprintf(stderr,"odp3 timing=%lld current=%.0f\n",timing,current);
#endif
}

double  odp4t(long mi, long mj, long n, long k) {
  return ODPA+ODPB*mj+ODPC*mi+ODP4D*(mi)*(mj);
  }
   
void odp4(long mi, long mj, long n, long k,
     double *xx, double *xk, double *mu, double *sigma, 
     int *nGrp, int *nii, 
     double *scr, update_progress_routine* callback, void* callbackArg, 
     double *currentStart, double *maxStart) {
  long i, j;
  int iCurrent = 0;
#ifdef EDGE_DEBUG
/* gah - for rdtsc timing */
  long long timing=0,rdtsc();
#endif

  /* tracking thread progress: */ 
  double max, current;

  /* temporaries for i and j loops */
  double powt, sigma2t; 

#ifdef EDGE_DEBUG
timing -= rdtsc();
#endif
//  max = maxStart ? *maxStart : (double)(mi) * (mj);
  max = *maxStart;
  current = *currentStart+ODPA+ODPC*mi;

/*  gah - I think the following is wrong: increment is outside k loop */
/*  if(!maxStart && *null == 0) max = max * (k);   */

  do_callback(callback, callbackArg, max, current, ODP_INIT);

  /* compute denominator if null component */
  for(i = 0; i < mi; i++) scr[i] = 0.0;
  for(j = 0; j < mj; j++) {
    current += ODPB;
#if 0
    powt = pow(sigma[nii[j]],-n); 
    sigma2t= -0.5 / (sigma[nii[j]] * sigma[nii[j]]);
#else
    powt = pow(sigma[j],-n); 
    sigma2t= -0.5 / (sigma[j] * sigma[j]);
#endif
    for(i = 0; i < mi; i++, iCurrent++) {
      current += ODP4D;

      if (iCurrent > ENOUGH)
        {
        do_callback(callback, callbackArg, max, current, ODP_COMPUTING_ODP);
        iCurrent = 0;
      }

      scr[i] += powt * exp(sigma2t * xx[i]);
    }
  }
#ifdef EDGE_DEBUG
  for(i=0;i<10 && i<mi;i++) fprintf(stderr,"den0[%ld]=%g\n",i,scr[i]);
#endif

  *currentStart = current;
#ifdef EDGE_DEBUG
timing += rdtsc();
#endif
#ifdef EDGE_DEBUG
fprintf(stderr,"odp4 timing=%lld current=%.0f\n",timing,current);
#endif
}

/*
  Compute ODP statistics under null hypothesis by bootstrap

  res: (m x n) matrix of residuals, collapsed by columns
  mu: (m x k) matrix of group-specific means, collapsed by columns
  sigma: m-vector of pooled sd's
  sigma0: m-vector of sd's computed under null hypothesis
         (but only m0 of them will be used)
  vv: (B x n) matrix of indices defining bootstrap samples, collapsed by columns
  ID: (n x k) model matrix, collapsed by columns
  m0: number of genes (the subset of genes used in 
        denominators of original ODP statistics)
  n: number of samples
  k: number of comparison groups
  nGrp: number of samples in each comparison group
  B: number of bootstrap samples
  scr: (m x B) matrix of null ODP statistics, collapsed by columns
*/

void nullODP(long m, long m0, long n, long k, long B, double *res, 
     double *mu, double *sigma, double *sigma0, int *vv, int *ID, int *nii, 
      int *nGrp, double *scr, update_progress_routine* callback, 
      void* callbackArg, double *currentStart, double *maxStart) {

//   int ii;
  long i, j, l, h;
  double *num, *den, *dat0, *xx, *xk, *mu0, *sigmasc,jj;
  /* max and current for tracking thread progress: */ 
  double max = maxStart ? *maxStart : (double)(m0) * (m0+m) * (B);
  double current = currentStart ? *currentStart : 0;

  /* allocate memory */
//  pass nii instead of using all
//  nii = intvector(0, m0 - 1);
  num = vector(0, m - 1);
  den = vector(0, m - 1);
  dat0 = vector(0, m * n - 1);
  xx = vector(0, m - 1);
  xk = vector(0, m * k - 1);
  mu0 = vector(0, m0 - 1);
  sigmasc = vector(0, m0 - 1);

//  for(ii = 0; ii < m0; ii++)
//    nii[ii] = ii;

  /* loop over bootstrap samples */
  for(i = 0; i < B; i++) {
    /* initialization */
    for(j = 0; j < m0; j++) {
      xx[j] = 0.0;
      mu0[j] = 0.0;
	  sigmasc[j] = 0.0;
      for(l = 0; l < k; l++)
        xk[j + l * m0] = 0.0;
    }

    /* form bootstrap sample */
    for(j = 0; j < m0; j++) {
      for(l = 0; l < n; l++) {
        dat0[j + l * m0] = res[j + vv[i + l * B] * m0];
        mu0[j] += dat0[j + l * m0] / n;
      }
      if(k > 1){
	    for(l = 0; l < n; l++) 
	      dat0[j + l * m0] -= mu0[j];
      }
    }
	
	/* Scale by the appropriate standard deviations */
	

  for(j = 0; j < m0; j++){
        for(l = 0; l < n; l++){
            if(k > 1){
              sigmasc[j] += pow(dat0[j + l*m0],2);
            }
            if(k <= 1) {
              jj = dat0[j + l*m0] - mu0[j];
              sigmasc[j] += pow(jj, 2);
            }
        }
        sigmasc[j] = sqrt(sigmasc[j]/(n-1));
    }




	for(j = 0; j < m0; j++){
		for(l = 0; l < n; l++){
			dat0[j + l*m0] = (dat0[j + l*m0]) * sigma[nii[j]] / sigmasc[j];
		}
	} 
	

    /* initial computations on bootstrap sample */
    for(j = 0; j < m0; j++) {
      for(l = 0; l < n; l++)
        xx[j] += dat0[j + l * m0] * dat0[j + l * m0];
      for(l = 0; l < k; l++) {
        for(h = 0; h < n; h++)
          xk[j + l * m0] += dat0[j + h * m0] * ID[h + l * n];
      }
    }

    /* compute scores */
    odp3(m0, m , n, k, xx, xk, mu, sigma, nGrp, nii, num, 
          callback, callbackArg, &current, &max);
    odp4(m0, m0, n, k, xx, xk, mu, sigma0, nGrp, nii, den, 
          callback, callbackArg, &current, &max);

    /* prepare output */
    for(j = 0; j < m0; j++)
      scr[j + i * m0] = num[j] / den[j];
  }

  /* free memory */
//  free_intvector(nii, 0, m0 - 1);
  free_vector(num, 0, - 1);
  free_vector(den, 0,  - 1);
  free_vector(dat0, 0,  - 1);
  free_vector(xx, 0, - 1);
  free_vector(xk, 0,  - 1);
  free_vector(mu0, 0,  - 1);

  if(currentStart) *currentStart = current;
}

/****************************************************************************
functions for timex:  
  nullFIndep: simulate null F statistics under independent sampling
  nullFLongNoInt: simulate null F statistics under longitudinal sampling, no intercept
  nullFLongInt: simulate null F statistics under longitudinal sampling, with intercept
  FNoEM: compute F statistics without using EM algorithm
  FEM: compute F statistics using EM algorithm
  EMFit: the EM algorithm.
  EMFitR: R wrapper for calling EMFit directly
*****************************************************************************/

/* simulate null F statistics under independent sampling */
void nullFIndep(double *fitR, double *resR, int *vR, double *H1R, 
        double *H0R, long *m, long *n, long *B, double *F, 
        update_progress_routine* callback, void* callbackArg) {

  long i, j, k, **v;
  double **fit, **res, **H0, **H1, **dat, **fit1, **fit0, *rss1, *rss0, *Ft;
  double max = (double)(*B) * (*m) * (*n + 1) * (*n + 1);
  double current = 0;
  int iCurrent = 0;

  /* allocate memory */
  v = imatrix(0, *B - 1, 0, *n - 1);
  fit = matrix(0, *m - 1, 0, *n - 1);
  res = matrix(0, *m - 1, 0, *n - 1);
  H0 = matrix(0, *n - 1, 0, *n - 1);
  H1 = matrix(0, *n - 1, 0, *n - 1);
  dat = matrix(0, *m - 1, 0, *n - 1);
  fit1 = matrix(0, *m - 1, 0, *n - 1);
  fit0 = matrix(0, *m - 1, 0, *n - 1);
  rss1 = vector(0, *m - 1);
  rss0 = vector(0, *m - 1);
  Ft = vector(0, *m - 1);

  /* reform matrix inputs (n*(B+m+n) calculations, not counted)*/
  for(i = 0; i < *n; i++) {
    for(j = 0; j < *B; j++)
      v[j][i] = vR[j + i * *B];

    for(j = 0; j < *m; j++) {
      fit[j][i] = fitR[j + i * *m];
      res[j][i] = resR[j + i * *m];
    }

    for(j = 0; j < *n; j++) {
      H0[j][i] = H0R[j + i * *n];
      H1[j][i] = H1R[j + i * *n];
    }
  }

  /* simulate null statistics (B*m*(n^2+2n+1) calculations)*/
  for(i = 0; i < *B; i++) {
    /* initializations (m*n, not counted) */
    for(j = 0; j < *m; j++) {
      for(k = 0; k < *n; k++) {
        fit1[j][k] = 0.0;
        fit0[j][k] = 0.0;
      }

      rss1[j] = 0.0;
      rss0[j] = 0.0;
    }

    /* form a null sample (m*n) */
    for(j = 0; j < *m; j++, current++, iCurrent++) 
      for(k = 0; k < *n; k++) 
    {
      if(iCurrent % 1000 == 0) 
        {
          do_callback(callback, callbackArg, max, current, TIMECOURSE);
          iCurrent = 0;
        }
      dat[j][k] = fit[j][k] + res[j][v[i][k]];
    }

    /* compute F statistics (m*(n^2+n+1)) */
    FNoEM(dat, H1, H0, fit1, fit0, rss1, rss0, m, n, Ft, callback, 
        callbackArg, &current, &max);
    for(j = 0; j < *m; j++)
      F[j + i * *m] = Ft[j]; 
  }

  /* free memory */
  free_imatrix(v, 0, *B - 1, 0, *n - 1);
  free_matrix(fit, 0, *m - 1, 0, *n - 1);
  free_matrix(res, 0, *m - 1, 0, *n - 1);
  free_matrix(H0, 0, *n - 1, 0, *n - 1);
  free_matrix(H1, 0, *n - 1, 0, *n - 1);
  free_matrix(dat, 0, *m - 1, 0, *n - 1);
  free_matrix(fit1, 0, *m - 1, 0, *n - 1);
  free_matrix(fit0, 0, *m - 1, 0, *n - 1);
  free_vector(rss1, 0,  - 1);
  free_vector(rss0, 0,  - 1);
  free_vector(Ft, 0,  - 1);
}

/* simulate null F statistics under longitudinal sampling, no intercept */
void nullFLongNoInt(double *fitR, double *resR, int *vR, int *rmv, 
       int *inc, double *H1R, double *H0R, double *wsqR, int *ind, 
       int *idx, long *m, long *n, long *N, int *nGrp, long *B, double *F,
       update_progress_routine* callback, void* callbackArg) {

  long i, j, k, l, ll, **v;
  double **fit, **res, **res0, **H0, **H1, **wsq, **dat, **fit1, 
       **fit0, *rss1, *rss0, *Ft, grpSum;
  double max = ((double)(*B)) * (*m) * ((double)(*n-*N)*(*n-*N) + *N + 
        (double)(*n + 1) * (*n + 1));
  double current = 0;
  int iCurrent = 0;

  /* allocate memory */
  v = imatrix(0, *B - 1, 0, *n - *N - 1);
  fit = matrix(0, *m - 1, 0, *n - 1);
  res = matrix(0, *m - 1, 0, *n - 1);
  res0 = matrix(0, *m - 1, 0, *n - *N - 1);
  H0 = matrix(0, *n - 1, 0, *n - 1);
  H1 = matrix(0, *n - 1, 0, *n - 1);
  wsq = matrix(0, *n - *N - 1, 0, *n - *N - 1);
  dat = matrix(0, *m - 1, 0, *n - 1);
  fit1 = matrix(0, *m - 1, 0, *n - 1);
  fit0 = matrix(0, *m - 1, 0, *n - 1);
  rss1 = vector(0, *m - 1);
  rss0 = vector(0, *m - 1);
  Ft = vector(0, *m - 1);

  /* reform matrix inputs */
  for(i = 0; i < *n - *N; i++) {
    for(j = 0; j < *B; j++)
      v[j][i] = vR[j + i * *B];
    for(j = 0; j < *m; j++)
      res0[j][i] = resR[j + i * *m];
    for(j = 0; j < *n - *N; j++)
      wsq[j][i] = wsqR[j + i * (*n - *N)];
  }

  for(i = 0; i < *n; i++) {
    for(j = 0; j < *m; j++)
      fit[j][i] = fitR[j + i * *m];
    for(j = 0; j < *n; j++) {
      H0[j][i] = H0R[j + i * *n];
      H1[j][i] = H1R[j + i * *n];
    }
  }

  /* simulate null statistics */
  for(i = 0; i < *B; i++) {
    /* initializations */
    for(j = 0; j < *m; j++) {
      for(k = 0; k < *n; k++) {
        fit1[j][k] = 0.0;
        fit0[j][k] = 0.0;
        res[j][k] = 0.0;
      }

      rss1[j] = 0.0;
      rss0[j] = 0.0;
    }

    /* do something mysterious to the residuals (m*((n-N)*(n-N) + N) */
    for(j = 0; j < *m; j++) {
      for(k = 0; k < *n - *N; k++)
        for(l = 0; l < *n - *N; l++, current++, iCurrent++)
      {
        if(iCurrent % 1000 == 0) 
          {
        do_callback(callback, callbackArg, max, current, TIMECOURSE);
        iCurrent = 0;
          }
        res[j][inc[k]] += res0[j][v[i][l]] * wsq[l][k];
      }

      ll = 0;
      for(k = 0; k < *N; k++, current++, iCurrent++) {
    if(iCurrent % 1000 == 0) 
      {
        do_callback(callback, callbackArg, max, current, TIMECOURSE);
        iCurrent = 0;
      }
        grpSum = 0.0;
        for(l = 0; l < nGrp[k]; l++, ll++)
          grpSum += res[j][idx[ll]];
        res[j][rmv[k]] = -grpSum;
      } 
    }

    /* form a null sample (m*n) */
    for(j = 0; j < *m; j++) 
      for(k = 0; k < *n; k++)
    {
      if(iCurrent % 1000 == 0) 
        {
          do_callback(callback, callbackArg, max, current, TIMECOURSE);
          iCurrent = 0;
        }
      dat[j][k] = fit[j][k] + res[j][k];
    }

    /* compute F statistics m*(n^2 + n + 1) */
    FNoEM(dat, H1, H0, fit1, fit0, rss1, rss0, m, n, Ft, callback, callbackArg, &current, &max);
    for(j = 0; j < *m; j++)
      F[j + i * *m] = Ft[j];
  }


  /* free_memory */
  free_imatrix(v, 0, *B - 1, 0, *n - *N - 1);
  free_matrix(fit, 0, *m - 1, 0, *n - 1);
  free_matrix(res, 0, *m - 1, 0, *n - 1);
  free_matrix(res0, 0, *m - 1, 0, *n - *N - 1);
  free_matrix(H0, 0, *n - 1, 0, *n - 1);
  free_matrix(H1, 0, *n - 1, 0, *n - 1);
  free_matrix(dat, 0, *m - 1, 0, *n - 1);
  free_matrix(wsq, 0, *n - *N - 1, 0, *n - *N - 1);
  free_matrix(fit1, 0, *m - 1, 0, *n - 1);
  free_matrix(fit0, 0, *m - 1, 0, *n - 1);
  free_vector(rss1, 0,  - 1);
  free_vector(rss0, 0,  - 1);
  free_vector(Ft, 0,  - 1);
}

/* simulate null F statistics under longitudinal sampling, with intercept */
void nullFLongInt(double *fitR, double *resR, int *vR, int *v1R, double *H1R,
          double *H0R, double *alphaR, int *ind, int *idx, 
          double *eps, long *m, long *n, long *N, int *nGrp, 
          long *B, double *F,
          update_progress_routine* callback, void* callbackArg) {

  long i, j, k, **v, **v1, *EM;
  double **fit, **res, **H0, **H1, **alpha, **alpha1, **alpha0, **dat, 
          **fit1, **fit0, *rss1, *rss0, **datC, *gm, **A0, **AA0, **AA, 
          *lik0, *lik, *tau, *sig, *atmp, *Ft;

  double max = ((double)*B) * 2;
  double current = 0;

  /* allocate memory */
  v = imatrix(0, *B - 1, 0, *n - 1);
  v1 = imatrix(0, *B - 1, 0, *n - 1);
  fit = matrix(0, *m - 1, 0, *n - 1);
  res = matrix(0, *m - 1, 0, *n - 1);
  H0 = matrix(0, *n - 1, 0, *n - 1);
  H1 = matrix(0, *n - 1, 0, *n - 1);
  alpha = matrix(0, *m - 1, 0, *n - 1);
  alpha1 = matrix(0, *m - 1, 0, *n - 1);
  alpha0 = matrix(0, *m - 1, 0, *n - 1);
  dat = matrix(0, *m - 1, 0, *n - 1);
  fit1 = matrix(0, *m - 1, 0, *n - 1);
  fit0 = matrix(0, *m - 1, 0, *n - 1);
  rss1 = vector(0, *m - 1);
  rss0 = vector(0, *m - 1);
  datC = matrix(0, *m - 1, 0, *n - 1);
  gm = vector(0, *m - 1);
  A0 = matrix(0, *m - 1, 0, *N - 1);
  AA0 = matrix(0, *m - 1, 0, *N - 1);
  AA = matrix(0, *m - 1, 0, *N - 1);
  lik0 = vector(0, *m - 1);
  lik = vector(0, *m - 1);
  tau = vector(0, *m - 1);
  sig = vector(0, *m - 1);
  EM = ivector(0, *m - 1);
  atmp = vector(0, *N - 1);
  Ft = vector(0, *m - 1);

  /* debug */
  saveDblArray("fitR.txt", fitR, (*m) * (*n));
  saveDblArray("resR.txt", resR, (*m) * (*n));
  saveDblArray("H1R.txt", H1R, (*n) * (*n));
  saveDblArray("H0R.txt", H0R, (*n) * (*n));
  saveDblArray("alphaR.txt", alphaR, (*m) * (*n));
  saveIntArray("vR.txt", vR, (*B) * (*n));
  saveIntArray("v1R.txt", v1R, (*B) * (*n));
  saveIntArray("ind.txt", ind, (*n));
  saveIntArray("idx.txt", idx, (*n));

  /* reform matrix inputs */
  for(i = 0; i < *n; i++) {
    for(j = 0; j < *B; j++) {
      v[j][i] = vR[j + i * *B];
      v1[j][i] = v1R[j + i * *B];
    }

    for(j = 0; j < *m; j++) {
      fit[j][i] = fitR[j + i * *m];
      res[j][i] = resR[j + i * *m];
      alpha[j][i] = alphaR[j + i * *m];
    }

    for(j = 0; j < *n; j++) {
      H0[j][i] = H0R[j + i * *n];
      H1[j][i] = H1R[j + i * *n];
    }
  }

  /* simulate null statistics */
  for(i = 0; i < *B; i++) {
    /* initializations */
    for(j = 0; j < *m; j++) {
      for(k = 0; k < *n; k++) {
        fit1[j][k] = 0.0;
        fit0[j][k] = 0.0;
      }

      rss1[j] = 0.0;
      rss0[j] = 0.0;
    }

    /* form a null sample */
    for(j = 0; j < *m; j++) 
      for(k = 0; k < *n; k++)
        dat[j][k] = fit[j][k] + res[j][v[i][k]] + alpha[j][v1[i][k]];

    /* compute F statistics */
    FEM(dat, H1, H0, ind, idx, eps, fit1, fit0, alpha1, alpha0, rss1, 
          rss0, datC, gm, A0, AA0, AA, lik0, lik, tau, sig, EM, atmp, 
          m, n, N, nGrp, Ft, callback, callbackArg, &max, &current);

    for(j = 0; j < *m; j++)
      F[j + i * *m] = Ft[j];
  }

  free_imatrix(v, 0, *B - 1, 0, *n - 1);
  free_imatrix(v1, 0, *B - 1, 0, *n - 1);
  free_matrix(fit, 0, *m - 1, 0, *n - 1);
  free_matrix(res, 0, *m - 1, 0, *n - 1);
  free_matrix(alpha, 0, *m - 1, 0, *n - 1);
  free_matrix(alpha1, 0, *m - 1, 0, *n - 1);
  free_matrix(alpha0, 0, *m - 1, 0, *n - 1);
  free_matrix(H0, 0, *n - 1, 0, *n - 1);
  free_matrix(H1, 0, *n - 1, 0, *n - 1);
  free_matrix(dat, 0, *m - 1, 0, *n - 1);
  free_matrix(fit1, 0, *m - 1, 0, *n - 1);
  free_matrix(fit0, 0, *m - 1, 0, *n - 1);
  free_vector(rss1, 0,  - 1);
  free_vector(rss0, 0,  - 1);
  free_matrix(datC, 0, *m - 1, 0, *n - 1);
  free_vector(gm, 0,  - 1);
  free_matrix(A0, 0, *m - 1, 0, *N - 1);
  free_matrix(AA0, 0, *m - 1, 0, *N - 1);
  free_matrix(AA, 0, *m - 1, 0, *N - 1);
  free_vector(lik0, 0,  - 1);
  free_vector(lik, 0,  - 1);
  free_vector(tau, 0,  - 1);
  free_vector(sig, 0,  - 1);
  free_ivector(EM, 0, *m - 1);
  free_vector(atmp, 0,  - 1);
  free_vector(Ft, 0,  - 1);

  saveDblArray("F.txt", F, (*m) * (*B));

}

/* compute F statistics without using EM algorithm */
void FNoEM(double **dat, double **H1, double **H0, double **fit1, 
       double **fit0, double *rss1, double *rss0, long *m, long *n, 
       double *F, update_progress_routine* callback, 
       void* callbackArg, double *pCurrent, double *pMax) {

  long j, k, l;
  double current = pCurrent == NULL ? 0 : *pCurrent;
  double max = pMax == NULL ? ((double)(*m))*((*n)*(*n) + (*n) + 1) : *pMax;
  int iCurrent = 0;

  /* compute fitted values under two models (m * n * n calculations) */
  for(j = 0; j < *m; j++) {
    for(k = 0; k < *n; k++) {
      for(l = 0; l < *n; l++, current++, iCurrent++) {
    if(iCurrent % 1000 == 0) 
      {
        do_callback(callback, callbackArg, max, current, TIMECOURSE);
        iCurrent = 0;
      }
        fit1[j][k] += dat[j][l] * H1[k][l];
        fit0[j][k] += dat[j][l] * H0[k][l];
      }
    }
  }

  /* compute residual sums-of-squares (m * n calculations) */
  for(j = 0; j < *m; j++) {
    for(k = 0; k < *n; k++, current++, iCurrent++) {
      if(iCurrent % 1000 == 0) 
    {
      do_callback(callback, callbackArg, max, current, TIMECOURSE);
      iCurrent = 0;
    }
      rss1[j] += (dat[j][k] - fit1[j][k]) * (dat[j][k] - fit1[j][k]);
      rss0[j] += (dat[j][k] - fit0[j][k]) * (dat[j][k] - fit0[j][k]);
    }
  }

  /* compute F statistics (m calculations) */
  for(j = 0; j < *m; j++)
    F[j] = rss0[j] / rss1[j] - 1.0;

  if(NULL != pCurrent)
    *pCurrent = current;
}

/* compute F statistics using EM algorithm */
void FEM(double **dat, double **H1, double **H0, int *ind, int *idx, 
     double *eps, double **fit1, double **fit0, double **alpha1, 
     double **alpha0, double *rss1, double *rss0, double **datC, 
     double *gm, double **A0, double **AA0, double **AA, 
     double *lik0, double *lik, double *tau, double *sig, 
     long *EM, double *atmp, long *m, long *n, long *N, int *nGrp, 
     double *F, update_progress_routine* callback, void* callbackArg, 
     double *pMax, double *pCurrent) {

  long j, k, iter;

  double max = pMax == NULL ? 2 : *pMax;
  double current = pCurrent == NULL ? 0 : *pCurrent;

// gah - max, current are passed to EMFit, but not used there.

  /* compute fitted values under two models */
  EMFit(dat, H1, ind, idx, eps, fit1, alpha1, datC, gm, A0, AA0, AA, 
    lik0, lik, tau, sig, EM, atmp, m, n, N, nGrp, &iter,
    callback, callbackArg, &max, &current);
  current++;
  if(NULL != callback) do_callback(callback, callbackArg, max, current, 
        TIMECOURSE);

  EMFit(dat, H0, ind, idx, eps, fit0, alpha0, datC, gm, A0, AA0, 
        AA, lik0, lik, tau, sig, EM, atmp, m, n, N, nGrp, &iter,
        callback, callbackArg, &max, &current);
  current++;
  if(NULL != callback) do_callback(callback, callbackArg, max, current, 
         TIMECOURSE);

  /* compute residual sums-of-squares */
  for(j = 0; j < *m; j++) {
    for(k = 0; k < *n; k++) {
      rss1[j] += (dat[j][k] - fit1[j][k] - alpha1[j][k]) * 
          (dat[j][k] - fit1[j][k] - alpha1[j][k]);
      rss0[j] += (dat[j][k] - fit0[j][k] - alpha0[j][k]) * 
          (dat[j][k] - fit0[j][k] - alpha0[j][k]);
    }
  }

  /* compute F statistics */
  for(j = 0; j < *m; j++)
    F[j] = rss0[j] / rss1[j] - 1.0;

  if(NULL != pCurrent)
    *pCurrent = current;
}

/* the EM algorithm */
/* note: ind argument not used */
void EMFit(double **dat, double **H, int *ind, int *idx, double *eps, 
       double **fit, double **alpha, double **datC, double *gm, 
       double **A0, double **AA0, double **AA, double *lik0, double *lik, 
       double *tau, double *sig, long *EM, double *atmp, long *m, long *n, 
       long *N, int *nGrp, long *iter, update_progress_routine* callback, 
       void* callbackArg, double *pMax, double *pCurrent) {
  
  long i, j, k, l, ll, nEM = *m;

// gah - the callback arguments callback, callbackArg, pMax, pCurrent
//       are not used at all.  This could be changed.

  /* initialization */
  (*iter) = 1;
  for(i = 0; i < *m; i++) {
    gm[i] = 0.0;
    lik0[i] = 0.0;
    lik[i] = 1.0;
    EM[i] = i;
    for(j = 0; j < *N; j++)
      A0[i][j] = 0.0;
  }

  for(i = 0; i < *m; i++) {
    for(j = 0; j < *n; j++)
      gm[i] += dat[i][j];
    gm[i] /= *n;
  }
  
  for(i = 0; i < *m; i++)
    for(j = 0; j < *n; j++)
      datC[i][j] = dat[i][j] - gm[i];

  for(i = 0; i < *m; i++) {
    ll = 0;
    for(j = 0; j < *N; j++) {
      for(k = 0; k < nGrp[j]; k++, ll++)
        A0[i][j] += datC[i][idx[ll]];
      A0[i][j] /= nGrp[j];
      AA0[i][j] = A0[i][j] * A0[i][j];
      alpha[i][j] = A0[i][j];
      AA[i][j] = AA0[i][j];
    }
  }

  /* begin EM algorithm */
  for(; nEM > 1; (*iter)++) {
    for(i = 0; i < nEM; i++) {
      /* initializations */
      for(j = 0; j < *N; j++) {
        alpha[EM[i]][j] = A0[EM[i]][j];
        AA[EM[i]][j] = AA0[EM[i]][j];
        A0[EM[i]][j] = 0.0;
        AA0[EM[i]][j] = 0.0;
      } for(j = 0; j < *n; j++) {
        fit[EM[i]][j] = 0.0;
      }
      tau[EM[i]] = 0.0;
      sig[EM[i]] = 0.0;
    }

    for(i = 0; i < nEM; i++) {
      for(j = 0; j < *n; j++) {
        ll = 0;
        for(k = 0; k < *N; k++) 
          for(l = 0; l < nGrp[k]; l++, ll++) 
            fit[EM[i]][j] += H[j][ll] * (dat[EM[i]][ll] - alpha[EM[i]][k]);
      }

      for(j = 0; j < *N; j++)
        tau[EM[i]] += AA[EM[i]][j];
      tau[EM[i]] /= *N;

      ll = 0;
      for(j = 0; j < *N; j++)
        for(k = 0; k < nGrp[j]; k++, ll++)
          sig[EM[i]] += (dat[EM[i]][ll] - fit[EM[i]][ll]) * 
          (dat[EM[i]][ll] - fit[EM[i]][ll]) - 
            2 * (dat[EM[i]][ll] - fit[EM[i]][ll]) * alpha[EM[i]][j] + 
            AA[EM[i]][j];
      sig[EM[i]] /= *n;

      ll = 0;
      for(j = 0; j < *N; j++) {
        for(k = 0; k < nGrp[j]; k++, ll++)
          A0[EM[i]][j] += dat[EM[i]][idx[ll]] - fit[EM[i]][idx[ll]];
        A0[EM[i]][j] /= nGrp[j];
        A0[EM[i]][j] *= tau[EM[i]] * nGrp[j] / 
            (sig[EM[i]] + tau[EM[i]] * nGrp[j]);
      }

      for(j = 0; j < *N; j++) 
        AA0[EM[i]][j] = tau[EM[i]] * (1.0 - (tau[EM[i]] * nGrp[j] / 
         (sig[EM[i]] + tau[EM[i]] * nGrp[j]))) + A0[EM[i]][j] * A0[EM[i]][j];
    }

    for(i = 0; i < *m; i++)
      lik0[i] = lik[i];

    for(i = 0; i < nEM; i++)
      lik[EM[i]] = -*N / 2 * log(tau[EM[i]]) - *m / 2 * log(sig[EM[i]]);

    nEM = 0;
    for(i = 0; i < *m; i++) 
      if(fabs(lik[i] - lik0[i]) > *eps) 
        EM[nEM++] = i;
  }

  /* expand alpha to m x n */
  for(i = 0; i < *m; i++) {
    ll = 0;
    for(j = 0; j < *N; j++)
      atmp[j] = alpha[i][j];
    for(j = 0; j < *N; j++)
      for(k = 0; k < nGrp[j]; k++, ll++)
        alpha[i][idx[ll]] = atmp[j];
  }

}

/* R wrapper for calling EMFit directly */
void EMFitR(double *datR, double *HR, int *ind, int *idx, double *eps, 
  double *fitR, double *alphaR, 
  long *m, long *n, long *N, int *nGrp, long *iter) {

  long i, j, *EM;
  double **dat, **H, **fit, **alpha, **datC, *gm, **A0, **AA0, **AA, 
        *lik0, *lik, *tau, *sig, *atmp;

  /* allocate memory */
  dat = matrix(0, *m - 1, 0, *n - 1);
  H = matrix(0, *n - 1, 0, *n - 1);
  fit = matrix(0, *m - 1, 0, *n - 1);
  alpha = matrix(0, *m - 1, 0, *n - 1);
  datC = matrix(0, *m - 1, 0, *n - 1);
  gm = vector(0, *m - 1);
  A0 = matrix(0, *m - 1, 0, *N - 1);
  AA0 = matrix(0, *m - 1, 0, *N - 1);
  AA = matrix(0, *m - 1, 0, *N - 1);
  lik0 = vector(0, *m - 1);
  lik = vector(0, *m - 1);
  tau = vector(0, *m - 1);
  sig = vector(0, *m - 1);
  EM = ivector(0, *m - 1);
  atmp = vector(0, *N - 1);

  /* reform matrix inputs */
  for(i = 0; i < *n; i++) {
    for(j = 0; j < *m; j++) 
      dat[j][i] = datR[j + i * *m];

    for(j = 0; j < *n; j++) 
      H[j][i] = HR[j + i * *n];
  }

  /* call EM algorithm */
  EMFit(dat, H, ind, idx, eps, fit, alpha, datC, gm, A0, AA0, AA, lik0, 
    lik, tau, sig, EM, atmp, m, n, N, nGrp, iter, NULL, NULL, NULL, NULL);

  /* outputs */
  for(i = 0; i < *m; i++) {
    for(j = 0; j < *n; j++) {
      fitR[i + j * *m] = fit[i][j];
      alphaR[i + j * *m] = alpha[i][j];
    }
  }

  /* free memory */
  free_matrix(dat, 0, *m - 1, 0, *n - 1);
  free_matrix(H, 0, *n - 1, 0, *n - 1);
  free_matrix(fit, 0, *m - 1, 0, *n - 1);
  free_matrix(alpha, 0, *m - 1, 0, *n - 1);
  free_matrix(datC, 0, *m - 1, 0, *n - 1);
  free_vector(gm, 0,  - 1);
  free_matrix(A0, 0, *m - 1, 0, *N - 1);
  free_matrix(AA0, 0, *m - 1, 0, *N - 1);
  free_matrix(AA, 0, *m - 1, 0, *N - 1);
  free_vector(lik0, 0,  - 1);
  free_vector(lik, 0,  - 1);
  free_vector(tau, 0,  - 1);
  free_vector(sig, 0,  - 1);
  free_ivector(EM, 0, *m - 1);
  free_vector(atmp, 0,  - 1);
}

/***************************************************************************
utility functions:  adapted from Numerical Recipes in C
****************************************************************************/

/* quicksort routine */
void sortQK(long low, long high, long n, double *w) {
  if(low < high) {
    long lo = low, hi = high + 1;
    double elem = w[low];
    for (;;) {
      while ((lo < n) && (w[++lo] < elem));
      while ((hi >= 0) && (w[--hi] > elem));
      if (lo < hi) swapQK(lo, hi, w);
      else break;
    }

    swapQK(low, hi, w);
    sortQK(low, hi - 1, n, w);
    sortQK(hi + 1, high, n, w);
  }
}

/* swap function for use with sortQK() */
void swapQK(long i, long j, double *w) {
  double tmp = w[i];
  
  w[i] = w[j];
  w[j] = tmp;
}

/* allocate a int vector with subscript range v[nl...nh] */
int *intvector(long nl, long nh) {
  int *v;

  v = (int *) malloc((size_t)((nh - nl + 1 + NR_END) * sizeof(int)));
  if(!v) printf("\n allocation failure in intvector()\n");
  return v - nl + NR_END;
}

/* free a int vector allocated with ivector() */
void free_intvector(int *v, long nl, long nh) {
  free((FREE_ARG) (v + nl - NR_END));
}

/* allocate a long vector with subscript range v[nl...nh] */
long *ivector(long nl, long nh) {
  long *v;

  v = (long *) malloc((size_t)((nh - nl + 1 + NR_END) * sizeof(long)));
  if(!v) printf("\n allocation failure in ivector()\n");
  return v - nl + NR_END;
}

/* free a long vector allocated with ivector() */
void free_ivector(long *v, long nl, long nh) {
  free((FREE_ARG) (v + nl - NR_END));
}

/* allocate a long matrix with subscript ranges m[nrl...nrh][ncl...nch] */
long **imatrix(long nrl, long nrh, long ncl, long nch) {
  long i, nrow = nrh - nrl + 1, ncol = nch - ncl + 1;
  long **m;

  /* allocate pointers to rows */
  m = (long **) malloc((size_t)((nrow + NR_END) * sizeof(long*)));
  if(!m) printf("%s", "allocation fialure\n");

  m += NR_END;
  m -= nrl;

  /* set pointer to rows */
  m[nrl] = (long *) malloc((size_t)((nrow * ncol + NR_END) * sizeof(long)));
  if(!m[nrl]) printf("%s", "allocation fialure\n");
  m[nrl] += NR_END;
  m[nrl] -= ncl;

  for(i = nrl + 1; i <= nrh; i++) m[i] = m[i - 1] + ncol;
  return m;
}

/* free long matrix allocated with imatrix() */
void free_imatrix(long **m, long nrl, long nrh, long ncl, long nch) {
  free((FREE_ARG) (m[nrl] + ncl - NR_END));
  free((FREE_ARG) (m + nrl - NR_END));
}

/* allocate a double matrix with subscript ranges m[nrl...nrh][ncl...nch] */
double **matrix(long nrl, long nrh, long ncl, long nch) {
  long i, nrow = nrh - nrl + 1, ncol = nch - ncl + 1;
  double **m;

  /* allocate pointers to rows */
  m = (double **) malloc((size_t)((nrow + NR_END) * sizeof(double*)));
  if(!m) printf("%s", "allocation fialure\n");

  m += NR_END;
  m -= nrl;

  /* set pointer to rows */
  m[nrl] = (double *) malloc((size_t)((nrow * ncol + NR_END) * sizeof(double)));
  if(!m[nrl]) printf("%s", "allocation fialure\n");
  m[nrl] += NR_END;
  m[nrl] -= ncl;

  for(i = nrl + 1; i <= nrh; i++) m[i] = m[i - 1] + ncol;
  return m;
}

/* free double matrix allocated with matrix() */
void free_matrix(double **m, long nrl, long nrh, long ncl, long nch) {
  free((FREE_ARG) (m[nrl] + ncl - NR_END));
  free((FREE_ARG) (m + nrl - NR_END));
}

/* allocate a double vector with subscript range v[nl...nh] */
double *vector(long nl, long nh) {
  double *v;

  v = (double *) malloc((size_t) ((nh - nl + 1 + NR_END) * sizeof(double)));
  if(!v) printf("\n allocation failure in vector()\n");
  return v - nl + NR_END;
}

/* free double vector allocated with vector() */
void free_vector(double *v, long nl, long nh) {
  free((FREE_ARG) (v + nl - NR_END));
}
#ifdef __cplusplus
}
#endif
