9 Helpers for simulations

There are two alternatives to consider when gating points in a series of cytograms.

  • The first alternative (underfit_gmm()) is to estimate the clusterings to be identical across all time points. This is basically equivalent to collapsing all cytograms into one and clustering them.
  • The second alternative (overfit_gmm()) is to estimate clusters in each cytogram, then connect the cytograms as much as one can, e.g., by finding similar clusters across time points and combining them.

9.1 Two alternative clusterings

#' Pool the entire series of cytograms, fit a GMM model, and re-aggregate parameters.
#' 
#' @param ylist Data.
#' @param numclust Number of clusters.
#' @return
#' 
#' @export
underfit_gmm <- function(ylist, numclust){
  
  ## Pool all data
  TT <- length(ylist)
  nt <- sapply(ylist, nrow)
  ylist_pooled <- do.call(rbind, ylist) %>% as_tibble()
  colnames(ylist_pooled) = "y"

  ## Fit the GMM model
  gmm_pooled <- mclust::Mclust(data = ylist_pooled, G = numclust,
                                modelNames = "V", verbose = FALSE)

  ## Memberships (soft- and hard-clustered)
  hard_mem = gmm_pooled$z  %>% apply(1, which.max)
  soft_mem = gmm_pooled$z  %>%
    apply(1, function(myrow){
      sample(1:2, size = 1, replace = FALSE, prob=myrow)
    })

  ## Put together in a table
  tab <- tibble(y = ylist_pooled$y,
                soft_cluster = factor(soft_mem, levels = c(2,1)),
                hard_cluster = factor(hard_mem, levels = c(2,1)),
                time = rep(1:TT, times = nt)) 
  tab_long = tab %>%  pivot_longer(-c("time", "y"), values_to = "cluster",
                                   names_to = "type")

  ## That's it! Return the results
  param_mat = tibble(time=1:TT,
                     mn1 = gmm_pooled$parameters$mean[1],
         mn2 = gmm_pooled$parameters$mean[2],
         sd1 = gmm_pooled$parameters$variance$sigmasq[1],
         sd2 = gmm_pooled$parameters$variance$sigmasq[2],
         prob1 = gmm_pooled$parameters$pro[1],
         prob2 = gmm_pooled$parameters$pro[2])
  mu = param_mat[,c("mn1", "mn2")] %>% as.matrix()
  prob = param_mat[,c("prob1", "prob2")] %>% as.matrix()
  sigma = param_mat[,c("sd1", "sd2")] %>% as.matrix()
  
  ## Soft membership
  memlist = tab_long %>% subset(type == "soft_cluster") %>% group_by(time) %>% 
    group_split() %>% lapply(function(a)pull(a, cluster))

  ## Responsibilities
  soft_mem = gmm_pooled$z %>% as_tibble()
  colnames(soft_mem) = c("clust1", "clust2")
  resp_list = soft_mem %>% add_column( time = rep(1:TT, times = nt))  %>% group_by(time) %>% group_split() %>%
    lapply(select, c("clust1", "clust2")) %>% lapply(as.matrix)

  ## List of sigmas
  TT = nrow(sigma)
  sigma_list = lapply(1:TT, function(tt){
    one_row = sigma[tt,] %>% as.numeric()
    one_sigma = array(NA, dim = c(2,1,1))
    one_sigma[,1,1] = one_row
    return(one_sigma)
  })

  return(list(tab_long = tab_long,
              param_mat = param_mat,
              mu = mu,
              prob = prob,
              sigma = sigma,
              sigma_list = sigma_list,
              memlist = memlist,
              resp_list = resp_list,
              numclust = numclust))
}
#' Pool the entire series of cytograms, fit a GMM model, and re-aggregate parameters.
#' 
#' @param ylist Data.
#' @param numclust Number of clusters (only works for 2).
#' @return 
#' @export
overfit_gmm <- function(ylist, numclust = 2, reorder = TRUE){ 

  ## Basic checks
  stopifnot(numclust == 2)

  ## Estimate individual GMM models
  gmm_list <- lapply(ylist, gmm_each, numclust = numclust)

  ## Reorder the cluster memberships sequentially
  if(reorder){
    gmm_list = match_clusters_gmm(gmm_list, numclust = 2)
  }

  ## Obtain the resulting data in long format
  tab_list <- lapply(gmm_list, function(a) a$tab)
  TT = length(ylist)
  names(tab_list) = 1:TT
  tab_long = tab_list %>% bind_rows(.id = "time") %>%
    pivot_longer(-c("time", "y"), values_to = "cluster",
                                   names_to = "type")
  memlist = tab_long %>% subset(type == "soft_cluster") %>% group_by(time) %>%
    group_split() %>% lapply(function(a)pull(a, cluster))

  ## Get the Gaussian distribution parameters
  param_list <- lapply(gmm_list, function(a) a$param)
  param_mat = param_list %>% bind_rows(.id = "time")
  mu = param_mat[,c("mn1", "mn2")] %>% as.matrix()
  prob = param_mat[,c("prob1", "prob2")] %>% as.matrix()
  sigma = param_mat[,c("sd1", "sd2")] %>% .^2 %>% as.matrix()

  ## Responsibilities
  resp_list = gmm_list %>% lapply(function(a) a$resp)

  ## Make a list of each time points' sigmas
  TT = nrow(sigma)
  sigma_list = lapply(1:TT, function(tt){
    one_row = sigma[tt,] %>% as.numeric()
    one_sigma = array(NA, dim = c(2,1,1))
    one_sigma[,1,1] = one_row
    return(one_sigma)
  })

  return(list(tab_long = tab_long,
              memlist = memlist,
              param_mat = param_mat,
              mu = mu,
              prob = prob,
              sigma = sigma,
              sigma_list = sigma_list,
              resp_list = resp_list,
              numclust = numclust))
}
#' Reordering function for gmm results.
#'
#' @param obj result of \code{gmm_overfit()} or \code{gmm_underfit()}.
#' @param new_order new ordering of clusters to use.
#'
#' @return reordered object.
#' @export
reorder_gmm_fit <- function(obj, new_order){

  ## Setup
  numclust = max(new_order)

  ## Reorder everything
  obj$prob = obj$prob[,new_order]
  obj$mu = obj$mu[,new_order]
  obj$sigma = obj$sigma[,new_order]
  obj$memlist =  lapply(obj$memlist, function(a){
    a_copy = a
    for(iclust in 1:numclust){
      a_copy[which(a == iclust)] = new_order[iclust]
    }
    return(a_copy)
  })

  ## Reorder the responsibilities
  obj$resp_list =  lapply(obj$resp_list, function(oneresp){
    return(oneresp[,new_order])
  })

  return(obj)
}
#' Sequentially permute a time series of GMMs.
#'
#' @param gmm_list Output of an lapply of gmm_each over ylist.
#' @param numclust Number of clusters.
#'
#' @return Same format as |gmm_list|
match_clusters_gmm <- function(gmm_list, numclust = 2){

  ## Initialize some objects
  TT <- length(gmm_list)
  
  ## Helper to permute one GMM
  my_reorder <- function(one_gmm, new_order){
    one_gmm_reordered = one_gmm
    one_gmm_reordered$tab = one_gmm_reordered$tab %>%
      mutate(hard_cluster = hard_cluster %>%
               plyr::revalue(replace=c("1"=new_order[1],"2"=new_order[2]))) %>%
      mutate(soft_cluster = soft_cluster %>%
               plyr::revalue(replace=c("1"=new_order[1],"2"=new_order[2])))

    ## reorder the parameters
    param = one_gmm$param
    new_mns = param[,c("mn1", "mn2")][new_order] %>% as.numeric()
    new_sds = param[,c("sd1", "sd2")][new_order] %>% as.numeric()
    new_probs = param[,c("prob1", "prob2")][new_order] %>% as.numeric()
    one_gmm_reordered$param <- data.frame(mn1 = new_mns[1], mn2 = new_mns[2],
                                          sd1 = new_sds[1], sd2 = new_sds[2],
                                          prob1 = new_probs[1], prob2 = new_probs[2])
    ## if(all(new_order == 2:1)) browser()

    ## reorder the responsibilities
    one_gmm_reordered$resp = one_gmm$resp[,new_order]
    return(one_gmm_reordered)
  }
  
  ## Fill in the rest of the rows by sequentially matching clusters from t-1 to
  ## t, using the Hungarian Algorithm
  new_orders = matrix(NA, ncol = 2, nrow = TT)
  new_orders[1,] = c(1, 2)
  gmm_list_copy = gmm_list
  for(tt in 2:TT){

    ## Find order
    param1 = gmm_list_copy[[tt-1]] %>% .$param
    param2 = gmm_list[[tt]] %>% .$param
    klmat <- symmetric_kl_between_gaussians(param1 = param1, param2 = param2)
    new_order <- RcppHungarian::HungarianSolver(klmat) %>%
      .$pairs %>% data.frame() %>% arrange(.[,2]) %>% .[,1]
    new_orders[tt,] = new_order

    ## Perform the reordering
    gmm_list_copy[[tt]] <- my_reorder(gmm_list[[tt]], new_order)
  }
  return(gmm_list_copy)
}

#' Between two sets of Gaussian mean/sd parameters, what is the symmetric KL
#' distance.
#' @param param1 list of mn1, mn2, sd1, sd2
#' @param param2 another such list.
#'
#' @return (numclust x numclust) distance matrix.
symmetric_kl_between_gaussians <- function(param1, param2, numclust = 2){

  ## First model's parameters
  mn_model1 = c(param1$mn1, param1$mn2)
  sd_model1 = c(param1$sd1, param1$sd2)

  ## Second model's parameters
  mn_model2 = c(param2$mn1, param2$mn2)
  sd_model2 = c(param2$sd1, param2$sd2)

  ## Fill out distance matrix
  dist_cs <- matrix(0, ncol = numclust, nrow = numclust)
  for(iclust_1 in 1:numclust){
    for(iclust_2 in 1:numclust){
      dist_cs[iclust_2, iclust_1] <-
        one_symmetric_kl(mu1 = mn_model1[iclust_1], mu2 = mn_model2[iclust_2],
                         sd1 = sd_model1[iclust_1], sd2 = sd_model2[iclust_2])
    }
  }
  rownames(dist_cs) <- rep("C1", numclust)
  colnames(dist_cs) <- rep("C2", numclust)
  return(dist_cs)
}

#' Symmetric KL divergence between two Gaussian distributions.
#' @param mu1 Mean for distribution 1.
#' @param mu2 Mean for distribution 2.
#' @param sd1 Standard deviation for distribution 1.
#' @param sd2 Standard deviation for distribution 2.
one_symmetric_kl <- function(mu1, mu2, sd1, sd2){
  stopifnot(length(mu1)==1 & length(mu2) == 1 &
            length(sd1) == 1 & length(sd2) == 1)
  kl12 <- log(sd2/sd1) + (sd1^2 + (mu1 - mu2)^2)/(2*sd2^2) - 1/2
  kl21 <- log(sd1/sd2) + (sd2^2 + (mu2 - mu1)^2)/(2*sd1^2) - 1/2
  kl_sym <- 0.5*(kl12 + kl21)
  return(kl_sym)
}


#' Apply a gmm in each 1-dimensional cytogram.
#' 
#' @param one_y One-column matrix.
#' @param numclust Number of clusters.
#' @return 
#' @export
gmm_each <- function(one_y, numclust){
  
  ## Basic check
  stopifnot(ncol(one_y) == 1)

  ## Do Gaussian Mixture model
  ## obj <- mclust::Mclust(data = one_y, G = numclust,
##                             modelNames = "V", verbose = FALSE)
  obj <- my_mclust(one_y, numclust, FALSE)

  ## Memberships (soft- and hard-clustered)
  hard_mem = obj$z  %>% apply(1, which.max)
  soft_mem = obj$z  %>%
    apply(1, function(myrow){
      sample(1:2, size = 1, replace = FALSE, prob=myrow)
    })
  tab <- tibble(y = as.numeric(one_y),
                soft_cluster = factor(soft_mem, levels = c(1,2)),
                hard_cluster = factor(hard_mem, levels = c(1,2)))

  ## parameter table
  mn1 = obj$parameters$mean[[1]]
  mn2 = obj$parameters$mean[[2]]
  sds = obj %>% .$parameters %>% .$variance %>% .$sigmasq %>% sqrt()
  sd1 = sds[1]
  sd2 = sds[2]
  prob1 = obj$parameters$pro[1]
  prob2 = obj$parameters$pro[2]

  ## ggplot(data.frame(x=as.numeric(one_y))) +
  ## geom_histogram(aes(x=x)) +
  ## geom_vline(xintercept = mn1, col = 'blue') +
  ## geom_vline(xintercept = mn1 + 2*sd1, col = 'skyblue') +
  ## geom_vline(xintercept = mn1 - 2*sd1, col = 'skyblue') +
  ## geom_vline(xintercept = mn2, col = 'red') +
  ## geom_vline(xintercept = mn2 - 2*sd2, col = 'orange') +
  ## geom_vline(xintercept = mn2 + 2*sd2, col = 'orange')
  param = tibble(mn1 = mn1,
                 mn2 = mn2,
                 sd1 = sd1,
                 sd2 = sd2,
                 prob1 = prob1,
                 prob2 = prob2)
  resp = obj$z

  return(list(tab = tab, param = param, resp = resp))##, sigma_list = sigma_list))
}

Here’s a function for soft-gating given a 2-column responsibility matrix.

#' Soft-gates a responsibility matrix by a bernoulli (or multinoulli) draw.
#' 
#' @param oneresp A 2-column responsibility matrix
#' 
#' @return
#' @export
soft_gate_one_responsibility_matrix <- function(oneresp){

  ## Setup
  numclust = ncol(oneresp)
  vec = rep(0,numclust)

  ## Draw the 0-1 memberships
  zero_one_mat = apply(oneresp, 1, function(myrow){
    draw = sample(1:numclust, size=1, prob=myrow)
    vec[draw]= 1
    vec
  }) %>% t()

  ## Check dimensions and return
  stopifnot(all(dim(zero_one_mat) == dim(oneresp)))
  return(zero_one_mat)
}

Here’s a useful function to create a flowtrend-like object but containing the “oracle” information about the cluster means, variances, and mixture probabilities.

#' Reformatting |datobj| to create a flowtrend-like object that contains
#' "oracle" information of the model that
#' generates the simulated pseudo-real data.
#'
#' @param datobj A data object.
#'
#' @return A flowtrend-like object containing mn, sigma, and prob.
#'
#' @export
create_oracle <- function(datobj){

  TT = nrow(datobj$mns)
  dimdat = 1
  numclust = ncol(datobj$mns)

  ## Make a fake object in a similar format as a |flowtrend| object
  fake_obj = list()
  fake_obj$mn = array(NA, dim = c(TT, dimdat,numclust))
  fake_obj$mn[,1,] = datobj$mns
  fake_obj$prob = datobj$prob
  fake_obj$sigma = array(NA, dim = c(numclust, dimdat, dimdat))
  fake_obj$sigma[1,1,1] = datobj$sd1^2
  fake_obj$sigma[2,1,1] = datobj$sd2^2
  fake_obj$numclust = numclust
  return(fake_obj)
}

9.2 Evaluating performance: soft RAND index

We will use a “soft” Rand index that replaces the regular Rand index:

\[ \sum_{i, i'} 1\{ \hat C_i = \hat C_{i'}, C_i^* \neq C_{i'}^*\}.\]

which measures, for every pair of points \(i\) and \(i'\), the number of times that the two clustering mechanisms disagree.

Now, let’s say that the two mechanisms give probabilities:

\[\hat \gamma_{ik} = \hat P(\hat C_i = k),\] \[\hat \gamma^*_{ik} = \hat P(\hat C_i^* = k).\]

Then, the probability that the clustering is the same \(P(C_i^* = C_{i'}^*)\) for the pair of points \(i\) and \(i'\) is:

\[ (\gamma_i^*)^T (\gamma_{i'}^*) = \sum_{k=1} P(C_i = k) P(C_i^* = k) = P(C_i^* = C_{i'}^*).\]

and the probaility they are different is:

\[ (\gamma_i^*)^T (\gamma_{i'}^*) = \sum_{k=1} P(C_i = k) P(C_i^* = k) = P(C_i^* = C_{i'}^*).\]

So, we can measure the difference as:

\[\sum_{i,i'} (\hat \gamma_i^T \hat \gamma_{i'})\cdot(1- \gamma^*_i^T \gamma^*_{i'}) \]

And when one of the clusterings is soft, we can still use a 0-1 vector as \(\gamma\).

#' A "soft" version of a rand index between two sets of responsibility
#' (membership probability) matrices. Measures the /disagreement/ between the two clusterings.
#' 
#' @param resp_list1 One list of responsibility matrices.
#' @param resp_list2 Another list of responsibility matrices.
#' @param times Optional; if you would like to isolate your attention to some specific times.
#'
#' @return A single soft rand index number
#' @export
rand_old <- function(resp_list1, resp_list2, times = NULL, prop = .25){

  if(!is.null(times)) resp_list1 = resp_list1[times]
  if(!is.null(times)) resp_list2 = resp_list2[times]

  rand_onetime <- function(resp1, resp2){
    stopifnot(nrow(resp1) == nrow(resp2))
    stopifnot(ncol(resp1) == ncol(resp2))
    nt = nrow(resp1)
    
    ## Form the score
    mat11 = resp1 %*% t(resp1)
    mat22 = resp2 %*% t(resp2)
    mat11not = (1-mat11)
    mat22not = (1-mat22)

    ## Make the diagonals not matter anywhere
    diag(mat11) = 0
    diag(mat22) = 0
    diag(mat11not) = 0
    diag(mat22not) = 0

    a = mat11 * mat22
    b = mat11not * mat22not
    c = mat11 * mat22not
    d = mat11not * mat22

    return(c(a = sum(a), b = sum(b), c = sum(c), d = sum(d)))
  }
  abcd_over_time = mapply(rand_onetime, resp_list1, resp_list2)
  abcdmat = abcd_over_time %>% t()
  abcd = abcdmat %>% colSums()
  score = (abcd["a"] + abcd["b"])/  (sum(abcd))
  return(score)
}

#' Rand index between responsibilities.
#'
#' @param resp_list1 One list.
#' @param resp_list2 Another list.
#' @param times A subset of the time points (out of 1:length(resp_list1)) to
#'   examine.
#' @param smaller If TRUE, use only a small (sampled) subset of the particles'
#'   responsibilities for calculating RAND.
#' @param prop How much to downsample; defaults to 0.1.
#' @export
rand <- function(resp_list1, resp_list2, times = NULL, smaller = TRUE, prop = 0.1){
 
  ## Basic checks
  stopifnot(all(sapply(resp_list1, nrow) == sapply(resp_list2, nrow)))

  ## Subset the times if needed
  if(!is.null(times)) resp_list1 = resp_list1[times]
  if(!is.null(times)) resp_list2 = resp_list2[times]

  ## names(everything)
  ## list2env(everything,envir = environment())
  ## resp_list1 = resp_oracle
  ## resp_list2 = true_resp
  if(smaller){
    indslist = lapply(resp_list1, function(oneresp){
      inds = sample(x=1:nrow(oneresp), size=ceiling(nrow(oneresp)*prop), replace=FALSE) %>% sort()
    })
    resp_list1 = mapply(function(a,b)a[b,,drop=FALSE], resp_list1, indslist, SIMPLIFY=FALSE)
    resp_list2 = mapply(function(a,b)a[b,,drop=FALSE], resp_list2, indslist, SIMPLIFY=FALSE)
  }

  ## Make each list into one long matrix
  resp1 = Reduce(rbind, resp_list1)
  resp2 = Reduce(rbind, resp_list2)
  ## resp1 = do.call(rbind, resp_list1)## %>% bind_rows()
  ## resp2 = do.call(rbind, resp_list2)## %>% bind_rows()
  
  stopifnot(nrow(resp1) == nrow(resp2))
  stopifnot(ncol(resp1) == ncol(resp2))
  nt = nrow(resp1)
  
  ## Form the score
  mat11 = resp1 %*% t(resp1)
  mat22 = resp2 %*% t(resp2)
  mat11not = (1-mat11)
  mat22not = (1-mat22)

  ## Make the diagonals not matter anywhere
  diag(mat11) = 0
  diag(mat22) = 0
  diag(mat11not) = 0
  diag(mat22not) = 0

  a = mat11 * mat22
  b = mat11not * mat22not
  c = mat11 * mat22not
  d = mat11not * mat22

  abcd = c(a = sum(a), b = sum(b), c = sum(c), d = sum(d))
  score = (abcd["a"] + abcd["b"])/  (sum(abcd))
  return(score)
}



#' 2 x 2 contigency table from membership vectors.
#' 
#' @param mem1 Membership vector.
#' @param mem2 Another membership vector.
#'
#' @return A 3x3 matrix containing (1) a 2 x 2 table in the first [1:2,1:2]
#'   entries, and (2) row sums and column sums and total sums in the [,3], [3,]
#'   entries.
#' @export
make_contingency_table<-function(mem1, mem2){
  tab = table(mem1, mem2)
  tab = cbind(tab, rowSums(tab))
  tab = rbind(tab, colSums(tab))
  return(tab)
}


#' Calculate RAND index from a contingency table.
#' 
#' @param tab  A matrix containing  a 2 x 2 table in the first [1:2,1:2]
#'   entries; e.g., from make_contingency_table().
#'
#' @return Rand index.
#' @export
get_rand_from_table <- function(tab){
  numer = sum(sapply(as.numeric(tab), function(nij) choose(nij, 2))) +
    tab[1,1] * tab[2,2] + tab[1,2] * tab[2,1]
  denom = (choose(sum(tab), 2))
  ri = numer/denom
  return(ri)
}


#' Faster rand index calculation from membership vectors.
#'
#' @param mem1
#' @param mem2 
#' 
#' @return RAND index.
#' @export
rand_from_mems <- function(mem1, mem2){
  make_contingency_table(mem1, mem2) %>%
    .[1:2,1:2] %>%
    get_rand_from_table()
}

It’s also helpful to have a function that takes a list of discrete memberships (integers 1,..,K) and convert it to a list of responsibility matrices similar in format to obj$resp of a flowtrend object obj.

#' Convert list of memberships into a list of responsibilities.
#' 
#' @param memlist List of memberships
#'
#' @return
#' @export
memlist_to_respmat <- function(memlist){
  numclust = max(sapply(memlist, max))
  respmats = lapply(memlist, function(mem){
    respmat = lapply(mem, function(onemem){
      onerow = rep(0, numclust)##c(0,0)
      onerow[onemem] = 1
      return(onerow)
    }) %>% do.call(rbind, .)
    return(respmat)
  })
  return(respmats)
}

9.3 Helpers in generating pseudo-real data

Two clusters will be taken from an estimated flowtrend model fit on real data, downloaded from here:

https://zenodo.org/records/6471995

to ./inst/data.

9.4 Miscellaneous helpers

Lastly, there are some miscellaneous helpers that are useful when running actual jobs on a server.

The first one is create_destin(), which creates a directory if it doesn’t already exist.

#' Creates a directory \code{destin}, if it doesn't already exist.
#'
#' @param destin Destination directory.
#'
#' @return Nothing.
#' @export
create_destin <- function(destin){
  if(!dir.exists(destin)){
    dir.create(destin, recursive = TRUE)
    cat("Creating destin: ", destin, fill=TRUE)
  } else {
    cat("All output goes out to destin: ", destin, fill = TRUE)
  }
}

Another helper parse_args() helps R read in trailing arguments from the command line, like this:

parse_args(args = commandArgs(trailingOnly = TRUE), verbose=TRUE)

so that one can run jobs using a SLURM command such as:

sbatch --export=summ=0 --array=1 run-3dreal.slurm

from which the R script can access the summ=0 variable.

#' Parse command line arguments and assigns the values of them. |args| is meant
#' to just be additional command line arguments.
#'
#' @param args Argument.
#'
#' @return Nothing.
#' @export
parse_args <- function(args, verbose=FALSE){
  args = sapply(args, strsplit, "=")
  print(args)
  for(arg in args){

    ## Check if the thing is integer
    all_numbers = str_detect(arg[2], "^[:digit:]+$")

    ## Assign the variable
    if(all_numbers){
      assign(arg[1], as.numeric(arg[2]), inherits = TRUE)
    } else {
      assign(arg[1], arg[2], inherits = TRUE)
    }

    if(verbose){
      cat(arg[1], "takes the value of", arg[2],
          "from command line input", fill=TRUE)
    }
    print("===============================")
  }
}