9 Helpers for simulations
There are two alternatives to consider when gating points in a series of cytograms.
- The first alternative (
underfit_gmm()) is to estimate the clusterings to be identical across all time points. This is basically equivalent to collapsing all cytograms into one and clustering them. - The second alternative (
overfit_gmm()) is to estimate clusters in each cytogram, then connect the cytograms as much as one can, e.g., by finding similar clusters across time points and combining them.
9.1 Two alternative clusterings
#' Pool the entire series of cytograms, fit a GMM model, and re-aggregate parameters.
#'
#' @param ylist Data.
#' @param numclust Number of clusters.
#' @return
#'
#' @export
underfit_gmm <- function(ylist, numclust){
## Pool all data
TT <- length(ylist)
nt <- sapply(ylist, nrow)
ylist_pooled <- do.call(rbind, ylist) %>% as_tibble()
colnames(ylist_pooled) = "y"
## Fit the GMM model
gmm_pooled <- mclust::Mclust(data = ylist_pooled, G = numclust,
modelNames = "V", verbose = FALSE)
## Memberships (soft- and hard-clustered)
hard_mem = gmm_pooled$z %>% apply(1, which.max)
soft_mem = gmm_pooled$z %>%
apply(1, function(myrow){
sample(1:2, size = 1, replace = FALSE, prob=myrow)
})
## Put together in a table
tab <- tibble(y = ylist_pooled$y,
soft_cluster = factor(soft_mem, levels = c(2,1)),
hard_cluster = factor(hard_mem, levels = c(2,1)),
time = rep(1:TT, times = nt))
tab_long = tab %>% pivot_longer(-c("time", "y"), values_to = "cluster",
names_to = "type")
## That's it! Return the results
param_mat = tibble(time=1:TT,
mn1 = gmm_pooled$parameters$mean[1],
mn2 = gmm_pooled$parameters$mean[2],
sd1 = gmm_pooled$parameters$variance$sigmasq[1],
sd2 = gmm_pooled$parameters$variance$sigmasq[2],
prob1 = gmm_pooled$parameters$pro[1],
prob2 = gmm_pooled$parameters$pro[2])
mu = param_mat[,c("mn1", "mn2")] %>% as.matrix()
prob = param_mat[,c("prob1", "prob2")] %>% as.matrix()
sigma = param_mat[,c("sd1", "sd2")] %>% as.matrix()
## Soft membership
memlist = tab_long %>% subset(type == "soft_cluster") %>% group_by(time) %>%
group_split() %>% lapply(function(a)pull(a, cluster))
## Responsibilities
soft_mem = gmm_pooled$z %>% as_tibble()
colnames(soft_mem) = c("clust1", "clust2")
resp_list = soft_mem %>% add_column( time = rep(1:TT, times = nt)) %>% group_by(time) %>% group_split() %>%
lapply(select, c("clust1", "clust2")) %>% lapply(as.matrix)
## List of sigmas
TT = nrow(sigma)
sigma_list = lapply(1:TT, function(tt){
one_row = sigma[tt,] %>% as.numeric()
one_sigma = array(NA, dim = c(2,1,1))
one_sigma[,1,1] = one_row
return(one_sigma)
})
return(list(tab_long = tab_long,
param_mat = param_mat,
mu = mu,
prob = prob,
sigma = sigma,
sigma_list = sigma_list,
memlist = memlist,
resp_list = resp_list,
numclust = numclust))
}#' Pool the entire series of cytograms, fit a GMM model, and re-aggregate parameters.
#'
#' @param ylist Data.
#' @param numclust Number of clusters (only works for 2).
#' @return
#' @export
overfit_gmm <- function(ylist, numclust = 2, reorder = TRUE){
## Basic checks
stopifnot(numclust == 2)
## Estimate individual GMM models
gmm_list <- lapply(ylist, gmm_each, numclust = numclust)
## Reorder the cluster memberships sequentially
if(reorder){
gmm_list = match_clusters_gmm(gmm_list, numclust = 2)
}
## Obtain the resulting data in long format
tab_list <- lapply(gmm_list, function(a) a$tab)
TT = length(ylist)
names(tab_list) = 1:TT
tab_long = tab_list %>% bind_rows(.id = "time") %>%
pivot_longer(-c("time", "y"), values_to = "cluster",
names_to = "type")
memlist = tab_long %>% subset(type == "soft_cluster") %>% group_by(time) %>%
group_split() %>% lapply(function(a)pull(a, cluster))
## Get the Gaussian distribution parameters
param_list <- lapply(gmm_list, function(a) a$param)
param_mat = param_list %>% bind_rows(.id = "time")
mu = param_mat[,c("mn1", "mn2")] %>% as.matrix()
prob = param_mat[,c("prob1", "prob2")] %>% as.matrix()
sigma = param_mat[,c("sd1", "sd2")] %>% .^2 %>% as.matrix()
## Responsibilities
resp_list = gmm_list %>% lapply(function(a) a$resp)
## Make a list of each time points' sigmas
TT = nrow(sigma)
sigma_list = lapply(1:TT, function(tt){
one_row = sigma[tt,] %>% as.numeric()
one_sigma = array(NA, dim = c(2,1,1))
one_sigma[,1,1] = one_row
return(one_sigma)
})
return(list(tab_long = tab_long,
memlist = memlist,
param_mat = param_mat,
mu = mu,
prob = prob,
sigma = sigma,
sigma_list = sigma_list,
resp_list = resp_list,
numclust = numclust))
}#' Reordering function for gmm results.
#'
#' @param obj result of \code{gmm_overfit()} or \code{gmm_underfit()}.
#' @param new_order new ordering of clusters to use.
#'
#' @return reordered object.
#' @export
reorder_gmm_fit <- function(obj, new_order){
## Setup
numclust = max(new_order)
## Reorder everything
obj$prob = obj$prob[,new_order]
obj$mu = obj$mu[,new_order]
obj$sigma = obj$sigma[,new_order]
obj$memlist = lapply(obj$memlist, function(a){
a_copy = a
for(iclust in 1:numclust){
a_copy[which(a == iclust)] = new_order[iclust]
}
return(a_copy)
})
## Reorder the responsibilities
obj$resp_list = lapply(obj$resp_list, function(oneresp){
return(oneresp[,new_order])
})
return(obj)
}#' Sequentially permute a time series of GMMs.
#'
#' @param gmm_list Output of an lapply of gmm_each over ylist.
#' @param numclust Number of clusters.
#'
#' @return Same format as |gmm_list|
match_clusters_gmm <- function(gmm_list, numclust = 2){
## Initialize some objects
TT <- length(gmm_list)
## Helper to permute one GMM
my_reorder <- function(one_gmm, new_order){
one_gmm_reordered = one_gmm
one_gmm_reordered$tab = one_gmm_reordered$tab %>%
mutate(hard_cluster = hard_cluster %>%
plyr::revalue(replace=c("1"=new_order[1],"2"=new_order[2]))) %>%
mutate(soft_cluster = soft_cluster %>%
plyr::revalue(replace=c("1"=new_order[1],"2"=new_order[2])))
## reorder the parameters
param = one_gmm$param
new_mns = param[,c("mn1", "mn2")][new_order] %>% as.numeric()
new_sds = param[,c("sd1", "sd2")][new_order] %>% as.numeric()
new_probs = param[,c("prob1", "prob2")][new_order] %>% as.numeric()
one_gmm_reordered$param <- data.frame(mn1 = new_mns[1], mn2 = new_mns[2],
sd1 = new_sds[1], sd2 = new_sds[2],
prob1 = new_probs[1], prob2 = new_probs[2])
## if(all(new_order == 2:1)) browser()
## reorder the responsibilities
one_gmm_reordered$resp = one_gmm$resp[,new_order]
return(one_gmm_reordered)
}
## Fill in the rest of the rows by sequentially matching clusters from t-1 to
## t, using the Hungarian Algorithm
new_orders = matrix(NA, ncol = 2, nrow = TT)
new_orders[1,] = c(1, 2)
gmm_list_copy = gmm_list
for(tt in 2:TT){
## Find order
param1 = gmm_list_copy[[tt-1]] %>% .$param
param2 = gmm_list[[tt]] %>% .$param
klmat <- symmetric_kl_between_gaussians(param1 = param1, param2 = param2)
new_order <- RcppHungarian::HungarianSolver(klmat) %>%
.$pairs %>% data.frame() %>% arrange(.[,2]) %>% .[,1]
new_orders[tt,] = new_order
## Perform the reordering
gmm_list_copy[[tt]] <- my_reorder(gmm_list[[tt]], new_order)
}
return(gmm_list_copy)
}
#' Between two sets of Gaussian mean/sd parameters, what is the symmetric KL
#' distance.
#' @param param1 list of mn1, mn2, sd1, sd2
#' @param param2 another such list.
#'
#' @return (numclust x numclust) distance matrix.
symmetric_kl_between_gaussians <- function(param1, param2, numclust = 2){
## First model's parameters
mn_model1 = c(param1$mn1, param1$mn2)
sd_model1 = c(param1$sd1, param1$sd2)
## Second model's parameters
mn_model2 = c(param2$mn1, param2$mn2)
sd_model2 = c(param2$sd1, param2$sd2)
## Fill out distance matrix
dist_cs <- matrix(0, ncol = numclust, nrow = numclust)
for(iclust_1 in 1:numclust){
for(iclust_2 in 1:numclust){
dist_cs[iclust_2, iclust_1] <-
one_symmetric_kl(mu1 = mn_model1[iclust_1], mu2 = mn_model2[iclust_2],
sd1 = sd_model1[iclust_1], sd2 = sd_model2[iclust_2])
}
}
rownames(dist_cs) <- rep("C1", numclust)
colnames(dist_cs) <- rep("C2", numclust)
return(dist_cs)
}
#' Symmetric KL divergence between two Gaussian distributions.
#' @param mu1 Mean for distribution 1.
#' @param mu2 Mean for distribution 2.
#' @param sd1 Standard deviation for distribution 1.
#' @param sd2 Standard deviation for distribution 2.
one_symmetric_kl <- function(mu1, mu2, sd1, sd2){
stopifnot(length(mu1)==1 & length(mu2) == 1 &
length(sd1) == 1 & length(sd2) == 1)
kl12 <- log(sd2/sd1) + (sd1^2 + (mu1 - mu2)^2)/(2*sd2^2) - 1/2
kl21 <- log(sd1/sd2) + (sd2^2 + (mu2 - mu1)^2)/(2*sd1^2) - 1/2
kl_sym <- 0.5*(kl12 + kl21)
return(kl_sym)
}
#' Apply a gmm in each 1-dimensional cytogram.
#'
#' @param one_y One-column matrix.
#' @param numclust Number of clusters.
#' @return
#' @export
gmm_each <- function(one_y, numclust){
## Basic check
stopifnot(ncol(one_y) == 1)
## Do Gaussian Mixture model
## obj <- mclust::Mclust(data = one_y, G = numclust,
## modelNames = "V", verbose = FALSE)
obj <- my_mclust(one_y, numclust, FALSE)
## Memberships (soft- and hard-clustered)
hard_mem = obj$z %>% apply(1, which.max)
soft_mem = obj$z %>%
apply(1, function(myrow){
sample(1:2, size = 1, replace = FALSE, prob=myrow)
})
tab <- tibble(y = as.numeric(one_y),
soft_cluster = factor(soft_mem, levels = c(1,2)),
hard_cluster = factor(hard_mem, levels = c(1,2)))
## parameter table
mn1 = obj$parameters$mean[[1]]
mn2 = obj$parameters$mean[[2]]
sds = obj %>% .$parameters %>% .$variance %>% .$sigmasq %>% sqrt()
sd1 = sds[1]
sd2 = sds[2]
prob1 = obj$parameters$pro[1]
prob2 = obj$parameters$pro[2]
## ggplot(data.frame(x=as.numeric(one_y))) +
## geom_histogram(aes(x=x)) +
## geom_vline(xintercept = mn1, col = 'blue') +
## geom_vline(xintercept = mn1 + 2*sd1, col = 'skyblue') +
## geom_vline(xintercept = mn1 - 2*sd1, col = 'skyblue') +
## geom_vline(xintercept = mn2, col = 'red') +
## geom_vline(xintercept = mn2 - 2*sd2, col = 'orange') +
## geom_vline(xintercept = mn2 + 2*sd2, col = 'orange')
param = tibble(mn1 = mn1,
mn2 = mn2,
sd1 = sd1,
sd2 = sd2,
prob1 = prob1,
prob2 = prob2)
resp = obj$z
return(list(tab = tab, param = param, resp = resp))##, sigma_list = sigma_list))
}Here’s a function for soft-gating given a 2-column responsibility matrix.
#' Soft-gates a responsibility matrix by a bernoulli (or multinoulli) draw.
#'
#' @param oneresp A 2-column responsibility matrix
#'
#' @return
#' @export
soft_gate_one_responsibility_matrix <- function(oneresp){
## Setup
numclust = ncol(oneresp)
vec = rep(0,numclust)
## Draw the 0-1 memberships
zero_one_mat = apply(oneresp, 1, function(myrow){
draw = sample(1:numclust, size=1, prob=myrow)
vec[draw]= 1
vec
}) %>% t()
## Check dimensions and return
stopifnot(all(dim(zero_one_mat) == dim(oneresp)))
return(zero_one_mat)
}Here’s a useful function to create a flowtrend-like object but containing the “oracle” information about the cluster means, variances, and mixture probabilities.
#' Reformatting |datobj| to create a flowtrend-like object that contains
#' "oracle" information of the model that
#' generates the simulated pseudo-real data.
#'
#' @param datobj A data object.
#'
#' @return A flowtrend-like object containing mn, sigma, and prob.
#'
#' @export
create_oracle <- function(datobj){
TT = nrow(datobj$mns)
dimdat = 1
numclust = ncol(datobj$mns)
## Make a fake object in a similar format as a |flowtrend| object
fake_obj = list()
fake_obj$mn = array(NA, dim = c(TT, dimdat,numclust))
fake_obj$mn[,1,] = datobj$mns
fake_obj$prob = datobj$prob
fake_obj$sigma = array(NA, dim = c(numclust, dimdat, dimdat))
fake_obj$sigma[1,1,1] = datobj$sd1^2
fake_obj$sigma[2,1,1] = datobj$sd2^2
fake_obj$numclust = numclust
return(fake_obj)
}9.2 Evaluating performance: soft RAND index
We will use a “soft” Rand index that replaces the regular Rand index:
\[ \sum_{i, i'} 1\{ \hat C_i = \hat C_{i'}, C_i^* \neq C_{i'}^*\}.\]
which measures, for every pair of points \(i\) and \(i'\), the number of times that the two clustering mechanisms disagree.
Now, let’s say that the two mechanisms give probabilities:
\[\hat \gamma_{ik} = \hat P(\hat C_i = k),\] \[\hat \gamma^*_{ik} = \hat P(\hat C_i^* = k).\]
Then, the probability that the clustering is the same \(P(C_i^* = C_{i'}^*)\) for the pair of points \(i\) and \(i'\) is:
\[ (\gamma_i^*)^T (\gamma_{i'}^*) = \sum_{k=1} P(C_i = k) P(C_i^* = k) = P(C_i^* = C_{i'}^*).\]
and the probaility they are different is:
\[ (\gamma_i^*)^T (\gamma_{i'}^*) = \sum_{k=1} P(C_i = k) P(C_i^* = k) = P(C_i^* = C_{i'}^*).\]
So, we can measure the difference as:
\[\sum_{i,i'} (\hat \gamma_i^T \hat \gamma_{i'})\cdot(1- \gamma^*_i^T \gamma^*_{i'}) \]
And when one of the clusterings is soft, we can still use a 0-1 vector as \(\gamma\).
#' A "soft" version of a rand index between two sets of responsibility
#' (membership probability) matrices. Measures the /disagreement/ between the two clusterings.
#'
#' @param resp_list1 One list of responsibility matrices.
#' @param resp_list2 Another list of responsibility matrices.
#' @param times Optional; if you would like to isolate your attention to some specific times.
#'
#' @return A single soft rand index number
#' @export
rand_old <- function(resp_list1, resp_list2, times = NULL, prop = .25){
if(!is.null(times)) resp_list1 = resp_list1[times]
if(!is.null(times)) resp_list2 = resp_list2[times]
rand_onetime <- function(resp1, resp2){
stopifnot(nrow(resp1) == nrow(resp2))
stopifnot(ncol(resp1) == ncol(resp2))
nt = nrow(resp1)
## Form the score
mat11 = resp1 %*% t(resp1)
mat22 = resp2 %*% t(resp2)
mat11not = (1-mat11)
mat22not = (1-mat22)
## Make the diagonals not matter anywhere
diag(mat11) = 0
diag(mat22) = 0
diag(mat11not) = 0
diag(mat22not) = 0
a = mat11 * mat22
b = mat11not * mat22not
c = mat11 * mat22not
d = mat11not * mat22
return(c(a = sum(a), b = sum(b), c = sum(c), d = sum(d)))
}
abcd_over_time = mapply(rand_onetime, resp_list1, resp_list2)
abcdmat = abcd_over_time %>% t()
abcd = abcdmat %>% colSums()
score = (abcd["a"] + abcd["b"])/ (sum(abcd))
return(score)
}
#' Rand index between responsibilities.
#'
#' @param resp_list1 One list.
#' @param resp_list2 Another list.
#' @param times A subset of the time points (out of 1:length(resp_list1)) to
#' examine.
#' @param smaller If TRUE, use only a small (sampled) subset of the particles'
#' responsibilities for calculating RAND.
#' @param prop How much to downsample; defaults to 0.1.
#' @export
rand <- function(resp_list1, resp_list2, times = NULL, smaller = TRUE, prop = 0.1){
## Basic checks
stopifnot(all(sapply(resp_list1, nrow) == sapply(resp_list2, nrow)))
## Subset the times if needed
if(!is.null(times)) resp_list1 = resp_list1[times]
if(!is.null(times)) resp_list2 = resp_list2[times]
## names(everything)
## list2env(everything,envir = environment())
## resp_list1 = resp_oracle
## resp_list2 = true_resp
if(smaller){
indslist = lapply(resp_list1, function(oneresp){
inds = sample(x=1:nrow(oneresp), size=ceiling(nrow(oneresp)*prop), replace=FALSE) %>% sort()
})
resp_list1 = mapply(function(a,b)a[b,,drop=FALSE], resp_list1, indslist, SIMPLIFY=FALSE)
resp_list2 = mapply(function(a,b)a[b,,drop=FALSE], resp_list2, indslist, SIMPLIFY=FALSE)
}
## Make each list into one long matrix
resp1 = Reduce(rbind, resp_list1)
resp2 = Reduce(rbind, resp_list2)
## resp1 = do.call(rbind, resp_list1)## %>% bind_rows()
## resp2 = do.call(rbind, resp_list2)## %>% bind_rows()
stopifnot(nrow(resp1) == nrow(resp2))
stopifnot(ncol(resp1) == ncol(resp2))
nt = nrow(resp1)
## Form the score
mat11 = resp1 %*% t(resp1)
mat22 = resp2 %*% t(resp2)
mat11not = (1-mat11)
mat22not = (1-mat22)
## Make the diagonals not matter anywhere
diag(mat11) = 0
diag(mat22) = 0
diag(mat11not) = 0
diag(mat22not) = 0
a = mat11 * mat22
b = mat11not * mat22not
c = mat11 * mat22not
d = mat11not * mat22
abcd = c(a = sum(a), b = sum(b), c = sum(c), d = sum(d))
score = (abcd["a"] + abcd["b"])/ (sum(abcd))
return(score)
}
#' 2 x 2 contigency table from membership vectors.
#'
#' @param mem1 Membership vector.
#' @param mem2 Another membership vector.
#'
#' @return A 3x3 matrix containing (1) a 2 x 2 table in the first [1:2,1:2]
#' entries, and (2) row sums and column sums and total sums in the [,3], [3,]
#' entries.
#' @export
make_contingency_table<-function(mem1, mem2){
tab = table(mem1, mem2)
tab = cbind(tab, rowSums(tab))
tab = rbind(tab, colSums(tab))
return(tab)
}
#' Calculate RAND index from a contingency table.
#'
#' @param tab A matrix containing a 2 x 2 table in the first [1:2,1:2]
#' entries; e.g., from make_contingency_table().
#'
#' @return Rand index.
#' @export
get_rand_from_table <- function(tab){
numer = sum(sapply(as.numeric(tab), function(nij) choose(nij, 2))) +
tab[1,1] * tab[2,2] + tab[1,2] * tab[2,1]
denom = (choose(sum(tab), 2))
ri = numer/denom
return(ri)
}
#' Faster rand index calculation from membership vectors.
#'
#' @param mem1
#' @param mem2
#'
#' @return RAND index.
#' @export
rand_from_mems <- function(mem1, mem2){
make_contingency_table(mem1, mem2) %>%
.[1:2,1:2] %>%
get_rand_from_table()
}It’s also helpful to have a function that takes a list of discrete memberships
(integers 1,..,K) and convert it to a list of responsibility matrices similar in
format to obj$resp of a flowtrend object obj.
#' Convert list of memberships into a list of responsibilities.
#'
#' @param memlist List of memberships
#'
#' @return
#' @export
memlist_to_respmat <- function(memlist){
numclust = max(sapply(memlist, max))
respmats = lapply(memlist, function(mem){
respmat = lapply(mem, function(onemem){
onerow = rep(0, numclust)##c(0,0)
onerow[onemem] = 1
return(onerow)
}) %>% do.call(rbind, .)
return(respmat)
})
return(respmats)
}9.3 Helpers in generating pseudo-real data
Two clusters will be taken from an estimated flowtrend model fit on real data, downloaded from here:
https://zenodo.org/records/6471995
to ./inst/data.
9.4 Miscellaneous helpers
Lastly, there are some miscellaneous helpers that are useful when running actual jobs on a server.
The first one is create_destin(), which creates a directory if it doesn’t already exist.
#' Creates a directory \code{destin}, if it doesn't already exist.
#'
#' @param destin Destination directory.
#'
#' @return Nothing.
#' @export
create_destin <- function(destin){
if(!dir.exists(destin)){
dir.create(destin, recursive = TRUE)
cat("Creating destin: ", destin, fill=TRUE)
} else {
cat("All output goes out to destin: ", destin, fill = TRUE)
}
}Another helper parse_args() helps R read in trailing arguments from the command line, like this:
parse_args(args = commandArgs(trailingOnly = TRUE), verbose=TRUE)
so that one can run jobs using a SLURM command such as:
sbatch --export=summ=0 --array=1 run-3dreal.slurm
from which the R script can access the summ=0 variable.
#' Parse command line arguments and assigns the values of them. |args| is meant
#' to just be additional command line arguments.
#'
#' @param args Argument.
#'
#' @return Nothing.
#' @export
parse_args <- function(args, verbose=FALSE){
args = sapply(args, strsplit, "=")
print(args)
for(arg in args){
## Check if the thing is integer
all_numbers = str_detect(arg[2], "^[:digit:]+$")
## Assign the variable
if(all_numbers){
assign(arg[1], as.numeric(arg[2]), inherits = TRUE)
} else {
assign(arg[1], arg[2], inherits = TRUE)
}
if(verbose){
cat(arg[1], "takes the value of", arg[2],
"from command line input", fill=TRUE)
}
print("===============================")
}
}