6 Reordering clusters

It’s useful to be able to reorder, or permute, one model’s cluster labels (cluster 1,2,.. of newres which are arbitrary) to that of another model origres. The function reorder_kl() does this by (1) taking the posterior probabilities of the particles in ylist_particle (row-binded to be a \(\sum_t n_t \times K\) matrix), and then (2) using a Hungarian algorithm [@kuhn-hungarian] to best match the two elongated matrices \(A\) and \(B\) by measuring the symmetric KL divergence between all permutations of the two matrices’ \(K\) columns.

#' Reorder the cluster numbers for a new flowtrend object \code{newres}; the best
#' permutation (reordering) is to match the original flowtrend object
#' \code{origres}. Also works for flowmix objects.
#'
#' @param newres New flowtrend object to reorder.
#' @param origres Original flowtrend object.
#' @param ylist_particle The particle-level data.
#' @param fac Defaults to 100, to take 1/100'th of the particles from each time point.
#' @param verbose Loud or not?
#'
#' @return Reordered flowtrend object.
#'
#' @export
reorder_kl <- function(newres, origres, ylist_particle, fac = 100, verbose = FALSE){

  ## Randomly sample  1/100 of the original particles (mainly for memory reasons)
  TT = length(ylist_particle)
  N = sapply(ylist_particle, nrow) %>% sum()
  ntlist = sapply(ylist_particle, nrow)
  indlist = lapply(1:TT, function(tt){
    nt = ntlist[[tt]]
    ind = sample(1:nt, round(nt / fac), replace=FALSE)
  })

  ## Sample responsibilities
  ylist_particle_small = Map(function(ind, y){ y[ind,,drop = FALSE]  }, indlist, ylist_particle)

  ## Calculate new responsibilities
  resp_orig_small <- Estep(origres$mn, origres$sigma, origres$prob,
                           ylist = ylist_particle_small,
                           numclust = origres$numclust, first_iter = TRUE)
  resp_new_small <- Estep(newres$mn, newres$sigma, newres$prob,
                          ylist = ylist_particle_small,
                          numclust = newres$numclust, first_iter = TRUE)
  assertthat::assert_that(all(sapply(resp_orig_small, dim) == sapply(resp_new_small, dim)))

  ## Get best ordering (using symm. KL divergence and Hungarian algorithm for
  ## matching)
  best_ord <- get_best_match_from_kl(resp_new_small, resp_orig_small)

  if(verbose) cat("New order is", best_ord, fill=TRUE)
  newres_reordered_kl = newres %>% reorder_clust(ord = best_ord)

  ## Return the reordered object
  return(newres_reordered_kl)
}

This function uses get_best_match_from_kl(), which takes two lists containing responsibilities (posterior probabilities of particles) – one from each model – and returns the cluster ordering to apply to the model that produced resp_new. We define this function and a couple of helper functions next.

#' Compute KL divergence from responsibilities between two models'
#' responsibilities \code{resp_new} and \code{resp_old}.
#'
#' @param resp_new New responsibilities
#' @param resp_orig Original responsiblities.
#'
#' @return Calculate reordering \code{o} of the clusters in model represented
#'   by \code{resp_new}. To be clear, \code{o[i]} of new model is the best
#'   match with the i'th cluster of the original model.
#'
#' @export
#' @importFrom clue solve_LSAP
get_best_match_from_kl <- function(resp_new, resp_orig){

  ## Basic checks
  . = NULL ## Fixing check()
  assertthat::assert_that(all(sapply(resp_new, dim) == sapply(resp_orig , dim)))

  ## Row-bind all the responsibilities to make a long matrix
  distmat = form_symmetric_kl_distmat(resp_orig %>% do.call(rbind,.),
                                      resp_new %>% do.call(rbind,.))

  ## Use Hungarian algorithm to solve.
  fit <- clue::solve_LSAP(distmat)
  o <- as.numeric(fit)

  ## Return the ordering
  return(o)
}

#' From two probability matrices, form a (K x K) distance matrix of the
#' (n)-vectors. The distance between the vectors is the symmetric KL
#' divergence.
#'
#' @param mat1 Matrix 1 of size (n x K).
#' @param mat2 Matrix 2 of size (n x K).
#'
#' @return K x K matrix containing symmetric KL divergence of each column of
#'   \code{mat1} and \code{mat2}.
form_symmetric_kl_distmat <- function(mat1, mat2){

  ## Manually add some small, in case some columns are all zero
  mat1 = (mat1 + 1E-10) %>% pmin(1)
  mat2 = (mat2 + 1E-10) %>% pmin(1)

  ## Calculate and return distance matrix.
  KK1 = ncol(mat1)
  KK2 = ncol(mat2)
  distmat = matrix(NA, ncol=KK2, nrow=KK1)
  for(kk1 in 1:KK1){
    for(kk2 in 1:KK2){
      mydist = symmetric_kl(mat1[,kk1, drop=TRUE], mat2[,kk2, drop=TRUE])
      distmat[kk1, kk2] = mydist
    }
  }
  stopifnot(all(!is.na(distmat)))
  return(distmat)
}

#' Symmetric KL divergence, of two probability vectors.
#'
#' @param vec1 First probability vector.
#' @param vec2 Second prbability vector.
#'
#' @return Symmetric KL divergence (scalar).
symmetric_kl <- function(vec1, vec2){
  stopifnot(all(vec1 <= 1) & all(vec1 >= 0))
  stopifnot(all(vec2 <= 1) & all(vec2 >= 0))
  kl <- function(vec1, vec2){
    sum(vec1 * log(vec1 / vec2))
  }
  return((kl(vec1, vec2) + kl(vec2, vec1))/2)
}

Finally, the function that actually performs the manual reordering the clusters of an estimated model obj is reorder_clust().

#' Reorder the results of one object so that cluster 1 through
#' \code{numclust} is in a particular order. The default is decreasing order of
#' the averages (over time) of the cluster means.
#'
#' @param res Model object.
#' @param ord Defaults to NULL. Use if you have an ordering in mind.
#'
#' @return Same object, but with clusters reordered.
#'
#' @export
reorder_clust <- function(res, ord = NULL){

  ## Find an order by sums (averages)
  stopifnot(class(res) == "flowtrend")
  if(is.null(ord)) ord = res$mn[,1,] %>% colSums() %>% order(decreasing = TRUE)
  if(!is.null(ord)) all(sort(ord) == 1:res$numclust)

  ## Reorder mean
  res$mn = res$mn[,,ord, drop=FALSE]

  ## Reorder sigma
  res$sigma = res$sigma[ord,,,drop=FALSE]

  ## Reorder prob
  res$prob = res$prob[,ord, drop=FALSE]

  ## Reorder the responsibilities
  ## if('resp' %in% res){
    resp_temp = res$resp
    for(tt in 1:res$TT){
      resp_temp[[tt]] = res$resp[[tt]][,ord]
    }
  ## }
  res$resp = resp_temp
  return(res)
}

Here’s an example of how to use this.

devtools::load_all("~/repos/FlowTF")
set.seed(100)
dt       <- gendat_1d(100, rep(100, 100), die_off_time = 0.45)
dt_model <- gendat_1d(100, rep(100, 100), die_off_time = 0.45, return_model = TRUE)
ylist = dt %>% dt2ylist()
x = dt %>% pull(time) %>% unique()

## Fit model twice.
set.seed(2)
objlist <- lapply(1:2, function(isim){
  flowtrend(ylist = ylist,
             x = x,
             maxdev = 5,
             numclust = 3,
             lambda = 0.02,
             l = 1,
             l_prob = 2,
             lambda_prob = .005,
             nrestart = 1, verbose = TRUE)})

## Perform the reordering, make three plots
newres = objlist[[1]]
origres = objlist[[2]]
newres_reordered = reorder_kl(newres, origres, ylist, fac = 100, verbose = FALSE)
plot_1d(ylist, newres, x = x) + ggtitle("before reordering, model 1")
plot_1d(ylist, origres, x = x) + ggtitle("model 2")
plot_1d(ylist, newres_reordered, x = x) + ggtitle("reordered model 1")