8 Testing the flowtrend method

8.1 1d example

First, we’ll generate data.

set.seed(100)
dt       <- gendat_1d(100, rep(100, 100), offset = 4)
dt_model <- gendat_1d(100, rep(100, 100), return_model = TRUE, offset=4)
ylist = dt %>% dt2ylist()
x = dt %>% pull(time) %>% unique()
plot_1d(ylist)

dt_model %>% select(time, cluster, prob) %>% ggplot() + geom_line(aes(x=time, y=prob, group=cluster, col = cluster))

Next, we fit a model (with hand-picked lambda values).

set.seed(18)
obj <- flowtrend(ylist = ylist,
                 x = x,
                 maxdev = 1,
                 numclust = 3,
                 l = 2,
                 l_prob = 1,
                 lambda = .1,
                 lambda_prob = .1,
                 nrestart = 5)
 
## Also reorder the cluster labels of the truth, to match the fitted model.
ord = obj$mn[,1,] %>% colSums() %>% order(decreasing=TRUE)
lookup <- setNames(c(1:obj$numclust), ord)
dt_model$cluster = lookup[as.numeric(dt_model$cluster)] %>% as.factor()

## Reorder the cluster lables of the fitted model.
obj = reorder_clust(obj)

The data and estimated model are shown here, along with the true means in dashed black lines.

plot_1d(ylist = ylist, obj = obj, x = x) +
  geom_line(aes(x = time, y = mean, group = cluster),
            data = dt_model,## %>% subset(time %ni% held_out),
            linetype = "dashed", size=2, alpha = .7)

Also, the estimated probabilities are shown here.

plot_prob(obj=obj, x=x) +
  geom_line(aes(x = time, y = prob, group = cluster, color = cluster),
            data = dt_model, linetype = "dashed")  +
  facet_wrap(~cluster)

8.2 Testing monotonicity of objective values

The objective value (that is, the penalized log likelihood) should be monotone decreasing across EM algorithm iterations.

testthat::test_that("Objective value decreases over EM iterations.",{
  
  glist = list()
  for(iseed in 1:5){

    ## Generate synthetic data
    set.seed(iseed*100)
    dt       <- gendat_1d(100, rep(10, 100))
    dt_model       <- gendat_1d(100, rep(10, 100), return_model = TRUE)
    ylist = dt %>% dt2ylist()
    x = dt %>% pull(time) %>% unique()
    
    ## Fit model
    obj <- flowtrend_once(ylist = ylist,
                          x = x,
                          maxdev = 5,
                          numclust = 3,
                          lambda = 0.02,
                          l = 1,
                          l_prob = 2,
                          lambda_prob = 0.05)

    ## Test objective monotonicity
    niter_end = length(obj$objective)
    testthat::expect_true(all(diff(obj$objective) < 1E-3))
  

    ## Make a plot
    g = ggplot(tibble(iter=1:niter_end, objective=obj$objectives)) +
      geom_point(aes(x=iter, y=objective)) +
      geom_line(aes(x=iter, y=objective)) +
      ggtitle(paste0("Seed=", iseed*100)) +
      xlab("EM iteration")
    glist[[iseed]] = g
  }
  title = cowplot::ggdraw() + cowplot::draw_label("Objective values over EM iterations", fontface='bold')
  main_plot = cowplot::plot_grid(plotlist = glist, ncol=5, nrow=1) 
  cowplot::plot_grid(title, main_plot, ncol=1, rel_heights=c(0.1, 1)) %>% print()
})

## Test passed 🎉

8.3 2d example

Next, we try out flowtrend on a synthetic 2d data example.

set.seed(100) 
TT = 100
dt <- gendat_2d(TT, rep(100, times = TT))
x = 1:TT
set.seed(10)
obj <- flowtrend(ylist = dt$ylist,
                 x = x,
                 maxdev = 3,
                 numclust = 3,
                 l = 2,
                 l_prob = 2,
                 lambda = 0.01,
                 lambda_prob = .01,
                 rho_init = 0.01, nrestart = 1)

These are some snapshots at time \(t=10\) and \(t=20\).

plot_2d(dt$ylist, obj = obj, tt = 10, bin = FALSE) + coord_fixed() + ylim(-6, 3) + xlim(-6, 3)

plot_2d(dt$ylist, obj = obj, tt = 20, bin = FALSE) + coord_fixed() + ylim(-6, 3) + xlim(-6, 3)

8.4 Working with “binned” dataset

Recall that “binning” means we will use binned frequency histogram estimates of the original particle-level dataset; this is useful when there are too many particles.

## library(flowmix)
set.seed(10232)
TT = 100
dt <- gendat_2d(TT, rep(100, times = TT))
manual_grid = flowmix::make_grid(dt$ylist, gridsize = 20)
## Warning: replacing previous import 'RcppArmadillo::fastLmPure' by 'RcppEigen::fastLmPure' when loading 'flowmix'
## Warning: replacing previous import 'RcppArmadillo::fastLm' by 'RcppEigen::fastLm' when loading 'flowmix'
binres = flowmix::bin_many_cytograms(dt$ylist, manual.grid = manual_grid)
set.seed(100)
obj = flowtrend(ylist = binres$ybin_list,
                countslist = binres$counts_list,
                maxdev = 2,
                numclust = 3,
                l = 2,
                l_prob = 2,
                lambda = .01,
                lambda_prob = .005,
                rho_init = .01,
                nrestart = 1)

plot_2d(ylist = binres$ybin_list,
       countslist = binres$counts_list,
       obj = obj, tt = 10, bin = TRUE) + coord_fixed()
## Warning: Raster pixels are placed at uneven horizontal intervals and will be shifted
## ℹ Consider using `geom_tile()` instead.
## Warning: Raster pixels are placed at uneven horizontal intervals and will be shifted
## ℹ Consider using `geom_tile()` instead.

plot_2d(ylist = binres$ybin_list,
       countslist = binres$counts_list,
       obj = obj, tt = 100, bin = TRUE) + coord_fixed()
## Warning: Raster pixels are placed at uneven horizontal intervals and will be shifted
## ℹ Consider using `geom_tile()` instead.

Also plot the cluster probabilities:

true_prob_long = dt$probs %>% as_tibble() %>% add_column(time = 1:TT) %>% pivot_longer(-time)
plot_prob(obj) +
  geom_line(aes(x = time, y = value, group = name), data = true_prob_long, linetype = "dashed") +
  ylim(c(0,1)) + ylab("")

8.5 Unevenly spaced inputs (x)

Let’s try the EM algorithm out with unevenly spaced inputs.

set.seed(100)
dt       <- gendat_1d(TT=100, rep(10, times=100))
dt_model       <- gendat_1d(TT=100, rep(100, times=100), return_model = TRUE)
ylist_orig = dt %>% dt2ylist()

## Two ways of removing some time points
ind_rm_list = list(seq(from=10, to=100, by=10),
                   ind_rm = 30:50)

## Try both ways, and see that the objective values
objlist = list()
for(ii in 1:2){
  ind_rm = ind_rm_list[[ii]]
  x = (1:100)[-ind_rm]
  ylist = ylist_orig[x]
  set.seed(100)
  obj <- flowtrend_once(ylist = ylist, 
                        x = x,
                        maxdev = 100,
                        numclust = 3,
                        l = 2,
                        l_prob = 2,
                        lambda = 0.1,
                        lambda_prob = 0.1, 
                        admm_local_adapt = TRUE,
                        rho_init = 0.01,
                        verbose = FALSE)
  objlist[[ii]] = obj
}

## Plot both results (and objectives)
ii = 1
ind_rm = ind_rm_list[[ii]]
x = (1:100)[-ind_rm]
gg1 = plot_1d(obj = objlist[[1]], ylist = ylist_orig[x], x = x) +
  ggtitle('Randomly missing time points')
all_objectives = objlist[[1]]$objectives 
gg2 = data.frame(iter = 1:length(all_objectives), objectives = all_objectives) %>%
  ggplot(aes(x = iter, y = objectives)) +
  geom_point() + geom_line() + theme_minimal() +
  ggtitle("EM objectives")


ii=2
ind_rm = ind_rm_list[[ii]]
x = (1:100)[-ind_rm]
gg3 = plot_1d(obj = objlist[[2]], ylist = ylist_orig[x], x = x) +
  ggtitle('Chunks of missing time points')
all_objectives = objlist[[2]]$objectives 
gg4 = data.frame(iter = 1:length(all_objectives), objectives = all_objectives) %>%
  ggplot(aes(x = iter, y = objectives)) +
  geom_point() + geom_line() + theme_minimal() +
  ggtitle("EM objectives")

cowplot::plot_grid(gg1, gg2, gg3, gg4)

## %>%  ggsave(filename = file.path("inst",
##                               "uneven-spaced-data-objectives.png"), width = 5, height = 3)

(The dashed black lines are the truth, and the solid colored lines are the estimated means.)