4 Generating and plotting data

4.1 1d data

4.1.1 Generating 1d data

This function generates synthetic 1-dimensional data, and returns it in a “long” format matrix, with columns time, y, mu, and cluster. The latter two are the true underlying parameters.

#' Generates some synthetic 1-dimensional data with three clusters. Returns a
#' data frame with (1) time, (2) Y (3) mu (4) cluster assignments.
#'
#' @param TT Number of time points.
#' @param nt Number of particles at each time.
#' @param offset Defaults to 0. How much to push up cluster 1.
#' @param return_model If true, return true cluster means and probabilities
#'   instead of data.
#'
#' @return long matrix with data or model.
#' @export
gendat_1d <- function(TT, ntlist, offset = 0, return_model = FALSE,
                      sd3=NULL){

  ## Basic checks
  stopifnot(length(ntlist) == TT)
  prob_link1 = rep(NA, TT)
  prob_link1[1] = 2
  for(tt in 2:floor(TT/2)){
    prob_link1[tt] = prob_link1[tt-1] + (1/TT)
  }
  for(tt in (floor(TT/2)+1):TT){
    prob_link1[tt] = prob_link1[tt-1] - .5*(1/TT)
  }
  prob_link2 = sapply(1:TT, function(tt) 3 - 0.25*(tt/TT))
  prob_link3 = sapply(1:TT, function(tt) 2.5)
  linkmat = cbind(prob_link1, prob_link2, prob_link3) ##%>% matplot()
  mysum = exp(linkmat) %>% rowSums()
  cluster_prob1 = exp(prob_link1) / mysum
  cluster_prob2 = exp(prob_link2) / mysum
  cluster_prob3 = exp(prob_link3) / mysum
  probs = cbind(cluster_prob1, cluster_prob2, cluster_prob3)
  colnames(probs) = 1:3
  sd1 = 0.4
  sd2 = 0.5
  if(is.null(sd3)) sd3 = 0.35
  ## if(!is.null(sd3)) sd3 = 1/1.5 ## The ratio of amplitude:data-standard-deviation is about
  ##             ## $1.50695$.
  probs_long = as_tibble(probs) %>%
    add_column(time = 1:TT) %>%
    pivot_longer(-"time", names_to = "cluster", values_to = "prob")
  probs_long %>% ggplot() + geom_line(aes(x=time,y=prob, group=cluster, col=cluster)) + ylim(c(0,1))

  ## Make cluster means, by time
  means <- matrix(NA, TT, 3)
  for(ii in 1:3){
    for(tt in 1:TT){
      means[tt, 1] <- offset + tt/TT - 1.5
      means[tt, 2] <- sin(seq(-1, 1, length.out = TT)[tt] * 3.1415)
      means[tt, 3] <- -3+sin(seq(-1, 1, length.out = TT)[tt] * 6.282)
    }
  }
  colnames(means) = c(1:3)
  means_long = as_tibble(means) %>%
    add_column(time = 1:TT) %>%
    pivot_longer(-"time", names_to = "cluster", values_to = "mean")
  model = full_join(means_long, probs_long, by = c("time", "cluster"))
  model = model %>% left_join(tibble(cluster = c("1", "2", "3"),
                                     sd = c(sd1, sd2, sd3)),
                              by = "cluster") ## Does this work?
  if(return_model) return(model)

  ys <- lapply(1:TT, FUN = function(tt){

    Y <- vector(mode = "list", length = 2)
    mu <- vector(mode = "list", length = 2)
    clusters_count <- rmultinom(n = 1, size = ntlist[tt], prob = probs[tt,])

    for(ii in 1:3){
      if(ii == 1){
        mn = means[tt,ii, drop=TRUE]
        Y[[ii]] <- replicate(clusters_count[ii,1], mn + rnorm(1, mean=0, sd = sd1))
        mu[[ii]] <- rep(mn, clusters_count[ii,1])
      }
      if(ii == 2){
        mn = means[tt,ii, drop=TRUE]
        Y[[ii]] <- replicate(clusters_count[ii,1], mn + rnorm(1, mean=0, sd = sd2))
        mu[[ii]] <- rep(mn, clusters_count[ii,1])
      }
      if(ii == 3){
        mn = means[tt,ii, drop=TRUE]
        Y[[ii]] <- replicate(clusters_count[ii,1], mn + rnorm(1, mean=0, sd = sd3))
        mu[[ii]] <- rep(mn, clusters_count[ii,1])
      }
    }
    Y <- unlist(Y)
    mu <- unlist(mu)
    cluster <- rep(1:3, times = clusters_count)
    one_df = tibble(time = tt, Y = Y, mu = mu, cluster = cluster)
    return(one_df)
  }) %>% bind_rows()
  return(ys)
}

dt2ylist() is a helper that takes the output generated from gendat_1d(), and splits it by the time column to create a ylist object, which is a \(T\)-length list of \(n_t \times d\) matrices.

#' Converting to a list of matrices, \code{ylist}, to input to \code{flowtrend()}.
#'
#' @param dt Output from \code{gendat_1d()}.
#'
#' @return List of matrices
#' @export
dt2ylist <- function(dt){
  dt%>% dplyr::select(time, Y) %>% arrange(time) %>%
    group_by(time) %>%
    group_split(.keep = FALSE) %>%
    lapply(as.matrix)
}

Let’s generate some data using these functions.

dt = gendat_1d(TT = 100, ntlist =rep(100,100))
print(dt)

## # A tibble: 10,000 × 4
##     time      Y    mu cluster
##    <int>  <dbl> <dbl>   <int>
##  1     1 -1.52  -1.49       1
##  2     1 -1.31  -1.49       1
##  3     1 -1.34  -1.49       1
##  4     1 -2.41  -1.49       1
##  5     1 -1.55  -1.49       1
##  6     1 -2.03  -1.49       1
##  7     1 -0.731 -1.49       1
##  8     1 -1.47  -1.49       1
##  9     1 -1.54  -1.49       1
## 10     1 -2.19  -1.49       1
## # ℹ 9,990 more rows

ylist = dt2ylist(dt)
print(head(str(ylist[1:5])))

## List of 5
##  $ : num [1:100, 1] -1.52 -1.31 -1.34 -2.41 -1.55 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr "Y"
##  $ : num [1:100, 1] -1.3 -1.94 -1.43 -1.31 -1.72 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr "Y"
##  $ : num [1:100, 1] -1.419 -1.794 -0.978 -1.436 -1.919 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr "Y"
##  $ : num [1:100, 1] -1.085 -1.552 -1.91 -1.012 -0.703 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr "Y"
##  $ : num [1:100, 1] -2.06 -1.01 -1.73 -1.94 -1.84 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr "Y"
## NULL

print(head(ylist[[1]]))

##              Y
## [1,] -1.520855
## [2,] -1.308549
## [3,] -1.336187
## [4,] -2.413723
## [5,] -1.553342
## [6,] -2.033201

Next, we’ll make some plotting functions 1d model and data.

4.1.2 Plotting 1d data

Given 1d data ylist and an estimated model flowtrend object obj, we want to visualize these in a single plot. plot_1d() lets you do this.

#' Makes 1d plot of data and model
#'
#' @param ylist Data. A list of (|nt| by |dimdat|) matrices
#' @param obj A flowtrend (or flowmix) object. Defaults to NULL.
#' @param x Time points. Defaults to NULL.
#' @param alpha Between 0 and 1, how transparent to plot the data
#'   points. Defaults to 0.1.
#' @param bin If TRUE, the data is binned.
#'
#' @return ggplot object with data, and optionally, a flowtrend model overlaid.
#' @export
plot_1d <- function(ylist, countslist=NULL, obj=NULL, x = NULL, alpha = .1, bin = FALSE,
                    plot_band = TRUE){

  ## Basic checks
  if(!is.null(obj))stopifnot(class(obj) %in% c("flowmix", "flowtrend"))
  if(!is.null(x)){
    stopifnot(length(x) == length(ylist))
    times = x
  } else {
    times = 1:length(ylist)
  }
  dimdat = ncol(ylist[[1]])
  assertthat::assert_that(dimdat == 1)

  ## If countslist is not provided, make a dummy
  if(is.null(countslist)) countslist = lapply(ylist, function(y) rep(1, nrow(y)))


  ## Make data into long matrix
  ymat <- lapply(1:length(ylist), FUN = function(tt){
    data.frame(time = times[tt], Y = ylist[[tt]], counts = countslist[[tt]])
  }) %>% bind_rows() %>% as_tibble()
  colnames(ymat) = c("time", "Y", "counts") ## when ylist[[tt]] already has a column name, this is needed.
  
  if(bin){
    gg = ymat %>% ggplot() +
      geom_raster(aes(x = time, y = Y, fill = counts)) +
      theme_bw() + ylab("Data") + xlab("Time")  +
      scale_fill_gradientn(colours = c("white", "black"))
  }
  if(!bin){
    gg = ymat %>% ggplot() +
      geom_point(aes(x = time, y = Y), alpha = alpha) +
      theme_bw() + ylab("Data") + xlab("Time") 
  }

  ## If there is a flowtrend object to plot, do it.
  if(is.null(obj)){
    return(gg)
  } else {
  
    ## Add the model
    numclust = obj$numclust
    mnmat = obj$mn %>% .[,1,] %>%  `colnames<-`(1:numclust) %>% as_tibble() %>% 
      add_column(time = times)
    probmat = obj$prob %>% as_tibble() %>% setNames(1:numclust) %>% add_column(time = times)
    mn_long = mnmat %>% pivot_longer(-time, names_to = "cluster", values_to = "mean")
    prob_long = probmat %>% pivot_longer(-time, names_to = "cluster", values_to = "prob")
    est_long = full_join(mn_long, prob_long, by = c("time","cluster"))
    gg = gg + geom_path(aes(x = time, y = mean, linewidth = prob, group = cluster, color = cluster),
                        data = est_long,
                        lineend = "round", linejoin="mitre") +
      scale_linewidth(range = c(0.05, 5), limits = c(0, 1))

    if(plot_band){
      ## Add the estimated 95% probability regions for data.
      stdev = obj$sigma %>% .[,,1] %>% sqrt()
      mn_long_by_clust = mn_long %>% group_by(cluster) %>% group_split()
      band_long_by_clust = lapply(1:numclust, function(iclust){
        mn_long_by_clust[[iclust]] %>% mutate(upper = mean + 1.96 * stdev[iclust]) %>% mutate(lower = mean - 1.96 * stdev[iclust])
      })
      band_long = band_long_by_clust %>% bind_rows()
      gg = gg + geom_line(aes(x = time, y = upper, group = cluster, color = cluster),
                     data = band_long, size = rel(.7), alpha = .5) +
        geom_line(aes(x = time, y = lower, group = cluster, color = cluster),
                  data = band_long, size = rel(.7), alpha = .5) +
      guides(size = "none") # To turn off line size from legend
    }
    return(gg)
  }
}

The plotting function plot_1d() will be even more useful when we have a model, but can also simply plot the data ylist. Let’s try this out.

set.seed(100)
dt       <- gendat_1d(100, rep(100, 100))
dt_model <- gendat_1d(100, rep(100, 100), return_model = TRUE)
ylist = dt %>% dt2ylist()
x = dt %>% pull(time) %>% unique()

plot_1d(ylist = ylist, x = x) +
  geom_line(aes(x = time, y = mean, group = cluster),
            data = dt_model,
            linetype = "dashed", size=2, alpha = .7)

Voilà!

Adding a model is easy, you can use:

plot_1d(ylist, countslist) %>% plot_1d_add_model(obj = obj, idim = 1)

#' Add a model to an existing plot from \code{plot_1d()}.
#' This actually can take a model \code{obj}, and just plot the 1-dimensional projections of the model in the \code{idim} dimension.
#' 
#' @param gg0 ggplot object from running \code{plot_1d}.
#' @param obj Model
#' @param idim dimension
#' @param plot_band If TRUE, plot a +1.96 standard deviation band.
#'
#' @return ggplot object
#'
#' @export
plot_1d_add_model <- function(gg0, obj, idim, plot_band = TRUE){

  times = obj$x
  numclust = obj$numclust
  mnmat = obj$mn %>% .[,idim,] %>%  `colnames<-`(1:numclust) %>% as_tibble() %>% 
    add_column(time = times)
  probmat = obj$prob %>% as_tibble() %>% setNames(1:numclust) %>% add_column(time = times)
  mn_long = mnmat %>% pivot_longer(-time, names_to = "cluster", values_to = "mean") %>%
    mutate(cluster = factor(cluster, levels=sapply(1:10, toString)))
  prob_long = probmat %>% pivot_longer(-time, names_to = "cluster", values_to = "prob") %>% 
    mutate(cluster = factor(cluster, levels=sapply(1:10, toString)))
  est_long = full_join(mn_long, prob_long, by = c("time","cluster"))
  gg = gg0 + geom_path(aes(x = time, y = mean, linewidth = prob, group = cluster, color = cluster),
                       data = est_long, lineend = "round", linejoin="mitre") +
    scale_linewidth(range = c(0.05, 5), limits = c(0, 1))

  ## Add the estimated 95% probability regions for data.
  if(plot_band){
    stdev = obj$sigma %>% .[,idim,idim] %>% sqrt()
    names(stdev) = sapply(1:numclust, toString)
    mn_long_by_clust = mn_long %>% group_by(cluster) %>% group_split()
    band_long_by_clust = lapply(1:numclust, function(iclust){
      mn_long_by_clust[[iclust]] %>%
        mutate(upper = mean + 1.96 * stdev[iclust]) %>%
        mutate(lower = mean - 1.96 * stdev[iclust])
    })
    band_long = band_long_by_clust %>% bind_rows()
    gg = gg + geom_line(aes(x = time, y = upper, group = cluster, color = cluster),
                        data = band_long, size = rel(.7), alpha = .5) +
      geom_line(aes(x = time, y = lower, group = cluster, color = cluster),
                data = band_long, size = rel(.7), alpha = .5) +
      guides(size = "none") 
  }
  return(gg)
}

Also, we will want to plot the estimated cluster probabilities of a model obj of class flowtrend.

#' Makes cluster probability plot (lines over time).
#'
#' @param obj Estimated model (from e.g. \code{flowtrend()})
#'
#' @export
plot_prob <- function(obj, x = NULL){

  ## Basic checks
  if(!is.null(obj)) stopifnot(class(obj) %in% c("flowmix", "flowtrend"))
  if(!is.null(x)){
    times = x
  } else {
    ##stop("must provide x")
    times = 1:(obj$TT)
  }

  numclust = obj$numclust
  probmat = obj$prob %>% as_tibble() %>% setNames(1:numclust) %>% add_column(time = times)
  prob_long = probmat %>% pivot_longer(-time, names_to = "cluster", values_to = "prob")
  prob_long %>% ggplot() +
    geom_line(aes(x=time, y = prob, group = cluster, col = cluster), size = rel(1)) +
    ggtitle("Estimated cluster probability")
}

We can’t test it out now, but we’ll use it later in 1d-example.

Here is a 1d plotting function if you have cluster memberships memlist.

#' Makes 1d plot of data when
#'
#' @param ylist Particlel-level data. A list of (|nt| by |dimdat|) matrices.
#' @param memlist List of memberships for the particles.
#' @param x Time points. Defaults to NULL.
#' @param alpha Between 0 and 1, how transparent to plot the data
#'   points. Defaults to 0.1.
#'
#' @return ggplot object with data, and optionally, a flowtrend model overlaid.
#' @export
plot_1d_with_membership <- function(ylist, memlist, countslist = NULL, x = NULL, alpha = .01){

  ## Basic checks
  if(!is.null(x)){
    stopifnot(length(x) == length(ylist))
    times = x
  } else {
    times = 1:length(ylist)
  }
  dimdat = ncol(ylist[[1]])

  ## If countslist is not provided, make a dummy
  if(is.null(countslist)) countslist = lapply(ylist, function(y) rep(1, nrow(y)))

  ## Make data into long matrix
  ymat <- lapply(1:length(ylist), FUN = function(tt){
    data.frame(time = times[tt], Y = ylist[[tt]],
               counts = countslist[[tt]], cluster = memlist[[tt]])
  }) %>% bind_rows() %>% as_tibble()
  ymat = ymat %>% mutate(cluster = as.factor(cluster))
  colnames(ymat) = c("time", "Y", "counts", "cluster") ## when ylist[[tt]] already has a column name, this is needed.

  gg =
    ymat %>% ggplot() +
    geom_point(aes(x = time, y = Y, group=cluster, col = cluster),##, shape = cluster),
               alpha = alpha) + ##facet_wrap(~cluster)  +
    theme_bw() + ylab("Data") + xlab("Time") 
  return(gg)
}

4.2 2d data

4.2.1 Generating 2d data

#' Generates some synthetic 2-dimensional data with three clusters. 
#'
#' @param TT Number of time points.
#' @param nt Number of particles at each time.
#'
#' @return List containing (1) ylist, (2) mnlist, (3) clusterlist.
#' @export
gendat_2d <- function(TT, ntlist){


  ## Basic checks
  stopifnot(length(ntlist) == TT)

  ## Make cluster probabilities, by time
  cluster_prob1 = sapply(1:TT, function(tt) sin(tt/24 * 2 * pi)/3 + 1 + (tt/TT)*5)
  cluster_prob2 = sapply(1:TT, function(tt) cos(tt/24 * 2 * pi)/3 + 8 - (tt/TT)*5)
  cluster_prob3 = rep(3, TT)
  probs = cbind(cluster_prob1, cluster_prob2, cluster_prob3)
  probs = probs/rowSums(probs)
  colnames(probs) = 1:3
  probs_long = as_tibble(probs) %>%
    add_column(time = 1:TT) %>%
    pivot_longer(-"time", names_to = "cluster", values_to = "prob")
  probs_long %>% ggplot() + geom_line(aes(x=time,y=prob, group=cluster, col=cluster)) + ylim(c(0,1))

  ## Make cluster means, by time
  means <- array(NA, dim = c(TT, 3, 2))
  for(ii in 1:3){
    for(tt in 1:TT){
      means[tt, 1, 1] = means[tt, 1, 2] = tt/TT + 0.5
      means[tt, 2, 1]  = sin(tt/24 * 2 * pi)##seq(-1, 1, length.out = TT)[tt]*3.1415)
      means[tt, 2, 2] = 0
      means[tt, 3, 1] = means[tt, 3, 2] = -3+cos(tt/24 * 2 * pi)##seq(-1, 1, length.out = TT)[tt]*6.282)
    }
  }
  dimnames(means)[[2]]  = c(1:3)
  means_long = as_tibble(means) %>%
    add_column(time = 1:TT) %>%
    pivot_longer(-"time", names_to = "cluster", values_to = "mean")
  model = full_join(means_long, probs_long, by = c("time", "cluster"))

  ylist = list()
  mulist = list()
  clusterlist = list()
  for(tt in 1:TT){
    Y <- vector(mode = "list", length = 2)
    mu <- vector(mode = "list", length = 2)
    clusters_count <- rmultinom(n = 1, size = ntlist[tt], prob = probs[tt,])

    for(ii in 1:3){
      if(ii == 1){
        mn = means[tt,ii,,drop=TRUE]
        Sigma1 = matrix(c(0.4, 0.3, 0.3, 0.4), ncol = 2)
        Y[[ii]] <- replicate(clusters_count[ii,1], mn + MASS::mvrnorm(1, mu=c(0,0), Sigma= Sigma1)) %>% t()
        mu[[ii]] <- replicate(clusters_count[ii,1], mn) %>% t()
      }
      if(ii == 2){
        mn = means[tt,ii,, drop=TRUE]
        Y[[ii]] <- replicate(clusters_count[ii,1], mn + MASS::mvrnorm(1, mu=c(0,0), Sigma = diag(c(0.5, 0.1)))) %>% t()
        mu[[ii]] <- replicate(clusters_count[ii,1], mn) %>% t()
      }
      if(ii == 3){
        mn = means[tt,ii,, drop=TRUE]
        Y[[ii]] <- replicate(clusters_count[ii,1], mn + MASS::mvrnorm(1, mu=c(0,0), Sigma = diag(c(0.35, 0.35)))) %>% t()
        mu[[ii]] <- replicate(clusters_count[ii,1], mn) %>% t()
      }
    }

    Y <- Y %>% purrr::compact() %>% do.call(rbind, .)
    mu <- mu %>% purrr::compact() %>% do.call(rbind, .)
    cluster <- rep(1:3, times = clusters_count)
    ylist[[tt]] = Y
    mulist[[tt]] = mu
    clusterlist[[tt]] = cluster
  }
  return(list(ylist = ylist, mulist = mulist,
              clusterlist = clusterlist,
              probs = probs, means = means))
}

4.2.2 Plotting 2d data

Here’s a simpler plotting function for 2d data at the particle level.

#' Simple plotter for 2d particle data.
#' 
#' @param ylist Data. A list of (|nt| by |dimdat|) matrices
#' @param countslist Count data.
#' @param obj flowtrend (or flowmix) object.
#' @param time Out of 1 through \code{lengthy(list)}, which time point to plot.
#' @param zero_one_list_censored A list of zeros and 1s marking the censored
#'   points.
#' 
#' @export
#' @return ggplot object.
plot_2d <- function(ylist, countslist = NULL, obj = NULL, tt, bin = TRUE,
                    point_color = "blue", raster_colours = c("white", "blue"),
                    zero_one_list_censored = NULL){

  ## Basic checks
  if(!is.null(obj)) stopifnot(class(obj) %in% c("flowmix", "flowtrend"))
  stopifnot(ncol(ylist[[1]]) == 2)
  if(!is.null(obj)) stopifnot(obj$dimdat == 2)
  ##  if(!bin) stop("2d plotting for particle level data isn't supported.")
  

  ## Take data from one time point
  y = ylist %>% .[[tt]]
  if(is.null(colnames(y))){ colnames(y) = paste0("dim", c(1,2)) }
  y = y %>% as_tibble()


  ## Handle counts
  if(is.null(countslist)){
    counts = rep(1, nrow(ylist[[1]]))
  }
  if(!is.null(countslist)){
    counts = countslist[[tt]]
  }

  ## Get variable names
  varnames = y %>% colnames()
  varname1 = varnames[1]
  varname2 = varnames[2]

  ## Get data from one timepoint
  y = y %>% add_column(counts = counts)

  ## If a list of 0's and 1's (marking censored points) is provided, plot it
  if(!is.null(zero_one_list_censored)){
    zero_one = zero_one_list_censored[[tt]]
    y = y %>% add_column(censored = factor(zero_one, levels = c(FALSE, TRUE)))##c(0,1)))
  }

  if(!bin){
    p = y %>% ggplot() +
      geom_point(aes(x = !!sym(varname1), y=!!sym(varname2)), size = rel(counts),
                 alpha = .2, col = point_color) +
      theme_minimal() +
      theme(legend.position = "none")  + 
      ## scale_size()
      scale_size_area()
    if(!is.null(zero_one_list_censored)){
      p =
        y %>% ggplot() + geom_point(aes(x = !!sym(varname1), 
                                        y = !!sym(varname2), col = zero_one), size = rel(counts), alpha = 0.2) +
        theme_minimal() + theme(legend.position = "none") + 
        ## scale_size() +
        scale_size_area() +
        scale_color_manual(values=c("blue", "greenyellow"))
    }

  }
  if(bin){
    p =
      y %>% ggplot() +
      geom_raster(aes(x = !!sym(varname1), y=!!sym(varname2), fill = counts))  +
      scale_fill_gradientn(guide="none", colours = raster_colours) 
  }

  p = p + ggtitle(paste0("Time=", tt))

  ## Adding visualizations of the model |obj|
  if(is.null(obj)){
    return(p)
  } else {
    mnlist = lapply(1:obj$numclust, function(iclust){
      one_mnmat = obj$mn[,,iclust]
      colnames(one_mnmat) = paste0("dim", 1:2)
      one_mnmat %>% as_tibble() %>% add_column(cluster = iclust)
    })
    mnmat = do.call(rbind, mnlist)

    mn_colours = rep("red", obj$numclust)
    for(iclust in 1:obj$numclust){

      ## Add ellipse
      el = ellipse::ellipse(x = obj$sigma[iclust,,],
                            centre = obj$mn[tt,,iclust]) %>% as_tibble()
      p = p + geom_path(aes(x = x, y = y), data = el,
                        colour = mn_colours[iclust], lty = 2,
                        linewidth = pmin(obj$prob[tt,iclust] * 8, 0.8))

      ## Add mean
      p = p + geom_point(aes(x = dim1, y = dim2),
                         data = mnmat %>% subset(cluster == iclust) %>% .[tt,],
                         colour = mn_colours[iclust],
                         ## size = rel(3))
                         size = obj$prob[tt,iclust] * 10)
    }
  }
  return(p)
}




## This is from the many-cruises 02-helpers.R file; come back to this.
if(FALSE){
  p = datobj_2d %>%
    ggplot() +
    theme_minimal() +
    geom_raster(aes(x = !!sym(varname1), y=!!sym(varname2), fill = counts)) +
    scale_fill_gradientn(colours = colours, guide="colorbar")+
    xlim(c(0,8)) + ylim(c(0, 8)) +
    theme(legend.position = "none")
}

Let’s try it out.

datobj = gendat_2d(2, c(1000,1000))
plot_2d(datobj$ylist, tt=1, bin = FALSE)

4.3 3d data

Given 3d data ylist and an estimated flowtrend model object obj, we want to plot both in three two-dimensional plot. plot_3d() lets you do this.

4.3.1 Plotting 3d data

First, here’s a plotting helper.

#' Combine list of ggplots
#' @export
my_mfrow <- function(glist, ncol = NULL, nrow = NULL){
  if(is.null(ncol)) ncol = 1
  if(is.null(nrow)) ncol = 3
  do.call(ggpubr::ggarrange, c(glist, ncol=ncol, nrow=nrow))
}

#' Makes three 2-dimensional plots of the 3d data
#' 
#' @param ylist Data. A list of (|nt| by |dimdat|) matrices
#' @param countslist Count data.
#' @param obj flowmix or flowtrend object
#' @param tt Time point
#' @param return_list_of_plots If TRUE, return the list of three plots instead
#'   of the combined plot
#' @param zero_one_list_censored A list of zeros and 1s marking the censored
#'   points.
#' 
#' @export
#' @return
#'   
plot_3d <- function(ylist, obj = NULL, tt, countslist = NULL,
                    mn_colours = NULL,
                    labels = NULL,
                    bin = TRUE,
                    plot_title = NULL,
                    return_list_of_plots = FALSE,
                    zero_one_list_censored = NULL){

  ## Basic checks
  if(!is.null(obj)) stopifnot(class(obj) %in% c("flowmix", "flowtrend"))
  stopifnot(ncol(ylist[[1]]) == 3)
  if(!is.null(labels)) assertthat::assert_that(length(labels) == obj$numclust)
##  if(!bin) stop("This function is only for binned 3d data!")

  ## Extract data
  ## y = ylist[[tt]][,dims]
  labs = colnames(ylist[[1]])
  if(is.null(labs))  labs = paste0("dim", 1:3)
  if(is.null(countslist)){
    counts = rep(1, nrow(ylist[[tt]]))
  }
  if(!is.null(countslist)){
    counts = countslist[[tt]]
  }


  ## Aggregate counts into the two dimensions
  y2d_list = list()
  counts2d_list = list()
  for(ii in 1:3){
    dims = list(c(1:2), c(2:3), c(3,1))[[ii]]
    if(bin){
      yy = flowmix::collapse_3d_to_2d(ylist[[tt]], counts, dims)
      y2d = yy[,1:2]
      colnames(y2d) = labs[dims]
      one_counts = yy[,3]
    } else {
      y2d = ylist[[tt]][,dims]
      colnames(y2d) = labs[dims]
      one_counts = counts
    }
    y2d_list[[ii]] = y2d
    counts2d_list[[ii]] = one_counts
  }
  total_range = sapply(counts2d_list, range) %>% range()


  if(bin) zero_one = NULL
  if(!bin & !is.null(zero_one_list_censored)) zero_one = list(zero_one_list_censored[[tt]])
  if(!bin & is.null(zero_one_list_censored)) zero_one = NULL 

  ## Create three 2d plots
  plotlist = list()
  for(ii in 1:3){
    dims = list(c(1:2), c(2:3), c(3,1))[[ii]]

    ## Make data plot 
    ##one_countslist = (if(!is.null(countslist)) list(counts2d_list[[ii]]) else NULL)
    p = plot_2d(list(y2d_list[[ii]]),
                           list(counts2d_list[[ii]]), obj = NULL, tt = 1, bin = bin,
                           zero_one_list_censored = zero_one)
    if(bin){
      p$scales$scales <- list()
      p = p + scale_fill_gradientn(
                  colors = c("white", "blue", "yellow"), limits = total_range,
                  guide = "none")
    }

    ## Adding visualizations of the model |obj| at time tt
    if(!is.null(obj)){
      if(is.null(mn_colours)){
        mn_colours = rep("red", obj$numclust)
      }
      
      ## Make data matrix
      mnlist = lapply(1:obj$numclust, function(iclust){
        one_mnmat = obj$mn[tt,dims,iclust] %>% t()
        colnames(one_mnmat) = paste0("dim", 1:2)
        one_mnmat %>% as_tibble() %>% add_column(cluster = iclust)
      })
      mnmat = do.call(rbind, mnlist)

      for(iclust in 1:obj$numclust){
        
        ## Add ellipse
        el = ellipse::ellipse(x = obj$sigma[iclust,dims,dims],
                              centre = obj$mn[tt,dims,iclust]) %>% as_tibble()
        p = p + geom_path(aes(x = x, y = y), data = el,
                          colour = mn_colours[iclust], lty = 2,
                          linewidth = pmin(obj$prob[tt,iclust] * 8, 0.8))
        
        ## Add mean
        p = p + geom_point(aes(x = dim1, y = dim2),
                           data = mnmat %>% subset(cluster == iclust),
                           colour = mn_colours[iclust],
                           size = obj$prob[tt,iclust] * 10)

        ## Add cluster number as a label
        cex = rel(3)
        fac = 10
        if(is.null(labels)){
          labels = 1:(obj$numclust)
        }
        dt = data.frame(dim1 = mnmat[,1], dim2 = mnmat[,2], prob = obj$prob[tt,] * fac)
        p = p + ggrepel::geom_text_repel(aes(x = dim1, y = dim2, label = labels,
                                             point.size = sqrt(prob)),
                                         col = mn_colours,
                                         cex = cex,
                                         bg.color = "white",
                                         bg.r = 0.1,
                                         fontface = "bold",
                                         force_pull   = 5, # do not pull toward data points
                                         ## data = mnmat,
                                         data = dt,
                                         seed = 1)

      }
    }

    ## Format a bit more and save
    if(ii == 1){
      if(is.null(plot_title)) plot_title = paste0("Time=", tt)
      p = p + ggtitle(plot_title)
    } else {
      p = p + ggtitle("")
    }
    p = p + theme(legend.position = "none") 
    plotlist[[ii]] = p
  }

  p_combined = my_mfrow(plotlist)

  ## Return the plots
  if(return_list_of_plots) return(plotlist)
  return(p_combined)
}

4.3.2 Generating 3d data

#' Generates some synthetic 3-dimensional data with three clusters. 
#'
#' @param TT Number of time points.
#' @param nt Number of particles at each time.
#'
#' @return List containing (1) ylist, (2) mnlist, (3) clusterlist.
#' @export
gendat_3d <- function(TT, ntlist){

  ## Basic checks
  stopifnot(length(ntlist) == TT)

  ## Make cluster probabilities, by time
  cluster_prob1 = sapply(1:TT, function(tt) sin(tt/24 * 2 * pi)/3 + 1 + (tt/TT)*5)
  cluster_prob2 = sapply(1:TT, function(tt) cos(tt/24 * 2 * pi)/3 + 8 - (tt/TT)*5)
  cluster_prob3 = rep(3, TT)
  probs = cbind(cluster_prob1, cluster_prob2, cluster_prob3)
  probs = probs/rowSums(probs)
  colnames(probs) = 1:3
  probs_long = as_tibble(probs) %>%
    add_column(time = 1:TT) %>%
    pivot_longer(-"time", names_to = "cluster", values_to = "prob")
  probs_long %>% ggplot() + geom_line(aes(x=time,y=prob, group=cluster, col=cluster)) + ylim(c(0,1))

  ## Make cluster means, by time
  numclust = 3
  dimdat = 3
  ## means <- array(NA, dim = c(TT, 3, 2))
  means <- array(NA, dim = c(TT, numclust, dimdat))
  for(ii in 1:3){
    for(tt in 1:TT){

      ## First cluster 
      means[tt, 1, 1] = means[tt, 1, 2] = means[tt, 1, 3] = tt/TT + 0.5

      ## Second cluster 
      means[tt, 2, 1]  = sin(tt/24 * 2 * pi)##seq(-1, 1, length.out = TT)[tt]*3.1415)
      means[tt, 2, 2] = 0
      means[tt, 2, 3] = 0

      ## Third cluster
      means[tt, 3, 1] = means[tt, 3, 2] = means[tt, 3, 3] =
        -3+cos(tt/24 * 2 * pi)##seq(-1, 1, length.out = TT)[tt]*6.282)
    }
  }
  dimnames(means)[[2]]  = c(1:3)
  means_long = as_tibble(means) %>%
    add_column(time = 1:TT) %>%
    pivot_longer(-"time", names_to = "cluster", values_to = "mean")
  model = full_join(means_long, probs_long, by = c("time", "cluster"))

  ylist = list()
  mulist = list()
  clusterlist = list()
  for(tt in 1:TT){
    Y <- vector(mode = "list", length = numclust)
    mu <- vector(mode = "list", length = numclust)
    clusters_count <- rmultinom(n = 1, size = ntlist[tt], prob = probs[tt,])

    for(ii in 1:3){
      if(ii == 1){
        mn = means[tt,ii,,drop=TRUE]
        Sigma1 = matrix(rep(0.3, 9), ncol = 3)
        diag(Sigma1) = 0.4
        ## Sigma1 = matrix(c(0.4, 0.3, 0.3, 0.4), ncol = 2)
        Y[[ii]] <- replicate(clusters_count[ii,1], mn + MASS::mvrnorm(1, mu=c(0,0,0), Sigma= Sigma1)) %>% t()
        mu[[ii]] <- replicate(clusters_count[ii,1], mn) %>% t()
      }
      if(ii == 2){
        mn = means[tt,ii,, drop=TRUE]
        Y[[ii]] <- replicate(clusters_count[ii,1], mn + MASS::mvrnorm(1, mu=c(0,0,0), Sigma = diag(c(0.5, 0.1, 0.2)))) %>% t()
        mu[[ii]] <- replicate(clusters_count[ii,1], mn) %>% t()
      }
      if(ii == 3){
        mn = means[tt,ii,, drop=TRUE]
        Y[[ii]] <- replicate(clusters_count[ii,1], mn + MASS::mvrnorm(1, mu=c(0,0,0), Sigma = diag(c(0.35, 0.35, 0.35)))) %>% t()
        mu[[ii]] <- replicate(clusters_count[ii,1], mn) %>% t()
      }
    }

    Y <- Y %>% purrr::compact() %>% do.call(rbind, .)
    mu <- mu %>% purrr::compact() %>% do.call(rbind, .)
    cluster <- rep(1:3, times = clusters_count)
    colnames(Y) = paste0("dim", 1:3)
    ylist[[tt]] = Y
    mulist[[tt]] = mu
    clusterlist[[tt]] = cluster
  }
  return(list(ylist = ylist, mulist = mulist,
              clusterlist = clusterlist,
              probs = probs, means = means))
}

Now, let’s try using plot_3d on some synthetic data.

set.seed(2321)
datobj = gendat_3d(100, rep(30,100))
plot_3d(ylist = datobj$ylist, tt = 1, bin = FALSE)