5 Building the model/algorithm

5.1 Trend filtering

Trend filtering is a non-parametric regression technique for a sequence of output points \(y = (y_1,..., y_T)\) observed at locations \(x = (x_1, ..., x_T)\). It is usually assumed that \(x_1, ..., x_T\) are evenly spaced points, though this can be relaxed. The trend filtering estimate of order \(l\) of the time series \(\mu_t = \mathbb{E}(y_t), t \in x\) is obtained by calculating \[\hat{\mu} = \mathop{\mathrm{argmin}}_{\mu \in \mathbb{R}^T} \frac{1}{2}\| \mu - y\|_2^2 + \lambda \| D^{(l+1)} \mu\|_1,\] where \(\lambda\) is a tuning parameter and \(D^{(l+1)} \in \mathbb{R}^{T-l}\) is the \((l+1)^\text{th}\) order discrete differencing matrix.

We first need to be able to construct the trend filtering “differencing matrix” used for smoothing the mixture parameters over time. The general idea of the trend filtering is explained masterfully in [Ryan’s paper, Section 6 and equation (41)].

The differencing matrix is formed by recursion, starting with \(D^{(1)}\).

\[\begin{equation*} D^{(1)} = \begin{bmatrix} -1 & 1 & 0 & \cdots & 0 & 0 \\ 0 & -1 & 1 & \cdots & 0 & 0\\ \vdots & & & & \\ 0 & 0 & 0 & \cdots & -1 & 1 \end{bmatrix}. \end{equation*}\]

For \(l>1\), the differencing matrox \(D^{(l+1)}\) is defined recursively as \(D^{(l+1)} = D^{(1)} D^{(l)}\), starting with \(D^{(1)}\).

#' Generating Difference Matrix of Specified Order
#'
#' @param n length of vector to be differenced
#' @param l order of differencing
#' @param x optional. Spacing of input points.
#'
#' @return A n by n-l-1 matrix
#' @export
#'
#' @examples
gen_diff_mat <- function(n, l, x = NULL){

  ## Basic check
  if(!is.null(x))  stopifnot(length(x) == n) 
  if(is.unsorted(x))  stop("x must be in increasing order!") 

  get_D1 <- function(t) {do.call(rbind, lapply(1:(t-1), FUN = function(x){
    v <- rep(0, t)
    v[x] <- -1
    v[x+1] <- 1
    v
  }))}

  if(is.null(x)){
    if(l == 0){
      return(diag(rep(1,n)))
    }
    if(l == 1){
      return(get_D1(n))
    }
    if(l > 1){
      D <- get_D1(n)
      for(k in 1:(l-1)){
        D <- get_D1(n-k) %*% D
      }
      return(D)
    }
  }
  else{
    if(l == 0){
      return(diag(rep(1,n)))
    }
    if(l == 1){
      return(get_D1(n))
    }
    if(l > 1){

      D <- get_D1(n)
      for(k in 1:(l-1)){
        D1 = get_D1(n-k)
        facmat = diag(k / diff(x, lag = k))
        D <- D1 %*% facmat %*% D
      }
      return(D)
    }
  }
}

For equally spaced inputs with \(l=1\) and \(l=2\):

l = 1
Dl = gen_diff_mat(n = 10, l = l+1, x = NULL)
print(Dl)

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]    1   -2    1    0    0    0    0    0    0     0
## [2,]    0    1   -2    1    0    0    0    0    0     0
## [3,]    0    0    1   -2    1    0    0    0    0     0
## [4,]    0    0    0    1   -2    1    0    0    0     0
## [5,]    0    0    0    0    1   -2    1    0    0     0
## [6,]    0    0    0    0    0    1   -2    1    0     0
## [7,]    0    0    0    0    0    0    1   -2    1     0
## [8,]    0    0    0    0    0    0    0    1   -2     1

l = 2
Dl = gen_diff_mat(n = 10, l = l+1, x = NULL)
print(Dl)

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]   -1    3   -3    1    0    0    0    0    0     0
## [2,]    0   -1    3   -3    1    0    0    0    0     0
## [3,]    0    0   -1    3   -3    1    0    0    0     0
## [4,]    0    0    0   -1    3   -3    1    0    0     0
## [5,]    0    0    0    0   -1    3   -3    1    0     0
## [6,]    0    0    0    0    0   -1    3   -3    1     0
## [7,]    0    0    0    0    0    0   -1    3   -3     1

When the inputs have a gap in it:

## See what a l=2 difference matrix looks like:
x = (1:10)[-(3)]
l = 2
TT = length(x)
Dl = gen_diff_mat(n = TT, l = l+1, x = x)
print(Dl)

##            [,1]       [,2]      [,3]       [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] -0.6666667  1.3333333 -1.333333  0.6666667    0    0    0    0    0
## [2,]  0.0000000 -0.3333333  2.000000 -2.6666667    1    0    0    0    0
## [3,]  0.0000000  0.0000000 -1.000000  3.0000000   -3    1    0    0    0
## [4,]  0.0000000  0.0000000  0.000000 -1.0000000    3   -3    1    0    0
## [5,]  0.0000000  0.0000000  0.000000  0.0000000   -1    3   -3    1    0
## [6,]  0.0000000  0.0000000  0.000000  0.0000000    0   -1    3   -3    1

## Formally test it
d1 = gen_diff_mat(n = 10, l = l+1, x = 1:10)
d2 = gen_diff_mat(n = 10, l = l+1, x = (1:10)*2)
print(d1)

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]   -1    3   -3    1    0    0    0    0    0     0
## [2,]    0   -1    3   -3    1    0    0    0    0     0
## [3,]    0    0   -1    3   -3    1    0    0    0     0
## [4,]    0    0    0   -1    3   -3    1    0    0     0
## [5,]    0    0    0    0   -1    3   -3    1    0     0
## [6,]    0    0    0    0    0   -1    3   -3    1     0
## [7,]    0    0    0    0    0    0   -1    3   -3     1

print(d2)

##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10]
## [1,] -0.25  0.75 -0.75  0.25  0.00  0.00  0.00  0.00  0.00  0.00
## [2,]  0.00 -0.25  0.75 -0.75  0.25  0.00  0.00  0.00  0.00  0.00
## [3,]  0.00  0.00 -0.25  0.75 -0.75  0.25  0.00  0.00  0.00  0.00
## [4,]  0.00  0.00  0.00 -0.25  0.75 -0.75  0.25  0.00  0.00  0.00
## [5,]  0.00  0.00  0.00  0.00 -0.25  0.75 -0.75  0.25  0.00  0.00
## [6,]  0.00  0.00  0.00  0.00  0.00 -0.25  0.75 -0.75  0.25  0.00
## [7,]  0.00  0.00  0.00  0.00  0.00  0.00 -0.25  0.75 -0.75  0.25

d1_d2_ratio = as.numeric(d1/d2)  %>% na.omit() %>% as.numeric()
testthat::expect_true(all(d1_d2_ratio==d1_d2_ratio[1])) ## correct

A formal test for unevenly spaced inputs will come soon, once we’ve defined gen_tf_mat, next.

We now build a function to build a lasso regressor matrix \(H\) that can be used to solve an equivalent problem as the trend filtering of the l’th degree.

(This is stated in Lemma 4, equation (25) from Tibshirani (2014).)

#' A lasso regressor matrix H that can be used to solve an equivalent problem as the trend filtering of the \code{k}'th degree.
#'
#' @param n Total number of time points.
#' @param k Degree of trend filtering for cluster probabilities. $k=0$ is fused lasso, $k=1$ is linear trend filtering, and so on.
#' @param x Time points
#' 
#' @return $n$ by $n$ matrix.
#' 
#' @export
gen_tf_mat <- function(n, k, x = NULL){

  if(is.null(x) ){
    x = (1:n)/n
  }
  if(!is.null(x)){
    stopifnot(length(x) == n)
  }

  ## For every i,j'th entry, use this helper function (from eq 25 of Tibshirani
  ## (2014)).
  gen_ij <- function(x, i, j, k){
    xi <- x[i]
    if(j %in% 1:(k+1)){
      return(xi^(j-1))
    }
    if(j %in% (k+2):n){

      ## Special handling for k==0, See lemma 4 eq 25
      if(k == 0){
        prd = 1 
        ind = j
      }
      if(k >= 1){
        ind = j - (1:k)
        prd = prod(xi - x[ind]) 

      }
      return(prd * ifelse(xi >= x[max(ind)], 1, 0))
    }
  }

  ## Generate the H matrix, entry-wise.
  H <- matrix(nrow = n, ncol = n)
  for(i in 1:n){
    for(j in 1:n){
      H[i,j] <- gen_ij(x, i,j, k)
    }
  }
  return(H)
}

Here’s a simple test of gen_tf_mat(), against an alternative function built for equally spaced data.

#' Creates trend filtering regression matrix using Lemma 2 of Ryan Tibshirani's
#' trendfilter paper (2014); works on equally spaced data only.
#'
#' @param n Number of data points.
#' @param k Order of trend filter. 0 is fused lasso, and so on.
#' 
#' @examples 
#' ord = 1
#' H_tf <- gen_tf_mat_equalspace(n = 100, k = ord)
#' H_tf[,1] * 100
#' H_tf[,2] * 100
#' H_tf[,3] * 100
#' H_tf[,4] * 100
#' H_tf[,99] * 100
#' H_tf[,100] * 100
#' 
#' @return n by n matrix.
gen_tf_mat_equalspace <- function(n, k){
  nn = n
  kk = k
  ##' Connects kk to kk-1.
  sigm <- function(ii, kk){
    if(kk == 0) return(rep(1, ii))
    cumsum(sigm(ii, kk-1))
  }

  mat = matrix(NA, ncol = nn, nrow = nn)
  for(ii in 1:nn){
    for(jj in 1:nn){
      if(jj <= kk+1) mat[ii,jj] = ii^(jj-1) / nn^(jj-1)
      if(ii <= jj-1 & jj >= kk+2) mat[ii, jj] = 0
      if(ii > jj-1 & jj >= kk+2){
        mat[ii, jj] = (sigm(ii-jj+1, kk) %>% tail(1)) * factorial(kk) / nn^kk
      }
    }
  }
  return(mat)
}

testthat::test_that("Trend filtering regression matrix is created correctly on equally spaced data.",
{
  ## Check that equally spaced data creates same trendfilter regression matrix
  ## Degree 1
  H1 <- gen_tf_mat(10, 1)
  H1_other <- gen_tf_mat(10, 1, x=(1:10)/10)
  testthat::expect_equal(H1, H1_other)

  ## Degree 1
  H2 <- gen_tf_mat(10, 2)
  H2_other <- gen_tf_mat(10, 2, x=(1:10)/10)
  testthat::expect_equal(H2, H2_other)
  
  ## Check the dimension
  testthat::expect_equal(dim(H1), c(10,10))
  
  ## Check against an alternative function.
  for(ord in c(0,1,2,3,4)){
    H <- gen_tf_mat(10, ord)
    H_eq = gen_tf_mat_equalspace(10, ord)
    testthat::expect_true(max(abs(H_eq- H)) < 1E-10)
  }
})

## Test passed 😀

Finally, let’s test the difference matrix \(D\) formed using unevenly spaced \(x\)’s.

testthat::test_that("Uneven spaced D matrix is formed correctly", {

  x = (1:6)[-(2)]
  ## k = 0 is piecewise constant, k=1 is piecewise linear, etc.
  for(k in 0:3){

    ## Temp
    k=1
    ## End of temp

    l = k+1

    ## Form Dl using our function
    Dl <- flowtrend::gen_diff_mat(n=5, l = l, x=x)
    
    ## Compare it to rows of H matrix, per section 6 of Tibshirani et al. 2014
    gen_tf_mat(n = length(x), k = k, x = x) %>%
      solve() %>%
      `*`(factorial(k)) %>% ## This part is missing in Tibshirani et al. 2014
      tail(length(x)-(k+1)) -> Hx 
    ratiomat = Hx/Dl

    ## Process the ratios of each entry, and check that they're equal to 1
    ratios = ratiomat[!is.nan(ratiomat)]
    ratios = ratios[is.finite(ratios)]
    testthat::expect_true(all.equal(ratios, rep(1, length(ratios)))) 
  }
})

## Test passed 🎉

5.2 Objective (data log-likelihood)

The function loglik_tt() calculates the log-likelihood of one cytogram, which is:

\[\sum_{i=1}^{n_t} C_i^{(t)} \log\left( \sum_{k=1}^K \pi_{itk} \phi(y_i^{(t)}; \mu_{kt}, \Sigma_k) \right). \]

This is the portion of the objective function of the flowtrend model.

#' Log likelihood for a single time point's data.
#'
#' @param mu Cluster means.
#' @param prob Cluster probabilities.
#' @param sigma Cluster variances.
#' @param ylist Data.
#' @param tt Time point.
loglik_tt <- function(ylist, tt, mu, sigma, prob, dimdat = NULL, countslist = NULL, numclust){

  ## One particle's log likelihood (weighted density)
  weighted.densities = sapply(1:numclust, function(iclust){
    if(dimdat == 1){
      return(prob[tt,iclust] * dnorm(ylist[[tt]], mu[tt,,iclust], sqrt(sigma[iclust,,])))
    }
    if(dimdat > 1){
    return(prob[tt,iclust] * dmvnorm_arma_fast(ylist[[tt]], mu[tt,,iclust], as.matrix(sigma[iclust,,]), FALSE))
    }
  })
  nt = nrow(ylist[[tt]])
  counts = (if(!is.null(countslist)) countslist[[tt]] else rep(1, nt))

  sum_wt_dens = rowSums(weighted.densities)
  sum_wt_dens = sum_wt_dens %>% pmax(1E-100)

  return(sum(log(sum_wt_dens) * counts))
}

Next, here is the function that calculates the entire objective from all cytograms, given model parameter mu, prob, and sigma:

\[\sum_{t=1}^T\sum_{i=1}^{n_t} C_i^{(t)} \log\left( \sum_{k=1}^K \pi_{itk} \phi(y_i^{(t)}; \mu_{kt}, \Sigma_k) \right). \]

#' Evaluating the penalized negative log-likelihood on all data \code{ylist} given parameters \code{mu}, \code{prob}, and \code{sigma}.
#'
#' @param mu
#' @param prob
#' @param prob_link
#' @param sigma
#' @param ylist
#' @param Dl
#' @param l
#' @param lambda
#' @param l_prob
#' @param Dl_prob
#' @param lambda_prob
#' @param alpha
#' @param beta
#' @param countslist
#' @param unpenalized if TRUE, return the unpenalized out-of-sample fit.
#'
#' @return
#' @export
#'
#' @examples
objective <- function(mu, prob, prob_link = NULL, sigma,
                      ylist,
                      Dlp1 = NULL, l = NULL,
                      lambda = 0,
                      l_prob = NULL,
                      Dlp1_prob = NULL,
                      lambda_prob = 0,
                      alpha = NULL, beta = NULL,
                      countslist = NULL,
                      unpenalized = FALSE){

  ## Set some important variables
  TT = dim(mu)[1]
  numclust = dim(mu)[3]
  if(is.null(countslist)){
    ntlist = sapply(ylist, nrow)
  } else {
    ntlist = sapply(countslist, sum)
  }
  N = sum(ntlist)
  dimdat = ncol(ylist[[1]])

  ## Calculate the log likelihood
  loglik = sapply(1:TT, function(tt){
    return(loglik_tt(ylist, tt, mu, sigma, prob, countslist, numclust = numclust, dimdat = dimdat))
  })

  if(unpenalized){
    obj =  -1/N * sum(unlist(loglik)) 
    return(obj)
  } else {

    ## Return penalized likelihood
    mu.splt <- asplit(mu, MARGIN = 2) ## This was 3, which I think produces the same result.
    diff_mu <- sum(unlist(lapply(mu.splt, FUN = function(m) sum(abs(Dlp1 %*% m)))))
    diff_prob <- sum(abs(Dlp1_prob %*% prob_link))
    obj =  -1/N * sum(unlist(loglik)) + lambda * diff_mu + lambda_prob * diff_prob
    return(obj)
  }
}

Here’s a helper to check numerical convergence of the EM algorithm.

#' Checks numerical improvement in objective value. Returns TRUE if |old-new|/|old| is smaller than tol.
#'
#' @param old Objective value from previous iteration.
#' @param new Objective value from current iteration.
#' @param tol Numerical tolerance.
check_converge_rel <- function(old, new, tol=1E-6){
  return(abs((old-new)/old) < tol )
}

Here’s also a helper function to do the softmax-ing of \(\alpha_t \in \mathbb{R}^K\).

#'  Softmax function.
#' 
#' @param prob_link alpha, which is a (T x K) matrix.
#' 
#' @return exp(alpha)/rowSum(exp(alpha)). A (T x K) matrix.
softmax <- function(prob_link){
  exp_prob_link = exp(prob_link)
  prob = exp_prob_link / rowSums(exp_prob_link)
}

testthat::test_that("Test for softmax",{
  link = runif(100, min = -10, max = 10) %>% matrix(nrow = 10, ncol = 10)
  testthat::expect_true(all(abs(rowSums(softmax(link)) - 1) < 1E-13))
})

## Test passed 🎉

5.3 Initial parameters for EM algorithm

The EM algorithm requires some initial values for \(\mu\), \(\pi\) and \(\Sigma\).

Initializing \(\pi\) is done in one line, prob = matrix(1/numclust, nrow = TT, ncol = numclust), which sets everything to \(1/K\).

For \(\mu\) and \(\Sigma\), we write some functions. Essentially, initial means \(\mu\) are jittered versions of a \(K\) means that are drawn from a downsampled version of \(ylist\) (downsampling is done because \(ylist\) can have a large number of particles). \(\Sigma\) is \(d\times d\) identity matrices, with fac=1 diagonal by default.

#' Initialize the cluster centers.
#'
#' @param ylist  A T-length list of (nt  by 3) datasets.  There should  be T of
#'   such datasets. 3 is actually \code{mulen}.
#' @param numclust Number of clusters (M).
#' @param TT total number of (training) time points.
#'
#' @return An array of dimension (T x dimdat x M).
#' @export
init_mn <- function(ylist, numclust, TT, dimdat, countslist = NULL, seed=NULL){

  if(!is.null(seed)){
    assertthat::assert_that(all((seed %>% sapply(., class)) == "integer"))
    assertthat::assert_that(length(seed) == 7)
    RNGkind("L'Ecuyer-CMRG") 
    .Random.seed <<- seed
  }

  if(is.null(countslist)){
    ntlist = sapply(ylist, nrow)
    countslist = lapply(ntlist, FUN = function(nt) rep(1, nt))
  }

  ## Initialize the means by (1) collapsing to one cytogram (2) random
  ## sampling from this distribution, after truncation,
  TT = length(ylist)
  ylist_downsampled <- lapply(1:TT, function(tt){

    y = ylist[[tt]]
    counts = countslist[[tt]]

    ## Sample so that, in total, we get mean(nt) * 30 sized sample. In the case
    ## of binned data, nt is the number of bins.
    if(nrow(y) > 500) nsize = pmin(nrow(y) / TT * 30, nrow(y)) else nsize = nrow(y)
    some_rows = sample(1:nrow(y),
                       size = nsize,
                       prob = counts/sum(counts))
    y[some_rows,, drop=FALSE]
  })

  ## Jitter the means a bit
  yy = do.call(rbind, ylist_downsampled)
  new_means = yy[sample(1:nrow(yy), numclust),, drop=FALSE]
  jitter_sd = apply(yy, 2, sd) / 100
  jitter_means = MASS::mvrnorm(n = nrow(new_means),
                               mu = rep(0, dimdat),
                               Sigma = diag(jitter_sd, ncol = dimdat))
  new_means = new_means + jitter_means

  ## Repeat TT times == flat/constant initial means across time.
  mulist = lapply(1:TT, function(tt){ new_means })

  ## New (T x dimdat x numclust) array is created.
  muarray = array(NA, dim=c(TT, dimdat, numclust))
  for(tt in 1:TT){
    muarray[tt,,] = as.matrix(mulist[[tt]])
  }
  return(muarray)
}


#' Initialize the covariances.
#'
#' @param data The (nt by 3) datasets. There should be T of them.
#' @param numclust Number of clusters.
#' @param fac Value to use for the diagonal of the (dimdat x dimdat) covariance
#'   matrix.
#'
#' @return An (K x dimdat x dimdat) array containing the (dimdat by dimdat)
#'   covariances.
#' @export
init_sigma <- function(data, numclust, fac = 1){

  ndat = nrow(data[[1]])
  pdat = ncol(data[[1]])
  sigmas = lapply(1:numclust, function(iclust){
    onesigma = diag(fac * rep(1, pdat))
    if(pdat==1) onesigma = as.matrix(fac)
    colnames(onesigma) = paste0("datcol", 1:pdat)
    rownames(onesigma) = paste0("datcol", 1:pdat)
    return(onesigma)
  })
  sigmas = abind::abind(sigmas, along=0)
  return(sigmas)
}

Here’s a helper function for printing the progress.

#' A helper function to print the progress of a loop or simulation.
#'
#' @param isim Replicate number.
#' @param nsim Total number of replicates.
#' @param type Type of job you're running. Defaults to "simulation".
#' @param lapsetime Lapsed time, in seconds (by default).
#' @param lapsetimeunit "second".
#' @param start.time start time.
#' @param fill Whether or not to fill the line.
#'
#' @return No return
print_progress <- function(isim, nsim,
                           type = "simulation", lapsetime = NULL,
                           lapsetimeunit = "seconds", start.time = NULL,
                           fill = FALSE){

  ## If lapse time is present, then use it
  if(fill) cat(fill = TRUE)
  if(is.null(lapsetime) & is.null(start.time)){
    cat("\r", type, " ", isim, "out of", nsim)
  } else {
    if(!is.null(start.time)){
      lapsetime = round(difftime(Sys.time(), start.time,
                                 units = "secs"), 0)
      remainingtime = round(lapsetime * (nsim-isim)/isim,0)
      endtime = Sys.time() + remainingtime
    }
    cat("\r", type, " ", isim, "out of", nsim, "with lapsed time",
        lapsetime, lapsetimeunit, "and remaining time", remainingtime,
        lapsetimeunit, "and will finish at", strftime(endtime))
  }
  if(fill) cat(fill = TRUE)
}

5.4 E step

#' E step, which updates the "responsibilities", which are posterior membership probabilities of each particle.
#'
#' @param mn
#' @param sigma covariance
#' @param prob
#' @param ylist
#' @param numclust
#' @param denslist_by_clust
#' @param first_iter
#' @param countslist
#' @param padding A small amount of padding to add to the weighted
#'   densities. Note, a very small value (like 1E-20) should be used, otherwise
#'   the probabilitistic (soft) gating can be affected quite noticably.
#'
#' @return
#' @export
#'
Estep <- function(mn, sigma, prob, ylist = NULL, numclust, denslist_by_clust = NULL,
                  first_iter = FALSE, countslist = NULL, padding = 1E-20){
  ## Basic setup
  TT = length(ylist)
  ntlist = sapply(ylist, nrow)
  resp = list()
  dimdat = dim(mn)[2]
  assertthat::assert_that(dim(mn)[1] == length(ylist))

  ## Helper to calculate Gaussian density for each \code{N(y_{t,k},mu_{t,k} and
  ## Sigma_k)}.
  calculate_dens <- function(iclust, tt, y, mn, sigma, denslist_by_clust,
                             first_iter) {
    mu <- mn[tt, , iclust]
    if (dimdat == 1) {
      dens = dnorm(y, mu, sd = sqrt(sigma[iclust, , ]))
    } else {
       dens = dmvnorm_arma_fast(y, mu, sigma[iclust,,], FALSE)
    }
    return(dens)
  }
  ## Calculate posterior probability of membership of $y_{it}$.
  ncol.prob = ncol(prob)
  empty_row = rbind(1:numclust)[-1,]
  for (tt in 1:TT) {
    ylist_tt = ylist[[tt]]
    if(nrow(ylist_tt) == 0){
      resp[[tt]] <- empty_row
    } else {
      densmat <- sapply(1:numclust, calculate_dens, tt, ylist_tt,
                        mn, sigma, denslist_by_clust, first_iter) 
      wt.densmat <- matrix(prob[tt, ], nrow = ntlist[tt],
                           ncol = ncol.prob, byrow = TRUE) * densmat
      wt.densmat = wt.densmat + padding##1e-20
      wt.densmat <- wt.densmat/rowSums(wt.densmat)
      resp[[tt]] <- wt.densmat
    }
  }

  ## Weight the responsibilities by $C_{it}$.
  if (!is.null(countslist)) {
    resp <- Map(function(myresp, mycount) {
      myresp * mycount
    }, resp, countslist)
  }
  return(resp)
}

The E step should return a list whose elements are matrices that have the same number of rows as ylist, (that is, a \(T\) -length list of matrices of size \(n_t \times K\) for \(t=1,\cdots, T\)).

(This test fails for some reason; will come back to this.)

testthat::test_that("E step returns appropriately sized responsibilities.",{
  
  ## Generate some fake data
  TT = 10
  dimdat = 1
  ylist = lapply(1:TT, function(tt){ runif(30*dimdat) %>% matrix(ncol = dimdat, nrow = 30)})
  numclust = 3

  ## Initialize a few parameters, not carefully
  sigma = init_sigma(ylist, numclust) ## (T x numclust x (dimdat x dimdat))
  mn = init_mn(ylist, numclust, TT, dimdat)##, countslist = countslist)
  prob = matrix(1/numclust, nrow = TT, ncol = numclust) ## Initialize to all 1/K.

  ## Calculate responsibility
  resp = Estep(mn = mn, sigma = sigma, prob = prob, ylist = ylist, numclust = numclust)

  ## Check these things
  testthat::expect_equal(length(resp), length(ylist))
  testthat::expect_equal(length(resp), length(ylist))
  testthat::expect_equal(sapply(resp, nrow), sapply(ylist, nrow))
  testthat::expect_true(all(sapply(resp, ncol) == numclust))
})

## Test passed 🥇

5.5 M step

The M step of the EM algorithm has three steps – one each for \(\mu\), \(\pi\), and \(\Sigma\).

5.5.1 M step for \(\pi\)

#' The M step for the cluster probabilities
#'
#' @param resp Responsibilities.
#' @param H_tf Trend filtering matrix.
#' @param countslist Particle multiplicities.
#' @param lambda_prob Regularization. 
#' @param l_prob Trend filtering degree.
#'
#' @return (T x k) matrix containing the alphas, for \code{prob = exp(alpha)/
#'         rowSums(exp(alpha))}. 
#' @export
#'
Mstep_prob <- function(resp, H_tf, countslist = NULL,
                       lambda_prob = NULL, l_prob = NULL, x = NULL){

  ## Basic setup
  TT <- length(resp)

  ## Basic checks
  stopifnot(is.null(l_prob) == is.null(lambda_prob))

  ## If glmnet isn't actually needed, don't use it.
  if(is.null(l_prob) & is.null(lambda_prob)){

    ## Calculate the average responsibilities, per time point.
    if(is.null(countslist)){
      resp.avg <- lapply(resp, colMeans) %>% do.call(rbind, .)
    } else {
      resp.avg <- lapply(1:TT, FUN = function(ii){
        colSums(resp[[ii]])/sum(countslist[[ii]])
      }) %>% do.call(rbind, .)
    }
    return(resp.avg) 

  ## If glmnet is needed, use it.
  } else {

    lambda_range <- function(lam, nlam = 50, lam.max = max(1, 5*lam)){
      return(exp(seq(log(lam.max), log(lam), length.out = nlam)))
    }

    penalty.facs <- c(rep(0, l_prob+1), rep(1, nrow(H_tf) - l_prob - 1))
    resp.predict <- do.call(rbind, lapply(resp, colSums))
    fac = resp.predict %>% apply(2, function(mycol){max(mycol)/min(mycol)}) %>% max()

    if(FALSE){
    ## Run GLMNET but if it throws an error, reduce the dynamic range |fac| of
    ## each column |resp.predict| until it's okay.
    error_means_null = NULL
    fac = 1024
    while(is.null(error_means_null)){
      print("fac is")
      print(fac)
      error_means_null <- tryCatch({
        resp.predict = resp.predict0 %>% apply(2, function(mycol){mycol = pmax(mycol, max(mycol)/fac)})
        glmnet_obj <- glmnet::glmnet(x = H_tf, y = resp.predict, family = "multinomial",
                                     penalty.factor = penalty.facs, maxit = 1e7,
                                     lambda =  mean(penalty.facs)*lambda_range(lambda_prob),
                                     standardize = F, intercept = FALSE) 
        pred_link <- predict(glmnet_obj, newx = H_tf, type = "link", s = mean(penalty.facs) * lambda_prob)[,,1]
      }, error = function(e){
        return(NULL)
      })
      if(is.null(error_means_null)){
        fac = fac/2
        resp.predict = resp.predict0 %>% apply(2, function(mycol){mycol = pmax(mycol, max(mycol)/fac)})
      }
    }
    } else {
        glmnet_obj <- glmnet::glmnet(x = H_tf, y = resp.predict, family = "multinomial",
                                     penalty.factor = penalty.facs, maxit = 1e7,
                                     lambda =  mean(penalty.facs)*lambda_range(lambda_prob),
                                     standardize = F, intercept = FALSE) 
    }
    pred_link <- predict(glmnet_obj, newx = H_tf, type = "link", s = mean(penalty.facs) * lambda_prob)[,,1]
    return(pred_link)
  } 
}

This should return a \(T\) by \(K\) matrix, which we’ll test here:

testthat::test_that("Mstep of pi returns a (T x K) matrix.", {

  ## Generate some fake responsibilities and trend filtering matrix
  TT = 100
  numclust = 3
  nt = 10
  resp = lapply(1:TT, function(tt){
    oneresp = runif(nt*numclust) %>% matrix(ncol=numclust)
    oneresp = oneresp/rowSums(oneresp)
  })
  H_tf <- gen_tf_mat(n = TT, k = 0)

  ## Check the size
  pred_link = Mstep_prob(resp, H_tf, l_prob = 0, lambda_prob = 1E-3)
  testthat::expect_equal(dim(pred_link), c(TT, numclust))
  pred_link = Mstep_prob(resp, H_tf)
  testthat::expect_equal(dim(pred_link), c(TT, numclust))

  ## Check the correctness
  pred_link = Mstep_prob(resp, H_tf)
})

Each row of this matrix should contain the fitted values \(\alpha_k \in \mathbb{R}^3\) where \(\alpha_{kt} = h_t^T w_{k}\), for..

\(h_t\) that are rows of the trend filtering matrix \(H \in \mathbb{R}^{T \times T}\).
\(w_k \in \mathbb{R}^{n}\) that are the regression coefficients estimated by glmnet().

Here is a test for the correctness of the M step for \(\pi\).

testthat::test_that("Test the M step of \pi against CVXR", {})

5.5.2 M step for \(\Sigma\)

#' M step for cluster covariance (sigma).
#'
#' @param resp Responsibility.
#' @param ylist Data.
#' @param mn Means
#' @param numclust Number of clusters.
#'
#' @return (K x d x d) array containing K (d x d) covariance matrices.
#' @export
#'
#' @examples
Mstep_sigma <- function(resp, ylist, mn, numclust){

  ## Find some sizes
  TT = length(ylist)
  ntlist = sapply(ylist, nrow)
  dimdat = ncol(ylist[[1]])
  cs = c(0, cumsum(ntlist))

  ## Set up empty residual matrix (to be reused)
  cs = c(0, cumsum(ntlist))
  vars <- vector(mode = "list", numclust)
  ylong = do.call(rbind, ylist)
  ntlist = sapply(ylist, nrow)
  irows = rep(1:nrow(mn), times = ntlist)

  for(iclust in 1:numclust){
    resp.thisclust = lapply(resp, function(myresp) myresp[,iclust, drop = TRUE])
    resp.long = do.call(c, resp.thisclust)
    mnlong = mn[irows,,iclust]
    if(is.vector(mnlong)) mnlong = mnlong %>% cbind()
    resid <- ylong - mnlong
    resid_weighted <- resp.long * resid
    sig_temp <- t(resid_weighted) %*% resid/sum(resp.long)
    vars[[iclust]] <- sig_temp
  }

  ## Make into an array
  sigma_array = array(NA, dim=c(numclust, dimdat, dimdat))
  for(iclust in 1:numclust){
    sigma_array[iclust,,] = vars[[iclust]]
  }

  ## Basic check
  stopifnot(all(dim(sigma_array) == c(numclust, dimdat, dimdat)))
  return(sigma_array)
}

5.5.3 M step for \(\mu\)

This is a big one. It uses the ADMM algorithm as written in section 2 of the paper, reproduced briefly here.

We need a convergence checker for the outer layer of LA-ADMM, which checks whether the objective values have plateaued.

Next, we define the main function Mstep_mu(). This uses a “locally-adaptive” ADMM paper. M-step_mu() calls la_admm_oneclust() on each cluster \(k=1,\cdots, K\); this function will be introduced shortly.

#' Computes the M step for mu.
#' 
#' @param resp Responsibilities of each particle.
#' @param ylist 
#' @param lambda
#' @param l
#' @param sigma
#' @param sigma_eig_by_clust
#' @param Dl
#' @param Dlp1
#' @param TT
#' @param N
#' @param dimdat
#' @param first_iter
#' @param mus
#' @param Zs
#' @param Ws
#' @param uws
#' @param uzs
#' @param maxdev
#' @param x
#' @param niter
#' @param err_rel
#' @param err_abs
#' @param zerothresh
#' @param local_adapt
#' @param local_adapt_niter
#' @param rho_init
#' @param iter
#'
#' @return
#' @export
#'
#' @examples
Mstep_mu <- function(resp,
                     ylist,  
                     lambda = 0.5,
                     l = 3,
                     sigma,
                     sigma_eig_by_clust = NULL,
                     Dlsqrd,
                     Dl, tDl, Dlp1,  TT, N, dimdat,
                     first_iter = TRUE,
                     e_mat,

                     ## Warm startable variables
                     mus = NULL,
                     Zs = NULL,
                     Ws = NULL,
                     uws = NULL,
                     uzs = NULL,
                     ## End of warm startable variables

                     maxdev = NULL,
                     x = NULL,
                     niter = (if(local_adapt) 1e2 else 1e3),
                     err_rel = 1E-3,
                     err_abs = 0,
                     zerothresh = 1E-6,
                     local_adapt = FALSE,
                     local_adapt_niter = 10,
                     rho_init = 0.01,
                     iter = NULL){

  ####################
  ## Preliminaries ###
  ####################
  TT = length(ylist)
  numclust = ncol(resp[[1]])
  dimdat = ncol(ylist[[1]])
  ntlist = sapply(ylist, nrow)
  resp.sum = lapply(resp, colSums) %>% do.call(rbind, .)
  N = sum(unlist(resp.sum))

  ## Other preliminaries
  schur_syl_A_by_clust = schur_syl_B_by_clust = term3list = list()
  ybarlist = list()
  ycentered_list = Xcentered_list = yXcentered_list = list()
  Qlist = list()
  sigmainv_list = list()
  for(iclust in 1:numclust){

    ## Retrieve sigma inverse from pre-computed SVD, if necessary
    if(is.null(sigma_eig_by_clust)){
      sigmainv = solve(sigma[iclust,,])
    } else {
      sigmainv = sigma_eig_by_clust[[iclust]]$sigma_inv
    }

    resp.iclust <- lapply(resp, FUN = function(r) matrix(r[,iclust]))

    AB <- get_AB_mats(y = y, resp = resp.iclust, Sigma_inv = sigmainv,
                      e_mat = e_mat, N = N, Dlp1 = Dlp1, Dl = Dl,
                      Dlsqrd = Dlsqrd, rho = rho_init, z = NULL, w = NULL,
                      uz = NULL, uw = NULL)

    ## Store the Schur decomposition
    schur_syl_A_by_clust[[iclust]] = myschur(AB$A)
    schur_syl_B_by_clust[[iclust]] = myschur(AB$B)

    ycentered <- NULL
    ycentered_list[[iclust]] = ycentered
    sigmainv_list[[iclust]] = sigmainv
  }

  ##########################################
  ## Run ADMM separately on each cluster ##
  #########################################
  admm_niters = admm_inner_iters = vector(length = numclust, mode = "list")
  if(first_iter){ mus = vector(length = numclust, mode = "list") }
  Zs <- lapply(1:numclust, function(x) matrix(0, nrow = TT, ncol = dimdat))
  Ws <- lapply(1:numclust, function(x) matrix(0, nrow = TT - l, ncol = dimdat))
  uzs <- lapply(1:numclust, function(x) matrix(0, nrow = TT, ncol = dimdat))
  uws <- lapply(1:numclust, function(x) matrix(0, nrow = TT - l, ncol = dimdat))

  ## For every cluster, run LA-ADMM
  start.time = Sys.time()
  for(iclust in 1:numclust){

    resp.iclust <- lapply(resp, FUN = function(r) matrix(r[,iclust]))

    ## Possibly locally adaptive ADMM, for now just running with rho == lambda
    res = la_admm_oneclust(K = (if(local_adapt) local_adapt_niter else 1),
                           local_adapt = local_adapt,
                           iclust = iclust,
                           niter = niter,
                           TT = TT, N = N, dimdat = dimdat, maxdev = maxdev,
                           schurA = schur_syl_A_by_clust[[iclust]],
                           schurB = schur_syl_B_by_clust[[iclust]],
                           sigmainv = sigmainv_list[[iclust]],
                           rho = rho_init, 
                           rhoinit = rho_init, 
                           ## rho = (if(iclust == 1) rho_init else res$rho/2),
                           ## rhoinit = (if(iclust == 1) rho_init else res$rho/2),
                           sigma = sigma,
                           lambda = lambda,
                           resp = resp.iclust,
                           resp_sum = resp.sum[,iclust],
                           l = l,
                           Dlp1 = Dlp1,
                           Dl = Dl,
                           tDl = tDl,
                           y = ylist,
                           err_rel = err_rel,
                           err_abs = err_abs,
                           zerothresh = zerothresh,
                           sigma_eig_by_clust = sigma_eig_by_clust,
                           em_iter = iter,

                           ## Warm starts from previous *EM* iteration
                           first_iter = first_iter,
                           uw = uws[[iclust]],
                           uz = uzs[[iclust]],
                           z = Zs[[iclust]],
                           w = Ws[[iclust]])

    ## Store the results
    mus[[iclust]] = res$mu
    admm_niters[[iclust]] = res$kk
    admm_inner_iters[[iclust]] = res$inner.iter

    ## Store other things for for warmstart
    Zs[[iclust]] = res$Z
    uzs[[iclust]] = res$uz
    uws[[iclust]] = res$uw
    Ws[[iclust]] = res$W
  }

  ## Aggregate the yhats into one array
  mu_array = array(NA, dim = c(TT, dimdat, numclust))
  for(iclust in 1:numclust){ mu_array[,,iclust] = mus[[iclust]] }

  ## Each are lists of length |numclust|.
  return(list(mns = mu_array,
              admm_niters = admm_niters, 
              admm_inner_iters = admm_inner_iters,

              ## For warmstarts
              Zs = Zs,
              Ws = Ws,
              uws = uws,
              uzs = uzs,
              N = N,

              ## Return the column
              rho = res$rho,

              ## For using in the Sigma M step
              ycentered_list = ycentered_list,
              Xcentered_list = Xcentered_list,
              yXcentered_list = yXcentered_list,
              Qlist = Qlist
  ))
}

The locally adaptive admm for a single cluster involves an inner and outer loop. The inner loop is an ADMM for a fixed \(\rho\). The outer loop is written here in la_admm_oneclust, and runs the inner ADMM with a fixed step-size \(\rho\) while sequentially doubling \(\rho\).

#' Locally adaptive ADMM; wrapper to \code{admm_oneclust()}.
#'
#' @param K
#' @param ...
#'
#' @return
#'
#' @examples
la_admm_oneclust <- function(K, ...){

  ## Initialize arguments for ADMM.
  args <- list(...)
  p = args$p
  l = args$l
  TT = args$TT
  dimdat = args$dimdat
  rhoinit = args$rhoinit

  ## This initialization can come from the previous *EM* iteration.
  if(args$first_iter){
    mu = matrix(0, nrow = TT, ncol = dimdat)
    Z <- matrix(0, nrow = TT, ncol = dimdat)
    W <- matrix(0, nrow = TT-l, ncol = dimdat)
    uz <- matrix(0, nrow = TT, ncol = dimdat)
    uw <- matrix(0, nrow = TT - l, ncol = dimdat)

    args[['mu']] <- mu
    args[['z']] <- Z
    args[['w']] <- W
    args[['uz']] <- uz
    args[['uw']] <- uw
  }

  cols = c()
  some_admm_objectives = c()


  ## Run ADMM repeatedly with (1) double rho, and (2) previous b
  for(kk in 1:K){
    if(kk > 1){

      ## These ensure warm starts are true
      args[['mu']] <- mu
      args[['z']] <- Z
      args[['w']] <- W
      args[['uz']] <- uz
      args[['uw']] <- uw
      args[['rho']] <- rho
    }

    ## Run ADMM
    args[['outer_iter']] <- kk

    ## Call main function
    argn <- lapply(names(args), as.name)
    names(argn) <- names(args)
    call <- as.call(c(list(as.name("admm_oneclust")), argn))
    res = eval(call, args)

    if(any(abs(res$mu)>1E2)){
      stop("mu is blowing up! Probably because the initial ADMM step size (rho) is too large (and possibly the ball constraint on the means is large.") 
    }

    some_admm_objectives = c(some_admm_objectives, res$single_admm_objective) 

    ## Handling the scenario where the objectives are all zero
    padding = 1E-12
    some_admm_objectives = some_admm_objectives + padding

    ## See if outer iterations should terminate
    if(res$converge){
      res$converge <- T
      break
    }

    ## Update some parameters; double the rho value, and update the B matrix
    rho = 2 * args$rho
    mu = res$mu
    Z = res$Z
    W = res$W
    uz = res$uz
    uw = res$uw
  }
  if(!res$converge) {
    ## warning("ADMM didn't converge for one cluster.") ## Disabling this because it's not really something to fix. The only thing change a user would make is to make |rho_init| smaller 
  }

  ## Record how long the admm took; in terms of # iterations.
  res$kk = kk

  ## Record the final rho
  res$rho = args$rho

  return(res)
}

Next, the inner loop. The workhorse admm_oneclust() actually performs the ADMM update for one cluster for a fixed step-size \(\rho\) across optimization iterations iter=1,...,n.

This admm_oneclust() uses the following helpers:

W_update_fused(): this uses the prox function, which uses a RCpp function. prox_dp.
Z_update()
U_update_Z()
U_update_W()

A prox_dp function, written in C, will be used.

#' Solve a fused lasso problem for the W update. Internally, a fused lasso dynamic
#' programming solver \code{prox()} (which calls \code{prox_dp()} written in C)
#' is used.
#'
W_update_fused <- function(l, TT, mu, uw, rho, lambda, Dl){

  # modified lambda for fused lasso routine
  mod_lam <- lambda/rho

  # generating pseudo response xi
  if( l < 0 ){
    stop("l should never be /below/ zero!")
  } else if( l == 0 ){
    xi <- mu + 1/rho * uw  ## This is faster
  } else {
    xi <- Dl %*% mu + 1/rho * uw
    if(any(is.nan(xi))) browser()

    ## l = 2 is quadratic trend filtering
    ## l = 1 is linear trend filtering
    ## l = 0 is fused lasso
    ## D^{(1)} is first differences, so it correponds to l=0
    ## Dl = gen_diff_mat(n = TT, l = l, x = x)  <---  (T-l) x T matrix 
  }

  ## Running the fused LASSO
  ## which solves min_zhat 1/2 |z-zhat|_2^2 + lambda |D^{(1)}zhat|
  fit <- prox_dp(z = xi, lam = mod_lam) 

  return(fit)
}

Z_update  <- function(m, Uz, C, rho){
  mat = m + Uz/rho
  Z = projCmat(mat, C)
  return(Z)
}

#' Project rows of matrix \code{mat} into a ball of size \code{C}.
#' 
#' @param mat Matrix whose rows will be projected into a C-sized ball.
#' @param C radius
#'
#' @return Projected matrix.
projCmat <- function(mat, C){
  if(!is.null(C)){
    vlens = sqrt(rowSums(mat * mat))
    inds = which(vlens > C)
    if(length(inds) > 0){
      mat[inds,] = mat[inds,] * C / vlens[inds]
    }
  }
  return(mat)
}

We will test the projCmat function.

testthat::test_that("Test the ball projection", {
set.seed(100)
mat = matrix(rnorm(100), ncol=2)
projected_mat = flowtrend:::projCmat(mat, 1)
testthat::expect_true(all(projected_mat %>% apply(1, function(myrow)sum(myrow*myrow)) < 1+1E-8))
})

## Test passed 🎉

The other parameter updates are done with these functions.

#' @param U (T x dimdat) matrix.
U_update_Z <- function(U, rho, mu, Z, TT){
 # return(U + rho * (scale(mu, scale = F) - Z))
  stopifnot(nrow(U) == TT)

  centered_mu = sweep(mu, 2, colMeans(mu)) 
  ## stopifnot(all(abs(colMeans(centered_mu))<1E-8))
  Unew = U + rho * (centered_mu - Z)

  ## Expect a (T-l) x dimdat matrix.
  stopifnot(all(dim(U) == dim(Unew))) 
  stopifnot(nrow(U) == TT)
  return(Unew)
}

#' @param U ((T-l) x dimdat) matrix.
U_update_W <- function(U, rho, mu, W, l, Dl, TT){

  # l = 2 is quadratic trend filtering 
  # l = 1 is linear trend filtering
  # l = 0 is fused lasso
  # D^{(1)} is first differences, so it correponds to l=0
  # D^{(l+1)} is used for order-l trend filtering.
  stopifnot(nrow(W) == TT - l)
  Unew <- U + rho * (Dl %*% mu - W)

  ## Expect a (T-l) x dimdat matrix.
  stopifnot(all(dim(U) == dim(Unew))) 
  stopifnot(nrow(U) == TT-l)
  return(Unew)
}

Here is that main workhorse admm_oneclust().

#' One cluster's ADMM for a fixed step size (\code{rho}).
#' 
#' 
#' @export
#' @param iclust 
#' @param niter 
#' @param y 
#' @param Dl 
#' @param tDl 
#' @param Dlp1 
#' @param l Default: NULL
#' @param TT 
#' @param N 
#' @param dimdat 
#' @param maxdev 
#' @param rho 
#' @param rhoinit Default: rho
#' @param Xinv 
#' @param schurA 
#' @param schurB 
#' @param sigmainv 
#' @param lambda 
#' @param resp 
#' @param resp_sum 
#' @param ylist 
#' @param err_rel Default: 1e-3
#' @param err_abs Default: 0
#' @param zerothresh 
#' @param mu 
#' @param z 
#' @param w 
#' @param uw 
#' @param uz 
#' @param first_iter 
#' @param em_iter 
#' @param outer_iter 
#' @param local_adapt 
#' @param sigma 
#' @param sigma_eig_by_clust 
#'
#' @return A list containing the updated parameters for the cluster.
#' @seealso \code{\link{la_admm_oneclust}} which is a wrapper over this function.
admm_oneclust <- function(iclust = 1, niter, y,
                          Dl, tDl, Dlp1, l = NULL,
                          TT, N, dimdat, maxdev,
                          rho,
                          rhoinit = rho,
                          Xinv,
                          schurA,
                          schurB,
                          sigmainv,
                          lambda,
                          resp,
                          resp_sum,
                          ylist, err_rel = 1e-3, err_abs = 0,
                          zerothresh,
                          mu, 
                          z,
                          w,
                          uw,
                          uz,
                          first_iter,## Not used 
                          em_iter,
                          outer_iter,
                          local_adapt,
                          sigma,
                          sigma_eig_by_clust){

  ## Initialize the variables ###
  ## resid_mat = matrix(NA, nrow = ceiling(niter/5), ncol = 4)
  ## colnames(resid_mat) = c("primresid", "primerr", "dualresid", "dualerr")
  resid_mat = matrix(NA, nrow = ceiling(niter/5), ncol = 6)
  colnames(resid_mat) = c("prim1", "prim2", "primresid", "primerr", "dualresid", "dualerr")
  rhofac = rho / rhoinit 

  ## This doesn't change over iterations
  schurB = myschur(schurB$orig * rhofac) ## In flowmix, this is done on A. Here, it's done on B (in AX + XB + C = 0).
  TA = schurA$T 
  TB = schurB$T
  UA = schurA$Q
  UB = schurB$Q
  tUA = schurA$tQ
  tUB = schurB$tQ

  ## This also doesn't change over iterations
  C1 <- do.call(cbind, lapply(1:TT, FUN = function(tt){
      multmat <- apply(y[[tt]], FUN = function(yy) yy * resp[[tt]], MARGIN = 2)
      sigmainv %*% colSums(multmat)
    }))

  
  for(iter in 1:niter){
    syl_C <- get_C_mat(C1 = C1, resp_sum = resp_sum, TT = TT, dimdat = dimdat,
                       Sigma_inv = sigmainv, N = N, Dl = Dl, rho = rho,
                       z = z, w = w, uz = uz, uw = uw, l=l)
    FF =  (-1) * tUA %*% syl_C %*% UB
    mu = UA %*% matrix_function_solve_triangular_sylvester_barebonesC2(TA, TB, FF) %*% tUB
    mu = t(mu)
    stopifnot(nrow(mu) == TT)
    stopifnot(ncol(mu) == dimdat)

    centered_mu = sweep(mu, 2, colMeans(mu)) 
    z <- Z_update(centered_mu, Uz = uz, C = maxdev, rho = rho)

    if(any(abs(mu)>1E2)){
      stop("mu is blowing up!")
      ## break
    }
    wlist = lapply(1:dimdat, function(j){
      W_update_fused(l = l, TT = TT, mu = mu[, j, drop = TRUE],
                     rho = rho, lambda = lambda,
                     uw = uw[,j,drop=TRUE],
                     Dl = Dl)})
    w <- do.call(cbind, wlist)
    stopifnot(nrow(w) == TT-l)
    stopifnot(ncol(w) == dimdat)

    uz = U_update_Z(uz, rho, mu, z, TT)

    ## uw = U_update_W(uw, rho, mu, w, l, TT)
    uw = U_update_W(uw, rho, mu, w, l, Dl, TT)

    ## Check convergence
    if( iter > 1  & iter %% 5 == 0){## & !local_adapt){

      ## Calculate convergence criterion
      obj = check_converge(mu, rho,
                           w, z,
                           w_prev, z_prev,
                           uw, uz,
                           Dl, tDl, err_rel = err_rel, err_abs = err_abs)

      jj = (iter/ 5)
      resid_mat[jj,] = c(
          norm(obj$primal_resid1, "F"), ## Temp
          norm(obj$primal_resid2, "F"), ## Temp
          norm(obj$primal_resid, "F"),
                         obj$primal_err,
                         norm(obj$dual_resid,"F"),
                         obj$dual_err)

      if(is.na(obj$converge)){
        obj$converge = converge = FALSE
        warning("Convergence was NA")
      }
      if(obj$converge){
        converge = TRUE
        break
      } else {
        converge = FALSE
      }
    }
    w_prev = w
    z_prev = z
  }

  if(FALSE){
    ## Calculate optimization objective values for this cluster.
    obj.value <- objective_per_cluster(y = y, mu = mu, resp = resp,
                                       Sigma_inv = sigmainv, TT = TT,
                                       d = dimdat, Dlp1 = Dlp1, Dl = Dl, l = l,
                                       maxdev = maxdev, lambda = lambda,
                                       rho = rho, N = N)
    ## This is very expensive to do within each ADMM iteration, so it's commented out for now.
  }
  obj.value = NA

  return(list(mu = mu,
              resid_mat = resid_mat,
              converge = converge,
              ## Other variables to return.
              Z = z,
              W = w,
              uz = uz,
              uw = uw,
              inner.iter = iter,
              single_admm_objective = obj.value))
}

5.5.4 Helpers for M step (\(\mu\))

First, let’s start with a few helper functions.

#' a
#'
#' @param TT
etilde_mat <- function(TT){

  mats <- lapply(1:TT, FUN = function(t){
    e_vec <- rep(0, TT)
    e_vec[t] <- 1
    (e_vec - 1/TT) %*% t(e_vec - 1/TT)
  })
  Reduce('+', mats)
}

#' a
#'
#' @param y
get_AB_mats <- function(y, resp, Sigma_inv, e_mat, Dlsqrd, N, Dlp1, Dl, rho, z, w, uz, uw){

  # A matrix
  A <- 1/N * Sigma_inv

  # B matrix
  sum_resp <- sapply(resp, sum)
  #B <- rho*(t(Dl)%*%Dl + e_mat)%*% diag(1/unlist(sum_resp))
  B <- rho*(Dlsqrd + e_mat)
  B <- B/sum_resp[col(B)]
  #B <- rho*(Dlsqrd + e_mat)  %*% diag(1/unlist(sum_resp))

  return(list(A = A, B = B))
}

#' a
#' @param C1
get_C_mat <- function(C1, resp_sum, TT, dimdat, Sigma_inv, e_mat, N, Dlp1, Dl,
                      rho, z, w, uz, uw, l){

  C2 <- t(uz - rho*z)

  # averaging
  C2 <- C2 - rowMeans(C2)

  # third component
  C3 <- do.call(rbind, lapply(1:dimdat, FUN = function(j){
    ((uw[,j, drop=TRUE] - rho * w[,j,drop=TRUE]) %*% Dl) %>% as.numeric()
  }))

  # combining
  C <-  (-1/N * C1 + C2 + C3)
  C <- C/resp_sum[col(C)]
  return(C)
}

#' @param mat Matrix to Schur-decompose.
myschur <- function(mat){
  stopifnot(nrow(mat) == ncol(mat))
  if(is.numeric(mat) & length(mat)==1) mat = mat %>% as.matrix()
  obj = Matrix::Schur(mat)
  obj$tQ = t(obj$Q)
  obj$orig = mat
  return(obj)
}

Here’s a function to check convergence of the ADMM with a fixed step size.

#' Check convergence of ADMM with a fixed step size (rho).
check_converge <- function(mu, rho, w, z, w_prev, z_prev, uw, uz, Dl, tDl,
                     err_rel = 1E-4,
                     err_abs = 0
){

  ## Constraints are: Ax + Bz =c, where x = mu, z =(w, z)
  prim1 = rbind(Dl %*% mu, mu - colMeans(mu))
  prim2 = rbind(-w, -z)
  primal_resid = prim1 + prim2   ## Ax + Bz - c

  change_z = z - z_prev
  change_w = w - w_prev
  dual_resid = rho * (-(change_z - colMeans(change_z)) - tDl %*% change_w)
  tAU = (uz - colMeans(uz)) + tDl %*% uw
         
  ## Form primal and dual tolerances.
  primal_err = sqrt(length(primal_resid)) * err_abs +
    err_rel * max(norm(prim1, "F"), norm(prim2, "F"))
  dual_err = sqrt(length(dual_resid)) * err_abs +
    err_rel * norm(tAU, "F")

  ## Check convergence.
  primal_resid_size = norm(primal_resid, "F")
  dual_resid_size = norm(dual_resid, "F")
  primal_converge = ( primal_resid_size  <= primal_err )
  dual_converge = ( dual_resid_size <= dual_err )

  ## Some checks (trying to address problems with |converge|).
  assertthat::assert_that(is.numeric(primal_resid_size))
  assertthat::assert_that(is.numeric(primal_err))
  assertthat::assert_that(is.numeric(dual_resid_size))
  assertthat::assert_that(is.numeric(dual_err))

  ## return(primal_converge & dual_converge)
  converge = primal_converge & dual_converge

  return(list(
      primal_resid1 = prim1,
      primal_resid2 = prim2,
      primal_resid = primal_resid,
      primal_err = primal_err,
      dual_resid = dual_resid,
      dual_err = dual_err,
      converge = converge))
}

Here’s a function to compute the augmented Lagrangian of the M-step (this is not used for now).

#' computes the Augmented lagrangian.
aug_lagr <- function(y, TT, d, z, w, l, uz, uw, mu, resp, Sigma_inv, Dlp1, Dl, maxdev, lambda, rho, N){
  mu_dd <- rowMeans(mu)

  # Check the Z's for ball constraint, up to a tolerance of 1e-4
  znorms <- apply(z, FUN = function(zz) sqrt(sum(zz^2)), MARGIN = 1)
  if(any(is.na(znorms))) browser()
  if(any(znorms > (maxdev + 1e-4))){
    warning("||z||_2 > maxdev, current iterate not feasible.")
    return(Inf)
  }

  aug1 <- sum(do.call(cbind, lapply(1:TT, FUN = function(t){
    multmat <- apply(y[[t]], FUN = function(yy){
      t(yy - mu[,t]) %*% Sigma_inv %*% (yy - mu[,t])}, MARGIN = 1)
    sum(1/(2*N) * resp[[t]]*multmat)
  })))

  aug2 <- lambda*sum(do.call(rbind, lapply(1:d, FUN = function(j)
    sum(abs(diff(w[j,], differences = 1))))))

  aug3 <- sum(do.call(cbind, lapply(1:TT, FUN = function(t){
    uz[t,]%*%(mu[,t] - mu_dd - z[t,]) + rho/2 * sum((mu[,t] - mu_dd - z[t,])^2)
  })))


  aug4 <- sum(do.call(rbind, lapply(1:d, FUN = function(j){
    uw[,j] %*% (Dl %*% mu[j,]  - w[j,]) + rho/2 * sum((Dl %*% mu[j,] - w[j,])^2)
  })))

  total <- aug1 + aug2 + aug3 + aug4

  return(total)
}

Here’s a function to calculate the objective value for the ADMM.

\[\frac{1}{2N} \sum_{t=1}^T \sum_{i=1}^{n_t} \hat{\gamma}_{it} (y_i^{(t)} - \mu_{\cdot t})^T \hat{\Sigma}^{-1} ( y_i^{(t)} - \mu_{\cdot t}) + \lambda \sum_{j=1}^d \|D^{(l+1)}\mu_{j\cdot }\|_1\]

(This is the penalized surrogate objective \(Q_{\tilde \theta}(\mu, \Sigma, \pi) + \lambda \sum_{j=1}^d \|D^{(l+1)} \mu_{j \cdot}\|_1\), taking only the parts, and leaving out a constant \(C=-\frac{d}{2}\log(2\pi) - \frac{1}{2} \log det(\Sigma_k)\), since the constant \(C\) doesn’t change over ADMM iterations.)

#' computes the ADMM objective for one cluster
objective_per_cluster <- function(y, TT, d, l, mu, resp, Sigma_inv, Dlp1, Dl,
                                  maxdev, lambda, rho, N){
  
  aug1 <- sum(do.call(cbind, lapply(1:TT, FUN = function(tt){
    multmat <- apply(y[[tt]], FUN = function(yy){
      t(yy - mu[tt,]) %*% Sigma_inv %*% (yy - mu[tt,])}, MARGIN = 1)
    sum(1/(2*N) * resp[[tt]] * multmat)
  })))

  aug2 <- lambda*sum(do.call(rbind, lapply(1:d, FUN = function(j)
    sum(abs(diff(mu[,j], differences = l))))))

  total <- aug1 + aug2
  return(total)
}

5.5.5 All Rcpp functions

The main function we write in Rcpp is the “barebones” Sylvester equation solver that takes upper-triangular coefficient matrices.

// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::depends(RcppEigen)]]
#include <RcppArmadillo.h>
#include <RcppEigen.h>
#include <numeric>

using namespace arma;
using namespace Eigen;

using Eigen::Map;                       // 'maps' rather than copies
using Eigen::MatrixXd;                  // variable size matrix, double precision

//' Solve "barebones" sylvester equation that takes upper triangular matrices as coefficients.
//'
//' @param TA Upper-triangular matrix
//' @param TB Upper-triangular matrix
//' @param C matrix
//' @export
// [[Rcpp::export]]
Eigen::MatrixXd matrix_function_solve_triangular_sylvester_barebonesC2(const Eigen::MatrixXd & TA, 
                                     const Eigen::MatrixXd & TB,
                                     const Eigen::MatrixXd & C){
  // Eigen::eigen_assert(TA.rows() == TA.cols());
  // Eigen::eigen_assert(TA.Eigen::isUpperTriangular());
  // Eigen::eigen_assert(TB.rows() == TB.cols());
  // Eigen::eigen_assert(TB.Eigen::isUpperTriangular());
  // Eigen::eigen_assert(C.rows() == TA.rows());
  // Eigen::eigen_assert(C.cols() == TB.rows());

  // typedef typename MatrixType::Index Index; 
  // typedef typename MatrixType::Scalar Scalar;

  int m = TA.rows();
  int n = TB.rows();
  Eigen::MatrixXd X(m, n);

  for (int i = m - 1; i >= 0; --i) {
    for (int j = 0; j < n; ++j) {

      // Compute T_A X = \sum_{k=i+1}^m T_A_{ik} X_{kj}
      double TAX;
      if (i == m - 1) {
        TAX = 0;
      } else {
    MatrixXd TAXmatrix = TA.row(i).tail(m-1-i) * X.col(j).tail(m-1-i);
        TAX = TAXmatrix(0,0);
      }

      // Compute X T_B = \sum_{k=1}^{j-1} X_{ik} T_B_{kj}
      double XTB;
      if (j == 0) {
        XTB = 0;
      } else {
        MatrixXd XTBmatrix = X.row(i).head(j) * TB.col(j).head(j);
        XTB = XTBmatrix(0,0);
      }

      X(i,j) = (C(i,j) - TAX - XTB) / (TA(i,i) + TB(j,j));
    }
  }
  return X;
}

Also, we define a faster dmvnorm function written in C++.

// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>

static double const log2pi = std::log(2.0 * M_PI);

/* C++ version of the dtrmv BLAS function */
void inplace_tri_mat_mult(arma::rowvec &x, arma::mat const &trimat){
  arma::uword const n = trimat.n_cols;

  for(unsigned j = n; j-- > 0;){
    double tmp(0.);
    for(unsigned i = 0; i <= j; ++i)
      tmp += trimat.at(i, j) * x[i];
    x[j] = tmp;
  }
}


// [[Rcpp::export]]
arma::vec dmvnorm_arma_fast(arma::mat const &x,
                           arma::rowvec const &mean,
                           arma::mat const &sigma,
                           bool const logd = false) {
    using arma::uword;
    uword const n = x.n_rows,
             xdim = x.n_cols;
    arma::vec out(n);
    arma::mat const rooti = arma::inv(trimatu(arma::chol(sigma)));
    double const rootisum = arma::sum(log(rooti.diag())),
                constants = -(double)xdim/2.0 * log2pi,
              other_terms = rootisum + constants;

    arma::rowvec z;
    for (uword i = 0; i < n; i++) {
        z = (x.row(i) - mean);
        inplace_tri_mat_mult(z, rooti);
        out(i) = other_terms - 0.5 * arma::dot(z, z);
    }

    if (logd)
      return out;
    return exp(out);
}

// All credit goes to https://gallery.rcpp.org/articles/dmvnorm_arma/

// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>
#include <vector>
#include <cmath> // For standard C++ functions if needed, though not strictly required here


using namespace std;

// Dynamic programming algorithm for the 1d fused lasso problem
// (Ryan Tibshirani's implementation of Nick Johnson's algorithm)

void prox_dp_core_cpp(
    int n,
    const double *y_ptr, // Use const since input y is not modified
    double lam,
    double *beta_ptr)
{
    // Take care of a few trivial cases
    if (n == 0) return;
    if (n == 1 || lam == 0) {
        for (int i = 0; i < n; i++) beta_ptr[i] = y_ptr[i];
        return;
    }

    // Use std::vector for automatic memory management (RAII) instead of malloc/free
    // These store the derivative knots (x) and coefficients (a, b)
    std::vector<double> x(2 * n);
    std::vector<double> a(2 * n);
    std::vector<double> b(2 * n);
    int l, r;

    // These are the knots of the back-pointers
    std::vector<double> tm(n - 1);
    std::vector<double> tp(n - 1);

    // Aliases to make the original logic clearer (using ptrs to vector data)
    double *x_data = x.data();
    double *a_data = a.data();
    double *b_data = b.data();
    double *tm_data = tm.data();
    double *tp_data = tp.data();

    double afirst, alast, bfirst, blast;

    // We step through the first iteration manually
    tm_data[0] = -lam + y_ptr[0];
    tp_data[0] = lam + y_ptr[0];
    l = n - 1;
    r = n;
    x_data[l] = tm_data[0];
    x_data[r] = tp_data[0];
    a_data[l] = 1;
    b_data[l] = -y_ptr[0] + lam;
    a_data[r] = -1;
    b_data[r] = y_ptr[0] + lam;
    afirst = 1;
    bfirst = -lam - y_ptr[1];
    alast = -1;
    blast = -lam + y_ptr[1];

    // Now iterations 2 through n-1
    int lo, hi;
    double alo, blo, ahi, bhi;
    for (int k = 1; k < n - 1; k++) {
        // Compute lo: step up from l until the
        // derivative is greater than -lam
        alo = afirst;
        blo = bfirst;
        for (lo = l; lo <= r; lo++) {
            if (alo * x_data[lo] + blo > -lam) break;
            alo += a_data[lo];
            blo += b_data[lo];
        }

        // Compute the negative knot
        tm_data[k] = (-lam - blo) / alo;
        l = lo - 1;
        x_data[l] = tm_data[k];

        // Compute hi: step down from r until the
        // derivative is less than lam
        ahi = alast;
        bhi = blast;
        for (hi = r; hi >= l; hi--) {
            if (-ahi * x_data[hi] - bhi < lam) break;
            ahi += a_data[hi];
            bhi += b_data[hi];
        }

        // Compute the positive knot
        tp_data[k] = (lam + bhi) / (-ahi);
        r = hi + 1;
        x_data[r] = tp_data[k];

        // Update a and b
        a_data[l] = alo;
        b_data[l] = blo + lam;
        a_data[r] = ahi;
        b_data[r] = bhi + lam;
        afirst = 1;
        bfirst = -lam - y_ptr[k + 1];
        alast = -1;
        blast = -lam + y_ptr[k + 1];
    }

    // Compute the last coefficient: this is where
    // the function has zero derivative

    alo = afirst;
    blo = bfirst;
    for (lo = l; lo <= r; lo++) {
        if (alo * x_data[lo] + blo > 0) break;
        alo += a_data[lo];
        blo += b_data[lo];
    }
    beta_ptr[n - 1] = -blo / alo;

    // Compute the rest of the coefficients, by the
    // back-pointers
    for (int k = n - 2; k >= 0; k--) {
        if (beta_ptr[k + 1] > tp_data[k]) beta_ptr[k] = tp_data[k];
        else if (beta_ptr[k + 1] < tm_data[k]) beta_ptr[k] = tm_data[k];
        else beta_ptr[k] = beta_ptr[k + 1];
    }

    // Memory cleanup is now automatic because std::vector is used.
}


//' Implements the dynamic programming algorithm for the 1D Fused Lasso proximal operator.
//'
//' @param z A numeric vector of input values ($\mathbf{y}$).
//' @param lam A numeric scalar, the regularization parameter ($\lambda$).
//' @return A numeric vector, the result of the proximal operation ($\boldsymbol{\beta}$).
//' @export
// [[Rcpp::export]]
arma::vec prox_dp(arma::vec z, double lam) {

    // Set up the output vector
    int n = z.n_elem;
    arma::vec beta = arma::zeros<arma::vec>(n);

    // Call the core C++ logic
    // z.memptr() provides a pointer to the internal data (const for input)
    // beta.memptr() provides a pointer to the internal data (non-const for output)
    prox_dp_core_cpp(n, z.memptr(), lam, beta.memptr());

    return beta;
}

We need this blob (from https://github.com/jacobbien/litr-project/blob/main/examples/make-an-r-package-with-armadillo/create-witharmadillo.Rmd ):

usethis::use_rcpp_armadillo(name = "code")

## ℹ Leaving 'src/code.cpp' unchanged.

## ☐ Edit 'src/code.cpp'.

## ✔ Adding RcppArmadillo to 'LinkingTo' field in DESCRIPTION.

## ✔ Created 'src/Makevars' and 'src/Makevars.win' with requested compilation settings.

usethis::use_rcpp_eigen(name = "code")

## ℹ Leaving 'src/code.cpp' unchanged.

## ☐ Edit 'src/code.cpp'.

## ✔ Adding RcppEigen to 'LinkingTo' field in DESCRIPTION.

## ✔ Adding "@import RcppEigen" to 'R/flowtrend-package.R'.

## ☐ Run `devtools::document()` to update 'NAMESPACE'.

#' Used as a fallback or to test against \code{Mstep_mu()}.
#' 
#' @param ylist
#' @param resp
#' @param lambda
#' @param l
#' @param Sigma_inv inverse of Sigma
#' @param x covariates
Mstep_mu_cvxr <- function(ylist,
                          resp,
                          lambda,
                          l,
                          Sigma_inv, 
                          x = NULL,
                          thresh = 1E-8,
                          maxdev = NULL,
                          dimdat,
                          N,
                          ecos_thresh = 1E-8,
                          scs_eps = 1E-5){
  
  ## Define dimensions
  TT = length(ylist)
  
  ## Responsibility Weighted Data
  ytildes <- lapply(1:TT, FUN = function(tt){
    yy <- ylist[[tt]]
    g <- resp[[tt]]
    yy <- apply(yy, MARGIN = 2, FUN = function(x) x * g)
    yrow = matrix(c(NA, NA), nrow=1, ncol=dimdat)
    yrow[1,] = colSums(yy)
    yrow
  }) %>% do.call(rbind, .)
  
  ## Auxiliary term, needed to make the objective interpretable
  aux.y <- Reduce("+", lapply(1:TT, FUN = function(tt){
    yy <- ylist[[tt]]
    g <- sqrt(resp[[tt]])
    yy <- apply(yy, MARGIN = 2, FUN = function(x) x * g)
    sum(diag(yy %*% Sigma_inv %*% t(yy)))
  }))
  
  ## Mu, d x T matrix
  mumat <- CVXR::Variable(cols=dimdat, rows=TT)
  
  ## Summed sqrt responsibilities - needed in the objective.
  resp.sum.sqrt <- lapply(resp, FUN = function(x) sqrt(sum(x)))
  
  ## Differencing Matrix, (TT-(l+1)) x TT 
  Dlp1 <- gen_diff_mat(n = TT, l = l+1, x = x)
  # l = 2 is quadratic trend filtering
  # l = 1 is linear trend filtering
  # l = 0 is fused lasso
  
  ## Forming the objective
  obj = 1/(2*N) *( Reduce("+", lapply(1:TT, FUN = function(tt) CVXR::quad_form(t(resp.sum.sqrt[[tt]]*mumat[tt,]), Sigma_inv))) -2 * Reduce("+", lapply(1:TT, FUN = function(tt) t(ytildes[tt,]) %*% Sigma_inv %*% t(mumat[tt,]))) + aux.y) + lambda * sum(CVXR::sum_entries(abs(Dlp1 %*% mumat), axis = 1))

  ## Putting together the ball constraint
  rowmns <- matrix(rep(1, TT^2), nrow = TT)/TT
  mu_dotdot <- rowmns %*% mumat
  constraints = list()
  if(!is.null(maxdev)){
    constraints = list(CVXR::sum_entries(CVXR::square(mumat - mu_dotdot), axis = 2) <= rep(maxdev^2, TT) )
  }
  
  ## Try all two CVXR solvers.
  prob <- CVXR::Problem(CVXR::Minimize(obj), constraints)
  result = NULL
  result <- tryCatch({
    CVXR::solve(prob, solver="ECOS",
          FEASTOL = ecos_thresh, RELTOL = ecos_thresh, ABSTOL = ecos_thresh)
  }, error=function(err){
    err$message = paste(err$message, 
                        "\n", "Lasso solver using ECOS has failed." ,sep="")
    cat(err$message, fill=TRUE)
    return(NULL)
  })
  
  ## If anything is wrong, flag to use SCS solver
  scs = FALSE
  if(is.null(result)){
    scs = TRUE
  } else {
    if(result$status != "optimal") scs = TRUE
  }
  
  ## Use the SCS solver
  if(scs){
    result = CVXR::solve(prob, solver="SCS", eps = scs_eps)
    if(any(is.na(result$getValue(mumat)))){ ## A clumsy way to check.
      stop("Lasso solver using both ECOS and SCS has failed.", sep="")
    }
  }
  
  ## Record Interesting Parameters
  num_iters <- result$num_iters
  status <- result$status
  mumat <- result$getValue(mumat)
  val <- result$value
  
  return(list(mu = mumat, value = val, status = status, num_iters = num_iters))
}

This function solves the following problem:

\[ \begin{align*} &\text{minimize}_{\mu} {\frac{1}{2N} \sum_{t=1}^T \sum_{i=1}^{n_t} \hat{\gamma}_{it} (y_i^{(t)} - \mu_{\cdot t})^\top \hat{\Sigma}^{-1} ( y_i^{(t)} - \mu_{\cdot t}) + \lambda \sum_{j=1}^d \|D^{(l)}\mu_{j\cdot }\|_1}\\ &\text{subject to}\;\; {\| \mu_{\cdot t} - \bar{\mu}_{\cdot \cdot}\|_2 \le r \;\;\forall t=1,\cdots, T, } \end{align*} \]

and is directly equivalent to Mstep_mu(). The resulting solution and the objective value should be the same. Let’s check that.

First, set up some objects to run Mstep_mu() and Mstep_mu_cvxr().

numclust = 3 
TT = 100
dimdat = 1
set.seed(0)
dt = gendat_1d(TT = TT, ntlist = rep(TT, 100))
ylist = dt %>% dt2ylist()
sigma = init_sigma(ylist, numclust) ## (T x numclust x (dimdat x dimdat))
mn = init_mn(ylist, numclust, TT, dimdat)##, countslist = countslist) 
prob = matrix(1/numclust, nrow = TT, ncol = numclust) ## Initialize to all 1/K.
resp = Estep(mn, sigma, prob, ylist = ylist, numclust)
lambda = .01
l = 1
x = 1:TT
Dlp1 = gen_diff_mat(n = TT, l = l+1, x = x)
Dl = gen_diff_mat(n = TT, l = l, x = x)
Dlsqrd <- t(Dl) %*% Dl
maxdev = NULL

Then, compare the result of the two implementations. They should look identical.

## overall ADMM
res1 = Mstep_mu(resp, ylist, lambda, l=l, sigma=sigma, Dlsqrd = Dlsqrd, Dl=Dl, Dlp1=Dlp1, TT=TT, N=N, dimdat=dimdat, e_mat=etilde_mat(TT = TT),
                maxdev = maxdev)
mn1 = res1$mns

## CVXR just ONE cluster
res2list = lapply(1:numclust, function(iclust){
  Sigma_inv_oneclust = solve(sigma[iclust,,])
  resp_oneclist = lapply(resp, function(resp_onetime){resp_onetime[,iclust, drop=FALSE]})
  N = sum(unlist(resp))
  res2 = Mstep_mu_cvxr(ylist, resp_oneclust,
                       lambda, l,  Sigma_inv_oneclust,
                       thresh = 1E-8, maxdev = maxdev, dimdat, N) 
  res2$mu
})
mn2 = array(NA, dim=c(100, dimdat, 3))
for(iclust in 1:numclust){
  mn2[,,iclust] = res2list[[iclust]] %>%  as.matrix()
}

  
## Plot them
plot(x = dt$time, y=dt$Y, col = rgb(0,0,0,0.04), 
     main = 'admm (solid) vs cvxr (dashed)',
     ylab = "", xlab = "time")
mn1[,1,] %>% matlines(lwd = 1, lty = 1)
mn2[,1,] %>% matlines(lwd = 3, lty = 3)

Next, do this for uneven inputs.

## Setup
numclust = 3
TT = 100
dimdat = 1
set.seed(0)
dt = gendat_1d(TT = TT, ntlist = rep(TT, 100))
ylist = dt %>% dt2ylist()

## Subset them
ylist = ylist[-seq(from=10,to=100,by=10)]
x = (1:100)[-seq(from=10,to=100,by=10)]

sigma = init_sigma(ylist, numclust) ## (T x numclust x (dimdat x dimdat))
mn = init_mn(ylist, numclust, TT, dimdat)##, countslist = countslist)
prob = matrix(1/numclust, nrow = TT, ncol = numclust) ## Initialize to all 1/K.
resp = Estep(mn, sigma, prob, ylist = ylist, numclust)
lambda = .1
l = 2
##x = 1:TT
Dlp1 = gen_diff_mat(n = length(x), l = l+1, x = x)
Dl = gen_diff_mat(n = length(x), l = l, x = x)
Dlsqrd <- t(Dl) %*% Dl
maxdev = NULL


## Try the algorithm itelf.
set.seed(100)
obj = flowtrend_once(ylist = ylist, x = x, lambda = .1, lambda_prob = .1,
                     l = 2, l_prob = 2, maxdev = 5, numclust = 3,
                     rho_init = 0.01, verbose = TRUE)
plot_1d(ylist=ylist, obj=obj)
plot(obj$objectives, type ='l')
mn1 = obj$mn
obj$mn[,1,] %>% diff() %>% diff() %>% diff() %>% matplot(type='l')
mn[,1,] %>% diff() %>% diff() %>% diff() %>% matplot(type='l')
mn[,1,] %>% matplot(type='l')
## Okay, so cluster 3 has a very irregular mean.

plot(x = dt$time, y=dt$Y, col = rgb(0,0,0,0.04),
     main = 'admm (solid) vs cvxr (dashed)',
     ylab = "", xlab = "time")
mn_old[,1,] %>% matlines(lwd = 1, x=x, lty = 1)
mn_less_old[,1,] %>% matlines(lwd = 1, x=x, lty = 2)
mn[,1,] %>% matlines(lwd = 1, x=x, lty = 3)

5.6 The main “flowtrend” function

Now we’ve assembled all ingredients we need, we’ll build the main function flowtrend_once() to estimate a flowtrend model. Here goes:

#' Estimate flowtrend model once.
#'
#' @param ylist Data.
#' @param countslist Counts corresponding to multiplicities.
#' @param x Times, if points are not evenly spaced. Defaults to NULL, in which
#'   case the value becomes \code{1:T}, for the $T==length(ylist)$.
#' @param numclust Number of clusters.
#' @param niter Maximum number of EM iterations.
#' @param l Degree of differencing for the mean trend filtering. l=0 will give
#'   you piecewise constant means; l=1 is piecewise linear, and so forth. 
#' @param l_prob Degree of differencing for the probability trend filtering. 
#' @param mn Initial value for cluster means. Defaults to NULL, in which case
#'   initial values are randomly chosen from the data.
#' @param lambda Smoothing parameter for means
#' @param lambda_prob Smoothing parameter for probabilities
#' @param verbose Loud or not? EM iteration progress is printed.
#' @param tol_em Relative numerical improvement of the objective value at which
#'   to stop the EM algorithm
#' @param maxdev Maximum deviation of cluster means across time..
#' @param countslist_overwrite
#' @param admm_err_rel
#' @param admm_err_abs
#' @param admm_local_adapt
#' @param admm_local_adapt_niter
#'
#' @return List object with flowtrend model estimates.
#' @export
#'
#' @examples
flowtrend_once <- function(ylist,
                       countslist = NULL,
                       x = NULL,
                       numclust, niter = 1000, l, l_prob = NULL,
                       mn = NULL, lambda = 0, lambda_prob = NULL, verbose = FALSE,
                       tol_em = 1E-4,
                       maxdev = NULL,
                       countslist_overwrite = NULL,
                       ## beta Mstep (ADMM) settings
                       admm = TRUE,
                       admm_err_rel = 1E-3,
                       admm_err_abs = 1E-4,
                       ## Mean M step (Locally Adaptive ADMM) settings
                       admm_local_adapt = TRUE,
                       admm_local_adapt_niter = if(admm_local_adapt) 10 else 1,
                       rho_init = 0.1,
                       ## Other options
                       check_convergence = TRUE,
                       ## Random seed
                       seed = NULL){

  ## Basic checks
  if(!is.null(maxdev)){
    assertthat::assert_that(maxdev!=0)
  } else {
    maxdev = 1E10
  }
  assertthat::assert_that(numclust > 1)
  assertthat::assert_that(niter > 1)
  if(is.null(countslist)){
    ntlist = sapply(ylist, nrow)
    countslist = lapply(ntlist, FUN = function(nt) rep(1, nt))
  }
  if(!is.null(seed)){
    assertthat::assert_that(all((seed %>% sapply(., class)) == "integer"))
    assertthat::assert_that(length(seed) == 7)
  }

  ## Setup for EM algorithm
  TT = length(ylist)
  dimdat = ncol(ylist[[1]])
  if(is.null(x)) x <- 1:TT
  if(is.unsorted(x)) stop("x must be ordered!")

  # l = 2 is quadratic trend filtering
  # l = 1 is linear trend filtering
  # l = 0 is fused lasso
  # D^{(1)} is first differences, so it correponds to l=0
  Dlp1 = gen_diff_mat(n = TT, l = l+1, x = x)
  if(l > 1){
    facmat = diag(l / diff(x, lag = l))
  } else {
    facmat = diag(rep(1, TT-l))
  }
  Dl = facmat %*% gen_diff_mat(n = TT, l = l, x = x)
  tDl = t(Dl)

  Dlsqrd <- t(Dl) %*% Dl
  e_mat <- etilde_mat(TT = TT) # needed to generate B
  Dlp1_prob = gen_diff_mat(n = TT, l = l_prob+1, x = x)
  H_tf <- gen_tf_mat(n = TT, k = l_prob, x = x)
  if(is.null(mn)){
    mn = init_mn(ylist, numclust, TT, dimdat,
                 countslist = countslist, seed = seed)
  }
  ntlist = sapply(ylist, nrow)
  N = sum(ntlist)

  ## Initialize some objects
  prob = matrix(1/numclust, nrow = TT, ncol = numclust) ## Initialize to all 1/K.
  denslist_by_clust <- NULL
  objectives = c(+1E20, rep(NA, niter-1))
  sigma_fac <- diff(range(do.call(rbind, ylist)))/8
  sigma = init_sigma(ylist, numclust, sigma_fac) ## (T x numclust x (dimdat x dimdat))
  sigma_eig_by_clust = NULL
  zero.betas = zero.alphas = list()

  ## The least elegant solution I can think of.. used only for blocked cv
  if(!is.null(countslist_overwrite)) countslist = countslist_overwrite
  #if(!is.null(countslist)) check_trim(ylist, countslist)

  vals <- vector(length = niter)
  latest_rho = NA
  start.time = Sys.time()
  for(iter in 2:niter){
    if(verbose){
      print_progress(iter-1, niter-1, "EM iterations.", start.time = start.time, fill = FALSE)
    }
    resp <- Estep(mn, sigma, prob, ylist = ylist, numclust = numclust,
                  denslist_by_clust = denslist_by_clust,
                  first_iter = (iter == 2), countslist = countslist,
                  padding = 1E-8)

    ## M step (three parts)

    ## 1. Means
    res_mu = Mstep_mu(resp, ylist,
                      lambda = lambda,
                      first_iter = (iter == 2),
                      l = l, Dlp1 = Dlp1, Dl = Dl,
                      tDl = tDl,
                      Dlsqrd = Dlsqrd,
                      sigma_eig_by_clust = sigma_eig_by_clust,
                      sigma = sigma, maxdev = maxdev,
                      e_mat = e_mat,
                      Zs = NULL,
                      Ws = NULL,
                      uws = NULL,
                      uzs =  NULL,
                      x = x,
                      err_rel = admm_err_rel,
                      err_abs = admm_err_abs,
                      local_adapt = admm_local_adapt,
                      local_adapt_niter = admm_local_adapt_niter,
                      rho_init = rho_init,
                      iter = iter)
                      ## rho_init = (if(iter == 2) rho_init else latest_rho))
    ## latest_rho = res_mu$rho
    mn = res_mu$mns

    ## 2. Sigma
    sigma = Mstep_sigma(resp, ylist, mn, numclust)

    ## 3. Probabilities
    prob_link = Mstep_prob(resp, countslist = countslist, H_tf = H_tf,
                           lambda_prob = lambda_prob, l_prob = l_prob, x = x)
    prob = softmax(prob_link)

    objectives[iter] = objective(ylist = ylist, mu = mn, sigma = sigma, prob = prob, prob_link = prob_link,
                                 lambda = lambda, Dlp1 = Dlp1, l = l, countslist = countslist,
                                 Dlp1_prob = Dlp1_prob,
                                 l_prob = l_prob,
                                 lambda_prob = lambda_prob)

    ## Check convergence
    if(iter > 10){ 
      if(check_convergence &
         check_converge_rel(objectives[iter-1], objectives[iter], tol = tol_em) &
         check_converge_rel(objectives[iter-2], objectives[iter-1], tol = tol_em)&
         check_converge_rel(objectives[iter-3], objectives[iter-2], tol = tol_em)){
         ## check_converge_rel(objectives[iter-4], objectives[iter-3], tol = tol_em)){
        break
      }
    }
  }

  return(structure(list(mn = mn,
                        prob = prob,
                        prob_link = prob_link,
                        sigma = sigma,
                        objectives = objectives[2:iter],
                        final.iter = iter,
                        ## Above is output, below are data/algorithm settings.
                        dimdat = dimdat,
                        TT = TT,
                        N = N,
                        l = l,
                        l_prob = l_prob,
                        x = x,
                        numclust = numclust,
                        lambda = lambda,
                        lambda_prob = lambda_prob,
                        maxdev = maxdev,
                        niter = niter
  ), class = "flowtrend"))
}

Next, flowtrend() is the main user-facing function.

#' Main function. Repeats the EM algorithm (\code{flowtrend_once()}) with |nrep| restarts (5 by default).
#'
#' @param ... : arguments for \code{flowtrend_once()}
#' @param nrestart : number of random restarts
#'
#' @return
#' @export
#'
#' @examples
flowtrend <- function(..., nrestart = 5){

  args = list(...)
  if("verbose" %in% names(args)){
    if(args$verbose){
      cat("EM will restart", nrestart, "times", fill=TRUE)
    }
  }

  out_models <- lapply(1:nrestart, FUN = function(irestart){
    if("verbose" %in% names(args)){
      if(args$verbose){
        cat("EM restart:", irestart, fill=TRUE) 
      }
    }
    model_temp <- flowtrend_once(...)
    model_obj <- tail(model_temp$objectives, n = 1)
    if("verbose" %in% names(args)){
      if(args$verbose){
        cat(fill=TRUE) 
      }
    }
    return(list(model = model_temp, final_objective = model_obj))
  })
  final_objectives <- sapply(out_models, FUN = function(x) x$final_objective)
  best_model <- which.min(final_objectives)
  final_model = out_models[[best_model]][["model"]]

  ## Add the objectives
  final_model$all_objectives =
    lapply(1:nrestart, function(irestart){
        one_model = out_models[[irestart]]
        data.frame(objective=one_model$model$objectives) %>% mutate(iter=row_number(), irestart=irestart) %>% select(irestart, iter, objective)
    }) %>% bind_rows()
  return(final_model)
}