5 Building the model/algorithm
5.1 Trend filtering
Trend filtering is a non-parametric regression technique for a sequence of output points \(y = (y_1,..., y_T)\) observed at locations \(x = (x_1, ..., x_T)\). It is usually assumed that \(x_1, ..., x_T\) are evenly spaced points, though this can be relaxed. The trend filtering estimate of order \(l\) of the time series \(\mu_t = \mathbb{E}(y_t), t \in x\) is obtained by calculating \[\hat{\mu} = \mathop{\mathrm{argmin}}_{\mu \in \mathbb{R}^T} \frac{1}{2}\| \mu - y\|_2^2 + \lambda \| D^{(l+1)} \mu\|_1,\] where \(\lambda\) is a tuning parameter and \(D^{(l+1)} \in \mathbb{R}^{T-l}\) is the \((l+1)^\text{th}\) order discrete differencing matrix.
We first need to be able to construct the trend filtering “differencing matrix” used for smoothing the mixture parameters over time. The general idea of the trend filtering is explained masterfully in [Ryan’s paper, Section 6 and equation (41)].
The differencing matrix is formed by recursion, starting with \(D^{(1)}\).
\[\begin{equation*} D^{(1)} = \begin{bmatrix} -1 & 1 & 0 & \cdots & 0 & 0 \\ 0 & -1 & 1 & \cdots & 0 & 0\\ \vdots & & & & \\ 0 & 0 & 0 & \cdots & -1 & 1 \end{bmatrix}. \end{equation*}\]
For \(l>1\), the differencing matrox \(D^{(l+1)}\) is defined recursively as \(D^{(l+1)} = D^{(1)} D^{(l)}\), starting with \(D^{(1)}\).
#' Generating Difference Matrix of Specified Order
#'
#' @param n length of vector to be differenced
#' @param l order of differencing
#' @param x optional. Spacing of input points.
#'
#' @return A n by n-l-1 matrix
#' @export
#'
#' @examples
gen_diff_mat <- function(n, l, x = NULL){
## Basic check
if(!is.null(x)) stopifnot(length(x) == n)
if(is.unsorted(x)) stop("x must be in increasing order!")
get_D1 <- function(t) {do.call(rbind, lapply(1:(t-1), FUN = function(x){
v <- rep(0, t)
v[x] <- -1
v[x+1] <- 1
v
}))}
if(is.null(x)){
if(l == 0){
return(diag(rep(1,n)))
}
if(l == 1){
return(get_D1(n))
}
if(l > 1){
D <- get_D1(n)
for(k in 1:(l-1)){
D <- get_D1(n-k) %*% D
}
return(D)
}
}
else{
if(l == 0){
return(diag(rep(1,n)))
}
if(l == 1){
return(get_D1(n))
}
if(l > 1){
D <- get_D1(n)
for(k in 1:(l-1)){
D1 = get_D1(n-k)
facmat = diag(k / diff(x, lag = k))
D <- D1 %*% facmat %*% D
}
return(D)
}
}
}For equally spaced inputs with \(l=1\) and \(l=2\):
l = 1
Dl = gen_diff_mat(n = 10, l = l+1, x = NULL)
print(Dl)## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 -2 1 0 0 0 0 0 0 0
## [2,] 0 1 -2 1 0 0 0 0 0 0
## [3,] 0 0 1 -2 1 0 0 0 0 0
## [4,] 0 0 0 1 -2 1 0 0 0 0
## [5,] 0 0 0 0 1 -2 1 0 0 0
## [6,] 0 0 0 0 0 1 -2 1 0 0
## [7,] 0 0 0 0 0 0 1 -2 1 0
## [8,] 0 0 0 0 0 0 0 1 -2 1
l = 2
Dl = gen_diff_mat(n = 10, l = l+1, x = NULL)
print(Dl)## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] -1 3 -3 1 0 0 0 0 0 0
## [2,] 0 -1 3 -3 1 0 0 0 0 0
## [3,] 0 0 -1 3 -3 1 0 0 0 0
## [4,] 0 0 0 -1 3 -3 1 0 0 0
## [5,] 0 0 0 0 -1 3 -3 1 0 0
## [6,] 0 0 0 0 0 -1 3 -3 1 0
## [7,] 0 0 0 0 0 0 -1 3 -3 1
When the inputs have a gap in it:
## See what a l=2 difference matrix looks like:
x = (1:10)[-(3)]
l = 2
TT = length(x)
Dl = gen_diff_mat(n = TT, l = l+1, x = x)
print(Dl)## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] -0.6666667 1.3333333 -1.333333 0.6666667 0 0 0 0 0
## [2,] 0.0000000 -0.3333333 2.000000 -2.6666667 1 0 0 0 0
## [3,] 0.0000000 0.0000000 -1.000000 3.0000000 -3 1 0 0 0
## [4,] 0.0000000 0.0000000 0.000000 -1.0000000 3 -3 1 0 0
## [5,] 0.0000000 0.0000000 0.000000 0.0000000 -1 3 -3 1 0
## [6,] 0.0000000 0.0000000 0.000000 0.0000000 0 -1 3 -3 1
## Formally test it
d1 = gen_diff_mat(n = 10, l = l+1, x = 1:10)
d2 = gen_diff_mat(n = 10, l = l+1, x = (1:10)*2)
print(d1)## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] -1 3 -3 1 0 0 0 0 0 0
## [2,] 0 -1 3 -3 1 0 0 0 0 0
## [3,] 0 0 -1 3 -3 1 0 0 0 0
## [4,] 0 0 0 -1 3 -3 1 0 0 0
## [5,] 0 0 0 0 -1 3 -3 1 0 0
## [6,] 0 0 0 0 0 -1 3 -3 1 0
## [7,] 0 0 0 0 0 0 -1 3 -3 1
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] -0.25 0.75 -0.75 0.25 0.00 0.00 0.00 0.00 0.00 0.00
## [2,] 0.00 -0.25 0.75 -0.75 0.25 0.00 0.00 0.00 0.00 0.00
## [3,] 0.00 0.00 -0.25 0.75 -0.75 0.25 0.00 0.00 0.00 0.00
## [4,] 0.00 0.00 0.00 -0.25 0.75 -0.75 0.25 0.00 0.00 0.00
## [5,] 0.00 0.00 0.00 0.00 -0.25 0.75 -0.75 0.25 0.00 0.00
## [6,] 0.00 0.00 0.00 0.00 0.00 -0.25 0.75 -0.75 0.25 0.00
## [7,] 0.00 0.00 0.00 0.00 0.00 0.00 -0.25 0.75 -0.75 0.25
d1_d2_ratio = as.numeric(d1/d2) %>% na.omit() %>% as.numeric()
testthat::expect_true(all(d1_d2_ratio==d1_d2_ratio[1])) ## correctA formal test for unevenly spaced inputs will come soon, once we’ve defined
gen_tf_mat, next.
We now build a function to build a lasso regressor matrix \(H\) that can be used to solve an equivalent problem as the trend filtering of the l’th degree.
(This is stated in Lemma 4, equation (25) from Tibshirani (2014).)
#' A lasso regressor matrix H that can be used to solve an equivalent problem as the trend filtering of the \code{k}'th degree.
#'
#' @param n Total number of time points.
#' @param k Degree of trend filtering for cluster probabilities. $k=0$ is fused lasso, $k=1$ is linear trend filtering, and so on.
#' @param x Time points
#'
#' @return $n$ by $n$ matrix.
#'
#' @export
gen_tf_mat <- function(n, k, x = NULL){
if(is.null(x) ){
x = (1:n)/n
}
if(!is.null(x)){
stopifnot(length(x) == n)
}
## For every i,j'th entry, use this helper function (from eq 25 of Tibshirani
## (2014)).
gen_ij <- function(x, i, j, k){
xi <- x[i]
if(j %in% 1:(k+1)){
return(xi^(j-1))
}
if(j %in% (k+2):n){
## Special handling for k==0, See lemma 4 eq 25
if(k == 0){
prd = 1
ind = j
}
if(k >= 1){
ind = j - (1:k)
prd = prod(xi - x[ind])
}
return(prd * ifelse(xi >= x[max(ind)], 1, 0))
}
}
## Generate the H matrix, entry-wise.
H <- matrix(nrow = n, ncol = n)
for(i in 1:n){
for(j in 1:n){
H[i,j] <- gen_ij(x, i,j, k)
}
}
return(H)
}Here’s a simple test of gen_tf_mat(), against an alternative function built
for equally spaced data.
#' Creates trend filtering regression matrix using Lemma 2 of Ryan Tibshirani's
#' trendfilter paper (2014); works on equally spaced data only.
#'
#' @param n Number of data points.
#' @param k Order of trend filter. 0 is fused lasso, and so on.
#'
#' @examples
#' ord = 1
#' H_tf <- gen_tf_mat_equalspace(n = 100, k = ord)
#' H_tf[,1] * 100
#' H_tf[,2] * 100
#' H_tf[,3] * 100
#' H_tf[,4] * 100
#' H_tf[,99] * 100
#' H_tf[,100] * 100
#'
#' @return n by n matrix.
gen_tf_mat_equalspace <- function(n, k){
nn = n
kk = k
##' Connects kk to kk-1.
sigm <- function(ii, kk){
if(kk == 0) return(rep(1, ii))
cumsum(sigm(ii, kk-1))
}
mat = matrix(NA, ncol = nn, nrow = nn)
for(ii in 1:nn){
for(jj in 1:nn){
if(jj <= kk+1) mat[ii,jj] = ii^(jj-1) / nn^(jj-1)
if(ii <= jj-1 & jj >= kk+2) mat[ii, jj] = 0
if(ii > jj-1 & jj >= kk+2){
mat[ii, jj] = (sigm(ii-jj+1, kk) %>% tail(1)) * factorial(kk) / nn^kk
}
}
}
return(mat)
}testthat::test_that("Trend filtering regression matrix is created correctly on equally spaced data.",
{
## Check that equally spaced data creates same trendfilter regression matrix
## Degree 1
H1 <- gen_tf_mat(10, 1)
H1_other <- gen_tf_mat(10, 1, x=(1:10)/10)
testthat::expect_equal(H1, H1_other)
## Degree 1
H2 <- gen_tf_mat(10, 2)
H2_other <- gen_tf_mat(10, 2, x=(1:10)/10)
testthat::expect_equal(H2, H2_other)
## Check the dimension
testthat::expect_equal(dim(H1), c(10,10))
## Check against an alternative function.
for(ord in c(0,1,2,3,4)){
H <- gen_tf_mat(10, ord)
H_eq = gen_tf_mat_equalspace(10, ord)
testthat::expect_true(max(abs(H_eq- H)) < 1E-10)
}
})## Test passed 😀
Finally, let’s test the difference matrix \(D\) formed using unevenly spaced \(x\)’s.
testthat::test_that("Uneven spaced D matrix is formed correctly", {
x = (1:6)[-(2)]
## k = 0 is piecewise constant, k=1 is piecewise linear, etc.
for(k in 0:3){
## Temp
k=1
## End of temp
l = k+1
## Form Dl using our function
Dl <- flowtrend::gen_diff_mat(n=5, l = l, x=x)
## Compare it to rows of H matrix, per section 6 of Tibshirani et al. 2014
gen_tf_mat(n = length(x), k = k, x = x) %>%
solve() %>%
`*`(factorial(k)) %>% ## This part is missing in Tibshirani et al. 2014
tail(length(x)-(k+1)) -> Hx
ratiomat = Hx/Dl
## Process the ratios of each entry, and check that they're equal to 1
ratios = ratiomat[!is.nan(ratiomat)]
ratios = ratios[is.finite(ratios)]
testthat::expect_true(all.equal(ratios, rep(1, length(ratios))))
}
})## Test passed 🎉
5.2 Objective (data log-likelihood)
The function loglik_tt() calculates the log-likelihood of one cytogram, which is:
\[\sum_{i=1}^{n_t} C_i^{(t)} \log\left( \sum_{k=1}^K \pi_{itk} \phi(y_i^{(t)}; \mu_{kt}, \Sigma_k) \right). \]
This is the portion of the objective function of the flowtrend model.
#' Log likelihood for a single time point's data.
#'
#' @param mu Cluster means.
#' @param prob Cluster probabilities.
#' @param sigma Cluster variances.
#' @param ylist Data.
#' @param tt Time point.
loglik_tt <- function(ylist, tt, mu, sigma, prob, dimdat = NULL, countslist = NULL, numclust){
## One particle's log likelihood (weighted density)
weighted.densities = sapply(1:numclust, function(iclust){
if(dimdat == 1){
return(prob[tt,iclust] * dnorm(ylist[[tt]], mu[tt,,iclust], sqrt(sigma[iclust,,])))
}
if(dimdat > 1){
return(prob[tt,iclust] * dmvnorm_arma_fast(ylist[[tt]], mu[tt,,iclust], as.matrix(sigma[iclust,,]), FALSE))
}
})
nt = nrow(ylist[[tt]])
counts = (if(!is.null(countslist)) countslist[[tt]] else rep(1, nt))
sum_wt_dens = rowSums(weighted.densities)
sum_wt_dens = sum_wt_dens %>% pmax(1E-100)
return(sum(log(sum_wt_dens) * counts))
}Next, here is the function that calculates the entire objective from all
cytograms, given model parameter mu, prob, and sigma:
\[\sum_{t=1}^T\sum_{i=1}^{n_t} C_i^{(t)} \log\left( \sum_{k=1}^K \pi_{itk} \phi(y_i^{(t)}; \mu_{kt}, \Sigma_k) \right). \]
#' Evaluating the penalized negative log-likelihood on all data \code{ylist} given parameters \code{mu}, \code{prob}, and \code{sigma}.
#'
#' @param mu
#' @param prob
#' @param prob_link
#' @param sigma
#' @param ylist
#' @param Dl
#' @param l
#' @param lambda
#' @param l_prob
#' @param Dl_prob
#' @param lambda_prob
#' @param alpha
#' @param beta
#' @param countslist
#' @param unpenalized if TRUE, return the unpenalized out-of-sample fit.
#'
#' @return
#' @export
#'
#' @examples
objective <- function(mu, prob, prob_link = NULL, sigma,
ylist,
Dlp1 = NULL, l = NULL,
lambda = 0,
l_prob = NULL,
Dlp1_prob = NULL,
lambda_prob = 0,
alpha = NULL, beta = NULL,
countslist = NULL,
unpenalized = FALSE){
## Set some important variables
TT = dim(mu)[1]
numclust = dim(mu)[3]
if(is.null(countslist)){
ntlist = sapply(ylist, nrow)
} else {
ntlist = sapply(countslist, sum)
}
N = sum(ntlist)
dimdat = ncol(ylist[[1]])
## Calculate the log likelihood
loglik = sapply(1:TT, function(tt){
return(loglik_tt(ylist, tt, mu, sigma, prob, countslist, numclust = numclust, dimdat = dimdat))
})
if(unpenalized){
obj = -1/N * sum(unlist(loglik))
return(obj)
} else {
## Return penalized likelihood
mu.splt <- asplit(mu, MARGIN = 2) ## This was 3, which I think produces the same result.
diff_mu <- sum(unlist(lapply(mu.splt, FUN = function(m) sum(abs(Dlp1 %*% m)))))
diff_prob <- sum(abs(Dlp1_prob %*% prob_link))
obj = -1/N * sum(unlist(loglik)) + lambda * diff_mu + lambda_prob * diff_prob
return(obj)
}
}Here’s a helper to check numerical convergence of the EM algorithm.
#' Checks numerical improvement in objective value. Returns TRUE if |old-new|/|old| is smaller than tol.
#'
#' @param old Objective value from previous iteration.
#' @param new Objective value from current iteration.
#' @param tol Numerical tolerance.
check_converge_rel <- function(old, new, tol=1E-6){
return(abs((old-new)/old) < tol )
}Here’s also a helper function to do the softmax-ing of \(\alpha_t \in \mathbb{R}^K\).
#' Softmax function.
#'
#' @param prob_link alpha, which is a (T x K) matrix.
#'
#' @return exp(alpha)/rowSum(exp(alpha)). A (T x K) matrix.
softmax <- function(prob_link){
exp_prob_link = exp(prob_link)
prob = exp_prob_link / rowSums(exp_prob_link)
}testthat::test_that("Test for softmax",{
link = runif(100, min = -10, max = 10) %>% matrix(nrow = 10, ncol = 10)
testthat::expect_true(all(abs(rowSums(softmax(link)) - 1) < 1E-13))
})## Test passed 🎉
5.3 Initial parameters for EM algorithm
The EM algorithm requires some initial values for \(\mu\), \(\pi\) and \(\Sigma\).
Initializing \(\pi\) is done in one line, prob = matrix(1/numclust, nrow = TT, ncol = numclust), which sets everything to \(1/K\).
For \(\mu\) and \(\Sigma\), we write some functions. Essentially, initial means
\(\mu\) are jittered versions of a \(K\) means that are drawn from a downsampled
version of \(ylist\) (downsampling is done because \(ylist\) can have a large number
of particles). \(\Sigma\) is \(d\times d\) identity matrices, with fac=1 diagonal
by default.
#' Initialize the cluster centers.
#'
#' @param ylist A T-length list of (nt by 3) datasets. There should be T of
#' such datasets. 3 is actually \code{mulen}.
#' @param numclust Number of clusters (M).
#' @param TT total number of (training) time points.
#'
#' @return An array of dimension (T x dimdat x M).
#' @export
init_mn <- function(ylist, numclust, TT, dimdat, countslist = NULL, seed=NULL){
if(!is.null(seed)){
assertthat::assert_that(all((seed %>% sapply(., class)) == "integer"))
assertthat::assert_that(length(seed) == 7)
RNGkind("L'Ecuyer-CMRG")
.Random.seed <<- seed
}
if(is.null(countslist)){
ntlist = sapply(ylist, nrow)
countslist = lapply(ntlist, FUN = function(nt) rep(1, nt))
}
## Initialize the means by (1) collapsing to one cytogram (2) random
## sampling from this distribution, after truncation,
TT = length(ylist)
ylist_downsampled <- lapply(1:TT, function(tt){
y = ylist[[tt]]
counts = countslist[[tt]]
## Sample so that, in total, we get mean(nt) * 30 sized sample. In the case
## of binned data, nt is the number of bins.
if(nrow(y) > 500) nsize = pmin(nrow(y) / TT * 30, nrow(y)) else nsize = nrow(y)
some_rows = sample(1:nrow(y),
size = nsize,
prob = counts/sum(counts))
y[some_rows,, drop=FALSE]
})
## Jitter the means a bit
yy = do.call(rbind, ylist_downsampled)
new_means = yy[sample(1:nrow(yy), numclust),, drop=FALSE]
jitter_sd = apply(yy, 2, sd) / 100
jitter_means = MASS::mvrnorm(n = nrow(new_means),
mu = rep(0, dimdat),
Sigma = diag(jitter_sd, ncol = dimdat))
new_means = new_means + jitter_means
## Repeat TT times == flat/constant initial means across time.
mulist = lapply(1:TT, function(tt){ new_means })
## New (T x dimdat x numclust) array is created.
muarray = array(NA, dim=c(TT, dimdat, numclust))
for(tt in 1:TT){
muarray[tt,,] = as.matrix(mulist[[tt]])
}
return(muarray)
}
#' Initialize the covariances.
#'
#' @param data The (nt by 3) datasets. There should be T of them.
#' @param numclust Number of clusters.
#' @param fac Value to use for the diagonal of the (dimdat x dimdat) covariance
#' matrix.
#'
#' @return An (K x dimdat x dimdat) array containing the (dimdat by dimdat)
#' covariances.
#' @export
init_sigma <- function(data, numclust, fac = 1){
ndat = nrow(data[[1]])
pdat = ncol(data[[1]])
sigmas = lapply(1:numclust, function(iclust){
onesigma = diag(fac * rep(1, pdat))
if(pdat==1) onesigma = as.matrix(fac)
colnames(onesigma) = paste0("datcol", 1:pdat)
rownames(onesigma) = paste0("datcol", 1:pdat)
return(onesigma)
})
sigmas = abind::abind(sigmas, along=0)
return(sigmas)
}Here’s a helper function for printing the progress.
#' A helper function to print the progress of a loop or simulation.
#'
#' @param isim Replicate number.
#' @param nsim Total number of replicates.
#' @param type Type of job you're running. Defaults to "simulation".
#' @param lapsetime Lapsed time, in seconds (by default).
#' @param lapsetimeunit "second".
#' @param start.time start time.
#' @param fill Whether or not to fill the line.
#'
#' @return No return
print_progress <- function(isim, nsim,
type = "simulation", lapsetime = NULL,
lapsetimeunit = "seconds", start.time = NULL,
fill = FALSE){
## If lapse time is present, then use it
if(fill) cat(fill = TRUE)
if(is.null(lapsetime) & is.null(start.time)){
cat("\r", type, " ", isim, "out of", nsim)
} else {
if(!is.null(start.time)){
lapsetime = round(difftime(Sys.time(), start.time,
units = "secs"), 0)
remainingtime = round(lapsetime * (nsim-isim)/isim,0)
endtime = Sys.time() + remainingtime
}
cat("\r", type, " ", isim, "out of", nsim, "with lapsed time",
lapsetime, lapsetimeunit, "and remaining time", remainingtime,
lapsetimeunit, "and will finish at", strftime(endtime))
}
if(fill) cat(fill = TRUE)
}5.4 E step
#' E step, which updates the "responsibilities", which are posterior membership probabilities of each particle.
#'
#' @param mn
#' @param sigma covariance
#' @param prob
#' @param ylist
#' @param numclust
#' @param denslist_by_clust
#' @param first_iter
#' @param countslist
#' @param padding A small amount of padding to add to the weighted
#' densities. Note, a very small value (like 1E-20) should be used, otherwise
#' the probabilitistic (soft) gating can be affected quite noticably.
#'
#' @return
#' @export
#'
Estep <- function(mn, sigma, prob, ylist = NULL, numclust, denslist_by_clust = NULL,
first_iter = FALSE, countslist = NULL, padding = 1E-20){
## Basic setup
TT = length(ylist)
ntlist = sapply(ylist, nrow)
resp = list()
dimdat = dim(mn)[2]
assertthat::assert_that(dim(mn)[1] == length(ylist))
## Helper to calculate Gaussian density for each \code{N(y_{t,k},mu_{t,k} and
## Sigma_k)}.
calculate_dens <- function(iclust, tt, y, mn, sigma, denslist_by_clust,
first_iter) {
mu <- mn[tt, , iclust]
if (dimdat == 1) {
dens = dnorm(y, mu, sd = sqrt(sigma[iclust, , ]))
} else {
dens = dmvnorm_arma_fast(y, mu, sigma[iclust,,], FALSE)
}
return(dens)
}
## Calculate posterior probability of membership of $y_{it}$.
ncol.prob = ncol(prob)
empty_row = rbind(1:numclust)[-1,]
for (tt in 1:TT) {
ylist_tt = ylist[[tt]]
if(nrow(ylist_tt) == 0){
resp[[tt]] <- empty_row
} else {
densmat <- sapply(1:numclust, calculate_dens, tt, ylist_tt,
mn, sigma, denslist_by_clust, first_iter)
wt.densmat <- matrix(prob[tt, ], nrow = ntlist[tt],
ncol = ncol.prob, byrow = TRUE) * densmat
wt.densmat = wt.densmat + padding##1e-20
wt.densmat <- wt.densmat/rowSums(wt.densmat)
resp[[tt]] <- wt.densmat
}
}
## Weight the responsibilities by $C_{it}$.
if (!is.null(countslist)) {
resp <- Map(function(myresp, mycount) {
myresp * mycount
}, resp, countslist)
}
return(resp)
}The E step should return a list whose elements are matrices that have the same number of rows as ylist,
(that is, a \(T\) -length list of matrices of size \(n_t \times K\) for \(t=1,\cdots, T\)).
(This test fails for some reason; will come back to this.)
testthat::test_that("E step returns appropriately sized responsibilities.",{
## Generate some fake data
TT = 10
dimdat = 1
ylist = lapply(1:TT, function(tt){ runif(30*dimdat) %>% matrix(ncol = dimdat, nrow = 30)})
numclust = 3
## Initialize a few parameters, not carefully
sigma = init_sigma(ylist, numclust) ## (T x numclust x (dimdat x dimdat))
mn = init_mn(ylist, numclust, TT, dimdat)##, countslist = countslist)
prob = matrix(1/numclust, nrow = TT, ncol = numclust) ## Initialize to all 1/K.
## Calculate responsibility
resp = Estep(mn = mn, sigma = sigma, prob = prob, ylist = ylist, numclust = numclust)
## Check these things
testthat::expect_equal(length(resp), length(ylist))
testthat::expect_equal(length(resp), length(ylist))
testthat::expect_equal(sapply(resp, nrow), sapply(ylist, nrow))
testthat::expect_true(all(sapply(resp, ncol) == numclust))
})## Test passed 🥇
5.5 M step
The M step of the EM algorithm has three steps – one each for \(\mu\), \(\pi\), and \(\Sigma\).
5.5.1 M step for \(\pi\)
#' The M step for the cluster probabilities
#'
#' @param resp Responsibilities.
#' @param H_tf Trend filtering matrix.
#' @param countslist Particle multiplicities.
#' @param lambda_prob Regularization.
#' @param l_prob Trend filtering degree.
#'
#' @return (T x k) matrix containing the alphas, for \code{prob = exp(alpha)/
#' rowSums(exp(alpha))}.
#' @export
#'
Mstep_prob <- function(resp, H_tf, countslist = NULL,
lambda_prob = NULL, l_prob = NULL, x = NULL){
## Basic setup
TT <- length(resp)
## Basic checks
stopifnot(is.null(l_prob) == is.null(lambda_prob))
## If glmnet isn't actually needed, don't use it.
if(is.null(l_prob) & is.null(lambda_prob)){
## Calculate the average responsibilities, per time point.
if(is.null(countslist)){
resp.avg <- lapply(resp, colMeans) %>% do.call(rbind, .)
} else {
resp.avg <- lapply(1:TT, FUN = function(ii){
colSums(resp[[ii]])/sum(countslist[[ii]])
}) %>% do.call(rbind, .)
}
return(resp.avg)
## If glmnet is needed, use it.
} else {
lambda_range <- function(lam, nlam = 50, lam.max = max(1, 5*lam)){
return(exp(seq(log(lam.max), log(lam), length.out = nlam)))
}
penalty.facs <- c(rep(0, l_prob+1), rep(1, nrow(H_tf) - l_prob - 1))
resp.predict <- do.call(rbind, lapply(resp, colSums))
fac = resp.predict %>% apply(2, function(mycol){max(mycol)/min(mycol)}) %>% max()
if(FALSE){
## Run GLMNET but if it throws an error, reduce the dynamic range |fac| of
## each column |resp.predict| until it's okay.
error_means_null = NULL
fac = 1024
while(is.null(error_means_null)){
print("fac is")
print(fac)
error_means_null <- tryCatch({
resp.predict = resp.predict0 %>% apply(2, function(mycol){mycol = pmax(mycol, max(mycol)/fac)})
glmnet_obj <- glmnet::glmnet(x = H_tf, y = resp.predict, family = "multinomial",
penalty.factor = penalty.facs, maxit = 1e7,
lambda = mean(penalty.facs)*lambda_range(lambda_prob),
standardize = F, intercept = FALSE)
pred_link <- predict(glmnet_obj, newx = H_tf, type = "link", s = mean(penalty.facs) * lambda_prob)[,,1]
}, error = function(e){
return(NULL)
})
if(is.null(error_means_null)){
fac = fac/2
resp.predict = resp.predict0 %>% apply(2, function(mycol){mycol = pmax(mycol, max(mycol)/fac)})
}
}
} else {
glmnet_obj <- glmnet::glmnet(x = H_tf, y = resp.predict, family = "multinomial",
penalty.factor = penalty.facs, maxit = 1e7,
lambda = mean(penalty.facs)*lambda_range(lambda_prob),
standardize = F, intercept = FALSE)
}
pred_link <- predict(glmnet_obj, newx = H_tf, type = "link", s = mean(penalty.facs) * lambda_prob)[,,1]
return(pred_link)
}
}This should return a \(T\) by \(K\) matrix, which we’ll test here:
testthat::test_that("Mstep of pi returns a (T x K) matrix.", {
## Generate some fake responsibilities and trend filtering matrix
TT = 100
numclust = 3
nt = 10
resp = lapply(1:TT, function(tt){
oneresp = runif(nt*numclust) %>% matrix(ncol=numclust)
oneresp = oneresp/rowSums(oneresp)
})
H_tf <- gen_tf_mat(n = TT, k = 0)
## Check the size
pred_link = Mstep_prob(resp, H_tf, l_prob = 0, lambda_prob = 1E-3)
testthat::expect_equal(dim(pred_link), c(TT, numclust))
pred_link = Mstep_prob(resp, H_tf)
testthat::expect_equal(dim(pred_link), c(TT, numclust))
## Check the correctness
pred_link = Mstep_prob(resp, H_tf)
})Each row of this matrix should contain the fitted values \(\alpha_k \in \mathbb{R}^3\) where \(\alpha_{kt} = h_t^T w_{k}\), for..
\(h_t\) that are rows of the trend filtering matrix \(H \in \mathbb{R}^{T \times T}\).
\(w_k \in \mathbb{R}^{n}\) that are the regression coefficients estimated by
glmnet().
Here is a test for the correctness of the M step for \(\pi\).
5.5.2 M step for \(\Sigma\)
#' M step for cluster covariance (sigma).
#'
#' @param resp Responsibility.
#' @param ylist Data.
#' @param mn Means
#' @param numclust Number of clusters.
#'
#' @return (K x d x d) array containing K (d x d) covariance matrices.
#' @export
#'
#' @examples
Mstep_sigma <- function(resp, ylist, mn, numclust){
## Find some sizes
TT = length(ylist)
ntlist = sapply(ylist, nrow)
dimdat = ncol(ylist[[1]])
cs = c(0, cumsum(ntlist))
## Set up empty residual matrix (to be reused)
cs = c(0, cumsum(ntlist))
vars <- vector(mode = "list", numclust)
ylong = do.call(rbind, ylist)
ntlist = sapply(ylist, nrow)
irows = rep(1:nrow(mn), times = ntlist)
for(iclust in 1:numclust){
resp.thisclust = lapply(resp, function(myresp) myresp[,iclust, drop = TRUE])
resp.long = do.call(c, resp.thisclust)
mnlong = mn[irows,,iclust]
if(is.vector(mnlong)) mnlong = mnlong %>% cbind()
resid <- ylong - mnlong
resid_weighted <- resp.long * resid
sig_temp <- t(resid_weighted) %*% resid/sum(resp.long)
vars[[iclust]] <- sig_temp
}
## Make into an array
sigma_array = array(NA, dim=c(numclust, dimdat, dimdat))
for(iclust in 1:numclust){
sigma_array[iclust,,] = vars[[iclust]]
}
## Basic check
stopifnot(all(dim(sigma_array) == c(numclust, dimdat, dimdat)))
return(sigma_array)
}5.5.3 M step for \(\mu\)
This is a big one. It uses the ADMM algorithm as written in section 2 of the paper, reproduced briefly here.
We need a convergence checker for the outer layer of LA-ADMM, which checks whether the objective values have plateaued.
Next, we define the main function Mstep_mu(). This uses a “locally-adaptive” ADMM paper. M-step_mu() calls la_admm_oneclust() on each cluster \(k=1,\cdots, K\); this function will be introduced shortly.
#' Computes the M step for mu.
#'
#' @param resp Responsibilities of each particle.
#' @param ylist
#' @param lambda
#' @param l
#' @param sigma
#' @param sigma_eig_by_clust
#' @param Dl
#' @param Dlp1
#' @param TT
#' @param N
#' @param dimdat
#' @param first_iter
#' @param mus
#' @param Zs
#' @param Ws
#' @param uws
#' @param uzs
#' @param maxdev
#' @param x
#' @param niter
#' @param err_rel
#' @param err_abs
#' @param zerothresh
#' @param local_adapt
#' @param local_adapt_niter
#' @param rho_init
#' @param iter
#'
#' @return
#' @export
#'
#' @examples
Mstep_mu <- function(resp,
ylist,
lambda = 0.5,
l = 3,
sigma,
sigma_eig_by_clust = NULL,
Dlsqrd,
Dl, tDl, Dlp1, TT, N, dimdat,
first_iter = TRUE,
e_mat,
## Warm startable variables
mus = NULL,
Zs = NULL,
Ws = NULL,
uws = NULL,
uzs = NULL,
## End of warm startable variables
maxdev = NULL,
x = NULL,
niter = (if(local_adapt) 1e2 else 1e3),
err_rel = 1E-3,
err_abs = 0,
zerothresh = 1E-6,
local_adapt = FALSE,
local_adapt_niter = 10,
rho_init = 0.01,
iter = NULL){
####################
## Preliminaries ###
####################
TT = length(ylist)
numclust = ncol(resp[[1]])
dimdat = ncol(ylist[[1]])
ntlist = sapply(ylist, nrow)
resp.sum = lapply(resp, colSums) %>% do.call(rbind, .)
N = sum(unlist(resp.sum))
## Other preliminaries
schur_syl_A_by_clust = schur_syl_B_by_clust = term3list = list()
ybarlist = list()
ycentered_list = Xcentered_list = yXcentered_list = list()
Qlist = list()
sigmainv_list = list()
for(iclust in 1:numclust){
## Retrieve sigma inverse from pre-computed SVD, if necessary
if(is.null(sigma_eig_by_clust)){
sigmainv = solve(sigma[iclust,,])
} else {
sigmainv = sigma_eig_by_clust[[iclust]]$sigma_inv
}
resp.iclust <- lapply(resp, FUN = function(r) matrix(r[,iclust]))
AB <- get_AB_mats(y = y, resp = resp.iclust, Sigma_inv = sigmainv,
e_mat = e_mat, N = N, Dlp1 = Dlp1, Dl = Dl,
Dlsqrd = Dlsqrd, rho = rho_init, z = NULL, w = NULL,
uz = NULL, uw = NULL)
## Store the Schur decomposition
schur_syl_A_by_clust[[iclust]] = myschur(AB$A)
schur_syl_B_by_clust[[iclust]] = myschur(AB$B)
ycentered <- NULL
ycentered_list[[iclust]] = ycentered
sigmainv_list[[iclust]] = sigmainv
}
##########################################
## Run ADMM separately on each cluster ##
#########################################
admm_niters = admm_inner_iters = vector(length = numclust, mode = "list")
if(first_iter){ mus = vector(length = numclust, mode = "list") }
Zs <- lapply(1:numclust, function(x) matrix(0, nrow = TT, ncol = dimdat))
Ws <- lapply(1:numclust, function(x) matrix(0, nrow = TT - l, ncol = dimdat))
uzs <- lapply(1:numclust, function(x) matrix(0, nrow = TT, ncol = dimdat))
uws <- lapply(1:numclust, function(x) matrix(0, nrow = TT - l, ncol = dimdat))
## For every cluster, run LA-ADMM
start.time = Sys.time()
for(iclust in 1:numclust){
resp.iclust <- lapply(resp, FUN = function(r) matrix(r[,iclust]))
## Possibly locally adaptive ADMM, for now just running with rho == lambda
res = la_admm_oneclust(K = (if(local_adapt) local_adapt_niter else 1),
local_adapt = local_adapt,
iclust = iclust,
niter = niter,
TT = TT, N = N, dimdat = dimdat, maxdev = maxdev,
schurA = schur_syl_A_by_clust[[iclust]],
schurB = schur_syl_B_by_clust[[iclust]],
sigmainv = sigmainv_list[[iclust]],
rho = rho_init,
rhoinit = rho_init,
## rho = (if(iclust == 1) rho_init else res$rho/2),
## rhoinit = (if(iclust == 1) rho_init else res$rho/2),
sigma = sigma,
lambda = lambda,
resp = resp.iclust,
resp_sum = resp.sum[,iclust],
l = l,
Dlp1 = Dlp1,
Dl = Dl,
tDl = tDl,
y = ylist,
err_rel = err_rel,
err_abs = err_abs,
zerothresh = zerothresh,
sigma_eig_by_clust = sigma_eig_by_clust,
em_iter = iter,
## Warm starts from previous *EM* iteration
first_iter = first_iter,
uw = uws[[iclust]],
uz = uzs[[iclust]],
z = Zs[[iclust]],
w = Ws[[iclust]])
## Store the results
mus[[iclust]] = res$mu
admm_niters[[iclust]] = res$kk
admm_inner_iters[[iclust]] = res$inner.iter
## Store other things for for warmstart
Zs[[iclust]] = res$Z
uzs[[iclust]] = res$uz
uws[[iclust]] = res$uw
Ws[[iclust]] = res$W
}
## Aggregate the yhats into one array
mu_array = array(NA, dim = c(TT, dimdat, numclust))
for(iclust in 1:numclust){ mu_array[,,iclust] = mus[[iclust]] }
## Each are lists of length |numclust|.
return(list(mns = mu_array,
admm_niters = admm_niters,
admm_inner_iters = admm_inner_iters,
## For warmstarts
Zs = Zs,
Ws = Ws,
uws = uws,
uzs = uzs,
N = N,
## Return the column
rho = res$rho,
## For using in the Sigma M step
ycentered_list = ycentered_list,
Xcentered_list = Xcentered_list,
yXcentered_list = yXcentered_list,
Qlist = Qlist
))
}The locally adaptive admm for a single cluster involves an inner and outer
loop. The inner loop is an ADMM for a fixed \(\rho\). The outer loop is written
here in la_admm_oneclust, and runs the inner ADMM with a fixed step-size
\(\rho\) while sequentially doubling \(\rho\).
#' Locally adaptive ADMM; wrapper to \code{admm_oneclust()}.
#'
#' @param K
#' @param ...
#'
#' @return
#'
#' @examples
la_admm_oneclust <- function(K, ...){
## Initialize arguments for ADMM.
args <- list(...)
p = args$p
l = args$l
TT = args$TT
dimdat = args$dimdat
rhoinit = args$rhoinit
## This initialization can come from the previous *EM* iteration.
if(args$first_iter){
mu = matrix(0, nrow = TT, ncol = dimdat)
Z <- matrix(0, nrow = TT, ncol = dimdat)
W <- matrix(0, nrow = TT-l, ncol = dimdat)
uz <- matrix(0, nrow = TT, ncol = dimdat)
uw <- matrix(0, nrow = TT - l, ncol = dimdat)
args[['mu']] <- mu
args[['z']] <- Z
args[['w']] <- W
args[['uz']] <- uz
args[['uw']] <- uw
}
cols = c()
some_admm_objectives = c()
## Run ADMM repeatedly with (1) double rho, and (2) previous b
for(kk in 1:K){
if(kk > 1){
## These ensure warm starts are true
args[['mu']] <- mu
args[['z']] <- Z
args[['w']] <- W
args[['uz']] <- uz
args[['uw']] <- uw
args[['rho']] <- rho
}
## Run ADMM
args[['outer_iter']] <- kk
## Call main function
argn <- lapply(names(args), as.name)
names(argn) <- names(args)
call <- as.call(c(list(as.name("admm_oneclust")), argn))
res = eval(call, args)
if(any(abs(res$mu)>1E2)){
stop("mu is blowing up! Probably because the initial ADMM step size (rho) is too large (and possibly the ball constraint on the means is large.")
}
some_admm_objectives = c(some_admm_objectives, res$single_admm_objective)
## Handling the scenario where the objectives are all zero
padding = 1E-12
some_admm_objectives = some_admm_objectives + padding
## See if outer iterations should terminate
if(res$converge){
res$converge <- T
break
}
## Update some parameters; double the rho value, and update the B matrix
rho = 2 * args$rho
mu = res$mu
Z = res$Z
W = res$W
uz = res$uz
uw = res$uw
}
if(!res$converge) {
## warning("ADMM didn't converge for one cluster.") ## Disabling this because it's not really something to fix. The only thing change a user would make is to make |rho_init| smaller
}
## Record how long the admm took; in terms of # iterations.
res$kk = kk
## Record the final rho
res$rho = args$rho
return(res)
}Next, the inner loop. The workhorse admm_oneclust() actually performs the ADMM
update for one cluster for a fixed step-size \(\rho\) across optimization
iterations iter=1,...,n.
This admm_oneclust() uses the following helpers:
-
W_update_fused(): this uses theproxfunction, which uses a RCpp function.prox_dp. Z_update()U_update_Z()U_update_W()
A prox_dp function, written in C, will be used.
#' Solve a fused lasso problem for the W update. Internally, a fused lasso dynamic
#' programming solver \code{prox()} (which calls \code{prox_dp()} written in C)
#' is used.
#'
W_update_fused <- function(l, TT, mu, uw, rho, lambda, Dl){
# modified lambda for fused lasso routine
mod_lam <- lambda/rho
# generating pseudo response xi
if( l < 0 ){
stop("l should never be /below/ zero!")
} else if( l == 0 ){
xi <- mu + 1/rho * uw ## This is faster
} else {
xi <- Dl %*% mu + 1/rho * uw
if(any(is.nan(xi))) browser()
## l = 2 is quadratic trend filtering
## l = 1 is linear trend filtering
## l = 0 is fused lasso
## D^{(1)} is first differences, so it correponds to l=0
## Dl = gen_diff_mat(n = TT, l = l, x = x) <--- (T-l) x T matrix
}
## Running the fused LASSO
## which solves min_zhat 1/2 |z-zhat|_2^2 + lambda |D^{(1)}zhat|
fit <- prox_dp(z = xi, lam = mod_lam)
return(fit)
}
Z_update <- function(m, Uz, C, rho){
mat = m + Uz/rho
Z = projCmat(mat, C)
return(Z)
}
#' Project rows of matrix \code{mat} into a ball of size \code{C}.
#'
#' @param mat Matrix whose rows will be projected into a C-sized ball.
#' @param C radius
#'
#' @return Projected matrix.
projCmat <- function(mat, C){
if(!is.null(C)){
vlens = sqrt(rowSums(mat * mat))
inds = which(vlens > C)
if(length(inds) > 0){
mat[inds,] = mat[inds,] * C / vlens[inds]
}
}
return(mat)
}We will test the projCmat function.
testthat::test_that("Test the ball projection", {
set.seed(100)
mat = matrix(rnorm(100), ncol=2)
projected_mat = flowtrend:::projCmat(mat, 1)
testthat::expect_true(all(projected_mat %>% apply(1, function(myrow)sum(myrow*myrow)) < 1+1E-8))
})## Test passed 🎉
The other parameter updates are done with these functions.
#' @param U (T x dimdat) matrix.
U_update_Z <- function(U, rho, mu, Z, TT){
# return(U + rho * (scale(mu, scale = F) - Z))
stopifnot(nrow(U) == TT)
centered_mu = sweep(mu, 2, colMeans(mu))
## stopifnot(all(abs(colMeans(centered_mu))<1E-8))
Unew = U + rho * (centered_mu - Z)
## Expect a (T-l) x dimdat matrix.
stopifnot(all(dim(U) == dim(Unew)))
stopifnot(nrow(U) == TT)
return(Unew)
}#' @param U ((T-l) x dimdat) matrix.
U_update_W <- function(U, rho, mu, W, l, Dl, TT){
# l = 2 is quadratic trend filtering
# l = 1 is linear trend filtering
# l = 0 is fused lasso
# D^{(1)} is first differences, so it correponds to l=0
# D^{(l+1)} is used for order-l trend filtering.
stopifnot(nrow(W) == TT - l)
Unew <- U + rho * (Dl %*% mu - W)
## Expect a (T-l) x dimdat matrix.
stopifnot(all(dim(U) == dim(Unew)))
stopifnot(nrow(U) == TT-l)
return(Unew)
}Here is that main workhorse admm_oneclust().
#' One cluster's ADMM for a fixed step size (\code{rho}).
#'
#'
#' @export
#' @param iclust
#' @param niter
#' @param y
#' @param Dl
#' @param tDl
#' @param Dlp1
#' @param l Default: NULL
#' @param TT
#' @param N
#' @param dimdat
#' @param maxdev
#' @param rho
#' @param rhoinit Default: rho
#' @param Xinv
#' @param schurA
#' @param schurB
#' @param sigmainv
#' @param lambda
#' @param resp
#' @param resp_sum
#' @param ylist
#' @param err_rel Default: 1e-3
#' @param err_abs Default: 0
#' @param zerothresh
#' @param mu
#' @param z
#' @param w
#' @param uw
#' @param uz
#' @param first_iter
#' @param em_iter
#' @param outer_iter
#' @param local_adapt
#' @param sigma
#' @param sigma_eig_by_clust
#'
#' @return A list containing the updated parameters for the cluster.
#' @seealso \code{\link{la_admm_oneclust}} which is a wrapper over this function.
admm_oneclust <- function(iclust = 1, niter, y,
Dl, tDl, Dlp1, l = NULL,
TT, N, dimdat, maxdev,
rho,
rhoinit = rho,
Xinv,
schurA,
schurB,
sigmainv,
lambda,
resp,
resp_sum,
ylist, err_rel = 1e-3, err_abs = 0,
zerothresh,
mu,
z,
w,
uw,
uz,
first_iter,## Not used
em_iter,
outer_iter,
local_adapt,
sigma,
sigma_eig_by_clust){
## Initialize the variables ###
## resid_mat = matrix(NA, nrow = ceiling(niter/5), ncol = 4)
## colnames(resid_mat) = c("primresid", "primerr", "dualresid", "dualerr")
resid_mat = matrix(NA, nrow = ceiling(niter/5), ncol = 6)
colnames(resid_mat) = c("prim1", "prim2", "primresid", "primerr", "dualresid", "dualerr")
rhofac = rho / rhoinit
## This doesn't change over iterations
schurB = myschur(schurB$orig * rhofac) ## In flowmix, this is done on A. Here, it's done on B (in AX + XB + C = 0).
TA = schurA$T
TB = schurB$T
UA = schurA$Q
UB = schurB$Q
tUA = schurA$tQ
tUB = schurB$tQ
## This also doesn't change over iterations
C1 <- do.call(cbind, lapply(1:TT, FUN = function(tt){
multmat <- apply(y[[tt]], FUN = function(yy) yy * resp[[tt]], MARGIN = 2)
sigmainv %*% colSums(multmat)
}))
for(iter in 1:niter){
syl_C <- get_C_mat(C1 = C1, resp_sum = resp_sum, TT = TT, dimdat = dimdat,
Sigma_inv = sigmainv, N = N, Dl = Dl, rho = rho,
z = z, w = w, uz = uz, uw = uw, l=l)
FF = (-1) * tUA %*% syl_C %*% UB
mu = UA %*% matrix_function_solve_triangular_sylvester_barebonesC2(TA, TB, FF) %*% tUB
mu = t(mu)
stopifnot(nrow(mu) == TT)
stopifnot(ncol(mu) == dimdat)
centered_mu = sweep(mu, 2, colMeans(mu))
z <- Z_update(centered_mu, Uz = uz, C = maxdev, rho = rho)
if(any(abs(mu)>1E2)){
stop("mu is blowing up!")
## break
}
wlist = lapply(1:dimdat, function(j){
W_update_fused(l = l, TT = TT, mu = mu[, j, drop = TRUE],
rho = rho, lambda = lambda,
uw = uw[,j,drop=TRUE],
Dl = Dl)})
w <- do.call(cbind, wlist)
stopifnot(nrow(w) == TT-l)
stopifnot(ncol(w) == dimdat)
uz = U_update_Z(uz, rho, mu, z, TT)
## uw = U_update_W(uw, rho, mu, w, l, TT)
uw = U_update_W(uw, rho, mu, w, l, Dl, TT)
## Check convergence
if( iter > 1 & iter %% 5 == 0){## & !local_adapt){
## Calculate convergence criterion
obj = check_converge(mu, rho,
w, z,
w_prev, z_prev,
uw, uz,
Dl, tDl, err_rel = err_rel, err_abs = err_abs)
jj = (iter/ 5)
resid_mat[jj,] = c(
norm(obj$primal_resid1, "F"), ## Temp
norm(obj$primal_resid2, "F"), ## Temp
norm(obj$primal_resid, "F"),
obj$primal_err,
norm(obj$dual_resid,"F"),
obj$dual_err)
if(is.na(obj$converge)){
obj$converge = converge = FALSE
warning("Convergence was NA")
}
if(obj$converge){
converge = TRUE
break
} else {
converge = FALSE
}
}
w_prev = w
z_prev = z
}
if(FALSE){
## Calculate optimization objective values for this cluster.
obj.value <- objective_per_cluster(y = y, mu = mu, resp = resp,
Sigma_inv = sigmainv, TT = TT,
d = dimdat, Dlp1 = Dlp1, Dl = Dl, l = l,
maxdev = maxdev, lambda = lambda,
rho = rho, N = N)
## This is very expensive to do within each ADMM iteration, so it's commented out for now.
}
obj.value = NA
return(list(mu = mu,
resid_mat = resid_mat,
converge = converge,
## Other variables to return.
Z = z,
W = w,
uz = uz,
uw = uw,
inner.iter = iter,
single_admm_objective = obj.value))
}5.5.4 Helpers for M step (\(\mu\))
First, let’s start with a few helper functions.
#' a
#'
#' @param TT
etilde_mat <- function(TT){
mats <- lapply(1:TT, FUN = function(t){
e_vec <- rep(0, TT)
e_vec[t] <- 1
(e_vec - 1/TT) %*% t(e_vec - 1/TT)
})
Reduce('+', mats)
}#' a
#'
#' @param y
get_AB_mats <- function(y, resp, Sigma_inv, e_mat, Dlsqrd, N, Dlp1, Dl, rho, z, w, uz, uw){
# A matrix
A <- 1/N * Sigma_inv
# B matrix
sum_resp <- sapply(resp, sum)
#B <- rho*(t(Dl)%*%Dl + e_mat)%*% diag(1/unlist(sum_resp))
B <- rho*(Dlsqrd + e_mat)
B <- B/sum_resp[col(B)]
#B <- rho*(Dlsqrd + e_mat) %*% diag(1/unlist(sum_resp))
return(list(A = A, B = B))
}#' a
#' @param C1
get_C_mat <- function(C1, resp_sum, TT, dimdat, Sigma_inv, e_mat, N, Dlp1, Dl,
rho, z, w, uz, uw, l){
C2 <- t(uz - rho*z)
# averaging
C2 <- C2 - rowMeans(C2)
# third component
C3 <- do.call(rbind, lapply(1:dimdat, FUN = function(j){
((uw[,j, drop=TRUE] - rho * w[,j,drop=TRUE]) %*% Dl) %>% as.numeric()
}))
# combining
C <- (-1/N * C1 + C2 + C3)
C <- C/resp_sum[col(C)]
return(C)
}#' @param mat Matrix to Schur-decompose.
myschur <- function(mat){
stopifnot(nrow(mat) == ncol(mat))
if(is.numeric(mat) & length(mat)==1) mat = mat %>% as.matrix()
obj = Matrix::Schur(mat)
obj$tQ = t(obj$Q)
obj$orig = mat
return(obj)
}Here’s a function to check convergence of the ADMM with a fixed step size.
#' Check convergence of ADMM with a fixed step size (rho).
check_converge <- function(mu, rho, w, z, w_prev, z_prev, uw, uz, Dl, tDl,
err_rel = 1E-4,
err_abs = 0
){
## Constraints are: Ax + Bz =c, where x = mu, z =(w, z)
prim1 = rbind(Dl %*% mu, mu - colMeans(mu))
prim2 = rbind(-w, -z)
primal_resid = prim1 + prim2 ## Ax + Bz - c
change_z = z - z_prev
change_w = w - w_prev
dual_resid = rho * (-(change_z - colMeans(change_z)) - tDl %*% change_w)
tAU = (uz - colMeans(uz)) + tDl %*% uw
## Form primal and dual tolerances.
primal_err = sqrt(length(primal_resid)) * err_abs +
err_rel * max(norm(prim1, "F"), norm(prim2, "F"))
dual_err = sqrt(length(dual_resid)) * err_abs +
err_rel * norm(tAU, "F")
## Check convergence.
primal_resid_size = norm(primal_resid, "F")
dual_resid_size = norm(dual_resid, "F")
primal_converge = ( primal_resid_size <= primal_err )
dual_converge = ( dual_resid_size <= dual_err )
## Some checks (trying to address problems with |converge|).
assertthat::assert_that(is.numeric(primal_resid_size))
assertthat::assert_that(is.numeric(primal_err))
assertthat::assert_that(is.numeric(dual_resid_size))
assertthat::assert_that(is.numeric(dual_err))
## return(primal_converge & dual_converge)
converge = primal_converge & dual_converge
return(list(
primal_resid1 = prim1,
primal_resid2 = prim2,
primal_resid = primal_resid,
primal_err = primal_err,
dual_resid = dual_resid,
dual_err = dual_err,
converge = converge))
}Here’s a function to compute the augmented Lagrangian of the M-step (this is not used for now).
#' computes the Augmented lagrangian.
aug_lagr <- function(y, TT, d, z, w, l, uz, uw, mu, resp, Sigma_inv, Dlp1, Dl, maxdev, lambda, rho, N){
mu_dd <- rowMeans(mu)
# Check the Z's for ball constraint, up to a tolerance of 1e-4
znorms <- apply(z, FUN = function(zz) sqrt(sum(zz^2)), MARGIN = 1)
if(any(is.na(znorms))) browser()
if(any(znorms > (maxdev + 1e-4))){
warning("||z||_2 > maxdev, current iterate not feasible.")
return(Inf)
}
aug1 <- sum(do.call(cbind, lapply(1:TT, FUN = function(t){
multmat <- apply(y[[t]], FUN = function(yy){
t(yy - mu[,t]) %*% Sigma_inv %*% (yy - mu[,t])}, MARGIN = 1)
sum(1/(2*N) * resp[[t]]*multmat)
})))
aug2 <- lambda*sum(do.call(rbind, lapply(1:d, FUN = function(j)
sum(abs(diff(w[j,], differences = 1))))))
aug3 <- sum(do.call(cbind, lapply(1:TT, FUN = function(t){
uz[t,]%*%(mu[,t] - mu_dd - z[t,]) + rho/2 * sum((mu[,t] - mu_dd - z[t,])^2)
})))
aug4 <- sum(do.call(rbind, lapply(1:d, FUN = function(j){
uw[,j] %*% (Dl %*% mu[j,] - w[j,]) + rho/2 * sum((Dl %*% mu[j,] - w[j,])^2)
})))
total <- aug1 + aug2 + aug3 + aug4
return(total)
}Here’s a function to calculate the objective value for the ADMM.
\[\frac{1}{2N} \sum_{t=1}^T \sum_{i=1}^{n_t} \hat{\gamma}_{it} (y_i^{(t)} - \mu_{\cdot t})^T \hat{\Sigma}^{-1} ( y_i^{(t)} - \mu_{\cdot t}) + \lambda \sum_{j=1}^d \|D^{(l+1)}\mu_{j\cdot }\|_1\]
(This is the penalized surrogate objective \(Q_{\tilde \theta}(\mu, \Sigma, \pi) + \lambda \sum_{j=1}^d \|D^{(l+1)} \mu_{j \cdot}\|_1\), taking only the parts, and leaving out a constant \(C=-\frac{d}{2}\log(2\pi) - \frac{1}{2} \log det(\Sigma_k)\), since the constant \(C\) doesn’t change over ADMM iterations.)
#' computes the ADMM objective for one cluster
objective_per_cluster <- function(y, TT, d, l, mu, resp, Sigma_inv, Dlp1, Dl,
maxdev, lambda, rho, N){
aug1 <- sum(do.call(cbind, lapply(1:TT, FUN = function(tt){
multmat <- apply(y[[tt]], FUN = function(yy){
t(yy - mu[tt,]) %*% Sigma_inv %*% (yy - mu[tt,])}, MARGIN = 1)
sum(1/(2*N) * resp[[tt]] * multmat)
})))
aug2 <- lambda*sum(do.call(rbind, lapply(1:d, FUN = function(j)
sum(abs(diff(mu[,j], differences = l))))))
total <- aug1 + aug2
return(total)
}5.5.5 All Rcpp functions
The main function we write in Rcpp is the “barebones” Sylvester equation solver that takes upper-triangular coefficient matrices.
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::depends(RcppEigen)]]
#include <RcppArmadillo.h>
#include <RcppEigen.h>
#include <numeric>
using namespace arma;
using namespace Eigen;
using Eigen::Map; // 'maps' rather than copies
using Eigen::MatrixXd; // variable size matrix, double precision
//' Solve "barebones" sylvester equation that takes upper triangular matrices as coefficients.
//'
//' @param TA Upper-triangular matrix
//' @param TB Upper-triangular matrix
//' @param C matrix
//' @export
// [[Rcpp::export]]
Eigen::MatrixXd matrix_function_solve_triangular_sylvester_barebonesC2(const Eigen::MatrixXd & TA,
const Eigen::MatrixXd & TB,
const Eigen::MatrixXd & C){
// Eigen::eigen_assert(TA.rows() == TA.cols());
// Eigen::eigen_assert(TA.Eigen::isUpperTriangular());
// Eigen::eigen_assert(TB.rows() == TB.cols());
// Eigen::eigen_assert(TB.Eigen::isUpperTriangular());
// Eigen::eigen_assert(C.rows() == TA.rows());
// Eigen::eigen_assert(C.cols() == TB.rows());
// typedef typename MatrixType::Index Index;
// typedef typename MatrixType::Scalar Scalar;
int m = TA.rows();
int n = TB.rows();
Eigen::MatrixXd X(m, n);
for (int i = m - 1; i >= 0; --i) {
for (int j = 0; j < n; ++j) {
// Compute T_A X = \sum_{k=i+1}^m T_A_{ik} X_{kj}
double TAX;
if (i == m - 1) {
TAX = 0;
} else {
MatrixXd TAXmatrix = TA.row(i).tail(m-1-i) * X.col(j).tail(m-1-i);
TAX = TAXmatrix(0,0);
}
// Compute X T_B = \sum_{k=1}^{j-1} X_{ik} T_B_{kj}
double XTB;
if (j == 0) {
XTB = 0;
} else {
MatrixXd XTBmatrix = X.row(i).head(j) * TB.col(j).head(j);
XTB = XTBmatrix(0,0);
}
X(i,j) = (C(i,j) - TAX - XTB) / (TA(i,i) + TB(j,j));
}
}
return X;
}Also, we define a faster dmvnorm function written in C++.
// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>
static double const log2pi = std::log(2.0 * M_PI);
/* C++ version of the dtrmv BLAS function */
void inplace_tri_mat_mult(arma::rowvec &x, arma::mat const &trimat){
arma::uword const n = trimat.n_cols;
for(unsigned j = n; j-- > 0;){
double tmp(0.);
for(unsigned i = 0; i <= j; ++i)
tmp += trimat.at(i, j) * x[i];
x[j] = tmp;
}
}
// [[Rcpp::export]]
arma::vec dmvnorm_arma_fast(arma::mat const &x,
arma::rowvec const &mean,
arma::mat const &sigma,
bool const logd = false) {
using arma::uword;
uword const n = x.n_rows,
xdim = x.n_cols;
arma::vec out(n);
arma::mat const rooti = arma::inv(trimatu(arma::chol(sigma)));
double const rootisum = arma::sum(log(rooti.diag())),
constants = -(double)xdim/2.0 * log2pi,
other_terms = rootisum + constants;
arma::rowvec z;
for (uword i = 0; i < n; i++) {
z = (x.row(i) - mean);
inplace_tri_mat_mult(z, rooti);
out(i) = other_terms - 0.5 * arma::dot(z, z);
}
if (logd)
return out;
return exp(out);
}
// All credit goes to https://gallery.rcpp.org/articles/dmvnorm_arma/// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>
#include <vector>
#include <cmath> // For standard C++ functions if needed, though not strictly required here
using namespace std;
// Dynamic programming algorithm for the 1d fused lasso problem
// (Ryan Tibshirani's implementation of Nick Johnson's algorithm)
void prox_dp_core_cpp(
int n,
const double *y_ptr, // Use const since input y is not modified
double lam,
double *beta_ptr)
{
// Take care of a few trivial cases
if (n == 0) return;
if (n == 1 || lam == 0) {
for (int i = 0; i < n; i++) beta_ptr[i] = y_ptr[i];
return;
}
// Use std::vector for automatic memory management (RAII) instead of malloc/free
// These store the derivative knots (x) and coefficients (a, b)
std::vector<double> x(2 * n);
std::vector<double> a(2 * n);
std::vector<double> b(2 * n);
int l, r;
// These are the knots of the back-pointers
std::vector<double> tm(n - 1);
std::vector<double> tp(n - 1);
// Aliases to make the original logic clearer (using ptrs to vector data)
double *x_data = x.data();
double *a_data = a.data();
double *b_data = b.data();
double *tm_data = tm.data();
double *tp_data = tp.data();
double afirst, alast, bfirst, blast;
// We step through the first iteration manually
tm_data[0] = -lam + y_ptr[0];
tp_data[0] = lam + y_ptr[0];
l = n - 1;
r = n;
x_data[l] = tm_data[0];
x_data[r] = tp_data[0];
a_data[l] = 1;
b_data[l] = -y_ptr[0] + lam;
a_data[r] = -1;
b_data[r] = y_ptr[0] + lam;
afirst = 1;
bfirst = -lam - y_ptr[1];
alast = -1;
blast = -lam + y_ptr[1];
// Now iterations 2 through n-1
int lo, hi;
double alo, blo, ahi, bhi;
for (int k = 1; k < n - 1; k++) {
// Compute lo: step up from l until the
// derivative is greater than -lam
alo = afirst;
blo = bfirst;
for (lo = l; lo <= r; lo++) {
if (alo * x_data[lo] + blo > -lam) break;
alo += a_data[lo];
blo += b_data[lo];
}
// Compute the negative knot
tm_data[k] = (-lam - blo) / alo;
l = lo - 1;
x_data[l] = tm_data[k];
// Compute hi: step down from r until the
// derivative is less than lam
ahi = alast;
bhi = blast;
for (hi = r; hi >= l; hi--) {
if (-ahi * x_data[hi] - bhi < lam) break;
ahi += a_data[hi];
bhi += b_data[hi];
}
// Compute the positive knot
tp_data[k] = (lam + bhi) / (-ahi);
r = hi + 1;
x_data[r] = tp_data[k];
// Update a and b
a_data[l] = alo;
b_data[l] = blo + lam;
a_data[r] = ahi;
b_data[r] = bhi + lam;
afirst = 1;
bfirst = -lam - y_ptr[k + 1];
alast = -1;
blast = -lam + y_ptr[k + 1];
}
// Compute the last coefficient: this is where
// the function has zero derivative
alo = afirst;
blo = bfirst;
for (lo = l; lo <= r; lo++) {
if (alo * x_data[lo] + blo > 0) break;
alo += a_data[lo];
blo += b_data[lo];
}
beta_ptr[n - 1] = -blo / alo;
// Compute the rest of the coefficients, by the
// back-pointers
for (int k = n - 2; k >= 0; k--) {
if (beta_ptr[k + 1] > tp_data[k]) beta_ptr[k] = tp_data[k];
else if (beta_ptr[k + 1] < tm_data[k]) beta_ptr[k] = tm_data[k];
else beta_ptr[k] = beta_ptr[k + 1];
}
// Memory cleanup is now automatic because std::vector is used.
}
//' Implements the dynamic programming algorithm for the 1D Fused Lasso proximal operator.
//'
//' @param z A numeric vector of input values ($\mathbf{y}$).
//' @param lam A numeric scalar, the regularization parameter ($\lambda$).
//' @return A numeric vector, the result of the proximal operation ($\boldsymbol{\beta}$).
//' @export
// [[Rcpp::export]]
arma::vec prox_dp(arma::vec z, double lam) {
// Set up the output vector
int n = z.n_elem;
arma::vec beta = arma::zeros<arma::vec>(n);
// Call the core C++ logic
// z.memptr() provides a pointer to the internal data (const for input)
// beta.memptr() provides a pointer to the internal data (non-const for output)
prox_dp_core_cpp(n, z.memptr(), lam, beta.memptr());
return beta;
}We need this blob (from https://github.com/jacobbien/litr-project/blob/main/examples/make-an-r-package-with-armadillo/create-witharmadillo.Rmd ):
## ℹ Leaving 'src/code.cpp' unchanged.
## ☐ Edit 'src/code.cpp'.
## ✔ Adding RcppArmadillo to 'LinkingTo' field in DESCRIPTION.
## ✔ Created 'src/Makevars' and 'src/Makevars.win' with requested compilation settings.
## ℹ Leaving 'src/code.cpp' unchanged.
## ☐ Edit 'src/code.cpp'.
## ✔ Adding RcppEigen to 'LinkingTo' field in DESCRIPTION.
## ✔ Adding "@import RcppEigen" to 'R/flowtrend-package.R'.
## ☐ Run `devtools::document()` to update 'NAMESPACE'.
#' Used as a fallback or to test against \code{Mstep_mu()}.
#'
#' @param ylist
#' @param resp
#' @param lambda
#' @param l
#' @param Sigma_inv inverse of Sigma
#' @param x covariates
Mstep_mu_cvxr <- function(ylist,
resp,
lambda,
l,
Sigma_inv,
x = NULL,
thresh = 1E-8,
maxdev = NULL,
dimdat,
N,
ecos_thresh = 1E-8,
scs_eps = 1E-5){
## Define dimensions
TT = length(ylist)
## Responsibility Weighted Data
ytildes <- lapply(1:TT, FUN = function(tt){
yy <- ylist[[tt]]
g <- resp[[tt]]
yy <- apply(yy, MARGIN = 2, FUN = function(x) x * g)
yrow = matrix(c(NA, NA), nrow=1, ncol=dimdat)
yrow[1,] = colSums(yy)
yrow
}) %>% do.call(rbind, .)
## Auxiliary term, needed to make the objective interpretable
aux.y <- Reduce("+", lapply(1:TT, FUN = function(tt){
yy <- ylist[[tt]]
g <- sqrt(resp[[tt]])
yy <- apply(yy, MARGIN = 2, FUN = function(x) x * g)
sum(diag(yy %*% Sigma_inv %*% t(yy)))
}))
## Mu, d x T matrix
mumat <- CVXR::Variable(cols=dimdat, rows=TT)
## Summed sqrt responsibilities - needed in the objective.
resp.sum.sqrt <- lapply(resp, FUN = function(x) sqrt(sum(x)))
## Differencing Matrix, (TT-(l+1)) x TT
Dlp1 <- gen_diff_mat(n = TT, l = l+1, x = x)
# l = 2 is quadratic trend filtering
# l = 1 is linear trend filtering
# l = 0 is fused lasso
## Forming the objective
obj = 1/(2*N) *( Reduce("+", lapply(1:TT, FUN = function(tt) CVXR::quad_form(t(resp.sum.sqrt[[tt]]*mumat[tt,]), Sigma_inv))) -2 * Reduce("+", lapply(1:TT, FUN = function(tt) t(ytildes[tt,]) %*% Sigma_inv %*% t(mumat[tt,]))) + aux.y) + lambda * sum(CVXR::sum_entries(abs(Dlp1 %*% mumat), axis = 1))
## Putting together the ball constraint
rowmns <- matrix(rep(1, TT^2), nrow = TT)/TT
mu_dotdot <- rowmns %*% mumat
constraints = list()
if(!is.null(maxdev)){
constraints = list(CVXR::sum_entries(CVXR::square(mumat - mu_dotdot), axis = 2) <= rep(maxdev^2, TT) )
}
## Try all two CVXR solvers.
prob <- CVXR::Problem(CVXR::Minimize(obj), constraints)
result = NULL
result <- tryCatch({
CVXR::solve(prob, solver="ECOS",
FEASTOL = ecos_thresh, RELTOL = ecos_thresh, ABSTOL = ecos_thresh)
}, error=function(err){
err$message = paste(err$message,
"\n", "Lasso solver using ECOS has failed." ,sep="")
cat(err$message, fill=TRUE)
return(NULL)
})
## If anything is wrong, flag to use SCS solver
scs = FALSE
if(is.null(result)){
scs = TRUE
} else {
if(result$status != "optimal") scs = TRUE
}
## Use the SCS solver
if(scs){
result = CVXR::solve(prob, solver="SCS", eps = scs_eps)
if(any(is.na(result$getValue(mumat)))){ ## A clumsy way to check.
stop("Lasso solver using both ECOS and SCS has failed.", sep="")
}
}
## Record Interesting Parameters
num_iters <- result$num_iters
status <- result$status
mumat <- result$getValue(mumat)
val <- result$value
return(list(mu = mumat, value = val, status = status, num_iters = num_iters))
}This function solves the following problem:
\[ \begin{align*} &\text{minimize}_{\mu} {\frac{1}{2N} \sum_{t=1}^T \sum_{i=1}^{n_t} \hat{\gamma}_{it} (y_i^{(t)} - \mu_{\cdot t})^\top \hat{\Sigma}^{-1} ( y_i^{(t)} - \mu_{\cdot t}) + \lambda \sum_{j=1}^d \|D^{(l)}\mu_{j\cdot }\|_1}\\ &\text{subject to}\;\; {\| \mu_{\cdot t} - \bar{\mu}_{\cdot \cdot}\|_2 \le r \;\;\forall t=1,\cdots, T, } \end{align*} \]
and is directly equivalent to Mstep_mu(). The resulting solution and the
objective value should be the same. Let’s check that.
First, set up some objects to run Mstep_mu() and Mstep_mu_cvxr().
numclust = 3
TT = 100
dimdat = 1
set.seed(0)
dt = gendat_1d(TT = TT, ntlist = rep(TT, 100))
ylist = dt %>% dt2ylist()
sigma = init_sigma(ylist, numclust) ## (T x numclust x (dimdat x dimdat))
mn = init_mn(ylist, numclust, TT, dimdat)##, countslist = countslist)
prob = matrix(1/numclust, nrow = TT, ncol = numclust) ## Initialize to all 1/K.
resp = Estep(mn, sigma, prob, ylist = ylist, numclust)
lambda = .01
l = 1
x = 1:TT
Dlp1 = gen_diff_mat(n = TT, l = l+1, x = x)
Dl = gen_diff_mat(n = TT, l = l, x = x)
Dlsqrd <- t(Dl) %*% Dl
maxdev = NULLThen, compare the result of the two implementations. They should look identical.
## overall ADMM
res1 = Mstep_mu(resp, ylist, lambda, l=l, sigma=sigma, Dlsqrd = Dlsqrd, Dl=Dl, Dlp1=Dlp1, TT=TT, N=N, dimdat=dimdat, e_mat=etilde_mat(TT = TT),
maxdev = maxdev)
mn1 = res1$mns
## CVXR just ONE cluster
res2list = lapply(1:numclust, function(iclust){
Sigma_inv_oneclust = solve(sigma[iclust,,])
resp_oneclist = lapply(resp, function(resp_onetime){resp_onetime[,iclust, drop=FALSE]})
N = sum(unlist(resp))
res2 = Mstep_mu_cvxr(ylist, resp_oneclust,
lambda, l, Sigma_inv_oneclust,
thresh = 1E-8, maxdev = maxdev, dimdat, N)
res2$mu
})
mn2 = array(NA, dim=c(100, dimdat, 3))
for(iclust in 1:numclust){
mn2[,,iclust] = res2list[[iclust]] %>% as.matrix()
}
## Plot them
plot(x = dt$time, y=dt$Y, col = rgb(0,0,0,0.04),
main = 'admm (solid) vs cvxr (dashed)',
ylab = "", xlab = "time")
mn1[,1,] %>% matlines(lwd = 1, lty = 1)
mn2[,1,] %>% matlines(lwd = 3, lty = 3)Next, do this for uneven inputs.
## Setup
numclust = 3
TT = 100
dimdat = 1
set.seed(0)
dt = gendat_1d(TT = TT, ntlist = rep(TT, 100))
ylist = dt %>% dt2ylist()
## Subset them
ylist = ylist[-seq(from=10,to=100,by=10)]
x = (1:100)[-seq(from=10,to=100,by=10)]
sigma = init_sigma(ylist, numclust) ## (T x numclust x (dimdat x dimdat))
mn = init_mn(ylist, numclust, TT, dimdat)##, countslist = countslist)
prob = matrix(1/numclust, nrow = TT, ncol = numclust) ## Initialize to all 1/K.
resp = Estep(mn, sigma, prob, ylist = ylist, numclust)
lambda = .1
l = 2
##x = 1:TT
Dlp1 = gen_diff_mat(n = length(x), l = l+1, x = x)
Dl = gen_diff_mat(n = length(x), l = l, x = x)
Dlsqrd <- t(Dl) %*% Dl
maxdev = NULL
## Try the algorithm itelf.
set.seed(100)
obj = flowtrend_once(ylist = ylist, x = x, lambda = .1, lambda_prob = .1,
l = 2, l_prob = 2, maxdev = 5, numclust = 3,
rho_init = 0.01, verbose = TRUE)
plot_1d(ylist=ylist, obj=obj)
plot(obj$objectives, type ='l')
mn1 = obj$mn
obj$mn[,1,] %>% diff() %>% diff() %>% diff() %>% matplot(type='l')
mn[,1,] %>% diff() %>% diff() %>% diff() %>% matplot(type='l')
mn[,1,] %>% matplot(type='l')
## Okay, so cluster 3 has a very irregular mean.
plot(x = dt$time, y=dt$Y, col = rgb(0,0,0,0.04),
main = 'admm (solid) vs cvxr (dashed)',
ylab = "", xlab = "time")
mn_old[,1,] %>% matlines(lwd = 1, x=x, lty = 1)
mn_less_old[,1,] %>% matlines(lwd = 1, x=x, lty = 2)
mn[,1,] %>% matlines(lwd = 1, x=x, lty = 3)5.6 The main “flowtrend” function
Now we’ve assembled all ingredients we need, we’ll build the main function
flowtrend_once() to estimate a flowtrend model. Here goes:
#' Estimate flowtrend model once.
#'
#' @param ylist Data.
#' @param countslist Counts corresponding to multiplicities.
#' @param x Times, if points are not evenly spaced. Defaults to NULL, in which
#' case the value becomes \code{1:T}, for the $T==length(ylist)$.
#' @param numclust Number of clusters.
#' @param niter Maximum number of EM iterations.
#' @param l Degree of differencing for the mean trend filtering. l=0 will give
#' you piecewise constant means; l=1 is piecewise linear, and so forth.
#' @param l_prob Degree of differencing for the probability trend filtering.
#' @param mn Initial value for cluster means. Defaults to NULL, in which case
#' initial values are randomly chosen from the data.
#' @param lambda Smoothing parameter for means
#' @param lambda_prob Smoothing parameter for probabilities
#' @param verbose Loud or not? EM iteration progress is printed.
#' @param tol_em Relative numerical improvement of the objective value at which
#' to stop the EM algorithm
#' @param maxdev Maximum deviation of cluster means across time..
#' @param countslist_overwrite
#' @param admm_err_rel
#' @param admm_err_abs
#' @param admm_local_adapt
#' @param admm_local_adapt_niter
#'
#' @return List object with flowtrend model estimates.
#' @export
#'
#' @examples
flowtrend_once <- function(ylist,
countslist = NULL,
x = NULL,
numclust, niter = 1000, l, l_prob = NULL,
mn = NULL, lambda = 0, lambda_prob = NULL, verbose = FALSE,
tol_em = 1E-4,
maxdev = NULL,
countslist_overwrite = NULL,
## beta Mstep (ADMM) settings
admm = TRUE,
admm_err_rel = 1E-3,
admm_err_abs = 1E-4,
## Mean M step (Locally Adaptive ADMM) settings
admm_local_adapt = TRUE,
admm_local_adapt_niter = if(admm_local_adapt) 10 else 1,
rho_init = 0.1,
## Other options
check_convergence = TRUE,
## Random seed
seed = NULL){
## Basic checks
if(!is.null(maxdev)){
assertthat::assert_that(maxdev!=0)
} else {
maxdev = 1E10
}
assertthat::assert_that(numclust > 1)
assertthat::assert_that(niter > 1)
if(is.null(countslist)){
ntlist = sapply(ylist, nrow)
countslist = lapply(ntlist, FUN = function(nt) rep(1, nt))
}
if(!is.null(seed)){
assertthat::assert_that(all((seed %>% sapply(., class)) == "integer"))
assertthat::assert_that(length(seed) == 7)
}
## Setup for EM algorithm
TT = length(ylist)
dimdat = ncol(ylist[[1]])
if(is.null(x)) x <- 1:TT
if(is.unsorted(x)) stop("x must be ordered!")
# l = 2 is quadratic trend filtering
# l = 1 is linear trend filtering
# l = 0 is fused lasso
# D^{(1)} is first differences, so it correponds to l=0
Dlp1 = gen_diff_mat(n = TT, l = l+1, x = x)
if(l > 1){
facmat = diag(l / diff(x, lag = l))
} else {
facmat = diag(rep(1, TT-l))
}
Dl = facmat %*% gen_diff_mat(n = TT, l = l, x = x)
tDl = t(Dl)
Dlsqrd <- t(Dl) %*% Dl
e_mat <- etilde_mat(TT = TT) # needed to generate B
Dlp1_prob = gen_diff_mat(n = TT, l = l_prob+1, x = x)
H_tf <- gen_tf_mat(n = TT, k = l_prob, x = x)
if(is.null(mn)){
mn = init_mn(ylist, numclust, TT, dimdat,
countslist = countslist, seed = seed)
}
ntlist = sapply(ylist, nrow)
N = sum(ntlist)
## Initialize some objects
prob = matrix(1/numclust, nrow = TT, ncol = numclust) ## Initialize to all 1/K.
denslist_by_clust <- NULL
objectives = c(+1E20, rep(NA, niter-1))
sigma_fac <- diff(range(do.call(rbind, ylist)))/8
sigma = init_sigma(ylist, numclust, sigma_fac) ## (T x numclust x (dimdat x dimdat))
sigma_eig_by_clust = NULL
zero.betas = zero.alphas = list()
## The least elegant solution I can think of.. used only for blocked cv
if(!is.null(countslist_overwrite)) countslist = countslist_overwrite
#if(!is.null(countslist)) check_trim(ylist, countslist)
vals <- vector(length = niter)
latest_rho = NA
start.time = Sys.time()
for(iter in 2:niter){
if(verbose){
print_progress(iter-1, niter-1, "EM iterations.", start.time = start.time, fill = FALSE)
}
resp <- Estep(mn, sigma, prob, ylist = ylist, numclust = numclust,
denslist_by_clust = denslist_by_clust,
first_iter = (iter == 2), countslist = countslist,
padding = 1E-8)
## M step (three parts)
## 1. Means
res_mu = Mstep_mu(resp, ylist,
lambda = lambda,
first_iter = (iter == 2),
l = l, Dlp1 = Dlp1, Dl = Dl,
tDl = tDl,
Dlsqrd = Dlsqrd,
sigma_eig_by_clust = sigma_eig_by_clust,
sigma = sigma, maxdev = maxdev,
e_mat = e_mat,
Zs = NULL,
Ws = NULL,
uws = NULL,
uzs = NULL,
x = x,
err_rel = admm_err_rel,
err_abs = admm_err_abs,
local_adapt = admm_local_adapt,
local_adapt_niter = admm_local_adapt_niter,
rho_init = rho_init,
iter = iter)
## rho_init = (if(iter == 2) rho_init else latest_rho))
## latest_rho = res_mu$rho
mn = res_mu$mns
## 2. Sigma
sigma = Mstep_sigma(resp, ylist, mn, numclust)
## 3. Probabilities
prob_link = Mstep_prob(resp, countslist = countslist, H_tf = H_tf,
lambda_prob = lambda_prob, l_prob = l_prob, x = x)
prob = softmax(prob_link)
objectives[iter] = objective(ylist = ylist, mu = mn, sigma = sigma, prob = prob, prob_link = prob_link,
lambda = lambda, Dlp1 = Dlp1, l = l, countslist = countslist,
Dlp1_prob = Dlp1_prob,
l_prob = l_prob,
lambda_prob = lambda_prob)
## Check convergence
if(iter > 10){
if(check_convergence &
check_converge_rel(objectives[iter-1], objectives[iter], tol = tol_em) &
check_converge_rel(objectives[iter-2], objectives[iter-1], tol = tol_em)&
check_converge_rel(objectives[iter-3], objectives[iter-2], tol = tol_em)){
## check_converge_rel(objectives[iter-4], objectives[iter-3], tol = tol_em)){
break
}
}
}
return(structure(list(mn = mn,
prob = prob,
prob_link = prob_link,
sigma = sigma,
objectives = objectives[2:iter],
final.iter = iter,
## Above is output, below are data/algorithm settings.
dimdat = dimdat,
TT = TT,
N = N,
l = l,
l_prob = l_prob,
x = x,
numclust = numclust,
lambda = lambda,
lambda_prob = lambda_prob,
maxdev = maxdev,
niter = niter
), class = "flowtrend"))
}Next, flowtrend() is the main user-facing function.
#' Main function. Repeats the EM algorithm (\code{flowtrend_once()}) with |nrep| restarts (5 by default).
#'
#' @param ... : arguments for \code{flowtrend_once()}
#' @param nrestart : number of random restarts
#'
#' @return
#' @export
#'
#' @examples
flowtrend <- function(..., nrestart = 5){
args = list(...)
if("verbose" %in% names(args)){
if(args$verbose){
cat("EM will restart", nrestart, "times", fill=TRUE)
}
}
out_models <- lapply(1:nrestart, FUN = function(irestart){
if("verbose" %in% names(args)){
if(args$verbose){
cat("EM restart:", irestart, fill=TRUE)
}
}
model_temp <- flowtrend_once(...)
model_obj <- tail(model_temp$objectives, n = 1)
if("verbose" %in% names(args)){
if(args$verbose){
cat(fill=TRUE)
}
}
return(list(model = model_temp, final_objective = model_obj))
})
final_objectives <- sapply(out_models, FUN = function(x) x$final_objective)
best_model <- which.min(final_objectives)
final_model = out_models[[best_model]][["model"]]
## Add the objectives
final_model$all_objectives =
lapply(1:nrestart, function(irestart){
one_model = out_models[[irestart]]
data.frame(objective=one_model$model$objectives) %>% mutate(iter=row_number(), irestart=irestart) %>% select(irestart, iter, objective)
}) %>% bind_rows()
return(final_model)
}