#
# em.S
#	
# main routine
#
# Splus code of the EM algorithm for mixture models
# 	copyright by Shotaro Akaho <s.akaho@aist.go.jp>
#                    AIST Neuroscience Research Instutute, Japan 
# Notice:
#  1. Only noncommercial use is allowed
#  2. Attach this header in the case of redistribution
#  3. If you find any bugs, it is strongly recommended to inform them to me.

estimate <- function(model, ...) {
  UseMethod("estimate")
}

dprob <- function(model, ...) {
  UseMethod("dprob")
}

rprob <- function(model, ...) {
  UseMethod("rprob")
}

### template for an example class "temp"
## constructor :
#   temp <- function(...) { 
#     tmp <- ...
#     class(tmp) <- "temp"
#     tmp
#   }
## estimator
#   estimate.temp <- function(model, dat, weight = 1) {
#   }
## density function
#   dprob.temp <- function(model, dat) {
#   }
## random number generator
#   rprob.temp <- function(model, n = 1) {
#   }
##
## print form (optional)
#   print.temp <- function(model, fullform=F) {
#   }
###
### generic functions
## likelihood
#  like.em <- function(model, dat)
## plot form
#   plot.em <- function(model, dat)

# typeI class (any complex shape kernel with a scale and a shift parameters)

typeI <- function(model, scale = rep(1, length(model$models[[1]]$mu)), 
  shift = rep(0, length(scale))) {
  if(class(model) != "mixture") {
    stop("The first argument must be mixture class")
  }
  k <- length(model$models)
  for(i in 1:k) {
    if(class(model$models[[i]]) != "gaussdiag")
	stop("typeI : model must be mixture of gaussdiag class")
  }
  model$scale <- scale
  model$shift <- shift
  unclass(model)
  class(model) <- "typeI"
  model
}

estimate.typeI <- function(model, dat, weight = 1) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  wt <- rep(weight, length = n)  
  wt <- wt / sum(wt)
  k <- length(model$models)
  d <- length(model$scale)
  rep1k <- rep(1, k)

  dat2 <- sweep(dat, 2, model$scale, "*")
  dat2 <- sweep(dat2, 2, model$shift, "+")
  pr <- prod(model$scale) * condprob.mixture(model, dat2, k)
  vars <- mus <- matrix(0, k, d)
  for(j in 1:k) {
    mus[j, ] <- model$models[[j]]$mu
    vars[j, ] <- (model$models[[j]]$sd)^2
  }

  valZ <- sum(wt * apply(pr, 1, sum))
#  valZ <- sum(wt * c(pr %*% rep1k))
  for(i in 1:d) {
    pr2 <- sweep(pr, 2, vars[, i], "/")
#    wt2 <- apply(pr2, 1, sum)
    wt2 <- c(pr2 %*% rep1k)
    mupr2 <- sweep(pr2, 2, mus[, i], "*")
#    muwt2 <- apply(mupr2, 1, sum)
    muwt2 <- c(mupr2 %*% rep1k)
    # shift parameter
    valU <- sum(wt * (muwt2 - model$scale[i] * dat[, i] * wt2))
    valV <- sum(wt * wt2)
    model$shift[i] <- valU / valV
    # scale parameter
    valX <- sum(wt * dat[, i]^2 * wt2)
    valY <- sum(wt * dat[, i] * (model$shift[i] * wt2 - muwt2))
    model$scale[i] <- (sqrt(valY^2 + 4 * valX * valZ) - valY) / valX / 2
  }
  model
}

dprob.typeI <- function(model, dat) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  dat <- sweep(dat, 2, model$scale, "*")
  dat <- sweep(dat, 2, model$shift, "+")
  prod(model$scale) * dprob.mixture(model, dat)
}

rprob.typeI <- function(model, n = 1) {
  tmp <- rprob.mixture(model, n)
  tmp <- sweep(tmp, 2, model$shift, "-")
  tmp <- sweep(tmp, 2, model$scale, "/")
  tmp
}

# typeII class (any shape of kernels with a shift, scale and rotation
#               parameter, but only 2-dim)

typeII <- function(model, sclrot = c(1, 0), shift = rep(0, 2)) {
  if(class(model) != "mixture") {
    stop("The first argument must be mixture class")
  }
  k <- length(model$models)
  for(i in 1:k) {
    if(class(model$models[[i]]) != "gaussconst")
	stop("typeII : model must be mixture of gaussconst class")
  }
  model$sclrot <- sclrot[1:2]
  model$shift <- shift[1:2]
  unclass(model)
  class(model) <- "typeII"
  model
}

estimate.typeII <- function(model, dat, weight = 1) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  wt <- rep(weight, length = n)  
  wt <- wt / sum(wt)
  k <- length(model$models)
  rep1k <- rep(1, k)

  h <- model$sclrot
  tH <- matrix(c(h[1], h[2], -h[2], h[1]), 2, 2)
  dat2 <- dat %*% tH
  dat2 <- sweep(dat2, 2, model$shift, "+")
  pr <- (h[1]^2 + h[2]^2) * condprob.mixture(model, dat2, k)

  var <- rep(0, k)
  mus <- matrix(0, k, 2)
  for(j in 1:k) {
    mus[j, ] <- model$models[[j]]$mu
    var[j] <- (model$models[[j]]$sd)^2
  }

#  valZ <- sum(wt * apply(pr, 1, sum))
  valZ <- sum(wt * c(pr %*% rep1k))
  pr2 <- sweep(pr, 2, var, "/")
#  wt2 <- apply(pr2, 1, sum)
  wt2 <- c(pr2 %*% rep1k)
  mu1pr2 <- sweep(pr2, 2, mus[, 1], "*")
#  mu1wt2 <- apply(mu1pr2, 1, sum)
  mu1wt2 <- c(mu1pr2 %*% rep1k)
  mu2pr2 <- sweep(pr2, 2, mus[, 2], "*")
#  mu2wt2 <- apply(mu2pr2, 1, sum)
  mu2wt2 <- c(mu2pr2 %*% rep1k)
  # shift parameter
  valU1 <- sum(wt * (mu1wt2 - (h[1] * dat[, 1] + h[2] * dat[, 2]) * wt2))
  valU2 <- sum(wt * (mu2wt2 - (h[1] * dat[, 2] - h[2] * dat[, 1]) * wt2))
  valV <- sum(wt * wt2)
  model$shift[1] <- valU1 / valV
  model$shift[2] <- valU2 / valV
  # scale parameter
  valX <- sum(wt * (dat[, 1]^2  + dat[, 2]^2) * wt2)
  tmp1 <- mu1wt2 - model$shift[1] * wt2
  tmp2 <- mu2wt2 - model$shift[2] * wt2
  valY1 <- sum(wt * (tmp1 * dat[, 1] + tmp2 * dat[, 2]))
  valY2 <- sum(wt * (tmp1 * dat[, 2] - tmp2 * dat[, 1]))
  tmp <- (1 + sqrt(1 + 8 * valZ * valX / (valY1^2 + valY2^2))) / valX / 2
  model$sclrot[1] <- valY1 * tmp
  model$sclrot[2] <- valY2 * tmp
  model
}

dprob.typeII <- function(model, dat) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  h <- model$sclrot
  tH <- matrix(c(h[1], h[2], -h[2], h[1]), 2, 2)
  dat <- dat %*% tH
  dat <- sweep(dat, 2, model$shift, "+")
  (h[1]^2 + h[2]^2) * dprob.mixture(model, dat)
}

rprob.typeII <- function(model, n = 1) {
  tmp <- rprob.mixture(model, n)
  tmp <- sweep(tmp, 2, model$shift, "-")
  h <- model$sclrot
  tHinv <- matrix(c(h[1], -h[2], h[2], h[1]), 2, 2) / (h[1]^2 + h[2]^2)
  tmp <- tmp %*% tHinv
  tmp
}

plot.typeII <- function(model, dat) {
  d <- length(model$shift)
  plot.em(model, dat)
}

# mixture class (general mixture class)

mixture <- function(models, prob = rep(1 / length(models), length(models)), 
  modelonly = F) {
  if(!is.null(class(models)) && class(models) != "list") 
	stop("the first argument must be list of models")
  tmp <- list(prob = prob, models = models, modelonly = modelonly)
  class(tmp) <- "mixture"
  tmp
}

estimate.mixture <- function(model, dat, weight = 1) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  k <- length(model$models)
  condpr <- condprob.mixture(model, dat, k) 
  if(!model$modelonly) {
#    model$prob <- apply(condpr, 2, sum) / n
    model$prob <- c(rep(1, n) %*% condpr) / n
  }
  for(i in 1:k) {
    model$models[[i]] <- 
      estimate(model$models[[i]], dat, weight * condpr[, i])
  }
  model
}

dprob.mixture <- function(model, dat) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  pr <- rep(0, n)
  k <- length(model$models)
  for(i in 1:k) {
    pr <- pr + dprob(model$models[[i]], dat) * model$prob[i]
  }
  pr
}

rprob.mixture <- function(model, n = 1) {
  k <- length(model$models)
  lab <- sample(1:k, n, replace = T, prob = model$prob)
  tmp <- NULL
  for(i in 1:k) {
    nk <- sum(lab == i)
    if(nk > 0) tmp <- rbind(tmp, rprob(model$models[[i]], nk))
  }
  tmp
}

condprob.mixture <- function(model, dat, k = length(model$models)) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  pr <- matrix(0, n, k)
  for(i in 1:k) {
    pr[, i] <- dprob(model$models[[i]], dat) * model$prob[i]
  }
#  sweep(pr, 1, apply(pr, 1, sum), "/")
  sweep(pr, 1, c(pr %*% rep(1, k)), "/")
}

plot.mixture <- function(model, dat, d = ncol(dat), noplotpoints = F) {
  n <- nrow(dat)
  k <- length(model$models)
  lab <- clust.mixture(model, dat, n, k)
  if(d == 1) {
    dat <- cbind(dat, rep(1, n))
  } else if(d > 2) { 
    dat <- prcomp(dat)$x[, 1:2]
  }
  xlim <- c(min(dat[, 1]), max(dat[, 1]))
  ylim <- c(min(dat[, 2]), max(dat[, 2]))
  if(noplotpoints && d <= 2) {
    plot(xlim, ylim, type="n")
    for(i in 1:k) {
      plot.em(model$models[[i]], dat, noplotpoints = F, add = F, col=i)
    }
    return()
  }
  plot(xlim, ylim, xlim = xlim, ylim = ylim, type = "n", 
    xlab = "x1", ylab = "x2")
  for(i in 1:k) {
    idx <- lab == i
    points(dat[idx, 1], dat[idx, 2], pch=i)
  }
}

clust.mixture <- function(model, dat, k=length(model$models)) {
  condpr <- condprob.mixture(model, dat, k)
  argmax.mat(condpr, 1)
}

# gauss class (general gaussian)

gauss <- function(mu = 0, var = diag(rep(1,length(mu))), zeromean = F) {
  d <- length(mu)
  if(zeromean) mu <- rep(0, d)
  if(!is.matrix(var)) var <- as.matrix(var)
  r <- eigen(var, T)
  norm <- sqrt((2 * pi)^d * prod(r$values))
  tmp <- list(mu = mu, var = var, norm = norm, zeromean = zeromean)
  class(tmp) <- "gauss"
  tmp
}

estimate.gauss <- function(model, dat, weight = 1) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  wt <- rep(weight, length = n)
  tmp <- cov.wt(dat, wt, center = !model$zeromean)
  gauss(tmp$center, tmp$cov)
}

dprob.gauss <- function(model, dat) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  exp(-mahalanobis(dat, model$mu, model$var) / 2) / model$norm
}

rprob.gauss <- function(model, n = 1) {
  d <- length(model$mu)
  tmp <- matrix(rnorm(d * n), ncol = d)
  tmp <- tmp %*% chol(model$var)
  if(!model$zeromean) sweep(tmp, 2, model$mu, "+") else tmp
## the following is a routine which appeares in S-PLUS manual (modified)
#  d <- length(model$mu)
#  vs <- svd(model$var)
#  vsqrt <- t(vs$v %*% (t(vs$u) * sqrt(vs$d)))
#  tmp <- matrix(rnorm(d * n), nrow = n) %*% vsqrt
#  sweep(tmp, 2, model$mu, "+")
}

# gaussdiag class (elliptic gaussian)

gaussdiag <- function(mu = 0, sd = rep(1,length(mu))) {
  tmp <- list(mu = mu, sd = rep(c(sd),length=length(mu)))
  class(tmp) <- "gaussdiag"
  tmp
}

estimate.gaussdiag <- function(model, dat, weight = 1) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  wt <- rep(weight, length = n)
  wt <- wt / sum(wt)
  dat.wt <- sweep(dat, 1, wt, "*")
  dat.sqwt <- sweep(dat, 1, sqrt(wt), "*")
#  model$mu <- apply(dat.wt, 2, sum)
  model$mu <- c(rep(1, n) %*% dat.wt)
#  model$sd <- sqrt(apply(dat.sqwt^2, 2, sum) - mu^2)
  model$sd <- sqrt(c(rep(1, n) %*% dat.sqwt^2) - mu^2)
  model
}

dprob.gaussdiag <- function(model, dat) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  d <- length(model$mu)
  tmp <- rep(1, n)
  for(i in 1:d) {
    tmp <- tmp * dnorm(dat[, i], model$mu[i], model$sd[i])
  }
  tmp
}

rprob.gaussdiag <- function(model, n = 1) {
  d <- length(model$mu)
  tmp <- NULL
  for(i in 1:d) {
    tmp <- cbind(tmp, rnorm(n, model$mu[i], model$sd[i]))
  }
  tmp
}

# gaussconst class (circular gaussian)

gaussconst <- function(mu = 0, sd = rep(1,length(mu)), muonly = F) {
  tmp <- list(mu = mu, sd = sd[1], muonly = muonly)
  class(tmp) <- "gaussconst"
  tmp
}

estimate.gaussconst <- function(model, dat, weight = 1) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  wt <- rep(weight, length = n)
  wt <- wt / sum(wt)
  dat.wt <- sweep(dat, 1, wt, "*")
#  model$mu <- apply(dat.wt, 2, sum)
  model$mu <- c(rep(1, n) %*% dat.wt)
  if(!model$muonly) {
    dat.sqwt <- sweep(dat, 1, sqrt(wt), "*")
#    model$sd <- sqrt(mean(apply(dat.sqwt^2, 2, sum) - model$mu^2))
    model$sd <- sqrt(mean(c(rep(1, n) %*% dat.sqwt^2) - model$mu^2))
  }
  model
}

dprob.gaussconst <- function(model, dat) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  d <- length(model$mu)
  dat <- sweep(dat, 2, model$mu, "-")
  tmp <- matrix(log(dnorm(dat, 0, model$sd)), ncol = d)
  matrix(exp(tmp %*% rep(1, d)), ncol = d)
}

rprob.gaussconst <- function(model, n = 1) {
  d <- length(model$mu)
  tmp <- matrix(rnorm(n * d), ncol = d)
  sweep(tmp, 2, model$mu, "+")
}

# uniform class (uniform distribution)

uniform <- function(datmin = 0, datmax = 1) {
  dmin <- min(datmin, datmax)
  dmax <- max(datmin, datmax)
  tmp <- list(val = 1 / prod(dmax - dmin),
    min = dmin, max = dmax)
  class(tmp) <- "uniform"
  tmp
}

estimate.uniform <- function(model, dat, k) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  model
}

dprob.uniform <- function(model, dat) {
  dat <- as.matrix(dat)
  n <- nrow(dat)
  rep(model$val, n)
}

rprob.uniform <- function(model, n = 1) {
  d <- length(model$min)
  matrix(runif(n * d, model$min, model$max), ncol = d, byrow = T)
}

# General functions

# likelihood for the distribution
like.em <- function(model, dat) {
  tmp <- dprob(model, dat)
  mean(log(tmp))
}

# argmax function
argmax <- function(x) {
  ((1:length(x))[x == max(x)])[1]
}

argmax.mat <- function(mat, margin) {
  if(margin == 2) mat <- t(mat)
  tmp <- rep(1, nrow(mat))
  mx <- mat[, 1]
  if(ncol(mat) == 1) return(tmp) 
  for(i in 2:ncol(mat)) {                      
    tmp2 <- mx < mat[, i]
    mx[tmp2] <- mat[tmp2, i]
    tmp[tmp2] <- i
  }
  tmp
}

plot.em <- function(model, dat, num = 100, 
    noplotpoints = F, add = F, col = 1) {
  dat <- as.matrix(dat)
  d <- ncol(dat)
  if(d == 1) {
    x <- dat[, 1]
    xmin <- min(x)
    xmax <- max(x)
    xseq <- seq(xmin, xmax, length = num)
    dst2 <- dprob(model, xseq)
    ymax <- max(dst2)
    if(!noplotpoints) {
      dst1 <- density(x)
      ymax <- max(max(dst1$y), max(dst2))
    }
    if(!add)
      plot(c(xmin, xmax), c(0, ymax), type="n", xlab="x", ylab="density")
    lines(xseq, dst2, col = col)
    if(!noplotpoints) {
      lines(dst1, lty = 2, col = col)
    }
  } else if(d == 2) {
    x <- dat[, 1]
    y <- dat[, 2]
    xmin <- min(x)
    xmax <- max(x)
    ymin <- min(y)
    ymax <- max(y)
    if(!add) 
      plot(c(xmin, xmax),c(ymin, ymax), type="n", xlab="x", ylab="y")
    if(!noplotpoints) points(x, y, pch=".", col = col)
    dx <- seq(xmin, xmax, length=num)
    dy <- seq(ymin, ymax, length=num)
    dz <- matrix(0, num, num)
    dxy <- matrix(dx, num, 2)
    for(i in 1:num) {
        dxy[, 2] <- dy[i]
        dz[, i] <- dprob(model, dxy)
    }
    contour(dx, dy, dz, add=T, nint=5, lty=1, col=col)
  } else if(class(model) == "mixture") {
    plot.mixture(model, dat, d)
  } else {
    stop("sorry unsupported")
  }
}