# # em.S # # main routine # # Splus code of the EM algorithm for mixture models # copyright by Shotaro Akaho # AIST Neuroscience Research Instutute, Japan # Notice: # 1. Only noncommercial use is allowed # 2. Attach this header in the case of redistribution # 3. If you find any bugs, it is strongly recommended to inform them to me. estimate <- function(model, ...) { UseMethod("estimate") } dprob <- function(model, ...) { UseMethod("dprob") } rprob <- function(model, ...) { UseMethod("rprob") } ### template for an example class "temp" ## constructor : # temp <- function(...) { # tmp <- ... # class(tmp) <- "temp" # tmp # } ## estimator # estimate.temp <- function(model, dat, weight = 1) { # } ## density function # dprob.temp <- function(model, dat) { # } ## random number generator # rprob.temp <- function(model, n = 1) { # } ## ## print form (optional) # print.temp <- function(model, fullform=F) { # } ### ### generic functions ## likelihood # like.em <- function(model, dat) ## plot form # plot.em <- function(model, dat) # typeI class (any complex shape kernel with a scale and a shift parameters) typeI <- function(model, scale = rep(1, length(model\$models[[1]]\$mu)), shift = rep(0, length(scale))) { if(class(model) != "mixture") { stop("The first argument must be mixture class") } k <- length(model\$models) for(i in 1:k) { if(class(model\$models[[i]]) != "gaussdiag") stop("typeI : model must be mixture of gaussdiag class") } model\$scale <- scale model\$shift <- shift unclass(model) class(model) <- "typeI" model } estimate.typeI <- function(model, dat, weight = 1) { dat <- as.matrix(dat) n <- nrow(dat) wt <- rep(weight, length = n) wt <- wt / sum(wt) k <- length(model\$models) d <- length(model\$scale) rep1k <- rep(1, k) dat2 <- sweep(dat, 2, model\$scale, "*") dat2 <- sweep(dat2, 2, model\$shift, "+") pr <- prod(model\$scale) * condprob.mixture(model, dat2, k) vars <- mus <- matrix(0, k, d) for(j in 1:k) { mus[j, ] <- model\$models[[j]]\$mu vars[j, ] <- (model\$models[[j]]\$sd)^2 } valZ <- sum(wt * apply(pr, 1, sum)) # valZ <- sum(wt * c(pr %*% rep1k)) for(i in 1:d) { pr2 <- sweep(pr, 2, vars[, i], "/") # wt2 <- apply(pr2, 1, sum) wt2 <- c(pr2 %*% rep1k) mupr2 <- sweep(pr2, 2, mus[, i], "*") # muwt2 <- apply(mupr2, 1, sum) muwt2 <- c(mupr2 %*% rep1k) # shift parameter valU <- sum(wt * (muwt2 - model\$scale[i] * dat[, i] * wt2)) valV <- sum(wt * wt2) model\$shift[i] <- valU / valV # scale parameter valX <- sum(wt * dat[, i]^2 * wt2) valY <- sum(wt * dat[, i] * (model\$shift[i] * wt2 - muwt2)) model\$scale[i] <- (sqrt(valY^2 + 4 * valX * valZ) - valY) / valX / 2 } model } dprob.typeI <- function(model, dat) { dat <- as.matrix(dat) n <- nrow(dat) dat <- sweep(dat, 2, model\$scale, "*") dat <- sweep(dat, 2, model\$shift, "+") prod(model\$scale) * dprob.mixture(model, dat) } rprob.typeI <- function(model, n = 1) { tmp <- rprob.mixture(model, n) tmp <- sweep(tmp, 2, model\$shift, "-") tmp <- sweep(tmp, 2, model\$scale, "/") tmp } # typeII class (any shape of kernels with a shift, scale and rotation # parameter, but only 2-dim) typeII <- function(model, sclrot = c(1, 0), shift = rep(0, 2)) { if(class(model) != "mixture") { stop("The first argument must be mixture class") } k <- length(model\$models) for(i in 1:k) { if(class(model\$models[[i]]) != "gaussconst") stop("typeII : model must be mixture of gaussconst class") } model\$sclrot <- sclrot[1:2] model\$shift <- shift[1:2] unclass(model) class(model) <- "typeII" model } estimate.typeII <- function(model, dat, weight = 1) { dat <- as.matrix(dat) n <- nrow(dat) wt <- rep(weight, length = n) wt <- wt / sum(wt) k <- length(model\$models) rep1k <- rep(1, k) h <- model\$sclrot tH <- matrix(c(h[1], h[2], -h[2], h[1]), 2, 2) dat2 <- dat %*% tH dat2 <- sweep(dat2, 2, model\$shift, "+") pr <- (h[1]^2 + h[2]^2) * condprob.mixture(model, dat2, k) var <- rep(0, k) mus <- matrix(0, k, 2) for(j in 1:k) { mus[j, ] <- model\$models[[j]]\$mu var[j] <- (model\$models[[j]]\$sd)^2 } # valZ <- sum(wt * apply(pr, 1, sum)) valZ <- sum(wt * c(pr %*% rep1k)) pr2 <- sweep(pr, 2, var, "/") # wt2 <- apply(pr2, 1, sum) wt2 <- c(pr2 %*% rep1k) mu1pr2 <- sweep(pr2, 2, mus[, 1], "*") # mu1wt2 <- apply(mu1pr2, 1, sum) mu1wt2 <- c(mu1pr2 %*% rep1k) mu2pr2 <- sweep(pr2, 2, mus[, 2], "*") # mu2wt2 <- apply(mu2pr2, 1, sum) mu2wt2 <- c(mu2pr2 %*% rep1k) # shift parameter valU1 <- sum(wt * (mu1wt2 - (h[1] * dat[, 1] + h[2] * dat[, 2]) * wt2)) valU2 <- sum(wt * (mu2wt2 - (h[1] * dat[, 2] - h[2] * dat[, 1]) * wt2)) valV <- sum(wt * wt2) model\$shift[1] <- valU1 / valV model\$shift[2] <- valU2 / valV # scale parameter valX <- sum(wt * (dat[, 1]^2 + dat[, 2]^2) * wt2) tmp1 <- mu1wt2 - model\$shift[1] * wt2 tmp2 <- mu2wt2 - model\$shift[2] * wt2 valY1 <- sum(wt * (tmp1 * dat[, 1] + tmp2 * dat[, 2])) valY2 <- sum(wt * (tmp1 * dat[, 2] - tmp2 * dat[, 1])) tmp <- (1 + sqrt(1 + 8 * valZ * valX / (valY1^2 + valY2^2))) / valX / 2 model\$sclrot[1] <- valY1 * tmp model\$sclrot[2] <- valY2 * tmp model } dprob.typeII <- function(model, dat) { dat <- as.matrix(dat) n <- nrow(dat) h <- model\$sclrot tH <- matrix(c(h[1], h[2], -h[2], h[1]), 2, 2) dat <- dat %*% tH dat <- sweep(dat, 2, model\$shift, "+") (h[1]^2 + h[2]^2) * dprob.mixture(model, dat) } rprob.typeII <- function(model, n = 1) { tmp <- rprob.mixture(model, n) tmp <- sweep(tmp, 2, model\$shift, "-") h <- model\$sclrot tHinv <- matrix(c(h[1], -h[2], h[2], h[1]), 2, 2) / (h[1]^2 + h[2]^2) tmp <- tmp %*% tHinv tmp } plot.typeII <- function(model, dat) { d <- length(model\$shift) plot.em(model, dat) } # mixture class (general mixture class) mixture <- function(models, prob = rep(1 / length(models), length(models)), modelonly = F) { if(!is.null(class(models)) && class(models) != "list") stop("the first argument must be list of models") tmp <- list(prob = prob, models = models, modelonly = modelonly) class(tmp) <- "mixture" tmp } estimate.mixture <- function(model, dat, weight = 1) { dat <- as.matrix(dat) n <- nrow(dat) k <- length(model\$models) condpr <- condprob.mixture(model, dat, k) if(!model\$modelonly) { # model\$prob <- apply(condpr, 2, sum) / n model\$prob <- c(rep(1, n) %*% condpr) / n } for(i in 1:k) { model\$models[[i]] <- estimate(model\$models[[i]], dat, weight * condpr[, i]) } model } dprob.mixture <- function(model, dat) { dat <- as.matrix(dat) n <- nrow(dat) pr <- rep(0, n) k <- length(model\$models) for(i in 1:k) { pr <- pr + dprob(model\$models[[i]], dat) * model\$prob[i] } pr } rprob.mixture <- function(model, n = 1) { k <- length(model\$models) lab <- sample(1:k, n, replace = T, prob = model\$prob) tmp <- NULL for(i in 1:k) { nk <- sum(lab == i) if(nk > 0) tmp <- rbind(tmp, rprob(model\$models[[i]], nk)) } tmp } condprob.mixture <- function(model, dat, k = length(model\$models)) { dat <- as.matrix(dat) n <- nrow(dat) pr <- matrix(0, n, k) for(i in 1:k) { pr[, i] <- dprob(model\$models[[i]], dat) * model\$prob[i] } # sweep(pr, 1, apply(pr, 1, sum), "/") sweep(pr, 1, c(pr %*% rep(1, k)), "/") } plot.mixture <- function(model, dat, d = ncol(dat), noplotpoints = F) { n <- nrow(dat) k <- length(model\$models) lab <- clust.mixture(model, dat, n, k) if(d == 1) { dat <- cbind(dat, rep(1, n)) } else if(d > 2) { dat <- prcomp(dat)\$x[, 1:2] } xlim <- c(min(dat[, 1]), max(dat[, 1])) ylim <- c(min(dat[, 2]), max(dat[, 2])) if(noplotpoints && d <= 2) { plot(xlim, ylim, type="n") for(i in 1:k) { plot.em(model\$models[[i]], dat, noplotpoints = F, add = F, col=i) } return() } plot(xlim, ylim, xlim = xlim, ylim = ylim, type = "n", xlab = "x1", ylab = "x2") for(i in 1:k) { idx <- lab == i points(dat[idx, 1], dat[idx, 2], pch=i) } } clust.mixture <- function(model, dat, k=length(model\$models)) { condpr <- condprob.mixture(model, dat, k) argmax.mat(condpr, 1) } # gauss class (general gaussian) gauss <- function(mu = 0, var = diag(rep(1,length(mu))), zeromean = F) { d <- length(mu) if(zeromean) mu <- rep(0, d) if(!is.matrix(var)) var <- as.matrix(var) r <- eigen(var, T) norm <- sqrt((2 * pi)^d * prod(r\$values)) tmp <- list(mu = mu, var = var, norm = norm, zeromean = zeromean) class(tmp) <- "gauss" tmp } estimate.gauss <- function(model, dat, weight = 1) { dat <- as.matrix(dat) n <- nrow(dat) wt <- rep(weight, length = n) tmp <- cov.wt(dat, wt, center = !model\$zeromean) gauss(tmp\$center, tmp\$cov) } dprob.gauss <- function(model, dat) { dat <- as.matrix(dat) n <- nrow(dat) exp(-mahalanobis(dat, model\$mu, model\$var) / 2) / model\$norm } rprob.gauss <- function(model, n = 1) { d <- length(model\$mu) tmp <- matrix(rnorm(d * n), ncol = d) tmp <- tmp %*% chol(model\$var) if(!model\$zeromean) sweep(tmp, 2, model\$mu, "+") else tmp ## the following is a routine which appeares in S-PLUS manual (modified) # d <- length(model\$mu) # vs <- svd(model\$var) # vsqrt <- t(vs\$v %*% (t(vs\$u) * sqrt(vs\$d))) # tmp <- matrix(rnorm(d * n), nrow = n) %*% vsqrt # sweep(tmp, 2, model\$mu, "+") } # gaussdiag class (elliptic gaussian) gaussdiag <- function(mu = 0, sd = rep(1,length(mu))) { tmp <- list(mu = mu, sd = rep(c(sd),length=length(mu))) class(tmp) <- "gaussdiag" tmp } estimate.gaussdiag <- function(model, dat, weight = 1) { dat <- as.matrix(dat) n <- nrow(dat) wt <- rep(weight, length = n) wt <- wt / sum(wt) dat.wt <- sweep(dat, 1, wt, "*") dat.sqwt <- sweep(dat, 1, sqrt(wt), "*") # model\$mu <- apply(dat.wt, 2, sum) model\$mu <- c(rep(1, n) %*% dat.wt) # model\$sd <- sqrt(apply(dat.sqwt^2, 2, sum) - mu^2) model\$sd <- sqrt(c(rep(1, n) %*% dat.sqwt^2) - mu^2) model } dprob.gaussdiag <- function(model, dat) { dat <- as.matrix(dat) n <- nrow(dat) d <- length(model\$mu) tmp <- rep(1, n) for(i in 1:d) { tmp <- tmp * dnorm(dat[, i], model\$mu[i], model\$sd[i]) } tmp } rprob.gaussdiag <- function(model, n = 1) { d <- length(model\$mu) tmp <- NULL for(i in 1:d) { tmp <- cbind(tmp, rnorm(n, model\$mu[i], model\$sd[i])) } tmp } # gaussconst class (circular gaussian) gaussconst <- function(mu = 0, sd = rep(1,length(mu)), muonly = F) { tmp <- list(mu = mu, sd = sd[1], muonly = muonly) class(tmp) <- "gaussconst" tmp } estimate.gaussconst <- function(model, dat, weight = 1) { dat <- as.matrix(dat) n <- nrow(dat) wt <- rep(weight, length = n) wt <- wt / sum(wt) dat.wt <- sweep(dat, 1, wt, "*") # model\$mu <- apply(dat.wt, 2, sum) model\$mu <- c(rep(1, n) %*% dat.wt) if(!model\$muonly) { dat.sqwt <- sweep(dat, 1, sqrt(wt), "*") # model\$sd <- sqrt(mean(apply(dat.sqwt^2, 2, sum) - model\$mu^2)) model\$sd <- sqrt(mean(c(rep(1, n) %*% dat.sqwt^2) - model\$mu^2)) } model } dprob.gaussconst <- function(model, dat) { dat <- as.matrix(dat) n <- nrow(dat) d <- length(model\$mu) dat <- sweep(dat, 2, model\$mu, "-") tmp <- matrix(log(dnorm(dat, 0, model\$sd)), ncol = d) matrix(exp(tmp %*% rep(1, d)), ncol = d) } rprob.gaussconst <- function(model, n = 1) { d <- length(model\$mu) tmp <- matrix(rnorm(n * d), ncol = d) sweep(tmp, 2, model\$mu, "+") } # uniform class (uniform distribution) uniform <- function(datmin = 0, datmax = 1) { dmin <- min(datmin, datmax) dmax <- max(datmin, datmax) tmp <- list(val = 1 / prod(dmax - dmin), min = dmin, max = dmax) class(tmp) <- "uniform" tmp } estimate.uniform <- function(model, dat, k) { dat <- as.matrix(dat) n <- nrow(dat) model } dprob.uniform <- function(model, dat) { dat <- as.matrix(dat) n <- nrow(dat) rep(model\$val, n) } rprob.uniform <- function(model, n = 1) { d <- length(model\$min) matrix(runif(n * d, model\$min, model\$max), ncol = d, byrow = T) } # General functions # likelihood for the distribution like.em <- function(model, dat) { tmp <- dprob(model, dat) mean(log(tmp)) } # argmax function argmax <- function(x) { ((1:length(x))[x == max(x)])[1] } argmax.mat <- function(mat, margin) { if(margin == 2) mat <- t(mat) tmp <- rep(1, nrow(mat)) mx <- mat[, 1] if(ncol(mat) == 1) return(tmp) for(i in 2:ncol(mat)) { tmp2 <- mx < mat[, i] mx[tmp2] <- mat[tmp2, i] tmp[tmp2] <- i } tmp } plot.em <- function(model, dat, num = 100, noplotpoints = F, add = F, col = 1) { dat <- as.matrix(dat) d <- ncol(dat) if(d == 1) { x <- dat[, 1] xmin <- min(x) xmax <- max(x) xseq <- seq(xmin, xmax, length = num) dst2 <- dprob(model, xseq) ymax <- max(dst2) if(!noplotpoints) { dst1 <- density(x) ymax <- max(max(dst1\$y), max(dst2)) } if(!add) plot(c(xmin, xmax), c(0, ymax), type="n", xlab="x", ylab="density") lines(xseq, dst2, col = col) if(!noplotpoints) { lines(dst1, lty = 2, col = col) } } else if(d == 2) { x <- dat[, 1] y <- dat[, 2] xmin <- min(x) xmax <- max(x) ymin <- min(y) ymax <- max(y) if(!add) plot(c(xmin, xmax),c(ymin, ymax), type="n", xlab="x", ylab="y") if(!noplotpoints) points(x, y, pch=".", col = col) dx <- seq(xmin, xmax, length=num) dy <- seq(ymin, ymax, length=num) dz <- matrix(0, num, num) dxy <- matrix(dx, num, 2) for(i in 1:num) { dxy[, 2] <- dy[i] dz[, i] <- dprob(model, dxy) } contour(dx, dy, dz, add=T, nint=5, lty=1, col=col) } else if(class(model) == "mixture") { plot.mixture(model, dat, d) } else { stop("sorry unsupported") } }