# R code for fitting a Finite Mixture Model to survival data

The following R code fits a Finite Mixture Model to survival (or reliability) data. The survival/reliability data may contain (right / interval) censored observations. However, it is also possible to fit the Finite Mixture Model to complete (uncensored) survival/reliability data.

The Finite Mixture Model is fitted by using the Expectation-Maximization (EM) algorithm. The fitted model assumes that the lifetime observations follow either a Weibull, lognormal, or Gaussian distribution.

See this page for an overview of all of Stefan’s R code blog posts.

R code

```library(SPREDA)
library(LearnBayes)

##RIGHT CENSORED DATA

#function for fitting a finite mixture model to RIGHT censored or complete data
mixcensored <- function (y, d=rep(1, length(y)), wt=rep(1, length(y)),
xformula=~1, xdata=NULL,
dist="weibull", n, cluster=NULL, classify="EM",
maxiter=100, tol=1e-6) {
#d is the censoring indicator (1=failure; 0=censored)
#wt are the weights for the observations
#xformula specifies the right hand side of the regression model to be fitted
#xdata is a data frame containing the covariates in the xformula
#dist: either the "weibull", "lognormal", or "gaussian" distribution
#n is the number of components
# a matrix with n columns of initial posterior probabilities for the observations
#classify: "EM", "CEM", or "SEM" strategy
#maxiter is the maximum number of iterations
#tol is the convergence criterion

nobs <- sum(wt) #number of observations

if (xformula==~1) {
termFormula <- 1
parms <- matrix(NA, ncol=3, nrow=n)
colnames(parms) <- c("logLik", "mu", "sigma")
stdErr <- matrix(NA, ncol=2, nrow=n)
colnames(stdErr) <- c("mu", "log(sigma)")
dm <- data.frame(y,d,1)}
else {
termsXformula <- terms(xformula)
nterm <- ncol(model.matrix(xformula, xdata))
covNames <- colnames(model.matrix(xformula, xdata))
termNames <- attr(termsXformula, "term.labels")
termFormula <- paste(termNames, collapse="+")
parms <- matrix(NA, ncol=nterm+2, nrow=n)
colnames(parms) <- c("logLik", covNames, "sigma")
stdErr <- matrix(NA, ncol=nterm+1, nrow=n)
colnames(stdErr) <- c(covNames, "log(sigma)")
dm <- data.frame(y,d,xdata)}

mus <- matrix(NA, ncol=n, nrow=length(y))
posteriorP <- matrix(NA, ncol=n, nrow=length(y))
P <- matrix(NA, ncol=n, nrow=length(y))
posSigma <- ncol(parms)

iteration <- 0 #initialize iteration counter
loglikInit <- logLikC <- 0 #initialize log-likelihood estimates

#initialize posterior probabilities
if (is.null(cluster)) {
alpha <- rep(.1, n)
posteriorP <- rdirichlet(length(y), alpha)}
else {posteriorP <- cluster}

while (iteration < maxiter) {
#estimate prior probabilities
priorP <- apply(wt*posteriorP, 2, sum)/nobs

#estimate distribution parameters for each component
if (classify=="EM"){
for (i in 1:n) {
wtp <- ifelse(wt*posteriorP[,i]<=0, 1e-15, wt*posteriorP[,i])
mixform <- formula(paste("Surv(y,d)~", termFormula, sep=""))
cp <- survreg(mixform, weights=wtp, dist=dist, data=dm)
parms[i,] <- c(logLik(cp)[1], coef(cp), cp\$scale)
stdErr[i,] <- sqrt(diag(vcov(cp)))
mus[,i] <- predict(cp, type="lp")}
}

else if (classify=="CEM"){
compPost <- apply(posteriorP, 1, which.max)
for (i in 1:n) {
mixform <- formula(paste("Surv(y,d)~", termFormula, sep=""))
cp <- survreg(mixform, weights=wt, dist=dist, data=dm, subset=(compPost==i))
parms[i,] <- c(logLik(cp)[1], coef(cp), cp\$scale)
stdErr[i,] <- sqrt(diag(vcov(cp)))
if (xformula==~1) {
mus[,i] <- predict(cp, newdata=data.frame(rep(1,length(y))), type="lp")}
else mus[,i] <- predict(cp, newdata=xdata, type="lp")}
}

else if (classify=="SEM"){
compPost <- apply(posteriorP, 1, function (p) sample(x=1:n, size=1, prob=p))
for (i in 1:n) {
mixform <- formula(paste("Surv(y,d)~", termFormula, sep=""))
cp <- survreg(mixform, weights=wt, dist=dist, data=dm, subset=(compPost==i))
parms[i,] <- c(logLik(cp)[1], coef(cp), cp\$scale)
stdErr[i,] <- sqrt(diag(vcov(cp)))
if (xformula==~1) {
mus[,i] <- predict(cp, newdata=data.frame(rep(1,length(y))), type="lp")}
else mus[,i] <- predict(cp, newdata=xdata, type="lp")}
}

#compute the (complete) log-likelihood value
logLikC <- sum(parms[,1]) + sum(wt*(posteriorP%*%log(priorP)))
logLikC <- ifelse(is.na(logLikC), 0, logLikC)

#estimate posterior probabilities
if (dist=="weibull") {
for (j in 1:n) {
mu <- mus[,j]; sigma <- parms[j, posSigma]
z <- (log(y)-mu)/sigma
P[,j] <- priorP[j]*((1/(sigma*y)*dsev(z))^d)*((1-psev(z))^(1-d))}
}

else if (dist=="lognormal") {
for (j in 1:n) {
mu <- mus[,j]; sigma <- parms[j, posSigma]
z <- (log(y)-mu)/sigma
P[,j] <- priorP[j]*((1/(sigma*y)*dnorm(z))^d)*((1-pnorm(z))^(1-d))}
}

else if (dist=="gaussian") {
for (j in 1:n) {
mu <- mus[,j]; sigma <- parms[j, posSigma]
z <- (y-mu)/sigma
P[,j] <- priorP[j]*((1/(sigma)*dnorm(z))^d)*((1-pnorm(z))^(1-d))}
}

posteriorP <- P/rowSums(P)

#check convergence criterion
if (( abs(logLikC-loglikInit) / (abs(loglikInit)+.001*tol) ) < tol) break
loglikInit <- logLikC #update log-likelihood
iteration <- iteration + 1 #increment counter
}

if (iteration == maxiter) warning("Maximum number of iterations exceeded")
list(components=parms[,2:ncol(parms)], prior=priorP, loglik=logLikC,
AIC=-2*logLikC + 2*(n-1+n*(ncol(parms)-1)),
BIC=-2*logLikC + (n-1+n*(ncol(parms)-1))*log(nobs),
strategy=classify, distribution=dist, iterations=iteration,
standardError=stdErr, posterior=posteriorP)
}

##mixture of WEIBULL distributions

#generate mixture of Weibull distributions
N <- 400 #number of observations
probs <- c(.3, .7) #mixing proportions
n <- length(probs) #number of components
component <- sample(1:n, prob=probs, size=N, replace=TRUE)

#covariates
x1 <- rnorm(N, 10, 2)
x2 <- rnorm(N, 5, 1)

dataT <- rep(NA, N)

for(i in 1:N){
if(component[i]==1){
dataT[i] <- exp( (2 + .2*x1[i] - 1.5*x2[i]) + rsev(1)*.7)
}else if(component[i]==2){
dataT[i] <- exp( (1 + .5*x1[i] - 0.5*x2[i]) + rsev(1)*.35)}
}

plot(density(dataT)) #plot mixture of distributions

#apply censoring (here: Type I censoring)
#failure=1, censored=0
Cens <- ifelse(dataT>100, 0, 1)
N-sum(Cens) #number of censored observations
dataT <- ifelse(Cens==0, 100, dataT)

#fit parametric survival model to components
(comp1 <- survreg(Surv(dataT, Cens)~x1+x2, subset=(component==1), dist="weibull"))
(comp2 <- survreg(Surv(dataT, Cens)~x1+x2, subset=(component==2), dist="weibull"))

#fit Finite Mixture Model to the data

#run mixcensored repeatedly (applying the EM strategy)
#that is, fit repeatedly a mixture distribution to the data,
#and start each fit with a different random initialization of
#the posterior probabilities
mixmult <- replicate(5, mixcensored(y=dataT, d=Cens,
xformula=~x1+x2,
xdata=data.frame(x1=x1, x2=x2),
dist="weibull", n=2, classify="EM"))
#discard models that did not converge
mixmult[3,] <- ifelse(mixmult[3,]==0, NA, mixmult[3,])
length(which(is.na(mixmult[3,])))
#select the solution with the maximum value of the log-likelihood
mixEM <- mixmult[, which.max(mixmult[3,])]
mixEM[1:6]

#use the standard errors for computing normal-approximation confidence intervals
mixEM[9]
#95% confidence interval for the intercept of component 1
mixEM\$components[1,1] + c(1,-1)*qnorm(.05/2)*mixEM\$standardError[1,1]
#95% confidence interval for sigma of component 2
exp(log(mixEM\$components[2,4]) + c(1,-1)*qnorm(.05/2)*mixEM\$standardError[2,4])

#run mixcensored repeatedly (applying the CEM strategy)
mixmult <- replicate(5, mixcensored(y=dataT, d=Cens,
xformula=~x1+x2,
xdata=data.frame(x1=x1, x2=x2),
dist="weibull", n=2, classify="CEM"))
#discard models that did not converge
mixmult[3,] <- ifelse(mixmult[3,]==0, NA, mixmult[3,])
length(which(is.na(mixmult[3,])))
#select the solution with the maximum value of the log-likelihood
mixCEM <- mixmult[, which.max(mixmult[3,])]
mixCEM[1:6]

#compare EM with CEM
mixEM[1:6]
mixCEM[1:6]

#compare with the models fitted to the original components
comp1
comp2

##Finite Mixture model with INTERACTION TERM included

#generate data with interaction term included
for(i in 1:N){
if(component[i]==1){
dataT[i] <- exp( (2 + .2*x1[i] - 1.5*x2[i] + .005*x1[i]*x2[i]) + rsev(1)*.7)
}else if(component[i]==2){
dataT[i] <- exp( (1 + .5*x1[i] - 0.5*x2[i] + .001*x1[i]*x2[i]) + rsev(1)*.35)}
}

plot(density(dataT)) #plot mixture of distributions

#apply censoring (here: Type I censoring)
#failure=1, censored=0
Cens <- ifelse(dataT>100, 0, 1)
N-sum(Cens) #number of censored observations
dataT <- ifelse(Cens==0, 100, dataT)

#fit parametric survival model to components
(comp1 <- survreg(Surv(dataT, Cens)~x1+x2+x1:x2, subset=(component==1), dist="weibull"))
(comp2 <- survreg(Surv(dataT, Cens)~x1+x2+x1:x2, subset=(component==2), dist="weibull"))

#fit Finite Mixture Model to the data

#run mixcensored repeatedly (applying the CEM strategy)
mixmult <- replicate(5, mixcensored(y=dataT, d=Cens,
xformula=~x1+x2+x1:x2,
xdata=data.frame(x1=x1, x2=x2),
dist="weibull", n=2, classify="CEM"))
#discard models that did not converge
mixmult[3,] <- ifelse(mixmult[3,]==0, NA, mixmult[3,])
length(which(is.na(mixmult[3,])))
#select the solution with the maximum value of the log-likelihood
mixCEM <- mixmult[, which.max(mixmult[3,])]
mixCEM[1:6]

#compare with the models fitted to the original components
comp1
comp2

##INTERVAL CENSORED DATA

library(SPREDA)
library(LearnBayes)

#function for fitting a finite mixture model to interval censored data

#NOTE: this function employs the type="interval" coding scheme as used
#in the "Surv" function (from the survival package) for interval censored data

mixcensoredInt <- function (y1, y2, d=rep(1, length(y1)), wt=rep(1, length(y1)),
xformula=~1, xdata=NULL,
dist="weibull", n, cluster=NULL, classify="EM",
maxiter=100, tol=1e-6) {
#y1 is the right/left censored value, the exact lifetime observation, or for
# interval censoring the lower value of the censoring interval
#y2 is the upper value of the censoring interval
#d is the censoring indicator (0=right censored, 1=event at time,
# 2=left censored, 3=interval censored)
#wt are the weights for the observations
#xformula specifies the right hand side of the regression model to be fitted
#xdata is a data frame containing the covariates in the xformula
#dist: either the "weibull", "lognormal", or "gaussian" distribution
#n is the number of components
# a matrix with n columns of initial posterior probabilities for the observations
#classify: "EM", "CEM", or "SEM" strategy
#maxiter is the maximum number of iterations
#tol is the convergence criterion

nobs <- sum(wt) #number of observations

if (xformula==~1) {
termFormula <- 1
parms <- matrix(NA, ncol=3, nrow=n)
colnames(parms) <- c("logLik", "mu", "sigma")
stdErr <- matrix(NA, ncol=2, nrow=n)
colnames(stdErr) <- c("mu", "log(sigma)")
dm <- data.frame(y1,y2,d,1)}
else {
termsXformula <- terms(xformula)
nterm <- ncol(model.matrix(xformula, xdata))
covNames <- colnames(model.matrix(xformula, xdata))
termNames <- attr(termsXformula, "term.labels")
termFormula <- paste(termNames, collapse="+")
parms <- matrix(NA, ncol=nterm+2, nrow=n)
colnames(parms) <- c("logLik", covNames, "sigma")
stdErr <- matrix(NA, ncol=nterm+1, nrow=n)
colnames(stdErr) <- c(covNames, "log(sigma)")
dm <- data.frame(y1,y2,d,xdata)}

mus <- matrix(NA, ncol=n, nrow=length(y1))
posteriorP <- matrix(NA, ncol=n, nrow=length(y1))
P <- matrix(NA, ncol=n, nrow=length(y1))
posSigma <- ncol(parms)

iteration <- 0 #initialize iteration counter
loglikInit <- logLikC <- 0 #initialize log-likelihood estimates

#initialize posterior probabilities
if (is.null(cluster)) {
alpha <- rep(.1, n)
posteriorP <- rdirichlet(length(y1), alpha)}
else {posteriorP <- cluster}

while (iteration < maxiter) {
#estimate prior probabilities
priorP <- apply(wt*posteriorP, 2, sum)/nobs

#estimate distribution parameters for each component
if (classify=="EM"){
for (i in 1:n) {
wtp <- ifelse(wt*posteriorP[,i]<=0, 1e-15, wt*posteriorP[,i])
mixform <- formula(paste("Surv(y1,y2,d, type='interval')~",
termFormula, sep=""))
cp <- survreg(mixform, weights=wtp, dist=dist, data=dm)
parms[i,] <- c(logLik(cp)[1], coef(cp), cp\$scale)
stdErr[i,] <- sqrt(diag(vcov(cp)))
mus[,i] <- predict(cp, type="lp")}
}

else if (classify=="CEM"){
compPost <- apply(posteriorP, 1, which.max)
for (i in 1:n) {
mixform <- formula(paste("Surv(y1,y2,d, type='interval')~",
termFormula, sep=""))
cp <- survreg(mixform, weights=wt, dist=dist, data=dm, subset=(compPost==i))
parms[i,] <- c(logLik(cp)[1], coef(cp), cp\$scale)
stdErr[i,] <- sqrt(diag(vcov(cp)))
if (xformula==~1) {
mus[,i] <- predict(cp, newdata=data.frame(rep(1,length(y1))), type="lp")}
else mus[,i] <- predict(cp, newdata=xdata, type="lp")}
}

else if (classify=="SEM"){
compPost <- apply(posteriorP, 1, function (p) sample(x=1:n, size=1, prob=p))
for (i in 1:n) {
mixform <- formula(paste("Surv(y1,y2,d, type='interval')~",
termFormula, sep=""))
cp <- survreg(mixform, weights=wt, dist=dist, data=dm, subset=(compPost==i))
parms[i,] <- c(logLik(cp)[1], coef(cp), cp\$scale)
stdErr[i,] <- sqrt(diag(vcov(cp)))
if (xformula==~1) {
mus[,i] <- predict(cp, newdata=data.frame(rep(1,length(y1))), type="lp")}
else mus[,i] <- predict(cp, newdata=xdata, type="lp")}
}

#compute the (complete) log-likelihood value
logLikC <- sum(parms[,1]) + sum(wt*(posteriorP%*%log(priorP)))
logLikC <- ifelse(is.na(logLikC), 0, logLikC)

#estimate posterior probabilities
if (dist=="weibull") {
for (j in 1:n) {
mu <- mus[,j]; sigma <- parms[j, posSigma]

z1 <- (log(y1)-mu)/sigma
z2 <- (log(y2)-mu)/sigma

cp <- ifelse(d==0, 1-psev(z1), #right censoring
ifelse(d==1, 1/(sigma*y1) * dsev(z1), #exact observations
ifelse(d==2, psev(z1), #left censoring
psev(z2)-psev(z1) ))) #interval censoring
P[,j] <- priorP[j]*cp}
}

else if (dist=="lognormal") {
for (j in 1:n) {
mu <- mus[,j]; sigma <- parms[j, posSigma]

z1 <- (log(y1)-mu)/sigma
z2 <- (log(y2)-mu)/sigma

cp <- ifelse(d==0, 1-pnorm(z1), #right censoring
ifelse(d==1, 1/(sigma*y1) * dnorm(z1), #exact observations
ifelse(d==2, pnorm(z1), #left censoring
pnorm(z2)-pnorm(z1) )))
P[,j] <- priorP[j]*cp}
}

else if (dist=="gaussian") {
for (j in 1:n) {
mu <- mus[,j]; sigma <- parms[j, posSigma]

z1 <- (y1-mu)/sigma
z2 <- (y2-mu)/sigma

cp <- ifelse(d==0, 1-pnorm(z1), #right censoring
ifelse(d==1, 1/(sigma) * dnorm(z1), #exact observations
ifelse(d==2, pnorm(z1), #left censoring
pnorm(z2)-pnorm(z1) )))
P[,j] <- priorP[j]*cp}
}

posteriorP <- P/rowSums(P)

#check convergence criterion
if (( abs(logLikC-loglikInit) / (abs(loglikInit)+.001*tol) ) < tol) break
loglikInit <- logLikC #update log-likelihood
iteration <- iteration + 1 #increment counter
}

if (iteration == maxiter) warning("Maximum number of iterations exceeded")
list(components=parms[,2:ncol(parms)], prior=priorP, loglik=logLikC,
AIC=-2*logLikC + 2*(n-1+n*(ncol(parms)-1)),
BIC=-2*logLikC + (n-1+n*(ncol(parms)-1))*log(nobs),
strategy=classify, distribution=dist, iterations=iteration,
standardError=stdErr, posterior=posteriorP)
}

##mixture of LOGNORMAL distributions

#generate mixture of lognormal distributions
N <- 600 #number of observations
probs <- c(.3, .7) #mixing proportions
n <- length(probs) #number of components
component <- sample(1:n, prob=probs, size=N, replace=TRUE)

#covariates
x1 <- rnorm(N, 2, .5)
x2 <- rnorm(N, 1, .1)

dataT <- rep(NA, N)

for(i in 1:N){
if(component[i]==1){
dataT[i] <- exp( (6 + .3*x1[i] - .2*x2[i]) + rnorm(1)*.7)
}else if(component[i]==2){
dataT[i] <- exp( (8 + .1*x1[i] + .3*x2[i]) + rnorm(1)*.35)}
}

#generate intervals for censoring
intCens <- seq(0,5000,100)

#assign observations to censoring intervals
assignCens <- findInterval(dataT, intCens)

#observations with >5000 are right censored and get value NA for y2
intCensRC <- c(intCens, NA)

#lower bound of censoring interval
L <- sapply(assignCens, function (k) intCensRC[k])

#upper bound of censoring interval
U <- sapply(assignCens, function (k) intCensRC[k+1])

#indicator for type of censoring
Cens <- ifelse(is.na(U), 0, 3)

#censoring in the interval [0,100] is identical to left censoring at t=100
Cens <- ifelse(L==0, 2, Cens) #set censoring indicator to 2 (=left censoring)
#for these left censored observations, set L to 100, and U to NA
L <- ifelse(Cens==2, U, L)
U <- ifelse(Cens==2, NA, U)

#inspect data
head(cbind(L, U, dataT, Cens, component), n=15)

#censoring distribution
table(Cens)

#fit parametric survival model to components
(comp1 <- survreg(Surv(L,U,Cens, type="interval")~x1+x2,
subset=(component==1), dist="lognormal"))
(comp2 <- survreg(Surv(L,U,Cens, type="interval")~x1+x2,
subset=(component==2), dist="lognormal"))

#fit a Finite Mixture Model to the interval censored data
mixmult <- replicate(5, mixcensoredInt(y1=L, y2=U, d=Cens,
xformula=~x1+x2,
xdata=data.frame(x1=x1, x2=x2),
dist="lognormal", n=2,
classify="CEM", maxiter=200))
#log-likelihood values for the different solutions
mixmult[3,]
#discard models that did not converge
mixmult[3,] <- ifelse(mixmult[3,]==0, NA, mixmult[3,])
length(which(is.na(mixmult[3,])))
#select the solution with the maximum value of the log-likelihood
mixCEM <- mixmult[, which.max(mixmult[3,])]
mixCEM[1:6]

#compare with the models fitted to the original components
comp1
comp2

#some runs might generate an error
#this is a way of handing these errors
mixmult <- replicate(5, try(mixcensoredInt(y1=L, y2=U, d=Cens,
xformula=~x1+x2,
xdata=data.frame(x1=x1, x2=x2),
dist="lognormal", n=2,
classify="CEM", maxiter=200),
silent=TRUE),
simplify=FALSE)
#number of runs that generated an error
length(which(lapply(mixmult, length)==1))
#discard runs that generated an error
mixmult <- simplify2array(mixmult[which(lapply(mixmult, length)>1)])
#log-likelihood values for the different solutions
mixmult[3,]
#discard models that did not converge
mixmult[3,] <- ifelse(mixmult[3,]==0, NA, mixmult[3,])