R code for fitting a Finite Mixture Model to survival data

The following R code fits a Finite Mixture Model to survival (or reliability) data. The survival/reliability data may contain (right / interval) censored observations. However, it is also possible to fit the Finite Mixture Model to complete (uncensored) survival/reliability data.

The Finite Mixture Model is fitted by using the Expectation-Maximization (EM) algorithm. The fitted model assumes that the lifetime observations follow either a Weibull, lognormal, or Gaussian distribution.

Suggestions and/or questions? Please contact Stefan Gelissen (email: info at datall-analyse.nl).
See this page for an overview of all of Stefan’s R code blog posts.

 

R code

library(SPREDA)
library(LearnBayes)

##RIGHT CENSORED DATA

#function for fitting a finite mixture model to RIGHT censored or complete data
mixcensored <- function (y, d=rep(1, length(y)), wt=rep(1, length(y)),
                         xformula=~1, xdata=NULL,
                         dist="weibull", n, cluster=NULL, classify="EM",
                         maxiter=100, tol=1e-6) {
  #y are the lifetime observations
  #d is the censoring indicator (1=failure; 0=censored)
  #wt are the weights for the observations
  #xformula specifies the right hand side of the regression model to be fitted
  #xdata is a data frame containing the covariates in the xformula
  #dist: either the "weibull", "lognormal", or "gaussian" distribution
  #n is the number of components
  #cluster: start with random initialization of posterior probabilities (=NULL), or
  # a matrix with n columns of initial posterior probabilities for the observations
  #classify: "EM", "CEM", or "SEM" strategy
  #maxiter is the maximum number of iterations
  #tol is the convergence criterion
  
  nobs <- sum(wt) #number of observations
  
  if (xformula==~1) {
    termFormula <- 1
    parms <- matrix(NA, ncol=3, nrow=n)
    colnames(parms) <- c("logLik", "mu", "sigma")
    stdErr <- matrix(NA, ncol=2, nrow=n)
    colnames(stdErr) <- c("mu", "log(sigma)")
    dm <- data.frame(y,d,1)}
  else {
    termsXformula <- terms(xformula)
    nterm <- ncol(model.matrix(xformula, xdata))
    covNames <- colnames(model.matrix(xformula, xdata))
    termNames <- attr(termsXformula, "term.labels")
    termFormula <- paste(termNames, collapse="+")
    parms <- matrix(NA, ncol=nterm+2, nrow=n)
    colnames(parms) <- c("logLik", covNames, "sigma")
    stdErr <- matrix(NA, ncol=nterm+1, nrow=n)
    colnames(stdErr) <- c(covNames, "log(sigma)")
    dm <- data.frame(y,d,xdata)}
  
  mus <- matrix(NA, ncol=n, nrow=length(y))
  posteriorP <- matrix(NA, ncol=n, nrow=length(y))
  P <- matrix(NA, ncol=n, nrow=length(y))
  posSigma <- ncol(parms)
  
  iteration <- 0 #initialize iteration counter
  loglikInit <- logLikC <- 0 #initialize log-likelihood estimates
  
  #initialize posterior probabilities
  if (is.null(cluster)) {
    alpha <- rep(.1, n)
    posteriorP <- rdirichlet(length(y), alpha)}
  else {posteriorP <- cluster}
  
  while (iteration < maxiter) {
    #estimate prior probabilities
    priorP <- apply(wt*posteriorP, 2, sum)/nobs
    
    #estimate distribution parameters for each component
    if (classify=="EM"){
      for (i in 1:n) {
        wtp <- ifelse(wt*posteriorP[,i]<=0, 1e-15, wt*posteriorP[,i])
        mixform <- formula(paste("Surv(y,d)~", termFormula, sep=""))
        cp <- survreg(mixform, weights=wtp, dist=dist, data=dm)
        parms[i,] <- c(logLik(cp)[1], coef(cp), cp$scale)
        stdErr[i,] <- sqrt(diag(vcov(cp)))
        mus[,i] <- predict(cp, type="lp")}
    }
    
    else if (classify=="CEM"){
      compPost <- apply(posteriorP, 1, which.max)
      for (i in 1:n) {
        mixform <- formula(paste("Surv(y,d)~", termFormula, sep=""))
        cp <- survreg(mixform, weights=wt, dist=dist, data=dm, subset=(compPost==i))
        parms[i,] <- c(logLik(cp)[1], coef(cp), cp$scale)
        stdErr[i,] <- sqrt(diag(vcov(cp)))
        if (xformula==~1) {
          mus[,i] <- predict(cp, newdata=data.frame(rep(1,length(y))), type="lp")}
        else mus[,i] <- predict(cp, newdata=xdata, type="lp")}
    }
    
    else if (classify=="SEM"){
      compPost <- apply(posteriorP, 1, function (p) sample(x=1:n, size=1, prob=p))
      for (i in 1:n) {
        mixform <- formula(paste("Surv(y,d)~", termFormula, sep=""))
        cp <- survreg(mixform, weights=wt, dist=dist, data=dm, subset=(compPost==i))
        parms[i,] <- c(logLik(cp)[1], coef(cp), cp$scale)
        stdErr[i,] <- sqrt(diag(vcov(cp)))
        if (xformula==~1) {
          mus[,i] <- predict(cp, newdata=data.frame(rep(1,length(y))), type="lp")}
        else mus[,i] <- predict(cp, newdata=xdata, type="lp")}
    }
    
    #compute the (complete) log-likelihood value
    logLikC <- sum(parms[,1]) + sum(wt*(posteriorP%*%log(priorP)))
    logLikC <- ifelse(is.na(logLikC), 0, logLikC)
    
    #estimate posterior probabilities
    if (dist=="weibull") {
      for (j in 1:n) {
        mu <- mus[,j]; sigma <- parms[j, posSigma]
        z <- (log(y)-mu)/sigma
        P[,j] <- priorP[j]*((1/(sigma*y)*dsev(z))^d)*((1-psev(z))^(1-d))}
    }
    
    else if (dist=="lognormal") {
      for (j in 1:n) {
        mu <- mus[,j]; sigma <- parms[j, posSigma]
        z <- (log(y)-mu)/sigma
        P[,j] <- priorP[j]*((1/(sigma*y)*dnorm(z))^d)*((1-pnorm(z))^(1-d))}
    }
    
    else if (dist=="gaussian") {
      for (j in 1:n) {
        mu <- mus[,j]; sigma <- parms[j, posSigma]
        z <- (y-mu)/sigma
        P[,j] <- priorP[j]*((1/(sigma)*dnorm(z))^d)*((1-pnorm(z))^(1-d))}
    }
    
    posteriorP <- P/rowSums(P)
    
    #check convergence criterion
    if (( abs(logLikC-loglikInit) / (abs(loglikInit)+.001*tol) ) < tol) break
    loglikInit <- logLikC #update log-likelihood
    iteration <- iteration + 1 #increment counter
  }
  
  if (iteration == maxiter) warning("Maximum number of iterations exceeded")
  list(components=parms[,2:ncol(parms)], prior=priorP, loglik=logLikC,
       AIC=-2*logLikC + 2*(n-1+n*(ncol(parms)-1)),
       BIC=-2*logLikC + (n-1+n*(ncol(parms)-1))*log(nobs),
       strategy=classify, distribution=dist, iterations=iteration,
       standardError=stdErr, posterior=posteriorP)
}



##mixture of WEIBULL distributions

#generate mixture of Weibull distributions
N <- 400 #number of observations
probs <- c(.3, .7) #mixing proportions
n <- length(probs) #number of components
component <- sample(1:n, prob=probs, size=N, replace=TRUE)

#covariates
x1 <- rnorm(N, 10, 2)
x2 <- rnorm(N, 5, 1)

dataT <- rep(NA, N)

for(i in 1:N){
  if(component[i]==1){
    dataT[i] <- exp( (2 + .2*x1[i] - 1.5*x2[i]) + rsev(1)*.7)
  }else if(component[i]==2){
    dataT[i] <- exp( (1 + .5*x1[i] - 0.5*x2[i]) + rsev(1)*.35)}
}

plot(density(dataT)) #plot mixture of distributions

#apply censoring (here: Type I censoring)
#failure=1, censored=0
Cens <- ifelse(dataT>100, 0, 1)
N-sum(Cens) #number of censored observations
dataT <- ifelse(Cens==0, 100, dataT)



#fit parametric survival model to components
(comp1 <- survreg(Surv(dataT, Cens)~x1+x2, subset=(component==1), dist="weibull"))
(comp2 <- survreg(Surv(dataT, Cens)~x1+x2, subset=(component==2), dist="weibull"))



#fit Finite Mixture Model to the data

#run mixcensored repeatedly (applying the EM strategy)
#that is, fit repeatedly a mixture distribution to the data,
#and start each fit with a different random initialization of
#the posterior probabilities
mixmult <- replicate(5, mixcensored(y=dataT, d=Cens,
                                    xformula=~x1+x2,
                                    xdata=data.frame(x1=x1, x2=x2),
                                    dist="weibull", n=2, classify="EM"))
#discard models that did not converge
mixmult[3,] <- ifelse(mixmult[3,]==0, NA, mixmult[3,])
#number of discarded models
length(which(is.na(mixmult[3,])))
#select the solution with the maximum value of the log-likelihood
mixEM <- mixmult[, which.max(mixmult[3,])]
mixEM[1:6]

#use the standard errors for computing normal-approximation confidence intervals
mixEM[9]
#95% confidence interval for the intercept of component 1
mixEM$components[1,1] + c(1,-1)*qnorm(.05/2)*mixEM$standardError[1,1]
#95% confidence interval for sigma of component 2
exp(log(mixEM$components[2,4]) + c(1,-1)*qnorm(.05/2)*mixEM$standardError[2,4])



#run mixcensored repeatedly (applying the CEM strategy)
mixmult <- replicate(5, mixcensored(y=dataT, d=Cens,
                                    xformula=~x1+x2,
                                    xdata=data.frame(x1=x1, x2=x2),
                                    dist="weibull", n=2, classify="CEM"))
#discard models that did not converge
mixmult[3,] <- ifelse(mixmult[3,]==0, NA, mixmult[3,])
#number of discarded models
length(which(is.na(mixmult[3,])))
#select the solution with the maximum value of the log-likelihood
mixCEM <- mixmult[, which.max(mixmult[3,])]
mixCEM[1:6]



#compare EM with CEM
mixEM[1:6]
mixCEM[1:6]

#compare with the models fitted to the original components
comp1
comp2





##Finite Mixture model with INTERACTION TERM included

#generate data with interaction term included
for(i in 1:N){
  if(component[i]==1){
    dataT[i] <- exp( (2 + .2*x1[i] - 1.5*x2[i] + .005*x1[i]*x2[i]) + rsev(1)*.7)
  }else if(component[i]==2){
    dataT[i] <- exp( (1 + .5*x1[i] - 0.5*x2[i] + .001*x1[i]*x2[i]) + rsev(1)*.35)}
}

plot(density(dataT)) #plot mixture of distributions

#apply censoring (here: Type I censoring)
#failure=1, censored=0
Cens <- ifelse(dataT>100, 0, 1)
N-sum(Cens) #number of censored observations
dataT <- ifelse(Cens==0, 100, dataT)

#fit parametric survival model to components
(comp1 <- survreg(Surv(dataT, Cens)~x1+x2+x1:x2, subset=(component==1), dist="weibull"))
(comp2 <- survreg(Surv(dataT, Cens)~x1+x2+x1:x2, subset=(component==2), dist="weibull"))



#fit Finite Mixture Model to the data

#run mixcensored repeatedly (applying the CEM strategy)
mixmult <- replicate(5, mixcensored(y=dataT, d=Cens,
                                    xformula=~x1+x2+x1:x2,
                                    xdata=data.frame(x1=x1, x2=x2),
                                    dist="weibull", n=2, classify="CEM"))
#discard models that did not converge
mixmult[3,] <- ifelse(mixmult[3,]==0, NA, mixmult[3,])
#number of discarded models
length(which(is.na(mixmult[3,])))
#select the solution with the maximum value of the log-likelihood
mixCEM <- mixmult[, which.max(mixmult[3,])]
mixCEM[1:6]

#compare with the models fitted to the original components
comp1
comp2





##INTERVAL CENSORED DATA

library(SPREDA)
library(LearnBayes)

#function for fitting a finite mixture model to interval censored data

#NOTE: this function employs the type="interval" coding scheme as used
#in the "Surv" function (from the survival package) for interval censored data

mixcensoredInt <- function (y1, y2, d=rep(1, length(y1)), wt=rep(1, length(y1)),
                            xformula=~1, xdata=NULL,
                            dist="weibull", n, cluster=NULL, classify="EM",
                            maxiter=100, tol=1e-6) {
  #y1 is the right/left censored value, the exact lifetime observation, or for
  # interval censoring the lower value of the censoring interval
  #y2 is the upper value of the censoring interval
  #d is the censoring indicator (0=right censored, 1=event at time,
  # 2=left censored, 3=interval censored)
  #wt are the weights for the observations
  #xformula specifies the right hand side of the regression model to be fitted
  #xdata is a data frame containing the covariates in the xformula
  #dist: either the "weibull", "lognormal", or "gaussian" distribution
  #n is the number of components
  #cluster: start with random initialization of posterior probabilities (=NULL), or
  # a matrix with n columns of initial posterior probabilities for the observations
  #classify: "EM", "CEM", or "SEM" strategy
  #maxiter is the maximum number of iterations
  #tol is the convergence criterion
  
  nobs <- sum(wt) #number of observations
  
  if (xformula==~1) {
    termFormula <- 1
    parms <- matrix(NA, ncol=3, nrow=n)
    colnames(parms) <- c("logLik", "mu", "sigma")
    stdErr <- matrix(NA, ncol=2, nrow=n)
    colnames(stdErr) <- c("mu", "log(sigma)")
    dm <- data.frame(y1,y2,d,1)}
  else {
    termsXformula <- terms(xformula)
    nterm <- ncol(model.matrix(xformula, xdata))
    covNames <- colnames(model.matrix(xformula, xdata))
    termNames <- attr(termsXformula, "term.labels")
    termFormula <- paste(termNames, collapse="+")
    parms <- matrix(NA, ncol=nterm+2, nrow=n)
    colnames(parms) <- c("logLik", covNames, "sigma")
    stdErr <- matrix(NA, ncol=nterm+1, nrow=n)
    colnames(stdErr) <- c(covNames, "log(sigma)")
    dm <- data.frame(y1,y2,d,xdata)}
  
  mus <- matrix(NA, ncol=n, nrow=length(y1))
  posteriorP <- matrix(NA, ncol=n, nrow=length(y1))
  P <- matrix(NA, ncol=n, nrow=length(y1))
  posSigma <- ncol(parms)
  
  iteration <- 0 #initialize iteration counter
  loglikInit <- logLikC <- 0 #initialize log-likelihood estimates
  
  #initialize posterior probabilities
  if (is.null(cluster)) {
    alpha <- rep(.1, n)
    posteriorP <- rdirichlet(length(y1), alpha)}
  else {posteriorP <- cluster}
  
  while (iteration < maxiter) {
    #estimate prior probabilities
    priorP <- apply(wt*posteriorP, 2, sum)/nobs
    
    #estimate distribution parameters for each component
    if (classify=="EM"){
      for (i in 1:n) {
        wtp <- ifelse(wt*posteriorP[,i]<=0, 1e-15, wt*posteriorP[,i])
        mixform <- formula(paste("Surv(y1,y2,d, type='interval')~",
                                 termFormula, sep=""))
        cp <- survreg(mixform, weights=wtp, dist=dist, data=dm)
        parms[i,] <- c(logLik(cp)[1], coef(cp), cp$scale)
        stdErr[i,] <- sqrt(diag(vcov(cp)))
        mus[,i] <- predict(cp, type="lp")}
    }
    
    else if (classify=="CEM"){
      compPost <- apply(posteriorP, 1, which.max)
      for (i in 1:n) {
        mixform <- formula(paste("Surv(y1,y2,d, type='interval')~",
                                 termFormula, sep=""))
        cp <- survreg(mixform, weights=wt, dist=dist, data=dm, subset=(compPost==i))
        parms[i,] <- c(logLik(cp)[1], coef(cp), cp$scale)
        stdErr[i,] <- sqrt(diag(vcov(cp)))
        if (xformula==~1) {
          mus[,i] <- predict(cp, newdata=data.frame(rep(1,length(y1))), type="lp")}
        else mus[,i] <- predict(cp, newdata=xdata, type="lp")}
    }
    
    else if (classify=="SEM"){
      compPost <- apply(posteriorP, 1, function (p) sample(x=1:n, size=1, prob=p))
      for (i in 1:n) {
        mixform <- formula(paste("Surv(y1,y2,d, type='interval')~",
                                 termFormula, sep=""))
        cp <- survreg(mixform, weights=wt, dist=dist, data=dm, subset=(compPost==i))
        parms[i,] <- c(logLik(cp)[1], coef(cp), cp$scale)
        stdErr[i,] <- sqrt(diag(vcov(cp)))
        if (xformula==~1) {
          mus[,i] <- predict(cp, newdata=data.frame(rep(1,length(y1))), type="lp")}
        else mus[,i] <- predict(cp, newdata=xdata, type="lp")}
    }
    
    #compute the (complete) log-likelihood value
    logLikC <- sum(parms[,1]) + sum(wt*(posteriorP%*%log(priorP)))
    logLikC <- ifelse(is.na(logLikC), 0, logLikC)
    
    #estimate posterior probabilities
    if (dist=="weibull") {
      for (j in 1:n) {
        mu <- mus[,j]; sigma <- parms[j, posSigma]
        
        z1 <- (log(y1)-mu)/sigma
        z2 <- (log(y2)-mu)/sigma
        
        cp <- ifelse(d==0, 1-psev(z1), #right censoring
                     ifelse(d==1, 1/(sigma*y1) * dsev(z1), #exact observations
                            ifelse(d==2, psev(z1), #left censoring
                                   psev(z2)-psev(z1) ))) #interval censoring
        P[,j] <- priorP[j]*cp}
    }
    
    else if (dist=="lognormal") {
      for (j in 1:n) {
        mu <- mus[,j]; sigma <- parms[j, posSigma]
        
        z1 <- (log(y1)-mu)/sigma
        z2 <- (log(y2)-mu)/sigma
        
        cp <- ifelse(d==0, 1-pnorm(z1), #right censoring
                     ifelse(d==1, 1/(sigma*y1) * dnorm(z1), #exact observations
                            ifelse(d==2, pnorm(z1), #left censoring
                                   pnorm(z2)-pnorm(z1) )))
        P[,j] <- priorP[j]*cp}
    }
    
    else if (dist=="gaussian") {
      for (j in 1:n) {
        mu <- mus[,j]; sigma <- parms[j, posSigma]
        
        z1 <- (y1-mu)/sigma
        z2 <- (y2-mu)/sigma
        
        cp <- ifelse(d==0, 1-pnorm(z1), #right censoring
                     ifelse(d==1, 1/(sigma) * dnorm(z1), #exact observations
                            ifelse(d==2, pnorm(z1), #left censoring
                                   pnorm(z2)-pnorm(z1) )))
        P[,j] <- priorP[j]*cp}
    }
    
    posteriorP <- P/rowSums(P)
    
    #check convergence criterion
    if (( abs(logLikC-loglikInit) / (abs(loglikInit)+.001*tol) ) < tol) break
    loglikInit <- logLikC #update log-likelihood
    iteration <- iteration + 1 #increment counter
  }
  
  if (iteration == maxiter) warning("Maximum number of iterations exceeded")
  list(components=parms[,2:ncol(parms)], prior=priorP, loglik=logLikC,
       AIC=-2*logLikC + 2*(n-1+n*(ncol(parms)-1)),
       BIC=-2*logLikC + (n-1+n*(ncol(parms)-1))*log(nobs),
       strategy=classify, distribution=dist, iterations=iteration,
       standardError=stdErr, posterior=posteriorP)
}



##mixture of LOGNORMAL distributions

#generate mixture of lognormal distributions
N <- 600 #number of observations
probs <- c(.3, .7) #mixing proportions
n <- length(probs) #number of components
component <- sample(1:n, prob=probs, size=N, replace=TRUE)

#covariates
x1 <- rnorm(N, 2, .5)
x2 <- rnorm(N, 1, .1)

dataT <- rep(NA, N)

for(i in 1:N){
  if(component[i]==1){
    dataT[i] <- exp( (6 + .3*x1[i] - .2*x2[i]) + rnorm(1)*.7)
  }else if(component[i]==2){
    dataT[i] <- exp( (8 + .1*x1[i] + .3*x2[i]) + rnorm(1)*.35)}
}

#generate intervals for censoring
intCens <- seq(0,5000,100)

#assign observations to censoring intervals
assignCens <- findInterval(dataT, intCens)

#observations with >5000 are right censored and get value NA for y2
intCensRC <- c(intCens, NA)

#lower bound of censoring interval
L <- sapply(assignCens, function (k) intCensRC[k])

#upper bound of censoring interval
U <- sapply(assignCens, function (k) intCensRC[k+1])

#indicator for type of censoring
Cens <- ifelse(is.na(U), 0, 3)

#censoring in the interval [0,100] is identical to left censoring at t=100
Cens <- ifelse(L==0, 2, Cens) #set censoring indicator to 2 (=left censoring)
#for these left censored observations, set L to 100, and U to NA
L <- ifelse(Cens==2, U, L)
U <- ifelse(Cens==2, NA, U)

#inspect data
head(cbind(L, U, dataT, Cens, component), n=15)

#censoring distribution
table(Cens)



#fit parametric survival model to components
(comp1 <- survreg(Surv(L,U,Cens, type="interval")~x1+x2,
                  subset=(component==1), dist="lognormal"))
(comp2 <- survreg(Surv(L,U,Cens, type="interval")~x1+x2,
                  subset=(component==2), dist="lognormal"))



#fit a Finite Mixture Model to the interval censored data
mixmult <- replicate(5, mixcensoredInt(y1=L, y2=U, d=Cens,
                                       xformula=~x1+x2,
                                       xdata=data.frame(x1=x1, x2=x2),
                                       dist="lognormal", n=2,
                                       classify="CEM", maxiter=200))
#log-likelihood values for the different solutions
mixmult[3,]
#discard models that did not converge
mixmult[3,] <- ifelse(mixmult[3,]==0, NA, mixmult[3,])
#number of discarded models
length(which(is.na(mixmult[3,])))
#select the solution with the maximum value of the log-likelihood
mixCEM <- mixmult[, which.max(mixmult[3,])]
mixCEM[1:6]

#compare with the models fitted to the original components
comp1
comp2





#some runs might generate an error
#this is a way of handing these errors
mixmult <- replicate(5, try(mixcensoredInt(y1=L, y2=U, d=Cens,
                                           xformula=~x1+x2,
                                           xdata=data.frame(x1=x1, x2=x2),
                                           dist="lognormal", n=2,
                                           classify="CEM", maxiter=200),
                            silent=TRUE),
                     simplify=FALSE)
#number of runs that generated an error
length(which(lapply(mixmult, length)==1))
#discard runs that generated an error
mixmult <- simplify2array(mixmult[which(lapply(mixmult, length)>1)])
#log-likelihood values for the different solutions
mixmult[3,]
#discard models that did not converge
mixmult[3,] <- ifelse(mixmult[3,]==0, NA, mixmult[3,])
#number of discarded models
length(which(is.na(mixmult[3,])))
#select the solution with the maximum value of the log-likelihood
mixmultCEM <- mixmult[, which.max(mixmult[3,])]
mixmultCEM[1:6]