R code for fitting a quantile regression model to censored data by means of a copula

In my previous blog post I showed how to fit a copula to censored data. For the ease of use, I’m going to call these fitted copulas censored copulas.

The following R code demonstrates how these censored copulas in turn can be used for fitting a quantile regression model to censored data.

A more detailed description of the method employed for fitting the quantile regression model can be found in this blog post.

Suggestions and/or questions? Please contact Stefan Gelissen (email: info at datall-analyse.nl).
See this page for an overview of all of Stefan’s
R code blog posts.

R code

library(copula)
library(MASS)
library(survival)
library(SPREDA)

#R functions for fitting a copula to censored data (using the
#two-stage parametric ML estimation method), and fitting a quantile regression
#model by means of a copula can be downloaded from: 
source("http://www.datall-analyse.nl/R/quantile_regression_censored_copula.R")

#Have a look at the functions loglikCensored (used for fitting the copula
#to censored data) and qrCopulaCensored (used for fitting a quantile regression
#model to censored data), and notice that at the beginning of the scripts you
#will find a description of the functions' arguments.
loglikCensored
qrCopulaCensored



##EXAMPLE 1: bivariate Gaussian copula with normally and Weibull distributed marginals

##Generate some artificial data using a bivariate Gaussian copula

#Bivariate Gaussian copula
myCop <- normalCopula(param=-.7, dim=2)

#Marginals:
#the first marginal is a normal distribution, and
#the second marginal is a Weibull distribution
myMvdc <- mvdc(copula=myCop, c("norm", "myWeibull"),
               list(list(mean=5, sd=.8),
                    list(location=4.5, scale=.7)))

#Visualize the copula
persp(myMvdc, dMvdc, n.grid=50, xlim=c(2, 8), ylim=c(0, 300))

#Draw sample (this sample will subsequently be used as the artificial data)
ns <- 200 #sample size
xys <- rMvdc(ns, myMvdc)

#Inspect the generated data
plot(xys[,1], xys[,2], xlab="x", ylab="y")

#Apply random censoring to the observations of variable y. The censored
#observations will subsequently be treated as right censored cases.
censored <- rmyWeibull(ns, location=5.2, scale=.8)
events <- ifelse(xys[,2] > censored, 0, 1)
xys[,2] <- pmin(xys[,2], censored)

#Plot the censored data
plot(xys[,1], xys[,2], xlab="x", ylab="y", col=events+1)
legend("topright", pch=c(1, 1), col=c(1, 2),
       legend=c("censored", "uncensored"),
       bty="n", y.intersp=1.2, cex=.7)

#Censoring distribution: 0=right censored, 1=uncensored
table(events)



##Fit a bivariate Gaussian copula to the artificial data
#by means of the two-stage parametric ML estimation method, also referred
#to as the method of Inference Functions for Margins (IFM).

#Stage 1: fit the marginals, and obtain the marginal parameters
(normMLE <- fitdistr(xys[,1], densfun="normal"))
normMean1 <- normMLE$estimate[1]
normSD1 <- normMLE$estimate[2]

(weibullMLE <- survreg(Surv(time=xys[,2], events) ~ 1, dist="weibull"))
weibullLocation2 <- coef(weibullMLE)
weibullScale2 <- weibullMLE$scale

#Compute the cumulative probabilities for the marginals
udat <- cbind(pnorm(xys[,1], mean=normMean1, sd=normSD1),
              pmyWeibull(xys[,2], location=weibullLocation2, scale=weibullScale2))


#Stage 2: fix the parameters of the marginals and estimate the copula parameter

#Starting value for the copula parameter
(tauEst <- sin(cor(xys[events==1, 1], xys[events==1, 2], method="kendall")*pi/2))

fitGaussian <- optim(tauEst, loglikCensored, method="BFGS",
                     hessian=TRUE,
                     control=list(fnscale=-1), copula=myCop,
                     u=udat, which.censored=2, event=events)

#Did the optimization converge? 
fitGaussian$convergence # 0 indicates successful convergence
#MLE for the copula parameter (remember: the true value for the parameter was -.7)
fitGaussian$par
#Standard error for MLE copula parameter
SEparms <- sqrt(diag(solve(-1*fitGaussian$hessian)))
#Construct 95% confidence interval for the copula parameter
(ci <- fitGaussian$par + c(1,-1)*qnorm(.05/2)*SEparms)

#When using the two-stage parametric ML method, the standard error of the MLE
#for the copula parameter is usually underestimated (i.e., too small).
#The reason for this underestimation is that the marginal MLEs were fixed at
#the second stage of the fitting process. As a consequence, when estimating
#the copula parameter at this second stage, variation in the MLEs of the
#marginals is not taken into account.
#You may use a resampling method (such as a nonparametric bootstrap method)
#for estimating standard errors that include the variability of marginal MLEs.


##Fit a quantile regression model

#We will employ a mvdc object as template
myMvdc2 <- mvdc(copula=normalCopula(param=1, dim=2), c("norm", "myWeibull"),
                list(list(mean=0, sd=1),
                     list(location=0, scale=1)))

#Combine the MLEs in a vector
params <- c(normMean1, normSD1, weibullLocation2, weibullScale2, fitGaussian$par)

#Set the parameters of the template
myMvdc2 <- copula:::setMvdcPar(myMvdc2, params)


##Quantile regression: regress y on x
xpoints <- seq(3, 8, length.out=100)
#Compute the median
ymed <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc2, p=.5,
                                                      marginal=2, x=k))
#Compute the .05th-quantile
y05 <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc2, p=.05,
                                                     marginal=2, x=k))
#Compute the .95th-quantile
y95 <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc2, p=.95,
                                                     marginal=2, x=k))


#Plot the results
plot(xys[,1], xys[,2], xlab="x", ylab="y", pch=events, col=gray(.2,.5))
#Add the median
lines(xpoints, ymed, lty=1, col="red")
#Add the 90% prediction interval
lines(xpoints, y05, lty=2, col="blue")
lines(xpoints, y95, lty=2, col="blue")
#Add a legend
legend("topright", lty=c(1, 2, NA, NA), pch=c(NA, NA, 0, 1),
       col=c("red", "blue", gray(.2,.5), gray(.2,.5)),
       legend=c("median", "90% prediction interval", "censored", "uncensored"),
       bty="n", y.intersp=1.2, cex=.7)


#Compute the true quantile curves
ymedT <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc, p=.5,
                                                       marginal=2, x=k))
y05T <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc, p=.05,
                                                      marginal=2, x=k))
y95T <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc, p=.95,
                                                      marginal=2, x=k))

#Add the true quantile curves to the plot
lines(xpoints, ymedT, lty=2, lwd=2, col="purple")
lines(xpoints, y05T, lty=2, lwd=2, col="purple")
lines(xpoints, y95T, lty=2, lwd=2, col="purple")


#As a comparison, we will use the crq function from the quantreg package.
#Similar to qrCopulaCensored(), crq() also fits quantile regression
#models to censored data.
library(quantreg)

#Fit a quantile regression model
crqData <- data.frame(y=xys[,2], x1=xys[,1])
crqmod <- crq(Surv(log(y), event=events, type="right") ~ x1,
              data=crqData, method="Portnoy")

#Corresponding quantile curves
ypointsmed <- exp(cbind(1, xpoints)%*%coef(crqmod, taus=.5))
ypoints05 <- exp(cbind(1, xpoints)%*%coef(crqmod, taus=.05))
ypoints95 <- exp(cbind(1, xpoints)%*%coef(crqmod, taus=.95))

#Add these quantile curves to the plot
lines(xpoints, ypointsmed, lty=2, col="darkgreen")
lines(xpoints, ypoints05, lty=2, col="darkgreen")
lines(xpoints, ypoints95, lty=2, col="darkgreen")





##EXAMPLE 2: Gaussian copula with a 3-parameter Weibull as marginal

##Generate some artificial data using a bivariate Gaussian copula

#Bivariate Gaussian copula
myCop <- normalCopula(param=-.7, dim=2)

#Marginals:
#the first marginal is a normal distribution, and
#the second marginal is a 3-parameter Weibull distribution
myMvdc <- mvdc(copula=myCop, c("norm", "myWeibull"),
               list(list(mean=5, sd=.8),
                    list(location=4.5, scale=.7, threshold=90)))

#Visualize the copula
persp(myMvdc, dMvdc, n.grid=50, xlim=c(2, 8), ylim=c(90, 500))

#Draw sample (this sample will subsequently be used as the artificial data)
ns <- 200 #sample size
xys <- rMvdc(ns, myMvdc)

#Inspect the generated data
plot(xys[,1], xys[,2], xlab="x", ylab="y", ylim=c(85, max(xys[,2])))
abline(h=90, lty=2, col="red") #threshold of 3-parameter Weibull

#Apply random censoring to the observations of variable y. The censored
#observations will subsequently be treated as right censored cases.
censored <- rmyWeibull(ns, location=5.2, scale=.8, threshold=110)
events <- ifelse(xys[,2] > censored, 0, 1)
xys[,2] <- pmin(xys[,2], censored)

#Plot the censored data
plot(xys[,1], xys[,2], xlab="x", ylab="y", col=events+1)
legend("topright", pch=c(1, 1), col=c(1, 2),
       legend=c("censored", "uncensored"),
       bty="n", y.intersp=1.2, cex=.7)

#Censoring distribution: 0=right censored, 1=uncensored
table(events)



##Fit a bivariate Gaussian copula to the artificial data
#by means of the two-stage parametric ML estimation method, also referred
#to as the method of Inference Functions for Margins (IFM).

#Stage 1: fit the marginals, and obtain the marginal parameters

#Fit a normal distribution to the observations of variable x
(normMLE <- fitdistr(xys[,1], densfun="normal"))
normMean1 <- normMLE$estimate[1]
normSD1 <- normMLE$estimate[2]

#Fit a 3-parameter Weibull distribution to the observations of variable y

#Profile function for the threshold (=gamma) parameter
profile.t <- function(gammahat, x, event) {
  x <- x-gammahat
  fit <- survreg(Surv(x, event) ~ 1, dist="weibull", subset=(x>0))
  fit$loglik[1]
}

#Optimization
gammaMLE <- optim(par=0, profile.t, method="L-BFGS-B",
                  upper=(1-(1/1e+6))*min(xys[events==1, 2]),
                  control=list(fnscale=-1),
                  x=xys[,2], event=events)

gamma <- gammaMLE$par #MLE for the threshold parameter
names(gamma) <- "gamma"

#MLEs for the location (=mu) and scale (=sigma) parameters
sgamma <- Surv(xys[,2]-gamma, events)
musigma <- survreg(sgamma ~ 1, dist="weibull", subset=(xys[,2]-gamma>0))

mu <- coef(musigma); names(mu) <- "mu"
sigma <- musigma$scale; names(sigma) <- "sigma"
c(mu, sigma, gamma) #summary

weibullLocation2 <- mu
weibullScale2 <- sigma
weibullThreshold2 <- gamma

#Compute the cumulative probabilities for the marginals
udat <- cbind(pnorm(xys[,1], mean=normMean1, sd=normSD1),
              pmyWeibull(xys[,2], location=weibullLocation2, scale=weibullScale2,
                         threshold=weibullThreshold2))


#Stage 2: fix the parameters of the marginals and estimate the copula parameter

#Starting value for the copula parameter
(tauEst <- sin(cor(xys[events==1, 1], xys[events==1, 2], method="kendall")*pi/2))

fitGaussian <- optim(tauEst, loglikCensored, method="BFGS",
                     hessian=TRUE,
                     control=list(fnscale=-1), copula=myCop,
                     u=udat, which.censored=2, event=events)

#Did the optimization converge? 
fitGaussian$convergence # 0 indicates successful convergence
#MLE for the copula parameter (remember: the true value for the parameter was -.7)
fitGaussian$par
#Standard error for MLE copula parameter
SEparms <- sqrt(diag(solve(-1*fitGaussian$hessian)))
#Construct 95% confidence interval for the copula parameter
(ci <- fitGaussian$par + c(1,-1)*qnorm(.05/2)*SEparms)

#Remember that when using the two-stage parametric ML method, the standard error
#of the MLE for the copula parameter is usually underestimated (i.e., too small).



##Fit a quantile regression model

#Construct a template
myMvdc2 <- mvdc(copula=normalCopula(param=1, dim=2), c("norm", "myWeibull"),
                list(list(mean=0, sd=1),
                     list(location=0, scale=1, threshold=0)))

#Combine the MLEs
params <- c(normMean1, normSD1, weibullLocation2, weibullScale2, weibullThreshold2,
            fitGaussian$par)

#Set the parameters of the template
myMvdc2 <- copula:::setMvdcPar(myMvdc2, params)


##Quantile regression: regress y on x
xpoints <- seq(3, 8, length.out=100)
#Compute the median
ymed <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc2, p=.5,
                                                      marginal=2, x=k))
#Compute the .05th-quantile
y05 <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc2, p=.05,
                                                     marginal=2, x=k))
#Compute the .95th-quantile
y95 <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc2, p=.95,
                                                     marginal=2, x=k))


#Plot the results
plot(xys[,1], xys[,2], xlab="x", ylab="y", pch=events, col=gray(.2,.5))
#Add the median
lines(xpoints, ymed, lty=1, col="red")
#Add the 90% prediction interval
lines(xpoints, y05, lty=2, col="blue")
lines(xpoints, y95, lty=2, col="blue")
#Add a legend
legend("topright", lty=c(1, 2, NA, NA), pch=c(NA, NA, 0, 1),
       col=c("red", "blue", gray(.2,.5), gray(.2,.5)),
       legend=c("median", "90% prediction interval", "censored", "uncensored"),
       bty="n", y.intersp=1.2, cex=.7)


#Compute the true quantile curves.
ymedT <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc, p=.5,
                                                       marginal=2, x=k))
y05T <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc, p=.05,
                                                      marginal=2, x=k))
y95T <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc, p=.95,
                                                      marginal=2, x=k))

#Add the true quantile curves to the plot
lines(xpoints, ymedT, lty=2, lwd=2, col="purple")
lines(xpoints, y05T, lty=2, lwd=2, col="purple")
lines(xpoints, y95T, lty=2, lwd=2, col="purple")



#As a final comparison, we will fit a parametric survival model.

#Profile function for the threshold (=gamma) parameter of the parametric
#survival model
profile.t2 <- function(gammahat, y, cov, event) {
  y <- y-gammahat
  fit <- survreg(Surv(y, event) ~ cov, dist="weibull", subset=(y>0))
  fit$loglik[2]
}

#Optimization
gammaMLE <- optim(par=0, profile.t2, method="L-BFGS-B",
                  upper=(1-(1/1e+6))*min(xys[events==1, 2]),
                  control=list(fnscale=-1),
                  y=xys[,2], cov=xys[,1], event=events)

gamma <- gammaMLE$par #MLE for the threshold parameter
names(gamma) <- "gamma"

#MLEs for the remaining parameters of the survival model
sgamma <- Surv(xys[,2]-gamma, events)
musigma <- survreg(sgamma ~ xys[,1], dist="weibull", subset=(xys[,2]-gamma>0))

mu <- coef(musigma); names(mu) <- c("intercept", "x")
sigma <- musigma$scale; names(sigma) <- "sigma"
c(mu, sigma, gamma) #summary of parameters

#Compute the quantile curves for the fitted survival model
mus <- cbind(1, xpoints)%*%coef(musigma)
ypointsmed <- qmyWeibull(.5, location=mus, scale=sigma, threshold=gamma)
ypoints05 <- qmyWeibull(.05, location=mus, scale=sigma, threshold=gamma)
ypoints95 <- qmyWeibull(.95, location=mus, scale=sigma, threshold=gamma)

#Add the quantile curves to the plot
lines(xpoints, ypointsmed, lty=2, col="darkgreen")
lines(xpoints, ypoints05, lty=2, col="darkgreen")
lines(xpoints, ypoints95, lty=2, col="darkgreen")