In my previous blog post I showed how to fit a copula to censored data. For the ease of use, I’m going to call these fitted copulas censored copulas.
The following R code demonstrates how these censored copulas in turn can be used for fitting a quantile regression model to censored data.
A more detailed description of the method employed for fitting the quantile regression model can be found in this blog post.
Suggestions and/or questions? Please contact Stefan Gelissen (email: info at datall-analyse.nl).
See this page for an overview of all of Stefan’s R code blog posts.
R code
library(copula)
library(MASS)
library(survival)
library(SPREDA)
#R functions for fitting a copula to censored data (using the
#two-stage parametric ML estimation method), and fitting a quantile regression
#model by means of a copula can be downloaded from:
source("http://www.datall-analyse.nl/R/quantile_regression_censored_copula.R")
#Have a look at the functions loglikCensored (used for fitting the copula
#to censored data) and qrCopulaCensored (used for fitting a quantile regression
#model to censored data), and notice that at the beginning of the scripts you
#will find a description of the functions' arguments.
loglikCensored
qrCopulaCensored
##EXAMPLE 1: bivariate Gaussian copula with normally and Weibull distributed marginals
##Generate some artificial data using a bivariate Gaussian copula
#Bivariate Gaussian copula
myCop <- normalCopula(param=-.7, dim=2)
#Marginals:
#the first marginal is a normal distribution, and
#the second marginal is a Weibull distribution
myMvdc <- mvdc(copula=myCop, c("norm", "myWeibull"),
list(list(mean=5, sd=.8),
list(location=4.5, scale=.7)))
#Visualize the copula
persp(myMvdc, dMvdc, n.grid=50, xlim=c(2, 8), ylim=c(0, 300))
#Draw sample (this sample will subsequently be used as the artificial data)
ns <- 200 #sample size
xys <- rMvdc(ns, myMvdc)
#Inspect the generated data
plot(xys[,1], xys[,2], xlab="x", ylab="y")
#Apply random censoring to the observations of variable y. The censored
#observations will subsequently be treated as right censored cases.
censored <- rmyWeibull(ns, location=5.2, scale=.8)
events <- ifelse(xys[,2] > censored, 0, 1)
xys[,2] <- pmin(xys[,2], censored)
#Plot the censored data
plot(xys[,1], xys[,2], xlab="x", ylab="y", col=events+1)
legend("topright", pch=c(1, 1), col=c(1, 2),
legend=c("censored", "uncensored"),
bty="n", y.intersp=1.2, cex=.7)
#Censoring distribution: 0=right censored, 1=uncensored
table(events)
##Fit a bivariate Gaussian copula to the artificial data
#by means of the two-stage parametric ML estimation method, also referred
#to as the method of Inference Functions for Margins (IFM).
#Stage 1: fit the marginals, and obtain the marginal parameters
(normMLE <- fitdistr(xys[,1], densfun="normal"))
normMean1 <- normMLE$estimate[1]
normSD1 <- normMLE$estimate[2]
(weibullMLE <- survreg(Surv(time=xys[,2], events) ~ 1, dist="weibull"))
weibullLocation2 <- coef(weibullMLE)
weibullScale2 <- weibullMLE$scale
#Compute the cumulative probabilities for the marginals
udat <- cbind(pnorm(xys[,1], mean=normMean1, sd=normSD1),
pmyWeibull(xys[,2], location=weibullLocation2, scale=weibullScale2))
#Stage 2: fix the parameters of the marginals and estimate the copula parameter
#Starting value for the copula parameter
(tauEst <- sin(cor(xys[events==1, 1], xys[events==1, 2], method="kendall")*pi/2))
fitGaussian <- optim(tauEst, loglikCensored, method="BFGS",
hessian=TRUE,
control=list(fnscale=-1), copula=myCop,
u=udat, which.censored=2, event=events)
#Did the optimization converge?
fitGaussian$convergence # 0 indicates successful convergence
#MLE for the copula parameter (remember: the true value for the parameter was -.7)
fitGaussian$par
#Standard error for MLE copula parameter
SEparms <- sqrt(diag(solve(-1*fitGaussian$hessian)))
#Construct 95% confidence interval for the copula parameter
(ci <- fitGaussian$par + c(1,-1)*qnorm(.05/2)*SEparms)
#When using the two-stage parametric ML method, the standard error of the MLE
#for the copula parameter is usually underestimated (i.e., too small).
#The reason for this underestimation is that the marginal MLEs were fixed at
#the second stage of the fitting process. As a consequence, when estimating
#the copula parameter at this second stage, variation in the MLEs of the
#marginals is not taken into account.
#You may use a resampling method (such as a nonparametric bootstrap method)
#for estimating standard errors that include the variability of marginal MLEs.
##Fit a quantile regression model
#We will employ a mvdc object as template
myMvdc2 <- mvdc(copula=normalCopula(param=1, dim=2), c("norm", "myWeibull"),
list(list(mean=0, sd=1),
list(location=0, scale=1)))
#Combine the MLEs in a vector
params <- c(normMean1, normSD1, weibullLocation2, weibullScale2, fitGaussian$par)
#Set the parameters of the template
myMvdc2 <- copula:::setMvdcPar(myMvdc2, params)
##Quantile regression: regress y on x
xpoints <- seq(3, 8, length.out=100)
#Compute the median
ymed <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc2, p=.5,
marginal=2, x=k))
#Compute the .05th-quantile
y05 <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc2, p=.05,
marginal=2, x=k))
#Compute the .95th-quantile
y95 <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc2, p=.95,
marginal=2, x=k))
#Plot the results
plot(xys[,1], xys[,2], xlab="x", ylab="y", pch=events, col=gray(.2,.5))
#Add the median
lines(xpoints, ymed, lty=1, col="red")
#Add the 90% prediction interval
lines(xpoints, y05, lty=2, col="blue")
lines(xpoints, y95, lty=2, col="blue")
#Add a legend
legend("topright", lty=c(1, 2, NA, NA), pch=c(NA, NA, 0, 1),
col=c("red", "blue", gray(.2,.5), gray(.2,.5)),
legend=c("median", "90% prediction interval", "censored", "uncensored"),
bty="n", y.intersp=1.2, cex=.7)
#Compute the true quantile curves
ymedT <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc, p=.5,
marginal=2, x=k))
y05T <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc, p=.05,
marginal=2, x=k))
y95T <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc, p=.95,
marginal=2, x=k))
#Add the true quantile curves to the plot
lines(xpoints, ymedT, lty=2, lwd=2, col="purple")
lines(xpoints, y05T, lty=2, lwd=2, col="purple")
lines(xpoints, y95T, lty=2, lwd=2, col="purple")
#As a comparison, we will use the crq function from the quantreg package.
#Similar to qrCopulaCensored(), crq() also fits quantile regression
#models to censored data.
library(quantreg)
#Fit a quantile regression model
crqData <- data.frame(y=xys[,2], x1=xys[,1])
crqmod <- crq(Surv(log(y), event=events, type="right") ~ x1,
data=crqData, method="Portnoy")
#Corresponding quantile curves
ypointsmed <- exp(cbind(1, xpoints)%*%coef(crqmod, taus=.5))
ypoints05 <- exp(cbind(1, xpoints)%*%coef(crqmod, taus=.05))
ypoints95 <- exp(cbind(1, xpoints)%*%coef(crqmod, taus=.95))
#Add these quantile curves to the plot
lines(xpoints, ypointsmed, lty=2, col="darkgreen")
lines(xpoints, ypoints05, lty=2, col="darkgreen")
lines(xpoints, ypoints95, lty=2, col="darkgreen")
##EXAMPLE 2: Gaussian copula with a 3-parameter Weibull as marginal
##Generate some artificial data using a bivariate Gaussian copula
#Bivariate Gaussian copula
myCop <- normalCopula(param=-.7, dim=2)
#Marginals:
#the first marginal is a normal distribution, and
#the second marginal is a 3-parameter Weibull distribution
myMvdc <- mvdc(copula=myCop, c("norm", "myWeibull"),
list(list(mean=5, sd=.8),
list(location=4.5, scale=.7, threshold=90)))
#Visualize the copula
persp(myMvdc, dMvdc, n.grid=50, xlim=c(2, 8), ylim=c(90, 500))
#Draw sample (this sample will subsequently be used as the artificial data)
ns <- 200 #sample size
xys <- rMvdc(ns, myMvdc)
#Inspect the generated data
plot(xys[,1], xys[,2], xlab="x", ylab="y", ylim=c(85, max(xys[,2])))
abline(h=90, lty=2, col="red") #threshold of 3-parameter Weibull
#Apply random censoring to the observations of variable y. The censored
#observations will subsequently be treated as right censored cases.
censored <- rmyWeibull(ns, location=5.2, scale=.8, threshold=110)
events <- ifelse(xys[,2] > censored, 0, 1)
xys[,2] <- pmin(xys[,2], censored)
#Plot the censored data
plot(xys[,1], xys[,2], xlab="x", ylab="y", col=events+1)
legend("topright", pch=c(1, 1), col=c(1, 2),
legend=c("censored", "uncensored"),
bty="n", y.intersp=1.2, cex=.7)
#Censoring distribution: 0=right censored, 1=uncensored
table(events)
##Fit a bivariate Gaussian copula to the artificial data
#by means of the two-stage parametric ML estimation method, also referred
#to as the method of Inference Functions for Margins (IFM).
#Stage 1: fit the marginals, and obtain the marginal parameters
#Fit a normal distribution to the observations of variable x
(normMLE <- fitdistr(xys[,1], densfun="normal"))
normMean1 <- normMLE$estimate[1]
normSD1 <- normMLE$estimate[2]
#Fit a 3-parameter Weibull distribution to the observations of variable y
#Profile function for the threshold (=gamma) parameter
profile.t <- function(gammahat, x, event) {
x <- x-gammahat
fit <- survreg(Surv(x, event) ~ 1, dist="weibull", subset=(x>0))
fit$loglik[1]
}
#Optimization
gammaMLE <- optim(par=0, profile.t, method="L-BFGS-B",
upper=(1-(1/1e+6))*min(xys[events==1, 2]),
control=list(fnscale=-1),
x=xys[,2], event=events)
gamma <- gammaMLE$par #MLE for the threshold parameter
names(gamma) <- "gamma"
#MLEs for the location (=mu) and scale (=sigma) parameters
sgamma <- Surv(xys[,2]-gamma, events)
musigma <- survreg(sgamma ~ 1, dist="weibull", subset=(xys[,2]-gamma>0))
mu <- coef(musigma); names(mu) <- "mu"
sigma <- musigma$scale; names(sigma) <- "sigma"
c(mu, sigma, gamma) #summary
weibullLocation2 <- mu
weibullScale2 <- sigma
weibullThreshold2 <- gamma
#Compute the cumulative probabilities for the marginals
udat <- cbind(pnorm(xys[,1], mean=normMean1, sd=normSD1),
pmyWeibull(xys[,2], location=weibullLocation2, scale=weibullScale2,
threshold=weibullThreshold2))
#Stage 2: fix the parameters of the marginals and estimate the copula parameter
#Starting value for the copula parameter
(tauEst <- sin(cor(xys[events==1, 1], xys[events==1, 2], method="kendall")*pi/2))
fitGaussian <- optim(tauEst, loglikCensored, method="BFGS",
hessian=TRUE,
control=list(fnscale=-1), copula=myCop,
u=udat, which.censored=2, event=events)
#Did the optimization converge?
fitGaussian$convergence # 0 indicates successful convergence
#MLE for the copula parameter (remember: the true value for the parameter was -.7)
fitGaussian$par
#Standard error for MLE copula parameter
SEparms <- sqrt(diag(solve(-1*fitGaussian$hessian)))
#Construct 95% confidence interval for the copula parameter
(ci <- fitGaussian$par + c(1,-1)*qnorm(.05/2)*SEparms)
#Remember that when using the two-stage parametric ML method, the standard error
#of the MLE for the copula parameter is usually underestimated (i.e., too small).
##Fit a quantile regression model
#Construct a template
myMvdc2 <- mvdc(copula=normalCopula(param=1, dim=2), c("norm", "myWeibull"),
list(list(mean=0, sd=1),
list(location=0, scale=1, threshold=0)))
#Combine the MLEs
params <- c(normMean1, normSD1, weibullLocation2, weibullScale2, weibullThreshold2,
fitGaussian$par)
#Set the parameters of the template
myMvdc2 <- copula:::setMvdcPar(myMvdc2, params)
##Quantile regression: regress y on x
xpoints <- seq(3, 8, length.out=100)
#Compute the median
ymed <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc2, p=.5,
marginal=2, x=k))
#Compute the .05th-quantile
y05 <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc2, p=.05,
marginal=2, x=k))
#Compute the .95th-quantile
y95 <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc2, p=.95,
marginal=2, x=k))
#Plot the results
plot(xys[,1], xys[,2], xlab="x", ylab="y", pch=events, col=gray(.2,.5))
#Add the median
lines(xpoints, ymed, lty=1, col="red")
#Add the 90% prediction interval
lines(xpoints, y05, lty=2, col="blue")
lines(xpoints, y95, lty=2, col="blue")
#Add a legend
legend("topright", lty=c(1, 2, NA, NA), pch=c(NA, NA, 0, 1),
col=c("red", "blue", gray(.2,.5), gray(.2,.5)),
legend=c("median", "90% prediction interval", "censored", "uncensored"),
bty="n", y.intersp=1.2, cex=.7)
#Compute the true quantile curves.
ymedT <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc, p=.5,
marginal=2, x=k))
y05T <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc, p=.05,
marginal=2, x=k))
y95T <- sapply(xpoints, function (k) qrCopulaCensored(copula=myMvdc, p=.95,
marginal=2, x=k))
#Add the true quantile curves to the plot
lines(xpoints, ymedT, lty=2, lwd=2, col="purple")
lines(xpoints, y05T, lty=2, lwd=2, col="purple")
lines(xpoints, y95T, lty=2, lwd=2, col="purple")
#As a final comparison, we will fit a parametric survival model.
#Profile function for the threshold (=gamma) parameter of the parametric
#survival model
profile.t2 <- function(gammahat, y, cov, event) {
y <- y-gammahat
fit <- survreg(Surv(y, event) ~ cov, dist="weibull", subset=(y>0))
fit$loglik[2]
}
#Optimization
gammaMLE <- optim(par=0, profile.t2, method="L-BFGS-B",
upper=(1-(1/1e+6))*min(xys[events==1, 2]),
control=list(fnscale=-1),
y=xys[,2], cov=xys[,1], event=events)
gamma <- gammaMLE$par #MLE for the threshold parameter
names(gamma) <- "gamma"
#MLEs for the remaining parameters of the survival model
sgamma <- Surv(xys[,2]-gamma, events)
musigma <- survreg(sgamma ~ xys[,1], dist="weibull", subset=(xys[,2]-gamma>0))
mu <- coef(musigma); names(mu) <- c("intercept", "x")
sigma <- musigma$scale; names(sigma) <- "sigma"
c(mu, sigma, gamma) #summary of parameters
#Compute the quantile curves for the fitted survival model
mus <- cbind(1, xpoints)%*%coef(musigma)
ypointsmed <- qmyWeibull(.5, location=mus, scale=sigma, threshold=gamma)
ypoints05 <- qmyWeibull(.05, location=mus, scale=sigma, threshold=gamma)
ypoints95 <- qmyWeibull(.95, location=mus, scale=sigma, threshold=gamma)
#Add the quantile curves to the plot
lines(xpoints, ypointsmed, lty=2, col="darkgreen")
lines(xpoints, ypoints05, lty=2, col="darkgreen")
lines(xpoints, ypoints95, lty=2, col="darkgreen")