Error propagation in R: Monte Carlo simulations using copulas

Physicists and engineers often have to calculate the uncertainty in a derived quantity.

For instance, a test engineer repeatedly measures two separate angles. The uncertainty (or error) in the measurements of each of these angles appears to be +/- 1 degree. Subsequently, the engineer calculates the sum of these two angles. This sum is a derived quantity. But note that this derived quantity is composed of two measured quantities each having their own uncertainties, so what is the uncertainty (or error) in this derived quantity? In other words, how propagates the uncertainty from the measured quantities (the two angles) to a derived quantity (sum of two angles).

The following R code calculates the uncertainty in a derived quantity using a Monte Carlo simulation. Moreover, the Monte Carlo simulation employs copulas that fall into the Archimedean class of copulas. This class consists of members such as the Clayton and Gumbel copula. These Archimedean copulas make it possible to calculate the uncertainty in a derived quantity when 1) the measured quantities are dependent (correlated), and 2) the measured quantities follow a normal or some asymmetric distribution (e.g., an exponential, lognormal, or Weibull distribution).

R code that uses a Gaussian copula for calculating the uncertainty in a derived quantity can be found in this blog post.

A comprehensive introduction to Archimedean copulas is given by Salvadori et al. in their 2007 book Extremes in nature: an approach using copulas.

Suggestions and/or questions? Please contact Stefan Gelissen (email: info at datall-analyse.nl).
See this page for an overview of all of Stefan’s R code blog posts.

R code (last update: 2016-10-20)

library(copula)
library(survival)

##generate artificial data
#- denote the measured quantities by l and b
#- the derived quantity is equal to l*b
#- assume that l follows a normal and b an exponential distribution
#- the measured quantities l and b are dependent
#- use a Clayton copula for simulating the dependence between l and b

#Clayton copula
myCop <- claytonCopula(param=.7, dim=2)

#margins: normal and exponential distributions
myMvdc <- mvdc(copula=myCop, c("norm", "exp"),
               list(list(mean=25, sd=.02), list(rate=.05)))

#draw sample
ns <- 50 #sample size
lb <- rMvdc(ns, myMvdc)

#inspect data
plot(lb[,1], lb[,2], xlab="l", ylab="b")



##fit Archimedean copula to data
#fitted are a Clayton and Gumbel copula
#for other copulas see Salvadori et al., Appendix C, pp. 233-264

#obtain start values for fitting the copulas
expRate <- exp(-coef(survreg(Surv(lb[,2])~1, dist="exponential")))
normMean <- mean(lb[,1])
nNorm <- length(lb[,1])
sdNorm <- sqrt((nNorm-1)/nNorm)*sd(lb[,1])

#fit Clayton copula to data
myClayton <- mvdc(copula=claytonCopula(param=1, dim=2), c("norm", "exp"),
                  list(list(mean=0, sd=1), list(rate=1)))

tauEst <- cor(lb[,1], lb[,2], method="kendall")
thetaClayton <- (2*tauEst)/(1-tauEst)

start <- c(normMean, sdNorm, expRate, thetaClayton)

fitClayton <- fitMvdc(lb, myClayton, start=start)
fitClayton

#estimate of derived quantity (which is the equal to l*b)
fitClayton@estimate[1]*(1/fitClayton@estimate[3])



#fit Gumbel copula to data
myGumbel <- mvdc(copula=gumbelCopula(param=1.5, dim=2), c("norm", "exp"),
                 list(list(mean=0, sd=1), list(rate=1)))

tauEst <- cor(lb[,1], lb[,2], method="kendall")
thetaGumbel <- 1 / (1-tauEst)

start <- c(normMean, sdNorm, expRate, thetaGumbel)

fitGumbel <- fitMvdc(lb, myGumbel, start=start)
fitGumbel

#estimate of derived quantity (which is the equal to l*b)
fitGumbel@estimate[1]*(1/fitGumbel@estimate[3])



#compare log-likelihoods of the fitted Clayton and Gumbel copulas
fitClayton@loglik
fitGumbel@loglik



##diagnostics
#Kendall plot and level curves are used as diagnostic tools

#generate sequence of cdf values for u and v
u <- seq(0, 1, length.out=100)
v <- seq(0, 1, length.out=100)
uv <- as.matrix(expand.grid(u,v))

#observations
uobs <- lb[,1]
vobs <- lb[,2]
nobs <- length(uobs) #sample size

#1) Kendall plot
#Kendall's measure (non-parametric)
kn <- sapply(1:nobs, function(i) {
  sum(as.numeric(uobs < uobs[i] & vobs < vobs[i])) / (nobs-1)})

#Kendall's measure (parametric)
#(see Salvadori et al., pp. 147-148)

#Clayton copula
myCopClayton <- copClayton
myCopClayton@theta <- fitClayton@mvdc@copula@parameters
KuClayton <- pK(u, cop=myCopClayton, d=2)

#Gumbel copula
myCopGumbel <- copGumbel
myCopGumbel@theta <- fitGumbel@mvdc@copula@parameters
KuGumbel <- pK(u, cop=myCopGumbel, d=2)

#Kendall plot
plot(sort(kn), (1:nobs)/nobs, type="p", xlim=c(0,1), log="y",
     ylab="Kc", xlab="t", col=gray(.5))
lines(u, KuClayton, col="blue")
lines(u, KuGumbel, col="darkgreen")
legend("bottomright", lty=1, col=c("blue", "darkgreen"), legend=c("Clayton", "Gumbel"))



#2) compare the level curves of the theoretical and empirical copula
#see Salvadori et al., pp. 140-142

#empirical copula (Salvadori et al., p.140, eq. (3.31))
ec <- C.n(u=uv, X=pobs(cbind(uobs,vobs)))
ec <- matrix(ec, nrow=length(u))
cL <- contourLines(u, v, z=ec, levels=seq(.1,.9,.1))

#Clayton copula
contour(claytonCopula(param=fitClayton@mvdc@copula@parameters), pCopula)
invisible(lapply(cL, lines, lwd=1, lty=2, col="red"))

#Gumbel copula
contour(claytonCopula(param=fitGumbel@mvdc@copula@parameters), pCopula)
invisible(lapply(cL, lines, lwd=1, lty=2, col="red"))



##Monte Carlo simulation
#proceed with the fitted Clayton copula for running the Monte Carlo simulation

#set up a Clayton copula with Student and exponential margins
corBLfit <- fitClayton@mvdc@copula@parameters
meanBfit <- fitClayton@estimate[1]
nb <- length(lb[,1])
sdBfit <- sqrt(nb/(nb-1))*fitClayton@estimate[2] #unbiased MLE of the sd
rateLfit <- fitClayton@estimate[3]

mcClayton <- mvdc(copula=claytonCopula(param=corBLfit, dim=2),
                  margins=c("t", "exp"),
                  paramMargins=list(list(df=nb-1), list(rate=rateLfit)))

#Monte Carlo simulation
nsim <- 100000
blSim <- rMvdc(nsim, mcClayton)
blSim[,1] <- meanBfit + blSim[,1]*sdBfit
prodSim <- blSim[,1]*blSim[,2]
#results Monte Carlo simulation
hist(prodSim) #skewed/asymmetric distribution
mean(prodSim)
sd(prodSim)

#percentile confidence intervals
alpha <- .05
quantile(prodSim, probs=c(alpha/2, 1-alpha/2))

#function for calculating the shortest coverage interval
sci <- function (values, alpha=.05){
  sortedSim <- sort(values)
  nsim <- length(values)
  covInt <- sapply(1:(nsim-round((1-alpha)*nsim)), function(i) {
    sortedSim[1+round((1-alpha)*nsim)+(i-1)]-sortedSim[1+(i-1)]})
  lcl <- sortedSim[which(covInt==min(covInt))]
  ucl <- sortedSim[1+round((1-alpha)*nsim)+(which(covInt==min(covInt))-1)]
  c(lcl, ucl)
}

#shortest 95% coverage interval
(simInt <- round(sci(prodSim, alpha=.05), 4))
#draw limits
hist(prodSim)
abline(v=simInt, col="blue", lty=2, lwd=2)



##confidence interval estimation when taking the sample size into account
#the above procedure for calculating the confidence interval does not take the
#sample size (i.e., n=50) into account, whereas the following procedure does

n <- nrow(lb) #original sample size
Nmc <- 50000 #number of samples
meanNmc <- numeric(Nmc)

#sample with replacement from skewed distribution of l*b
meanNmc <- replicate(Nmc, mean(sample(prodSim, n, replace=TRUE)))

#Central Limit Theory states that the sampling distribution from any
#distribution approaches a normal distribution as long as the sample
#size is sufficiently large
#thus, taking repeatedly a sample from the above skewed distribution (of l*b)
#should yield a sampling distribution that is approximately normal, assuming the
#orginal sample size (of n=50) was sufficiently large
hist(meanNmc) #approaches a normal distribution
mean(meanNmc)
sd(meanNmc)

#percentile confidence intervals
alpha <- .05
quantile(meanNmc, probs=c(alpha/2, 1-alpha/2))

#shortest 95% coverage interval
(simInt <- round(sci(meanNmc, alpha=.05), 4))
#draw limits
hist(meanNmc)
abline(v=simInt, col="blue", lty=2, lwd=2)




##parametric bootstrap method
#compare the latter Monte Carlo simulation confidence intervals (i.e., when the sample size
#of n=50 was taken into account) with those obtained by a parametric bootstrap method

#initialize bootstrap
B <- 5000 #number of bootstrap samples
estimateB <- numeric(B)
n <- nrow(lb)

#perform bootstrap
for (i in 1:B) {
  xyb <- rMvdc(n, mcClayton)
  xyb[,1] <- meanBfit + xyb[,1]*sdBfit
  mxb <- mean(xyb[,1])
  myb <- 1/exp(-coef(survreg(Surv(xyb[,2])~1, dist="exponential")))
  estimateB[i] <- mxb*myb
}

#check results bootstrap
hist(estimateB)
mean(estimateB)
sd(estimateB)

#percentile confidence intervals
alpha <- .05
quantile(estimateB, probs=c(alpha/2, 1-alpha/2))

#shortest 95% coverage interval
(simIntB <- round(sci(estimateB, alpha=.05), 4))
#draw limits
hist(estimateB)
abline(v=simIntB, col="blue", lty=2, lwd=2)





##coverage probability of the confidence intervals when taking sample size into account
#coverage probability estimation of the Monte Carlo confidence intervals
#when repeatedly sampling from the skewed distribution of l*b

#the following code demonstrates that the coverage probability of these latter
#confidence intervals is close to the nominal level

#note: running this code may take some time

#true (population) values
#assume that l and b follow a normal and exponential distribution, respectively
meanLT <- 25 #mean of l
sdLT <- .2 #standard deviation of l
rateBT <- .05 #mean of b is 20, thus rate is 1/20=.05
XYt <- meanLT*(1/rateBT) #best estimate of derived quantity (i.e., l*b)

#initialize Monte Carlo simulation
nsim <- 500 #number of simulations
alpha <- .05 #nominal coverage probability is equal to 1-alpha
m <- 50 #sample size

#population Clayton copula
myMvdc <- mvdc(copula=claytonCopula(param=.7, dim=2), c("norm", "exp"),
               list(list(mean=25, sd=.02), list(rate=.05)))
#fitted Clayton
myClayton <- mvdc(copula=claytonCopula(param=1, dim=2), c("norm", "exp"),
                  list(list(mean=0, sd=1), list(rate=1)))

#estimate the coverage probability of the Monte Carlo simulation
nMC <- 100000
nSS <- 50000
uclB <- numeric(nsim)
lclB <- numeric(nsim)
meanNmcXY <- numeric(nMC)

for (j in 1:nsim) {
  #sample from population copula
  xyb <- rMvdc(m, myMvdc)
  
  #fit Clayton copula to sample
  expRate <- exp(-coef(survreg(Surv(xyb[,2])~1, dist="exponential")))
  normMean <- mean(xyb[,1])
  sdNorm <- sqrt((m-1)/m)*sd(xyb[,1])
  tauEst <- cor(xyb[,1], xyb[,2], method="kendall")
  thetaClayton <- (2*tauEst)/(1-tauEst)
  fitClayton <- fitMvdc(xyb, myClayton, start=c(normMean, sdNorm, expRate, thetaClayton))
  
  #Monte Carlo simulation using fitted Clayton copula
  corXYfit <- fitClayton@mvdc@copula@parameters
  meanXfit <- fitClayton@estimate[1]
  sdXfit <- sqrt(m/(m-1))*fitClayton@estimate[2]
  rateYfit <- fitClayton@estimate[3]
  
  mcClayton <- mvdc(copula=claytonCopula(param=corXYfit, dim=2),
                    margins=c("t", "exp"),
                    paramMargins=list(list(df=m-1), list(rateYfit)))
  
  xySim <- rMvdc(nMC, mcClayton)
  xySim[,1] <- meanXfit + xySim[,1]*sdXfit
  
  hatXYsim <- xySim[,1]*xySim[,2]
  
  #sample with replacement from skewed distribution
  meanNmcXY <- replicate(nSS, mean(sample(hatXYsim, m, replace=TRUE)))
  
  #shortest coverage interval
  sortedSim <- sort(meanNmcXY)
  covInt <- sapply(1:(nSS-round((1-alpha)*nSS)), function(i) {
    sortedSim[1+round((1-alpha)*nSS)+(i-1)]-sortedSim[1+(i-1)]})
  
  lclB[j] <- sortedSim[which(covInt==min(covInt))]
  uclB[j] <- sortedSim[1+round((1-alpha)*nSS)+(which(covInt==min(covInt))-1)]
}


#coverage probability
mean(lclB < XYt & uclB > XYt) #result will be 94% (approximately)
#average width of confidence intervals
mean(uclB-lclB) #result will be 275 (approximately)