Contents

1 Consistency of the estimator in \(\texttt{ebmstate::CoxRFX}\)

This section shows the results of a simulation study to assess the consistency of the estimator in \(\texttt{CoxRFX}\) (an estimator of the regression coefficients in a Cox model).

In the following grid of plots, each column of the grid is based on a different batch of 500 data sets simulated from the same illness-death model. This model is a clock-reset Cox model with 10 regression coefficients for each transition (one for each of 10 covariates), and a Gompertz baseline hazard. The (prior) distribution of the parameters in each transition was assumed to be Gaussian (with undetermined mean and variance). The only difference between batches/columns is the number of observations (i.e. patients) per data set: 250 observations in the first column, 1000 in the second, 2000 in the third and 4000 in the fourth.

The top row plots compare the true coefficient values with the mean estimate over the 500 data sets in a batch; they give strong indication that the bias vanishes as the sample size increases. At the same time, the distribution of the estimator is concentrated inside increasingly smaller neighbourhoods around the true parameter value. This is strongly suggested by the middle row plots, which show a vanishing sample variance for each individual parameter, and also the bottom row ones, which show the empirical density of the sum of squared errors becoming concentrated in smaller and smaller neighbourhoods around zero.


consistency of the CoxRFX estimator.

Figure 1.1: consistency of the CoxRFX estimator.


Code to perform the simulation


Set the parameters of the simulation

set.seed(20078)
library(mvtnorm)
library(ebmstate)

n<-4000 # number of patients
covariate_names<-paste0("Cov",1:10) #number of covariates (for each transition)
nGroups<-1 #number of groups per transition
nParam<-3*length(covariate_names) #total number of parameters (regression coefficients)
nr_simulated_data_sets<-500
param<-runif(n=nParam,min = -0.5,max = 1) #simulation of parameters
file1<-"../data/coxph_vs_coxrfx_sim_illness_death_4000obs.Rdata"

Generate data by simulation

coefficient_estimates<-matrix(nrow = nr_simulated_data_sets,ncol = 3*length(covariate_names))
mu_estimates<-matrix(nrow = nr_simulated_data_sets,ncol=3*nGroups)
sigma2_estimates<-matrix(nrow = nr_simulated_data_sets,ncol = 3*nGroups)

coefficient_estimates_coxph<-matrix(nrow = nr_simulated_data_sets,ncol = 3*length(covariate_names))


for (j in 1:nr_simulated_data_sets){
  #covariates
  if(length(covariate_names)>1){
    covariate_matrix<-t(sapply(rep(length(covariate_names),n),function(x) rbinom(n=x,size = 1,prob = 0.5)))
  }else{
    covariate_matrix<-matrix(rbinom(n,size = 1,prob = 0.5),ncol=1)
  }
  
  colnames(covariate_matrix)<-covariate_names

  #relative risks (relative hazards)
  rel.risk_trans1<-exp(covariate_matrix%*%param[(1+length(covariate_names)*0):(length(covariate_names)*1)])
  rel.risk_trans2<-exp(covariate_matrix%*%param[(1+length(covariate_names)*1):(length(covariate_names)*2)])
  rel.risk_trans3<-exp(covariate_matrix%*%param[(1+length(covariate_names)*2):(length(covariate_names)*3)])
  
  #Generate a transition history for each patient. Clock-reset Cox model. Baseline hazard is Gompertz for all transitions. 

  m<-matrix(c(flexsurv::rgompertz(n, shape=0.1, rate = rel.risk_trans1*exp(-4.5)),flexsurv::rgompertz(n, shape=0.1, rate = rel.risk_trans2*exp(-4.65))),ncol = 2)
  v1<-apply(m,1,which.min)
  m<-cbind(sapply(1:nrow(m),function(x) m[x,v1[x]]),v1)
  m<-cbind(m,sapply(1:nrow(m), function(x) ifelse(m[x,2]==1,flexsurv::rgompertz(1,shape = 0.15,rate = rel.risk_trans3[x]*exp(-5.5)),NA)))
  m<-cbind(m,apply(m[,c(1,3)],1,sum,na.rm=T))
  m<-cbind(m,rexp(n,0.03))
  m<-cbind(m,(m[,5]<m[,4]))
  colnames(m)<-c("state1_duration","transition","state2_duration","total_time", "cens_time","cens=1")
  m<-as.data.frame(m)

  #convert the data to long format
  mstate.data<-data.frame()

  for(i in 1:nrow(m)){
    id<-rep(i,2)
    from<-c(1,1)
    to<-c(2,3)
    trans<-c(1,2)
    Tstart<-c(0,0)
    Tstop<-rep(min(m$state1_duration[i],m$cens_time[i]),2)
    time<-Tstop-Tstart
    status<-as.numeric(c(m$transition[i]==1 & m$cens_time[i]>m$state1_duration[i],m$transition[i]==2 & m$cens_time[i]>m$state1_duration[i]))
    mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,                                             trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status)) 
    if(status[1]==1){
      id<-i
      from<-2
      to<-4
      trans<-3
      Tstart<-Tstop[1]
      Tstop<-min(m$state1_duration[i]+m$state2_duration[i],m$cens_time[i])
      time<-Tstop-Tstart
      status<-as.numeric(m$state1_duration[i]+m$state2_duration[i]<m$cens_time[i])
      mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,
                                                Tstart=Tstart,Tstop=Tstop,time=time,status=status))
    }
  }
  
  #add covariates
  mstate.data<-cbind(mstate.data,covariate_matrix[mstate.data$id,])
  
  #attributes and class
  tmat<-mstate::transMat(x=list(c(2,3),c(4),c(),c()),names=c("health","illness","death","death_after_illness"))
  class(mstate.data)<-c("data.frame","msdata")
  attr(mstate.data,"trans")<-tmat
  
  #expand covariates
  mstate.data<-mstate::expand.covs(mstate.data,covs =names(mstate.data)[-(1:8)])
  
  #Fit empirical Bayes clock-reset Cox model. 
  
  #argument 'Z' of coxrfx
  Z<-mstate.data[,-(1:(8+length(covariate_names)))]
  Z$strata<-Z$trans<-mstate.data$trans
  class(Z) <- c("data.frame")
  
  #argument 'surv' of coxrfx
  surv<-survival::Surv(mstate.data$time,mstate.data$status)
  
  #argument 'groups' of coxrfx
  groups<-as.vector(sapply(c("trans1","trans2","trans3"),function(x) rep(x,nParam/3)))
  
  #fit random effects model
  coxrfx_object<-CoxRFX(Z,surv,groups,max.iter = 600,tol = 0.0001,sigma.hat = "df")

  coefficient_estimates[j,]<-coxrfx_object$coefficients
  mu_estimates[j,]<-coxrfx_object$mu
  sigma2_estimates[j,]<-coxrfx_object$sigma2
  
 
  if(j %%10==0){
    save(coefficient_estimates,mu_estimates,sigma2_estimates,param,file = file1)
  }

  print(j)
}

Code to generate grid of plots

file_paths<-c("../data/coxph_vs_coxrfx_sim_illness_death_250obs.Rdata","../data/coxph_vs_coxrfx_sim_illness_death_1000obs.Rdata","../data/coxph_vs_coxrfx_sim_illness_death_2000obs.Rdata","../data/coxph_vs_coxrfx_sim_illness_death_4000obs.Rdata")

pdf("./plots/coxrfx_vs_coxph_plot_grid_10covs_per_trans_one_prior_per_trans.pdf",height=8,width = 9)

par(mfcol=c(3,4),mar=c(5.1, 4.1, 3.1, 1.1))
for(file1 in file_paths){
#mse comparison 
load(file1)
#reorder matrix of estimates
coefficient_estimates<-coefficient_estimates[,c(seq(1,ncol(coefficient_estimates),3),seq(2,ncol(coefficient_estimates),3),seq(3,ncol(coefficient_estimates),3))]
  
#errors
errors_coxrfx<-t(coefficient_estimates)-param
errors_coxph<-t(coefficient_estimates_coxph)-param

#sse
sse_coxrfx<-apply(errors_coxrfx^2,2,sum,na.rm=T)

file2<-strsplit(file1,"/data/")[[1]][2]
file3<-strsplit(file2,".Rdata")

plot(0,type="n",xlim=c(-0.5,1),ylim=c(-0.5,1),ylab="average point estimate",xlab="true parameter",cex.lab=0.8,cex.axis=1,las=1,cex.lab=1.3)
points(param,apply(coefficient_estimates,2,mean,na.rm=T),cex=0.8,col="red")
abline(a=0,b=1)

var_coef_estimates<-apply(coefficient_estimates,2,var,na.rm=T)
var_coef_estimates_coxph<-apply(coefficient_estimates_coxph,2,var,na.rm=T)
barplot(var_coef_estimates,col=c("red"),beside=T,las=1,ylim=c(0,0.18),ylab = "sample variance",cex.lab=1.1,mgp=c(3,1,0))
mtext("regression coefficient",side = 1,line = 1,cex=0.8)

plot(density(sse_coxrfx,from = 0,kernel = "gaussian",na.rm = T),main="",ylab ="estimated density",xlab="sum of squared errors",xlim=c(0,5.5),ylim=c(0,7.5),col="red",lty=1,las=1,cex.lab=1.3)

}

dev.off()

2 Computation of state occupation probabilities for clock-reset models

In \(\texttt{ebmstate}\), the estimation of state occupation probabilities under clock-reset models, implemented in the functions \(\texttt{probtrans_ebmstate}\) and \(\texttt{probtrans_fft}\), relies on the estimators defined in (3) to (5) of section 3.2 of the paper. Substituting \(:=\) for \(\approx\) and removing the ‘hats’ in definitions (3) to (5), we obtain the approximate equalities that justify the estimators. We justify these approximate equalities as follows.

Approximate equality justifying the estimator in definition (3), section 3.2: \[\begin{align*} q_{j-1,j}\left(k\right)&:=\mathrm{P}\left[X_{j}=j, \tau_{j-1,j}\in \left[t_{k},t_{k+1}\right)\,|\,X_{j-1}=j-1\right]\\ &\approx f\left[X_{j}=j,\tau_{j-1,j}=t_{k}\,|\, X_{j-1}=j-1\right] \Delta t\\ &=\lim_{h \downarrow 0}\frac{\mathrm{P}\left[X_{j}=j,\tau_{j-1,j}\in \left[t_{k},t_{k}+h\right)\,|\,X_{j-1}=j-1\right]}{h}\Delta t\\ &=\lim_{h \downarrow 0}\mathrm{P} \left[\tau_{j-1,j} \geq t_{k}\,|\,X_{j-1}=j-1\right]\frac{\mathrm{P}\left[X_{j}=j,\tau_{j-1,j}\in \left[t_{k},t_{k}+h\right)|\tau_{j-1,j} \geq t_{k},X_{j-1}=j-1\right]}{h} \Delta t\\ &=\mathrm{P} \left[\tau_{j-1,j} \geq t_{k}\,|\,X_{j-1}=j-1\right]\lambda_{j-1,j}\left(t_{k}\right)\,\Delta t\\ &\approx \exp\left[-\Lambda_{j-1}\left(t_{k}\right)\right]\,\Delta\Lambda_{j-1,j}\left(t_{k}\right)\;. \end{align*}\]
Approximate equality justifying the estimator in definition (4), section 3.2:

\[\begin{align*} q_{0j}\left(k\right)&:=\mathrm{P}\left[X_{j}=j,\tau_{0,j}\in\left[t_{k},t_{k+1}\right)\,|\,X_{0}=0\right]\\ &\approx f\left[X_{j}=j,\tau_{0,j}=t_{k}\,|\,X_{0}=0\right]\Delta t\\ &=\int_{0}^{t_{k}}f\left[X_{j}=j,\tau_{0,j}=t_{k},X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\ &=\int_{0}^{t_{k}}f\left[X_{j}=j,\tau_{0,j}=t_{k}\,|\,X_{j-1}=j-1,\tau_{0,j-1}=u ,X_{0}=0\right]f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\ &=\int_{0}^{t_{k}}f\left[X_{j}=j,\tau_{0,j}=t_{k}\,|\,X_{j-1}=j-1,\tau_{0,j-1}=u \right]f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\ &=\int_{0}^{t_{k}}\lim_{h \downarrow 0} \frac{\mathrm{P}\left[X_{j}=j,\tau_{0,j}\in \left[t_{k},t_{k}+h\right)\,|\, X_{j-1}=j-1,\tau_{0,j-1}=u \right]}{h}\,f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\ &=\int_{0}^{t_{k}}\lim_{h \downarrow 0} \mathrm{P}\left[\tau_{0,j}\geq t_{k} \,|\,X_{j-1}=j-1,\tau_{0,j-1}=u\right] \frac{\mathrm{P}\left[X_{j}=j,\tau_{0,j}\in \left[t_{k},t_{k}+h\right)\,|\, \tau_{0,j}\geq t_{k},X_{j-1}=j-1,\tau_{0,j-1}=u \right]}{h}\,f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\ &=\int_{0}^{t_{k}}\lim_{h \downarrow 0} \mathrm{P}\left[\tau_{j-1,j}\geq t_{k}-u \,|\,X_{j-1}=j-1\right] \frac{\mathrm{P}\left[X_{j}=j,\tau_{j-1,j}\in \left[t_{k}-u,t_{k}-u+h\right)\,|\, \tau_{j-1,j}\geq t_{k}-u,X_{j-1}=j-1\right]}{h}\,f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\ &=\int_{0}^{t_{k}}\exp\left[-\Lambda_{j-1}\left(t_{k}-u\right)\right]\,\lambda_{j-1,j}\left(t_{k}-u\right) f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\ &\approx \int_{0}^{t_{k}}\exp\left[-\Lambda_{j-1}\left(t_{k}-u\right)\right]\,\Delta\Lambda_{j-1,j}\left(t_{k}-u\right) f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\\ &\approx \sum_{l=0}^{k-1} q_{j-1,j}\left(k-l-1\right)q_{0,j-1}\left(l\right) \;. \end{align*}\]
Approximate equality justifying the estimator in definition (5), section 3.2:

\[\begin{align*} p_{0n}\left(k\right)&=\mathrm{P}\left[X_{n}=n,\tau_{ 0, n}< t_{k} , \tau_{ n, n+1}\geq t_{k}-\tau_{ 0, n}\,|\, X_{0}=0\right]\\ &=\int_{0}^{t_{k}}\int_{t_{k}-u}^{\infty}f\left[X_{n}=n,\tau_{ 0, n}=u,\tau_{ n, n+1}=v\,|\, X_{0}=0\right]\mathrm{d}v\,\mathrm{d}u\\ &=\int_{0}^{t_{k}}\int_{t_{k}-u}^{\infty}f\left[\tau_{ n, n+1}=v|X_{n}=n,\tau_{ 0, n}=u,X_{0}=0\right]f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}v \,\mathrm{d}u\\ &=\int_{0}^{t_{k}}\int_{t_{k}-u}^{\infty}f\left[\tau_{ n, n+1}=v|X_{n}=n,\tau_{ 0, n}=u\right]f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}v \,\mathrm{d}u\\ &=\int_{0}^{t_{k}} \mathrm{P}\left[\tau_{ n, n+1}>t_{k}-u|X_{n}=n,\tau_{ 0, n}=u\right]f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}u\\ &=\int_{0}^{t_{k}} \mathrm{P}\left[\tau_{ n, n+1}>t_{k}-u|X_{n}=n \right]f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}u\\ &=\int_{0}^{t_{k}} \exp\left[-\Lambda_{n}\left(t_{k}-u\right)\right]\, f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}u\\ &\approx \sum_{l=0}^{k-1}r_{n}\left(k-l-1\right)q_{0n}\left(l\right)\quad. \end{align*}\]

3 Testing \(\texttt{probtrans_ebmstate}\)

The following plot shows that probtrans_ebmstate can accurately compute state occupation probabilities under clock-reset Cox models, when it is given the vectors of cumulative hazards for each transition. The dashed red lines were computed using a data set of 100,000 simulated disease histories for the same patient (or, equivalently, 100,000 patients with the same vector of covariates). For any time t, these lines give the relative frequencies of each state. They are superimposed on solid black lines which represent the state occupation probabilities as computed by probtrans_ebmstate, when the true (Gompertz) cumulative hazards are given to it as input.

accuracy of the estimator of state occupation probabilities in the function probtrans_ebmstate.

Figure 3.1: accuracy of the estimator of state occupation probabilities in the function probtrans_ebmstate.


Code used to generate the previous plot.

# Load packages and set working directory
set.seed(9873)
library(ebmstate)
library(flexsurv)

#generate a vector of covariates for the patient
nCovs<-50
covariate_vector<-rbinom(nCovs,size = 1,prob = 0.1)

#compute the relative risk (better said: relative hazard) of the patient for each transition
param<-runif(n=150,min = -0.5,max = 0.5)
rel.risk_trans1<-exp(sum(covariate_vector*param[(1+nCovs*0):(nCovs*1)]))
rel.risk_trans2<-exp(sum(covariate_vector*param[(1+nCovs*1):(nCovs*2)]))
rel.risk_trans3<-exp(sum(covariate_vector*param[(1+nCovs*2):(nCovs*3)]))

#generate 100,000 uncensored observations for the same patient
n<-100000
m<-matrix(c(rgompertz(n, shape=0.1, rate = rel.risk_trans1*exp(-4.5)),rgompertz(n, shape=0.1, rate = rel.risk_trans2*exp(-4.65))),ncol = 2)
v1<-apply(m,1,which.min)
m<-cbind(sapply(1:nrow(m),function(x) m[x,v1[x]]),v1)
m<-cbind(m,sapply(1:nrow(m), function(x) ifelse(m[x,2]==1,rgompertz(1,shape = 0.15,rate = rel.risk_trans3*exp(-5.5)),NA)))
m<-cbind(m,apply(m[,c(1,3)],1,sum,na.rm=T))
colnames(m)<-c("state1_duration","transition","state2_duration","total_time")
m<-as.data.frame(m)


#Build a function that computes relative frequencies of each state at some time t
rel_freq<-function(state,t){
  if(state==1){
    sum(m[,1]>t)/nrow(m)
  }else if(state==2){
    sum(m[,1]<t & m[,2]==1 & m[,4]>t)/nrow(m)
  }else if(state==3){
    sum(m[,1]<t & m[,2]==2)/nrow(m)
  }else if(state==4){
    sum(m[,2]==1 & m[,4]<t)/nrow(m)
  }
}

#Vectorise the cumulative hazards of the patient
time_vector<-seq(0,80,0.01)
cumhaz1<-Hgompertz(time_vector, shape=0.1, rate = rel.risk_trans1*exp(-4.5))
cumhaz2<-Hgompertz(time_vector, shape=0.1, rate = rel.risk_trans2*exp(-4.65))
cumhaz3<-Hgompertz(time_vector, shape=0.15, rate = rel.risk_trans3*exp(-5.5))
cumhaz1<-data.frame(time=time_vector,Haz=cumhaz1,trans=1)
cumhaz2<-data.frame(time=time_vector,Haz=cumhaz2,trans=2)
cumhaz3<-data.frame(time=time_vector,Haz=cumhaz3,trans=3)

#build an msfit object
tmat<-mstate::transMat(x=list(c(2,3),c(4),c(),c()),names=c("health","illness","death","death_illness"))
msfit_object<-list(Haz=rbind(cumhaz1,cumhaz2,cumhaz3),trans=tmat)
class(msfit_object)<-c("msfit","coxrfx")

#Calculate state occupation probabilities
probtrans_object<-probtrans_ebmstate("health",msfit_object,'clockreset')
rel_freq_1<-sapply(time_vector,rel_freq,state=1)
rel_freq_12<-rel_freq_1+sapply(time_vector,rel_freq,state=2)
rel_freq_123<-rel_freq_12+sapply(time_vector,rel_freq,state=3)
rel_freq_1234<-rel_freq_123+sapply(time_vector,rel_freq,state=4)
save(probtrans_object,rel_freq_1,rel_freq_12,rel_freq_123,rel_freq_1234,time_vector,file = "../data/testing_probtrans_ebmstate.Rdata")

#compare estimates obtained by simulation with the probabilities computed by probtrans_ebmstate
plot(probtrans_object,legend = c("","","",""),lwd = 2)
lines(time_vector,rel_freq_1,lwd=2,col="red",lty=3)
lines(time_vector,rel_freq_12,lwd=2,col="red",lty=3)
lines(time_vector,rel_freq_123,lwd=2,col="red",lty=3)
lines(time_vector,rel_freq_1234,lwd=2,col="red",lty=3)
text(10,0.3,"health")
text(37,0.4,"illness")
text(78,0.07,"death")
text(67,0.6,"death_after_illness")
legend("bottomleft", legend = c("using probtrans_ebmstate","by simulation"),cex = 0.7,lty = c(1,3),col = c(1,"red"))

4 Testing \(\texttt{probtrans_fft}\)

The following plot shows that probtrans_fft can accurately compute state occupation probabilities under clock-reset Cox models, when it is given the vectors of cumulative hazards for each transition. The dashed red lines were computed using a data set of 100,000 simulated disease histories for the same patient (or, equivalently, 100,000 patients with the same vector of covariates). For any time t, these lines give the relative frequencies of each state. They are superimposed on solid black lines which represent the state occupation probabilities as computed by probtrans_fft, when the true (Gompertz) cumulative hazards are given to it as input.

accuracy of the estimator of state occupation probabilities in the function probtrans_fft.

Figure 4.1: accuracy of the estimator of state occupation probabilities in the function probtrans_fft.


Code used to generate the previous plot.

# Load packages and set working directory
set.seed(981735)
library(ebmstate)
library(flexsurv)

#generate a vector of covariates for the patient
nCovs<-50
covariate_vector<-rbinom(nCovs,size = 1,prob = 0.1)

#compute the relative risk (better said: relative hazard) of the patient for each transition
param<-runif(n=150,min = -0.5,max = 0.5)
rel.risk_trans1<-exp(sum(covariate_vector*param[(1+nCovs*0):(nCovs*1)]))
rel.risk_trans2<-exp(sum(covariate_vector*param[(1+nCovs*1):(nCovs*2)]))
rel.risk_trans3<-exp(sum(covariate_vector*param[(1+nCovs*2):(nCovs*3)]))

#generate 100,000 uncensored observations for the same patient
n<-100000
m<-matrix(c(rgompertz(n, shape=0.1, rate = rel.risk_trans1*exp(-4.5)),rgompertz(n, shape=0.1, rate = rel.risk_trans2*exp(-4.65))),ncol = 2)
v1<-apply(m,1,which.min)
m<-cbind(sapply(1:nrow(m),function(x) m[x,v1[x]]),v1)
m<-cbind(m,sapply(1:nrow(m), function(x) ifelse(m[x,2]==1,rgompertz(1,shape = 0.15,rate = rel.risk_trans3*exp(-5.5)),NA)))
m<-cbind(m,apply(m[,c(1,3)],1,sum,na.rm=T))
colnames(m)<-c("state1_duration","transition","state2_duration","total_time")
m<-as.data.frame(m)


#Build a function that computes relative frequencies of each state at some time t
rel_freq<-function(state,t){
  if(state==1){
    sum(m[,1]>t)/nrow(m)
  }else if(state==2){
    sum(m[,1]<t & m[,2]==1 & m[,4]>t)/nrow(m)
  }else if(state==3){
    sum(m[,1]<t & m[,2]==2)/nrow(m)
  }else if(state==4){
    sum(m[,2]==1 & m[,4]<t)/nrow(m)
  }
}

#Vectorise the cumulative hazards of the patient
time_vector<-seq(0,80,0.01)
cumhaz1<-Hgompertz(time_vector, shape=0.1, rate = rel.risk_trans1*exp(-4.5))
cumhaz2<-Hgompertz(time_vector, shape=0.1, rate = rel.risk_trans2*exp(-4.65))
cumhaz3<-Hgompertz(time_vector, shape=0.15, rate = rel.risk_trans3*exp(-5.5))
cumhaz1<-data.frame(time=time_vector,Haz=cumhaz1,trans=1)
cumhaz2<-data.frame(time=time_vector,Haz=cumhaz2,trans=2)
cumhaz3<-data.frame(time=time_vector,Haz=cumhaz3,trans=3)

#build an msfit object
tmat<-mstate::transMat(x=list(c(2,3),c(4),c(),c()),names=c("health","illness","death","death_illness"))
msfit_object<-list(Haz=rbind(cumhaz1,cumhaz2,cumhaz3),trans=tmat)
class(msfit_object)<-c("msfit","coxrfx")

#Calculate state occupation probabilities
probtrans_object<-probtrans_fft("health",msfit_object,max_time=80,nr_steps = 40000)
rel_freq_1<-sapply(time_vector,rel_freq,state=1)
rel_freq_12<-rel_freq_1+sapply(time_vector,rel_freq,state=2)
rel_freq_123<-rel_freq_12+sapply(time_vector,rel_freq,state=3)
rel_freq_1234<-rel_freq_123+sapply(time_vector,rel_freq,state=4)
save(probtrans_object,rel_freq_1,rel_freq_12,rel_freq_123,rel_freq_1234,time_vector,file = "../data/testing_probtrans_fft.Rdata")

#compare estimates obtained by simulation with the probabilities computed by probtrans_ebmstate
plot(probtrans_object,legend = c("","","",""),lwd = 2)
lines(time_vector,rel_freq_1,lwd=2,col="red",lty=3)
lines(time_vector,rel_freq_12,lwd=2,col="red",lty=3)
lines(time_vector,rel_freq_123,lwd=2,col="red",lty=3)
lines(time_vector,rel_freq_1234,lwd=2,col="red",lty=3)
text(10,0.3,"health")
text(27,0.4,"illness")
text(78,0.07,"death")
text(67,0.6,"death_after_illness")
legend("bottomleft", legend = c("using probtrans_fft","by simulation"),cex = 0.7,lty = c(1,3),col = c(1,"red"))

5 \(\texttt{mssample}\) and \(\texttt{probtrans_fft}\): running time and estimation accuracy

The following script generates plots comparing the running time and the estimation accuracy of \(\texttt{mstate::mssample}\) and \(\texttt{ebmstate::probtrans_fft}\).

#Generate some data from which cumulative hazards are to be estimated

set.seed(89910225)
library(parallel)
library(flexsurv)
library(ebmstate)
library(bindata)

n<-1000 # number of patients
covariate_names<-paste0("Cov",1:10) #number of covariates (for each transition)
nGroups<-1 #overall number of priors
nTrans<-3 # number of transitions
nParam<-nTrans*length(covariate_names) #total number of parameters (regression coefficients)
nr_simulated_data_sets<-100
param_fun<-function(nParam){
  out<-sqrt(10/length(covariate_names))*rnorm(n = nParam,mean = 0,sd = 0.65)
  out
}
TimePoints<-seq(10,70,length.out=7) #time points at which state occupation probability estimates are to be retrieved
nTimePoints<-length(TimePoints)

tmat<-mstate::transMat(x=list(c(2),c(3),c(4),c()),names=c("state1","state2","state3","state4"))

shape1<-0.1
rate1<-exp(-4.5)
shape2<-0.1
rate2<-exp(-2.7)
shape3<-0.15
rate3<-exp(-3.5)


param<-param_fun(nParam)
marg_probs<-runif(n = length(covariate_names),min = 0.05,max = 0.3)

covariate_matrix<-rmvbin(n,margprob = marg_probs)

colnames(covariate_matrix)<-covariate_names


#relative risks (relative hazards)
rel.risk_trans1<-exp(covariate_matrix%*%param[(1+length(covariate_names)*0):(length(covariate_names)*1)])
rel.risk_trans2<-exp(covariate_matrix%*%param[(1+length(covariate_names)*1):(length(covariate_names)*2)])
rel.risk_trans3<-exp(covariate_matrix%*%param[(1+length(covariate_names)*2):(length(covariate_names)*3)])


#Generate a transition history for each patient. Clock-reset model. Baseline hazard is Gompertz for all transitions. 

m<-matrix(c(flexsurv::rgompertz(n, shape=shape1, rate = rel.risk_trans1*rate1),flexsurv::rgompertz(n, shape=shape2, rate = rel.risk_trans2*rate2),flexsurv::rgompertz(n, shape=shape3, rate = rel.risk_trans3*rate3)),ncol = 3)
m<-cbind(m,apply(m,1,sum,na.rm=T))
m<-cbind(m,rexp(n,0.008))
m<-cbind(m,(m[,5]<m[,4]))
colnames(m)<-c("state1_duration","state2_duration","state3_duration","total_time", "cens_time","cens=1")
m<-as.data.frame(m)
print(sum(m$`cens=1`)/nrow(m))

#convert the data to long format
mstate.data<-data.frame()

for(i in 1:nrow(m)){
  id<-i
  from<-1
  to<-2
  trans<-1
  Tstart<-0
  Tstop<-min(m$state1_duration[i],m$cens_time[i])
  time<-Tstop-Tstart
  status<-as.numeric(m$cens_time[i]>m$state1_duration[i])
  mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,Tstart=Tstart,
                                            Tstop=Tstop,time=time,status=status)) 
  if(status==1){
    id<-i
    from<-2
    to<-3
    trans<-2
    Tstart<-Tstop
    Tstop<-min(Tstart[1]+m$state2_duration[i],m$cens_time[i])
    time<-Tstop-Tstart
    status<-as.numeric(m$cens_time[i]>m$state1_duration[i]+m$state2_duration[i])
    mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,Tstart=Tstart,
                                              Tstop=Tstop,time=time,status=status))
    if(status==1){
      id<-i
      from<-3
      to<-4
      trans<-3
      Tstart<-Tstop
      Tstop<-min(m$total_time[i],m$cens_time[i])
      time<-Tstop-Tstart
      status<-as.numeric(m$total_time[i]<m$cens_time[i])
      mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,Tstart=Tstart,
                                                Tstop=Tstop,time=time,status=status))
    }
  }
  
}

#add covariates
mstate.data<-cbind(mstate.data,covariate_matrix[mstate.data$id,])

#attributes and class
class(mstate.data)<-c("data.frame","msdata")
attr(mstate.data,"trans")<-tmat

#expand covariates
mstate.data<-mstate::expand.covs(mstate.data,covs =names(mstate.data)[-(1:8)])

#Fit non-parametric model and estimate cumulative hazards
surv0<-survival::Surv(mstate.data$time,mstate.data$status)
coxph_object0<-coxph(as.formula("surv0~strata(trans)"),data = mstate.data)
msfit_object0<-msfit(coxph_object0,trans = tmat)

#State occupation probabilities by probtrans_fft

system.time(probtrans_object0<-probtrans_fft("state1",msfit_object0,time = seq(0,150,length.out=100000)))

tmat<-mstate::transMat(x=list(c(2),c(3),c(4),c())
)

#State occupation probabilities by mssample

system.time(probtrans_object_100<-mssample(msfit_object0$Haz
         ,tmat
         ,history = list(state=1,time=0,tstate=NULL)
         ,clock="reset"
         ,tvec=seq(0,100,length.out=5000)
         ,output = "state"
         ,M=100)
)

system.time(probtrans_object_1000<-mssample(msfit_object0$Haz
                                           ,tmat
                                           ,history = list(state=1,time=0,tstate=NULL)
                                           ,clock="reset"
                                           ,tvec=seq(0,100,length.out=5000)
                                           ,output = "state"
                                           ,M=1000)
)

system.time(probtrans_object_10000<-mssample(msfit_object0$Haz
                                           ,tmat
                                           ,history = list(state=1,time=0,tstate=NULL)
                                           ,clock="reset"
                                           ,tvec=seq(0,100,length.out=5000)
                                           ,output = "state"
                                           ,M=10000)
)

system.time(probtrans_object_100000<-mssample(msfit_object0$Haz
                                           ,tmat
                                           ,history = list(state=1,time=0,tstate=NULL)
                                           ,clock="reset"
                                           ,tvec=seq(0,100,length.out=5000)
                                           ,output = "state"
                                           ,M=100000)
)


#100 000: 1026 secs
#10 000: 93 secs
# 1000: 9.3 secs
# 100: 0.92 secs
# fft: 0.94 secs

#Plot code

probtrans_objects<-list(probtrans_object_100
                        ,probtrans_object_1000
                        ,probtrans_object_10000
                        ,probtrans_object0[[1]])
pdf(file ="./mssample_and_probtrans_fft.pdf"
    ,width = 6
    ,height = 4)
par(mfcol=c(4,4),mar=c(0,0,0.2,0),mgp=c(3,0.5,0),oma=c(3,3,1,1),tck=-0.05)
for(j in 1:4){
  for(i in 2:5){
    ifelse(j==1,yaxt_value<-"s",yaxt_value<-"n")
    plot(probtrans_object_100000$time,probtrans_object_100000[[i]]
         ,type="l"
         ,ylab="",xlab="",xaxt="n"
         ,yaxt=yaxt_value,cex.axis=0.7,las=1
    )
    if(i==5){
      axis(1,at=seq(0,90,15),cex.axis=0.75,mgp=c(3,0.2,0))
      mtext("time",side = 1,line = 1.2,cex = 0.6)
    }
    if(j==1){
      mtext("probability",side = 2,line = 2,cex = 0.6)
    }

    points(probtrans_objects[[j]]$time,probtrans_objects[[j]][[i]],type="l"
           ,col="red")
  }
  
}
dev.off()

6 Simulation study

Supplementary figures

The following figures are supplementary figures to the simulation study reported in section 4 of the main text. They show proportions of valid, infinite and missing (‘NA’) estimates produced by the empirical Bayes Cox estimators implemented in \(\texttt{ebmstate}\) and the fully non-parametric (null) estimators.



estimation of regression coefficients, relative hazards, and state occupation probabilities with empirical Bayes Cox estimators: proportions of valid, infinite and missing ('NA') values for data sets with 100 patients.

Figure 6.1: estimation of regression coefficients, relative hazards, and state occupation probabilities with empirical Bayes Cox estimators: proportions of valid, infinite and missing (‘NA’) values for data sets with 100 patients.



estimation of regression coefficients, relative hazards, and state occupation probabilities with empirical Bayes Cox estimators: proportions of valid, infinite and missing ('NA') values for data sets with 1000 patients.

Figure 6.2: estimation of regression coefficients, relative hazards, and state occupation probabilities with empirical Bayes Cox estimators: proportions of valid, infinite and missing (‘NA’) values for data sets with 1000 patients.





estimation of state occupation probabilities with non-parametric estimators: proportions of valid, infinite and missing ('NA') values.

Figure 6.3: estimation of state occupation probabilities with non-parametric estimators: proportions of valid, infinite and missing (‘NA’) values.

Sample script

The sample script below performs the following tasks: a) generates data sets of 1000 patients under a Cox model with a ‘linear’ transition structure and 500 covariates/coefficients per transition (non-sparse but scaled regression coefficients); b) fits the fixed effects Cox model, an empirical Bayes Cox model and a fully non-parametric model; c) for each data set, estimates state occupation probabilities under these three models; d) for each data set, estimates true occupation probabilities by simulating from the true model.


set.seed(0840350)
library(parallel)
library(flexsurv)
library(ebmstate)
library(bindata)

n<-1000 # number of patients
covariate_names<-paste0("Cov",1:500) #number of covariates (for each transition)
nGroups<-1 #overall number of priors
nTrans<-3 # number of transitions
nParam<-nTrans*length(covariate_names) #total number of parameters (regression coefficients)
nr_simulated_data_sets<-100
param_fun<-function(nParam){
  out<-sqrt(10/length(covariate_names))*rnorm(n = nParam,mean = 0,sd = 0.65)
  out
}
TimePoints<-seq(10,70,10) #time points at which state occupation probability estimates are to be retrieved
nTimePoints<-length(TimePoints)

tmat<-mstate::transMat(x=list(c(2),c(3),c(4),c()),names=c("state1","state2","state3","state4"))

shape1<-0.1
rate1<-exp(-4.5)
shape2<-0.1
rate2<-exp(-2.7)
shape3<-0.15
rate3<-exp(-3.5)

file1<-"../data/dense_sim_1000obs_1group_for_all_trans_500vars_LinearStructure.Rdata"


#A function that computes relative frequencies of each state at some time t
rel_freq<-function(state,t,m){
  if(state=="state1"){
    sum(t<=m[,1])/nrow(m)
  }else if(state=="state2"){
    sum(t>m[,1]&t<=m[,1]+m[,2])/nrow(m)
  }else if(state=="state3"){
    sum(t>m[,1]+m[,2]&t<=m[,1]+m[,2]+m[,3])/nrow(m)
  }else if(state=="state4"){
    sum(t>m[,1]+m[,2]+m[,3])/nrow(m)
  }
}


simfun<-function(j){
  param<-param_fun(nParam)
  marg_probs<-runif(n = length(covariate_names),min = 0.05,max = 0.3)
  
  true_rh_fun<-function(trans){
    exp(covariate_vector%*%param[(1+length(covariate_names)*(trans-1)):(length(covariate_names)*trans)])
  }
  
  rh_fun_coxrfx<-function(trans){
    exp(covariate_vector%*%coxrfx_object$coefficients[seq(trans,length(coxrfx_object$coefficients),nTrans)])
  }
  
  rh_fun_coxph<-function(trans){
    exp(covariate_vector%*%coxph_object$coefficients[seq(trans,length(coxph_object$coefficients),nTrans)])
  }
  
  covariate_matrix<-rmvbin(n,margprob = marg_probs)
  
  colnames(covariate_matrix)<-covariate_names
  
  
  #relative risks (relative hazards)
  rel.risk_trans1<-exp(covariate_matrix%*%param[(1+length(covariate_names)*0):(length(covariate_names)*1)])
  rel.risk_trans2<-exp(covariate_matrix%*%param[(1+length(covariate_names)*1):(length(covariate_names)*2)])
  rel.risk_trans3<-exp(covariate_matrix%*%param[(1+length(covariate_names)*2):(length(covariate_names)*3)])
  
  
  #Generate a transition history for each patient. Clock-reset model. Baseline hazard is Gompertz for all transitions. 
  
  m<-matrix(c(flexsurv::rgompertz(n, shape=shape1, rate = rel.risk_trans1*rate1),flexsurv::rgompertz(n, shape=shape2, rate = rel.risk_trans2*rate2),flexsurv::rgompertz(n, shape=shape3, rate = rel.risk_trans3*rate3)),ncol = 3)
  m<-cbind(m,apply(m,1,sum,na.rm=T))
  m<-cbind(m,rexp(n,0.008))
  m<-cbind(m,(m[,5]<m[,4]))
  colnames(m)<-c("state1_duration","state2_duration","state3_duration","total_time", "cens_time","cens=1")
  m<-as.data.frame(m)
  print(sum(m$`cens=1`)/nrow(m))
  
  #convert the data to long format
  mstate.data<-data.frame()
  
  for(i in 1:nrow(m)){
    id<-i
    from<-1
    to<-2
    trans<-1
    Tstart<-0
    Tstop<-min(m$state1_duration[i],m$cens_time[i])
    time<-Tstop-Tstart
    status<-as.numeric(m$cens_time[i]>m$state1_duration[i])
    mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,                                             trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status)) 
    if(status==1){
      id<-i
      from<-2
      to<-3
      trans<-2
      Tstart<-Tstop
      Tstop<-min(Tstart[1]+m$state2_duration[i],m$cens_time[i])
      time<-Tstop-Tstart
      status<-as.numeric(m$cens_time[i]>m$state1_duration[i]+m$state2_duration[i])
      mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,                                             trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status))
      if(status==1){
        id<-i
        from<-3
        to<-4
        trans<-3
        Tstart<-Tstop
        Tstop<-min(m$total_time[i],m$cens_time[i])
        time<-Tstop-Tstart
        status<-as.numeric(m$total_time[i]<m$cens_time[i])
        mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,                                     Tstart=Tstart,Tstop=Tstop,time=time,status=status))
      }
    }
    
  }
  
  #add covariates
  mstate.data<-cbind(mstate.data,covariate_matrix[mstate.data$id,])
  
  #attributes and class
  class(mstate.data)<-c("data.frame","msdata")
  attr(mstate.data,"trans")<-tmat
  
  
  #expand covariates
  mstate.data<-mstate::expand.covs(mstate.data,covs =names(mstate.data)[-(1:8)])
  
  #Fit non-parametric model
  surv0<-survival::Surv(mstate.data$Tstart,mstate.data$Tstop,mstate.data$status)
  coxph_object0<-coxph(as.formula("surv0~strata(trans)"),data = mstate.data)
  msfit_object0<-msfit_generic(coxph_object0,trans = tmat)
  probtrans_object0<-probtrans_mstate(msfit_object0,predt = 0)
  
  #retrieve state occupation prob estimates at times 10,20,...,50
  time_indices<-sapply(TimePoints,function(x) which.min(abs(probtrans_object0[[1]]$time-x)))
  probtrans_object0_reduced<-probtrans_object0[[1]][time_indices,1:(ncol(tmat)+1)]
  state_occup_estimates0<-as.matrix(probtrans_object0_reduced)
  
  
  #Fit homogeneous clock-reset empirical Bayes model. 
  
  #argument 'Z' of coxrfx
  Z<-mstate.data[,-(1:(8+length(covariate_names)))]
  Z$strata<-Z$trans<-mstate.data$trans
  
  #argument 'surv' of coxrfx
  surv<-survival::Surv(mstate.data$time,mstate.data$status)

  #argument 'groups' of coxrfx
  groups<-rep("unique_group",length(param))
  
  #fit random effects model
  coxrfx_object<-CoxRFX(Z,surv,groups,max.iter = 600,tol = 0.0001,sigma.hat = "df")
  cat("coxrfx_concordance:",concordance(coxrfx_object)$concordance)
  
  #fit fixed effects model
  
  model_formula<-as.formula(paste0("surv~",paste(head(names(Z),length(names(Z))-2),collapse = "+"),"+strata(strata)"))
  try(coxph_object<-ebmstate:::coxph(formula = model_formula,data=Z,control = coxph.control(iter.max = 100)),silent=TRUE)
  try(cat("coxph_concordance:",concordance(coxph_object)$concordance),silent=TRUE)
  
  #simulate single patient covariate data for computing relative and cumulative hazards 
  covariate_vector<-rmvbin(1,marg_probs)
  
  #single patient true relative hazards
  true_rel_risk_sampled_patient<-sapply(1:nTrans,true_rh_fun)
  
  #estimated relative hazard for patient (coxrfx)
  rel_risk_estimates_coxrfx<-sapply(1:nTrans,rh_fun_coxrfx)
  
  #estimated relative risks for patient (coxph)
  try(rel_risk_estimates_coxph<-sapply(1:nTrans,rh_fun_coxph),silent=TRUE)
  
  #put patient covariate vector in long format
  newdata<-data.frame(matrix(rep(covariate_vector,length(na.exclude(as.vector(tmat)))),nrow=length(na.exclude(as.vector(tmat))),byrow = T))
  names(newdata)<-covariate_names
  newdata$strata<-newdata$trans<-1:nTrans
  attr(newdata,"trans")<-tmat
  class(newdata)<-c("data.frame","msdata")
  newdata<-mstate::expand.covs(newdata,covs=names(newdata)[!names(newdata)%in%c("trans","strata")],append=TRUE)
  newdata<-newdata[-(1:length(covariate_names))]
  
  
  #compute cumulative hazards and state occupation probs (coxrfx)
  msfit_object_coxrfx<-msfit_generic(coxrfx_object,newdata,trans = tmat)
  probtrans_object_coxrfx<-probtrans_fft("state1",msfit_object_coxrfx,time = seq(0,80,length.out=50000))
  
  #retrieve state occupation prob estimates at times 10,20,...,50
  time_indices<-sapply(TimePoints,function(x) which.min(abs(probtrans_object_coxrfx[[1]]$time-x)))
  probtrans_object_coxrfx_reduced<-probtrans_object_coxrfx[[1]][time_indices,]
  state_occup_estimates_coxrfx<-as.matrix(probtrans_object_coxrfx_reduced)
  
  #compute cumulative hazards and state occupation probs (coxph)
  try(msfit_object_coxph<-msfit_generic(coxph_object,newdata,trans = tmat),silent = TRUE)
  try(probtrans_object_coxph<-probtrans_fft("state1",msfit_object_coxph,time = seq(0,80,length.out=50000)),silent = TRUE)
  
  #retrieve state occupation prob estimates at times 10,20,...,50
  try(time_indices<-sapply(TimePoints,function(x) which.min(abs(probtrans_object_coxph[[1]]$time-x))),silent = TRUE)
  try(probtrans_object_coxph_reduced<-probtrans_object_coxph[[1]][time_indices,],silent = TRUE)
  try(state_occup_estimates_coxph<-as.matrix(probtrans_object_coxph_reduced),silent = TRUE)
  
  
  #Get very precise estimates of state occupation probabilities for particular patient
  rel.risk_trans1<-exp(covariate_vector%*%param[(1+length(covariate_names)*0):(length(covariate_names)*1)])
  rel.risk_trans2<-exp(covariate_vector%*%param[(1+length(covariate_names)*1):(length(covariate_names)*2)])
  rel.risk_trans3<-exp(covariate_vector%*%param[(1+length(covariate_names)*2):(length(covariate_names)*3)])
  
  #generate 100,000 uncensored observations for the same patient
  n2<-100000
  
  m<-matrix(c(flexsurv::rgompertz(n2, shape=shape1, rate = rel.risk_trans1*rate1),flexsurv::rgompertz(n2, shape=shape2, rate = rel.risk_trans2*rate2),flexsurv::rgompertz(n2, shape=shape3, rate = rel.risk_trans3*rate3)),ncol = 3)
  m<-cbind(m,rowSums(m,na.rm=TRUE))
  colnames(m)<-c("state1_duration","state2_duration","state3_duration","total_time")
  m<-as.data.frame(m)
  
  time_vector<-TimePoints
  rel_freq_1<-sapply(time_vector,rel_freq,state="state1",m=m)
  rel_freq_2<-sapply(time_vector,rel_freq,state="state2",m=m)
  rel_freq_3<-sapply(time_vector,rel_freq,state="state3",m=m)
  rel_freq_4<-sapply(time_vector,rel_freq,state="state4",m=m)
  
  true_probs_matrix<-cbind(time_vector,rel_freq_1,rel_freq_2,rel_freq_3,rel_freq_4)
  if(!'coxph_object'%in%ls()) coxph_object<-list(coefficients=NA)
  if(!'rel_risk_estimates_coxph'%in%ls()) rel_risk_estimates_coxph<-NA 
  if(!'state_occup_estimates_coxph'%in%ls()) state_occup_estimates_coxph<-NA 
  
  
  list(n=n,nGroups=nGroups,nTrans=nTrans,nParam=nParam,param=param,
       TimePoints=TimePoints,tmat=tmat,
       coxrfx_coefficients=coxrfx_object$coefficients,
       coxrfx_mu=coxrfx_object$mu,
       coxrfx_sigma2=coxrfx_object$sigma2,
       coxph_coefficients=coxph_object$coefficients,
       covariate_vector=covariate_vector,
       true_rel_risk_sampled_patient=true_rel_risk_sampled_patient,
       rel_risk_estimates_coxph=rel_risk_estimates_coxph,
       rel_risk_estimates_coxrfx=rel_risk_estimates_coxrfx,
       state_occup_estimates_coxph=state_occup_estimates_coxph,
       state_occup_estimates_coxrfx=state_occup_estimates_coxrfx,
       state_occup_estimates0=state_occup_estimates0,
       true_probs_matrix=true_probs_matrix,
       marg_probs=marg_probs)
  
  
}

simfun_object<-mclapply(1:300,simfun,mc.cores = 50)
save(simfun_object,file=file1)


Example script to generate boxplots of average absolute errors and plots of proportions of failed estimates. It takes as input a set of objects with model estimates, such as the one generated by the previous block of code.

######### IMPORTING SIMULATED DATA
#Choose one of the following batches of data to import

### Batch 1: each simulated data set has 1000 patients
rdata_names<-vector("list",3)
rdata_names[[1]]<-c(
  "../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_10vars_LinearStructure.Rdata"
  ,  "../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_100vars_LinearStructure.Rdata"
  ,"../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_200vars_LinearStructure.Rdata"
  ,"../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_300vars_LinearStructure.Rdata"
  ,"../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_400vars_LinearStructure.Rdata"
  ,"../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_500vars_LinearStructure.Rdata"
)
rdata_names[[2]]<-c(
  "../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_10vars_CompetingRisksTransStructure.Rdata"
  ,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_100vars_CompetingRisksTransStructure.Rdata"
  ,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_200vars_CompetingRisksTransStructure.Rdata"
  ,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_300vars_CompetingRisksTransStructure.Rdata"
  ,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_400vars_CompetingRisksTransStructure.Rdata"
  ,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_500vars_CompetingRisksTransStructure.Rdata"
)
rdata_names[[3]]<-c(
  "../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_10vars_MtransStructure.Rdata"
  ,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_100vars_MtransStructure.Rdata"
  ,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_200vars_MtransStructure.Rdata"
  ,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_300vars_MtransStructure.Rdata"
  ,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_400vars_MtransStructure.Rdata"
  ,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_500vars_MtransStructure.Rdata"
)

### Batch 2: each simulated data set has 100 patients
rdata_names<-vector("list",3)
rdata_names[[1]]<-c(
  "../data/Linear_structure_100patients/dense_sim_100obs_1group_for_all_trans_10vars_LinearStructure.Rdata"
  ,"../data/Linear_structure_100patients/dense_sim_100obs_1group_for_all_trans_40vars_LinearStructure.Rdata"
  ,"../data/Linear_structure_100patients/dense_sim_100obs_1group_for_all_trans_70vars_LinearStructure.Rdata"
  ,  "../data/Linear_structure_100patients/dense_sim_100obs_1group_for_all_trans_100vars_LinearStructure.Rdata"
)
rdata_names[[2]]<-c(
  "../data/Comp_risks_structure_100patients/dense_sim_100obs_1group_for_all_trans_10vars_CompetingRisksTransStructure.Rdata"
  ,"../data/Comp_risks_structure_100patients/dense_sim_100obs_1group_for_all_trans_40vars_CompetingRisksTransStructure.Rdata"
  ,"../data/Comp_risks_structure_100patients/dense_sim_100obs_1group_for_all_trans_70vars_CompetingRisksTransStructure.Rdata"
  ,"../data/Comp_risks_structure_100patients/dense_sim_100obs_1group_for_all_trans_100vars_CompetingRisksTransStructure.Rdata"
)
rdata_names[[3]]<-c(
  "../data/M_structure_100patients/dense_sim_100obs_1group_for_all_trans_10vars_MtransStructure.Rdata"
  ,"../data/M_structure_100patients/dense_sim_100obs_1group_for_all_trans_40vars_MtransStructure.Rdata"
  ,"../data/M_structure_100patients/dense_sim_100obs_1group_for_all_trans_70vars_MtransStructure.Rdata"
  ,"../data/M_structure_100patients/dense_sim_100obs_1group_for_all_trans_100vars_MtransStructure.Rdata"
)

## load the batch of simulated data chosen

x_values<-vector("list",3)
simfun_object_lists<-vector("list",length(rdata_names))

for(i in 1:length(rdata_names)){
  x_values[[i]]<-rep(NA,length(rdata_names[[i]]))
  for(j in 1:length(x_values[[i]])){
    load(rdata_names[[i]][j])
    simfun_object_lists[[i]][[j]]<-simfun_object
    x_values[[i]][j]<-length(simfun_object[[1]]$covariate_vector)
  }
}
rm(simfun_object)

unwrap_fun<-function(simfun_object,name){
  if(name%in%c("n","nGroups","nTrans","nParam","param","TimePoints"
               ,"tmat","coxrfx_mu","coxrfx_sigma2","covariate_vector"
               ,"true_rel_risk_sampled_patient","true_probs_matrix"
               ,"marg_probs")){
    sapply(simfun_object,function(x){
      return(x[[name]])
    } 
    )
  }else if(name%in%c("coxph_coefficients","coxrfx_coefficients")){
    sapply(simfun_object,function(x){
      if(length(x[[name]])==x$nParam){
        return(x[[name]])
      }else{
        return(rep(NA,x$nParam))
      }
    } 
    )   
  }else if(grepl("rel_risk_estimates",name)){
    sapply(simfun_object,function(x){
      if(length(x[[name]])==x$nTrans){
        return(x[[name]])
      }else{
        return(rep(NA,x$nTrans))
      }
    } 
    )
  }else if(grepl("state_occup_estimates",name)){
    out<-sapply(simfun_object,function(x){
      if(length(as.vector(x[[name]]))==prod(dim(x$true_probs_matrix))){
        return(as.vector(x[[name]])[-(1:dim(x$true_probs_matrix)[1])])
      }else{
        return(rep(NA,prod(dim(x$true_probs_matrix))-dim(x$true_probs_matrix)[1]))
      }
    } 
    )
    invalid_estimates<-which(abs(colSums(out)-nrow(simfun_object[[1]]$true_probs_matrix))>1)
    out[,invalid_estimates]<-NA
    out
  }
  
}

plot_abs3<-function(simfun_object){
  list1<-sapply(names(simfun_object[[1]]),unwrap_fun,simfun_object=simfun_object,USE.NAMES = TRUE)
  names(list1)<-names(simfun_object[[1]])
  list1$nr_simulated_data_sets<-length(simfun_object)
  param_order_fun<-function(nTrans,nParam){
    as.vector(sapply(1:(nParam/nTrans),function(x) seq(x,nParam,nParam/nTrans)))
  }
  param_order<-param_order_fun(list1$nTrans[1],list1$nParam[1])
  list1$param<-list1$param[param_order,]

  abs_errors_coef0<-abs(list1$param)
  abs_errors_coef_coxph<-abs(list1$coxph_coefficients-list1$param)
  abs_errors_coef_coxrfx<-abs(list1$coxrfx_coefficients-list1$param)
  
  abs_errors_rel_risk_coxph<-abs(list1$rel_risk_estimates_coxph-list1$true_rel_risk_sampled_patient)
  abs_errors_rel_risk_coxrfx<-abs(list1$rel_risk_estimates_coxrfx-list1$true_rel_risk_sampled_patient)
  abs_errors_rel_risk0<-abs(1-list1$true_rel_risk_sampled_patient)
  
  abs_errors_pred_coxph<-abs(list1$state_occup_estimates_coxph-list1$true_probs_matrix[-(1:dim(simfun_object[[1]]$true_probs_matrix)[1]),])
  abs_errors_pred_coxrfx<-abs(list1$state_occup_estimates_coxrfx-list1$true_probs_matrix[-(1:dim(simfun_object[[1]]$true_probs_matrix)[1]),])
  abs_errors_pred0<-abs(list1$state_occup_estimates0-list1$true_probs_matrix[-(1:dim(simfun_object[[1]]$true_probs_matrix)[1]),])
  
  out<-list(coef_errors=list(coxph=abs_errors_coef_coxph
                             ,coxrfx=abs_errors_coef_coxrfx
                             ,null=abs_errors_coef0
                             )
            ,rel_risk_errors=list(coxph=abs_errors_rel_risk_coxph
                           ,coxrfx=abs_errors_rel_risk_coxrfx
                           ,null=abs_errors_rel_risk0)
            ,pred_errors=list(coxph=abs_errors_pred_coxph
                       ,coxrfx=abs_errors_pred_coxrfx
                       ,null=abs_errors_pred0
            )
            ,true_values=list(coef=list1$param
                              ,rel_risk=list1$true_rel_risk_sampled_patient
                              ,pred=list1$true_probs_matrix
                              )
            ,pred_estimates=list(coxph=list1$state_occup_estimates_coxph
                            ,coxrfx=list1$state_occup_estimates_coxrfx
                            ,null=list1$state_occup_estimates0
                            )
            ,coef_estimates=list(coxph=list1$coxph_coefficients
                                 ,coxrfx=list1$coxrfx_coefficients
                                 )
            ,rel_risk_estimates=list(coxph=list1$rel_risk_estimates_coxph
                                     ,coxrfx=list1$rel_risk_estimates_coxrfx
                                     )
  )
  out
}

plot_object_lists<-vector("list",length = length(rdata_names))
for(i in 1:length(simfun_object_lists)){
  plot_object_lists[[i]]<-lapply(simfun_object_lists[[i]],plot_abs3)
}


#### PLOTS

##boxplots of average absolute error for each simulated data set

boxplot_fun<-function(plot_obj_list,target){
  limits<-list(coef_errors=0.8,rel_risk_errors=10,pred_errors=0.5)
  for(i in 1:length(plot_obj_list)){
    obj1<-apply(plot_obj_list[[i]][[target]][["coxph"]],2,mean)
    obj2<-apply(plot_obj_list[[i]][[target]][["coxrfx"]],2,mean)
    obj3<-apply(plot_obj_list[[i]][[target]][["null"]],2,mean)
    obj1<-sapply(obj1,function(x) ifelse(x<limits[[target]]
                                         ,x,limits[[target]]))
    obj2<-sapply(obj2,function(x) ifelse(x<limits[[target]]
                                         ,x,limits[[target]]))
    obj3<-sapply(obj3,function(x) ifelse(x<limits[[target]]
                                         ,x,limits[[target]]))
    try(boxplot(obj1,add=TRUE, at = i-0.22,boxwex=0.35,pars=list(outcex=0.2,boxfill="white",lwd=0.7,medlwd=0.75,whisklty=1),yaxt="n",axes=FALSE))
    try(boxplot(obj2,add=TRUE,at = i,boxwex=0.35,pars=list(outcol="red",outcex=0.2,boxfill="white",lwd=0.7,medlwd=0.75,whisklty=1),yaxt="n",axes=FALSE,border="red"))
    try(boxplot(obj3,add=TRUE,at = i+0.22,boxwex=0.35,pars=list(outcex=0.2,boxfill="white",lwd=0.7,medlwd=0.75,whisklty=1),yaxt="n",axes=FALSE,border="blue"))
  }
  
}



pdf(file ="./plots/rplots.pdf"
    ,width = 6
    ,height = 4)
par(mfrow=c(3,3),mar=c(2,2,1,0.5))
ylims<-c(0.8,10,0.5)
names(ylims)<-c("coef_errors","rel_risk_errors","pred_errors")
y_at<-list(coef_errors=seq(0,0.8,0.2)
           ,rel_risk_errors=seq(0,10,2)
           ,pred_errors=seq(0,0.5,0.25)
           )
y_labels<-list(coef_errors=c(0,0.2,0.4,0.6,expression(NULL>=0.8))
           ,rel_risk_errors=c(0,2,4,6,8,expression(NULL>=10))
           ,pred_errors=c(0,0.25,expression(NULL>=0.5))
)
for (i in 1:length(plot_object_lists)){
  for(j in c("coef_errors","rel_risk_errors","pred_errors")){
    plot(NA,ylim=c(0,ylims[j])
            ,xlim=c(0.5,length(x_values[[i]])+0.5)
            ,xlab=""
            ,main=""
            ,yaxt="n"
            ,xaxt="n"
            ,lwd=0.5
            ,bty="n"
            
    )

    axis(1
         ,at=1:length(x_values[[i]])
         ,labels = x_values[[i]]
         ,tick=FALSE
         ,mgp=c(3,0.075,0)
         ,cex.axis=0.8
         ,lwd = 0.75
         )
    axis(2
         ,tick=TRUE
         ,tck=-0.05
         ,mgp=c(3,0.4,0)
         ,cex.axis=0.7
         ,lwd=0.75
         ,at=y_at[[j]]
         ,labels = y_labels[[j]]
    )
    mtext("covariates per transition"
          ,side=1
          ,line=1
          ,cex=0.5
          )
    mtext("average absolute error"
          ,side=2
          ,line=1.3
          ,cex=0.5
    )
    
    boxplot_fun(plot_object_lists[[i]],j)
    box(lwd=0.75)
    
  }
}
dev.off()



par(mfrow=c(1,1))
plot(NA,ylim=c(0,1),bty="n",ylab="",xlab="",xaxt="n",yaxt="n")
legend(x=0.8,y=0.5,legend = c("Cox","EBCox","null"),fill = c("white"),border = c("black","red","blue"))



## plots of proportions of failed estimates
na_function<-function(object,target,estimator){
  if(!target=="coef_estimates"){
    na_prp<-sum(is.na(as.vector(object[[target]][[estimator]])))/length(as.vector(object[[target]][[estimator]]))
    inf_prp<-sum(is.infinite(as.vector(object[[target]][[estimator]])))/length(as.vector(object[[target]][[estimator]]))
    c(na_prp,inf_prp,1-na_prp-inf_prp)
  }else{
    na_prp<-sum(is.na(unlist(object[[target]][[estimator]])))/length(unlist(object[[target]][[estimator]]))
    inf_prp<-sum(is.infinite(unlist(object[[target]][[estimator]])))/length(unlist(object[[target]][[estimator]]))
    out<-c(na_prp,inf_prp,1-na_prp-inf_prp)
    names(out)<-c("NA","Inf","valid")
    out
  }
}

# batch with 1000 patients per data set
pdf(file ="./plots/na_props.pdf"
    ,width = 6
    ,height = 4)
par(mfrow=c(3,3),mar=c(3,3.5,1,0),mgp=c(3,0.75,0))
for(i in 1:3){
  for(j in c("coef_estimates","rel_risk_estimates","pred_estimates")){
    barplot_matrix<-sapply(plot_object_lists[[i]],na_function,target=j,estimator="coxph")
    barplot(barplot_matrix,border=NA
            ,col = c(1,2,4)
            ,width=1,xlim = c(0,7),cex.axis =0.8,las=2,xaxt="n")
    axis(1,at=c(0.7,1.9,3.1,4.3,5.5,6.7)
         ,labels=c(10,100,200,300,400,500)
         ,tick = FALSE
         ,mgp=c(3,0.3,0)
         ,las=1
         ,cex.axis=0.8)
    mtext("covariates per trans",cex=0.5,line=1.3,side=1)
    mtext("proportion",cex=0.5,line=1.9,side=2,las=3)
  }
}
dev.off()

#batch with 100 patients per data set
pdf(file ="./plots/na_props_100.pdf"
    ,width = 6
    ,height = 4)
par(mfrow=c(3,3),mar=c(3,3.5,1,0),mgp=c(3,0.75,0))
for(i in 1:3){
  for(j in c("coef_estimates","rel_risk_estimates","pred_estimates")){
    barplot_matrix<-sapply(plot_object_lists[[i]],na_function,target=j,estimator="coxph")
    barplot(barplot_matrix,border=NA
            ,col = c(1,2,4)
            ,width=1,xlim = c(0,5),cex.axis =0.8,las=2,xaxt="n")
    axis(1,at=c(0.7,1.9,3.1,4.3)
         ,labels=c(10,40,70,100)
         ,tick = FALSE
         ,mgp=c(3,0.4,0)
         ,las=1
         ,cex.axis=0.8)
    mtext("covariates per trans",cex=0.5,line=1.4,side=1)
    mtext("proportion",cex=0.5,line=1.9,side=2,las=3)
  }
}
dev.off()

#legend
pdf(file ="./plots/na_props_100_legend.pdf"
    ,width = 6
    ,height = 4)
par(mfrow=c(1,1))
plot(NA,ylim=c(0,1),bty="n",ylab="",xaxt="n",yaxt="n")
legend(x=0.8,y=0.8
       ,legend = c("valid","infinite","NA")
       ,fill = c(4,2,1),)
dev.off()

  1. European Bioinformatics Institute (EMBL-EBI), ↩︎

  2. Genome Biology Unit, EMBL↩︎

  3. German Cancer Research Center (DKFZ)↩︎