This section shows the results of a simulation study to assess the consistency of the estimator in \(\texttt{CoxRFX}\) (an estimator of the regression coefficients in a Cox model).
In the following grid of plots, each column of the grid is based on a different batch of 500 data sets simulated from the same illness-death model. This model is a clock-reset Cox model with 10 regression coefficients for each transition (one for each of 10 covariates), and a Gompertz baseline hazard. The (prior) distribution of the parameters in each transition was assumed to be Gaussian (with undetermined mean and variance). The only difference between batches/columns is the number of observations (i.e. patients) per data set: 250 observations in the first column, 1000 in the second, 2000 in the third and 4000 in the fourth.
The top row plots compare the true coefficient values with the mean estimate over the 500 data sets in a batch; they give strong indication that the bias vanishes as the sample size increases. At the same time, the distribution of the estimator is concentrated inside increasingly smaller neighbourhoods around the true parameter value. This is strongly suggested by the middle row plots, which show a vanishing sample variance for each individual parameter, and also the bottom row ones, which show the empirical density of the sum of squared errors becoming concentrated in smaller and smaller neighbourhoods around zero.
Figure 1.1: consistency of the CoxRFX estimator.
Set the parameters of the simulation
set.seed(20078)
library(mvtnorm)
library(ebmstate)
n<-4000 # number of patients
covariate_names<-paste0("Cov",1:10) #number of covariates (for each transition)
nGroups<-1 #number of groups per transition
nParam<-3*length(covariate_names) #total number of parameters (regression coefficients)
nr_simulated_data_sets<-500
param<-runif(n=nParam,min = -0.5,max = 1) #simulation of parameters
file1<-"../data/coxph_vs_coxrfx_sim_illness_death_4000obs.Rdata"
Generate data by simulation
coefficient_estimates<-matrix(nrow = nr_simulated_data_sets,ncol = 3*length(covariate_names))
mu_estimates<-matrix(nrow = nr_simulated_data_sets,ncol=3*nGroups)
sigma2_estimates<-matrix(nrow = nr_simulated_data_sets,ncol = 3*nGroups)
coefficient_estimates_coxph<-matrix(nrow = nr_simulated_data_sets,ncol = 3*length(covariate_names))
for (j in 1:nr_simulated_data_sets){
#covariates
if(length(covariate_names)>1){
covariate_matrix<-t(sapply(rep(length(covariate_names),n),function(x) rbinom(n=x,size = 1,prob = 0.5)))
}else{
covariate_matrix<-matrix(rbinom(n,size = 1,prob = 0.5),ncol=1)
}
colnames(covariate_matrix)<-covariate_names
#relative risks (relative hazards)
rel.risk_trans1<-exp(covariate_matrix%*%param[(1+length(covariate_names)*0):(length(covariate_names)*1)])
rel.risk_trans2<-exp(covariate_matrix%*%param[(1+length(covariate_names)*1):(length(covariate_names)*2)])
rel.risk_trans3<-exp(covariate_matrix%*%param[(1+length(covariate_names)*2):(length(covariate_names)*3)])
#Generate a transition history for each patient. Clock-reset Cox model. Baseline hazard is Gompertz for all transitions.
m<-matrix(c(flexsurv::rgompertz(n, shape=0.1, rate = rel.risk_trans1*exp(-4.5)),flexsurv::rgompertz(n, shape=0.1, rate = rel.risk_trans2*exp(-4.65))),ncol = 2)
v1<-apply(m,1,which.min)
m<-cbind(sapply(1:nrow(m),function(x) m[x,v1[x]]),v1)
m<-cbind(m,sapply(1:nrow(m), function(x) ifelse(m[x,2]==1,flexsurv::rgompertz(1,shape = 0.15,rate = rel.risk_trans3[x]*exp(-5.5)),NA)))
m<-cbind(m,apply(m[,c(1,3)],1,sum,na.rm=T))
m<-cbind(m,rexp(n,0.03))
m<-cbind(m,(m[,5]<m[,4]))
colnames(m)<-c("state1_duration","transition","state2_duration","total_time", "cens_time","cens=1")
m<-as.data.frame(m)
#convert the data to long format
mstate.data<-data.frame()
for(i in 1:nrow(m)){
id<-rep(i,2)
from<-c(1,1)
to<-c(2,3)
trans<-c(1,2)
Tstart<-c(0,0)
Tstop<-rep(min(m$state1_duration[i],m$cens_time[i]),2)
time<-Tstop-Tstart
status<-as.numeric(c(m$transition[i]==1 & m$cens_time[i]>m$state1_duration[i],m$transition[i]==2 & m$cens_time[i]>m$state1_duration[i]))
mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to, trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status))
if(status[1]==1){
id<-i
from<-2
to<-4
trans<-3
Tstart<-Tstop[1]
Tstop<-min(m$state1_duration[i]+m$state2_duration[i],m$cens_time[i])
time<-Tstop-Tstart
status<-as.numeric(m$state1_duration[i]+m$state2_duration[i]<m$cens_time[i])
mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,
Tstart=Tstart,Tstop=Tstop,time=time,status=status))
}
}
#add covariates
mstate.data<-cbind(mstate.data,covariate_matrix[mstate.data$id,])
#attributes and class
tmat<-mstate::transMat(x=list(c(2,3),c(4),c(),c()),names=c("health","illness","death","death_after_illness"))
class(mstate.data)<-c("data.frame","msdata")
attr(mstate.data,"trans")<-tmat
#expand covariates
mstate.data<-mstate::expand.covs(mstate.data,covs =names(mstate.data)[-(1:8)])
#Fit empirical Bayes clock-reset Cox model.
#argument 'Z' of coxrfx
Z<-mstate.data[,-(1:(8+length(covariate_names)))]
Z$strata<-Z$trans<-mstate.data$trans
class(Z) <- c("data.frame")
#argument 'surv' of coxrfx
surv<-survival::Surv(mstate.data$time,mstate.data$status)
#argument 'groups' of coxrfx
groups<-as.vector(sapply(c("trans1","trans2","trans3"),function(x) rep(x,nParam/3)))
#fit random effects model
coxrfx_object<-CoxRFX(Z,surv,groups,max.iter = 600,tol = 0.0001,sigma.hat = "df")
coefficient_estimates[j,]<-coxrfx_object$coefficients
mu_estimates[j,]<-coxrfx_object$mu
sigma2_estimates[j,]<-coxrfx_object$sigma2
if(j %%10==0){
save(coefficient_estimates,mu_estimates,sigma2_estimates,param,file = file1)
}
print(j)
}
Code to generate grid of plots
file_paths<-c("../data/coxph_vs_coxrfx_sim_illness_death_250obs.Rdata","../data/coxph_vs_coxrfx_sim_illness_death_1000obs.Rdata","../data/coxph_vs_coxrfx_sim_illness_death_2000obs.Rdata","../data/coxph_vs_coxrfx_sim_illness_death_4000obs.Rdata")
pdf("./plots/coxrfx_vs_coxph_plot_grid_10covs_per_trans_one_prior_per_trans.pdf",height=8,width = 9)
par(mfcol=c(3,4),mar=c(5.1, 4.1, 3.1, 1.1))
for(file1 in file_paths){
#mse comparison
load(file1)
#reorder matrix of estimates
coefficient_estimates<-coefficient_estimates[,c(seq(1,ncol(coefficient_estimates),3),seq(2,ncol(coefficient_estimates),3),seq(3,ncol(coefficient_estimates),3))]
#errors
errors_coxrfx<-t(coefficient_estimates)-param
errors_coxph<-t(coefficient_estimates_coxph)-param
#sse
sse_coxrfx<-apply(errors_coxrfx^2,2,sum,na.rm=T)
file2<-strsplit(file1,"/data/")[[1]][2]
file3<-strsplit(file2,".Rdata")
plot(0,type="n",xlim=c(-0.5,1),ylim=c(-0.5,1),ylab="average point estimate",xlab="true parameter",cex.lab=0.8,cex.axis=1,las=1,cex.lab=1.3)
points(param,apply(coefficient_estimates,2,mean,na.rm=T),cex=0.8,col="red")
abline(a=0,b=1)
var_coef_estimates<-apply(coefficient_estimates,2,var,na.rm=T)
var_coef_estimates_coxph<-apply(coefficient_estimates_coxph,2,var,na.rm=T)
barplot(var_coef_estimates,col=c("red"),beside=T,las=1,ylim=c(0,0.18),ylab = "sample variance",cex.lab=1.1,mgp=c(3,1,0))
mtext("regression coefficient",side = 1,line = 1,cex=0.8)
plot(density(sse_coxrfx,from = 0,kernel = "gaussian",na.rm = T),main="",ylab ="estimated density",xlab="sum of squared errors",xlim=c(0,5.5),ylim=c(0,7.5),col="red",lty=1,las=1,cex.lab=1.3)
}
dev.off()
In \(\texttt{ebmstate}\), the estimation of state occupation probabilities under clock-reset models, implemented in the functions \(\texttt{probtrans_ebmstate}\) and \(\texttt{probtrans_fft}\), relies on the estimators defined in (3) to (5) of section 3.2 of the paper. Substituting \(:=\) for \(\approx\) and removing the ‘hats’ in definitions (3) to (5), we obtain the approximate equalities that justify the estimators. We justify these approximate equalities as follows.
Approximate equality justifying the estimator in definition (3), section 3.2:
\[\begin{align*}
q_{j-1,j}\left(k\right)&:=\mathrm{P}\left[X_{j}=j, \tau_{j-1,j}\in \left[t_{k},t_{k+1}\right)\,|\,X_{j-1}=j-1\right]\\
&\approx f\left[X_{j}=j,\tau_{j-1,j}=t_{k}\,|\, X_{j-1}=j-1\right] \Delta t\\
&=\lim_{h \downarrow 0}\frac{\mathrm{P}\left[X_{j}=j,\tau_{j-1,j}\in \left[t_{k},t_{k}+h\right)\,|\,X_{j-1}=j-1\right]}{h}\Delta t\\
&=\lim_{h \downarrow 0}\mathrm{P} \left[\tau_{j-1,j} \geq t_{k}\,|\,X_{j-1}=j-1\right]\frac{\mathrm{P}\left[X_{j}=j,\tau_{j-1,j}\in \left[t_{k},t_{k}+h\right)|\tau_{j-1,j} \geq t_{k},X_{j-1}=j-1\right]}{h} \Delta t\\
&=\mathrm{P} \left[\tau_{j-1,j} \geq t_{k}\,|\,X_{j-1}=j-1\right]\lambda_{j-1,j}\left(t_{k}\right)\,\Delta t\\
&\approx \exp\left[-\Lambda_{j-1}\left(t_{k}\right)\right]\,\Delta\Lambda_{j-1,j}\left(t_{k}\right)\;.
\end{align*}\]
Approximate equality justifying the estimator in definition (4), section 3.2:
\[\begin{align*}
q_{0j}\left(k\right)&:=\mathrm{P}\left[X_{j}=j,\tau_{0,j}\in\left[t_{k},t_{k+1}\right)\,|\,X_{0}=0\right]\\
&\approx f\left[X_{j}=j,\tau_{0,j}=t_{k}\,|\,X_{0}=0\right]\Delta t\\
&=\int_{0}^{t_{k}}f\left[X_{j}=j,\tau_{0,j}=t_{k},X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\
&=\int_{0}^{t_{k}}f\left[X_{j}=j,\tau_{0,j}=t_{k}\,|\,X_{j-1}=j-1,\tau_{0,j-1}=u ,X_{0}=0\right]f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\
&=\int_{0}^{t_{k}}f\left[X_{j}=j,\tau_{0,j}=t_{k}\,|\,X_{j-1}=j-1,\tau_{0,j-1}=u \right]f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\
&=\int_{0}^{t_{k}}\lim_{h \downarrow 0} \frac{\mathrm{P}\left[X_{j}=j,\tau_{0,j}\in \left[t_{k},t_{k}+h\right)\,|\, X_{j-1}=j-1,\tau_{0,j-1}=u \right]}{h}\,f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\
&=\int_{0}^{t_{k}}\lim_{h \downarrow 0} \mathrm{P}\left[\tau_{0,j}\geq t_{k} \,|\,X_{j-1}=j-1,\tau_{0,j-1}=u\right] \frac{\mathrm{P}\left[X_{j}=j,\tau_{0,j}\in \left[t_{k},t_{k}+h\right)\,|\, \tau_{0,j}\geq t_{k},X_{j-1}=j-1,\tau_{0,j-1}=u \right]}{h}\,f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\
&=\int_{0}^{t_{k}}\lim_{h \downarrow 0} \mathrm{P}\left[\tau_{j-1,j}\geq t_{k}-u \,|\,X_{j-1}=j-1\right] \frac{\mathrm{P}\left[X_{j}=j,\tau_{j-1,j}\in \left[t_{k}-u,t_{k}-u+h\right)\,|\, \tau_{j-1,j}\geq t_{k}-u,X_{j-1}=j-1\right]}{h}\,f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\
&=\int_{0}^{t_{k}}\exp\left[-\Lambda_{j-1}\left(t_{k}-u\right)\right]\,\lambda_{j-1,j}\left(t_{k}-u\right) f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\
&\approx \int_{0}^{t_{k}}\exp\left[-\Lambda_{j-1}\left(t_{k}-u\right)\right]\,\Delta\Lambda_{j-1,j}\left(t_{k}-u\right) f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\\
&\approx \sum_{l=0}^{k-1} q_{j-1,j}\left(k-l-1\right)q_{0,j-1}\left(l\right) \;.
\end{align*}\]
Approximate equality justifying the estimator in definition (5), section 3.2:
\[\begin{align*} p_{0n}\left(k\right)&=\mathrm{P}\left[X_{n}=n,\tau_{ 0, n}< t_{k} , \tau_{ n, n+1}\geq t_{k}-\tau_{ 0, n}\,|\, X_{0}=0\right]\\ &=\int_{0}^{t_{k}}\int_{t_{k}-u}^{\infty}f\left[X_{n}=n,\tau_{ 0, n}=u,\tau_{ n, n+1}=v\,|\, X_{0}=0\right]\mathrm{d}v\,\mathrm{d}u\\ &=\int_{0}^{t_{k}}\int_{t_{k}-u}^{\infty}f\left[\tau_{ n, n+1}=v|X_{n}=n,\tau_{ 0, n}=u,X_{0}=0\right]f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}v \,\mathrm{d}u\\ &=\int_{0}^{t_{k}}\int_{t_{k}-u}^{\infty}f\left[\tau_{ n, n+1}=v|X_{n}=n,\tau_{ 0, n}=u\right]f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}v \,\mathrm{d}u\\ &=\int_{0}^{t_{k}} \mathrm{P}\left[\tau_{ n, n+1}>t_{k}-u|X_{n}=n,\tau_{ 0, n}=u\right]f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}u\\ &=\int_{0}^{t_{k}} \mathrm{P}\left[\tau_{ n, n+1}>t_{k}-u|X_{n}=n \right]f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}u\\ &=\int_{0}^{t_{k}} \exp\left[-\Lambda_{n}\left(t_{k}-u\right)\right]\, f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}u\\ &\approx \sum_{l=0}^{k-1}r_{n}\left(k-l-1\right)q_{0n}\left(l\right)\quad. \end{align*}\]
The following plot shows that probtrans_ebmstate
can accurately compute state occupation probabilities under clock-reset Cox models, when it is given the vectors of cumulative hazards for each transition. The dashed red lines were computed using a data set of 100,000 simulated disease histories for the same patient (or, equivalently, 100,000 patients with the same vector of covariates). For any time t, these lines give the relative frequencies of each state. They are superimposed on solid black lines which represent the state occupation probabilities as computed by probtrans_ebmstate
, when the true (Gompertz) cumulative hazards are given to it as input.
Figure 3.1: accuracy of the estimator of state occupation probabilities in the function probtrans_ebmstate.
Code used to generate the previous plot.
# Load packages and set working directory
set.seed(9873)
library(ebmstate)
library(flexsurv)
#generate a vector of covariates for the patient
nCovs<-50
covariate_vector<-rbinom(nCovs,size = 1,prob = 0.1)
#compute the relative risk (better said: relative hazard) of the patient for each transition
param<-runif(n=150,min = -0.5,max = 0.5)
rel.risk_trans1<-exp(sum(covariate_vector*param[(1+nCovs*0):(nCovs*1)]))
rel.risk_trans2<-exp(sum(covariate_vector*param[(1+nCovs*1):(nCovs*2)]))
rel.risk_trans3<-exp(sum(covariate_vector*param[(1+nCovs*2):(nCovs*3)]))
#generate 100,000 uncensored observations for the same patient
n<-100000
m<-matrix(c(rgompertz(n, shape=0.1, rate = rel.risk_trans1*exp(-4.5)),rgompertz(n, shape=0.1, rate = rel.risk_trans2*exp(-4.65))),ncol = 2)
v1<-apply(m,1,which.min)
m<-cbind(sapply(1:nrow(m),function(x) m[x,v1[x]]),v1)
m<-cbind(m,sapply(1:nrow(m), function(x) ifelse(m[x,2]==1,rgompertz(1,shape = 0.15,rate = rel.risk_trans3*exp(-5.5)),NA)))
m<-cbind(m,apply(m[,c(1,3)],1,sum,na.rm=T))
colnames(m)<-c("state1_duration","transition","state2_duration","total_time")
m<-as.data.frame(m)
#Build a function that computes relative frequencies of each state at some time t
rel_freq<-function(state,t){
if(state==1){
sum(m[,1]>t)/nrow(m)
}else if(state==2){
sum(m[,1]<t & m[,2]==1 & m[,4]>t)/nrow(m)
}else if(state==3){
sum(m[,1]<t & m[,2]==2)/nrow(m)
}else if(state==4){
sum(m[,2]==1 & m[,4]<t)/nrow(m)
}
}
#Vectorise the cumulative hazards of the patient
time_vector<-seq(0,80,0.01)
cumhaz1<-Hgompertz(time_vector, shape=0.1, rate = rel.risk_trans1*exp(-4.5))
cumhaz2<-Hgompertz(time_vector, shape=0.1, rate = rel.risk_trans2*exp(-4.65))
cumhaz3<-Hgompertz(time_vector, shape=0.15, rate = rel.risk_trans3*exp(-5.5))
cumhaz1<-data.frame(time=time_vector,Haz=cumhaz1,trans=1)
cumhaz2<-data.frame(time=time_vector,Haz=cumhaz2,trans=2)
cumhaz3<-data.frame(time=time_vector,Haz=cumhaz3,trans=3)
#build an msfit object
tmat<-mstate::transMat(x=list(c(2,3),c(4),c(),c()),names=c("health","illness","death","death_illness"))
msfit_object<-list(Haz=rbind(cumhaz1,cumhaz2,cumhaz3),trans=tmat)
class(msfit_object)<-c("msfit","coxrfx")
#Calculate state occupation probabilities
probtrans_object<-probtrans_ebmstate("health",msfit_object,'clockreset')
rel_freq_1<-sapply(time_vector,rel_freq,state=1)
rel_freq_12<-rel_freq_1+sapply(time_vector,rel_freq,state=2)
rel_freq_123<-rel_freq_12+sapply(time_vector,rel_freq,state=3)
rel_freq_1234<-rel_freq_123+sapply(time_vector,rel_freq,state=4)
save(probtrans_object,rel_freq_1,rel_freq_12,rel_freq_123,rel_freq_1234,time_vector,file = "../data/testing_probtrans_ebmstate.Rdata")
#compare estimates obtained by simulation with the probabilities computed by probtrans_ebmstate
plot(probtrans_object,legend = c("","","",""),lwd = 2)
lines(time_vector,rel_freq_1,lwd=2,col="red",lty=3)
lines(time_vector,rel_freq_12,lwd=2,col="red",lty=3)
lines(time_vector,rel_freq_123,lwd=2,col="red",lty=3)
lines(time_vector,rel_freq_1234,lwd=2,col="red",lty=3)
text(10,0.3,"health")
text(37,0.4,"illness")
text(78,0.07,"death")
text(67,0.6,"death_after_illness")
legend("bottomleft", legend = c("using probtrans_ebmstate","by simulation"),cex = 0.7,lty = c(1,3),col = c(1,"red"))
The following plot shows that probtrans_fft
can accurately compute state occupation probabilities under clock-reset Cox models, when it is given the vectors of cumulative hazards for each transition. The dashed red lines were computed using a data set of 100,000 simulated disease histories for the same patient (or, equivalently, 100,000 patients with the same vector of covariates). For any time t, these lines give the relative frequencies of each state. They are superimposed on solid black lines which represent the state occupation probabilities as computed by probtrans_fft
, when the true (Gompertz) cumulative hazards are given to it as input.
Figure 4.1: accuracy of the estimator of state occupation probabilities in the function probtrans_fft.
Code used to generate the previous plot.
# Load packages and set working directory
set.seed(981735)
library(ebmstate)
library(flexsurv)
#generate a vector of covariates for the patient
nCovs<-50
covariate_vector<-rbinom(nCovs,size = 1,prob = 0.1)
#compute the relative risk (better said: relative hazard) of the patient for each transition
param<-runif(n=150,min = -0.5,max = 0.5)
rel.risk_trans1<-exp(sum(covariate_vector*param[(1+nCovs*0):(nCovs*1)]))
rel.risk_trans2<-exp(sum(covariate_vector*param[(1+nCovs*1):(nCovs*2)]))
rel.risk_trans3<-exp(sum(covariate_vector*param[(1+nCovs*2):(nCovs*3)]))
#generate 100,000 uncensored observations for the same patient
n<-100000
m<-matrix(c(rgompertz(n, shape=0.1, rate = rel.risk_trans1*exp(-4.5)),rgompertz(n, shape=0.1, rate = rel.risk_trans2*exp(-4.65))),ncol = 2)
v1<-apply(m,1,which.min)
m<-cbind(sapply(1:nrow(m),function(x) m[x,v1[x]]),v1)
m<-cbind(m,sapply(1:nrow(m), function(x) ifelse(m[x,2]==1,rgompertz(1,shape = 0.15,rate = rel.risk_trans3*exp(-5.5)),NA)))
m<-cbind(m,apply(m[,c(1,3)],1,sum,na.rm=T))
colnames(m)<-c("state1_duration","transition","state2_duration","total_time")
m<-as.data.frame(m)
#Build a function that computes relative frequencies of each state at some time t
rel_freq<-function(state,t){
if(state==1){
sum(m[,1]>t)/nrow(m)
}else if(state==2){
sum(m[,1]<t & m[,2]==1 & m[,4]>t)/nrow(m)
}else if(state==3){
sum(m[,1]<t & m[,2]==2)/nrow(m)
}else if(state==4){
sum(m[,2]==1 & m[,4]<t)/nrow(m)
}
}
#Vectorise the cumulative hazards of the patient
time_vector<-seq(0,80,0.01)
cumhaz1<-Hgompertz(time_vector, shape=0.1, rate = rel.risk_trans1*exp(-4.5))
cumhaz2<-Hgompertz(time_vector, shape=0.1, rate = rel.risk_trans2*exp(-4.65))
cumhaz3<-Hgompertz(time_vector, shape=0.15, rate = rel.risk_trans3*exp(-5.5))
cumhaz1<-data.frame(time=time_vector,Haz=cumhaz1,trans=1)
cumhaz2<-data.frame(time=time_vector,Haz=cumhaz2,trans=2)
cumhaz3<-data.frame(time=time_vector,Haz=cumhaz3,trans=3)
#build an msfit object
tmat<-mstate::transMat(x=list(c(2,3),c(4),c(),c()),names=c("health","illness","death","death_illness"))
msfit_object<-list(Haz=rbind(cumhaz1,cumhaz2,cumhaz3),trans=tmat)
class(msfit_object)<-c("msfit","coxrfx")
#Calculate state occupation probabilities
probtrans_object<-probtrans_fft("health",msfit_object,max_time=80,nr_steps = 40000)
rel_freq_1<-sapply(time_vector,rel_freq,state=1)
rel_freq_12<-rel_freq_1+sapply(time_vector,rel_freq,state=2)
rel_freq_123<-rel_freq_12+sapply(time_vector,rel_freq,state=3)
rel_freq_1234<-rel_freq_123+sapply(time_vector,rel_freq,state=4)
save(probtrans_object,rel_freq_1,rel_freq_12,rel_freq_123,rel_freq_1234,time_vector,file = "../data/testing_probtrans_fft.Rdata")
#compare estimates obtained by simulation with the probabilities computed by probtrans_ebmstate
plot(probtrans_object,legend = c("","","",""),lwd = 2)
lines(time_vector,rel_freq_1,lwd=2,col="red",lty=3)
lines(time_vector,rel_freq_12,lwd=2,col="red",lty=3)
lines(time_vector,rel_freq_123,lwd=2,col="red",lty=3)
lines(time_vector,rel_freq_1234,lwd=2,col="red",lty=3)
text(10,0.3,"health")
text(27,0.4,"illness")
text(78,0.07,"death")
text(67,0.6,"death_after_illness")
legend("bottomleft", legend = c("using probtrans_fft","by simulation"),cex = 0.7,lty = c(1,3),col = c(1,"red"))
The following script generates plots comparing the running time and the estimation accuracy of \(\texttt{mstate::mssample}\) and \(\texttt{ebmstate::probtrans_fft}\).
#Generate some data from which cumulative hazards are to be estimated
set.seed(89910225)
library(parallel)
library(flexsurv)
library(ebmstate)
library(bindata)
n<-1000 # number of patients
covariate_names<-paste0("Cov",1:10) #number of covariates (for each transition)
nGroups<-1 #overall number of priors
nTrans<-3 # number of transitions
nParam<-nTrans*length(covariate_names) #total number of parameters (regression coefficients)
nr_simulated_data_sets<-100
param_fun<-function(nParam){
out<-sqrt(10/length(covariate_names))*rnorm(n = nParam,mean = 0,sd = 0.65)
out
}
TimePoints<-seq(10,70,length.out=7) #time points at which state occupation probability estimates are to be retrieved
nTimePoints<-length(TimePoints)
tmat<-mstate::transMat(x=list(c(2),c(3),c(4),c()),names=c("state1","state2","state3","state4"))
shape1<-0.1
rate1<-exp(-4.5)
shape2<-0.1
rate2<-exp(-2.7)
shape3<-0.15
rate3<-exp(-3.5)
param<-param_fun(nParam)
marg_probs<-runif(n = length(covariate_names),min = 0.05,max = 0.3)
covariate_matrix<-rmvbin(n,margprob = marg_probs)
colnames(covariate_matrix)<-covariate_names
#relative risks (relative hazards)
rel.risk_trans1<-exp(covariate_matrix%*%param[(1+length(covariate_names)*0):(length(covariate_names)*1)])
rel.risk_trans2<-exp(covariate_matrix%*%param[(1+length(covariate_names)*1):(length(covariate_names)*2)])
rel.risk_trans3<-exp(covariate_matrix%*%param[(1+length(covariate_names)*2):(length(covariate_names)*3)])
#Generate a transition history for each patient. Clock-reset model. Baseline hazard is Gompertz for all transitions.
m<-matrix(c(flexsurv::rgompertz(n, shape=shape1, rate = rel.risk_trans1*rate1),flexsurv::rgompertz(n, shape=shape2, rate = rel.risk_trans2*rate2),flexsurv::rgompertz(n, shape=shape3, rate = rel.risk_trans3*rate3)),ncol = 3)
m<-cbind(m,apply(m,1,sum,na.rm=T))
m<-cbind(m,rexp(n,0.008))
m<-cbind(m,(m[,5]<m[,4]))
colnames(m)<-c("state1_duration","state2_duration","state3_duration","total_time", "cens_time","cens=1")
m<-as.data.frame(m)
print(sum(m$`cens=1`)/nrow(m))
#convert the data to long format
mstate.data<-data.frame()
for(i in 1:nrow(m)){
id<-i
from<-1
to<-2
trans<-1
Tstart<-0
Tstop<-min(m$state1_duration[i],m$cens_time[i])
time<-Tstop-Tstart
status<-as.numeric(m$cens_time[i]>m$state1_duration[i])
mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,Tstart=Tstart,
Tstop=Tstop,time=time,status=status))
if(status==1){
id<-i
from<-2
to<-3
trans<-2
Tstart<-Tstop
Tstop<-min(Tstart[1]+m$state2_duration[i],m$cens_time[i])
time<-Tstop-Tstart
status<-as.numeric(m$cens_time[i]>m$state1_duration[i]+m$state2_duration[i])
mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,Tstart=Tstart,
Tstop=Tstop,time=time,status=status))
if(status==1){
id<-i
from<-3
to<-4
trans<-3
Tstart<-Tstop
Tstop<-min(m$total_time[i],m$cens_time[i])
time<-Tstop-Tstart
status<-as.numeric(m$total_time[i]<m$cens_time[i])
mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,Tstart=Tstart,
Tstop=Tstop,time=time,status=status))
}
}
}
#add covariates
mstate.data<-cbind(mstate.data,covariate_matrix[mstate.data$id,])
#attributes and class
class(mstate.data)<-c("data.frame","msdata")
attr(mstate.data,"trans")<-tmat
#expand covariates
mstate.data<-mstate::expand.covs(mstate.data,covs =names(mstate.data)[-(1:8)])
#Fit non-parametric model and estimate cumulative hazards
surv0<-survival::Surv(mstate.data$time,mstate.data$status)
coxph_object0<-coxph(as.formula("surv0~strata(trans)"),data = mstate.data)
msfit_object0<-msfit(coxph_object0,trans = tmat)
#State occupation probabilities by probtrans_fft
system.time(probtrans_object0<-probtrans_fft("state1",msfit_object0,time = seq(0,150,length.out=100000)))
tmat<-mstate::transMat(x=list(c(2),c(3),c(4),c())
)
#State occupation probabilities by mssample
system.time(probtrans_object_100<-mssample(msfit_object0$Haz
,tmat
,history = list(state=1,time=0,tstate=NULL)
,clock="reset"
,tvec=seq(0,100,length.out=5000)
,output = "state"
,M=100)
)
system.time(probtrans_object_1000<-mssample(msfit_object0$Haz
,tmat
,history = list(state=1,time=0,tstate=NULL)
,clock="reset"
,tvec=seq(0,100,length.out=5000)
,output = "state"
,M=1000)
)
system.time(probtrans_object_10000<-mssample(msfit_object0$Haz
,tmat
,history = list(state=1,time=0,tstate=NULL)
,clock="reset"
,tvec=seq(0,100,length.out=5000)
,output = "state"
,M=10000)
)
system.time(probtrans_object_100000<-mssample(msfit_object0$Haz
,tmat
,history = list(state=1,time=0,tstate=NULL)
,clock="reset"
,tvec=seq(0,100,length.out=5000)
,output = "state"
,M=100000)
)
#100 000: 1026 secs
#10 000: 93 secs
# 1000: 9.3 secs
# 100: 0.92 secs
# fft: 0.94 secs
#Plot code
probtrans_objects<-list(probtrans_object_100
,probtrans_object_1000
,probtrans_object_10000
,probtrans_object0[[1]])
pdf(file ="./mssample_and_probtrans_fft.pdf"
,width = 6
,height = 4)
par(mfcol=c(4,4),mar=c(0,0,0.2,0),mgp=c(3,0.5,0),oma=c(3,3,1,1),tck=-0.05)
for(j in 1:4){
for(i in 2:5){
ifelse(j==1,yaxt_value<-"s",yaxt_value<-"n")
plot(probtrans_object_100000$time,probtrans_object_100000[[i]]
,type="l"
,ylab="",xlab="",xaxt="n"
,yaxt=yaxt_value,cex.axis=0.7,las=1
)
if(i==5){
axis(1,at=seq(0,90,15),cex.axis=0.75,mgp=c(3,0.2,0))
mtext("time",side = 1,line = 1.2,cex = 0.6)
}
if(j==1){
mtext("probability",side = 2,line = 2,cex = 0.6)
}
points(probtrans_objects[[j]]$time,probtrans_objects[[j]][[i]],type="l"
,col="red")
}
}
dev.off()
The following figures are supplementary figures to the simulation study reported in section 4 of the main text. They show proportions of valid, infinite and missing (‘NA’) estimates produced by the empirical Bayes Cox estimators implemented in \(\texttt{ebmstate}\) and the fully non-parametric (null) estimators.
Figure 6.1: estimation of regression coefficients, relative hazards, and state occupation probabilities with empirical Bayes Cox estimators: proportions of valid, infinite and missing (‘NA’) values for data sets with 100 patients.
Figure 6.2: estimation of regression coefficients, relative hazards, and state occupation probabilities with empirical Bayes Cox estimators: proportions of valid, infinite and missing (‘NA’) values for data sets with 1000 patients.
Figure 6.3: estimation of state occupation probabilities with non-parametric estimators: proportions of valid, infinite and missing (‘NA’) values.
The sample script below performs the following tasks: a) generates data sets of 1000 patients under a Cox model with a ‘linear’ transition structure and 500 covariates/coefficients per transition (non-sparse but scaled regression coefficients); b) fits the fixed effects Cox model, an empirical Bayes Cox model and a fully non-parametric model; c) for each data set, estimates state occupation probabilities under these three models; d) for each data set, estimates true occupation probabilities by simulating from the true model.
set.seed(0840350)
library(parallel)
library(flexsurv)
library(ebmstate)
library(bindata)
n<-1000 # number of patients
covariate_names<-paste0("Cov",1:500) #number of covariates (for each transition)
nGroups<-1 #overall number of priors
nTrans<-3 # number of transitions
nParam<-nTrans*length(covariate_names) #total number of parameters (regression coefficients)
nr_simulated_data_sets<-100
param_fun<-function(nParam){
out<-sqrt(10/length(covariate_names))*rnorm(n = nParam,mean = 0,sd = 0.65)
out
}
TimePoints<-seq(10,70,10) #time points at which state occupation probability estimates are to be retrieved
nTimePoints<-length(TimePoints)
tmat<-mstate::transMat(x=list(c(2),c(3),c(4),c()),names=c("state1","state2","state3","state4"))
shape1<-0.1
rate1<-exp(-4.5)
shape2<-0.1
rate2<-exp(-2.7)
shape3<-0.15
rate3<-exp(-3.5)
file1<-"../data/dense_sim_1000obs_1group_for_all_trans_500vars_LinearStructure.Rdata"
#A function that computes relative frequencies of each state at some time t
rel_freq<-function(state,t,m){
if(state=="state1"){
sum(t<=m[,1])/nrow(m)
}else if(state=="state2"){
sum(t>m[,1]&t<=m[,1]+m[,2])/nrow(m)
}else if(state=="state3"){
sum(t>m[,1]+m[,2]&t<=m[,1]+m[,2]+m[,3])/nrow(m)
}else if(state=="state4"){
sum(t>m[,1]+m[,2]+m[,3])/nrow(m)
}
}
simfun<-function(j){
param<-param_fun(nParam)
marg_probs<-runif(n = length(covariate_names),min = 0.05,max = 0.3)
true_rh_fun<-function(trans){
exp(covariate_vector%*%param[(1+length(covariate_names)*(trans-1)):(length(covariate_names)*trans)])
}
rh_fun_coxrfx<-function(trans){
exp(covariate_vector%*%coxrfx_object$coefficients[seq(trans,length(coxrfx_object$coefficients),nTrans)])
}
rh_fun_coxph<-function(trans){
exp(covariate_vector%*%coxph_object$coefficients[seq(trans,length(coxph_object$coefficients),nTrans)])
}
covariate_matrix<-rmvbin(n,margprob = marg_probs)
colnames(covariate_matrix)<-covariate_names
#relative risks (relative hazards)
rel.risk_trans1<-exp(covariate_matrix%*%param[(1+length(covariate_names)*0):(length(covariate_names)*1)])
rel.risk_trans2<-exp(covariate_matrix%*%param[(1+length(covariate_names)*1):(length(covariate_names)*2)])
rel.risk_trans3<-exp(covariate_matrix%*%param[(1+length(covariate_names)*2):(length(covariate_names)*3)])
#Generate a transition history for each patient. Clock-reset model. Baseline hazard is Gompertz for all transitions.
m<-matrix(c(flexsurv::rgompertz(n, shape=shape1, rate = rel.risk_trans1*rate1),flexsurv::rgompertz(n, shape=shape2, rate = rel.risk_trans2*rate2),flexsurv::rgompertz(n, shape=shape3, rate = rel.risk_trans3*rate3)),ncol = 3)
m<-cbind(m,apply(m,1,sum,na.rm=T))
m<-cbind(m,rexp(n,0.008))
m<-cbind(m,(m[,5]<m[,4]))
colnames(m)<-c("state1_duration","state2_duration","state3_duration","total_time", "cens_time","cens=1")
m<-as.data.frame(m)
print(sum(m$`cens=1`)/nrow(m))
#convert the data to long format
mstate.data<-data.frame()
for(i in 1:nrow(m)){
id<-i
from<-1
to<-2
trans<-1
Tstart<-0
Tstop<-min(m$state1_duration[i],m$cens_time[i])
time<-Tstop-Tstart
status<-as.numeric(m$cens_time[i]>m$state1_duration[i])
mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to, trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status))
if(status==1){
id<-i
from<-2
to<-3
trans<-2
Tstart<-Tstop
Tstop<-min(Tstart[1]+m$state2_duration[i],m$cens_time[i])
time<-Tstop-Tstart
status<-as.numeric(m$cens_time[i]>m$state1_duration[i]+m$state2_duration[i])
mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to, trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status))
if(status==1){
id<-i
from<-3
to<-4
trans<-3
Tstart<-Tstop
Tstop<-min(m$total_time[i],m$cens_time[i])
time<-Tstop-Tstart
status<-as.numeric(m$total_time[i]<m$cens_time[i])
mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans, Tstart=Tstart,Tstop=Tstop,time=time,status=status))
}
}
}
#add covariates
mstate.data<-cbind(mstate.data,covariate_matrix[mstate.data$id,])
#attributes and class
class(mstate.data)<-c("data.frame","msdata")
attr(mstate.data,"trans")<-tmat
#expand covariates
mstate.data<-mstate::expand.covs(mstate.data,covs =names(mstate.data)[-(1:8)])
#Fit non-parametric model
surv0<-survival::Surv(mstate.data$Tstart,mstate.data$Tstop,mstate.data$status)
coxph_object0<-coxph(as.formula("surv0~strata(trans)"),data = mstate.data)
msfit_object0<-msfit_generic(coxph_object0,trans = tmat)
probtrans_object0<-probtrans_mstate(msfit_object0,predt = 0)
#retrieve state occupation prob estimates at times 10,20,...,50
time_indices<-sapply(TimePoints,function(x) which.min(abs(probtrans_object0[[1]]$time-x)))
probtrans_object0_reduced<-probtrans_object0[[1]][time_indices,1:(ncol(tmat)+1)]
state_occup_estimates0<-as.matrix(probtrans_object0_reduced)
#Fit homogeneous clock-reset empirical Bayes model.
#argument 'Z' of coxrfx
Z<-mstate.data[,-(1:(8+length(covariate_names)))]
Z$strata<-Z$trans<-mstate.data$trans
#argument 'surv' of coxrfx
surv<-survival::Surv(mstate.data$time,mstate.data$status)
#argument 'groups' of coxrfx
groups<-rep("unique_group",length(param))
#fit random effects model
coxrfx_object<-CoxRFX(Z,surv,groups,max.iter = 600,tol = 0.0001,sigma.hat = "df")
cat("coxrfx_concordance:",concordance(coxrfx_object)$concordance)
#fit fixed effects model
model_formula<-as.formula(paste0("surv~",paste(head(names(Z),length(names(Z))-2),collapse = "+"),"+strata(strata)"))
try(coxph_object<-ebmstate:::coxph(formula = model_formula,data=Z,control = coxph.control(iter.max = 100)),silent=TRUE)
try(cat("coxph_concordance:",concordance(coxph_object)$concordance),silent=TRUE)
#simulate single patient covariate data for computing relative and cumulative hazards
covariate_vector<-rmvbin(1,marg_probs)
#single patient true relative hazards
true_rel_risk_sampled_patient<-sapply(1:nTrans,true_rh_fun)
#estimated relative hazard for patient (coxrfx)
rel_risk_estimates_coxrfx<-sapply(1:nTrans,rh_fun_coxrfx)
#estimated relative risks for patient (coxph)
try(rel_risk_estimates_coxph<-sapply(1:nTrans,rh_fun_coxph),silent=TRUE)
#put patient covariate vector in long format
newdata<-data.frame(matrix(rep(covariate_vector,length(na.exclude(as.vector(tmat)))),nrow=length(na.exclude(as.vector(tmat))),byrow = T))
names(newdata)<-covariate_names
newdata$strata<-newdata$trans<-1:nTrans
attr(newdata,"trans")<-tmat
class(newdata)<-c("data.frame","msdata")
newdata<-mstate::expand.covs(newdata,covs=names(newdata)[!names(newdata)%in%c("trans","strata")],append=TRUE)
newdata<-newdata[-(1:length(covariate_names))]
#compute cumulative hazards and state occupation probs (coxrfx)
msfit_object_coxrfx<-msfit_generic(coxrfx_object,newdata,trans = tmat)
probtrans_object_coxrfx<-probtrans_fft("state1",msfit_object_coxrfx,time = seq(0,80,length.out=50000))
#retrieve state occupation prob estimates at times 10,20,...,50
time_indices<-sapply(TimePoints,function(x) which.min(abs(probtrans_object_coxrfx[[1]]$time-x)))
probtrans_object_coxrfx_reduced<-probtrans_object_coxrfx[[1]][time_indices,]
state_occup_estimates_coxrfx<-as.matrix(probtrans_object_coxrfx_reduced)
#compute cumulative hazards and state occupation probs (coxph)
try(msfit_object_coxph<-msfit_generic(coxph_object,newdata,trans = tmat),silent = TRUE)
try(probtrans_object_coxph<-probtrans_fft("state1",msfit_object_coxph,time = seq(0,80,length.out=50000)),silent = TRUE)
#retrieve state occupation prob estimates at times 10,20,...,50
try(time_indices<-sapply(TimePoints,function(x) which.min(abs(probtrans_object_coxph[[1]]$time-x))),silent = TRUE)
try(probtrans_object_coxph_reduced<-probtrans_object_coxph[[1]][time_indices,],silent = TRUE)
try(state_occup_estimates_coxph<-as.matrix(probtrans_object_coxph_reduced),silent = TRUE)
#Get very precise estimates of state occupation probabilities for particular patient
rel.risk_trans1<-exp(covariate_vector%*%param[(1+length(covariate_names)*0):(length(covariate_names)*1)])
rel.risk_trans2<-exp(covariate_vector%*%param[(1+length(covariate_names)*1):(length(covariate_names)*2)])
rel.risk_trans3<-exp(covariate_vector%*%param[(1+length(covariate_names)*2):(length(covariate_names)*3)])
#generate 100,000 uncensored observations for the same patient
n2<-100000
m<-matrix(c(flexsurv::rgompertz(n2, shape=shape1, rate = rel.risk_trans1*rate1),flexsurv::rgompertz(n2, shape=shape2, rate = rel.risk_trans2*rate2),flexsurv::rgompertz(n2, shape=shape3, rate = rel.risk_trans3*rate3)),ncol = 3)
m<-cbind(m,rowSums(m,na.rm=TRUE))
colnames(m)<-c("state1_duration","state2_duration","state3_duration","total_time")
m<-as.data.frame(m)
time_vector<-TimePoints
rel_freq_1<-sapply(time_vector,rel_freq,state="state1",m=m)
rel_freq_2<-sapply(time_vector,rel_freq,state="state2",m=m)
rel_freq_3<-sapply(time_vector,rel_freq,state="state3",m=m)
rel_freq_4<-sapply(time_vector,rel_freq,state="state4",m=m)
true_probs_matrix<-cbind(time_vector,rel_freq_1,rel_freq_2,rel_freq_3,rel_freq_4)
if(!'coxph_object'%in%ls()) coxph_object<-list(coefficients=NA)
if(!'rel_risk_estimates_coxph'%in%ls()) rel_risk_estimates_coxph<-NA
if(!'state_occup_estimates_coxph'%in%ls()) state_occup_estimates_coxph<-NA
list(n=n,nGroups=nGroups,nTrans=nTrans,nParam=nParam,param=param,
TimePoints=TimePoints,tmat=tmat,
coxrfx_coefficients=coxrfx_object$coefficients,
coxrfx_mu=coxrfx_object$mu,
coxrfx_sigma2=coxrfx_object$sigma2,
coxph_coefficients=coxph_object$coefficients,
covariate_vector=covariate_vector,
true_rel_risk_sampled_patient=true_rel_risk_sampled_patient,
rel_risk_estimates_coxph=rel_risk_estimates_coxph,
rel_risk_estimates_coxrfx=rel_risk_estimates_coxrfx,
state_occup_estimates_coxph=state_occup_estimates_coxph,
state_occup_estimates_coxrfx=state_occup_estimates_coxrfx,
state_occup_estimates0=state_occup_estimates0,
true_probs_matrix=true_probs_matrix,
marg_probs=marg_probs)
}
simfun_object<-mclapply(1:300,simfun,mc.cores = 50)
save(simfun_object,file=file1)
Example script to generate boxplots of average absolute errors and plots of proportions of failed estimates. It takes as input a set of objects with model estimates, such as the one generated by the previous block of code.
######### IMPORTING SIMULATED DATA
#Choose one of the following batches of data to import
### Batch 1: each simulated data set has 1000 patients
rdata_names<-vector("list",3)
rdata_names[[1]]<-c(
"../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_10vars_LinearStructure.Rdata"
, "../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_100vars_LinearStructure.Rdata"
,"../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_200vars_LinearStructure.Rdata"
,"../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_300vars_LinearStructure.Rdata"
,"../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_400vars_LinearStructure.Rdata"
,"../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_500vars_LinearStructure.Rdata"
)
rdata_names[[2]]<-c(
"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_10vars_CompetingRisksTransStructure.Rdata"
,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_100vars_CompetingRisksTransStructure.Rdata"
,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_200vars_CompetingRisksTransStructure.Rdata"
,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_300vars_CompetingRisksTransStructure.Rdata"
,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_400vars_CompetingRisksTransStructure.Rdata"
,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_500vars_CompetingRisksTransStructure.Rdata"
)
rdata_names[[3]]<-c(
"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_10vars_MtransStructure.Rdata"
,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_100vars_MtransStructure.Rdata"
,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_200vars_MtransStructure.Rdata"
,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_300vars_MtransStructure.Rdata"
,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_400vars_MtransStructure.Rdata"
,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_500vars_MtransStructure.Rdata"
)
### Batch 2: each simulated data set has 100 patients
rdata_names<-vector("list",3)
rdata_names[[1]]<-c(
"../data/Linear_structure_100patients/dense_sim_100obs_1group_for_all_trans_10vars_LinearStructure.Rdata"
,"../data/Linear_structure_100patients/dense_sim_100obs_1group_for_all_trans_40vars_LinearStructure.Rdata"
,"../data/Linear_structure_100patients/dense_sim_100obs_1group_for_all_trans_70vars_LinearStructure.Rdata"
, "../data/Linear_structure_100patients/dense_sim_100obs_1group_for_all_trans_100vars_LinearStructure.Rdata"
)
rdata_names[[2]]<-c(
"../data/Comp_risks_structure_100patients/dense_sim_100obs_1group_for_all_trans_10vars_CompetingRisksTransStructure.Rdata"
,"../data/Comp_risks_structure_100patients/dense_sim_100obs_1group_for_all_trans_40vars_CompetingRisksTransStructure.Rdata"
,"../data/Comp_risks_structure_100patients/dense_sim_100obs_1group_for_all_trans_70vars_CompetingRisksTransStructure.Rdata"
,"../data/Comp_risks_structure_100patients/dense_sim_100obs_1group_for_all_trans_100vars_CompetingRisksTransStructure.Rdata"
)
rdata_names[[3]]<-c(
"../data/M_structure_100patients/dense_sim_100obs_1group_for_all_trans_10vars_MtransStructure.Rdata"
,"../data/M_structure_100patients/dense_sim_100obs_1group_for_all_trans_40vars_MtransStructure.Rdata"
,"../data/M_structure_100patients/dense_sim_100obs_1group_for_all_trans_70vars_MtransStructure.Rdata"
,"../data/M_structure_100patients/dense_sim_100obs_1group_for_all_trans_100vars_MtransStructure.Rdata"
)
## load the batch of simulated data chosen
x_values<-vector("list",3)
simfun_object_lists<-vector("list",length(rdata_names))
for(i in 1:length(rdata_names)){
x_values[[i]]<-rep(NA,length(rdata_names[[i]]))
for(j in 1:length(x_values[[i]])){
load(rdata_names[[i]][j])
simfun_object_lists[[i]][[j]]<-simfun_object
x_values[[i]][j]<-length(simfun_object[[1]]$covariate_vector)
}
}
rm(simfun_object)
unwrap_fun<-function(simfun_object,name){
if(name%in%c("n","nGroups","nTrans","nParam","param","TimePoints"
,"tmat","coxrfx_mu","coxrfx_sigma2","covariate_vector"
,"true_rel_risk_sampled_patient","true_probs_matrix"
,"marg_probs")){
sapply(simfun_object,function(x){
return(x[[name]])
}
)
}else if(name%in%c("coxph_coefficients","coxrfx_coefficients")){
sapply(simfun_object,function(x){
if(length(x[[name]])==x$nParam){
return(x[[name]])
}else{
return(rep(NA,x$nParam))
}
}
)
}else if(grepl("rel_risk_estimates",name)){
sapply(simfun_object,function(x){
if(length(x[[name]])==x$nTrans){
return(x[[name]])
}else{
return(rep(NA,x$nTrans))
}
}
)
}else if(grepl("state_occup_estimates",name)){
out<-sapply(simfun_object,function(x){
if(length(as.vector(x[[name]]))==prod(dim(x$true_probs_matrix))){
return(as.vector(x[[name]])[-(1:dim(x$true_probs_matrix)[1])])
}else{
return(rep(NA,prod(dim(x$true_probs_matrix))-dim(x$true_probs_matrix)[1]))
}
}
)
invalid_estimates<-which(abs(colSums(out)-nrow(simfun_object[[1]]$true_probs_matrix))>1)
out[,invalid_estimates]<-NA
out
}
}
plot_abs3<-function(simfun_object){
list1<-sapply(names(simfun_object[[1]]),unwrap_fun,simfun_object=simfun_object,USE.NAMES = TRUE)
names(list1)<-names(simfun_object[[1]])
list1$nr_simulated_data_sets<-length(simfun_object)
param_order_fun<-function(nTrans,nParam){
as.vector(sapply(1:(nParam/nTrans),function(x) seq(x,nParam,nParam/nTrans)))
}
param_order<-param_order_fun(list1$nTrans[1],list1$nParam[1])
list1$param<-list1$param[param_order,]
abs_errors_coef0<-abs(list1$param)
abs_errors_coef_coxph<-abs(list1$coxph_coefficients-list1$param)
abs_errors_coef_coxrfx<-abs(list1$coxrfx_coefficients-list1$param)
abs_errors_rel_risk_coxph<-abs(list1$rel_risk_estimates_coxph-list1$true_rel_risk_sampled_patient)
abs_errors_rel_risk_coxrfx<-abs(list1$rel_risk_estimates_coxrfx-list1$true_rel_risk_sampled_patient)
abs_errors_rel_risk0<-abs(1-list1$true_rel_risk_sampled_patient)
abs_errors_pred_coxph<-abs(list1$state_occup_estimates_coxph-list1$true_probs_matrix[-(1:dim(simfun_object[[1]]$true_probs_matrix)[1]),])
abs_errors_pred_coxrfx<-abs(list1$state_occup_estimates_coxrfx-list1$true_probs_matrix[-(1:dim(simfun_object[[1]]$true_probs_matrix)[1]),])
abs_errors_pred0<-abs(list1$state_occup_estimates0-list1$true_probs_matrix[-(1:dim(simfun_object[[1]]$true_probs_matrix)[1]),])
out<-list(coef_errors=list(coxph=abs_errors_coef_coxph
,coxrfx=abs_errors_coef_coxrfx
,null=abs_errors_coef0
)
,rel_risk_errors=list(coxph=abs_errors_rel_risk_coxph
,coxrfx=abs_errors_rel_risk_coxrfx
,null=abs_errors_rel_risk0)
,pred_errors=list(coxph=abs_errors_pred_coxph
,coxrfx=abs_errors_pred_coxrfx
,null=abs_errors_pred0
)
,true_values=list(coef=list1$param
,rel_risk=list1$true_rel_risk_sampled_patient
,pred=list1$true_probs_matrix
)
,pred_estimates=list(coxph=list1$state_occup_estimates_coxph
,coxrfx=list1$state_occup_estimates_coxrfx
,null=list1$state_occup_estimates0
)
,coef_estimates=list(coxph=list1$coxph_coefficients
,coxrfx=list1$coxrfx_coefficients
)
,rel_risk_estimates=list(coxph=list1$rel_risk_estimates_coxph
,coxrfx=list1$rel_risk_estimates_coxrfx
)
)
out
}
plot_object_lists<-vector("list",length = length(rdata_names))
for(i in 1:length(simfun_object_lists)){
plot_object_lists[[i]]<-lapply(simfun_object_lists[[i]],plot_abs3)
}
#### PLOTS
##boxplots of average absolute error for each simulated data set
boxplot_fun<-function(plot_obj_list,target){
limits<-list(coef_errors=0.8,rel_risk_errors=10,pred_errors=0.5)
for(i in 1:length(plot_obj_list)){
obj1<-apply(plot_obj_list[[i]][[target]][["coxph"]],2,mean)
obj2<-apply(plot_obj_list[[i]][[target]][["coxrfx"]],2,mean)
obj3<-apply(plot_obj_list[[i]][[target]][["null"]],2,mean)
obj1<-sapply(obj1,function(x) ifelse(x<limits[[target]]
,x,limits[[target]]))
obj2<-sapply(obj2,function(x) ifelse(x<limits[[target]]
,x,limits[[target]]))
obj3<-sapply(obj3,function(x) ifelse(x<limits[[target]]
,x,limits[[target]]))
try(boxplot(obj1,add=TRUE, at = i-0.22,boxwex=0.35,pars=list(outcex=0.2,boxfill="white",lwd=0.7,medlwd=0.75,whisklty=1),yaxt="n",axes=FALSE))
try(boxplot(obj2,add=TRUE,at = i,boxwex=0.35,pars=list(outcol="red",outcex=0.2,boxfill="white",lwd=0.7,medlwd=0.75,whisklty=1),yaxt="n",axes=FALSE,border="red"))
try(boxplot(obj3,add=TRUE,at = i+0.22,boxwex=0.35,pars=list(outcex=0.2,boxfill="white",lwd=0.7,medlwd=0.75,whisklty=1),yaxt="n",axes=FALSE,border="blue"))
}
}
pdf(file ="./plots/rplots.pdf"
,width = 6
,height = 4)
par(mfrow=c(3,3),mar=c(2,2,1,0.5))
ylims<-c(0.8,10,0.5)
names(ylims)<-c("coef_errors","rel_risk_errors","pred_errors")
y_at<-list(coef_errors=seq(0,0.8,0.2)
,rel_risk_errors=seq(0,10,2)
,pred_errors=seq(0,0.5,0.25)
)
y_labels<-list(coef_errors=c(0,0.2,0.4,0.6,expression(NULL>=0.8))
,rel_risk_errors=c(0,2,4,6,8,expression(NULL>=10))
,pred_errors=c(0,0.25,expression(NULL>=0.5))
)
for (i in 1:length(plot_object_lists)){
for(j in c("coef_errors","rel_risk_errors","pred_errors")){
plot(NA,ylim=c(0,ylims[j])
,xlim=c(0.5,length(x_values[[i]])+0.5)
,xlab=""
,main=""
,yaxt="n"
,xaxt="n"
,lwd=0.5
,bty="n"
)
axis(1
,at=1:length(x_values[[i]])
,labels = x_values[[i]]
,tick=FALSE
,mgp=c(3,0.075,0)
,cex.axis=0.8
,lwd = 0.75
)
axis(2
,tick=TRUE
,tck=-0.05
,mgp=c(3,0.4,0)
,cex.axis=0.7
,lwd=0.75
,at=y_at[[j]]
,labels = y_labels[[j]]
)
mtext("covariates per transition"
,side=1
,line=1
,cex=0.5
)
mtext("average absolute error"
,side=2
,line=1.3
,cex=0.5
)
boxplot_fun(plot_object_lists[[i]],j)
box(lwd=0.75)
}
}
dev.off()
par(mfrow=c(1,1))
plot(NA,ylim=c(0,1),bty="n",ylab="",xlab="",xaxt="n",yaxt="n")
legend(x=0.8,y=0.5,legend = c("Cox","EBCox","null"),fill = c("white"),border = c("black","red","blue"))
## plots of proportions of failed estimates
na_function<-function(object,target,estimator){
if(!target=="coef_estimates"){
na_prp<-sum(is.na(as.vector(object[[target]][[estimator]])))/length(as.vector(object[[target]][[estimator]]))
inf_prp<-sum(is.infinite(as.vector(object[[target]][[estimator]])))/length(as.vector(object[[target]][[estimator]]))
c(na_prp,inf_prp,1-na_prp-inf_prp)
}else{
na_prp<-sum(is.na(unlist(object[[target]][[estimator]])))/length(unlist(object[[target]][[estimator]]))
inf_prp<-sum(is.infinite(unlist(object[[target]][[estimator]])))/length(unlist(object[[target]][[estimator]]))
out<-c(na_prp,inf_prp,1-na_prp-inf_prp)
names(out)<-c("NA","Inf","valid")
out
}
}
# batch with 1000 patients per data set
pdf(file ="./plots/na_props.pdf"
,width = 6
,height = 4)
par(mfrow=c(3,3),mar=c(3,3.5,1,0),mgp=c(3,0.75,0))
for(i in 1:3){
for(j in c("coef_estimates","rel_risk_estimates","pred_estimates")){
barplot_matrix<-sapply(plot_object_lists[[i]],na_function,target=j,estimator="coxph")
barplot(barplot_matrix,border=NA
,col = c(1,2,4)
,width=1,xlim = c(0,7),cex.axis =0.8,las=2,xaxt="n")
axis(1,at=c(0.7,1.9,3.1,4.3,5.5,6.7)
,labels=c(10,100,200,300,400,500)
,tick = FALSE
,mgp=c(3,0.3,0)
,las=1
,cex.axis=0.8)
mtext("covariates per trans",cex=0.5,line=1.3,side=1)
mtext("proportion",cex=0.5,line=1.9,side=2,las=3)
}
}
dev.off()
#batch with 100 patients per data set
pdf(file ="./plots/na_props_100.pdf"
,width = 6
,height = 4)
par(mfrow=c(3,3),mar=c(3,3.5,1,0),mgp=c(3,0.75,0))
for(i in 1:3){
for(j in c("coef_estimates","rel_risk_estimates","pred_estimates")){
barplot_matrix<-sapply(plot_object_lists[[i]],na_function,target=j,estimator="coxph")
barplot(barplot_matrix,border=NA
,col = c(1,2,4)
,width=1,xlim = c(0,5),cex.axis =0.8,las=2,xaxt="n")
axis(1,at=c(0.7,1.9,3.1,4.3)
,labels=c(10,40,70,100)
,tick = FALSE
,mgp=c(3,0.4,0)
,las=1
,cex.axis=0.8)
mtext("covariates per trans",cex=0.5,line=1.4,side=1)
mtext("proportion",cex=0.5,line=1.9,side=2,las=3)
}
}
dev.off()
#legend
pdf(file ="./plots/na_props_100_legend.pdf"
,width = 6
,height = 4)
par(mfrow=c(1,1))
plot(NA,ylim=c(0,1),bty="n",ylab="",xaxt="n",yaxt="n")
legend(x=0.8,y=0.8
,legend = c("valid","infinite","NA")
,fill = c(4,2,1),)
dev.off()
European Bioinformatics Institute (EMBL-EBI), ruibarrigana@hotmail.com↩︎
Genome Biology Unit, EMBL↩︎
German Cancer Research Center (DKFZ)↩︎