# R packages
library("dplyr")
library("ecmeta")
library("gam")
library("ggplot2")
library("magrittr")
library("stringr")
library("survival")
library("tidyr")

# Settings
theme_set(theme_bw())

# Functions for simulation
source("functions.R")

# Monitor output
OUTFILE <- "output/sim-out.txt"
cat("Starting simulation: \n", file = OUTFILE)

# Cache
CACHE <- FALSE
if (!CACHE) {
  f <- list.files("cache", include.dirs = F, full.names = T, recursive = T)
  file.remove(f)
}
## [1] TRUE TRUE
# Globals
N_STUDIES <- 10000

1 Simulation data and scenarios

Survival data for n_studies is first simulated and hazard ratios are estimated for each study. Six different scenarios are used to simulate the survival data:

  • S1: Toy example where every study has the same number of events and has the same true outcomes, i.e. no between study heterogeneity

  • S2: Toy example with variability in the numbers of events in studies, but no between study heterogeneity in true outcomes

  • S3: Example with between study variability in both the numbers of events and outcomes in all three arms

  • S4: A more realistic null-hypothesis situation where the true trt v ic HR is fixed at one for all studies, with varying outcome for the external control and varying numbers of events

  • S5: A version of res4 assuming an alternative hypothesis of HR = 0.5

  • S6: Similar to res3, with the underlying simulation truth being distributed around an alternative hypothesis of HR > 1

sim_scenarios <- function(n_studies = 10000) {
  ptm <- proc.time()
  
  s1 <- sim_survdata(
    nstudies = n_studies, 
    cv_trt = 0 , cv_ic = 0, cv_ec=0, 
    ne_trt = 100, ne_ec = 50, ne_ic = 70
  ) %>%
    estimate_hrs()
  cat("Simulated data for scenario 1 \n", file = OUTFILE, append = TRUE)
  
  s2 <- sim_survdata(
    nstudies = n_studies, 
    cv_trt = 0, cv_ic = 0, cv_ec = 0, 
    ne_trt = 250, ne_ec = 250, ne_ic = 250, 
    cvne_ic = 0.2, cvne_ec = 0.2 , cvne_trt = 0.2, 
    med_trt = 24, med_ic = 24, med_ec = 18
  ) %>%
    estimate_hrs()
  cat("Simulated data for scenario 2 \n", file = OUTFILE, append = TRUE)
  
  s3 <- sim_survdata(
    nstudies = n_studies, 
    cv_trt = 0.4, cv_ic = 0.2, cv_ec = 0.2, 
    ne_trt = 250, ne_ec = 250, ne_ic = 250, 
    cvne_ic = 0.2, cvne_ec = 0.2, cvne_trt = 0.2, 
    med_trt = 24, med_ic = 24, med_ec = 18
  ) %>%
    estimate_hrs()
  cat("Simulated data for scenario 3 \n", file = OUTFILE, append = TRUE)
  
  s4 <- sim_survdata(
    nstudies = n_studies, 
    cv_trt = 0.4, cv_ic = 0.2, cv_ec = 0.2, 
    ne_trt = 250, ne_ec = 250, ne_ic = 150, 
    cvne_ic = 0.2, cvne_ec = 0.2, cvne_trt = 0.2, 
    med_trt = 24, med_ic = 24 , med_ec = 18 ,
    randomisation_ratio = 1 , true_hr = 1 ) %>%
    estimate_hrs()
  cat("Simulated data for scenario 4 \n", file = OUTFILE, append = TRUE)
  
  s5 <- sim_survdata(
    nstudies = n_studies,
    cv_trt = 0.4, cv_ic = 0.2, cv_ec = 0.2, 
    ne_trt = 250, ne_ec = 250, ne_ic = 150, 
    cvne_ic = 0.2, cvne_ec = 0.2, cvne_trt = 0.2, 
    med_trt = 24, med_ic = 24, med_ec = 18,
    randomisation_ratio = 1, true_hr = 0.5 
  ) %>%
    estimate_hrs()
  cat("Simulated data for scenario 5 \n", file = OUTFILE, append = TRUE)
  
  s6 <- sim_survdata(
    nstudies = n_studies,
    cv_trt = 0.4, cv_ic = 0.2, cv_ec = 0.2, 
    ne_trt = 250, ne_ec = 250, ne_ic = 250, 
    cvne_ic = 0.2, cvne_ec = 0.2, cvne_trt = 0.2, 
    med_trt = 35, med_ic = 24, med_ec = 18,
    randomisation_ratio = 1
  ) %>%
    estimate_hrs()
  cat("Simulated data for scenario 6 \n", file = OUTFILE, append = TRUE)
  
  out <- list(s1 = s1, s2 = s2, s3 = s3, 
       s4 = s4, s5 = s5, s6 = s6)
  
  run_time <- proc.time() - ptm
  cat(
    paste0("Time to simulate data for all scenarios: ", run_time["elapsed"], 
           " seconds\n"), 
           file = OUTFILE, append = TRUE
  )
  attr(out, "run_time") <- run_time
  out
}
if (file.exists("cache/scenarios.RData")) {
  load("cache/scenarios.RData")
} else {
  scenarios <- sim_scenarios(n_studies = N_STUDIES)
  save(scenarios, file = "cache/scenarios.RData")
}

For each of the 6 scenarios, we visualize the variability in the number of events and hazard ratios across the simulated studies.

lapply(1:length(scenarios) , function(i) plot_scenarios(scenarios[[i]], names(scenarios)[i]))
## [[1]]
## [[1]]$nes

## 
## [[1]]$hrs

## 
## 
## [[2]]
## [[2]]$nes

## 
## [[2]]$hrs

## 
## 
## [[3]]
## [[3]]$nes

## 
## [[3]]$hrs

## 
## 
## [[4]]
## [[4]]$nes

## 
## [[4]]$hrs

## 
## 
## [[5]]
## [[5]]$nes

## 
## [[5]]$hrs

## 
## 
## [[6]]
## [[6]]$nes

## 
## [[6]]$hrs

2 Running the simulation

run_sims <- function() {
  ptm <- proc.time()
  
  out <- lapply(1:length(scenarios) , function(i) {
    cat(paste0("Starting simulation for scenario", i, " \n"), 
        file = OUTFILE, append = TRUE)
    lapply(4:9, run_sim, .res = scenarios[[i]]) %>%
      bind_rows() %>%
      mutate(
        scenario = names(scenarios)[i],
        constant_hr = sd(truth) < 1e-15
      )
    }) %>%
    bind_rows() 
  
  run_time <- proc.time() - ptm
    cat(
    paste0("Time to run simulation for all scenarios: ", run_time["elapsed"], 
           " seconds\n"), 
           file = OUTFILE, append = TRUE
  )
  attr(out, "run_time") <- run_time
  out
}
if (file.exists("cache/sims.rds")) {
  sims <- readRDS("cache/sims.rds")
} else {
  #sims_old <- run_sims_old()
  sims <- run_sims()
  saveRDS(sims, file = "cache/sims.rds")
}

3 Bias

plot_bias <- function(x) {
  x %>%
    ggplot() + 
    geom_boxplot(aes(x = n_ref , y = bias , group = n_ref , fill = n_ref)) + 
    geom_hline(aes(yintercept = 0)) +
    facet_wrap(~scenario) + 
    xlab("Number of reference studies") + 
    ylab("Bias of log HR") +
    theme(legend.position = "none")
}

3.1 Bayesian (half-Cauchy prior)

plot_bias(sims[sims$prior == "Half-Cauchy", ])

3.2 Comparison

plot_bias(sims[sims$prior == "Inverse gamma", ])

plot_bias(sims[sims$method == "Maximum likelihood", ])

4 Coverage

plot_coverage <- function(x) {
  x %>%
    group_by(n_ref, scenario, method) %>%
    summarise(covered95 = mean(covered95)) %>%
    ggplot(aes(x = n_ref, y = covered95, colour = scenario)) + 
    geom_hline(aes(yintercept = .95), lty = 2) +
    geom_line() +
    xlab("Number Of reference studies") + 
    ylab("Coverage (95%)") +
    scale_colour_discrete(name = "Scenario") 
}

4.1 Bayesian (half-Cauchy)

plot_coverage(sims[sims$prior == "Half-Cauchy", ])
## `summarise()` has grouped output by 'n_ref', 'scenario'. You can override using the `.groups` argument.

4.2 Comparison

plot_coverage(sims) + 
  facet_wrap(~method, nrow = 2)
## `summarise()` has grouped output by 'n_ref', 'scenario'. You can override using the `.groups` argument.

5 Power

plot_power <- function(x) {
  x %>%
    group_by(scenario, n_ref, method) %>%
    summarise(power = mean(sig)) %>%
    ggplot(aes(x = n_ref, y = power, colour = scenario)) + 
    geom_line() +
    xlab("Number of reference studies") + 
    ylab("Power") +
    geom_hline(yintercept = 0.025 , lty = 2) + 
    scale_colour_discrete("Scenario")
}
plot_power_by_hr <- function(x) {
  smooth_power <- x %>%
    filter(!constant_hr) %>%
    group_split(scenario, n_ref) %>%
    lapply(function(.sim) {
      power <- gam(sig ~ lo(truth) , family = "binomial" , data = .sim) %>%
        predict(type = "response")
      cbind(.sim , power)
    }) %>%
    bind_rows() %>%
    arrange(scenario , n_ref , truth) 

  ggplot(smooth_power, aes(x = truth, y = power, colour = factor(n_ref))) + 
    geom_line() +
    xlab("True Log HR (TRTvIC)") +
    ylab("Power") + 
    ylim(c(0, 1)) +
    geom_hline(yintercept = 0.025 , lty = 2) + 
    facet_wrap(~scenario) + 
    scale_colour_discrete("Number of\nreference\nstudies") 
}

5.1 Bayesian (half-Cauchy)

plot_power(sims[sims$prior == "Half-Cauchy", ])
## `summarise()` has grouped output by 'scenario', 'n_ref'. You can override using the `.groups` argument.

plot_power_by_hr(sims[sims$prior == "Half-Cauchy", ])

5.2 Comparison

plot_power(sims) + 
  facet_wrap(~method, nrow = 2)
## `summarise()` has grouped output by 'scenario', 'n_ref'. You can override using the `.groups` argument.