Generate Propensity Score Matches

This file was compiled on 2022-04-27 17:24:06 by jsimmeri on argon-itf-bx47-28.hpc.

library(tidyverse)
library(parallel)
library(Rcpp)
library(RcppProgress)

Overview of Matching Approach

Now that I have estimated propensity scores, I need to generate the matches. I am impatient (recurring theme) and want to use parallel evaluation with parLapply() to speed this up. While each individual matching job is single threaded, there is no reason I cannot run multiple threads at the same time for the different jobs.

First, I load the function included in match.cpp. This simply loops over the treated candidates and finds the nearest reference candidate with similar duration of follow-up (+/- 90 days). If the difference in the log odds is less than 20% of the standard deviation of the pooled log odds, we keep this pairing. This is done in C++ as opposed to R just because it is way faster.

sourceCpp("~/projects/tz-5ari-final/match.cpp")

I then define an R function ps_match() that does the matching for us. We pass the function values for:

treatment a string indicating the treated cohort (must be one of tz, tam or 5ari)
control a string indicating the name of the control cohort (must be one of tam, 5ari)
seed the seed to set the PRNG to in the cluster (we shuffle the data to handle tie breaking)

ps_match <- function(treatment, control, seed) {
  log_path <- glue::glue("~/matching_logs/{treatment}_{control}.txt")

  if (file.exists(log_path)) {
    file.remove(log_path)
  }

  cat(
    glue::glue("{lubridate::now()} | Starting matching for {treatment} to {control} using seed = {seed}\n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )

  candidates <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/matching_candidates/{treatment}_{control}.rds"))
  
  set.seed(seed)
  # shuffle since we will take the lowest index row in the event of a tie
  # shuffling the data means that the lowest index row will be = to selecting
  # at random from all ties (but much faster)
  candidates <- candidates %>%
    select(enrolid, treatment, follow_up_time, ptx) %>%
    sample_frac(1)

  cases <- candidates[candidates$treatment == 1, ]
  controls <- candidates[candidates$treatment == 0, ]

  cat(
    glue::glue("{lubridate::now()} | Tx = {nrow(cases)} | Ref = {nrow(controls)}\n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  
  # we want to do our matching on the log odds, so convert from prob tx to
  # the log odds of tx
  case_ps <- log(cases$ptx / (1 - cases$ptx))
  case_fu <- cases$follow_up_time
  control_ps <- log(controls$ptx / (1 - controls$ptx))
  control_fu <- controls$follow_up_time

  caliper_std <- 0.2 * sd(c(case_ps, control_ps))

  cat(
    glue::glue("{lubridate::now()} | Matching Started \n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  pairs <- matchCpp(case_ps, case_fu, control_ps, control_fu, caliper_std)
  cat(
    glue::glue("{lubridate::now()} | Matching Complete \n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  # replace failure to match code (-1) with NA
  pairs[pairs == -1] <- NA
  
  matches <- tibble(
    case_id = cases$enrolid,
    control_id = controls$enrolid[pairs],
    pair_id = seq(1, nrow(cases))
  )

  
  # remove failure to match
  matches <- matches %>%
    filter(!is.na(case_id), !is.na(control_id))
  
  pct_matched <- round(nrow(matches) / nrow(cases) * 100)

  cat(
    glue::glue("{lubridate::now()} | Matched {nrow(matches)} ({pct_matched}%) \n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )

  matches <- matches %>%
    gather(status, enrolid, -pair_id) %>%
    mutate(treatment = ifelse(status == "case_id", 1, 0)) %>%
    select(enrolid, treatment, pair_id)
  
  write_rds(
    matches,
    glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/matches/{treatment}_{control}.rds")
  )
  cat(
    glue::glue("{lubridate::now()} | Complete \n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  return(0)
}

Parallelization

Wrapper

This function needs a wrapper to embed all the arguments we want to pass to it in a single list:

par_match <- function(args) {
  treatment <- args[[1]]
  control <- args[[2]]
  seed <- args[[3]]
  ps_match(treatment, control, seed)
  return(0)
}

And the list we want to iterate over

seeds <- sample(1:10000000, 3)

args_list <- list(
  list("tz", "tam", seeds[1]),
  list("tz", "5ari", seeds[2]),
  list("tam", "5ari", seeds[3])
)

Define Cluster

Start the cluster and initalize it with the required functions and packages:

cl <- makeCluster(length(args_list))
clusterEvalQ(cl, library(tidyverse))

[[1]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[2]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[3]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"

clusterEvalQ(cl, library(Rcpp))

[[1]]
 [1] "Rcpp"      "forcats"   "stringr"   "dplyr"     "purrr"    
 [6] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
[11] "stats"     "graphics"  "grDevices" "utils"     "datasets" 
[16] "methods"   "base"     

[[2]]
 [1] "Rcpp"      "forcats"   "stringr"   "dplyr"     "purrr"    
 [6] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
[11] "stats"     "graphics"  "grDevices" "utils"     "datasets" 
[16] "methods"   "base"     

[[3]]
 [1] "Rcpp"      "forcats"   "stringr"   "dplyr"     "purrr"    
 [6] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
[11] "stats"     "graphics"  "grDevices" "utils"     "datasets" 
[16] "methods"   "base"

clusterEvalQ(cl, library(RcppProgress))

[[1]]
 [1] "RcppProgress" "Rcpp"         "forcats"      "stringr"     
 [5] "dplyr"        "purrr"        "readr"        "tidyr"       
 [9] "tibble"       "ggplot2"      "tidyverse"    "stats"       
[13] "graphics"     "grDevices"    "utils"        "datasets"    
[17] "methods"      "base"        

[[2]]
 [1] "RcppProgress" "Rcpp"         "forcats"      "stringr"     
 [5] "dplyr"        "purrr"        "readr"        "tidyr"       
 [9] "tibble"       "ggplot2"      "tidyverse"    "stats"       
[13] "graphics"     "grDevices"    "utils"        "datasets"    
[17] "methods"      "base"        

[[3]]
 [1] "RcppProgress" "Rcpp"         "forcats"      "stringr"     
 [5] "dplyr"        "purrr"        "readr"        "tidyr"       
 [9] "tibble"       "ggplot2"      "tidyverse"    "stats"       
[13] "graphics"     "grDevices"    "utils"        "datasets"    
[17] "methods"      "base"

clusterEvalQ(cl, sourceCpp("~/projects/tz-5ari-final/match.cpp"))

[[1]]
[[1]]$functions
[1] "matchCpp"

[[1]]$modules
character(0)

[[1]]$cppSourcePath
[1] "/Users/jsimmeri/projects/tz-5ari-final/match.cpp"

[[1]]$buildDirectory
[1] "/localscratch/8196067.1.UI/RtmpP6Hq28/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_31e5824d5a4e5"


[[2]]
[[2]]$functions
[1] "matchCpp"

[[2]]$modules
character(0)

[[2]]$cppSourcePath
[1] "/Users/jsimmeri/projects/tz-5ari-final/match.cpp"

[[2]]$buildDirectory
[1] "/localscratch/8196067.1.UI/RtmppDu3ag/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_31e592e1b27f2"


[[3]]
[[3]]$functions
[1] "matchCpp"

[[3]]$modules
character(0)

[[3]]$cppSourcePath
[1] "/Users/jsimmeri/projects/tz-5ari-final/match.cpp"

[[3]]$buildDirectory
[1] "/localscratch/8196067.1.UI/RtmpDio1al/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_31e5a427aeaee"

clusterExport(cl, "ps_match")

Generate Matches

Then do that parLapply() magic:

x <- parLapply(
  cl,
  args_list,
  par_match
)

stopCluster(cl)

Session Info

sessionInfo()

R version 4.0.4 (2021-02-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04 LTS

Matrix products: default
BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets 
[7] methods   base     

other attached packages:
 [1] RcppProgress_0.4.2 Rcpp_1.0.6         forcats_0.5.1     
 [4] stringr_1.4.0      dplyr_1.0.4        purrr_0.3.4       
 [7] readr_1.4.0        tidyr_1.1.2        tibble_3.0.6      
[10] ggplot2_3.3.3      tidyverse_1.3.0   

loaded via a namespace (and not attached):
 [1] tidyselect_1.1.0  xfun_0.21         haven_2.3.1      
 [4] colorspace_2.0-0  vctrs_0.3.6       generics_0.1.0   
 [7] htmltools_0.5.1.1 yaml_2.2.1        rlang_0.4.10     
[10] pillar_1.4.7      withr_2.4.1       glue_1.4.2       
[13] DBI_1.1.1         dbplyr_2.1.0      modelr_0.1.8     
[16] readxl_1.3.1      lifecycle_1.0.0   munsell_0.5.0    
[19] gtable_0.3.0      cellranger_1.1.0  rvest_0.3.6      
[22] evaluate_0.14     knitr_1.31        ps_1.5.0         
[25] fansi_0.4.2       broom_0.7.4       backports_1.2.1  
[28] scales_1.1.1      jsonlite_1.7.2    fs_1.5.0         
[31] distill_1.2       hms_1.0.0         digest_0.6.27    
[34] stringi_1.5.3     grid_4.0.4        cli_2.3.0        
[37] tools_4.0.4       magrittr_2.0.1    crayon_1.4.1     
[40] pkgconfig_2.0.3   downlit_0.2.1     ellipsis_0.3.1   
[43] xml2_1.3.2        reprex_1.0.0      lubridate_1.7.9.2
[46] assertthat_0.2.1  rmarkdown_2.6     httr_1.4.2       
[49] rstudioapi_0.13   R6_2.5.0          compiler_4.0.4