Generate Propensity Score Matches

This file was compiled on 2022-05-27 16:33:53 by jsimmeri on argon-lc-g14-35.hpc.

library(tidyverse)
library(parallel)
library(Rcpp)
library(RcppProgress)

Overview of Matching Approach

Now that I have estimated propensity scores, I need to generate the matches. I am impatient (recurring theme) and want to use parallel evaluation with parLapply() to speed this up. While each individual matching job is single threaded, there is no reason I cannot run multiple threads at the same time for the different jobs.

The matching uses the function in match.cpp. This simply loops over the treated candidates and finds the nearest reference candidate with similar duration of disease (+/- 180 days). If the difference in the log odds is less than 20% of the standard deviation of the pooled log odds, we keep this pairing. This is done in C++ as opposed to R just because it is way faster.

Here I define an R function ps_match() that does the matching for us. We pass the function values for:

ps_match <- function(treatment, control, seed, fu_width, pd_width) {
  log_path <- glue::glue("~/pdd_matching_logs/{treatment}_{control}.txt")

  if (file.exists(log_path)) {
    file.remove(log_path)
  }

  cat(
    glue::glue("{lubridate::now()} | Starting matching for {treatment} to {control} using seed = {seed}\n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )

  candidates <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/pdd/fitted_values/{treatment}_{control}.rds"))
  
  set.seed(seed)
  # shuffle since we will take the lowest index row in the event of a tie
  # shuffling the data means that the lowest index row will be = to selecting
  # at random from all ties (but much faster)
  candidates <- candidates %>%
    select(enrolid, treatment, follow_up_time, pd_days, prob) %>%
    sample_frac(1)

  cases <- candidates[candidates$treatment == 1, ]
  controls <- candidates[candidates$treatment == 0, ]

  cat(
    glue::glue("{lubridate::now()} | Tx = {nrow(cases)} | Ref = {nrow(controls)}\n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  
  # we want to do our matching on the log odds, so convert from prob tx to
  # the log odds of tx
  case_ps <- log(cases$prob / (1 - cases$prob))
  case_fu <- cases$follow_up_time
  case_pd_days <- cases$pd_days
  control_ps <- log(controls$prob / (1 - controls$prob))
  control_fu <- controls$follow_up_time
  control_pd_days <- controls$pd_days

  caliper_std <- 0.2 * sd(c(case_ps, control_ps))

  cat(
    glue::glue("{lubridate::now()} | Matching Started \n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  pairs <- matchCpp(case_ps, case_fu, case_pd_days, 
                    control_ps, control_fu, control_pd_days, 
                    fu_width, pd_width, caliper_std)
  cat(
    glue::glue("{lubridate::now()} | Matching Complete \n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  # replace failure to match code (-1) with NA
  pairs[pairs == -1] <- NA
  
  matches <- tibble(
    case_id = cases$enrolid,
    control_id = controls$enrolid[pairs],
    pair_id = seq(1, nrow(cases))
  )

  # remove failure to match
  matches <- matches %>%
    filter(!is.na(case_id), !is.na(control_id))
  
  pct_matched <- round(nrow(matches) / nrow(cases) * 100)

  cat(
    glue::glue("{lubridate::now()} | Matched {nrow(matches)} ({pct_matched}%) \n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )

  matches <- matches %>%
    gather(status, enrolid, -pair_id) %>%
    mutate(treatment = ifelse(status == "case_id", 1, 0)) %>%
    select(enrolid, treatment, pair_id)
  
  write_rds(
    matches,
    glue::glue("/Shared/lss_jsimmeri_backup/data/pdd/matches/{treatment}_{control}.rds")
  )
  cat(
    glue::glue("{lubridate::now()} | Complete \n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  return(matches)
}

Parallelization

Wrapper

This function needs a wrapper to embed all the arguments we want to pass to it in a single list:

par_match <- function(args, fu_width, pd_width) { treatment <- args[[1]] control <- args[[2]] seed <- args[[3]] matches <- ps_match(treatment, control, seed, fu_width, pd_width) return(matches) }

[[1]] [1] "forcats" "stringr" "dplyr" "purrr" "readr" [6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" [11] "graphics" "grDevices" "utils" "datasets" "methods" [16] "base" [[2]] [1] "forcats" "stringr" "dplyr" "purrr" "readr" [6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" [11] "graphics" "grDevices" "utils" "datasets" "methods" [16] "base" [[3]] [1] "forcats" "stringr" "dplyr" "purrr" "readr" [6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" [11] "graphics" "grDevices" "utils" "datasets" "methods" [16] "base"

[[1]] [1] "Rcpp" "forcats" "stringr" "dplyr" "purrr" [6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" [11] "stats" "graphics" "grDevices" "utils" "datasets" [16] "methods" "base" [[2]] [1] "Rcpp" "forcats" "stringr" "dplyr" "purrr" [6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" [11] "stats" "graphics" "grDevices" "utils" "datasets" [16] "methods" "base" [[3]] [1] "Rcpp" "forcats" "stringr" "dplyr" "purrr" [6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" [11] "stats" "graphics" "grDevices" "utils" "datasets" [16] "methods" "base"

[[1]] [1] "RcppProgress" "Rcpp" "forcats" "stringr" [5] "dplyr" "purrr" "readr" "tidyr" [9] "tibble" "ggplot2" "tidyverse" "stats" [13] "graphics" "grDevices" "utils" "datasets" [17] "methods" "base" [[2]] [1] "RcppProgress" "Rcpp" "forcats" "stringr" [5] "dplyr" "purrr" "readr" "tidyr" [9] "tibble" "ggplot2" "tidyverse" "stats" [13] "graphics" "grDevices" "utils" "datasets" [17] "methods" "base" [[3]] [1] "RcppProgress" "Rcpp" "forcats" "stringr" [5] "dplyr" "purrr" "readr" "tidyr" [9] "tibble" "ggplot2" "tidyverse" "stats" [13] "graphics" "grDevices" "utils" "datasets" [17] "methods" "base"

[[1]] [[1]]$functions [1] "matchCpp" [[1]]$modules character(0) [[1]]$cppSourcePath [1] "/Users/jsimmeri/projects/pdd/matching/match.cpp" [[1]]$buildDirectory [1] "/localscratch/8370878.1.CCOM/RtmpLx4rZE/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_b80969ee2c16" [[2]] [[2]]$functions [1] "matchCpp" [[2]]$modules character(0) [[2]]$cppSourcePath [1] "/Users/jsimmeri/projects/pdd/matching/match.cpp" [[2]]$buildDirectory [1] "/localscratch/8370878.1.CCOM/RtmpvDwYuQ/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_b80b4b9b3fef" [[3]] [[3]]$functions [1] "matchCpp" [[3]]$modules character(0) [[3]]$cppSourcePath [1] "/Users/jsimmeri/projects/pdd/matching/match.cpp" [[3]]$buildDirectory [1] "/localscratch/8370878.1.CCOM/Rtmpnh1pAI/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_b80a33396dff"

[[1]] # A tibble: 1,508 x 3 enrolid treatment pair_id <chr> <dbl> <int> 1 25581256101 1 3 2 973299001 1 6 3 208611401 1 7 4 627318901 1 8 5 1314653901 1 9 6 1559407501 1 10 7 1258245901 1 11 8 1847703901 1 12 9 2243496902 1 13 10 889533701 1 14 # … with 1,498 more rows [[2]] # A tibble: 950 x 3 enrolid treatment pair_id <chr> <dbl> <int> 1 266928401 1 1 2 1843615701 1 2 3 626846801 1 4 4 730838702 1 5 5 1503240801 1 6 6 267096901 1 7 7 670730001 1 9 8 27392525901 1 10 9 955825101 1 12 10 25409621301 1 13 # … with 940 more rows [[3]] # A tibble: 1,224 x 3 enrolid treatment pair_id <chr> <dbl> <int> 1 1121550301 1 1 2 602607701 1 3 3 27068777201 1 4 4 1825891301 1 5 5 659821601 1 7 6 3170679701 1 8 7 877553104 1 9 8 25635055701 1 10 9 893522101 1 11 10 568071401 1 12 # … with 1,214 more rows

Session Info

R version 4.0.4 (2021-02-15) Platform: x86_64-pc-linux-gnu (64-bit) Running under: Ubuntu 20.04 LTS Matrix products: default BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C [7] LC_PAPER=en_US.UTF-8 LC_NAME=C [9] LC_ADDRESS=C LC_TELEPHONE=C [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C attached base packages: [1] parallel stats graphics grDevices utils datasets [7] methods base other attached packages: [1] RcppProgress_0.4.2 Rcpp_1.0.6 forcats_0.5.1 [4] stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4 [7] readr_1.4.0 tidyr_1.1.2 tibble_3.0.6 [10] ggplot2_3.3.3 tidyverse_1.3.0 loaded via a namespace (and not attached): [1] tidyselect_1.1.0 xfun_0.21 haven_2.3.1 [4] colorspace_2.0-0 vctrs_0.3.6 generics_0.1.0 [7] htmltools_0.5.1.1 yaml_2.2.1 utf8_1.1.4 [10] rlang_0.4.10 pillar_1.4.7 withr_2.4.1 [13] glue_1.4.2 DBI_1.1.1 dbplyr_2.1.0 [16] modelr_0.1.8 readxl_1.3.1 lifecycle_1.0.0 [19] munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0 [22] rvest_0.3.6 evaluate_0.14 knitr_1.31 [25] ps_1.5.0 fansi_0.4.2 broom_0.7.4 [28] backports_1.2.1 scales_1.1.1 jsonlite_1.7.2 [31] fs_1.5.0 distill_1.2 hms_1.0.0 [34] digest_0.6.27 stringi_1.5.3 grid_4.0.4 [37] cli_2.3.0 tools_4.0.4 magrittr_2.0.1 [40] crayon_1.4.1 pkgconfig_2.0.3 downlit_0.2.1 [43] ellipsis_0.3.1 xml2_1.3.2 reprex_1.0.0 [46] lubridate_1.7.9.2 assertthat_0.2.1 rmarkdown_2.6 [49] httr_1.4.2 rstudioapi_0.13 R6_2.5.0 [52] compiler_4.0.4

Generate Propensity Score Matches

Author

Affiliation

Published

DOI

Overview of Matching Approach

Parallelization

Wrapper

Define Cluster

Generate Matches

Session Info

Footnotes