Use the matching candidate data from fit_psm.Rmd
to find propensity score matches for the cohort of interest.
This file was compiled on 2022-04-27 17:24:06 by jsimmeri
on argon-itf-bx47-28.hpc.
Now that I have estimated propensity scores, I need to generate the matches. I am impatient (recurring theme) and want to use parallel evaluation with parLapply()
to speed this up. While each individual matching job is single threaded, there is no reason I cannot run multiple threads at the same time for the different jobs.
First, I load the function included in match.cpp
. This simply loops over the treated candidates and finds the nearest reference candidate with similar duration of follow-up (+/- 90 days). If the difference in the log odds is less than 20% of the standard deviation of the pooled log odds, we keep this pairing. This is done in C++ as opposed to R just because it is way faster.
sourceCpp("~/projects/tz-5ari-final/match.cpp")
I then define an R function ps_match()
that does the matching for us. We pass the function values for:
treatment
a string indicating the treated cohort (must be one of tz
, tam
or 5ari
)control
a string indicating the name of the control cohort (must be one of tam
, 5ari
)seed
the seed to set the PRNG to in the cluster (we shuffle the data to handle tie breaking)ps_match <- function(treatment, control, seed) {
log_path <- glue::glue("~/matching_logs/{treatment}_{control}.txt")
if (file.exists(log_path)) {
file.remove(log_path)
}
cat(
glue::glue("{lubridate::now()} | Starting matching for {treatment} to {control} using seed = {seed}\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
candidates <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/matching_candidates/{treatment}_{control}.rds"))
set.seed(seed)
# shuffle since we will take the lowest index row in the event of a tie
# shuffling the data means that the lowest index row will be = to selecting
# at random from all ties (but much faster)
candidates <- candidates %>%
select(enrolid, treatment, follow_up_time, ptx) %>%
sample_frac(1)
cases <- candidates[candidates$treatment == 1, ]
controls <- candidates[candidates$treatment == 0, ]
cat(
glue::glue("{lubridate::now()} | Tx = {nrow(cases)} | Ref = {nrow(controls)}\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
# we want to do our matching on the log odds, so convert from prob tx to
# the log odds of tx
case_ps <- log(cases$ptx / (1 - cases$ptx))
case_fu <- cases$follow_up_time
control_ps <- log(controls$ptx / (1 - controls$ptx))
control_fu <- controls$follow_up_time
caliper_std <- 0.2 * sd(c(case_ps, control_ps))
cat(
glue::glue("{lubridate::now()} | Matching Started \n"),
file = log_path,
append = TRUE,
sep = "\n"
)
pairs <- matchCpp(case_ps, case_fu, control_ps, control_fu, caliper_std)
cat(
glue::glue("{lubridate::now()} | Matching Complete \n"),
file = log_path,
append = TRUE,
sep = "\n"
)
# replace failure to match code (-1) with NA
pairs[pairs == -1] <- NA
matches <- tibble(
case_id = cases$enrolid,
control_id = controls$enrolid[pairs],
pair_id = seq(1, nrow(cases))
)
# remove failure to match
matches <- matches %>%
filter(!is.na(case_id), !is.na(control_id))
pct_matched <- round(nrow(matches) / nrow(cases) * 100)
cat(
glue::glue("{lubridate::now()} | Matched {nrow(matches)} ({pct_matched}%) \n"),
file = log_path,
append = TRUE,
sep = "\n"
)
matches <- matches %>%
gather(status, enrolid, -pair_id) %>%
mutate(treatment = ifelse(status == "case_id", 1, 0)) %>%
select(enrolid, treatment, pair_id)
write_rds(
matches,
glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/matches/{treatment}_{control}.rds")
)
cat(
glue::glue("{lubridate::now()} | Complete \n"),
file = log_path,
append = TRUE,
sep = "\n"
)
return(0)
}
This function needs a wrapper to embed all the arguments we want to pass to it in a single list:
par_match <- function(args) {
treatment <- args[[1]]
control <- args[[2]]
seed <- args[[3]]
ps_match(treatment, control, seed)
return(0)
}
And the list we want to iterate over
Start the cluster and initalize it with the required functions and packages:
cl <- makeCluster(length(args_list))
clusterEvalQ(cl, library(tidyverse))
[[1]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[2]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[3]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
clusterEvalQ(cl, library(Rcpp))
[[1]]
[1] "Rcpp" "forcats" "stringr" "dplyr" "purrr"
[6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
[11] "stats" "graphics" "grDevices" "utils" "datasets"
[16] "methods" "base"
[[2]]
[1] "Rcpp" "forcats" "stringr" "dplyr" "purrr"
[6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
[11] "stats" "graphics" "grDevices" "utils" "datasets"
[16] "methods" "base"
[[3]]
[1] "Rcpp" "forcats" "stringr" "dplyr" "purrr"
[6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
[11] "stats" "graphics" "grDevices" "utils" "datasets"
[16] "methods" "base"
clusterEvalQ(cl, library(RcppProgress))
[[1]]
[1] "RcppProgress" "Rcpp" "forcats" "stringr"
[5] "dplyr" "purrr" "readr" "tidyr"
[9] "tibble" "ggplot2" "tidyverse" "stats"
[13] "graphics" "grDevices" "utils" "datasets"
[17] "methods" "base"
[[2]]
[1] "RcppProgress" "Rcpp" "forcats" "stringr"
[5] "dplyr" "purrr" "readr" "tidyr"
[9] "tibble" "ggplot2" "tidyverse" "stats"
[13] "graphics" "grDevices" "utils" "datasets"
[17] "methods" "base"
[[3]]
[1] "RcppProgress" "Rcpp" "forcats" "stringr"
[5] "dplyr" "purrr" "readr" "tidyr"
[9] "tibble" "ggplot2" "tidyverse" "stats"
[13] "graphics" "grDevices" "utils" "datasets"
[17] "methods" "base"
clusterEvalQ(cl, sourceCpp("~/projects/tz-5ari-final/match.cpp"))
[[1]]
[[1]]$functions
[1] "matchCpp"
[[1]]$modules
character(0)
[[1]]$cppSourcePath
[1] "/Users/jsimmeri/projects/tz-5ari-final/match.cpp"
[[1]]$buildDirectory
[1] "/localscratch/8196067.1.UI/RtmpP6Hq28/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_31e5824d5a4e5"
[[2]]
[[2]]$functions
[1] "matchCpp"
[[2]]$modules
character(0)
[[2]]$cppSourcePath
[1] "/Users/jsimmeri/projects/tz-5ari-final/match.cpp"
[[2]]$buildDirectory
[1] "/localscratch/8196067.1.UI/RtmppDu3ag/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_31e592e1b27f2"
[[3]]
[[3]]$functions
[1] "matchCpp"
[[3]]$modules
character(0)
[[3]]$cppSourcePath
[1] "/Users/jsimmeri/projects/tz-5ari-final/match.cpp"
[[3]]$buildDirectory
[1] "/localscratch/8196067.1.UI/RtmpDio1al/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_31e5a427aeaee"
clusterExport(cl, "ps_match")
Then do that parLapply()
magic:
x <- parLapply(
cl,
args_list,
par_match
)
stopCluster(cl)
R version 4.0.4 (2021-02-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04 LTS
Matrix products: default
BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats graphics grDevices utils datasets
[7] methods base
other attached packages:
[1] RcppProgress_0.4.2 Rcpp_1.0.6 forcats_0.5.1
[4] stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4
[7] readr_1.4.0 tidyr_1.1.2 tibble_3.0.6
[10] ggplot2_3.3.3 tidyverse_1.3.0
loaded via a namespace (and not attached):
[1] tidyselect_1.1.0 xfun_0.21 haven_2.3.1
[4] colorspace_2.0-0 vctrs_0.3.6 generics_0.1.0
[7] htmltools_0.5.1.1 yaml_2.2.1 rlang_0.4.10
[10] pillar_1.4.7 withr_2.4.1 glue_1.4.2
[13] DBI_1.1.1 dbplyr_2.1.0 modelr_0.1.8
[16] readxl_1.3.1 lifecycle_1.0.0 munsell_0.5.0
[19] gtable_0.3.0 cellranger_1.1.0 rvest_0.3.6
[22] evaluate_0.14 knitr_1.31 ps_1.5.0
[25] fansi_0.4.2 broom_0.7.4 backports_1.2.1
[28] scales_1.1.1 jsonlite_1.7.2 fs_1.5.0
[31] distill_1.2 hms_1.0.0 digest_0.6.27
[34] stringi_1.5.3 grid_4.0.4 cli_2.3.0
[37] tools_4.0.4 magrittr_2.0.1 crayon_1.4.1
[40] pkgconfig_2.0.3 downlit_0.2.1 ellipsis_0.3.1
[43] xml2_1.3.2 reprex_1.0.0 lubridate_1.7.9.2
[46] assertthat_0.2.1 rmarkdown_2.6 httr_1.4.2
[49] rstudioapi_0.13 R6_2.5.0 compiler_4.0.4