Use the matching candidate data from estimate_propensity_scores.Rmd to find propensity score matches for the cohort of interest.
This file was compiled on 2022-05-27 16:33:53 by jsimmeri on argon-lc-g14-35.hpc.
Now that I have estimated propensity scores, I need to generate the matches. I am impatient (recurring theme) and want to use parallel evaluation with parLapply() to speed this up. While each individual matching job is single threaded, there is no reason I cannot run multiple threads at the same time for the different jobs.
The matching uses the function in match.cpp. This simply loops over the treated candidates and finds the nearest reference candidate with similar duration of disease (+/- 180 days). If the difference in the log odds is less than 20% of the standard deviation of the pooled log odds, we keep this pairing. This is done in C++ as opposed to R just because it is way faster.
Here I define an R function ps_match() that does the matching for us. We pass the function values for:
treatment a string indicating the treated cohort (must be one of tz, tam, 5ari, statin)control a string indicating the name of the control cohort (must be one of tam, 5ari, statin or untx (for untreated))seed the seed to set the PRNG to in the cluster (we shuffle the data to handle tie breaking)ps_match <- function(treatment, control, seed, fu_width, pd_width) {
log_path <- glue::glue("~/pdd_matching_logs/{treatment}_{control}.txt")
if (file.exists(log_path)) {
file.remove(log_path)
}
cat(
glue::glue("{lubridate::now()} | Starting matching for {treatment} to {control} using seed = {seed}\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
candidates <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/pdd/fitted_values/{treatment}_{control}.rds"))
set.seed(seed)
# shuffle since we will take the lowest index row in the event of a tie
# shuffling the data means that the lowest index row will be = to selecting
# at random from all ties (but much faster)
candidates <- candidates %>%
select(enrolid, treatment, follow_up_time, pd_days, prob) %>%
sample_frac(1)
cases <- candidates[candidates$treatment == 1, ]
controls <- candidates[candidates$treatment == 0, ]
cat(
glue::glue("{lubridate::now()} | Tx = {nrow(cases)} | Ref = {nrow(controls)}\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
# we want to do our matching on the log odds, so convert from prob tx to
# the log odds of tx
case_ps <- log(cases$prob / (1 - cases$prob))
case_fu <- cases$follow_up_time
case_pd_days <- cases$pd_days
control_ps <- log(controls$prob / (1 - controls$prob))
control_fu <- controls$follow_up_time
control_pd_days <- controls$pd_days
caliper_std <- 0.2 * sd(c(case_ps, control_ps))
cat(
glue::glue("{lubridate::now()} | Matching Started \n"),
file = log_path,
append = TRUE,
sep = "\n"
)
pairs <- matchCpp(case_ps, case_fu, case_pd_days,
control_ps, control_fu, control_pd_days,
fu_width, pd_width, caliper_std)
cat(
glue::glue("{lubridate::now()} | Matching Complete \n"),
file = log_path,
append = TRUE,
sep = "\n"
)
# replace failure to match code (-1) with NA
pairs[pairs == -1] <- NA
matches <- tibble(
case_id = cases$enrolid,
control_id = controls$enrolid[pairs],
pair_id = seq(1, nrow(cases))
)
# remove failure to match
matches <- matches %>%
filter(!is.na(case_id), !is.na(control_id))
pct_matched <- round(nrow(matches) / nrow(cases) * 100)
cat(
glue::glue("{lubridate::now()} | Matched {nrow(matches)} ({pct_matched}%) \n"),
file = log_path,
append = TRUE,
sep = "\n"
)
matches <- matches %>%
gather(status, enrolid, -pair_id) %>%
mutate(treatment = ifelse(status == "case_id", 1, 0)) %>%
select(enrolid, treatment, pair_id)
write_rds(
matches,
glue::glue("/Shared/lss_jsimmeri_backup/data/pdd/matches/{treatment}_{control}.rds")
)
cat(
glue::glue("{lubridate::now()} | Complete \n"),
file = log_path,
append = TRUE,
sep = "\n"
)
return(matches)
}
This function needs a wrapper to embed all the arguments we want to pass to it in a single list:
par_match <- function(args, fu_width, pd_width) {
treatment <- args[[1]]
control <- args[[2]]
seed <- args[[3]]
matches <- ps_match(treatment, control, seed, fu_width, pd_width)
return(matches)
}
And the list we want to iterate over
set.seed(8625696)
seeds <- sample(1:10000000, 3)
args_list <- list(
list("tz", "tam", seeds[1]),
list("tz", "5ari", seeds[2]),
list("tam", "5ari", seeds[3])
)
Start the cluster and initialize it with the required functions and packages:
cl <- makeCluster(length(args_list))
clusterEvalQ(cl, library(tidyverse))
[[1]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[2]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[3]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
clusterEvalQ(cl, library(Rcpp))
[[1]]
[1] "Rcpp" "forcats" "stringr" "dplyr" "purrr"
[6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
[11] "stats" "graphics" "grDevices" "utils" "datasets"
[16] "methods" "base"
[[2]]
[1] "Rcpp" "forcats" "stringr" "dplyr" "purrr"
[6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
[11] "stats" "graphics" "grDevices" "utils" "datasets"
[16] "methods" "base"
[[3]]
[1] "Rcpp" "forcats" "stringr" "dplyr" "purrr"
[6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
[11] "stats" "graphics" "grDevices" "utils" "datasets"
[16] "methods" "base"
clusterEvalQ(cl, library(RcppProgress))
[[1]]
[1] "RcppProgress" "Rcpp" "forcats" "stringr"
[5] "dplyr" "purrr" "readr" "tidyr"
[9] "tibble" "ggplot2" "tidyverse" "stats"
[13] "graphics" "grDevices" "utils" "datasets"
[17] "methods" "base"
[[2]]
[1] "RcppProgress" "Rcpp" "forcats" "stringr"
[5] "dplyr" "purrr" "readr" "tidyr"
[9] "tibble" "ggplot2" "tidyverse" "stats"
[13] "graphics" "grDevices" "utils" "datasets"
[17] "methods" "base"
[[3]]
[1] "RcppProgress" "Rcpp" "forcats" "stringr"
[5] "dplyr" "purrr" "readr" "tidyr"
[9] "tibble" "ggplot2" "tidyverse" "stats"
[13] "graphics" "grDevices" "utils" "datasets"
[17] "methods" "base"
clusterEvalQ(cl, sourceCpp("~/projects/pdd/matching/match.cpp"))
[[1]]
[[1]]$functions
[1] "matchCpp"
[[1]]$modules
character(0)
[[1]]$cppSourcePath
[1] "/Users/jsimmeri/projects/pdd/matching/match.cpp"
[[1]]$buildDirectory
[1] "/localscratch/8370878.1.CCOM/RtmpLx4rZE/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_b80969ee2c16"
[[2]]
[[2]]$functions
[1] "matchCpp"
[[2]]$modules
character(0)
[[2]]$cppSourcePath
[1] "/Users/jsimmeri/projects/pdd/matching/match.cpp"
[[2]]$buildDirectory
[1] "/localscratch/8370878.1.CCOM/RtmpvDwYuQ/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_b80b4b9b3fef"
[[3]]
[[3]]$functions
[1] "matchCpp"
[[3]]$modules
character(0)
[[3]]$cppSourcePath
[1] "/Users/jsimmeri/projects/pdd/matching/match.cpp"
[[3]]$buildDirectory
[1] "/localscratch/8370878.1.CCOM/Rtmpnh1pAI/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_b80a33396dff"
clusterExport(cl, "ps_match")
Then do that parLapply() magic:
parLapply(
cl,
args_list,
par_match,
fu_width = 365 * 100,
pd_width = 180
)
[[1]]
# A tibble: 1,508 x 3
enrolid treatment pair_id
<chr> <dbl> <int>
1 25581256101 1 3
2 973299001 1 6
3 208611401 1 7
4 627318901 1 8
5 1314653901 1 9
6 1559407501 1 10
7 1258245901 1 11
8 1847703901 1 12
9 2243496902 1 13
10 889533701 1 14
# … with 1,498 more rows
[[2]]
# A tibble: 950 x 3
enrolid treatment pair_id
<chr> <dbl> <int>
1 266928401 1 1
2 1843615701 1 2
3 626846801 1 4
4 730838702 1 5
5 1503240801 1 6
6 267096901 1 7
7 670730001 1 9
8 27392525901 1 10
9 955825101 1 12
10 25409621301 1 13
# … with 940 more rows
[[3]]
# A tibble: 1,224 x 3
enrolid treatment pair_id
<chr> <dbl> <int>
1 1121550301 1 1
2 602607701 1 3
3 27068777201 1 4
4 1825891301 1 5
5 659821601 1 7
6 3170679701 1 8
7 877553104 1 9
8 25635055701 1 10
9 893522101 1 11
10 568071401 1 12
# … with 1,214 more rows
stopCluster(cl)
R version 4.0.4 (2021-02-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04 LTS
Matrix products: default
BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats graphics grDevices utils datasets
[7] methods base
other attached packages:
[1] RcppProgress_0.4.2 Rcpp_1.0.6 forcats_0.5.1
[4] stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4
[7] readr_1.4.0 tidyr_1.1.2 tibble_3.0.6
[10] ggplot2_3.3.3 tidyverse_1.3.0
loaded via a namespace (and not attached):
[1] tidyselect_1.1.0 xfun_0.21 haven_2.3.1
[4] colorspace_2.0-0 vctrs_0.3.6 generics_0.1.0
[7] htmltools_0.5.1.1 yaml_2.2.1 utf8_1.1.4
[10] rlang_0.4.10 pillar_1.4.7 withr_2.4.1
[13] glue_1.4.2 DBI_1.1.1 dbplyr_2.1.0
[16] modelr_0.1.8 readxl_1.3.1 lifecycle_1.0.0
[19] munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0
[22] rvest_0.3.6 evaluate_0.14 knitr_1.31
[25] ps_1.5.0 fansi_0.4.2 broom_0.7.4
[28] backports_1.2.1 scales_1.1.1 jsonlite_1.7.2
[31] fs_1.5.0 distill_1.2 hms_1.0.0
[34] digest_0.6.27 stringi_1.5.3 grid_4.0.4
[37] cli_2.3.0 tools_4.0.4 magrittr_2.0.1
[40] crayon_1.4.1 pkgconfig_2.0.3 downlit_0.2.1
[43] ellipsis_0.3.1 xml2_1.3.2 reprex_1.0.0
[46] lubridate_1.7.9.2 assertthat_0.2.1 rmarkdown_2.6
[49] httr_1.4.2 rstudioapi_0.13 R6_2.5.0
[52] compiler_4.0.4