Use the matching candidate data from estimate_propensity_scores.Rmd
to find propensity score matches for the cohort of interest.
This file was compiled on 2022-05-27 16:33:53 by jsimmeri
on argon-lc-g14-35.hpc.
Now that I have estimated propensity scores, I need to generate the matches. I am impatient (recurring theme) and want to use parallel evaluation with parLapply()
to speed this up. While each individual matching job is single threaded, there is no reason I cannot run multiple threads at the same time for the different jobs.
The matching uses the function in match.cpp
. This simply loops over the treated candidates and finds the nearest reference candidate with similar duration of disease (+/- 180 days). If the difference in the log odds is less than 20% of the standard deviation of the pooled log odds, we keep this pairing. This is done in C++ as opposed to R just because it is way faster.
Here I define an R function ps_match()
that does the matching for us. We pass the function values for:
treatment
a string indicating the treated cohort (must be one of tz
, tam
, 5ari
, statin
)control
a string indicating the name of the control cohort (must be one of tam
, 5ari
, statin
or untx
(for untreated))seed
the seed to set the PRNG to in the cluster (we shuffle the data to handle tie breaking)ps_match <- function(treatment, control, seed, fu_width, pd_width) {
log_path <- glue::glue("~/pdd_matching_logs/{treatment}_{control}.txt")
if (file.exists(log_path)) {
file.remove(log_path)
}
cat(
glue::glue("{lubridate::now()} | Starting matching for {treatment} to {control} using seed = {seed}\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
candidates <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/pdd/fitted_values/{treatment}_{control}.rds"))
set.seed(seed)
# shuffle since we will take the lowest index row in the event of a tie
# shuffling the data means that the lowest index row will be = to selecting
# at random from all ties (but much faster)
candidates <- candidates %>%
select(enrolid, treatment, follow_up_time, pd_days, prob) %>%
sample_frac(1)
cases <- candidates[candidates$treatment == 1, ]
controls <- candidates[candidates$treatment == 0, ]
cat(
glue::glue("{lubridate::now()} | Tx = {nrow(cases)} | Ref = {nrow(controls)}\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
# we want to do our matching on the log odds, so convert from prob tx to
# the log odds of tx
case_ps <- log(cases$prob / (1 - cases$prob))
case_fu <- cases$follow_up_time
case_pd_days <- cases$pd_days
control_ps <- log(controls$prob / (1 - controls$prob))
control_fu <- controls$follow_up_time
control_pd_days <- controls$pd_days
caliper_std <- 0.2 * sd(c(case_ps, control_ps))
cat(
glue::glue("{lubridate::now()} | Matching Started \n"),
file = log_path,
append = TRUE,
sep = "\n"
)
pairs <- matchCpp(case_ps, case_fu, case_pd_days,
control_ps, control_fu, control_pd_days,
fu_width, pd_width, caliper_std)
cat(
glue::glue("{lubridate::now()} | Matching Complete \n"),
file = log_path,
append = TRUE,
sep = "\n"
)
# replace failure to match code (-1) with NA
pairs[pairs == -1] <- NA
matches <- tibble(
case_id = cases$enrolid,
control_id = controls$enrolid[pairs],
pair_id = seq(1, nrow(cases))
)
# remove failure to match
matches <- matches %>%
filter(!is.na(case_id), !is.na(control_id))
pct_matched <- round(nrow(matches) / nrow(cases) * 100)
cat(
glue::glue("{lubridate::now()} | Matched {nrow(matches)} ({pct_matched}%) \n"),
file = log_path,
append = TRUE,
sep = "\n"
)
matches <- matches %>%
gather(status, enrolid, -pair_id) %>%
mutate(treatment = ifelse(status == "case_id", 1, 0)) %>%
select(enrolid, treatment, pair_id)
write_rds(
matches,
glue::glue("/Shared/lss_jsimmeri_backup/data/pdd/matches/{treatment}_{control}.rds")
)
cat(
glue::glue("{lubridate::now()} | Complete \n"),
file = log_path,
append = TRUE,
sep = "\n"
)
return(matches)
}
This function needs a wrapper to embed all the arguments we want to pass to it in a single list:
par_match <- function(args, fu_width, pd_width) {
treatment <- args[[1]]
control <- args[[2]]
seed <- args[[3]]
matches <- ps_match(treatment, control, seed, fu_width, pd_width)
return(matches)
}
And the list we want to iterate over
Start the cluster and initialize it with the required functions and packages:
cl <- makeCluster(length(args_list))
clusterEvalQ(cl, library(tidyverse))
[[1]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[2]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[3]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
clusterEvalQ(cl, library(Rcpp))
[[1]]
[1] "Rcpp" "forcats" "stringr" "dplyr" "purrr"
[6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
[11] "stats" "graphics" "grDevices" "utils" "datasets"
[16] "methods" "base"
[[2]]
[1] "Rcpp" "forcats" "stringr" "dplyr" "purrr"
[6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
[11] "stats" "graphics" "grDevices" "utils" "datasets"
[16] "methods" "base"
[[3]]
[1] "Rcpp" "forcats" "stringr" "dplyr" "purrr"
[6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
[11] "stats" "graphics" "grDevices" "utils" "datasets"
[16] "methods" "base"
clusterEvalQ(cl, library(RcppProgress))
[[1]]
[1] "RcppProgress" "Rcpp" "forcats" "stringr"
[5] "dplyr" "purrr" "readr" "tidyr"
[9] "tibble" "ggplot2" "tidyverse" "stats"
[13] "graphics" "grDevices" "utils" "datasets"
[17] "methods" "base"
[[2]]
[1] "RcppProgress" "Rcpp" "forcats" "stringr"
[5] "dplyr" "purrr" "readr" "tidyr"
[9] "tibble" "ggplot2" "tidyverse" "stats"
[13] "graphics" "grDevices" "utils" "datasets"
[17] "methods" "base"
[[3]]
[1] "RcppProgress" "Rcpp" "forcats" "stringr"
[5] "dplyr" "purrr" "readr" "tidyr"
[9] "tibble" "ggplot2" "tidyverse" "stats"
[13] "graphics" "grDevices" "utils" "datasets"
[17] "methods" "base"
clusterEvalQ(cl, sourceCpp("~/projects/pdd/matching/match.cpp"))
[[1]]
[[1]]$functions
[1] "matchCpp"
[[1]]$modules
character(0)
[[1]]$cppSourcePath
[1] "/Users/jsimmeri/projects/pdd/matching/match.cpp"
[[1]]$buildDirectory
[1] "/localscratch/8370878.1.CCOM/RtmpLx4rZE/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_b80969ee2c16"
[[2]]
[[2]]$functions
[1] "matchCpp"
[[2]]$modules
character(0)
[[2]]$cppSourcePath
[1] "/Users/jsimmeri/projects/pdd/matching/match.cpp"
[[2]]$buildDirectory
[1] "/localscratch/8370878.1.CCOM/RtmpvDwYuQ/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_b80b4b9b3fef"
[[3]]
[[3]]$functions
[1] "matchCpp"
[[3]]$modules
character(0)
[[3]]$cppSourcePath
[1] "/Users/jsimmeri/projects/pdd/matching/match.cpp"
[[3]]$buildDirectory
[1] "/localscratch/8370878.1.CCOM/Rtmpnh1pAI/sourceCpp-x86_64-pc-linux-gnu-1.0.6/sourcecpp_b80a33396dff"
clusterExport(cl, "ps_match")
Then do that parLapply()
magic:
parLapply(
cl,
args_list,
par_match,
fu_width = 365 * 100,
pd_width = 180
)
[[1]]
# A tibble: 1,508 x 3
enrolid treatment pair_id
<chr> <dbl> <int>
1 25581256101 1 3
2 973299001 1 6
3 208611401 1 7
4 627318901 1 8
5 1314653901 1 9
6 1559407501 1 10
7 1258245901 1 11
8 1847703901 1 12
9 2243496902 1 13
10 889533701 1 14
# … with 1,498 more rows
[[2]]
# A tibble: 950 x 3
enrolid treatment pair_id
<chr> <dbl> <int>
1 266928401 1 1
2 1843615701 1 2
3 626846801 1 4
4 730838702 1 5
5 1503240801 1 6
6 267096901 1 7
7 670730001 1 9
8 27392525901 1 10
9 955825101 1 12
10 25409621301 1 13
# … with 940 more rows
[[3]]
# A tibble: 1,224 x 3
enrolid treatment pair_id
<chr> <dbl> <int>
1 1121550301 1 1
2 602607701 1 3
3 27068777201 1 4
4 1825891301 1 5
5 659821601 1 7
6 3170679701 1 8
7 877553104 1 9
8 25635055701 1 10
9 893522101 1 11
10 568071401 1 12
# … with 1,214 more rows
stopCluster(cl)
R version 4.0.4 (2021-02-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04 LTS
Matrix products: default
BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats graphics grDevices utils datasets
[7] methods base
other attached packages:
[1] RcppProgress_0.4.2 Rcpp_1.0.6 forcats_0.5.1
[4] stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4
[7] readr_1.4.0 tidyr_1.1.2 tibble_3.0.6
[10] ggplot2_3.3.3 tidyverse_1.3.0
loaded via a namespace (and not attached):
[1] tidyselect_1.1.0 xfun_0.21 haven_2.3.1
[4] colorspace_2.0-0 vctrs_0.3.6 generics_0.1.0
[7] htmltools_0.5.1.1 yaml_2.2.1 utf8_1.1.4
[10] rlang_0.4.10 pillar_1.4.7 withr_2.4.1
[13] glue_1.4.2 DBI_1.1.1 dbplyr_2.1.0
[16] modelr_0.1.8 readxl_1.3.1 lifecycle_1.0.0
[19] munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0
[22] rvest_0.3.6 evaluate_0.14 knitr_1.31
[25] ps_1.5.0 fansi_0.4.2 broom_0.7.4
[28] backports_1.2.1 scales_1.1.1 jsonlite_1.7.2
[31] fs_1.5.0 distill_1.2 hms_1.0.0
[34] digest_0.6.27 stringi_1.5.3 grid_4.0.4
[37] cli_2.3.0 tools_4.0.4 magrittr_2.0.1
[40] crayon_1.4.1 pkgconfig_2.0.3 downlit_0.2.1
[43] ellipsis_0.3.1 xml2_1.3.2 reprex_1.0.0
[46] lubridate_1.7.9.2 assertthat_0.2.1 rmarkdown_2.6
[49] httr_1.4.2 rstudioapi_0.13 R6_2.5.0
[52] compiler_4.0.4