Put together the dataset for estimating the propensity score model to construct the matched cohorts. This includes removing combination users, those without lookback, those with PD at the index date, and calculating the variables used in the propensity score models.
This file was compiled on 2022-03-30 15:54:10 by jsimmeri
on argon-lc-f14-25.hpc.
Procedures can appear in three places in the Truven database:
I start by defining three functions find_procs_facility()
, find_procs_inpatient()
, and find_procs_inpatient()
to find procedures in these three places.
find_procs_facility <- function(source, year, icd_codes) {
facility_db <- DBI::dbConnect(RSQLite::SQLite(),
glue::glue("/Shared/Statepi_Marketscan/databases/Truven/facilities_dbs/facilities_{year}.db"))
procedures <- tbl(facility_db, glue::glue("facility_proc_{source}_{year}")) %>%
filter(proc %in% icd_codes) %>%
select(enrolid, svcdate) %>%
mutate(enrolid = as.character(enrolid)) %>%
collect() %>%
rename(date = svcdate)
DBI::dbDisconnect(facility_db)
return(procedures)
}
find_procs_inpatient <- function(source, year, icd_codes, cpt_codes) {
core_db <- DBI::dbConnect(RSQLite::SQLite(),
glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
cases <- tbl(core_db, glue::glue("inpatient_proc_{source}_{year}")) %>%
filter(
proc %in% c(icd_codes, cpt_codes)
) %>%
select(caseid) %>%
collect()
cases_with_dates <- tbl(core_db, glue::glue("inpatient_core_{source}_{year}")) %>%
filter(caseid %in% local(cases$caseid)) %>%
select(enrolid, admdate) %>%
mutate(enrolid = as.character(enrolid)) %>%
collect() %>%
rename(date = admdate)
DBI::dbDisconnect(core_db)
return(cases_with_dates)
}
find_procs_outpatient <- function(source, year, icd_codes, cpt_codes) {
core_db <- DBI::dbConnect(RSQLite::SQLite(),
glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
cases <- tbl(core_db, glue::glue("outpatient_core_{source}_{year}")) %>%
filter(
proc1 %in% c(icd_codes, cpt_codes)
) %>%
select(enrolid, svcdate) %>%
mutate(enrolid = as.character(enrolid)) %>%
collect() %>%
rename(date = svcdate)
DBI::dbDisconnect(core_db)
return(cases)
}
And then create a wrapper function that we can easily call with parLapply()
to handle the actual extraction:
find_procs <- function(args, icd_codes, cpt_codes) {
source <- args[[1]]
year <- args[[2]]
table <- args[[3]]
if (table == "facility") {
events <- find_procs_facility(source, year, icd_codes)
} else if (table == "inpatient") {
events <- find_procs_inpatient(source, year, icd_codes, cpt_codes)
} else {
events <- find_procs_outpatient(source, year, icd_codes, cpt_codes)
}
return(events)
}
Start a cluster and export the functions:
# start the cluster
cluster <- makeCluster(34)
clusterEvalQ(cluster, library(tidyverse))
[[1]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[2]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[3]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[4]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[5]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[6]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[7]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[8]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[9]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[10]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[11]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[12]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[13]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[14]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[15]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[16]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[17]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[18]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[19]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[20]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[21]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[22]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[23]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[24]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[25]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[26]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[27]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[28]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[29]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[30]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[31]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[32]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[33]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[34]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
clusterExport(cluster, c("find_procs_facility", "find_procs_inpatient",
"find_procs_outpatient"))
We want to then create a list with the elements of source
, year
, and table
where we want to do the extraction. Note for 2001, there is no facilities table.
args <- vector("list", 17 * 2 * 3 - 2)
i <- 1
for (year in stringr::str_pad(1:17, width = 2, pad = "0")) {
for (source in c("ccae", "mdcr")) {
for (table in c("facility", "inpatient", "outpatient")) {
if (!(year == "01" & table == "facility")) {
args[[i]] <- c(source, year, table)
i <- i + 1
}
}
}
}
Then using this list and find_procs()
, we want to find PSA measurement claims:
Uroflow measurement claims:
Cystometrogram claims:
Release the cluster’s resources and convert the lists returned by the cluster into tibbles:
stopCluster(cluster)
# Convert to a tibble from list (must remove 0 length tibbles before binding)
psa_events <- psa_events %>%
enframe() %>%
mutate(rows = purrr::map_dbl(value, nrow)) %>%
filter(rows > 0) %>%
select(value) %>%
unnest(cols = value) %>%
mutate(enrolid = as.numeric(enrolid))
uroflow_events <- uroflow_events %>%
enframe() %>%
mutate(rows = purrr::map_dbl(value, nrow)) %>%
filter(rows > 0) %>%
select(value) %>%
unnest(cols = value) %>%
mutate(enrolid = as.numeric(enrolid))
cystometrogram_events <- cystometrogram_events %>%
enframe() %>%
mutate(rows = purrr::map_dbl(value, nrow)) %>%
filter(rows > 0) %>%
select(value) %>%
unnest(cols = value) %>%
mutate(enrolid = as.numeric(enrolid))
Then write out the data for use in the propensity score fitting:
write_rds(
psa_events,
"/Shared/lss_jsimmeri_backup/data/tz-5ari-final/psa_procedures.rds"
)
write_rds(
uroflow_events,
"/Shared/lss_jsimmeri_backup/data/tz-5ari-final/uroflow_procedures.rds"
)
write_rds(
cystometrogram_events,
"/Shared/lss_jsimmeri_backup/data/tz-5ari-final/cystometrogram_procedures.rds"
)
R version 4.0.4 (2021-02-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04 LTS
Matrix products: default
BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats graphics grDevices utils datasets
[7] methods base
other attached packages:
[1] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4
[5] readr_1.4.0 tidyr_1.1.2 tibble_3.0.6 ggplot2_3.3.3
[9] tidyverse_1.3.0
loaded via a namespace (and not attached):
[1] tidyselect_1.1.0 xfun_0.21 haven_2.3.1
[4] colorspace_2.0-0 vctrs_0.3.6 generics_0.1.0
[7] htmltools_0.5.1.1 yaml_2.2.1 rlang_0.4.10
[10] pillar_1.4.7 withr_2.4.1 glue_1.4.2
[13] DBI_1.1.1 dbplyr_2.1.0 modelr_0.1.8
[16] readxl_1.3.1 lifecycle_1.0.0 munsell_0.5.0
[19] gtable_0.3.0 cellranger_1.1.0 rvest_0.3.6
[22] evaluate_0.14 knitr_1.31 ps_1.5.0
[25] fansi_0.4.2 broom_0.7.4 Rcpp_1.0.6
[28] backports_1.2.1 scales_1.1.1 jsonlite_1.7.2
[31] fs_1.5.0 distill_1.2 hms_1.0.0
[34] digest_0.6.27 stringi_1.5.3 grid_4.0.4
[37] cli_2.3.0 tools_4.0.4 magrittr_2.0.1
[40] crayon_1.4.1 pkgconfig_2.0.3 downlit_0.2.1
[43] ellipsis_0.3.1 xml2_1.3.2 reprex_1.0.0
[46] lubridate_1.7.9.2 assertthat_0.2.1 rmarkdown_2.6
[49] httr_1.4.2 rstudioapi_0.13 R6_2.5.0
[52] compiler_4.0.4