Build Dataset for Propensity Score Matching for Treated Cohort

Put together the dataset for estimating the propensity score model to construct the matched cohorts. This includes removing combination users, those without lookback, those with PD at the index date, and calculating the variables used in the propensity score models.

Jacob Simmering, PhD https://jacobsimmering.com (University of Iowa)https://uiowa.edu
2022-03-30

This file was compiled on 2022-03-30 15:54:10 by jsimmeri on argon-lc-f14-25.hpc.

Procedures can appear in three places in the Truven database:

  1. The facilities table: This reflect bills from facilities (e.g., hospitals) for service. Procedures are encoded using ICD-9 and ICD-10 procedure codes.
  2. The inpatient procedures table: These are largely encoded using either ICD-9, ICD-10, or CPT procedure codes.
  3. The outpatients table: Like the inpatient table, these are either ICD-9, ICD-10, or CPT procedure codes.

I start by defining three functions find_procs_facility(), find_procs_inpatient(), and find_procs_inpatient() to find procedures in these three places.

find_procs_facility <- function(source,  year, icd_codes) {
  facility_db <- DBI::dbConnect(RSQLite::SQLite(), 
                                glue::glue("/Shared/Statepi_Marketscan/databases/Truven/facilities_dbs/facilities_{year}.db"))
  procedures <- tbl(facility_db, glue::glue("facility_proc_{source}_{year}")) %>%
    filter(proc %in% icd_codes) %>%
    select(enrolid, svcdate) %>%
    mutate(enrolid = as.character(enrolid)) %>%
    collect() %>%
    rename(date = svcdate)
  DBI::dbDisconnect(facility_db)
  return(procedures)
}
find_procs_inpatient <- function(source, year, icd_codes, cpt_codes) {
  core_db <- DBI::dbConnect(RSQLite::SQLite(), 
                            glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
  cases <- tbl(core_db, glue::glue("inpatient_proc_{source}_{year}")) %>%
    filter(
      proc %in% c(icd_codes, cpt_codes)
    ) %>%
    select(caseid) %>%
    collect()
  
  cases_with_dates <- tbl(core_db, glue::glue("inpatient_core_{source}_{year}")) %>%
    filter(caseid %in% local(cases$caseid)) %>%
    select(enrolid, admdate) %>%
    mutate(enrolid = as.character(enrolid)) %>%
    collect() %>%
    rename(date = admdate)
  
  DBI::dbDisconnect(core_db)
  
  return(cases_with_dates)
}
find_procs_outpatient <- function(source, year, icd_codes, cpt_codes) {
  core_db <- DBI::dbConnect(RSQLite::SQLite(), 
                            glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
  
  cases <- tbl(core_db, glue::glue("outpatient_core_{source}_{year}")) %>%
    filter(
      proc1 %in% c(icd_codes, cpt_codes)
    ) %>%
    select(enrolid, svcdate) %>%
    mutate(enrolid = as.character(enrolid)) %>%
    collect() %>%
    rename(date = svcdate)
  
  DBI::dbDisconnect(core_db)
  
  return(cases)
}

And then create a wrapper function that we can easily call with parLapply() to handle the actual extraction:

find_procs <- function(args, icd_codes, cpt_codes) {
  source <- args[[1]]
  year <- args[[2]]
  table <- args[[3]]
  if (table == "facility") {
    events <- find_procs_facility(source, year, icd_codes)
  } else if (table == "inpatient") {
    events <- find_procs_inpatient(source, year, icd_codes, cpt_codes)
  } else {
    events <- find_procs_outpatient(source, year, icd_codes, cpt_codes)
  }
  return(events)
}

Start a cluster and export the functions:

# start the cluster
cluster <- makeCluster(34)
clusterEvalQ(cluster, library(tidyverse))
[[1]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[2]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[3]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[4]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[5]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[6]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[7]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[8]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[9]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[10]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[11]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[12]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[13]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[14]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[15]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[16]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[17]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[18]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[19]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[20]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[21]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[22]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[23]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[24]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[25]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[26]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[27]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[28]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[29]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[30]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[31]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[32]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[33]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[34]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     
clusterExport(cluster, c("find_procs_facility", "find_procs_inpatient",
                         "find_procs_outpatient"))

We want to then create a list with the elements of source, year, and table where we want to do the extraction. Note for 2001, there is no facilities table.

args <- vector("list", 17 * 2 * 3 - 2)
i <- 1
for (year in stringr::str_pad(1:17, width = 2, pad = "0")) {
  for (source in c("ccae", "mdcr")) {
    for (table in c("facility", "inpatient", "outpatient")) {
      if (!(year == "01" & table == "facility")) {
        args[[i]] <- c(source, year, table)
        i <- i + 1
      }
    }
  }
}

Then using this list and find_procs(), we want to find PSA measurement claims:

psa_events <- parLapply(
  cluster,
  args,
  find_procs, 
  icd_codes = NULL,
  cpt_codes = c("84152", "84153", "84154")
)

Uroflow measurement claims:

uroflow_events <- parLapply(
  cluster,
  args,
  find_procs, 
  icd_codes = c("8924", "4A1D75Z"),
  cpt_codes = c("51736", "51741")
)

Cystometrogram claims:

cystometrogram_events <- parLapply(
  cluster,
  args,
  find_procs, 
  icd_codes = c("8922", "4A0D7BZ", "4A0D8BZ", "4A1D7BZ", "4A1D8BZ"),
  cpt_codes = c("51725", "51726")
)

Release the cluster’s resources and convert the lists returned by the cluster into tibbles:

stopCluster(cluster)

# Convert to a tibble from list (must remove 0 length tibbles before binding)
psa_events <- psa_events %>%
  enframe() %>%
  mutate(rows = purrr::map_dbl(value, nrow)) %>%
  filter(rows > 0) %>%
  select(value) %>%
  unnest(cols = value) %>%
  mutate(enrolid = as.numeric(enrolid))

uroflow_events <- uroflow_events %>%
  enframe() %>%
  mutate(rows = purrr::map_dbl(value, nrow)) %>%
  filter(rows > 0) %>%
  select(value) %>%
  unnest(cols = value) %>%
  mutate(enrolid = as.numeric(enrolid))

cystometrogram_events <- cystometrogram_events %>%
  enframe() %>%
  mutate(rows = purrr::map_dbl(value, nrow)) %>%
  filter(rows > 0) %>%
  select(value) %>%
  unnest(cols = value) %>%
  mutate(enrolid = as.numeric(enrolid))

Then write out the data for use in the propensity score fitting:

write_rds(
  psa_events,
  "/Shared/lss_jsimmeri_backup/data/tz-5ari-final/psa_procedures.rds"  
)

write_rds(
  uroflow_events,
  "/Shared/lss_jsimmeri_backup/data/tz-5ari-final/uroflow_procedures.rds"  
)

write_rds(
  cystometrogram_events,
  "/Shared/lss_jsimmeri_backup/data/tz-5ari-final/cystometrogram_procedures.rds"  
)

Session Info

R version 4.0.4 (2021-02-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04 LTS

Matrix products: default
BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets 
[7] methods   base     

other attached packages:
[1] forcats_0.5.1   stringr_1.4.0   dplyr_1.0.4     purrr_0.3.4    
[5] readr_1.4.0     tidyr_1.1.2     tibble_3.0.6    ggplot2_3.3.3  
[9] tidyverse_1.3.0

loaded via a namespace (and not attached):
 [1] tidyselect_1.1.0  xfun_0.21         haven_2.3.1      
 [4] colorspace_2.0-0  vctrs_0.3.6       generics_0.1.0   
 [7] htmltools_0.5.1.1 yaml_2.2.1        rlang_0.4.10     
[10] pillar_1.4.7      withr_2.4.1       glue_1.4.2       
[13] DBI_1.1.1         dbplyr_2.1.0      modelr_0.1.8     
[16] readxl_1.3.1      lifecycle_1.0.0   munsell_0.5.0    
[19] gtable_0.3.0      cellranger_1.1.0  rvest_0.3.6      
[22] evaluate_0.14     knitr_1.31        ps_1.5.0         
[25] fansi_0.4.2       broom_0.7.4       Rcpp_1.0.6       
[28] backports_1.2.1   scales_1.1.1      jsonlite_1.7.2   
[31] fs_1.5.0          distill_1.2       hms_1.0.0        
[34] digest_0.6.27     stringi_1.5.3     grid_4.0.4       
[37] cli_2.3.0         tools_4.0.4       magrittr_2.0.1   
[40] crayon_1.4.1      pkgconfig_2.0.3   downlit_0.2.1    
[43] ellipsis_0.3.1    xml2_1.3.2        reprex_1.0.0     
[46] lubridate_1.7.9.2 assertthat_0.2.1  rmarkdown_2.6    
[49] httr_1.4.2        rstudioapi_0.13   R6_2.5.0         
[52] compiler_4.0.4