Estimate Propensity Scores for Matching

Jacob Simmering, PhD https://jacobsimmering.com (University of Iowa)https://uiowa.edu
2022-05-27

This file was compiled on 2022-05-27 16:33:40 by jsimmeri on argon-lc-g14-35.hpc.

The propensity score we estimate is given by:

ps_formula <- treatment ~ age +
  vol_inpatient_events + vol_inpatient_days +
  vol_outpatient_events + vol_ndx + vol_outpatient_dx_events +
  index_year +
  cm_elix_Alcohol + cm_elix_Anemia + cm_elix_BloodLoss +
  cm_elix_CHF + cm_elix_Coagulopathy + cm_elix_Depression +
  cm_elix_DM + cm_elix_DMcx + cm_elix_Drugs +
  cm_elix_FluidsLytes + cm_elix_HIV + cm_elix_HTN + cm_elix_HTNcx +
  cm_elix_Hypothyroid + cm_elix_Liver + cm_elix_Lymphoma +
  cm_elix_Mets + cm_elix_NeuroOther + cm_elix_Obesity +
  cm_elix_Paralysis + cm_elix_PHTN + cm_elix_Psychoses +
  cm_elix_PUD + cm_elix_Pulmonary + cm_elix_PVD +
  cm_elix_Renal + cm_elix_Rheumatic + cm_elix_Tumor +
  cm_elix_Valvular + cm_elix_WeightLoss +
  cm_bph + cm_slow_stream + cm_abnormal_psa +
  pr_psa + pr_uroflow + pr_cystometrogram +
  cm_orthostatic + cm_other_hypo

We want to fit the model using a XGBoost estimated model. We define a function to handle the model fitting: fit_xgboost_model(). This does a few things:

  1. Split the data, stratified by drug, into a 75% training set and a 25% test set
  2. Using the split data, create sparse dgCMatrix objects for faster evaluation with xgb.train()
  3. Make a watchlist object of the train and test matrix objects to aid in early stopping/model selection
  4. Starting with a max depth of 3, fit a model with up to 500 rounds. Stop if test set AUC does not improve for 5 rounds.
  5. Increase the max depth by 1 and re-fit. Continue this process until the test set’s best AUC is not improved for 3 consecutive values of max depth.
  6. Use the best max depth and best number of iterations to fit the final model with the entire data set (using the matrix dtotal)
  7. Report the variable importance (gain) for each candidate feature to std out
  8. Return the fitted values
fit_xgboost_model <- function(data, formula = ps_formula, k = 28) {
  data <- data %>%
    mutate(index_year = as.factor(index_year))
  train <- data %>%
    group_by(drug) %>% 
    sample_frac(0.75) %>%
    ungroup()
  test <- data %>% 
    anti_join(train, by = "enrolid")

  dtrain <- xgb.DMatrix(
    data = as(model.matrix(formula, data = train, sparse = TRUE)[, -1], "dgCMatrix"), 
    label = train$treatment == 1
  )
  dtest <- xgb.DMatrix(
    data = as(model.matrix(formula, data = test, sparse = TRUE)[, -1], "dgCMatrix"), 
    label = test$treatment == 1
  )
  watchlist <- list(train = dtrain, test = dtest)

  best_max_depth <- 3
  max_depth <- 3
  incr_since_last_improved_auc <- 0
  best_auc <- 0.5
  while (incr_since_last_improved_auc < 3) {
    early_stopping_model <- xgb.train(
      data = dtrain,
      max.depth = max_depth,
      eta = 0.1,
      nthread = k,
      nrounds = 500,
      objective = "binary:logistic",
      verbose = 0,
      watchlist = watchlist,
      eval_metric = "auc",
      early_stopping_rounds = 5
    )
    model_auc <- as_tibble(early_stopping_model$evaluation_log)$test_auc[early_stopping_model$best_iteration]
    if (model_auc > best_auc) {
      incr_since_last_improved_auc <- 0
      best_auc <- model_auc
      best_max_depth <- max_depth
      best_iteration <- early_stopping_model$best_iteration
    } else {
      incr_since_last_improved_auc <- incr_since_last_improved_auc + 1
    }
    max_depth <- max_depth + 1
  }

  dtotal <- xgb.DMatrix(
    data = as(model.matrix(formula, data = data, sparse = TRUE)[, -1], "dgCMatrix"), 
    label = data$treatment == 1
  )
  final_model <- xgboost(
    data = dtotal,
    max.depth = best_max_depth,
    eta = 0.1, 
    nthread = k,
    nrounds = best_iteration,
    objective = "binary:logistic",
    verbose = 0,
    eval_metric = "auc"
  )

  fitted_values <- data %>%
    mutate(prob = predict(final_model, dtotal)) %>%
    mutate(follow_up_time = end_date - index_date) %>%
    select(enrolid, drug, treatment, follow_up_time, pd_days, prob)

  return(fitted_values)
}

Now load the data, estimate the models, and write out the data for use in matching.

if (Sys.info()["sysname"] == "Darwin") {
  dir <- "Volumes"
} else {
  dir <- "Shared"
}
bph_rx <- read_rds(glue::glue("/{dir}/lss_jsimmeri_backup/data/pdd/bph_rx/model_data.rds"))

We want only incident cases of PD, so impose a 365 day lookback on that date. Additionally, we want the PD date to be during the lookback period.

bph_rx <- bph_rx %>%
  filter(pd_date >= (start_date + 365)) %>%
  filter(pd_date <= index_date)
bph_rx <- bph_rx %>%
  mutate(drug = forcats::fct_relevel(drug, "tz/dz/az", "tamsulosin", "5ari"))

Generate our comparison data sets:

tz_tam <- bph_rx %>%
  filter(drug == "tz/dz/az" | drug == "tamsulosin") %>%
  mutate(treatment = drug == "tz/dz/az")
tz_5ari <- bph_rx %>%
  filter(drug == "tz/dz/az" | drug == "5ari") %>%
  mutate(treatment = drug == "tz/dz/az")
tam_5ari <- bph_rx %>%
  filter(drug == "tamsulosin" | drug == "5ari") %>%
  mutate(treatment = drug == "tamsulosin")
set.seed(969037654)
tz_tam_fitted_values <- fit_xgboost_model(tz_tam)
write_rds(tz_tam,
  "/Shared/lss_jsimmeri_backup/data/pdd/model_data/tz_tam.rds")
write_rds(tz_tam_fitted_values,
  "/Shared/lss_jsimmeri_backup/data/pdd/fitted_values/tz_tam.rds")
tz_5ari_fitted_values <- fit_xgboost_model(tz_5ari)
write_rds(tz_5ari,
  "/Shared/lss_jsimmeri_backup/data/pdd/model_data/tz_5ari.rds")
write_rds(tz_5ari_fitted_values,
  "/Shared/lss_jsimmeri_backup/data/pdd/fitted_values/tz_5ari.rds")
tam_5ari_fitted_values <- fit_xgboost_model(tam_5ari)
write_rds(tam_5ari,
  "/Shared/lss_jsimmeri_backup/data/pdd/model_data/tam_5ari.rds")
write_rds(tam_5ari_fitted_values,
  "/Shared/lss_jsimmeri_backup/data/pdd/fitted_values/tam_5ari.rds")
R version 4.0.4 (2021-02-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04 LTS

Matrix products: default
BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods  
[7] base     

other attached packages:
 [1] xgboost_1.5.0.2 forcats_0.5.1   stringr_1.4.0   dplyr_1.0.4    
 [5] purrr_0.3.4     readr_1.4.0     tidyr_1.1.2     tibble_3.0.6   
 [9] ggplot2_3.3.3   tidyverse_1.3.0

loaded via a namespace (and not attached):
 [1] tidyselect_1.1.0  xfun_0.21         lattice_0.20-41  
 [4] haven_2.3.1       colorspace_2.0-0  vctrs_0.3.6      
 [7] generics_0.1.0    htmltools_0.5.1.1 yaml_2.2.1       
[10] rlang_0.4.10      pillar_1.4.7      withr_2.4.1      
[13] glue_1.4.2        DBI_1.1.1         dbplyr_2.1.0     
[16] modelr_0.1.8      readxl_1.3.1      lifecycle_1.0.0  
[19] munsell_0.5.0     gtable_0.3.0      cellranger_1.1.0 
[22] rvest_0.3.6       evaluate_0.14     knitr_1.31       
[25] ps_1.5.0          fansi_0.4.2       broom_0.7.4      
[28] Rcpp_1.0.6        backports_1.2.1   scales_1.1.1     
[31] jsonlite_1.7.2    fs_1.5.0          distill_1.2      
[34] hms_1.0.0         digest_0.6.27     stringi_1.5.3    
[37] grid_4.0.4        cli_2.3.0         tools_4.0.4      
[40] magrittr_2.0.1    crayon_1.4.1      pkgconfig_2.0.3  
[43] downlit_0.2.1     Matrix_1.3-2      ellipsis_0.3.1   
[46] data.table_1.13.6 xml2_1.3.2        reprex_1.0.0     
[49] lubridate_1.7.9.2 assertthat_0.2.1  rmarkdown_2.6    
[52] httr_1.4.2        rstudioapi_0.13   R6_2.5.0         
[55] compiler_4.0.4