This file was compiled on 2022-05-27 16:33:40 by jsimmeri
on argon-lc-g14-35.hpc.
The propensity score we estimate is given by:
ps_formula <- treatment ~ age +
vol_inpatient_events + vol_inpatient_days +
vol_outpatient_events + vol_ndx + vol_outpatient_dx_events +
index_year +
cm_elix_Alcohol + cm_elix_Anemia + cm_elix_BloodLoss +
cm_elix_CHF + cm_elix_Coagulopathy + cm_elix_Depression +
cm_elix_DM + cm_elix_DMcx + cm_elix_Drugs +
cm_elix_FluidsLytes + cm_elix_HIV + cm_elix_HTN + cm_elix_HTNcx +
cm_elix_Hypothyroid + cm_elix_Liver + cm_elix_Lymphoma +
cm_elix_Mets + cm_elix_NeuroOther + cm_elix_Obesity +
cm_elix_Paralysis + cm_elix_PHTN + cm_elix_Psychoses +
cm_elix_PUD + cm_elix_Pulmonary + cm_elix_PVD +
cm_elix_Renal + cm_elix_Rheumatic + cm_elix_Tumor +
cm_elix_Valvular + cm_elix_WeightLoss +
cm_bph + cm_slow_stream + cm_abnormal_psa +
pr_psa + pr_uroflow + pr_cystometrogram +
cm_orthostatic + cm_other_hypo
We want to fit the model using a XGBoost estimated model. We define a function to handle the model fitting: fit_xgboost_model()
. This does a few things:
dgCMatrix
objects for faster evaluation with xgb.train()
watchlist
object of the train and test matrix objects to aid in early stopping/model selectiondtotal
)fit_xgboost_model <- function(data, formula = ps_formula, k = 28) {
data <- data %>%
mutate(index_year = as.factor(index_year))
train <- data %>%
group_by(drug) %>%
sample_frac(0.75) %>%
ungroup()
test <- data %>%
anti_join(train, by = "enrolid")
dtrain <- xgb.DMatrix(
data = as(model.matrix(formula, data = train, sparse = TRUE)[, -1], "dgCMatrix"),
label = train$treatment == 1
)
dtest <- xgb.DMatrix(
data = as(model.matrix(formula, data = test, sparse = TRUE)[, -1], "dgCMatrix"),
label = test$treatment == 1
)
watchlist <- list(train = dtrain, test = dtest)
best_max_depth <- 3
max_depth <- 3
incr_since_last_improved_auc <- 0
best_auc <- 0.5
while (incr_since_last_improved_auc < 3) {
early_stopping_model <- xgb.train(
data = dtrain,
max.depth = max_depth,
eta = 0.1,
nthread = k,
nrounds = 500,
objective = "binary:logistic",
verbose = 0,
watchlist = watchlist,
eval_metric = "auc",
early_stopping_rounds = 5
)
model_auc <- as_tibble(early_stopping_model$evaluation_log)$test_auc[early_stopping_model$best_iteration]
if (model_auc > best_auc) {
incr_since_last_improved_auc <- 0
best_auc <- model_auc
best_max_depth <- max_depth
best_iteration <- early_stopping_model$best_iteration
} else {
incr_since_last_improved_auc <- incr_since_last_improved_auc + 1
}
max_depth <- max_depth + 1
}
dtotal <- xgb.DMatrix(
data = as(model.matrix(formula, data = data, sparse = TRUE)[, -1], "dgCMatrix"),
label = data$treatment == 1
)
final_model <- xgboost(
data = dtotal,
max.depth = best_max_depth,
eta = 0.1,
nthread = k,
nrounds = best_iteration,
objective = "binary:logistic",
verbose = 0,
eval_metric = "auc"
)
fitted_values <- data %>%
mutate(prob = predict(final_model, dtotal)) %>%
mutate(follow_up_time = end_date - index_date) %>%
select(enrolid, drug, treatment, follow_up_time, pd_days, prob)
return(fitted_values)
}
Now load the data, estimate the models, and write out the data for use in matching.
We want only incident cases of PD, so impose a 365 day lookback on that date. Additionally, we want the PD date to be during the lookback period.
bph_rx <- bph_rx %>%
filter(pd_date >= (start_date + 365)) %>%
filter(pd_date <= index_date)
bph_rx <- bph_rx %>%
mutate(drug = forcats::fct_relevel(drug, "tz/dz/az", "tamsulosin", "5ari"))
Generate our comparison data sets:
tz_tam <- bph_rx %>%
filter(drug == "tz/dz/az" | drug == "tamsulosin") %>%
mutate(treatment = drug == "tz/dz/az")
tz_5ari <- bph_rx %>%
filter(drug == "tz/dz/az" | drug == "5ari") %>%
mutate(treatment = drug == "tz/dz/az")
tam_5ari <- bph_rx %>%
filter(drug == "tamsulosin" | drug == "5ari") %>%
mutate(treatment = drug == "tamsulosin")
set.seed(969037654)
tz_tam_fitted_values <- fit_xgboost_model(tz_tam)
write_rds(tz_tam,
"/Shared/lss_jsimmeri_backup/data/pdd/model_data/tz_tam.rds")
write_rds(tz_tam_fitted_values,
"/Shared/lss_jsimmeri_backup/data/pdd/fitted_values/tz_tam.rds")
tz_5ari_fitted_values <- fit_xgboost_model(tz_5ari)
write_rds(tz_5ari,
"/Shared/lss_jsimmeri_backup/data/pdd/model_data/tz_5ari.rds")
write_rds(tz_5ari_fitted_values,
"/Shared/lss_jsimmeri_backup/data/pdd/fitted_values/tz_5ari.rds")
tam_5ari_fitted_values <- fit_xgboost_model(tam_5ari)
write_rds(tam_5ari,
"/Shared/lss_jsimmeri_backup/data/pdd/model_data/tam_5ari.rds")
write_rds(tam_5ari_fitted_values,
"/Shared/lss_jsimmeri_backup/data/pdd/fitted_values/tam_5ari.rds")
R version 4.0.4 (2021-02-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04 LTS
Matrix products: default
BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods
[7] base
other attached packages:
[1] xgboost_1.5.0.2 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4
[5] purrr_0.3.4 readr_1.4.0 tidyr_1.1.2 tibble_3.0.6
[9] ggplot2_3.3.3 tidyverse_1.3.0
loaded via a namespace (and not attached):
[1] tidyselect_1.1.0 xfun_0.21 lattice_0.20-41
[4] haven_2.3.1 colorspace_2.0-0 vctrs_0.3.6
[7] generics_0.1.0 htmltools_0.5.1.1 yaml_2.2.1
[10] rlang_0.4.10 pillar_1.4.7 withr_2.4.1
[13] glue_1.4.2 DBI_1.1.1 dbplyr_2.1.0
[16] modelr_0.1.8 readxl_1.3.1 lifecycle_1.0.0
[19] munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0
[22] rvest_0.3.6 evaluate_0.14 knitr_1.31
[25] ps_1.5.0 fansi_0.4.2 broom_0.7.4
[28] Rcpp_1.0.6 backports_1.2.1 scales_1.1.1
[31] jsonlite_1.7.2 fs_1.5.0 distill_1.2
[34] hms_1.0.0 digest_0.6.27 stringi_1.5.3
[37] grid_4.0.4 cli_2.3.0 tools_4.0.4
[40] magrittr_2.0.1 crayon_1.4.1 pkgconfig_2.0.3
[43] downlit_0.2.1 Matrix_1.3-2 ellipsis_0.3.1
[46] data.table_1.13.6 xml2_1.3.2 reprex_1.0.0
[49] lubridate_1.7.9.2 assertthat_0.2.1 rmarkdown_2.6
[52] httr_1.4.2 rstudioapi_0.13 R6_2.5.0
[55] compiler_4.0.4