Put together the dataset for estimating the propensity score model to construct the matched cohorts. This includes removing combination users, those without lookback, those with PD at the index date, and calculating the variables used in the propensity score models.
This file was compiled on 2022-04-27 16:49:52 by jsimmeri
on argon-itf-bx47-28.hpc.
We load the enrollments previously computed, demographics, and the PD event dates.
enrollments <- read_rds("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/candidate_treated.rds")
demographics <- read_rds("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/treated_demographics.rds")
To compute the comorbidities, we need to know all the diagnoses that had occurred previously. More precisely, we need to know when was the first occurrence of the diagnosis.
I am impatient and want to use parLapply()
to accelerate this process. First, I define a function to find all outpatient events for a set of enrolid
values. find_outpatient()
does this returning all the outpatient events for a set of required_enrolid
.
find_outpatient <- function(source, year, required_enrolid) {
db <- DBI::dbConnect(RSQLite::SQLite(),
glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
if (as.numeric(year) <= 14) {
outpatient <- tbl(db, glue::glue("outpatient_dx_{source}_{year}")) %>%
filter(enrolid %in% required_enrolid) %>%
select(enrolid, svcdate, dx) %>%
mutate(icd_version = 9,
enrolid = as.character(enrolid)) %>%
collect()
} else {
outpatient_9 <- tbl(db, glue::glue("outpatient_dx9_{source}_{year}")) %>%
filter(enrolid %in% required_enrolid) %>%
select(enrolid, svcdate, dx) %>%
mutate(icd_version = 9,
enrolid = as.character(enrolid)) %>%
collect()
outpatient_10 <- tbl(db, glue::glue("outpatient_dx10_{source}_{year}")) %>%
filter(enrolid %in% required_enrolid) %>%
select(enrolid, svcdate, dx) %>%
mutate(icd_version = 10,
enrolid = as.character(enrolid)) %>%
collect()
if (nrow(outpatient_9) > 0 & nrow(outpatient_10) > 0) {
outpatient <- bind_rows(outpatient_9, outpatient_10)
} else if (nrow(outpatient_10) > 0) {
outpatient <- outpatient_10
} else {
outpatient <- outpatient_9
}
}
outpatient <- outpatient %>%
distinct()
DBI::dbDisconnect(db)
return(outpatient)
}
Inpatient data:
find_inpatient <- function(source, year, required_enrolid) {
db <- DBI::dbConnect(RSQLite::SQLite(),
glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
inpatient_core <- tbl(db, glue::glue("inpatient_core_{source}_{year}")) %>%
filter(enrolid %in% required_enrolid) %>%
mutate(enrolid = as.character(enrolid)) %>%
select(caseid, enrolid, admdate, los) %>%
collect() %>%
mutate(source = source, year = year)
if (as.numeric(year) <= 14) {
inpatient_dx <- tbl(db, glue::glue("inpatient_dx_{source}_{year}")) %>%
filter(caseid %in% local(inpatient_core$caseid)) %>%
select(caseid, dx) %>%
collect() %>%
mutate(icd_version = 9)
} else {
inpatient_9 <- tbl(db, glue::glue("inpatient_dx9_{source}_{year}")) %>%
filter(caseid %in% local(inpatient_core$caseid)) %>%
select(caseid, dx) %>%
collect() %>%
mutate(icd_version = 9)
inpatient_10 <- tbl(db, glue::glue("inpatient_dx10_{source}_{year}")) %>%
filter(caseid %in% local(inpatient_core$caseid)) %>%
select(caseid, dx) %>%
collect() %>%
mutate(icd_version = 10)
if (nrow(inpatient_9) > 0 & nrow(inpatient_10) > 0) {
inpatient_dx <- bind_rows(inpatient_9, inpatient_10)
} else if (nrow(inpatient_10) > 0) {
inpatient_dx <- inpatient_10
} else {
inpatient_dx <- inpatient_9
}
}
inpatient_dx <- inpatient_dx %>%
inner_join(inpatient_core, by = "caseid") %>%
select(caseid, enrolid, admdate, icd_version, dx)
DBI::dbDisconnect(db)
return(list(core = inpatient_core, dx = inpatient_dx))
}
A third function, reduce_dx()
, does much of the heavy lifting. find_outpatient()
only returns the outpatient claims for enrolid
values in required_enrolid
in source
/year
. We want to further reduce that within source
/year
to the first observed DX in either setting (outpatient or inpatient) and also find the volume of inpatient and outpatient visits. Finally, we also want to count the number of diagnosis codes recorded on the typical visit. The function reduce_dx()
does this.
Additionally, it is structured to take only one argument args
that is a list of length 3 with elements source
, year
, and required_enrolid
(as a vector). This makes it suitable for easy application with parLapply()
.
reduce_dx <- function(args) {
source <- args[[1]]
year <- args[[2]]
required_enrolid <- args[[3]]
log_path <- glue::glue("~/extraction_logs/{source}_{year}.txt")
if (file.exists(log_path)) {
file.remove(log_path)
}
cat(
glue::glue("{lubridate::now()} | {source}/{year} has {NROW(required_enrolid)} enrolid values\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
cat(
glue::glue("{lubridate::now()} | Starting outpatient extraction\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
outpatient <- find_outpatient(source, year, required_enrolid)
cat(
glue::glue("{lubridate::now()} | Starting inpatient extraction\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
inpatient <- find_inpatient(source, year, required_enrolid)
cat(
glue::glue("{lubridate::now()} | Pooling DX\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
dx <- bind_rows(
outpatient,
inpatient[["dx"]] %>%
rename(svcdate = admdate)
) %>%
select(enrolid, svcdate, icd_version, dx)
cat(
glue::glue("{lubridate::now()} | Finding first DX date\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
first_dx <- dx %>%
group_by(enrolid, icd_version, dx) %>%
summarize(
first_dx_date = min(svcdate),
.groups = "drop"
)
cat(
glue::glue("{lubridate::now()} | Getting outpatient volume\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
outpatient_volume <- outpatient %>%
select(enrolid, svcdate) %>%
distinct()
outpatient_dx_volume <- outpatient %>%
select(enrolid, svcdate, dx) %>%
distinct()
cat(
glue::glue("{lubridate::now()} | Getting outpatient dx by visit date\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
outpatient_ndx <- outpatient %>%
select(enrolid, svcdate, dx) %>%
distinct() %>%
group_by(enrolid, svcdate) %>%
summarize(ndx = n()) %>%
ungroup()
cat(
glue::glue("{lubridate::now()} | Getting inpatient volume\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
inpatient_volume <- inpatient[["core"]] %>%
select(enrolid, admdate, los)
cat(
glue::glue("{lubridate::now()} | Writing out\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
write_rds(first_dx, glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/first_dx/{source}_{year}.rds"))
write_rds(inpatient_volume, glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_inpatient.rds"))
write_rds(outpatient_volume, glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_outpatient.rds"))
write_rds(outpatient_dx_volume, glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_outpatient_dx.rds"))
write_rds(outpatient_ndx, glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_outpatient_ndx.rds"))
cat(
glue::glue("{lubridate::now()} | Complete with {source}/{year}\n"),
file = log_path,
append = TRUE,
sep = "\n"
)
return(0)
}
We need to make the list for parLapply()
. We have source of ccae
and mdcr
, years 01
to 17
. We also want to (for speed reasons) restrict the enrolid
values submitted in the query to only those with enrollment in year year
.
i <- 1
args_list <- vector("list", 17 * 2)
p <- progress::progress_bar$new(
total = 17 * 2,
format = "[:bar] :percent in :elapsed with :eta remaining"
)
for (source in c("mdcr", "ccae")) {
for (year in 2001:2017) {
year_start_date <- lubridate::ymd(glue::glue("{year}-01-01"))
year_end_date <- lubridate::ymd(glue::glue("{year}-12-31"))
required_enrolid <- enrollments %>%
filter(start_date <= year_end_date,
end_date >= year_start_date)
args_list[[i]] <- list(
source, stringr::str_sub(year, 3, 4), required_enrolid$enrolid
)
i <- i + 1
p$tick()
}
}
We want to start a cluster of the same length as the list, load the tidyverse packages, and send find_inpatient()
and find_outpatient()
to the workers.
cluster <- makeCluster(min(56, length(args_list)))
clusterEvalQ(cluster, library(tidyverse))
[[1]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[2]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[3]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[4]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[5]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[6]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[7]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[8]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[9]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[10]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[11]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[12]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[13]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[14]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[15]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[16]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[17]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[18]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[19]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[20]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[21]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[22]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[23]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[24]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[25]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[26]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[27]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[28]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[29]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[30]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[31]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[32]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[33]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
[[34]]
[1] "forcats" "stringr" "dplyr" "purrr" "readr"
[6] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
[11] "graphics" "grDevices" "utils" "datasets" "methods"
[16] "base"
clusterExport(cluster, c("find_inpatient", "find_outpatient"))
And then apply reduce_dx()
to args_list
using cluster
and parLapply()
:
events <- parLapply(
cluster,
args_list,
reduce_dx
)
stopCluster(cluster)
We want to convert the first diagnosis date by source
/year
into a table of global first diagnosis dates. Additionally, we’d like to summarize the utilization data.
A simple way is just to load them all into lists:
dx_list <- vector("list", 17 * 2)
inpatient_volume_list <- vector("list", 17 * 2)
outpatient_volume_list <- vector("list", 17 * 2)
ndx_list <- vector("list", 17 * 2)
outpatient_dx_volume_list <- vector("list", 17 * 2)
i <- 1
p <- progress::progress_bar$new(
total = 17 * 2,
format = "[:bar] :percent in :elapsed with :eta remaining"
)
for (source in c("ccae", "mdcr")) {
for (year in stringr::str_pad(1:17, width = 2, pad = "0")) {
dx_list[[i]] <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/first_dx/{source}_{year}.rds"))
inpatient_volume_list[[i]] <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_inpatient.rds"))
outpatient_volume_list[[i]] <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_outpatient.rds"))
outpatient_dx_volume_list[[i]] <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_outpatient_dx.rds"))
ndx_list[[i]] <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_outpatient_ndx.rds"))
i <- i + 1
p$tick()
}
}
We want to count the number of inpatient events and total inpatient days for each person during the lookback period. We will normalize by the duration of the lookback period, yielding in_rate
- the number of inpatient events per year during the lookback - and in_days_rate
- the number of days hospitalized per year.
inpatient_volume <- inpatient_volume_list %>%
bind_rows() %>%
mutate(enrolid = as.numeric(enrolid)) %>%
inner_join(
enrollments %>%
select(enrolid, start_date, index_date),
by = "enrolid") %>%
filter(admdate <= index_date,
admdate >= start_date) %>%
group_by(enrolid, duration = index_date - start_date + 1) %>%
summarize(
n_inpatient_events = n(),
n_inpatient_days = sum(los)
) %>%
ungroup() %>%
mutate(
in_rate = n_inpatient_events / duration * 365,
in_days_rate = n_inpatient_days / duration * 365
)
For outpatient, we just want the total number of outpatient encounters normalized by lookback duration.
outpatient_volume <- outpatient_volume_list %>%
bind_rows() %>%
mutate(enrolid = as.numeric(enrolid)) %>%
inner_join(
enrollments %>%
select(enrolid, start_date, index_date),
by = "enrolid") %>%
filter(svcdate <= index_date,
svcdate >= start_date) %>%
select(enrolid, svcdate, index_date, start_date) %>%
distinct() %>%
group_by(enrolid, duration = index_date - start_date + 1) %>%
summarize(
n_outpatient = n()
) %>%
ungroup() %>%
mutate(
out_rate = n_outpatient / duration * 365
)
What was the mean number of diagnoses made on days with outpatient visits?
ndx <- ndx_list %>%
bind_rows() %>%
mutate(enrolid = as.numeric(enrolid)) %>%
inner_join(
enrollments %>%
select(enrolid, start_date, index_date),
by = "enrolid") %>%
filter(svcdate <= index_date,
svcdate >= start_date) %>%
group_by(enrolid) %>%
summarize(
mean_ndx = mean(ndx)
) %>%
ungroup()
And incidence of diagnosis codes:
outpatient_dx_volume <- outpatient_dx_volume_list %>%
bind_rows() %>%
mutate(enrolid = as.numeric(enrolid)) %>%
inner_join(
enrollments %>%
select(enrolid, start_date, index_date),
by = "enrolid") %>%
filter(svcdate <= index_date,
svcdate >= start_date) %>%
group_by(enrolid, duration = index_date - start_date + 1) %>%
summarize(
out_dx_n = n()
) %>%
ungroup() %>%
mutate(out_dx_rate = out_dx_n / duration) %>%
select(enrolid, out_dx_rate)
The CM scores only only consider historical diagnosis - data before the index date.
dx_tbl <- bind_rows(dx_list)
lookback_dx <- dx_tbl %>%
mutate(enrolid = as.numeric(enrolid)) %>%
inner_join(enrollments %>%
select(enrolid, start_date, index_date),
by = "enrolid") %>%
filter(first_dx_date >= start_date, first_dx_date <= index_date) %>%
select(enrolid, icd_version, dx) %>%
distinct()
The package icd
has great tools for computing the comorbidity scores but can only handle ICD-9 or ICD-10 codes in one tibble. So we want to split our lookback_dx
into ICD-9 and ICD-10 tables.
And now use the mapping icd9_map_ahrq
and icd10_map_ahrq
to compute the values. Unlike with CCI, we are leaving these as flags and just need to stack, aggregate, and pivot.
elix9 <- comorbid_ahrq(
lookback_dx9,
visit_name = "enrolid",
icd_name = "dx",
return_df = TRUE,
hierarchy = FALSE
) %>%
as_tibble() %>%
gather(cm, value, -enrolid) %>%
filter(value == TRUE) %>%
select(enrolid, cm)
elix10 <- comorbid_ahrq(
lookback_dx10,
visit_name = "enrolid",
icd_name = "dx",
return_df = TRUE,
hierarchy = FALSE
) %>%
as_tibble() %>%
gather(cm, value, -enrolid) %>%
filter(value == TRUE) %>%
select(enrolid, cm)
We then pool across the elix9
and elix10
tibbles, selecting the unique enrolid
, cm
pairs.
elix <- bind_rows(elix9, elix10) %>%
distinct()
Which we then make “wide” to use in the future joins with the model data.
elix_wide <- elix %>%
mutate(cm = paste("cm_elix", cm, sep = "_"),
value = 1) %>%
spread(cm, value)
We will use the general purpose icd::comorbid()
to do this work for us. We need to start by defining a mapping using a list. The named elements of the list will be:
adhoc_9 <- list(
"cm_bph" = c("60000", "60001", "60010", "60011", "60020", "60021", "6003",
"60090", "60091"),
"cm_orthostatic" = c("4580"),
"cm_other_hypo" = c("4581", "45821", "45829", "4588", "4589"),
"cm_abnormal_psa" = c("79093"),
"cm_slow_stream" = c("78862"),
"cm_anxiety" = c("30000", "30001", "30002", "30009"),
"cm_ed" = c("60784")
)
adhoc_10 <- list(
"cm_bph" = c("N400", "N401", "N402", "N403", "N404"),
"cm_orthostatic" = c("I951"),
"cm_other_hypo" = c("I950", "I952", "I953"),
"cm_abnormal_psa" = c("R972", "R9720", "R9721"),
"cm_slow_stream" = c("R3912"),
"cm_anxiety" = c("F410", "F411", "F413", "F418", "F419"),
"cm_ed" = c("N5201", "N5202", "N5203", "N521", "N528", "N529")
)
Which we then pull back:
adhoc <- bind_rows(adhoc_flags_9, adhoc_flags_10) %>%
distinct()
adhoc <- adhoc %>%
gather(cm, flag, -enrolid) %>%
filter(flag == TRUE) %>%
distinct() %>%
spread(cm, flag)
We also need to pull back procedure events for
These are all pre-computed by find_luts_procs.R
and saved.
Load the set of all PSA measurement claims, restrict to those that happened before the index date, find the unique enrolid
values. Set psa_measured
to TRUE
.
psa_events <- read_rds("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/psa_procedures.rds")
psa_flag <- psa_events %>%
rename(date = date) %>%
inner_join(enrollments, by = "enrolid") %>%
filter(date >= start_date, date <= index_date) %>%
select(enrolid) %>%
distinct() %>%
mutate(psa_measured = TRUE)
Repeat this process for uroflow:
uroflow_events <- read_rds("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/uroflow_procedures.rds")
uroflow_flag <- uroflow_events %>%
rename(date = date) %>%
inner_join(enrollments, by = "enrolid") %>%
filter(date >= start_date, date <= index_date) %>%
select(enrolid) %>%
distinct() %>%
mutate(uroflow = TRUE)
And cystometrograms:
cystometrogram_events <- read_rds("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/cystometrogram_procedures.rds")
cystometrogram_flag <- cystometrogram_events %>%
rename(date = date) %>%
inner_join(enrollments, by = "enrolid") %>%
filter(date >= start_date, date <= index_date) %>%
select(enrolid) %>%
distinct() %>%
mutate(cystometrogram = TRUE)
We start by putting together the elements of the model data set constructed so far.
Our base is enrollments
which has 1 row per enrollee in our sample.
model_data <- enrollments
We want to include age at medication start. We add dobyr
from demographics
and use that and the index_date
to compute the age.
model_data <- model_data %>%
inner_join(
demographics %>%
mutate(enrolid = as.numeric(enrolid)),
by = "enrolid"
) %>%
mutate(
age = year(as_date(index_date)) - dobyr
) %>%
select(-dobyr)
We want to include start_year
of the index date as a factor to account for changes in the Truven database structure and time-trends in drug use and PD diagnosis rates.
Then we want to add our Elixhauser CM flags. The same process of left_join()
and replace_na()
applies but we pair it with across()
to do all 29 CM flags at once.
model_data <- left_join(
model_data,
elix_wide %>%
mutate(enrolid = as.numeric(enrolid)),
by = "enrolid") %>%
mutate(across(starts_with("cm_elix"), function(x) replace_na(x, 0)))
And then our adhoc CM codes, same process:
model_data <- left_join(
model_data,
adhoc %>%
mutate(enrolid = as.numeric(enrolid)),
by = "enrolid") %>%
mutate(across(starts_with("cm"), function(x) replace_na(x, 0)))
And procedures:
model_data <- model_data %>%
left_join(psa_flag, by = "enrolid") %>%
left_join(uroflow_flag, by = "enrolid") %>%
left_join(cystometrogram_flag, by = "enrolid") %>%
mutate(
psa_measured = replace_na(psa_measured, FALSE),
uroflow = replace_na(uroflow, FALSE),
cystometrogram = replace_na(cystometrogram, FALSE)
)
We also want to include volume of inpatient and outpatient events.
model_data <- model_data %>%
left_join(
inpatient_volume %>%
select(enrolid, in_rate, in_days_rate),
by = "enrolid"
) %>%
left_join(
outpatient_volume %>%
select(enrolid, out_rate),
by = "enrolid"
) %>%
left_join(
ndx,
by = "enrolid"
) %>%
left_join(
outpatient_dx_volume,
by = "enrolid"
) %>%
mutate(
mean_ndx = replace_na(mean_ndx, 0),
in_rate = replace_na(in_rate, 0),
in_days_rate = replace_na(in_days_rate, 0),
out_rate = replace_na(out_rate, 0),
out_dx_rate = replace_na(out_dx_rate, 0)
)
Finally, we want to add in our outcome and compute the survival time and outcome here. Also, compute duration of follow-up.
pd_dates <- read_rds("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/first_pd_date.rds")
model_data <- model_data %>%
left_join(
pd_dates %>%
mutate(enrolid = as.numeric(enrolid)),
by = "enrolid") %>%
mutate(
follow_up_time = end_date - index_date + 1,
develops_pd = ifelse(is.na(pd_date), FALSE, pd_date <= end_date),
survival_time = ifelse(
is.na(pd_date),
end_date - index_date,
ifelse(
pd_date <= end_date,
pd_date - index_date,
end_date - index_date
)
)
)
And a crude summary:
model_data %>%
group_by(drug) %>%
summarize(
n = n(),
cases = sum(develops_pd),
py = sum(follow_up_time) / 365,
incidence = cases / py * 10000
)
# A tibble: 3 x 5
drug n cases py incidence
* <chr> <int> <int> <dbl> <dbl>
1 5ari 79133 765 245871. 31.1
2 tamsulosin 429741 4867 1179929. 41.2
3 tz/dz/az 124905 1022 371051. 27.5
write_rds(model_data,
"/Shared/lss_jsimmeri_backup/data/tz-5ari-final/treated_model_data.rds")
R version 4.0.4 (2021-02-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04 LTS
Matrix products: default
BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats graphics grDevices utils datasets
[7] methods base
other attached packages:
[1] icd_4.0.9.9000 lubridate_1.7.9.2 forcats_0.5.1
[4] stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4
[7] readr_1.4.0 tidyr_1.1.2 tibble_3.0.6
[10] ggplot2_3.3.3 tidyverse_1.3.0
loaded via a namespace (and not attached):
[1] progress_1.2.2 tidyselect_1.1.0 xfun_0.21
[4] haven_2.3.1 colorspace_2.0-0 vctrs_0.3.6
[7] generics_0.1.0 htmltools_0.5.1.1 yaml_2.2.1
[10] utf8_1.1.4 rlang_0.4.10 pillar_1.4.7
[13] withr_2.4.1 glue_1.4.2 DBI_1.1.1
[16] dbplyr_2.1.0 modelr_0.1.8 readxl_1.3.1
[19] lifecycle_1.0.0 munsell_0.5.0 gtable_0.3.0
[22] cellranger_1.1.0 rvest_0.3.6 evaluate_0.14
[25] knitr_1.31 ps_1.5.0 fansi_0.4.2
[28] broom_0.7.4 Rcpp_1.0.6 backports_1.2.1
[31] scales_1.1.1 jsonlite_1.7.2 fs_1.5.0
[34] distill_1.2 hms_1.0.0 digest_0.6.27
[37] stringi_1.5.3 grid_4.0.4 cli_2.3.0
[40] tools_4.0.4 magrittr_2.0.1 crayon_1.4.1
[43] pkgconfig_2.0.3 downlit_0.2.1 ellipsis_0.3.1
[46] prettyunits_1.1.1 xml2_1.3.2 reprex_1.0.0
[49] assertthat_0.2.1 rmarkdown_2.6 httr_1.4.2
[52] rstudioapi_0.13 R6_2.5.0 compiler_4.0.4