Build Dataset for Propensity Score Matching for Treated Cohort

Put together the dataset for estimating the propensity score model to construct the matched cohorts. This includes removing combination users, those without lookback, those with PD at the index date, and calculating the variables used in the propensity score models.

Jacob Simmering, PhD https://jacobsimmering.com (University of Iowa)https://uiowa.edu
2022-04-27

This file was compiled on 2022-04-27 16:49:52 by jsimmeri on argon-itf-bx47-28.hpc.

Data Load

We load the enrollments previously computed, demographics, and the PD event dates.

enrollments <- read_rds("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/candidate_treated.rds")
demographics <- read_rds("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/treated_demographics.rds")

Find All Diagnosis Codes

Function Definitions

To compute the comorbidities, we need to know all the diagnoses that had occurred previously. More precisely, we need to know when was the first occurrence of the diagnosis.

I am impatient and want to use parLapply() to accelerate this process. First, I define a function to find all outpatient events for a set of enrolid values. find_outpatient() does this returning all the outpatient events for a set of required_enrolid.

find_outpatient <- function(source, year, required_enrolid) {
  db <- DBI::dbConnect(RSQLite::SQLite(),
                       glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
  if (as.numeric(year) <= 14) {
    outpatient <- tbl(db, glue::glue("outpatient_dx_{source}_{year}")) %>%
      filter(enrolid %in% required_enrolid) %>%
      select(enrolid, svcdate, dx) %>%
      mutate(icd_version = 9,
             enrolid = as.character(enrolid)) %>%
      collect()
  } else {
    outpatient_9 <- tbl(db, glue::glue("outpatient_dx9_{source}_{year}")) %>%
      filter(enrolid %in% required_enrolid) %>%
      select(enrolid, svcdate, dx) %>%
      mutate(icd_version = 9,
             enrolid = as.character(enrolid)) %>%
      collect()
    outpatient_10 <- tbl(db, glue::glue("outpatient_dx10_{source}_{year}")) %>%
      filter(enrolid %in% required_enrolid) %>%
      select(enrolid, svcdate, dx) %>%
      mutate(icd_version = 10,
             enrolid = as.character(enrolid)) %>%
      collect()
    if (nrow(outpatient_9) > 0 & nrow(outpatient_10) > 0) {
      outpatient <- bind_rows(outpatient_9, outpatient_10)
    } else if (nrow(outpatient_10) > 0) {
      outpatient <- outpatient_10
    } else {
      outpatient <- outpatient_9
    }
  }

  outpatient <- outpatient %>%
    distinct()
  DBI::dbDisconnect(db)
  return(outpatient)
}

Inpatient data:

find_inpatient <- function(source, year, required_enrolid) {
  db <- DBI::dbConnect(RSQLite::SQLite(),
                       glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
  inpatient_core <- tbl(db, glue::glue("inpatient_core_{source}_{year}")) %>%
    filter(enrolid %in% required_enrolid) %>%
    mutate(enrolid = as.character(enrolid)) %>%
    select(caseid, enrolid, admdate, los) %>%
    collect() %>%
    mutate(source = source, year = year)
  if (as.numeric(year) <= 14) {
    inpatient_dx <- tbl(db, glue::glue("inpatient_dx_{source}_{year}")) %>%
      filter(caseid %in% local(inpatient_core$caseid)) %>%
      select(caseid, dx) %>%
      collect() %>%
      mutate(icd_version = 9)
  } else {
    inpatient_9 <- tbl(db, glue::glue("inpatient_dx9_{source}_{year}")) %>%
      filter(caseid %in% local(inpatient_core$caseid)) %>%
      select(caseid, dx) %>%
      collect() %>%
      mutate(icd_version = 9)
    inpatient_10 <- tbl(db, glue::glue("inpatient_dx10_{source}_{year}")) %>%
      filter(caseid %in% local(inpatient_core$caseid)) %>%
      select(caseid, dx) %>%
      collect() %>%
      mutate(icd_version = 10)
    if (nrow(inpatient_9) > 0 & nrow(inpatient_10) > 0) {
      inpatient_dx <- bind_rows(inpatient_9, inpatient_10)
    } else if (nrow(inpatient_10) > 0) {
      inpatient_dx <- inpatient_10
    } else {
      inpatient_dx <- inpatient_9
    }
  }
  
  inpatient_dx <- inpatient_dx %>%
    inner_join(inpatient_core, by = "caseid") %>%
    select(caseid, enrolid, admdate, icd_version, dx)

  DBI::dbDisconnect(db)
  return(list(core = inpatient_core, dx = inpatient_dx))
}

A third function, reduce_dx(), does much of the heavy lifting. find_outpatient() only returns the outpatient claims for enrolid values in required_enrolid in source/year. We want to further reduce that within source/year to the first observed DX in either setting (outpatient or inpatient) and also find the volume of inpatient and outpatient visits. Finally, we also want to count the number of diagnosis codes recorded on the typical visit. The function reduce_dx() does this.

Additionally, it is structured to take only one argument args that is a list of length 3 with elements source, year, and required_enrolid (as a vector). This makes it suitable for easy application with parLapply().

reduce_dx <- function(args) {
  source <- args[[1]] 
  year <- args[[2]]
  required_enrolid <- args[[3]]

  log_path <- glue::glue("~/extraction_logs/{source}_{year}.txt")

  if (file.exists(log_path)) {
    file.remove(log_path)
  }

  cat(
    glue::glue("{lubridate::now()} | {source}/{year} has {NROW(required_enrolid)} enrolid values\n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  cat(
    glue::glue("{lubridate::now()} | Starting outpatient extraction\n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  outpatient <- find_outpatient(source, year, required_enrolid)

  cat(
    glue::glue("{lubridate::now()} | Starting inpatient extraction\n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  inpatient <- find_inpatient(source, year, required_enrolid)

  cat(
    glue::glue("{lubridate::now()} | Pooling DX\n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  dx <- bind_rows(
    outpatient, 
    inpatient[["dx"]] %>%
      rename(svcdate = admdate)
    ) %>%
    select(enrolid, svcdate, icd_version, dx)

  cat(
    glue::glue("{lubridate::now()} | Finding first DX date\n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  first_dx <- dx %>%
    group_by(enrolid, icd_version, dx) %>%
    summarize(
      first_dx_date = min(svcdate),
      .groups = "drop"
    )

  cat(
    glue::glue("{lubridate::now()} | Getting outpatient volume\n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  outpatient_volume <- outpatient %>%
    select(enrolid, svcdate) %>%
    distinct()
  outpatient_dx_volume <- outpatient %>%
    select(enrolid, svcdate, dx) %>%
    distinct()
  cat(
    glue::glue("{lubridate::now()} | Getting outpatient dx by visit date\n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  outpatient_ndx <- outpatient %>%
    select(enrolid, svcdate, dx) %>%
    distinct() %>%
    group_by(enrolid, svcdate) %>%
    summarize(ndx = n()) %>%
    ungroup()
  cat(
    glue::glue("{lubridate::now()} | Getting inpatient volume\n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  inpatient_volume <- inpatient[["core"]] %>%
    select(enrolid, admdate, los)

  cat(
    glue::glue("{lubridate::now()} | Writing out\n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  write_rds(first_dx, glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/first_dx/{source}_{year}.rds"))
  write_rds(inpatient_volume, glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_inpatient.rds"))
  write_rds(outpatient_volume, glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_outpatient.rds"))
  write_rds(outpatient_dx_volume, glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_outpatient_dx.rds"))
  write_rds(outpatient_ndx, glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_outpatient_ndx.rds"))

  cat(
    glue::glue("{lubridate::now()} | Complete with {source}/{year}\n"), 
    file = log_path,
    append = TRUE,
    sep = "\n"
  )
  
  return(0)
}

Extraction

We need to make the list for parLapply(). We have source of ccae and mdcr, years 01 to 17. We also want to (for speed reasons) restrict the enrolid values submitted in the query to only those with enrollment in year year.

i <- 1
args_list <- vector("list", 17 * 2)
p <- progress::progress_bar$new(
  total = 17 * 2,
  format = "[:bar] :percent in :elapsed with :eta remaining"
)
for (source in c("mdcr", "ccae")) {
  for (year in 2001:2017) {
    year_start_date <- lubridate::ymd(glue::glue("{year}-01-01"))
    year_end_date <- lubridate::ymd(glue::glue("{year}-12-31"))

    required_enrolid <- enrollments %>%
      filter(start_date <= year_end_date,
             end_date >= year_start_date)

    args_list[[i]] <- list(
      source, stringr::str_sub(year, 3, 4), required_enrolid$enrolid
    )
    i <- i + 1
    p$tick()
  }
}

We want to start a cluster of the same length as the list, load the tidyverse packages, and send find_inpatient() and find_outpatient() to the workers.

cluster <- makeCluster(min(56, length(args_list)))
clusterEvalQ(cluster, library(tidyverse))
[[1]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[2]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[3]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[4]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[5]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[6]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[7]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[8]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[9]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[10]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[11]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[12]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[13]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[14]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[15]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[16]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[17]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[18]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[19]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[20]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[21]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[22]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[23]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[24]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[25]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[26]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[27]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[28]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[29]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[30]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[31]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[32]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[33]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     

[[34]]
 [1] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [6] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
[11] "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[16] "base"     
clusterExport(cluster, c("find_inpatient", "find_outpatient"))

And then apply reduce_dx() to args_list using cluster and parLapply():

events <- parLapply(
  cluster,
  args_list,
  reduce_dx
)
stopCluster(cluster)

Summarization

We want to convert the first diagnosis date by source/year into a table of global first diagnosis dates. Additionally, we’d like to summarize the utilization data.

A simple way is just to load them all into lists:

dx_list <- vector("list", 17 * 2)
inpatient_volume_list <- vector("list", 17 * 2)
outpatient_volume_list <- vector("list", 17 * 2)
ndx_list <- vector("list", 17 * 2)
outpatient_dx_volume_list <- vector("list", 17 * 2)

i <- 1
p <- progress::progress_bar$new(
  total = 17 * 2,
  format = "[:bar] :percent in :elapsed with :eta remaining"
)
for (source in c("ccae", "mdcr")) {
  for (year in stringr::str_pad(1:17, width = 2, pad = "0")) {
    dx_list[[i]] <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/first_dx/{source}_{year}.rds"))
    inpatient_volume_list[[i]] <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_inpatient.rds"))
    outpatient_volume_list[[i]] <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_outpatient.rds"))
    outpatient_dx_volume_list[[i]] <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_outpatient_dx.rds"))
    ndx_list[[i]] <- read_rds(glue::glue("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/volume/{source}_{year}_outpatient_ndx.rds"))
    i <- i + 1
    p$tick()
  }
}

Generate Covariates

Normalize Volumes

We want to count the number of inpatient events and total inpatient days for each person during the lookback period. We will normalize by the duration of the lookback period, yielding in_rate - the number of inpatient events per year during the lookback - and in_days_rate - the number of days hospitalized per year.

inpatient_volume <- inpatient_volume_list %>%
  bind_rows() %>%
  mutate(enrolid = as.numeric(enrolid)) %>%
  inner_join(
    enrollments %>%
      select(enrolid, start_date, index_date),
    by = "enrolid") %>%
  filter(admdate <= index_date,
         admdate >= start_date) %>%
  group_by(enrolid, duration = index_date - start_date + 1) %>%
  summarize(
    n_inpatient_events = n(),
    n_inpatient_days = sum(los)
  ) %>%
  ungroup() %>%
  mutate(
    in_rate = n_inpatient_events / duration * 365,
    in_days_rate = n_inpatient_days / duration * 365
  )

For outpatient, we just want the total number of outpatient encounters normalized by lookback duration.

outpatient_volume <- outpatient_volume_list %>%
  bind_rows() %>%
  mutate(enrolid = as.numeric(enrolid)) %>%
  inner_join(
    enrollments %>%
      select(enrolid, start_date, index_date),
    by = "enrolid") %>%
  filter(svcdate <= index_date,
         svcdate >= start_date) %>%
  select(enrolid, svcdate, index_date, start_date) %>%
  distinct() %>%
  group_by(enrolid, duration = index_date - start_date + 1) %>%
  summarize(
    n_outpatient = n()
  ) %>%
  ungroup() %>%
  mutate(
    out_rate = n_outpatient / duration * 365
  )

What was the mean number of diagnoses made on days with outpatient visits?

ndx <- ndx_list %>%
  bind_rows() %>%
  mutate(enrolid = as.numeric(enrolid)) %>%
  inner_join(
    enrollments %>%
      select(enrolid, start_date, index_date),
    by = "enrolid") %>%
  filter(svcdate <= index_date,
         svcdate >= start_date) %>%
  group_by(enrolid) %>%
  summarize(
    mean_ndx = mean(ndx)
  ) %>%
  ungroup()

And incidence of diagnosis codes:

outpatient_dx_volume <- outpatient_dx_volume_list %>%
  bind_rows() %>%
  mutate(enrolid = as.numeric(enrolid)) %>%
  inner_join(
    enrollments %>%
      select(enrolid, start_date, index_date),
    by = "enrolid") %>%
  filter(svcdate <= index_date,
         svcdate >= start_date) %>%
  group_by(enrolid, duration = index_date - start_date + 1) %>%
  summarize(
    out_dx_n = n()
  ) %>%
  ungroup() %>%
  mutate(out_dx_rate = out_dx_n / duration) %>%
  select(enrolid, out_dx_rate)

Comorbidity Scores

The CM scores only only consider historical diagnosis - data before the index date.

dx_tbl <- bind_rows(dx_list)

lookback_dx <- dx_tbl %>%
  mutate(enrolid = as.numeric(enrolid)) %>%
  inner_join(enrollments %>%
               select(enrolid, start_date, index_date),
             by = "enrolid") %>%
  filter(first_dx_date >= start_date, first_dx_date <= index_date) %>%
  select(enrolid, icd_version, dx) %>%
  distinct()

The package icd has great tools for computing the comorbidity scores but can only handle ICD-9 or ICD-10 codes in one tibble. So we want to split our lookback_dx into ICD-9 and ICD-10 tables.

lookback_dx9 <- lookback_dx %>%
  filter(icd_version == 9)

lookback_dx10 <- lookback_dx %>%
  filter(icd_version == 10)

Elixhauser Flags

And now use the mapping icd9_map_ahrq and icd10_map_ahrqto compute the values. Unlike with CCI, we are leaving these as flags and just need to stack, aggregate, and pivot.

elix9 <- comorbid_ahrq(
  lookback_dx9,
  visit_name = "enrolid",
  icd_name = "dx",
  return_df = TRUE,
  hierarchy = FALSE
) %>%
  as_tibble() %>%
  gather(cm, value, -enrolid) %>%
  filter(value == TRUE) %>%
  select(enrolid, cm)

elix10 <- comorbid_ahrq(
  lookback_dx10,
  visit_name = "enrolid",
  icd_name = "dx",
  return_df = TRUE,
  hierarchy = FALSE
) %>%
  as_tibble() %>%
  gather(cm, value, -enrolid) %>%
  filter(value == TRUE) %>%
  select(enrolid, cm)

We then pool across the elix9 and elix10 tibbles, selecting the unique enrolid, cm pairs.

elix <- bind_rows(elix9, elix10) %>%
  distinct()

Which we then make “wide” to use in the future joins with the model data.

elix_wide <- elix %>%
  mutate(cm = paste("cm_elix", cm, sep = "_"),
         value = 1) %>%
  spread(cm, value)

Other Ad-Hoc Comorbidities

We will use the general purpose icd::comorbid() to do this work for us. We need to start by defining a mapping using a list. The named elements of the list will be:

  1. BPH
  2. Orthostatic hypotension
  3. Other non-orthostatic hypotension
  4. Abnormal PSA value
  5. Slow urinary stream
  6. Anxiety
  7. Erectile dysfunction
adhoc_9 <- list(
  "cm_bph" = c("60000", "60001", "60010", "60011", "60020", "60021", "6003",
                     "60090", "60091"),
  "cm_orthostatic" = c("4580"),
  "cm_other_hypo" = c("4581", "45821", "45829", "4588", "4589"),
  "cm_abnormal_psa" = c("79093"),
  "cm_slow_stream" = c("78862"),
  "cm_anxiety" = c("30000", "30001", "30002", "30009"),
  "cm_ed" = c("60784")
)

adhoc_10 <- list(
  "cm_bph" = c("N400", "N401", "N402", "N403", "N404"),
  "cm_orthostatic" = c("I951"),
  "cm_other_hypo" = c("I950", "I952", "I953"),
  "cm_abnormal_psa" = c("R972", "R9720", "R9721"),
  "cm_slow_stream" = c("R3912"),
  "cm_anxiety" = c("F410", "F411", "F413", "F418", "F419"),
  "cm_ed" = c("N5201", "N5202", "N5203", "N521", "N528", "N529")
)

Which we then pull back:

adhoc_flags_9 <- comorbid(
  lookback_dx9,
  adhoc_9,
  visit_name = "enrolid",
  icd_name = "dx",
  return_df = TRUE
) %>%
  as_tibble()

adhoc_flags_10 <- comorbid(
  lookback_dx10,
  adhoc_10,
  visit_name = "enrolid",
  icd_name = "dx",
  return_df = TRUE
) %>%
  as_tibble()
adhoc <- bind_rows(adhoc_flags_9, adhoc_flags_10) %>%
  distinct()

adhoc <- adhoc %>%
  gather(cm, flag, -enrolid) %>%
  filter(flag == TRUE) %>%
  distinct() %>%
  spread(cm, flag)

We also need to pull back procedure events for

  1. Measuring PSA
  2. Uroflow studies
  3. Cystometrogram studies

These are all pre-computed by find_luts_procs.R and saved.

Load the set of all PSA measurement claims, restrict to those that happened before the index date, find the unique enrolid values. Set psa_measured to TRUE.

psa_events <- read_rds("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/psa_procedures.rds")

psa_flag <- psa_events %>%
  rename(date = date) %>%
  inner_join(enrollments, by = "enrolid") %>%
  filter(date >= start_date, date <= index_date) %>%
  select(enrolid) %>%
  distinct() %>%
  mutate(psa_measured = TRUE)

Repeat this process for uroflow:

uroflow_events <- read_rds("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/uroflow_procedures.rds")

uroflow_flag <- uroflow_events %>%
  rename(date = date) %>%
  inner_join(enrollments, by = "enrolid") %>%
  filter(date >= start_date, date <= index_date) %>%
  select(enrolid) %>%
  distinct() %>%
  mutate(uroflow = TRUE)

And cystometrograms:

cystometrogram_events <- read_rds("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/cystometrogram_procedures.rds")

cystometrogram_flag <- cystometrogram_events %>%
  rename(date = date) %>%
  inner_join(enrollments, by = "enrolid") %>%
  filter(date >= start_date, date <= index_date) %>%
  select(enrolid) %>%
  distinct() %>%
  mutate(cystometrogram = TRUE)

Building Model Data Set

We start by putting together the elements of the model data set constructed so far.

Our base is enrollments which has 1 row per enrollee in our sample.

model_data <- enrollments

We want to include age at medication start. We add dobyr from demographics and use that and the index_date to compute the age.

model_data <- model_data %>%
  inner_join(
    demographics %>%
      mutate(enrolid = as.numeric(enrolid)), 
    by = "enrolid"
  ) %>%
  mutate(
    age = year(as_date(index_date)) - dobyr
  ) %>%
  select(-dobyr)

We want to include start_year of the index date as a factor to account for changes in the Truven database structure and time-trends in drug use and PD diagnosis rates.

model_data <- model_data %>%
  mutate(start_year = as.factor(year(as_date(index_date))))

Then we want to add our Elixhauser CM flags. The same process of left_join() and replace_na() applies but we pair it with across() to do all 29 CM flags at once.

model_data <- left_join(
  model_data, 
  elix_wide %>%
    mutate(enrolid = as.numeric(enrolid)), 
  by = "enrolid") %>%
  mutate(across(starts_with("cm_elix"), function(x) replace_na(x, 0)))

And then our adhoc CM codes, same process:

model_data <- left_join(
  model_data, 
  adhoc %>%
    mutate(enrolid = as.numeric(enrolid)), 
  by = "enrolid") %>%
  mutate(across(starts_with("cm"), function(x) replace_na(x, 0)))

And procedures:

model_data <- model_data %>%
  left_join(psa_flag, by = "enrolid") %>%
  left_join(uroflow_flag, by = "enrolid") %>%
  left_join(cystometrogram_flag, by = "enrolid") %>%
  mutate(
    psa_measured = replace_na(psa_measured, FALSE),
    uroflow = replace_na(uroflow, FALSE),
    cystometrogram = replace_na(cystometrogram, FALSE)
  )

We also want to include volume of inpatient and outpatient events.

model_data <- model_data %>%
  left_join(
    inpatient_volume %>%
      select(enrolid, in_rate, in_days_rate), 
    by = "enrolid"
  ) %>%
  left_join(
    outpatient_volume %>%
      select(enrolid, out_rate),
    by = "enrolid"
  ) %>%
  left_join(
    ndx,
    by = "enrolid"
  ) %>%
  left_join(
    outpatient_dx_volume,
    by = "enrolid"
  ) %>%
  mutate(
    mean_ndx = replace_na(mean_ndx, 0),
    in_rate = replace_na(in_rate, 0),
    in_days_rate = replace_na(in_days_rate, 0),
    out_rate = replace_na(out_rate, 0),
    out_dx_rate = replace_na(out_dx_rate, 0)
  )

Finally, we want to add in our outcome and compute the survival time and outcome here. Also, compute duration of follow-up.

pd_dates <- read_rds("/Shared/lss_jsimmeri_backup/data/tz-5ari-final/first_pd_date.rds")
model_data <- model_data %>%
  left_join(
    pd_dates %>%
      mutate(enrolid = as.numeric(enrolid)),
    by = "enrolid") %>%
  mutate(
    follow_up_time = end_date - index_date + 1,
    develops_pd = ifelse(is.na(pd_date), FALSE, pd_date <= end_date),
    survival_time = ifelse(
      is.na(pd_date), 
      end_date - index_date,
      ifelse(
        pd_date <= end_date,
        pd_date - index_date,
        end_date - index_date
      )
    )
  )

And a crude summary:

model_data %>%
  group_by(drug) %>%
  summarize(
    n = n(),
    cases = sum(develops_pd),
    py = sum(follow_up_time) / 365,
    incidence = cases / py * 10000
  )
# A tibble: 3 x 5
  drug            n cases       py incidence
* <chr>       <int> <int>    <dbl>     <dbl>
1 5ari        79133   765  245871.      31.1
2 tamsulosin 429741  4867 1179929.      41.2
3 tz/dz/az   124905  1022  371051.      27.5

Write Out Data

write_rds(model_data,
          "/Shared/lss_jsimmeri_backup/data/tz-5ari-final/treated_model_data.rds")

Session Info

R version 4.0.4 (2021-02-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04 LTS

Matrix products: default
BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets 
[7] methods   base     

other attached packages:
 [1] icd_4.0.9.9000    lubridate_1.7.9.2 forcats_0.5.1    
 [4] stringr_1.4.0     dplyr_1.0.4       purrr_0.3.4      
 [7] readr_1.4.0       tidyr_1.1.2       tibble_3.0.6     
[10] ggplot2_3.3.3     tidyverse_1.3.0  

loaded via a namespace (and not attached):
 [1] progress_1.2.2    tidyselect_1.1.0  xfun_0.21        
 [4] haven_2.3.1       colorspace_2.0-0  vctrs_0.3.6      
 [7] generics_0.1.0    htmltools_0.5.1.1 yaml_2.2.1       
[10] utf8_1.1.4        rlang_0.4.10      pillar_1.4.7     
[13] withr_2.4.1       glue_1.4.2        DBI_1.1.1        
[16] dbplyr_2.1.0      modelr_0.1.8      readxl_1.3.1     
[19] lifecycle_1.0.0   munsell_0.5.0     gtable_0.3.0     
[22] cellranger_1.1.0  rvest_0.3.6       evaluate_0.14    
[25] knitr_1.31        ps_1.5.0          fansi_0.4.2      
[28] broom_0.7.4       Rcpp_1.0.6        backports_1.2.1  
[31] scales_1.1.1      jsonlite_1.7.2    fs_1.5.0         
[34] distill_1.2       hms_1.0.0         digest_0.6.27    
[37] stringi_1.5.3     grid_4.0.4        cli_2.3.0        
[40] tools_4.0.4       magrittr_2.0.1    crayon_1.4.1     
[43] pkgconfig_2.0.3   downlit_0.2.1     ellipsis_0.3.1   
[46] prettyunits_1.1.1 xml2_1.3.2        reprex_1.0.0     
[49] assertthat_0.2.1  rmarkdown_2.6     httr_1.4.2       
[52] rstudioapi_0.13   R6_2.5.0          compiler_4.0.4