Build Cohort of People with ALS and Healthy Age, Sex, Time Matched Controls

Author

Jacob Simmering

library(tidyverse)
-- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
v dplyr     1.1.2     v readr     2.1.4
v forcats   1.0.0     v stringr   1.5.0
v ggplot2   3.4.2     v tibble    3.2.1
v lubridate 1.9.2     v tidyr     1.3.0
v purrr     1.0.1     
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(parallel)

Find People with ALS

People with ALS are people with:

  1. A diagnosis of ALS with ICD-9-CM 335.20
  2. A diagnosis of ALS with ICD-10-CM of G12.21
  3. A pharmacy claim with a dispensing of edaravone
  4. A pharmacy claim with a dispensing of riluzole

To add in the extraction, define a function to find outpatient, inpatient, and facilities claims with one of the diagnosis codes:

find_outpatient_dx <- function(source, year, icd_9, icd_10) {
  db <- DBI::dbConnect(RSQLite::SQLite(),
                       glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
  if (as.numeric(year) <= 14) {
    events <- tbl(db, glue::glue("outpatient_dx_{source}_{year}")) %>%
      filter(dx %in% icd_9) %>%
      select(enrolid, svcdate) %>%
      mutate(enrolid = as.character(enrolid)) %>%
      collect() %>%
      distinct()
  } else {
    events9 <- tbl(db, glue::glue("outpatient_dx9_{source}_{year}")) %>%
      filter(dx %in% icd_9) %>%
      select(enrolid, svcdate) %>%
      mutate(enrolid = as.character(enrolid)) %>%
      collect() %>%
      distinct()
    events10 <- tbl(db, glue::glue("outpatient_dx10_{source}_{year}")) %>%
      filter(dx %in% icd_10) %>%
      select(enrolid, svcdate) %>%
      mutate(enrolid = as.character(enrolid)) %>%
      collect() %>%
      distinct()
    events <- rbind(events9, events10)
  }

  events <- events %>%
    select(enrolid, date = svcdate)
  DBI::dbDisconnect(db)
  return(events)
}

find_inpatient_dx <- function(source, year, icd_9, icd_10) {
  db <- DBI::dbConnect(RSQLite::SQLite(),
                       glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
  if (as.numeric(year) <= 14) {
    events <- tbl(db, glue::glue("inpatient_dx_{source}_{year}")) %>%
      filter(dx %in% icd_9) %>%
      select(caseid) %>%
      collect()
  } else {
    events9 <- tbl(db, glue::glue("inpatient_dx9_{source}_{year}")) %>%
      filter(dx %in% icd_9) %>%
      select(caseid) %>%
      collect()
    events10 <- tbl(db, glue::glue("inpatient_dx10_{source}_{year}")) %>%
      filter(dx %in% icd_10) %>%
      select(caseid) %>%
      collect()
    events <- rbind(events9, events10)
  }

  events <- tbl(db, glue::glue("inpatient_core_{source}_{year}")) %>%
    filter(caseid %in% local(events$caseid)) %>%
    select(caseid, enrolid, admdate) %>%
    mutate(enrolid = as.character(enrolid)) %>%
    collect() %>%
    inner_join(events, by = "caseid") |>
    select(enrolid, date = admdate)

  DBI::dbDisconnect(db)
  return(events)
}

find_facility_dx <- function(source, year, icd_9, icd_10) {
  facility_db <- DBI::dbConnect(RSQLite::SQLite(), 
                                glue::glue("/Shared/Statepi_Marketscan/databases/Truven/facilities_dbs/facilities_{year}.db"))

  events <- tbl(facility_db, glue::glue("facility_dx_{source}_{year}")) |>
    filter((dx %in% icd_9 & dx_ver == 9) | (dx %in% icd_10 & dx_ver == 0)) |>
    select(enrolid, date = svcdate) |>
    mutate(enrolid = as.character(enrolid)) |>
    collect() |>
    distinct()

  DBI::dbDisconnect(facility_db)
  return(events)
}

We’ll also want to find the RX events:

find_rx_events <- function(source, year, ndc_num) {
  db <- DBI::dbConnect(RSQLite::SQLite(),
                       glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
  events <- tbl(db, glue::glue("rx_core_{source}_{year}")) %>%
    filter(ndcnum %in% ndc_num) %>%
    select(enrolid, svcdate) %>%
    mutate(enrolid = as.character(enrolid)) %>%
    collect()
  events <- events %>%
    select(enrolid, date = svcdate)
  DBI::dbDisconnect(db)
  return(events)
}

We’ll want to apply this in parallel, so define a functional version of these functions:

find_events <- function(args, icd_9, icd_10, ndc_num) {
  source <- args[[1]]
  year <- args[[2]]
  table <- args[[3]]
  if (table == "inpatient") {
    events <- find_inpatient_dx(source, year, icd_9, icd_10)
  } else if (table == "outpatient") {
    events <- find_outpatient_dx(source, year, icd_9, icd_10)
  } else if (table == "facility") {
    events <- find_facility_dx(source, year, icd_9, icd_10)
  } else if (table == "rx") {
    events <- find_rx_events(source, year, ndc_num)
  }
  return(events)
}

We want to pull data from 2001 to 2021. Note that the facilities table does not exist for 2001:

conditions <- vector("list", length = 2 * 20 * 3 - 2)
i <- 1
for (source in c("ccae", "mdcr")) {
  for (year in stringr::str_pad(1:21, width = 2, pad = "0")) {
    for (table in c("inpatient", "outpatient", "facility", "rx")) {
      if (year != "01" | table != "facility") {
        conditions[[i]] <- c(source, year, table)
        i <- i + 1
      }
    }
  }
}

Next, start the cluster and load packages and functions:

cluster <- makeCluster(56)
clusterEvalQ(cluster, library(tidyverse))
[[1]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[2]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[3]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[4]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[5]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[6]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[7]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[8]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[9]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[10]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[11]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[12]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[13]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[14]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[15]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[16]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[17]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[18]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[19]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[20]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[21]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[22]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[23]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[24]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[25]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[26]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[27]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[28]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[29]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[30]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[31]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[32]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[33]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[34]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[35]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[36]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[37]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[38]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[39]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[40]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[41]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[42]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[43]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[44]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[45]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[46]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[47]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[48]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[49]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[50]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[51]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[52]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[53]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[54]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[55]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[56]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     
clusterExport(cluster, c("find_inpatient_dx", "find_outpatient_dx", 
                         "find_facility_dx", "find_rx_events"))

Define the desired values of icd_9, icd_10, and ndc_num:

icd_9 <- c("33520")
icd_10 <- c("G1221")

redbook <- read_csv("/Shared/Statepi_Marketscan/databases/Truven/redbook.csv")
Rows: 386531 Columns: 34
-- Column specification --------------------------------------------------------
Delimiter: ","
chr (26): NDCNUM, ORGBKCD, SIGLSRC, ORGBKFG, DESIDRG, MASTFRM, PKQTYCD, EXCL...
dbl  (8): DEACLAS, GENERID, MAINTIN, PRODCAT, GENIND, THERCLS, PKSIZE, THERDTL

i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
ndc_num <- redbook |>
  filter(
    stringr::str_detect(tolower(GENNME), "edaravone") | stringr::str_detect(tolower(GENNME), "riluzole")
  )

ndc_num <- ndc_num$NDCNUM
als_events <- parLapplyLB(cluster,
                          conditions,
                          find_events,
                          icd_9 = icd_9, icd_10 = icd_10, ndc_num = ndc_num)

We are done with the cluster and we release those resources:

stopCluster(cluster)

Reduce the als_events table to the first observed date:

first_als_date <- als_events %>%
  enframe() %>%
  filter(purrr::map_int(value, nrow) > 0) %>%
  unnest(cols = "value") %>%
  group_by(enrolid) %>%
  summarize(als_date = min(date))

Which we then save for later use.

write_rds(first_als_date, "/Shared/lss_jsimmeri/als/first_als_date.rds")

Find non-ALS Matches

Connect to the enrollment database to find enrollment summaries for everyone in Truven:

enrollment_db <- DBI::dbConnect(
  RSQLite::SQLite(), 
  "/Shared/Statepi_Marketscan/databases/Truven/enrollment_dbs/all_enroll_01_21q4.db")

And then pull the enrollment summaries for everyone in Truven:

enrollments <- tbl(enrollment_db, "all_enrollees") |>
  collect()

Pull out the cases:

als_enrollments <- enrollments |>
  mutate(enrolid = as.character(enrolid)) |>
  filter(enrolid %in% first_als_date$enrolid)

For cases, substitute in the ALS diagnosis year for the last observed year and also remove anyone with less than a year of lookback:

als_enrollments <- als_enrollments |>
  select(-last_year) |>
  inner_join(
    first_als_date |>
      mutate(last_year = year(as_date(als_date))),
    by = "enrolid"
  ) |>
  filter(first_date <= (als_date - 365)) |>
  select(enrolid, dobyr, sex, first_year, last_year)

And controls:

control_enrollments <- enrollments |>
  mutate(enrolid = as.character(enrolid)) |>
  filter(!(enrolid %in% first_als_date$enrolid))

Next, we want to match 5 controls to each case based on

  1. Age
  2. Sex
  3. Enrollment start year
  4. Enrollment end year

To do this, first group the ALS data and get the number of counts per stratum:

n_als <- als_enrollments |>
  group_by(dobyr, sex, first_year, last_year) |>
  summarize(
    n_als = n(),
    .groups = "drop"
  )

And then join with the control_enrollments tibble and sample 5 * n_als:

set.seed(4287452)
control_sample <- control_enrollments |>
  group_by(dobyr, sex, first_year, last_year) |>
  nest() |>
  ungroup() |>
  inner_join(n_als, by = c("dobyr", "sex", "first_year", "last_year")) |>
  mutate(
    n_control = map_int(data, nrow),
    n_to_sample = case_when(
      n_als * 5 < n_control ~ n_als * 5,
      n_als > n_control ~ n_control,
      n_als < n_control ~ n_als
    )
  ) |>
  mutate(sample = map2(data, n_to_sample, sample_n)) |>
  select(dobyr, sex, first_year, last_year, starts_with("n"), sample)

And then expand that back out:

control_sample <- control_sample |>
  unnest(sample) |>
  select(enrolid, dobyr, sex, first_year, last_year)

And then save the full cohort:

cohort <- bind_rows(
  als_enrollments |> 
    mutate(als = TRUE),
  control_sample |>
    mutate(als = FALSE)
)

write_rds(cohort, "/Shared/lss_jsimmeri/als/cohort.rds")

Session Info

sessionInfo()
R version 4.1.3 (2022-03-10)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /cvmfs/argon.hpc.uiowa.edu/2022.1/apps/linux-centos7-x86_64/gcc-9.4.0/intel-oneapi-mkl-2022.0.2-s35g6hp/mkl/2022.0.2/lib/intel64/libmkl_gf_lp64.so.2

locale:
[1] C

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] lubridate_1.9.2 forcats_1.0.0   stringr_1.5.0   dplyr_1.1.2    
 [5] purrr_1.0.1     readr_2.1.4     tidyr_1.3.0     tibble_3.2.1   
 [9] ggplot2_3.4.2   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.10       dbplyr_2.3.3      pillar_1.9.0      compiler_4.1.3   
 [5] tools_4.1.3       bit_4.0.4         digest_0.6.29     memoise_2.0.1    
 [9] RSQLite_2.2.18    timechange_0.1.1  jsonlite_1.8.4    evaluate_0.16    
[13] lifecycle_1.0.3   gtable_0.3.1      pkgconfig_2.0.3   rlang_1.1.0      
[17] DBI_1.1.3         cli_3.6.1         yaml_2.3.5        xfun_0.32        
[21] fastmap_1.1.0     withr_2.5.0       knitr_1.40        generics_0.1.3   
[25] vctrs_0.6.2       htmlwidgets_1.5.4 hms_1.1.2         bit64_4.0.5      
[29] grid_4.1.3        tidyselect_1.2.0  glue_1.6.2        R6_2.5.1         
[33] fansi_1.0.4       vroom_1.6.3       rmarkdown_2.16    blob_1.2.3       
[37] tzdb_0.3.0        magrittr_2.0.3    ellipsis_0.3.2    scales_1.2.1     
[41] htmltools_0.5.3   colorspace_2.0-3  utf8_1.2.3        stringi_1.7.12   
[45] munsell_0.5.0     cachem_1.0.6      crayon_1.5.1