SENDAs Agreement 1 Update 2010-2022 (step 3)

Third step of deduplication process. It focuses on resolving inconsistencies in key “user-invariant” variables (like sex, nationality, and age of substance use onset) by triangulating data from multiple sources (C1-C6 agreements, hospital records, mortality registry, etc.) and applying logical rules or imputation.

Author

Andrés González Santa Cruz

Published

September 30, 2025

Data Loading and Exploration

Loading Packages and uniting databases

Proceed to load the necessary packages.

Code

# invisible("Only run from Ubuntu")
# if (!(Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv"))) {
#   if(Sys.info()["sysname"]!="Windows"){
#     Sys.setenv(RETICULATE_PYTHON = "/home/fondecytacc/.pyenv/versions/3.11.5/bin/python")
#   }
# }

#clean enviroment
rm(list = ls()); gc()

time_before_dedup2<-Sys.time()

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# --- Bootstrap reticulate con ruta relativa a getwd() ---
suppressPackageStartupMessages(library(reticulate))

# Busca .mamba_root/envs/py311/python.exe desde getwd() hacia padres
find_python_rel <- function(start = getwd(),
                            rel = file.path(".mamba_root","envs","py311","python.exe")) {
  cur <- normalizePath(start, winslash = "/", mustWork = FALSE)
  repeat {
    cand <- normalizePath(file.path(cur, rel), winslash = "/", mustWork = FALSE)
    if (file.exists(cand)) return(cand)
    parent <- dirname(cur)
    if (identical(parent, cur)) return(NA_character_)  # llegó a la raíz
    cur <- parent
  }
}

py <- find_python_rel()

if (is.na(py)) {
  stop("No se encontró Python relativo a getwd() (buscando '.mamba_root/envs/py311/python.exe').\n",
       "Directorio actual: ", getwd())
}

# Forzar ese intérprete
Sys.unsetenv(c("RETICULATE_CONDAENV","RETICULATE_PYTHON_FALLBACK"))
Sys.setenv(RETICULATE_PYTHON = py)
reticulate::use_python(py, required=T)

py_config()  # verificación

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

# tidy, robust, and commented
load_ndp <- function(date_tag,
                     base_name,
                     input_subdir,
                     out_subdir,
                     load_into    = .GlobalEnv) {

  # Are we in RStudio Server or Docker?
  is_server <- Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")

  # Project root = current WD without a trailing "/cons"
  # Safer than gsub everywhere
  wd <- getwd()
  project_root <- sub("(/)?cons/?$", "", wd)

  # Build dirs
  out_dir  <- file.path(project_root, out_subdir)
  in_dir   <- if (is_server) file.path(getwd(), input_subdir) else out_dir

  # Filenames (choose one canonical extension spelling)
  rdata_file   <- sprintf("%s_%s.Rdata", base_name, date_tag)
  seven_z_part <- sprintf("%s_%s.Rdata.7z.001", base_name, date_tag)
  enc_file     <- sprintf("%s_%s.Rdata.enc",  base_name, date_tag)  # only if you actually encrypt to .enc

  # Optional: Windows drive-based Google Drive/E: fallback (only on Windows)
  envpath <- NULL
  if (.Platform$OS.type == "windows") {
    drv <- toupper(substr(normalizePath(project_root, winslash = "\\", mustWork = FALSE), 1, 1))
    envpath <- if (identical(drv, "G")) {
      "G:/Mi unidad/Alvacast/SISTRAT 2023/"
    } else {
      "E:/Mi unidad/Alvacast/SISTRAT 2023/"
    }
  }
  # message("envpath: ", envpath %||% "<none>")

  # Ensure dirs exist (won't error if already present)
  dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)

  # Helper: load Rdata into the specified environment
  load_rdata <- function(path) {
    stopifnot(file.exists(path))
    loaded <- load(path, envir = load_into)
    message("Loaded objects: ", paste(loaded, collapse = ", "))
    invisible(loaded)
  }

  if (!is_server) {
    # Desktop workflow: expect plain .Rdata in out_dir
    local_rdata <- file.path(out_dir, rdata_file)
    if (!file.exists(local_rdata)) {
      stop("Rdata file not found: ", local_rdata)
    }
    return(load_rdata(local_rdata))

  } else {
    # Server/Docker workflow: expect compressed/encrypted parts in _input
    seven_z_path <- file.path(in_dir, seven_z_part)
    enc_path     <- file.path(in_dir, enc_file)
    out_rdata    <- file.path(out_dir, rdata_file)

    if (file.exists(seven_z_path)) {
      # Extract 7z multi-part to out_dir using password
      pass <- Sys.getenv("PASS_PPIO", unset = NA_character_)
      if (is.na(pass) || pass == "") stop("Missing PASS_PPIO env var for 7z decryption.")
      # 7z command: x (extract), -p<pass>, -o<outdir>
      args <- c("x", shQuote(seven_z_path), sprintf("-p%s", pass), paste0("-o", shQuote(out_dir)))
      status <- system2("7z", args = args, stdout = TRUE, stderr = TRUE)
      # Optional: check extraction result
      if (!file.exists(out_rdata)) {
        stop("Extraction finished but target not found: ", out_rdata, "\n7z output:\n", paste(status, collapse = "\n"))
      }
      return(load_rdata(out_rdata))

    } else if (file.exists(enc_path)) {
      # If you truly have a raw .enc, you need a decryption step here (not loadable as-is).
      stop("Found encrypted file but no extractor defined for .enc: ", enc_path)

    } else if (file.exists(out_rdata)) {
      # Already extracted earlier
      return(load_rdata(out_rdata))

    } else {
      stop("No input found in: ", in_dir,
           "\nTried: ", seven_z_path, " and ", enc_path,
           "\nAlso looked for already-extracted: ", out_rdata)
    }
  }
}

load_ndp(date_tag = "2025_09_27",
                     base_name = "24_ndp",
                     input_subdir = "_input",
                     out_subdir   = file.path("data", "20241015_out"),
                     load_into    = .GlobalEnv)

          used (Mb) gc trigger (Mb) max used (Mb)
Ncells  605943 32.4    1288181 68.8  1046203 55.9
Vcells 1206591  9.3    8388608 64.0  1915507 14.7
python:         G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe
libpython:      G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python311.dll
pythonhome:     G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311
version:        3.11.5 | packaged by conda-forge | (main, Aug 27 2023, 03:23:48) [MSC v.1936 64 bit (AMD64)]
Architecture:   64bit
numpy:           [NOT FOUND]

NOTE: Python version was forced by RETICULATE_PYTHON

Code

#https://github.com/rstudio/renv/issues/544
#renv falls back to copying rather than symlinking, which is evidently very slow in this configuration.
renv::settings$use.cache(FALSE)

#only use explicit dependencies (in DESCRIPTION)
renv::settings$snapshot.type("implicit")

#check if rstools is installed
try(installr::install.Rtools(check_r_update=F))

Installing package into ‘G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32’ (as ‘lib’ is unspecified)

Code

check_quarto_version <- function(required = "1.7.29", comparator = c("ge","gt","le","lt","eq")) {
  comparator <- match.arg(comparator)
  current <- package_version(paste(unlist(quarto::quarto_version()), collapse = "."))
  req     <- package_version(required)

  ok <- switch(comparator,
               ge = current >= req,
               gt = current >  req,
               le = current <= req,
               lt = current <  req,
               eq = current == req)

  if (!ok) {
    stop(sprintf("Quarto version check failed: need %s %s (installed: %s).",
                 comparator, required, current), call. = FALSE)
  }
  invisible(TRUE)
}

check_quarto_version("1.7.29", "ge") 

#change repository to CL
local({
  r <- getOption("repos")
  r["CRAN"] <- "https://cran.dcc.uchile.cl/"
  options(repos=r)
})

if(!require(pacman)){install.packages("pacman");require(pacman)}

Cargando paquete requerido: pacman

Code

if(!require(pak)){install.packages("pak");require(pak)}

Cargando paquete requerido: pak

Code

pacman::p_unlock(lib.loc = .libPaths()) #para no tener problemas reinstalando paquetes

No 00LOCK detected in: G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32 No 00LOCK detected in: C:/Program Files/R/R-4.4.1/library

Code

if(Sys.info()["sysname"]=="Windows"){
if (getRversion() != "4.4.1") { stop("Requires R version 4.4.1; Actual: ", getRversion()) }
}

#check docker
check_docker_running <- function() {
  # Try running 'docker info' to check if Docker is running
  system("docker info", intern = TRUE, ignore.stderr = TRUE)
}

install_docker <- function() {
  # Open the Docker Desktop download page in the browser for installation
  browseURL("https://www.docker.com/products/docker-desktop")
}

# Main logic
if (inherits(try(check_docker_running(), silent = TRUE), "try-error")) {
  liftr::install_docker()
} else {
  message("Docker is running.")
}

Warning in system(“docker info”, intern = TRUE, ignore.stderr = TRUE): el comando ejecutado ‘docker info’ tiene el estatus 1

Docker is running.

Code

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#PACKAGES#######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

unlink("*_cache", recursive=T)

# ----------------------------------------------------------------------
# 2. Use a single pak::pkg_install() call for most CRAN packages
# ----------------------------------------------------------------------

paks <-
  c(#"git", 
    # To connect to github
    "gh", #interface for  GitHub API from R
    #
    "gitcreds", # manages Git credentials (usernames, passwords, tokens)
    #
    "usethis", # simplifies common project setup tasks for R developers
    # Package to bring packages in development
    "devtools",
    # Package administration
    "renv",
    # To manipulate data
    "knitr", "pander", "DT",
    # Join
    "fuzzyjoin", "RecordLinkage",
    # For tables
    "tidyverse", "janitor",
    # For contingency tables
    "kableExtra",
    # For connections with python
    "reticulate",
    # To manipulate big data
    "polars", "sqldf",
    # To bring big databases
    "nanoparquet",
    # Interface for R and RStudio in R
    "installr", "rmarkdown", "quarto", "yaml", #"rstudioapi",
    # Time handling
    "clock",
    # Combine plots
    "ggpubr",
    # Parallelized iterative processing
    "furrr",
    # Work like a tibble with a data.table database
    "tidytable",
    # Split database into training and testing
    "caret",
    # Impute missing data
    "missRanger", "mice",
    # To modularize tasks
    "job",
    # For PhantomJS install checks
    "webshot"
  )

# dplyr
# janitor
# reshape2
# tidytable
# arrow
# boot
# broom
# car
# caret
# data.table
# DiagrammeR
# DiagrammeRsvg
# dplyr
# epiR
# epitools
# ggplot2
# glue
# htmlwidgets
# knitr
# lubridate
# naniar
# parallel
# polycor
# pROC
# psych
# readr
# rio
# rsvg
# scales
# stringr
# tableone
# rmarkdown
# biostat3
# codebook
# finalfit
# Hmisc
# kableExtra
# knitr
# devtools
# tidyr
# stringi
# stringr
# muhaz
# sqldf
# compareGroups
# survminer
# lubridate
# ggfortify
# car
# fuzzyjoin
# compareGroups
# caret
# job
# htmltools
# nanoparquet
# ggpubr
# polars
# installr
# clock
# pander
# reshape
# mice
# missRanger
# VIM
# withr
# biostat3
# broom
# glue
# finalfit
# purrr
# sf


# pak::pkg_install(paks)

pak::pak_sitrep()
# pak::sysreqs_check_installed(unique(unlist(paks)))
#pak::lockfile_create(unique(unlist(paks)),  "dependencies_duplicates24.lock", dependencies=T)
#pak::lockfile_install("dependencies_duplicates24.lock")
#https://rdrr.io/cran/pak/man/faq.html
#pak::cache_delete()

library(tidytable)

Adjuntando el paquete: ‘tidytable’

The following objects are masked from ‘package:stats’:

dt, filter, lag

The following object is masked from ‘package:base’:

%in%

Code

library(ggplot2)
library(readr)

Adjuntando el paquete: ‘readr’

The following object is masked by ‘.GlobalEnv’:

parse_date

Code

# ----------------------------------------------------------------------
# 3. Activate polars code completion (safe to try even if it fails)
# ----------------------------------------------------------------------
#try(polars_code_completion_activate())

# ----------------------------------------------------------------------
# 4. BPMN from GitHub (not on CRAN, so install via devtools if missing)
# ----------------------------------------------------------------------
if (!requireNamespace("bpmn", quietly = TRUE)) {
  devtools::install_github("bergant/bpmn")
}

# ----------------------------------------------------------------------
# 5. PhantomJS Check (use webshot if PhantomJS is missing)
# ----------------------------------------------------------------------
# if (!webshot::is_phantomjs_installed()) {
#   webshot::install_phantomjs()
# }

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#FUNCTIONS######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
# NO MORE DEBUGS
options(error = NULL)        # si antes tenías options(error = recover) o browser)
options(browserNLdisabled = FALSE)


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
#NAs are replaced with "" in knitr kable
options(knitr.kable.NA = '')

pander::panderOptions('big.mark', ',')
pander::panderOptions('decimal.mark', '.')

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#

#to format rows in bold
format_cells <- function(df, rows ,cols, value = c("italics", "bold", "strikethrough")){

  # select the correct markup
  # one * for italics, two ** for bold
  map <- setNames(c("*", "**", "~~"), c("italics", "bold", "strikethrough"))
  markup <- map[value]  

  for (r in rows){
    for(c in cols){

      # Make sure values are not factors
      df[[c]] <- as.character( df[[c]])

      # Update formatting
      df[r, c] <- ifelse(nchar(df[r, c])==0,"",paste0(markup, gsub(" ", "", df[r, c]), markup))
    }
  }

  return(df)
}
#To produce line breaks in messages and warnings
knitr::knit_hooks$set(
   error = function(x, options) {
     paste('\n\n<div class="alert alert-danger" style="font-size: small !important;">',
           gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
           '</div>', sep = '\n')
   },
   warning = function(x, options) {
     paste('\n\n<div class="alert alert-warning" style="font-size: small !important;">',
           gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
           '</div>', sep = '\n')
   },
   message = function(x, options) {
     paste('<div class="message" style="font-size: small !important;">',
           gsub('##', '\n', x),
           '</div>', sep = '\n')
   }
)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#

sum_dates <- function(x){
  cbind.data.frame(
    min= as.Date(min(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01"),
    p001= as.Date(quantile(unclass(as.Date(x)), .001, na.rm=T), origin = "1970-01-01"),
    p005= as.Date(quantile(unclass(as.Date(x)), .005, na.rm=T), origin = "1970-01-01"),
    p025= as.Date(quantile(unclass(as.Date(x)), .025, na.rm=T), origin = "1970-01-01"),
    p25= as.Date(quantile(unclass(as.Date(x)), .25, na.rm=T), origin = "1970-01-01"),
    p50= as.Date(quantile(unclass(as.Date(x)), .5, na.rm=T), origin = "1970-01-01"),
    p75= as.Date(quantile(unclass(as.Date(x)), .75, na.rm=T), origin = "1970-01-01"),
    p975= as.Date(quantile(unclass(as.Date(x)), .975, na.rm=T), origin = "1970-01-01"),
    p995= as.Date(quantile(unclass(as.Date(x)), .995, na.rm=T), origin = "1970-01-01"),
    p999= as.Date(quantile(unclass(as.Date(x)), .999, na.rm=T), origin = "1970-01-01"),
    max= as.Date(max(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01")
  )
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

# Define the function adapted for Polars
sum_dates_polars <- function(df, date_col) {
  # Create the list of quantiles
  quantiles <- c(0.001, 0.005, 0.025, 0.25, 0.5, 0.75, 0.975, 0.995, 0.999)
  # Create expressions to calculate min and max
  expr_list <- list(
    pl$col(date_col)$min()$alias("min"),
    pl$col(date_col)$max()$alias("max")
  )
  # Add expressions for quantiles
  for (q in quantiles) {
    expr_list <- append(expr_list, pl$col(date_col)$quantile(q)$alias(paste0("p", sub("\\.", "", as.character(q)))))
  }
  # Apply the expressions and return a DataFrame with the results
  df$select(expr_list)
}

# Custom function for sampling with a seed
sample_n_with_seed <- function(data, size, seed) {
  set.seed(seed)
  dplyr::sample_n(data, size)
}

# Function to get the most frequent value 
most_frequent <- function(x) { 
  uniq_vals <- unique(x)
  freq_vals <- sapply(uniq_vals, function(val) sum(x == val))
  most_freq <- uniq_vals[which(freq_vals == max(freq_vals))]
  
  if (length(most_freq) == 1) {
    return(most_freq)
  } else {
    return(NA)
  }
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#CONFIG #######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

options(scipen=2) #display numbers rather scientific number

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

# Define the function first
#joins these values with semicolons and optionally truncates the result if it exceeds a specified width.
toString2 <- function(x, width = NULL, ...) {
    string <- paste(x, collapse = "; ")
    if (missing(width) || is.null(width) || width == 0) 
        return(string)
    if (width < 0) 
        stop("'width' must be positive")
    if (nchar(string, type = "w") > width) {
        width <- max(6, width)
        string <- paste0(substr(string, 1, width - 3), "...")
    }
    string
}
normalize_txt <- function(x) {
  x|>
    stringi::stri_trans_general("Latin-ASCII")|>
    tolower()|>
    trimws()
}

icd10_broad <- function(x) {
x <- tolower(x)
tidytable::case_when(
stringr::str_detect(x, "organic|organico|demenc|alzheimer|parkinson|delirium|cerebral") ~ "F0 Organic",
stringr::str_detect(x, "psicoactiva|alcohol|marihuana|canabis|cannabis|cocain|opio|opiace|benzodiazep|sustancias") ~ "F1 Substance use",
stringr::str_detect(x, "esquizofren|psicotip|delirant|psicosis") ~ "F2 Psychotic",
stringr::str_detect(x, "estado de animo|afectiv|depres|bipolar|maniaco|distimia|hipoman") ~ "F3 Mood",
stringr::str_detect(x, "neurotic|ansiedad|fobi|panico|obsesivo|compulsiv|estres|adaptaci|somatoform|somatiz") ~ "F4 Anxiety/Stress/Somatoform",
stringr::str_detect(x, "comportamiento.*fisiolog|alimentari|anorex|bulim|sueñ|insomni|disfuncion sexual") ~ "F5 Physio/Eating/Sleep/Sexual",
stringr::str_detect(x, stringr::regex("personalidad|comportamiento del adulto|antisocial|limite|evitativ|habit|habitos|impuls|control de los impulsos|control\\s+de\\s+impulsos", ignore_case = TRUE)) ~ "F6 Personality/Adult behaviour",
stringr::str_detect(x, "retraso mental|discapacidad intelectual|intelectual") ~ "F7 Intellectual disability",
stringr::str_detect(x, "desarrollo psicolog|autism|asperger|lenguaje|aprendizaje|espectro autista|tdah|t\\s*d\\s*a\\s*h") ~ "F8/9 Neurodevelopment/Child",
TRUE ~ "Other/unspecified"
)
}
dsmiv_broad <- function(x) {
x <- tolower(x)
tidytable::case_when(
stringr::str_detect(x, "sustancia|alcohol|marihuana|cannabis|cocain|opio|opiace|benzodiazep") ~ "Substance-related",
stringr::str_detect(x, "esquizofren|psicosis|psicot") ~ "Psychotic",
stringr::str_detect(x, "estado de animo|afectiv|depres|bipolar|maniaco|distimia|hipoman") ~ "Mood",
stringr::str_detect(x, "ansiedad|fobi|panico|obsesivo|compulsiv|estres|adaptaci") ~ "Anxiety/Stress/Adjustment",
stringr::str_detect(x, "somatoform|somatiz|disociativ|conversion") ~ "Somatoform/Dissociative",
stringr::str_detect(x, "alimentari|anorex|bulim|sueñ|insomni|disfuncion sexual|para[f|f]ilia|sexual") ~ "Eating/Sleep/Sexual",
stringr::str_detect(x, "personalidad|antisocial|limite|borderline|paranoide|evitativ|dependient|narcis") ~ "Personality",
stringr::str_detect(x, "retraso mental|discapacidad intelectual|intelectual") ~ "Intellectual disability",
stringr::str_detect(x, "desarrollo|autism|asperger|infancia|tdah|t\\s*d\\s*a\\s*h|lenguaje|aprendizaje") ~ "Neurodevelopment/Childhood-onset",
TRUE ~ "Other/unspecified"
)
}

pair_str <- function(main, sub) {
  main <- as.character(main)
  sub  <- as.character(sub)
  both_na <- is.na(main) & is.na(sub)

  out <- paste0(
    tidytable::coalesce(main, "NA"),
    "::",
    tidytable::coalesce(sub,  "NA")
  )

  out[both_na] <- NA_character_
  out
}

# ── Helpers ────────────────────────────────────────────────────────────
mode_pick_int <- function(x){
  x <- x[!is.na(x)]
  if(length(x)==0) return(NA_integer_)
  tx <- sort(table(x), decreasing = TRUE)
  as.integer(names(tx)[1L])
}

subkey_to_label <- function(x){
  y <- gsub("_"," ", tolower(x))
  y <- gsub("amphetamine type stimulants","amphetamine-type stimulants", y)
  y <- gsub("tranquilizers hypnotics","tranquilizers/hypnotics", y)
  y
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# ── Encryption ────────────────────────────────────────────────────────

# Pack: [24-byte nonce][ciphertext] -> hex
encrypt_chr <- function(x, key) {
    vapply(x, function(v) {
        if (is.na(v)) return(NA_character_)
        ct <- sodium::data_encrypt(charToRaw(as.character(v)), key)
        nonce <- attr(ct, "nonce")                 # 24 bytes
        sodium::bin2hex(c(nonce, ct))              # pack nonce + ct
    }, FUN.VALUE = character(1))
}

# Unpack: hex -> [nonce|ciphertext], restore attr(nonce), then decrypt
decrypt_chr <- function(x, key) {
    vapply(x, function(v) {
        if (is.na(v)) return(NA_character_)
        buf <- sodium::hex2bin(v)
        if (length(buf) < 25L) stop("Ciphertext too short or corrupted.")
        nonce <- buf[1:24]
        ct    <- buf[-(1:24)]
        attr(ct, "nonce") <- nonce
        rawToChar(sodium::data_decrypt(ct, key))
    }, FUN.VALUE = character(1))
}

# Function to encrypt a character vector
encrypt_column <- function(x, key) {
  sapply(x, function(item) {
    if (is.na(item) || item == "") {
      return(NA_character_)
    }
    encrypted_raw <- sodium::data_encrypt(charToRaw(item), key)
    base64enc::base64encode(encrypted_raw)  # Convert to base64 for storage
  }, USE.NAMES = FALSE)
}

decrypt_column <- function(x, key) {
  sapply(x, function(item) {
    if (is.na(item)) return(NA_character_)
    encrypted_raw <- base64enc::base64decode(item)
    rawToChar(sodium::data_decrypt(encrypted_raw, key))
  }, USE.NAMES = FALSE)
}

Error in contrib.url(repos, "source") : 
  trying to use CRAN without setting a mirror
* pak version:
- 0.8.0.1
* Version information:
- pak platform: x86_64-w64-mingw32 (current: x86_64-w64-mingw32, compatible)
- pak repository: - (local install?)
* Optional packages installed:
- pillar
* Library path:
- G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32
- C:/Program Files/R/R-4.4.1/library
* pak is installed at G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32/pak.
* Dependency versions:
- callr      3.7.6      
- cli        3.6.2      
- curl       5.2.1      
- desc       1.4.3      
- filelock   1.0.3      
- jsonlite   1.8.8      
- lpSolve    5.6.23.9000
- pkgbuild   1.4.4      
- pkgcache   2.2.2.9000 
- pkgdepends 0.7.2.9000 
- pkgsearch  3.1.3.9000 
- processx   3.8.4      
- ps         1.7.6      
- R6         2.5.1      
- zip        2.3.1      
* Dependencies can be loaded

Note

The previous document (Step 2) resolved temporal inconsistencies by correcting overlapping treatment dates, imputing missing discharge dates, and handling implausible negative treatment durations.

This document (Step 3) focuses on resolving inconsistencies in user-invariant characteristics—attributes that should remain constant for an individual across all their treatment episodes. We triangulated data from multiple sources (C1-C6 agreements, hospital records, mortality registry, etc.) to deduce the most accurate values for: * Sex (sex_rec): Resolved conflicts using external data, pregnancy status, program type, ICD-10 diagnoses, gender identity, and probabilistic imputation. * Nationality (nationality_cons): Consolidated all reported nationalities for users with conflicting records. * Starting Substance (sus_ini_mod_mvv): Determined the most vulnerable starting substance (e.g., cocaine paste > cocaine powder > alcohol) for users with multiple reported substances. * Age of Onset (age_subs_onset_rec2, age_primary_onset_rec2): Developed a complex algorithm using external data to resolve conflicting or missing ages for the onset of any substance use and the primary substance use, establishing logical bounds (e.g., onset must precede admission age). * To protect privacy, we encrypted SENDA ID, center name and municipality using the sodium package (v 1.4.0). * We also built a basic longitudinal structure: Sorted by adm_date within each patient and use lead() to link to the next episode; Converted dates to numeric (days since 1970, with 2025-05-28 for ongoing cases) to compute diff_bet_treat; Flagged short gaps (less_60d_diff, less_45d_diff) and referral episodes; Summarized between-episode changes; and applied a helper to shorten and ASCII-normalize names for Stata 16 compatibility.

Although presented as distinct stages, these cleaning steps are interdependent. For instance, resolving a user’s sex was necessary to validate their participation in gender-specific programs. Throughout this document, we use the terms “rows”, “cases”, “observations”, or “treatment episodes” interchangeably to refer to entries in the dataset.

1. Data Editing / Deductive Imputation

1.1. DSM/ICD-10

Some cases did not have a primary diagnosis in DSM-IV notation but have a secondary (n= 603) or tertiary but no secondary (n= 20).

The data uses a nested structure for main and sub-diagnostic categories. When an episode had a main diagnosis but a missing sub-diagnosis, we inserted ‘NA’ as a placeholder (“NA_placeholder_”). We then removed duplicate entries among the second or third pair of main and sub-diagnoses. After this cleaning step, the diagnoses for each episode were concatenated.

The replacement of DSM-IV diagnoses with ICD-10 codes is not recommended for our analysis yet. We lack documentation on the source of any intersection between these classification systems, and 31 sub-diagnoses have no direct equivalents between the two systems. This inconsistency would compromise the validity of our diagnostic categorization and subsequent analyses..

The main diagnoses and sub-diagnoses for ICD-10 and DSM-IV classification systems were combined into the mod_psiq_cie_10 and mod_psiq_dsm_iv columns, respectively. In the future (step 4), they should be separated by column.

Additionally, the columns with suffixes _instudy (detects any “in study”), _no_dg (detects any “no disorder”), and _dg (detects any valid diagnostic) enable the identification of records where categories such as “sin trastorno” (no disorder) and “en estudio” (under study) can be removed, as these designations provide no clinical value when they occur alongside established diagnoses _dg).

Code

# - Count how often a sub-diagnosis exists while the main diagnosis is missing (or marked “en estudio”/“sin trastorno”), for both DSM-IV and ICD-10.
# - Learn a “lookup” from sub-diagnosis to main-diagnosis by observing valid main–sub pairs in your data (for DSM-IV and ICD-10 separately).
# - Use that lookup to “repair/standardize” main diagnoses when the sub-diagnosis is informative (not NA, not “en estudio”, not “sin trastorno”).
# - Insert a placeholder (“NA_placeholder”) when a main diagnosis exists but the sub-diagnosis is missing, to preserve the pair structure.
# - Remove duplicate diagnoses across the 2nd and 3rd diagnosis pairs so that repeated main–sub pairs do not get double-counted when concatenated.
# - Concatenate (main::sub) pairs into a single per-episode string for ICD-10 (mod_psiq_cie_10) and DSM-IV (mod_psiq_dsm_iv).
# - Create helper flags to identify rows with any “en estudio”, “sin trastorno”, or at least one valid diagnosis.

names_dg_dsmiv<-
c("diagnostico_trs_psiquiatrico_dsm_iv", "diagnostico_trs_psiquiatrico_sub_dsm_iv", 
"x2_diagnostico_trs_psiquiatrico_dsm_iv", "x2_diagnostico_trs_psiquiatrico_sub_dsm_iv", 
"x3_diagnostico_trs_psiquiatrico_dsm_iv", "x3_diagnostico_trs_psiquiatrico_sub_dsm_iv")
names_dg_icd10<- c("diagnostico_trs_psiquiatrico_cie_10", "diagnostico_trs_psiquiatrico_sub_cie_10", 
"x2_diagnostico_trs_psiquiatrico_cie_10", "x2_diagnostico_trs_psiquiatrico_sub_cie_10", 
"x3_diagnostico_trs_psiquiatrico_cie_10", "x3_diagnostico_trs_psiquiatrico_sub_cie_10", 
"diagnostico_trastorno_psiquiatrico_cie_10_al_egreso")

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #dg_trs_psiq_sub_cie_10 x2_dg_trs_psiq_sub_cie_10 x3_dg_trs_psiq_sub_cie_10
cat(paste0("Cases with sub-diagnostics but without the main, DSM-IV: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter(is.na(diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()))
cat(paste0("Cases with sub-diagnostics but without the main, second dg., DSM-IV: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter(is.na(x2_diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()))
cat(paste0("Cases with sub-diagnostics but without the main, third dg., DSM-IV: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter(is.na(x3_diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()))

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat(paste0("Cases with sub-diagnostics but without the main, ICD-10: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter(is.na(diagnostico_trs_psiquiatrico_cie_10) & !is.na(diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10)))|> nrow()))
cat(paste0("Cases with sub-diagnostics but without the main, second dg., ICD-10: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter(is.na(x2_diagnostico_trs_psiquiatrico_cie_10) & !is.na(x2_diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10)))|> nrow()))
cat(paste0("Cases with sub-diagnostics but without the main, third dg., ICD-10: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter(is.na(x3_diagnostico_trs_psiquiatrico_cie_10) & !is.na(x3_diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10)))|> nrow()))

# 3 cases with sub-diagnostics
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #dg_trs_psiq_sub_cie_10 x2_dg_trs_psiq_sub_cie_10 x3_dg_trs_psiq_sub_cie_10
cat(paste0("Cases with sub-diagnostics but without the main, DSM-IV: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter(is.na(diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()))
cat(paste0("Cases with sub-diagnostics but without the main, second dg., DSM-IV: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter(is.na(x2_diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()))
cat(paste0("Cases with sub-diagnostics but without the main, third dg., DSM-IV: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter(is.na(x3_diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()))
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat(paste0("Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno), ICD-10: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter((is.na(diagnostico_trs_psiquiatrico_cie_10) |
    diagnostico_trs_psiquiatrico_cie_10 == "en estudio" |
    diagnostico_trs_psiquiatrico_cie_10 == "sin trastorno") & !is.na(diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10)))|> 
  nrow()))
cat(paste0("Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno), second dg., ICD-10: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter((is.na(x2_diagnostico_trs_psiquiatrico_cie_10) |
    x2_diagnostico_trs_psiquiatrico_cie_10 == "en estudio" |
    x2_diagnostico_trs_psiquiatrico_cie_10 == "sin trastorno") & !is.na(x2_diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10)))|> 
  nrow()))
cat(paste0("Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno), third dg., ICD-10: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter((is.na(x3_diagnostico_trs_psiquiatrico_cie_10) |
    x3_diagnostico_trs_psiquiatrico_cie_10 == "en estudio" |
    x3_diagnostico_trs_psiquiatrico_cie_10 == "sin trastorno") & !is.na(x3_diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10)))|> 
  nrow()))

cat(paste0("Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno), DSM-IV: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter((is.na(diagnostico_trs_psiquiatrico_dsm_iv) |
    diagnostico_trs_psiquiatrico_dsm_iv == "en estudio" |
    diagnostico_trs_psiquiatrico_dsm_iv == "sin trastorno") & !is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> 
  nrow()))
cat(paste0("Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno), second dg., DSM-IV: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter((is.na(x2_diagnostico_trs_psiquiatrico_dsm_iv) |
    x2_diagnostico_trs_psiquiatrico_dsm_iv == "en estudio" |
    x2_diagnostico_trs_psiquiatrico_dsm_iv == "sin trastorno") & !is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> 
  nrow()))
cat(paste0("Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno), third dg., DSM-IV: ",
SISTRAT23_c1_2010_2024_df_prev1n|> filter((is.na(x3_diagnostico_trs_psiquiatrico_dsm_iv) |
    x3_diagnostico_trs_psiquiatrico_dsm_iv == "en estudio" |
    x3_diagnostico_trs_psiquiatrico_dsm_iv == "sin trastorno") & !is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> 
  nrow())) 
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

invisible("Primero solucionar el problema de arriba: clasificaciones con en estudio, pero la subclasificación con diagnóstico (agregarle como condición que la subclas tenga también no_NAs")

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("to standardize the main category with the DSM-IV subcategory\n")
dg_trs_psiq_dsm_iv_sub_tab<-
  SISTRAT23_c1_2010_2024_df_prev1n|>
  tidytable::mutate(dsm_concat= paste0(diagnostico_trs_psiquiatrico_dsm_iv, "_", diagnostico_trs_psiquiatrico_sub_dsm_iv))|> 
  tidytable::mutate(x2_dsm_concat= paste0(x2_diagnostico_trs_psiquiatrico_dsm_iv, "_", x2_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> 
  tidytable::mutate(x3_dsm_concat= paste0(x3_diagnostico_trs_psiquiatrico_dsm_iv, "_", x3_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> 
  tidytable::select(ends_with("dsm_concat"))|> 
  tidytable::pivot_longer(
    cols = everything(),
    names_to = "concat_type",
    values_to = "concat_value"
  )|>
  tidytable::select(-concat_type)|> 
  janitor::tabyl(concat_value)|> 
  data.frame()|>
  tidytable::arrange(desc(n))|> 
  tidytable::select(concat_value)|> 
  #only useful links
  tidytable::filter(!grepl("_NA$", concat_value))|>
  tidytable::filter(!grepl("^NA_", concat_value))|>
  tidytable::filter(!grepl("en estudio", concat_value))|>
  tidytable::filter(!grepl("sin trastorno", concat_value))|> 
  tidytable::separate(concat_value, into = c("main", "sub"), sep = "_", extra = "merge")

  ambiguous_subs <- dg_trs_psiq_dsm_iv_sub_tab|>
    tidytable::count(sub, name = "n_mains")|>
    tidytable::filter(n_mains > 1)
  
  if (nrow(ambiguous_subs) > 0) {
    stop("DSM-IV: Ambiguous sub-diagnoses found:\n", 
         paste(ambiguous_subs$sub, collapse = "\n"),
         call. = FALSE)
  }
dg_trs_psiq_dsm_iv_sub_tab <- dg_trs_psiq_dsm_iv_sub_tab|>
  tidytable::add_count(sub, main, name = "n")|>
  tidytable::arrange(sub, tidytable::desc(n))|>
  tidytable::distinct(sub, .keep_all = TRUE)|>
  tidytable::select(-n)

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("to standardize the main category with the ICD-10 subcategory\n")

# Given errors (SEP 2025), ambiguous mappings in ICD10
canonical_mapping <- tribble(
  ~sub, ~correct_main,
  "episodios depresivos", "trastornos del humor (afectivos)",
  "otros trastornos de la personalidad y del comportamiento del adulto", "trastornos de la personalidad y del comportamiento del adulto",
  "reacciones a estres grave y trastornos de adaptacion", "trastornos neuroticos, secundarios a situaciones estresantes y somatomorfos",
  "trastornos de los habitos y del control de los impulsos", "trastornos de los habitos y del control de los impulsos"
)
dg_trs_psiq_icd10_sub_tab <-
  SISTRAT23_c1_2010_2024_df_prev1n|>
  tidytable::mutate(
    icd_concat = paste0(diagnostico_trs_psiquiatrico_cie_10, "_", diagnostico_trs_psiquiatrico_sub_cie_10),
    x2_icd_concat = paste0(x2_diagnostico_trs_psiquiatrico_cie_10, "_", x2_diagnostico_trs_psiquiatrico_sub_cie_10),
    x3_icd_concat = paste0(x3_diagnostico_trs_psiquiatrico_cie_10, "_", x3_diagnostico_trs_psiquiatrico_sub_cie_10)
  )|>
  tidytable::select(ends_with("icd_concat"))|>
  tidytable::pivot_longer(cols = everything(), names_to = "concat_type", values_to = "concat_value")|>
  tidytable::select(concat_value)|>
  janitor::tabyl(concat_value)|>  # gives: concat_value, n, percent
  tidytable::filter(
    !grepl("_NA$|^NA_|en estudio|sin trastorno", concat_value, ignore.case = TRUE),
    !is.na(concat_value)
  )|>
  tidytable::separate(concat_value, into = c("main", "sub"), sep = "_", extra = "merge")|>
  tidytable::left_join(canonical_mapping, by = "sub")|>
  tidytable::mutate(main = ifelse(!is.na(correct_main), correct_main, main))|>
  tidytable::select(-correct_main)|>
  # 👇 CRITICAL: Re-aggregate by standardized (main, sub)
  tidytable::group_by(main, sub)|>
  tidytable::summarise(n = sum(n), .groups = "drop")|>
  tidytable::mutate(percent = n / sum(n))

# Check ambiguous codes
  ambiguous_icd_10_subs <- dg_trs_psiq_icd10_sub_tab|>
      tidytable::count(sub, name = "n_mains")|>
      tidytable::filter(n_mains > 1)
    
    if (nrow(ambiguous_icd_10_subs) > 0) {
      stop("ICD-10: Ambiguous sub-diagnoses found:\n", 
           paste(ambiguous_icd_10_subs$sub, collapse = "\n"),
           call. = FALSE)
    }

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("remove redundancies and duplicates in diagnoses\n")
#_______________#_____________________#______________#
#_______________#_____________________#______________#
#_______________#_____________________#______________#

invisible("Put placeholder to make replacements")
SISTRAT23_c1_2010_2024_df_prev1n_mod1<-
SISTRAT23_c1_2010_2024_df_prev1n|> 
  # SEP 2025:
  # If the diagnostic is not in study or confirmed not having any and is not an empty cell, then 
  # it will look to the main diagnostic given the sub-dg.
  # It is all about constructing from SUB to MAIN
    tidytable::mutate(dg_trs_psiq_dsm_iv= tidytable::case_when(
    !is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv) &
      !diagnostico_trs_psiquiatrico_sub_dsm_iv %in% c("en estudio", "sin trastorno") ~
      # Perform the lookup here
      dg_trs_psiq_dsm_iv_sub_tab$main[match(diagnostico_trs_psiquiatrico_sub_dsm_iv, dg_trs_psiq_dsm_iv_sub_tab$sub)],
    TRUE ~ diagnostico_trs_psiquiatrico_dsm_iv # Keep the original value otherwise
    #to explore differences and origin
  ))|> #filter(is.na(dg_trs_psiq_dsm_iv), !is.na(diagnostico_trs_psiquiatrico_dsm_iv))|> select(c("hash_key","dg_trs_psiq_dsm_iv","dg_trs_psiq_sub_dsm_iv", any_of(names_dg_dsmiv)))|> View()
  tidytable::mutate(x2_dg_trs_psiq_dsm_iv= tidytable::case_when(
    !is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv) &
    !x2_diagnostico_trs_psiquiatrico_sub_dsm_iv %in% c("en estudio", "sin trastorno") ~
      dg_trs_psiq_dsm_iv_sub_tab$main[match(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv, dg_trs_psiq_dsm_iv_sub_tab$sub)],
    TRUE ~ x2_diagnostico_trs_psiquiatrico_dsm_iv 
  ))|> #filter(x2_dg_trs_psiq_dsm_iv!=x2_diagnostico_trs_psiquiatrico_dsm_iv)|> select(c("hash_key","x2_dg_trs_psiq_dsm_iv", any_of(names_dg_dsmiv)))|> glimpse()
  tidytable::mutate(x3_dg_trs_psiq_dsm_iv= tidytable::case_when(
    !is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv) &
    !x3_diagnostico_trs_psiquiatrico_sub_dsm_iv %in% c("en estudio", "sin trastorno") ~
      dg_trs_psiq_dsm_iv_sub_tab$main[match(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv, dg_trs_psiq_dsm_iv_sub_tab$sub)],
    TRUE ~ x3_diagnostico_trs_psiquiatrico_dsm_iv 
  ))|> #filter(x3_dg_trs_psiq_dsm_iv!=x3_diagnostico_trs_psiquiatrico_dsm_iv)|> select(c("hash_key","x3_dg_trs_psiq_dsm_iv", any_of(names_dg_dsmiv)))|> glimpse()
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #now with ICD-10 classifications
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  tidytable::mutate(dg_trs_psiq_cie_10= tidytable::case_when(
    !is.na(diagnostico_trs_psiquiatrico_sub_cie_10) &
    !diagnostico_trs_psiquiatrico_sub_cie_10 %in% c("en estudio", "sin trastorno") ~
      coalesce(
          dg_trs_psiq_icd10_sub_tab$main[match(diagnostico_trs_psiquiatrico_sub_cie_10, dg_trs_psiq_icd10_sub_tab$sub)],
          diagnostico_trs_psiquiatrico_cie_10
        ),
      #dg_trs_psiq_icd10_sub_tab$main[match(diagnostico_trs_psiquiatrico_sub_cie_10, dg_trs_psiq_icd10_sub_tab$sub)],
    TRUE ~ diagnostico_trs_psiquiatrico_cie_10 
  ))|> #filter(dg_trs_psiq_cie_10!=diagnostico_trs_psiquiatrico_cie_10)|> select(c("hash_key","dg_trs_psiq_cie_10", any_of(names_dg_icd10)))|> glimpse()
  tidytable::mutate(x2_dg_trs_psiq_cie_10= tidytable::case_when(
    !is.na(x2_diagnostico_trs_psiquiatrico_sub_cie_10) &
    !x2_diagnostico_trs_psiquiatrico_sub_cie_10 %in% c("en estudio", "sin trastorno") ~
      dg_trs_psiq_icd10_sub_tab$main[match(x2_diagnostico_trs_psiquiatrico_sub_cie_10, dg_trs_psiq_icd10_sub_tab$sub)],
    TRUE ~ x2_diagnostico_trs_psiquiatrico_cie_10 
  ))|> #filter(x2_dg_trs_psiq_cie_10!=x2_diagnostico_trs_psiquiatrico_cie_10)|> select(c("hash_key","x2_dg_trs_psiq_cie_10", any_of(names_dg_icd10)))|> glimpse()
  tidytable::mutate(x3_dg_trs_psiq_cie_10= tidytable::case_when(
    !is.na(x3_diagnostico_trs_psiquiatrico_sub_cie_10) &
    !x3_diagnostico_trs_psiquiatrico_sub_cie_10 %in% c("en estudio", "sin trastorno") ~
      dg_trs_psiq_icd10_sub_tab$main[match(x3_diagnostico_trs_psiquiatrico_sub_cie_10, dg_trs_psiq_icd10_sub_tab$sub)],
    TRUE ~ x3_diagnostico_trs_psiquiatrico_cie_10 
  ))|> #filter(x3_dg_trs_psiq_cie_10!=x3_diagnostico_trs_psiquiatrico_cie_10)|> select(c("hash_key","x3_dg_trs_psiq_cie_10", any_of(names_dg_icd10)))|> glimpse()
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #in case of main dg but no sub-dg, DSM-IV: this empty field will be respected in the future in case replacing diagnoses
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  tidytable::mutate(dg_trs_psiq_sub_dsm_iv= tidytable::case_when(
    is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv) &
    !is.na(dg_trs_psiq_dsm_iv) &
    !diagnostico_trs_psiquiatrico_dsm_iv %in% c("en estudio", "sin trastorno") ~ 
      "NA_placeholder", # Or whatever value you want to assign
    TRUE ~ diagnostico_trs_psiquiatrico_sub_dsm_iv # Keep the original value otherwise
  ))|> #filter(dg_trs_psiq_sub_dsm_iv=="NA_placeholder")|> select(c("hash_key","dg_trs_psiq_sub_dsm_iv", any_of(names_dg_dsmiv)))|> glimpse()
  tidytable::mutate(x2_dg_trs_psiq_sub_dsm_iv= tidytable::case_when(
    is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv) &
    !is.na(x2_dg_trs_psiq_dsm_iv) &
    !x2_diagnostico_trs_psiquiatrico_dsm_iv %in% c("en estudio", "sin trastorno") ~ 
    "NA_placeholder", # Or whatever value you want to assign
    TRUE ~ x2_diagnostico_trs_psiquiatrico_sub_dsm_iv # Keep the original value otherwise
  ))|> #filter(x2_dg_trs_psiq_sub_dsm_iv=="NA_placeholder")|> select(c("hash_key","x2_dg_trs_psiq_sub_dsm_iv", any_of(names_dg_dsmiv)))|> glimpse()
  tidytable::mutate(x3_dg_trs_psiq_sub_dsm_iv= tidytable::case_when(
    is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv) &
    !is.na(x3_dg_trs_psiq_dsm_iv) &
    !x3_diagnostico_trs_psiquiatrico_dsm_iv %in% c("en estudio", "sin trastorno") ~ 
    "NA_placeholder", # Or whatever value you want to assign
    TRUE ~ x3_diagnostico_trs_psiquiatrico_sub_dsm_iv # Keep the original value otherwise
  ))|> #filter(x3_dg_trs_psiq_sub_dsm_iv=="NA_placeholder")|> select(c("hash_key","x3_dg_trs_psiq_sub_dsm_iv", any_of(names_dg_dsmiv)))|> glimpse()  
  tidytable::mutate(dg_trs_psiq_sub_cie_10= tidytable::case_when(
    is.na(diagnostico_trs_psiquiatrico_sub_cie_10) &
    !is.na(dg_trs_psiq_cie_10) &
    !diagnostico_trs_psiquiatrico_cie_10 %in% c("en estudio", "sin trastorno") ~ 
    "NA_placeholder", # Or whatever value you want to assign
    TRUE ~ diagnostico_trs_psiquiatrico_sub_cie_10 # Keep the original value otherwise
  ))|> 
  tidytable::mutate(x2_dg_trs_psiq_sub_cie_10= tidytable::case_when(
    is.na(x2_diagnostico_trs_psiquiatrico_sub_cie_10) &
    !is.na(x2_dg_trs_psiq_cie_10) &
    !x2_diagnostico_trs_psiquiatrico_cie_10 %in% c("en estudio", "sin trastorno") ~
    "NA_placeholder", # Or whatever value you want to assign
    TRUE ~ x2_diagnostico_trs_psiquiatrico_sub_cie_10 # Keep the original value otherwise
  ))|> 
  tidytable::mutate(x3_dg_trs_psiq_sub_cie_10= tidytable::case_when(
    is.na(x3_diagnostico_trs_psiquiatrico_sub_cie_10) &
    !is.na(x3_dg_trs_psiq_cie_10) &
    !x3_diagnostico_trs_psiquiatrico_cie_10 %in% c("en estudio", "sin trastorno") ~
    "NA_placeholder", # Or whatever value you want to assign
    TRUE ~ x3_diagnostico_trs_psiquiatrico_sub_cie_10 # Keep the original value otherwise
  ))|> #filter(x3_dg_trs_psiq_sub_cie_10=="NA_placeholder")|> select(c("hash_key","x3_dg_trs_psiq_cie_10", any_of(names_dg_icd10)))|> glimpse()  
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  # Collapse and separate main (ICD-10)
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  # First create combined diagnosis pairs for comparison
  tidytable::mutate(
    # Create diagnosis pairs for comparison
    diag_pair_1 = paste(dg_trs_psiq_cie_10, 
                        dg_trs_psiq_sub_cie_10, sep = "::"),
    diag_pair_2 = paste(x2_dg_trs_psiq_cie_10, 
                        x2_dg_trs_psiq_sub_cie_10, sep = "::"),
    diag_pair_3 = paste(x3_dg_trs_psiq_cie_10, 
                        x3_dg_trs_psiq_sub_cie_10, sep = "::"),
    # Now flag duplicates for removal
    # SEP2025: If second pair exists and its different than pair 1, keep it
    keep_pair_2 = !is.na(diag_pair_2) & (is.na(diag_pair_1) | diag_pair_2 != diag_pair_1),
    # SEP2025: If third pair exists and its different than pair 1 and 2 or anyone of these two do not exist, keep it
    keep_pair_3 = !is.na(diag_pair_3) & 
      (is.na(diag_pair_1) | diag_pair_3 != diag_pair_1) & 
      (is.na(diag_pair_2) | diag_pair_3 != diag_pair_2)
  )|>
  # Apply the duplicate filtering
  tidytable::mutate(
    # Set the filtered columns based on duplicate flags
    # prevents the same diagnosis from being counted multiple times when we combine them into the final concatenated field
    x2_diag_filtered = tidytable::if_else(keep_pair_2, x2_dg_trs_psiq_cie_10, NA_character_),
    x2_subdiag_filtered = tidytable::if_else(keep_pair_2, x2_dg_trs_psiq_sub_cie_10, NA_character_),
    x3_diag_filtered = tidytable::if_else(keep_pair_3, x3_dg_trs_psiq_cie_10, NA_character_),
    x3_subdiag_filtered = tidytable::if_else(keep_pair_3, x3_dg_trs_psiq_sub_cie_10, NA_character_)
  )|> 
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  # Collapse and separate main (NOW FOR DSM-IV DIAGNOSES)
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  # First create combined diagnosis pairs for comparison
  mutate(
    # Create diagnosis pairs for comparison
    dsm_diag_pair_1 = paste(dg_trs_psiq_dsm_iv, 
                        dg_trs_psiq_sub_dsm_iv, sep = "::"),
    dsm_diag_pair_2 = paste(x2_dg_trs_psiq_dsm_iv, 
                        x2_dg_trs_psiq_sub_dsm_iv, sep = "::"),
    dsm_diag_pair_3 = paste(x3_dg_trs_psiq_dsm_iv, 
                        x3_dg_trs_psiq_sub_dsm_iv, sep = "::"),
    # Now flag duplicates for removal
    # SEP2025: If second pair exists and its different than pair 1, keep it
    dsm_keep_pair_2 = !is.na(dsm_diag_pair_2) & (is.na(dsm_diag_pair_1) | dsm_diag_pair_2 != dsm_diag_pair_1),
    # SEP2025: If third pair exists and its different than pair 1 and 2 or anyone of these two do not exist, keep it
    dsm_keep_pair_3 = !is.na(dsm_diag_pair_3) & 
      (is.na(dsm_diag_pair_1) | dsm_diag_pair_3 != dsm_diag_pair_1) & 
      (is.na(dsm_diag_pair_2) | dsm_diag_pair_3 != dsm_diag_pair_2)
  )|>
  # Apply the duplicate filtering
  mutate(
    # Set the filtered columns based on duplicate flags
    # prevents the same diagnosis from being counted multiple times when we combine them into the final concatenated field
    dsm_x2_diag_filtered = if_else(dsm_keep_pair_2, x2_dg_trs_psiq_dsm_iv, NA_character_),
    dsm_x2_subdiag_filtered = if_else(dsm_keep_pair_2, x2_dg_trs_psiq_sub_dsm_iv, NA_character_),
    dsm_x3_diag_filtered = if_else(dsm_keep_pair_3, x3_dg_trs_psiq_dsm_iv, NA_character_),
    dsm_x3_subdiag_filtered = if_else(dsm_keep_pair_3, x3_dg_trs_psiq_sub_dsm_iv, NA_character_)
  )

invisible("To check inconsistencies so far")
# SISTRAT23_c1_2010_2024_df_prev1n_mod1|> filter(!is.na(x2_diag_filtered))|> select("diagnostico_trs_psiquiatrico_dsm_iv", "diagnostico_trs_psiquiatrico_sub_dsm_iv", "x2_diagnostico_trs_psiquiatrico_dsm_iv", "x2_diagnostico_trs_psiquiatrico_sub_dsm_iv", "x3_diagnostico_trs_psiquiatrico_dsm_iv", "x3_diagnostico_trs_psiquiatrico_sub_dsm_iv", "diagnostico_trs_psiquiatrico_cie_10", "diagnostico_trs_psiquiatrico_sub_cie_10", "x2_diagnostico_trs_psiquiatrico_cie_10", "x2_diagnostico_trs_psiquiatrico_sub_cie_10", "x3_diagnostico_trs_psiquiatrico_cie_10", "x3_diagnostico_trs_psiquiatrico_sub_cie_10", "diagnostico_trastorno_psiquiatrico_cie_10_al_egreso","dg_trs_psiq_dsm_iv", "x2_dg_trs_psiq_dsm_iv", "x3_dg_trs_psiq_dsm_iv", "dg_trs_psiq_cie_10", "x2_dg_trs_psiq_cie_10", "x3_dg_trs_psiq_cie_10", "dg_trs_psiq_sub_dsm_iv", "x2_dg_trs_psiq_sub_dsm_iv", "x3_dg_trs_psiq_sub_dsm_iv", "dg_trs_psiq_sub_cie_10", "x2_dg_trs_psiq_sub_cie_10", "x3_dg_trs_psiq_sub_cie_10", "diag_pair_1", "diag_pair_2", "diag_pair_3", "keep_pair_2", "keep_pair_3", "x2_diag_filtered", "x2_subdiag_filtered", "x3_diag_filtered", "x3_subdiag_filtered")|> sample_n_with_seed(50, seed=2125)|> data.frame()|>  rio::export("_out/test_icd_10.xlsx")


#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
sub_dsm_iv_to_cie_10_comp_table <- rio::import(paste0(wdpath,"cons/_input/sub_dsm_iv_to_cie_10_comp_table.xlsx"))|> 
  # minusc, we changed tildes
  tidytable::mutate(across(where(is.character), 
                ~stringi::stri_trans_general(., "Latin-ASCII")))  
invisible("Is not very useful to replace DSM-IV for ICD-10 codes. We dont know the source of the homologation and 31 sub-diagnostics are not homologued")
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("displace empty diagnoses\n")
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

invisible("to avoid printing the output")

#SEP 2025
#-Builds up to three “main::sub” strings (diag1–diag3) for ICD-10 using the already-filtered x2_/x3_ fields. Only build pairs when the main is informative (not NA, not “en estudio”/“sin trastorno”
#-Collapses them into a single semicolon-separated field mod_psiq_cie_10.
#-Cleans up temporary columns.

exclude_main <- c("en estudio", "sin trastorno")
exclude_sub  <- c("en estudio", "sin trastorno", "NA_placeholder")
sub_for_pair <- function(x) ifelse(is.na(x) | x %in% exclude_sub, "NA", x)

# First operation for mod_psiq_cie_10
SISTRAT23_c1_2010_2024_df_prev1n_mod1 <- SISTRAT23_c1_2010_2024_df_prev1n_mod1|>
  mutate(
    p1 = if_else(!is.na(dg_trs_psiq_cie_10) & !(dg_trs_psiq_cie_10 %in% exclude_main),
                paste0(dg_trs_psiq_cie_10, "::", sub_for_pair(dg_trs_psiq_sub_cie_10)),
                NA_character_),
    p2 = if_else(!is.na(x2_diag_filtered) & !(x2_diag_filtered %in% exclude_main),
                paste0(x2_diag_filtered, "::", sub_for_pair(x2_subdiag_filtered)),
                NA_character_),
    p3 = if_else(!is.na(x3_diag_filtered) & !(x3_diag_filtered %in% exclude_main),
                paste0(x3_diag_filtered, "::", sub_for_pair(x3_subdiag_filtered)),
                NA_character_)
  )|>
  mutate(
    mod_psiq_cie_10 = pmap_chr(
      list(p1, p2, p3),
      ~ {
        tmp <- c(...)
        tmp <- tmp[!is.na(tmp)]
        if (length(tmp)) paste(tmp, collapse = "; ") else NA_character_
      }
    )
  )|>
  select(-p1, -p2, -p3)

# Second operation for mod_psiq_dsm_iv
SISTRAT23_c1_2010_2024_df_prev1n_mod1 <- SISTRAT23_c1_2010_2024_df_prev1n_mod1|>
  mutate(
    p1 = if_else(!is.na(dg_trs_psiq_dsm_iv) & !(dg_trs_psiq_dsm_iv %in% exclude_main),
                paste0(dg_trs_psiq_dsm_iv, "::", sub_for_pair(dg_trs_psiq_sub_dsm_iv)),
                NA_character_),
    p2 = if_else(!is.na(dsm_x2_diag_filtered) & !(dsm_x2_diag_filtered %in% exclude_main),
                paste0(x2_diag_filtered, "::", sub_for_pair(x2_subdiag_filtered)),
                NA_character_),
    p3 = if_else(!is.na(dsm_x3_diag_filtered) & !(dsm_x3_diag_filtered %in% exclude_main),
                paste0(dsm_x3_diag_filtered, "::", sub_for_pair(dsm_x3_subdiag_filtered)),
                NA_character_)
  )|>
  mutate(
    mod_psiq_dsm_iv = pmap_chr(
      list(p1, p2, p3),
      ~ {
        tmp <- c(...)
        tmp <- tmp[!is.na(tmp)]
        if (length(tmp)) paste(tmp, collapse = "; ") else NA_character_
      }
    )
  )|>
  select(-p1, -p2, -p3)
# Remove memory
gc()


#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("merge clean diagnoses\n")
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

# Columns to scan
cie_cols <- c("dg_trs_psiq_cie_10", "x2_diag_filtered", "x3_diag_filtered")
dsm_cols <- c("dg_trs_psiq_dsm_iv", "dsm_x2_diag_filtered", "dsm_x3_diag_filtered")
exclude_vals <- c("en estudio", "sin trastorno")
norm_diag <- function(x) {
  x <- normalize_txt(x)                # your normalizer (lowercase, accents, trim)
  ifelse(is.na(x) | trimws(x) == "", NA_character_, x)
}

SISTRAT23_c1_2010_2024_df_prev1n_mod2 <-
  SISTRAT23_c1_2010_2024_df_prev1n_mod1|>
  # 0) normalize once to compare cleanly
  tidytable::mutate(
    across(tidytable::all_of(c(cie_cols, dsm_cols)), norm_diag)
  )|>
  # 1) compute raw "any" signals
  tidytable::mutate(
    # ICD-10 signals
    cie_any_valid   = rowSums(tidytable::across(
      tidytable::all_of(cie_cols),
      ~ tidytable::coalesce(!(.x %in% exclude_vals) & !is.na(.x), FALSE)
    )) > 0,
    cie_any_instudy = rowSums(tidytable::across(
      tidytable::all_of(cie_cols),
      ~ tidytable::coalesce(.x == "en estudio", FALSE)
    )) > 0,

    # DSM-IV signals
    dsm_any_valid   = rowSums(tidytable::across(
      tidytable::all_of(dsm_cols),
      ~ tidytable::coalesce(!(.x %in% exclude_vals) & !is.na(.x), FALSE)
    )) > 0,
    dsm_any_instudy = rowSums(tidytable::across(
      tidytable::all_of(dsm_cols),
      ~ tidytable::coalesce(.x == "en estudio", FALSE)
    )) > 0
  )|>
  # 2) TRIAGE (mutually exclusive per system)
  # TRIAGE (mutually exclusive):
  # 1) If there is any valid dx -> dg=TRUE, instudy=FALSE, no_dg=FALSE
  # 2) else if any "en estudio" -> instudy=TRUE, dg=FALSE, no_dg=FALSE
  # 3) else -> no_dg=TRUE, dg=FALSE, instudy=FALSE
  tidytable::mutate(
    # ICD-10
    dg_psiq_cie_10_dg       = cie_any_valid,
    dg_psiq_cie_10_instudy  = tidytable::if_else(cie_any_valid, FALSE, cie_any_instudy),
    dg_psiq_cie_10_no_dg    = !(dg_psiq_cie_10_dg | dg_psiq_cie_10_instudy),

    # DSM-IV
    dg_psiq_dsm_iv_dg       = dsm_any_valid,
    dg_psiq_dsm_iv_instudy  = tidytable::if_else(dsm_any_valid, FALSE, dsm_any_instudy),
    dg_psiq_dsm_iv_no_dg    = !(dg_psiq_dsm_iv_dg | dg_psiq_dsm_iv_instudy)
  )|>
  # 3) clean helpers
  tidytable::select(-cie_any_valid, -cie_any_instudy, -dsm_any_valid, -dsm_any_instudy)|>  
  # drop helper columns only if you’re done with them
  tidytable::select(-tidytable::contains("_pair_"), -tidytable::ends_with("_filtered"))|>
  # broad categories (normalize both)
  tidytable::mutate(
    cie10_main_norm = normalize_txt(dg_trs_psiq_cie_10),
    dsmiv_main_norm = normalize_txt(dg_trs_psiq_dsm_iv),
    cie10_broad_cat = tidytable::if_else(
      !is.na(cie10_main_norm) & !(cie10_main_norm %in% exclude_vals),
      icd10_broad(cie10_main_norm), NA_character_),
    dsmiv_broad_cat = tidytable::if_else(
      !is.na(dsmiv_main_norm) & !(dsmiv_main_norm %in% exclude_vals),
      dsmiv_broad(dsmiv_main_norm), NA_character_)
  )

#Clean memory
gc()

SISTRAT23_c1_2010_2024_df_prev1n_mod2|> select(-(1:30))|> sample_n_with_seed(50, seed=2125)|> data.frame()|>  rio::export("_out/test_icd_10_2.xlsx")

Cases with sub-diagnostics but without the main, DSM-IV: 0Cases with sub-diagnostics but without the main, second dg., DSM-IV: 0Cases with sub-diagnostics but without the main, third dg., DSM-IV: 3Cases with sub-diagnostics but without the main, ICD-10: 0Cases with sub-diagnostics but without the main, second dg., ICD-10: 0Cases with sub-diagnostics but without the main, third dg., ICD-10: 0Cases with sub-diagnostics but without the main, DSM-IV: 0Cases with sub-diagnostics but without the main, second dg., DSM-IV: 0Cases with sub-diagnostics but without the main, third dg., DSM-IV: 3Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno), ICD-10: 3Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno), second dg., ICD-10: 0Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno), third dg., ICD-10: 0Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno), DSM-IV: 2Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno), second dg., DSM-IV: 0Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno), third dg., DSM-IV: 4to standardize the main category with the DSM-IV subcategory
to standardize the main category with the ICD-10 subcategory
remove redundancies and duplicates in diagnoses
displace empty diagnoses
             used    (Mb) gc trigger    (Mb)   max used    (Mb)
Ncells    5214344   278.5    9072005   484.5    9072005   484.5
Vcells 1847332020 14094.1 2793554712 21313.2 1873238641 14291.7
merge clean diagnoses
             used    (Mb) gc trigger    (Mb)   max used    (Mb)
Ncells    5215882   278.6    9072005   484.5    9072005   484.5
Vcells 1848927589 14106.2 2793554712 21313.2 1876015956 14312.9

1.2. Ethnicity

To generate a more inclusive approach to ethnic identification and assuming that ethnicity is invariable per person, all ethnicity records associated with each individual were consolidated from the original dataset (SISTRAT23_c1_2010_2024_df), preserving the diversity of self-identifications through semicolon-separated values. We excluded non-reported ethnicity data (inclusive_historical_ethnicity_by_run). We also added ethnicity data from C2 to C6. This variable is called ethnicity_c1_c6_historic.

Code

inclusive_historical_ethnicity_by_run<-
SISTRAT23_c1_2010_2024_df|> 
  tidytable::filter(etnia!="no pertenece", !is.na(etnia))|>
  tidytable::group_by(hash_key)|>
  tidytable::summarise(etnias_distinct = paste(unique(etnia), collapse = "; "))|>
  tidytable::ungroup() #|>filter(grepl(";",etnias_distinct))

c2_inclusive_historical_ethnicity_by_run<-
CONS_C2_25_df|> 
  tidytable::filter(etnia!="no pertenece", !is.na(etnia))|> 
  tidytable::group_by(HASH_KEY)|>
  tidytable::summarise(etnias_distinct = paste(unique(etnia), collapse = "; "))|>
  tidytable::ungroup() #|> filter(grepl(";",etnias_distinct))

c3_inclusive_historical_ethnicity_by_run<-
CONS_C3_25_df|> 
  tidytable::filter(etnia!="no pertenece", !is.na(etnia))|> 
  tidytable::group_by(HASH_KEY)|>
  tidytable::summarise(etnias_distinct = paste(unique(etnia), collapse = "; "))|>
  tidytable::ungroup() #|> filter(grepl(";",etnias_distinct))

c4_inclusive_historical_ethnicity_by_run<-
CONS_C4_25_df|> 
  tidytable::filter(etnia!="no pertenece", !is.na(etnia))|> 
  tidytable::group_by(HASH_KEY)|>
  tidytable::summarise(etnias_distinct = paste(unique(etnia), collapse = "; "))|>
  tidytable::ungroup() #|> filter(grepl(";",etnias_distinct))

c5_inclusive_historical_ethnicity_by_run<-
CONS_C5_25_df|> 
  tidytable::filter(etnia!="no pertenece", !is.na(etnia))|> 
  tidytable::group_by(HASH_KEY)|>
  tidytable::summarise(etnias_distinct = paste(unique(etnia), collapse = "; "))|>
  tidytable::ungroup() #|> filter(grepl(";",etnias_distinct))

c6_inclusive_historical_ethnicity_by_run<-
  if(tidytable::filter(CONS_C6_25_df, pais_nacimiento=="no pertenece")|> nrow()>0){
CONS_C6_25_df|>
  tidytable::filter(pais_nacimiento!="no pertenece", !is.na(pais_nacimiento), pais_nacimiento!="chile")|>
  tidytable::group_by(HASH_KEY)|>
  tidytable::summarise(etnias_distinct = paste(unique(pais_nacimiento), collapse = "; "))|>
  tidytable::ungroup() #|> filter(grepl(";",etnias_distinct))
  } else {
  CONS_C6_25_df|>
  tidytable::filter(etnia!="no pertenece", !is.na(etnia), etnia!="chile")|>
  tidytable::group_by(HASH_KEY)|>
  tidytable::summarise(etnias_distinct = paste(unique(etnia), collapse = "; "))|>
  tidytable::ungroup() #|> filter(grepl(";",etnias_distinct))
}

SISTRAT23_c1_2010_2024_df_prev1o<-
SISTRAT23_c1_2010_2024_df_prev1n_mod2|> 
    (\(df) {
    cat(paste0("5.Number of cases after normalization of data editing: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("5.Number of patients after normalization of data editing: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })()|>
  tidytable::left_join(inclusive_historical_ethnicity_by_run, by="hash_key", multiple="first")|>
  tidytable::rename("ethnicity_inclusive"="etnias_distinct")|> 
  tidytable::left_join(c2_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidytable::rename("ethnicity_inclusive_c2"="etnias_distinct")|> 
  tidytable::left_join(c3_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidytable::rename("ethnicity_inclusive_c3"="etnias_distinct")|> 
  tidytable::left_join(c4_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidytable::rename("ethnicity_inclusive_c4"="etnias_distinct")|> 
  tidytable::left_join(c5_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidytable::rename("ethnicity_inclusive_c5"="etnias_distinct")|> 
  tidytable::left_join(c6_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidytable::rename("ethnicity_inclusive_c6"="etnias_distinct")|> 
    (\(df) {
    cat(paste0("5. After normalization and data editing, obs.: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("5. After normalization and data editing, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
    if (nrow(df) > nrow(SISTRAT23_c1_2010_2024_df_prev1n_mod2))stop("Error: Added treatment episodes in the process")
    df
  })()


SISTRAT23_c1_2010_2024_df_prev1o<-
# First, split each ethnicity column by "; ", extract unique values, and combine into a new column
SISTRAT23_c1_2010_2024_df_prev1o|>
  tidytable::rowwise()|>
  tidytable::mutate(
    ethnicity_c1_c6_historic = {
      # Get all columns starting with "ethnicity_inclusive_"
      # Use the names of the dataframe directly
      all_cols <- names(SISTRAT23_c1_2010_2024_df_prev1o)
      eth_cols <- c("ethnicity_inclusive", 
                   grep("^ethnicity_inclusive_", all_cols, value = TRUE))
      
      # Extract values from these columns that exist
      eth_values <- c()
      for (col in eth_cols) {
        if (col %in% all_cols) {
          val <- get(col)
          if (!is.na(val)) eth_values <- c(eth_values, val)
        }
      }
      
      # Split each value by semicolon and flatten
      if (length(eth_values) > 0) {
        all_eth <- unlist(strsplit(eth_values, "\\s*;\\s*"))
        paste(unique(all_eth), collapse = "; ")
      } else {
        NA_character_
      }
    }
  )|>
  tidytable::ungroup()|>
  tidytable::select(-any_of((tidytable::starts_with("ethnicity_inclusive"))))

5.Number of cases after normalization of data editing: 173,728 
5.Number of patients after normalization of data editing: 121,299 
5. After normalization and data editing, obs.: 173,728 
5. After normalization and data editing, RUNs: 121,299

2. More than One Value within User, Concerning User-Invariant Variables

We need to obtain sociodemographic categories that are usually invariant for a given individual. Although this assumption is highly debatable, it allows us to detect inequalities stemming from these distinctions and their associations with social roles and stigmatization. For this purpose, we used external databases linked to SENDA agreements 2 through 6, together with hospitalization and Prosecutor’s Office databases.

2.1. Sex

Sex (sexo_2) (patients= 562). Among patients with conflicting sex values, we assigned administrative sex = female if there was strong evidence such as participation in a women-only program or recorded pregnancy; otherwise we relied on concordant external records, and left cases ‘undetermined’ when evidence was inconclusive.

The primary approach differs depending on the availability of external data (count_not_na). The system first checks for clear agreement between internal (c1_perc_mujer) and external (perc_fem_ext) data. If both sources indicate a strong majority (>50%) for the same sex, that sex is assigned (Cases 6.a.a.1, 6.a.a.2). In cases of disagreement or ambiguity (e.g., one source shows a tie at 50%, or sources point to different sexes), the decision relies on comparing the quantity of records in each source (total vs. count_not_na). Generally, the source with more records is given higher weight.

When Only Internal Data is Available (count_not_na == 0), the decision relies solely on the internal data proportion (c1_perc_mujer) and the total number of internal records (total), and consider ties.

Code

wdpath<-paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
bpmn::bpmn(paste0(wdpath, "cons/_input/diagram_inconsistent_sex.bpmn"))

Code

invalid_sex_by_patient<-
SISTRAT23_c1_2010_2024_df_prev1o|>  group_by(hash_key)|> summarise(sexo_por_hash = n_distinct(sexo), miss_sexo = sum(is.na(sexo), na.rm=T), tot_obs = n())|> ungroup()|> mutate(perc_miss_sexo = miss_sexo/tot_obs)|> filter(sexo_por_hash>1|perc_miss_sexo==1)|> pull(hash_key)

invisible("======================================================")  
invalid_sex_hashs_hosp<-  
HOSP_filter_df|> 
    filter(run %in% invalid_sex_by_patient)|>
    (\(df) {
        message(paste0("Hospital, Entries: ", nrow(df)))
        message(paste0("Hospital, RUNs: ", distinct(df, run)|> nrow()))
        df
    })()|>
    group_by(run)|>
    summarise(femenino = sum(sexo==2, na.rm=T),masculino = sum(sexo==1, na.rm=T), total=n())|>
    ungroup()|>
    mutate(hosp_perc_fem = femenino / total, hosp_perc_masc = masculino / total)|>
  (\(df) {
    message(paste0("Unambiguous values: ", nrow(filter(df, hosp_perc_masc>.5|hosp_perc_fem>.5)|>  distinct(run))))
    select(df, -any_of(c("hosp_perc_masc", "hosp_perc_fem", "total")))
  })()

Hospital, Entries: 1773

Hospital, RUNs: 408

Unambiguous values: 406

Code

# Hospital, Entries: 1656
# Hospital, RUNs: 371
#SEP 2025
# Hospital, Entries: 1773
# Hospital, RUNs: 408
# Unambiguous values: 395


invisible("======================================================")
invalid_sex_top<-  
SISTRAT23_top_2015_2024_df|>
    filter(HASH_KEY %in% invalid_sex_by_patient)|>
    (\(df) {
        message(paste0("TOP, Entries: ", nrow(df)))
        message(paste0("TOP, RUNs: ", distinct(df, HASH_KEY)|> nrow()))
        df
    })()|>
    ungroup()|> 
    group_by(HASH_KEY)|>
    summarise(femenino = sum(sexo=="mujer", na.rm=T),masculino = sum(sexo=="hombre", na.rm=T), total=n())|>
    ungroup()|>
    mutate(top_perc_fem = femenino / total, top_perc_masc = masculino / total)|>
  (\(df) {
    message(paste0("Unambiguous values: ", nrow(filter(df, top_perc_masc>.5|top_perc_fem>.5)|>  distinct(HASH_KEY))))
    select(df, -any_of(c("top_perc_masc", "top_perc_fem", "total")))
  })()

TOP, Entries: 1769

TOP, RUNs: 354

Unambiguous values: 325

Code

# TOP, Entries: 1518
# TOP, RUNs: 310
#SEP 2025
# TOP, Entries: 1769
# TOP, RUNs: 354

invisible("======================================================")
invalid_sex_c2<-  
CONS_C2_25_df|>
    filter(HASH_KEY %in% invalid_sex_by_patient)|>
    (\(df) {
        message(paste0("C2, Entries: ", nrow(df)))
        message(paste0("C2, RUNs: ", distinct(df, HASH_KEY)|> nrow()))
        df
    })()|>  
    ungroup()|>
    group_by(HASH_KEY)|>
    summarise(femenino = sum(sexo=="femenino", na.rm=T),masculino = sum(sexo=="masculino", na.rm=T), total=n())|>
    ungroup()|>
    mutate(c2_perc_fem = femenino / total, c2_perc_masc = masculino / total)|>
  (\(df) {
    message(paste0("Unambiguous values: ", nrow(filter(df, c2_perc_masc>.5|c2_perc_fem>.5)|>  distinct(HASH_KEY))))
    select(df, -any_of(c("c2_perc_masc", "c2_perc_fem", "total")))
  })()

C2, Entries: 2

C2, RUNs: 2

Unambiguous values: 2

Code

# C2, Entries: 0
# C2, RUNs: 0
# SEP 2025
# C2, Entries: 2
# C2, RUNs: 2
# Unambiguous values: 2

invisible("======================================================")
invalid_sex_c3<-  
CONS_C3_25_df|>
    filter(HASH_KEY %in% invalid_sex_by_patient)|>
    (\(df) {
        message(paste0("C3, Entries: ", nrow(df)))
        message(paste0("C3, RUNs: ", distinct(df, HASH_KEY)|> nrow()))
        df
    })()|>  
    ungroup()|>
    group_by(HASH_KEY)|>
    summarise(femenino = sum(sexo=="mujer", na.rm=T),masculino = sum(sexo=="hombre", na.rm=T), total=n())|>
    ungroup()|>
    mutate(c3_perc_fem = femenino / total, c3_perc_masc = masculino / total)|>
  (\(df) {
    message(paste0("Unambiguous values: ", nrow(filter(df, c3_perc_masc>.5|c3_perc_fem>.5)|>  distinct(HASH_KEY))))
    select(df, -any_of(c("c3_perc_masc", "c3_perc_fem", "total")))
  })()

C3, Entries: 8

C3, RUNs: 4

Unambiguous values: 4

Code

# C3, Entries: 4
# C3, RUNs: 4
# SEP 2025
# C3, Entries: 8
# C3, RUNs: 4
# Unambiguous values: 4

invisible("======================================================")
invalid_sex_c4<-  
CONS_C4_25_df|>
    filter(HASH_KEY %in% invalid_sex_by_patient)|>
    (\(df) {
        (message(paste0("C4, Entries: ", nrow(df))))
        (message(paste0("C4, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    ungroup()|>
    group_by(HASH_KEY)|>
    summarise(femenino = sum(sexo=="mujer", na.rm=T),masculino = sum(sexo=="hombre", na.rm=T), total=n())|>
      mutate(c4_perc_fem = femenino / total, c4_perc_masc = masculino / total)|>
    (\(df) {
      message(paste0("Unambiguous values: ", nrow(filter(df, c4_perc_masc>.5|c4_perc_fem>.5)|>  distinct(HASH_KEY))))
      select(df, -any_of(c("c4_perc_masc", "c4_perc_fem", "total")))
    })()

C4, Entries: 4

C4, RUNs: 2

Unambiguous values: 2

Code

# C4, Entries: 2
# C4, RUNs: 1
#SEP2025
# C4, Entries: 4
# C4, RUNs: 2
# Unambiguous values: 2

invisible("======================================================")
invalid_sex_c5<-  
CONS_C5_25_df|>
    filter(HASH_KEY %in% invalid_sex_by_patient)|>
    (\(df) {
        (message(paste0("C5, Entries: ", nrow(df))))
        (message(paste0("C5, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>
    ungroup()|>
    group_by(HASH_KEY)|>
    summarise(femenino = sum(sexo=="femenino", na.rm=T),masculino = sum(sexo=="masculino", na.rm=T), total=n())|>
      mutate(c5_perc_fem = femenino / total, c5_perc_masc = masculino / total)|>
    (\(df) {
      message(paste0("Unambiguous values: ", nrow(filter(df, c5_perc_masc>.5|c5_perc_fem>.5)|>  distinct(HASH_KEY))))
      select(df, -any_of(c("c5_perc_masc", "c5_perc_fem", "total")))
    })()

C5, Entries: 2

C5, RUNs: 2

Unambiguous values: 2

Code

# C5, Entries: 1
# C5, RUNs: 1
# SEP 2025
# C5, Entries: 2
# C5, RUNs: 2
# Unambiguous values: 2

invisible("======================================================")
invalid_sex_c6<-  
CONS_C6_25_df|>
    filter(HASH_KEY %in% invalid_sex_by_patient)|>
    (\(df) {
        (message(paste0("C6, Entries: ", nrow(df))))
        (message(paste0("C6, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>
    ungroup()|>
    group_by(HASH_KEY)|>
    summarise(femenino = sum(sexo=="mujer", na.rm=T),masculino = sum(sexo=="hombre", na.rm=T), total=n())|>
      mutate(c6_perc_fem = femenino / total, c6_perc_masc = masculino / total)|>
    (\(df) {
      message(paste0("Unambiguous values: ", nrow(filter(df, c6_perc_masc>.5|c6_perc_fem>.5)|>  distinct(HASH_KEY))))
      select(df, -any_of(c("c6_perc_masc", "c6_perc_fem", "total")))
    })()

C6, Entries: 4

C6, RUNs: 4

Unambiguous values: 4

Code

# C6, Entries: 1
# C6, RUNs: 1
# SEP 2025
# C6, Entries: 4
# C6, RUNs: 4
# Unambiguous values: 4

invisible("======================================================")
invalid_sex_mortality<-  
mortality|>
    filter(hashkey %in% invalid_sex_by_patient)|>
    (\(df) {
        (message(paste0("Mortality, Entries: ", nrow(df))))
        (message(paste0("Mortality, RUNs: ", distinct(df, hashkey)|> nrow())))
        df
    })()|>  
    distinct(hashkey, sexo)|>
    ungroup()|> 
  rename("m_sexo"="sexo")

Mortality, Entries: 14

Mortality, RUNs: 14

Code

# Mortality, Entries: 15
# Mortality, RUNs: 15
# SEP 2025
# Mortality, Entries: 14
# Mortality, RUNs: 14

invisible("======================================================")
invalid_sex_may23_PO_office<-  
OLD_NEW_SISTRAT23_c1_2010_2024_df2|>
  tidylog::right_join(Base_fiscalia_v2, by=c("HASH_KEY.y"="rut_enc_saf"))|> 
  select("HASH_KEY.x","HASH_KEY.y", "sexo.y")|> 
  filter(HASH_KEY.x %in% invalid_sex_by_patient)|>
    (\(df) {
        (message(paste0("PO Office, Entries: ", nrow(df))))
        (message(paste0("PO Office, RUNs: ", distinct(df, HASH_KEY.x)|> nrow())))
        df
    })()|>  
    group_by(HASH_KEY.x)|>
    summarise(femenino = sum(grepl("FEM", sexo.y)),masculino = sum(grepl("MASC", sexo.y)), total=n())|> 
    ungroup()|> 
    mutate(po_perc_fem = femenino / total, po_perc_masc = masculino / total)|> 
    filter(po_perc_masc>.5|po_perc_fem>.5)|>
    (\(df) {
        (message(paste0("PO Office, only clear sexes, RUNs: ", distinct(df, HASH_KEY.x)|> nrow())))
        df
    })()

right_join: added 5 columns (sexo.x, fec_nacimiento_simple, sexo.y, avg_birth_date_po, n_dis_birth_date_po)

        > rows only in OLD_NEW_SISTRAT23_c1_20.. (   56,867)
        > rows only in Base_fiscalia_v2              30,256
        > matched rows                            1,164,249    (includes duplicates)
        >                                        ===========
        > rows total                              1,194,505

PO Office, Entries: 18515

PO Office, RUNs: 483

PO Office, only clear sexes, RUNs: 480

Code

# PO Office, Entries: 18515
# NULL
# PO Office, RUNs: 483
# NULL    
# PO Office, only clear sexes, RUNs: 294
# NULL

invisible("Given that there may be many sex records by hash, we got a percentage by HASH")
invisible("We got only the unambiguous records")
invalid_sex_may23_PO_office_alt<-  
OLD_NEW_SISTRAT23_c1_2010_2024_df2_alt|>
  #discard overlappings in HASHs
  tidytable::filter(!HASH_KEY %in% OLD_NEW_SISTRAT23_c1_2010_2024_df2$HASH_KEY.x)|> 
  #join with PO Office
  tidylog::right_join(Base_fiscalia_v2, by=c("HASH_KEY_target"="rut_enc_saf"), multiple="first")|> 
  #select variables of interest
  tidytable::select("HASH_KEY","HASH_KEY_target", "sexo.y","avg_birth_date_po")|> 
  #filter incosistent birth dates only
  tidytable::filter(HASH_KEY %in% invalid_sex_by_patient)|>
    (\(df) {
        (message(paste0("PO Office (alt., Aug 2025, not deterministically matched), Entries: ", nrow(df))))
        (message(paste0("PO Office (alt., Aug 2025, not deterministically matched), RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
        df
    })()|> 
    tidytable::group_by(HASH_KEY)|>
    tidytable::summarise(femenino = sum(grepl("FEM", sexo.y)),masculino = sum(grepl("MASC", sexo.y)), total=n())|> 
    tidytable::ungroup()|> 
    tidytable::mutate(po_perc_fem = femenino / total, po_perc_masc = masculino / total)|> 
    tidytable::filter(po_perc_masc>.5|po_perc_fem>.5)|>
    (\(df) {
        (message(paste0("PO Office (alt), only clear sexes, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
        df
    })()

right_join: added 5 columns (sexo.x, fec_nacimiento_simple, sexo.y, avg_birth_date_po, n_dis_birth_date_po) > rows only in tidytable::filter(OLD_N.. ( 1,858) > rows only in Base_fiscalia_v2 524,344 > matched rows 30,694 (includes duplicates) > ========= > rows total 555,038 PO Office (alt., Aug 2025, not deterministically matched), Entries: 15

PO Office (alt., Aug 2025, not deterministically matched), RUNs: 7

PO Office (alt), only clear sexes, RUNs: 7

Code

# PO Office (alt., Aug 2025, not deterministically matched), Entries: 15
# PO Office (alt., Aug 2025, not deterministically matched), RUNs: 7

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

invalid_sex_ext_info<-
SISTRAT23_c1_2010_2024_df_prev1o|>
  tidytable::filter(hash_key %in% invalid_sex_by_patient)|>
  tidytable::select(hash_key, sexo)|> 
  tidytable::group_by(hash_key)|>
  tidytable::summarise(c1_hombre = sum(grepl("hom", sexo), na.rm=T),c1_mujer = sum(grepl("muj", sexo), na.rm=T), total=n())|> 
  tidytable::ungroup()|> 
  tidytable::select(hash_key, c1_hombre, c1_mujer, total)|> 
  tidylog::left_join(invalid_sex_hashs_hosp, by=c("hash_key"="run"), multiple="first")|>
  tidytable::select(hash_key, c1_hombre, c1_mujer, total, femenino, masculino)|>
  tidytable::rename("h_masc"="masculino", "h_fem"="femenino")|> 
  tidylog::left_join(invalid_sex_top, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidytable::rename("top_masc"="masculino", "top_fem"="femenino")|>
  tidylog::left_join(invalid_sex_c2, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidytable::rename("c2_masc"="masculino", "c2_fem"="femenino")|>
  tidylog::left_join(invalid_sex_c3, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidytable::rename("c3_masc"="masculino", "c3_fem"="femenino")|>
  tidylog::left_join(invalid_sex_c4, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidytable::rename("c4_masc"="masculino", "c4_fem"="femenino")|>
  tidylog::left_join(invalid_sex_c5, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidytable::rename("c5_masc"="masculino", "c5_fem"="femenino")|>
  tidylog::left_join(invalid_sex_c6, by=c("hash_key"="HASH_KEY"), multiple="first")|>
  tidytable::rename("c6_masc"="masculino", "c6_fem"="femenino")|>
  tidylog::left_join(invalid_sex_mortality, by=c("hash_key"="hashkey"), multiple="first")|>
  tidytable::mutate(mort_fem= if_else(m_sexo==2,1,0))|>
  tidytable::select(-m_sexo)|> 
  tidylog::left_join(invalid_sex_may23_PO_office[,c("HASH_KEY.x","femenino", "masculino")], by=c("hash_key"="HASH_KEY.x"), multiple="first")|>
  tidytable::rename("po_masc"="masculino", "po_fem"="femenino")|>
  tidylog::left_join(invalid_sex_may23_PO_office_alt[,c("HASH_KEY","femenino", "masculino")], by=c("hash_key"="HASH_KEY"), multiple="first")|> 
  tidytable::select(-total)|>
  tidytable::rename("po_alt_masc"="masculino", "po_alt_fem"="femenino")|>
      (\(df) {
        (message(paste0("Invalid sex that have at least one external sex, Entries: ", nrow(df))))
        (message(paste0("Invalid sex that have at least one external sex, RUNs: ", tidytable::distinct(df, hash_key)|> nrow())))
        df
    })()|> 
  (\(df) {
    tidytable::mutate(df, count_not_na= rowSums(select(df, 2:ncol(df)), na.rm=T))#|> 
    #tidytable::mutate(count_fem= rowSums(select(df, c("c1_mujer", "", "", )))))|>
    #tidytable::mutate(count_masc= rowSums(!is.na(select(df, c("c1_mujer", "", "", )))))
  })()|> 
  tidytable::select(hash_key, count_not_na, everything())|> #count_fem, count_masc
  tidytable::rowwise()|>
  tidytable::mutate(count_fem = sum(c1_mujer, c2_fem, c3_fem, c4_fem, c5_fem, c6_fem, po_fem, po_alt_fem, h_fem, top_fem, na.rm=T))|>
  tidytable::mutate(count_masc = sum(c1_hombre, c2_masc, c3_masc, c4_masc, c5_masc, c6_masc, po_masc, po_alt_masc, h_masc, top_masc, na.rm=T))|>
  tidytable::ungroup()|>
  tidytable::mutate(perc_fem_ext= count_fem/count_not_na, perc_masc_ext= count_masc/count_not_na)

left_join: added 2 columns (femenino, masculino) > rows only in tidytable::select(tidyt.. 154 > rows only in invalid_sex_hashs_hosp ( 0) > matched rows 408 > ===== > rows total 562 left_join: added 2 columns (femenino, masculino) > rows only in tidytable::rename(tidyt.. 208 > rows only in invalid_sex_top ( 0) > matched rows 354 > ===== > rows total 562 left_join: added 2 columns (femenino, masculino) > rows only in tidytable::rename(tidyl.. 560 > rows only in invalid_sex_c2 ( 0) > matched rows 2 > ===== > rows total 562 left_join: added 2 columns (femenino, masculino) > rows only in tidytable::rename(tidyl.. 558 > rows only in invalid_sex_c3 ( 0) > matched rows 4 > ===== > rows total 562 left_join: added 2 columns (femenino, masculino) > rows only in tidytable::rename(tidyl.. 560 > rows only in invalid_sex_c4 ( 0) > matched rows 2 > ===== > rows total 562 left_join: added 2 columns (femenino, masculino) > rows only in tidytable::rename(tidyl.. 560 > rows only in invalid_sex_c5 ( 0) > matched rows 2 > ===== > rows total 562 left_join: added 2 columns (femenino, masculino) > rows only in tidytable::rename(tidyl.. 558 > rows only in invalid_sex_c6 ( 0) > matched rows 4 > ===== > rows total 562 left_join: added one column (m_sexo) > rows only in tidytable::rename(tidyl.. 548 > rows only in invalid_sex_mortality ( 0) > matched rows 14 > ===== > rows total 562 left_join: added 2 columns (femenino, masculino) > rows only in tidytable::select(tidyt.. 82 > rows only in invalid_sex_may23_PO_of.. ( 0) > matched rows 480 > ===== > rows total 562 left_join: added 2 columns (femenino, masculino) > rows only in tidytable::rename(tidyl.. 555 > rows only in invalid_sex_may23_PO_of.. ( 0) > matched rows 7 > ===== > rows total 562 Invalid sex that have at least one external sex, Entries: 562

Invalid sex that have at least one external sex, RUNs: 562

Code

stopifnot(invalid_sex_ext_info|> group_by(hash_key)|> summarise(n= n())|> filter(n>1)|> nrow()==0)

invalid_sex_ext_info[, c("hash_key", "count_not_na", "count_fem", "count_masc", "perc_fem_ext", "perc_masc_ext")]|> mutate(hash_key= as.numeric(factor(hash_key)))|>  rio::export(paste0(getwd(),"/_out/invalid_sex_ext_info.xlsx"))

message(paste0("Inconsistent sex: ", length(invalid_sex_by_patient)))

Inconsistent sex: 562

Code

message(paste0("HASHs with predominant sex: ",
invalid_sex_ext_info[, c("hash_key", "count_not_na", "count_fem", "count_masc", "perc_fem_ext", "perc_masc_ext")]|> filter(perc_fem_ext>0.5| perc_masc_ext>.5)|> nrow()," (",
scales::percent(invalid_sex_ext_info[, c("hash_key", "count_not_na", "count_fem", "count_masc", "perc_fem_ext", "perc_masc_ext")]|> filter(perc_fem_ext>0.5| perc_masc_ext>.5)|> nrow()/length(invalid_sex_by_patient)),")"
))

HASHs with predominant sex: 542 (96%)

Code

#HASHs with predominant sex: 542 (96%)


invalid_sex_ext_info_nondet<- 
invalid_sex_ext_info[, c("hash_key", "count_not_na", "count_fem", "count_masc", "perc_fem_ext", "perc_masc_ext")]|> filter(perc_fem_ext<=0.5& perc_masc_ext<=.5)
#00788d541c22e1960d361c4dc856d47d0b566be564313df58b32e1149d05ab31

invalid_sex_ext_info$decision <- ifelse(invalid_sex_ext_info$perc_fem_ext >.5 | invalid_sex_ext_info$perc_masc_ext >.5, "det","nondet")

# Hospital, Entries: 1773
# Hospital, RUNs: 408
# Unambiguous values: 406
# TOP, Entries: 1769
# TOP, RUNs: 354
# Unambiguous values: 325
# C2, Entries: 2
# C2, RUNs: 2
# Unambiguous values: 2
# C3, Entries: 8
# C3, RUNs: 4
# Unambiguous values: 4
# C4, Entries: 4
# C4, RUNs: 2
# Unambiguous values: 2
# C5, Entries: 2
# C5, RUNs: 2
# Unambiguous values: 2
# C6, Entries: 4
# C6, RUNs: 4
# Unambiguous values: 4
# Mortality, Entries: 14
# Mortality, RUNs: 14

For patients whose sex remained undetermined (n= 20) after the initial classifications, we used pregnancy status and program type information to aid in the final determination. We also checked information of this kind in C1 to C6 databases (about pregnancy status). At last, we also used gender identity as a proxy.

Where conflicting sex values existed, we derived an administrative sex field using concordant external records; women-only program participation and recorded pregnancy were treated as strong evidence; ICD-10 sex-specific codes were considered supportive only. Records with inconclusive evidence remained ‘undetermined’. Gender identity was recorded separately and never used to overwrite administrative sex.

Code

c1_6_sex_ext_data<-
  #group by RUN and count records of pregnancy and type of program=women
group_by(subset(SISTRAT23_c1_2010_2024_df_prev1o, hash_key %in% c(invalid_sex_ext_info_nondet$hash_key)), hash_key)|> summarise(n_embarazada= sum(pregnant=="si", na.rm=T), n_emb_egr= sum(pregnant_disch=="si", na.rm=T), n_prog_mujeres= sum(grepl("^m",plan_type),na.rm=T),.groups="drop_last")|>
  #
  mutate(pregnancy_c2= ifelse(hash_key %in% (filter(CONS_C2_25_df, a_setratadeunamujerembaraza=="si" | ha_estado_embarazada_egreso=="si")|> pull(HASH_KEY)),1,0))|>
  #
  mutate(pregnancy_c3= ifelse(hash_key %in% (filter(CONS_C3_25_df, se_trata_de_una_mujer_embarazada=="si" | ha_estado_embarazada_egreso=="si")|> pull(HASH_KEY)),1,0))|>
  #
  mutate(pregnancy_c4= ifelse(hash_key %in% (filter(CONS_C4_25_df, se_trata_de_una_mujer_embarazada=="si" | ha_estado_embarazada_egreso=="si")|> pull(HASH_KEY)),1,0))|>
  #
  mutate(pregnancy_c5= ifelse(hash_key %in% (filter(CONS_C5_25_df, embarazo=="si" | ha_estado_embarazada_egreso=="si")|> pull(HASH_KEY)),1,0))|>
  #
  mutate(pregnancy_c6= ifelse(hash_key %in% (filter(CONS_C6_25_df, se_trata_de_una_mujer_embarazada=="si" | ha_estado_embarazada_egreso=="si")|> pull(HASH_KEY)),1,0))
#c2 to c6 didnt add info

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

invalid_sex_ext_info_nondet[, c("hash_key", "count_not_na", "count_fem", "count_masc", "perc_fem_ext", "perc_masc_ext")]|>
left_join(c1_6_sex_ext_data, by="hash_key")|> 
  mutate(ext_data_woman= ifelse(n_embarazada>0|n_emb_egr>0|n_prog_mujeres>0,1,0))|> 
  mutate(ext_data_woman2= ifelse(pregnancy_c2>0|pregnancy_c3>0|pregnancy_c4>0|pregnancy_c5>0|pregnancy_c6>0,1,0))|> 
    (\(df) {
    message(paste0("Non-determined sex with pregnancy status: ", filter(df, ext_data_woman==1|ext_data_woman2==1)|> nrow()))
      filter(df, ext_data_woman==1|ext_data_woman2==1)|> pull(hash_key) ->> hashs_invalid_sex_nondet_pregnant
  })()

Non-determined sex with pregnancy status: 4

Code

#Non-determined sex with pregnancy status: 4


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
infer_sex_icd10 <- function(icd_codes) {
  # Enhanced female-specific ICD-10 patterns
  female_patterns <- c(
    "^O",              # Pregnancy/childbirth (O00-O99)
    "^C5[1-8]",        # Female genital cancers (C51-C58)
    "^D0[6-7]",        # CIS female genital (D06-D07)
    "^D2[4-8]",        # Benign female neoplasms (D24-D28)
    "^N7[0-7]",        # PID (N70-N77)
    "^N8[0-9]|^N9[0-8]", # Non-inflammatory disorders (N80-N98)
    "^Q5[0-2]",        # Congenital female anomalies (Q50-Q52)
    "^Z12\\.4",        # Female cancer screening
    "^Z3[0-9]"         # Reproductive health encounters (Z30-Z39)
  )
  
  # Enhanced male-specific ICD-10 patterns
  male_patterns <- c(
    "^N[4-5][0-9]",    # Male genital disorders (N40-N51)
    "^C6[0-3]",        # Male genital cancers (C60-C63)
    "^D29",            # Benign male neoplasms
    "^Q5[3-5]",        # Congenital male anomalies (Q53-Q55)
    "^Z12\\.5",        # Prostate screening
    "^Z41\\.2",        # Vasectomy
    "^Z90\\.7"         # Acquired absence of male genital
  )
  
  # Check matches
  is_female <- map_lgl(icd_codes, ~ any(stringr::str_detect(.x, female_patterns)))
  is_male <- map_lgl(icd_codes, ~ any(stringr::str_detect(.x, male_patterns)))
  
  case_when(
    is_female & !is_male ~ "Female",
    is_male & !is_female ~ "Male",
    is_female & is_male ~ "Conflict",
    TRUE ~ "nondet"
  )
}

cat("Classification based on ICD-10 diagnoses in hospitalizations")
HOSP_filter_df|> 
  mutate(sex= infer_sex_icd10(diag1))|> 
  janitor::tabyl(sex)
#   sex      n     percent
# Female  59075 0.252376375
#  Male   2030 0.008672434
# nondet 172970 0.738951191

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
HOSP_filter_df|> 
    filter(run %in% setdiff(invalid_sex_ext_info_nondet$hash_key, hashs_invalid_sex_nondet_pregnant))|>
    mutate(sex= infer_sex_icd10(diag1))|> 
    (\(df) {
        (message(paste0("Hospital, suggested sex (w/o pregnancy status), Entries: ", nrow(df))))
        (message(paste0("Hospital, suggested sex (w/o pregnancy status), RUNs: ", distinct(df, run)|> nrow())))
        df
    })()|>
    select(run, sex)|>
    summarise(female_icd10= sum(sex=="Female", na.rm=T), male_icd10= sum(sex=="Male", na.rm=T), .by= run,.groups="drop_last")|>
    filter(female_icd10>0|male_icd10>0)|> 
      (\(df) {
    cat(paste0("Cases with undetermined sex, lacking pregnancy information, but having available hospitalization records with ICD-10 diagnoses: ", nrow(df),"\n"))
    filter(df, female_icd10>0)|> pull(run) ->> hashs_invalid_sex_woman_ask_non_pregnant_but_icd10
    filter(df, male_icd10>0)|> pull(run) ->> hashs_invalid_sex_man_ask_not_pregnant_but_icd10
  })()

Hospital, suggested sex (w/o pregnancy status), Entries: 16

Hospital, suggested sex (w/o pregnancy status), RUNs: 7

Code

# Hospital, suggested sex (w/o pregnancy status), Entries: 16
# Hospital, suggested sex (w/o pregnancy status), RUNs: 7
#Cases with undetermined sex, lacking pregnancy information, but having available hospitalization records with ICD-10 diagnoses: 4

Classification based on ICD-10 diagnoses in hospitalizations    sex      n     percent
 Female  59075 0.252376375
   Male   2030 0.008672434
 nondet 172970 0.738951191
Cases with undetermined sex, lacking pregnancy information, but having available hospitalization records with ICD-10 diagnoses: 4

Undetermined sex classifications were retained, recognizing that for these records, inferring sex from external data might be unreliable.

We retained ‘undetermined’ where evidence was inconclusive (n= 12). Discrepancies may reflect data entry differences, changes in recorded gender identity, or linkage uncertainty. The remaining cases were classified using probabilistic imputation using missRanger (predictive mean matching).

Code

#for HASHs with undetermined sex, we used records
c1_inconsistent_sex_genid_10_24<- 
SISTRAT23_c1_2010_2024_df_prev1o|> 
  filter(hash_key %in% setdiff(invalid_sex_ext_info_nondet$hash_key, c(hashs_invalid_sex_nondet_pregnant, hashs_invalid_sex_woman_ask_non_pregnant_but_icd10, hashs_invalid_sex_man_ask_not_pregnant_but_icd10)))|> 
  select(hash_key, identidad_de_genero)|> 
  group_by(hash_key)|> 
  summarise(prop_fem_genid=sum(grepl("fem",identidad_de_genero),na.rm=T)/sum(!is.na(identidad_de_genero), na.rm=T), prop_masc_genid=sum(grepl("masc",identidad_de_genero),na.rm=T)/sum(!is.na(identidad_de_genero), na.rm=T))|>
  ungroup()|> 
  mutate(prop_fem_genid= ifelse(is.nan(prop_fem_genid),0, prop_fem_genid), prop_masc_genid= ifelse(is.nan(prop_masc_genid),0, prop_masc_genid))|> 
  filter(prop_fem_genid>0.5|prop_masc_genid>0.5)|> 
    (\(df) {
        cat(paste0("Undetermined records in sex, with defined gender id: ", formatC(nrow(df), big.mark=",")),"\n")
        df
    })()
  #Undetermined records in sex, with defined gender id: 6 

message(paste0("Candidate hashs for probabilistic imputation: ",
setdiff(invalid_sex_ext_info_nondet$hash_key, 
    c(hashs_invalid_sex_nondet_pregnant, 
    hashs_invalid_sex_woman_ask_non_pregnant_but_icd10, 
    hashs_invalid_sex_man_ask_not_pregnant_but_icd10, 
    c1_inconsistent_sex_genid_10_24$hash_key))|> 
length()))

Candidate hashs for probabilistic imputation: 6

Code

#Candidate hashs for probabilistic imputation: 6

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

# 0.a) Pick holdout rows (exclude your truncated set)
set.seed(2125)
holdout_sex_rn <-
  SISTRAT23_c1_2010_2024_df_prev1o|>
  #sep 2025, dont add imputations here, because this are the "clean" candidates for accuracy indexes
  #corrected to get processed days in treatment also valid
  tidytable::filter(!is.na(sexo), 
  !hash_key %in% setdiff(invalid_sex_ext_info_nondet$hash_key, c(hashs_invalid_sex_nondet_pregnant, hashs_invalid_sex_woman_ask_non_pregnant_but_icd10, hashs_invalid_sex_man_ask_not_pregnant_but_icd10, c1_inconsistent_sex_genid_10_24$hash_key)))|>
  tidytable::slice_sample(n = 1000)|>
  tidytable::pull(rn)

# 0.b) Build mr_in and MASK the holdout on the log target
mr_in_sex <-
  SISTRAT23_c1_2010_2024_df_prev1o|>
  tidytable::mutate(sexo_imp= ifelse(hash_key %in% 
        setdiff(invalid_sex_ext_info_nondet$hash_key, 
        c(hashs_invalid_sex_nondet_pregnant, 
          hashs_invalid_sex_woman_ask_non_pregnant_but_icd10, 
          hashs_invalid_sex_man_ask_not_pregnant_but_icd10, 
          c1_inconsistent_sex_genid_10_24$hash_key)),NA, sexo))|>
  tidytable::mutate(sexo_imp= ifelse(rn %in% holdout_sex_rn, NA, sexo_imp))

# 0.c) Impute through missranger
set.seed(2125)
mr_fit_sex <- missRanger::missRanger(
  data      = mr_in_sex,
  formula   = sexo_imp ~ . -rn -hash_key -sexo,
  pmm.k     = 9,
  num.trees = 2e2,#3e3
  keep_forests = F,#too heavy
  maxiter   = 5,
  mtry      = function(p) max(5L, floor(p/3L)),
  seed      = 2125,
  returnOOB = TRUE,
  verbose   = 1
)

Missing value imputation by random forests

Code

# Estimated remaining time: 51 seconds.

# 0.d) Predictive performance: OOB
oob_err_sex <- attr(mr_fit_sex, "oob")[["sexo_imp"]]   # misclassification rate
#misclassification error rate
cat(sprintf("OOB accuracy (sexo_imp): %.1f%%\n", 100 * (1 - oob_err_sex)))
#OOB accuracy (sexo_imp): 89.4%

eval_tbl_sex <- mr_fit_sex|>
  dplyr::filter(rn %in% holdout_sex_rn)|>
  dplyr::mutate(
    sexo     = factor(sexo, levels = c("hombre","mujer")),
    sexo_imp = factor(sexo_imp, levels = c("hombre","mujer"))
  )

# 0.e) Holdout
# --- Holdout evaluation (days) ---
eval_tbl_sex <-
  mr_fit_sex|>
  filter(rn %in% holdout_sex_rn)|> 
  mutate(
  sexo      = factor(sexo, levels = c("hombre","mujer"), labels = c("hombre","mujer")),
  sexo_imp  = factor(sexo_imp, levels = c(1,2), labels = c("hombre","mujer"))
)

tab <- table(truth = eval_tbl_sex$sexo, pred = eval_tbl_sex$sexo_imp)

# 0.f) Metrics and perfomance
metrics_from_table <- function(tab) {
  TP <- tab["mujer","mujer"]
  FN <- tab["mujer","hombre"]
  FP <- tab["hombre","mujer"]
  TN <- tab["hombre","hombre"]
  N  <- sum(tab)
  
  accuracy <- (TP + TN) / N
  sensitivity <- if ((TP+FN)>0) TP / (TP + FN) else NA_real_
  specificity <- if ((TN+FP)>0) TN / (TN + FP) else NA_real_
  precision   <- if ((TP+FP)>0) TP / (TP + FP) else NA_real_
  f1 <- if (!is.na(precision) && !is.na(sensitivity) && (precision + sensitivity)>0) 2*precision*sensitivity/(precision+sensitivity) else NA_real_
  balanced <- mean(c(sensitivity, specificity), na.rm = TRUE)
  
  c(accuracy = accuracy, sensitivity = sensitivity, specificity = specificity,
    precision = precision, F1 = f1, balanced_accuracy = balanced)
}

set.seed(2125)
R <- 2000
boots <- replicate(R, {
  samp <- eval_tbl_sex[sample.int(nrow(eval_tbl_sex), replace = TRUE), , drop = FALSE]
  tab <- table(truth = samp$sexo, pred = samp$sexo_imp)
  metrics_from_table(tab)
}, simplify = "array")

# 95% bootstrap CIs (percentile)
ci <- apply(boots, 1, function(x) quantile(x, probs = c(0.025, 0.975), na.rm = TRUE))
res <- tibble::tibble(
  metric = names(metrics_from_table(table(truth = eval_tbl_sex$sexo, pred = eval_tbl_sex$sexo_imp))),
  estimate = unname(metrics_from_table(table(truth = eval_tbl_sex$sexo, pred = eval_tbl_sex$sexo_imp))),
  ci_lower = ci[1,],
  ci_upper = ci[2,]
)

knitr::kable(res, "markdown", caption= "Predictive performance", digits=2)
#Reasonable discrimination but relatively low positive predictive value for the mujer class.

prob_imputation_sex<- 
mr_fit_sex|>
filter(hash_key %in% setdiff(invalid_sex_ext_info_nondet$hash_key, 
      c(hashs_invalid_sex_nondet_pregnant, 
        hashs_invalid_sex_woman_ask_non_pregnant_but_icd10, 
        hashs_invalid_sex_man_ask_not_pregnant_but_icd10, 
        c1_inconsistent_sex_genid_10_24$hash_key)))|>
group_by(hash_key)|> 
summarise(perc_fem_imp= sum(sexo_imp==2, na.rm=T)/ n())|>
ungroup()|>
filter(!perc_fem_imp==.5)|>
mutate(sexo_imp= ifelse(perc_fem_imp>.5, "mujer", "hombre"))

#Remove imputation databases
rm(list = intersect(ls(), c("mr_fit_sex","mr_fit")))

Undetermined records in sex, with defined gender id: 6 

Skip constant features for imputation:  nombre_usuario
Variables to impute:        sexo_imp
Variables used to impute:   TABLE, codigo_identificacion, tipo_de_programa, tipo_de_plan, senda, n_meses_en_tratamiento, n_meses_en_senda, origen_de_ingreso, nacionalidad, estado_conyugal, numero_de_hijos, condicion_ocupacional, con_quien_vive, sustancia_principal, edad_inicio_sustancia_principal, via_administracion_sustancia_principal, diagnostico_trs_consumo_sustancia, fecha_ingreso_a_tratamiento, TABLE_rec, birth_date, adm_date, OBS, adm_date_num, adm_date_rec, adm_date_rec_num, adm_age, birth_date_rec, adm_age_rec, plan_type, dit_earl_drop, tr_compliance, occupation_condition, sub_dep_icd10_status, adm_motive, macrozone_center, yr_block, dit_earl_drop_rec0, tr_compliance_rec0, tr_compliance_rec, tr_compliance_rec5, adm_date_rec2, tr_compliance_rec6, adm_age_rec2, adm_date_num_rec2, dit_earl_drop_rec, adm_age_rec3, dg_psiq_cie_10_dg, dg_psiq_cie_10_instudy, dg_psiq_cie_10_no_dg, dg_psiq_dsm_iv_dg, dg_psiq_dsm_iv_instudy, dg_psiq_dsm_iv_no_dg, sexo_imp

iter 1 

  |                                                                            
  |                                                                      |   0%Growing trees.. Progress: 40%. Estimated remaining time: 47 seconds.
Growing trees.. Progress: 78%. Estimated remaining time: 17 seconds.

  |                                                                            
  |======================================================================| 100%
iter 2 

  |                                                                            
  |                                                                      |   0%Growing trees.. Progress: 39%. Estimated remaining time: 49 seconds.
Growing trees.. Progress: 79%. Estimated remaining time: 16 seconds.

  |                                                                            
  |======================================================================| 100%
iter 3 

  |                                                                            
  |                                                                      |   0%Growing trees.. Progress: 40%. Estimated remaining time: 46 seconds.
Growing trees.. Progress: 81%. Estimated remaining time: 15 seconds.

  |                                                                            
  |======================================================================| 100%
OOB accuracy (sexo_imp): 89.3%

Predictive performance
metric	estimate	ci_lower	ci_upper
accuracy	0.75	0.72	0.77
sensitivity	0.65	0.59	0.70
specificity	0.78	0.75	0.81
precision	0.50	0.45	0.56
F1	0.57	0.52	0.62
balanced_accuracy	0.71	0.68	0.75

The imputation obtained a reasonable discrimination but relatively low positive predictive value for the “mujer” category. However, we must consider that the target of imputed values are only 4 HASHes.

Code

# 1) Key sets by source (female / male) ---
ext_fem_keys <- subset(invalid_sex_ext_info, decision == "det"& perc_fem_ext>.5)$hash_key
ext_masc_keys <- subset(invalid_sex_ext_info, decision == "det" & perc_masc_ext>.5)$hash_key  

preg_keys      <- hashs_invalid_sex_nondet_pregnant

icd10_fem_keys <- hashs_invalid_sex_woman_ask_non_pregnant_but_icd10
icd10_masc_keys<- hashs_invalid_sex_man_ask_not_pregnant_but_icd10  # hombre

genid_masc_keys <- subset(c1_inconsistent_sex_genid_10_24, prop_masc_genid > .5)$hash_key
genid_fem_keys  <- subset(c1_inconsistent_sex_genid_10_24, prop_fem_genid > .5)$hash_key

imp_masc_keys <- subset(prob_imputation_sex, sexo_imp == "hombre")$hash_key
imp_fem_keys  <- subset(prob_imputation_sex, sexo_imp == "mujer")$hash_key

# 2) Integrate into main table: create sex_rec + annotate OBS ---
SISTRAT23_c1_2010_2024_df_prev1p <-
  SISTRAT23_c1_2010_2024_df_prev1o|>
  (\(df) {
    cat(paste0("6.Number of cases before resolving inconsistencies in sex: ",
               formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("6.Number of patients before resolving inconsistencies in sex: ",
               formatC(nrow(tidytable::distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })()|>
  # Start with original value; OBS as character to safely append text
  tidytable::mutate(
    sex_rec = sexo,
    OBS     = as.character(OBS)
  )|>
  # 6.1 External majority vote (C1–C6/Hosp/Mort/PO/Top)
  (\(df) {
    cond_ext <- (df$hash_key %in% c(ext_fem_keys, ext_masc_keys))
      tidytable::mutate(df,
        sex_rec = tidytable::case_when(
          cond_ext & hash_key %in% ext_fem_keys  ~ "mujer",
          cond_ext & hash_key %in% ext_masc_keys ~ "hombre",
          TRUE ~ sex_rec
        ),
        OBS = ifelse(cond_ext,
                     paste0(dplyr::coalesce(OBS, ""),
                            ";6.1.sex_rec via external majority"),OBS))
  })()|>
  # 6.2 Pregnancy signals → mujer
  (\(df) {
    cond_preg <- (df$hash_key %in% preg_keys)
      tidytable::mutate(df,
        sex_rec = ifelse(cond_preg, "mujer", sex_rec),
        OBS     = ifelse(cond_preg,
                         paste0(dplyr::coalesce(OBS, ""),
                                ";6.2.sex_rec via pregnancy"),OBS))
  })()|>
  # 6.3 ICD-10 inference from hospitalizations
  (\(df) {
    cond_icd <- (df$hash_key %in% c(icd10_fem_keys, icd10_masc_keys))
      tidytable::mutate(df,
        sex_rec = dplyr::case_when(
          cond_icd & hash_key %in% icd10_fem_keys   ~ "mujer",
          cond_icd & hash_key %in% icd10_masc_keys  ~ "hombre",
          TRUE ~ sex_rec
        ),
        OBS = ifelse(cond_icd,
                     paste0(dplyr::coalesce(OBS, ""),
                            ";6.3.sex_rec via ICD-10 (hospitalization)"),OBS))
  })()|>
  # 6.4 Gender identity majority
  (\(df) {
    cond_genid <- (df$hash_key %in% c(genid_fem_keys, genid_masc_keys))
      tidytable::mutate(df,
        sex_rec = dplyr::case_when(
          cond_genid & hash_key %in% genid_fem_keys   ~ "mujer",
          cond_genid & hash_key %in% genid_masc_keys  ~ "hombre",
          TRUE ~ sex_rec
        ),
        OBS = ifelse(cond_genid,
                     paste0(dplyr::coalesce(OBS, ""),
                            ";6.4.sex_rec via gender identity"),OBS))
  })()|>
  # 6.5 Probabilistic imputation (leftovers only)
  (\(df) {
    cond_imp <- (df$hash_key %in% c(imp_fem_keys, imp_masc_keys))
      tidytable::mutate(df,
        sex_rec = dplyr::case_when(
          cond_imp & hash_key %in% imp_fem_keys   ~ "mujer",
          cond_imp & hash_key %in% imp_masc_keys  ~ "hombre",
          TRUE ~ sex_rec
        ),
        OBS = ifelse(cond_imp,
                     paste0(dplyr::coalesce(OBS, ""),
                            ";6.5.sex_rec via probabilistic imputation"),OBS))
  })()|>
  # Final sanity log
  (\(df) {
    cat(paste0("6. After resolving inconsistencies in sex, obs.: ",
               formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("6. After resolving inconsistencies in sex, RUNs: ",
               formatC(nrow(tidytable::distinct(df, hash_key)), big.mark=",")),"\n")
    if (nrow(df) > nrow(SISTRAT23_c1_2010_2024_df_prev1o))
      stop("Error: Added treatment episodes in the process")
    df
  })()

SISTRAT23_c1_2010_2024_df_prev1p$OBS <- gsub(";;+", ";", SISTRAT23_c1_2010_2024_df_prev1p$OBS)

6.Number of cases before resolving inconsistencies in sex: 173,728 
6.Number of patients before resolving inconsistencies in sex: 121,299 
6. After resolving inconsistencies in sex, obs.: 173,728 
6. After resolving inconsistencies in sex, RUNs: 121,299

The database resulting from these changes was named SISTRAT23_c1_2010_2024_df_prev1p, and the new variable containing the recoded sex information is called sex_rec.

2.2. Nationality

Nationality (nacionalidad) (n= 162). We created a column called nationality_cons showing what those inconsistent nationalities are for each affected patient.

Code

invisible("no tiene perdidos, la mayoría son de chile.  Por tanto, si es distinto a Chile, reemplazarlo")

invalid_nationality_by_patient<-
SISTRAT23_c1_2010_2024_df_prev1p|>  group_by(hash_key)|> summarise(nacionalidades_por_hash = n_distinct(nacionalidad), distinto_chile = sum(nacionalidad!="chile", na.rm=T), tot_obs = n())|> ungroup()|> mutate(perc_extranjero = distinto_chile/tot_obs)|> filter(nacionalidades_por_hash>1)|> pull(hash_key)

multiple_nationalities<- 
SISTRAT23_c1_2010_2024_df_prev1p|> select(hash_key, nacionalidad)|>  filter(hash_key %in% invalid_nationality_by_patient)|> 
  summarise(nacionalidad_distinct = paste(sort(unique(nacionalidad)), collapse = "; "), .by="hash_key")|>
  ungroup()

2.3. Starting substance

Starting Substance (first_sub_used) (n= 16,106). For users that had only two treatments but a different starting substance, or in cases or users that had ties within most recent database or within the most recent value, we added a second and a third variable called sus_ini_2 and sus_ini_3 that contains a second starting substance. We also made sus_ini_mvv for starting substances of the most vulnerable, or higher-risk (operational ranking in our context) value reported (Cocaine paste base > Cocaine hydrochloride > Alcohol > Marijuana > Other).

Code

invalid_start_subs_hash_key<- 
  SISTRAT23_c1_2010_2024_df_prev1p|> summarise(sus_ini_por_hash = n_distinct(first_sub_used),.by=hash_key, .groups="drop_last")|> filter(sus_ini_por_hash>1)|> pull(hash_key)

message(paste0("Patients with more than one starting substance: ",length(invalid_start_subs_hash_key)))

Patients with more than one starting substance: 16106

Code

#Patients with more than one starting substance: 16106

cat("Number of distinct starting substances\n")
SISTRAT23_c1_2010_2024_df_prev1p|> summarise(sus_ini_por_hash = n_distinct(first_sub_used),.by=hash_key, .groups="drop_last")|> filter(sus_ini_por_hash>1)|> pull(sus_ini_por_hash)|> summary()
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 2.000   2.000   2.000   2.147   2.000   5.00
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
invisible("======================================================")
invalid_start_subs_c2<-  
    CONS_C2_25_df|>
    tidytable::filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
    (\(df) {
        (message(paste0("C2, Entries: ", nrow(df))))
        (message(paste0("C2, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    tidytable::select(HASH_KEY, sustancia_de_inicio)|>
    tidytable::rename("start_sub"=2)|> 
    tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
    tidytable::ungroup()|> 
    tidytable::group_by(HASH_KEY)|>
    tidytable::summarise(
    alcohol                     = sum(start_sub == "alcohol", na.rm=T),
    amphetamine_type_stimulants = sum(start_sub == "amphetamine-type stimulants", na.rm=T),
    cocaine_paste               = sum(start_sub == "cocaine paste", na.rm=T),
    cocaine_powder              = sum(start_sub == "cocaine powder", na.rm=T),
    hallucinogens               = sum(start_sub == "hallucinogens", na.rm=T),
    inhalants                   = sum(start_sub == "inhalants", na.rm=T),
    marijuana                   = sum(start_sub == "marijuana", na.rm=T),
    opioids                     = sum(start_sub == "opioids", na.rm=T),
    others                      = sum(start_sub == "others", na.rm=T),
    tranquilizers_hypnotics     = sum(start_sub == "tranquilizers/hypnotics", na.rm=T),
    total                       = n(),       # total records per hash_key
    .groups = "drop"
  )

C2, Entries: 253

C2, RUNs: 112

Code

invisible("======================================================")
invalid_start_subs_c3<-  
    CONS_C3_25_df|>
    tidytable::filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
    (\(df) {
        (message(paste0("C3, Entries: ", nrow(df))))
        (message(paste0("C3, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    tidytable::select(HASH_KEY, sustancia_de_inicio)|>
    tidytable::rename("start_sub"=2)|> 
    tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
    tidytable::ungroup()|> 
    tidytable::group_by(HASH_KEY)|>
    tidytable::summarise(
    alcohol                     = sum(start_sub == "alcohol", na.rm=T),
    amphetamine_type_stimulants = sum(start_sub == "amphetamine-type stimulants", na.rm=T),
    cocaine_paste               = sum(start_sub == "cocaine paste", na.rm=T),
    cocaine_powder              = sum(start_sub == "cocaine powder", na.rm=T),
    hallucinogens               = sum(start_sub == "hallucinogens", na.rm=T),
    inhalants                   = sum(start_sub == "inhalants", na.rm=T),
    marijuana                   = sum(start_sub == "marijuana", na.rm=T),
    opioids                     = sum(start_sub == "opioids", na.rm=T),
    others                      = sum(start_sub == "others", na.rm=T),
    tranquilizers_hypnotics     = sum(start_sub == "tranquilizers/hypnotics", na.rm=T),
    total                       = n(),       # total records per hash_key
    .groups = "drop"
  )

C3, Entries: 288

C3, RUNs: 187

Code

invisible("======================================================")
invalid_start_subs_c4<-  
    CONS_C4_25_df|>
    tidytable::filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
    (\(df) {
        (message(paste0("C4, Entries: ", nrow(df))))
        (message(paste0("C4, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    tidytable::select(HASH_KEY, sustancia_de_inicio)|>
    tidytable::rename("start_sub"=2)|> 
    tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
    tidytable::ungroup()|> 
    tidytable::group_by(HASH_KEY)|>
    tidytable::summarise(
    alcohol                     = sum(start_sub == "alcohol", na.rm=T),
    amphetamine_type_stimulants = sum(start_sub == "amphetamine-type stimulants", na.rm=T),
    cocaine_paste               = sum(start_sub == "cocaine paste", na.rm=T),
    cocaine_powder              = sum(start_sub == "cocaine powder", na.rm=T),
    hallucinogens               = sum(start_sub == "hallucinogens", na.rm=T),
    inhalants                   = sum(start_sub == "inhalants", na.rm=T),
    marijuana                   = sum(start_sub == "marijuana", na.rm=T),
    opioids                     = sum(start_sub == "opioids", na.rm=T),
    others                      = sum(start_sub == "others", na.rm=T),
    tranquilizers_hypnotics     = sum(start_sub == "tranquilizers/hypnotics", na.rm=T),
    total                       = n(),       # total records per hash_key
    .groups = "drop"
  )

C4, Entries: 149

C4, RUNs: 106

Code

invisible("======================================================")
invalid_start_subs_c5<-  
    CONS_C5_25_df|>
    tidytable::filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
    (\(df) {
        (message(paste0("C5, Entries: ", nrow(df))))
        (message(paste0("C5, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    tidytable::select(HASH_KEY, sustancia_inicial)|>
    tidytable::rename("start_sub"=2)|> 
    tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
    tidytable::ungroup()|> 
    tidytable::group_by(HASH_KEY)|>
    tidytable::summarise(
    alcohol                     = sum(start_sub == "alcohol", na.rm=T),
    amphetamine_type_stimulants = sum(start_sub == "amphetamine-type stimulants", na.rm=T),
    cocaine_paste               = sum(start_sub == "cocaine paste", na.rm=T),
    cocaine_powder              = sum(start_sub == "cocaine powder", na.rm=T),
    hallucinogens               = sum(start_sub == "hallucinogens", na.rm=T),
    inhalants                   = sum(start_sub == "inhalants", na.rm=T),
    marijuana                   = sum(start_sub == "marijuana", na.rm=T),
    opioids                     = sum(start_sub == "opioids", na.rm=T),
    others                      = sum(start_sub == "others", na.rm=T),
    tranquilizers_hypnotics     = sum(start_sub == "tranquilizers/hypnotics", na.rm=T),
    total                       = n(),       # total records per hash_key
    .groups = "drop"
  )

C5, Entries: 73

C5, RUNs: 53

Code

invisible("======================================================")
invalid_start_subs_c6<-  
CONS_C6_25_df|>
    tidytable::filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
    (\(df) {
        (message(paste0("C6, Entries: ", nrow(df))))
        (message(paste0("C6, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    tidytable::select(HASH_KEY, sustancia_de_inicio)|>
    tidytable::rename("start_sub"=2)|> 
    tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
    tidytable::ungroup()|> 
    tidytable::group_by(HASH_KEY)|>
    tidytable::summarise(
    alcohol                     = sum(start_sub == "alcohol", na.rm=T),
    amphetamine_type_stimulants = sum(start_sub == "amphetamine-type stimulants", na.rm=T),
    cocaine_paste               = sum(start_sub == "cocaine paste", na.rm=T),
    cocaine_powder              = sum(start_sub == "cocaine powder", na.rm=T),
    hallucinogens               = sum(start_sub == "hallucinogens", na.rm=T),
    inhalants                   = sum(start_sub == "inhalants", na.rm=T),
    marijuana                   = sum(start_sub == "marijuana", na.rm=T),
    opioids                     = sum(start_sub == "opioids", na.rm=T),
    others                      = sum(start_sub == "others", na.rm=T),
    tranquilizers_hypnotics     = sum(start_sub == "tranquilizers/hypnotics", na.rm=T),
    total                       = n(),       # total records per hash_key
    .groups = "drop"
  )

C6, Entries: 73

C6, RUNs: 61

Code

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
invisible("Generar jerarquías de sustancias de inicio")
#attr(table(SISTRAT23_c1_2010_2024_df_prev1p$first_sub_used),"names")|> dput()

substances <- c(
  "alcohol",
  "amphetamine_type_stimulants",
  "cocaine_paste",
  "cocaine_powder",
  "hallucinogens",
  "inhalants",
  "marijuana",
  "opioids",
  "others",
  "tranquilizers_hypnotics"
)

cat("Make counts by RUN and top substances in case of more than one initial substance\n")

invalid_start_subs_ext_info <-
  SISTRAT23_c1_2010_2024_df_prev1p|>
  tidytable::filter(hash_key %in% invalid_start_subs_hash_key)|>
  tidytable::select(hash_key, first_sub_used)|>
  tidytable::group_by(hash_key)|>
  tidytable::summarise(
    alcohol                     = sum(first_sub_used == "alcohol", na.rm = TRUE),
    amphetamine_type_stimulants = sum(first_sub_used == "amphetamine-type stimulants", na.rm = TRUE),
    cocaine_paste               = sum(first_sub_used == "cocaine paste", na.rm = TRUE),
    cocaine_powder              = sum(first_sub_used == "cocaine powder", na.rm = TRUE),
    hallucinogens               = sum(first_sub_used == "hallucinogens", na.rm = TRUE),
    inhalants                   = sum(first_sub_used == "inhalants", na.rm = TRUE),
    marijuana                   = sum(first_sub_used == "marijuana", na.rm = TRUE),
    opioids                     = sum(first_sub_used == "opioids", na.rm = TRUE),
    others                      = sum(first_sub_used == "others", na.rm = TRUE),
    tranquilizers_hypnotics     = sum(first_sub_used == "tranquilizers/hypnotics", na.rm = TRUE),
    total                       = dplyr::n(),
    .groups = "drop"
  )|>
  tidylog::left_join(invalid_start_subs_c2, by = c("hash_key" = "HASH_KEY"), multiple = "first", suffix = c("", "_c2"))|>
  tidylog::left_join(invalid_start_subs_c3, by = c("hash_key" = "HASH_KEY"), multiple = "first", suffix = c("", "_c3"))|>
  tidylog::left_join(invalid_start_subs_c4, by = c("hash_key" = "HASH_KEY"), multiple = "first", suffix = c("", "_c4"))|>
  tidylog::left_join(invalid_start_subs_c5, by = c("hash_key" = "HASH_KEY"), multiple = "first", suffix = c("", "_c5"))|>
  tidylog::left_join(invalid_start_subs_c6, by = c("hash_key" = "HASH_KEY"), multiple = "first", suffix = c("", "_c6"))|>
  (\(df) {
    # Canonical labels
    substances <- c(
      "alcohol",
      "amphetamine_type_stimulants",
      "cocaine_paste",
      "cocaine_powder",
      "hallucinogens",
      "inhalants",
      "marijuana",
      "opioids",
      "others",
      "tranquilizers_hypnotics"
    )

    df2 <- as.data.frame(df, stringsAsFactors = FALSE)

    # Detect any susini text columns like c2_susini_1 … c6_susini_k
    susini_cols <- grep("^c[2-6]_susini_\\d+$", names(df2), value = TRUE)
    if (length(susini_cols) > 0) {
      df2[susini_cols] <- lapply(df2[susini_cols], function(x) tolower(trimws(x)))
    }

    # n_<substance> from susini text columns (0 if none exist)
    for (sub in substances) {
      safe_nm <- paste0("n_", make.names(sub))
      if (length(susini_cols) > 0) {
        df2[[safe_nm]] <- rowSums(df2[susini_cols] == sub, na.rm = TRUE)
      } else {
        df2[[safe_nm]] <- 0L
      }
    }

    # tot_<substance> = base + susini counts (base may be missing; treat as 0)
    tot_cols <- character(0)
    for (sub in substances) {
      base_nm <- sub
      susi_nm <- paste0("n_", make.names(sub))
      tot_nm  <- paste0("tot_", make.names(sub))
      base_vec <- if (base_nm %in% names(df2)) df2[[base_nm]] else 0L
      df2[[tot_nm]] <- base_vec + df2[[susi_nm]]
      tot_cols <- c(tot_cols, tot_nm)
    }

    # Map total column names -> human labels
    name_map <- setNames(substances, tot_cols)

    # Top 3 labels by totals with stable tie-break (count desc, then label asc)
    top3 <- t(apply(df2[tot_cols], 1, function(x) {
      pos_idx <- which(x > 0)
      if (length(pos_idx) == 0) return(rep(NA_character_, 3))
      pos_names <- names(x)[pos_idx]
      labels    <- name_map[pos_names]
      ord <- order(-x[pos_idx], labels)  # desc by count, then A–Z
      pick_names <- pos_names[ord][seq_len(min(3, length(pos_idx)))]
      c(name_map[pick_names], rep(NA_character_, 3 - length(pick_names)))
    }))
    colnames(top3) <- paste0("sus_ini_", 1:3)

    # Corresponding counts
    top3_n <- t(apply(df2[tot_cols], 1, function(x) {
      pos_idx <- which(x > 0)
      if (length(pos_idx) == 0) return(rep(NA_integer_, 3))
      pos_names <- names(x)[pos_idx]
      labels    <- name_map[pos_names]
      ord <- order(-x[pos_idx], labels)
      pick_names <- pos_names[ord][seq_len(min(3, length(pos_idx)))]
      c(as.integer(x[pick_names]), rep(NA_integer_, 3 - length(pick_names)))
    }))
    colnames(top3_n) <- paste0("sus_ini_", 1:3, "_n")

    # Bind the results and return
    df2 <- cbind(df2,
                 as.data.frame(top3,  stringsAsFactors = FALSE),
                 as.data.frame(top3_n, stringsAsFactors = FALSE))
    df2
  })()

left_join: added 11 columns (alcohol_c2, amphetamine_type_stimulants_c2, cocaine_paste_c2, cocaine_powder_c2, hallucinogens_c2, …)

       > rows only in tidytable::summarise(ti..  15,994
       > rows only in invalid_start_subs_c2     (     0)
       > matched rows                               112
       >                                        ========
       > rows total                              16,106

left_join: added 11 columns (alcohol_c3, amphetamine_type_stimulants_c3, cocaine_paste_c3, cocaine_powder_c3, hallucinogens_c3, …) > rows only in tidylog::left_join(tidy.. 15,919 > rows only in invalid_start_subs_c3 ( 0) > matched rows 187 > ======== > rows total 16,106 left_join: added 11 columns (alcohol_c4, amphetamine_type_stimulants_c4, cocaine_paste_c4, cocaine_powder_c4, hallucinogens_c4, …) > rows only in tidylog::left_join(tidy.. 16,000 > rows only in invalid_start_subs_c4 ( 0) > matched rows 106 > ======== > rows total 16,106 left_join: added 11 columns (alcohol_c5, amphetamine_type_stimulants_c5, cocaine_paste_c5, cocaine_powder_c5, hallucinogens_c5, …) > rows only in tidylog::left_join(tidy.. 16,053 > rows only in invalid_start_subs_c5 ( 0) > matched rows 53 > ======== > rows total 16,106 left_join: added 11 columns (alcohol_c6, amphetamine_type_stimulants_c6, cocaine_paste_c6, cocaine_powder_c6, hallucinogens_c6, …) > rows only in tidylog::left_join(tidy.. 16,045 > rows only in invalid_start_subs_c6 ( 0) > matched rows 61 > ======== > rows total 16,106

Number of distinct starting substances
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.000   2.000   2.000   2.147   2.000   5.000 
Make counts by RUN and top substances in case of more than one initial substance

Now we generated the criteria of the most vulnerable variable.

Code

invisible("buscar criterio de mvv")
#sus_ini_1, sus_ini_2, sus_ini_3, sus_ini_1_n, sus_ini_2_n, sus_ini_3_n

SISTRAT23_c1_2010_2024_df_prev1q <-
  SISTRAT23_c1_2010_2024_df_prev1p|>
  (\(df) {
    cat(paste0(
      "7.Number of cases before resolving inconsistencies in nationality & starting substance: ",
      formatC(nrow(df), big.mark = ",")
    ), "\n")
    cat(paste0(
      "7.Number of patients before resolving inconsistencies in nationality & starting substance: ",
      formatC(nrow(tidytable::distinct(df, hash_key)), big.mark = ",")
    ), "\n")
    df
  })()|>
  tidytable::left_join(multiple_nationalities, multiple = "first")|>
  tidytable::mutate(
    nationality_cons = tidytable::case_when(
      !is.na(nacionalidad_distinct) ~ nacionalidad_distinct,
      TRUE ~ nacionalidad
    )
  )|>
  tidytable::select(-nacionalidad_distinct)|>
  (\(df) {
    cat(paste0(
      "7.Number of cases before resolving inconsistencies in starting substance: ",
      formatC(nrow(df), big.mark = ",")
    ), "\n")
    cat(paste0(
      "7.Number of patients before resolving inconsistencies in starting substance: ",
      formatC(nrow(tidytable::distinct(df, hash_key)), big.mark = ",")
    ), "\n")
    df
  })()|>
  # Join only hash_key + sus_ini_1/2/3 from invalid_start_subs_ext_info
  tidytable::left_join(
    tidytable::select(
      invalid_start_subs_ext_info,
      hash_key,
      tidytable::all_of(paste0("sus_ini_", 1:3))
    ),
    multiple = "first"
  )|>
  # Normalize to lower case (and keep NAs as NA)
  tidytable::mutate(
    first_sub_used = tolower(first_sub_used),
    sus_ini_1      = tolower(gsub("_"," ", sus_ini_1)),
    sus_ini_2      = tolower(gsub("_"," ", sus_ini_2)),
    sus_ini_3      = tolower(gsub("_"," ", sus_ini_3))
  )|>
  # Row-wise flags (NA-safe) across first_sub_used & sus_ini_1/2/3
  tidytable::mutate(
    has_paste =
      (!is.na(first_sub_used) & grepl("past",   first_sub_used)) |
      (!is.na(sus_ini_1)      & sus_ini_1 == "cocaine paste")   |
      (!is.na(sus_ini_2)      & sus_ini_2 == "cocaine paste")   |
      (!is.na(sus_ini_3)      & sus_ini_3 == "cocaine paste"),
    has_powder =
      (!is.na(first_sub_used) & grepl("powder", first_sub_used)) |
      (!is.na(sus_ini_1)      & sus_ini_1 == "cocaine powder")  |
      (!is.na(sus_ini_2)      & sus_ini_2 == "cocaine powder")  |
      (!is.na(sus_ini_3)      & sus_ini_3 == "cocaine powder"),
    has_alcohol =
      (!is.na(first_sub_used) & grepl("alcohol", first_sub_used)) |
      (!is.na(sus_ini_1)      & sus_ini_1 == "alcohol")          |
      (!is.na(sus_ini_2)      & sus_ini_2 == "alcohol")          |
      (!is.na(sus_ini_3)      & sus_ini_3 == "alcohol"),
    has_marijuana =
      (!is.na(first_sub_used) & grepl("marij", first_sub_used))   |
      (!is.na(sus_ini_1)      & sus_ini_1 == "marijuana")         |
      (!is.na(sus_ini_2)      & sus_ini_2 == "marijuana")         |
      (!is.na(sus_ini_3)      & sus_ini_3 == "marijuana"),
    has_others =
      (!is.na(first_sub_used) &
         !grepl("alcohol|past|powder|marij", first_sub_used))     |
      (!is.na(sus_ini_1)      & sus_ini_1 == "others")            |
      (!is.na(sus_ini_2)      & sus_ini_2 == "others")            |
      (!is.na(sus_ini_3)      & sus_ini_3 == "others")
  )|>
  # Pick the most vulnerable reported substance (paste > powder > alcohol > marijuana > others)
  tidytable::mutate(
    sus_ini_mod_mvv = tidytable::case_when(
      has_paste     ~ "cocaine paste",
      has_powder    ~ "cocaine powder",
      has_alcohol   ~ "alcohol",
      has_marijuana ~ "marijuana",
      has_others    ~ "others",
      TRUE ~ NA_character_
    ),
    sus_ini_mod_mvv = factor(
      sus_ini_mod_mvv,
      levels = c("cocaine paste", "cocaine powder", "alcohol", "marijuana", "others")
    )
  )|>
  # Safety checks + message
  (\(df) {
    cat(paste0(
      "7. After resolving inconsistencies in nationality & starting substance, obs.: ",
      formatC(nrow(df), big.mark = ",")
    ), "\n")
    cat(paste0(
      "7. After resolving inconsistencies in nationality & starting substance, RUNs: ",
      formatC(nrow(tidytable::distinct(df, hash_key)), big.mark = ",")
    ), "\n")
    if (nrow(df) > nrow(SISTRAT23_c1_2010_2024_df_prev1p))
      stop("Error: Added treatment episodes in the process")
    df
  })()|>
  # Drop helpers and the sus_ini_* columns if you don't want them downstream
  tidytable::select(
    -tidytable::any_of(c("has_paste", "has_powder", "has_alcohol", "has_marijuana", "has_others"))
    #-tidytable::any_of(c("sus_ini_1", "sus_ini_2", "sus_ini_3"))
  )
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
#starting substance
#Segunda Sustancia de Inicio(Sólo más frecuentes) Second Starting Substance
#Tercera Sustancia de Inicio(Sólo más frecuentes) Third Starting Substance
#Sustancia de Inicio (Sólo más frecuentes)    Starting Substance (Only more frequent)

if (!"adm_age_rec3" %in% colnames(SISTRAT23_c1_2010_2024_df_prev1q)) {
  SISTRAT23_c1_2010_2024_df_prev1q$adm_age_rec3 <- round(
    lubridate::time_length(
      lubridate::interval(
        SISTRAT23_c1_2010_2024_df_prev1q$birth_date_rec, 
        SISTRAT23_c1_2010_2024_df_prev1q$adm_date_rec2
      ), 
      "years"
    ),
    3
  )
}

7.Number of cases before resolving inconsistencies in nationality & starting substance: 173,728 
7.Number of patients before resolving inconsistencies in nationality & starting substance: 121,299 
7.Number of cases before resolving inconsistencies in starting substance: 173,728 
7.Number of patients before resolving inconsistencies in starting substance: 121,299 
7. After resolving inconsistencies in nationality & starting substance, obs.: 173,728 
7. After resolving inconsistencies in nationality & starting substance, RUNs: 121,299

3. Age of onset of substance use (user-invariant variables)

Several respondent-invariant variables showed multiple values for the same person across duplicated records, so we triangulated sources to resolve ambiguities. Recall bias—especially among individuals with sustained substance use—likely contributes to imprecision in reported ages.

Age at onset of any drug use (age_subs_onset) (records = 62,142), had too many distinct values per patient (n = 21,950).
Age at onset of primary substance (age_prim_subs_onset) (records = 69,284), had too many distinct values per user (n = 25,100).

First, we flagged and excluded from analysis cases with ages of onset occurring at or after the age at admission (age_subs_onset n = 1). We also discarded invalid ages of onset for primary substance (age_prim_subs_onset n = 6).

Cases with a primary substance age of onset that preceded the age of substance use onset were discarded.

For context, summary statistics:

age_subs_onset= median: 15 (IQR: 14–18), mean: 16.37, range: 5–74, missing: 10,896.
age_prim_subs_onset= median: 18 (IQR: 15–23), mean: 20.24, range: 5–75, missing: 601.

Code

SISTRAT23_c1_2010_2024_df_prev1q|>
  tidytable::group_by(hash_key)|>
  tidytable::mutate(min_adm_age_rec3= min(adm_age_rec3, na.rm=T))|>
  tidytable::ungroup()|>
  tidytable::mutate(
    age_subs_onset_rec= tidytable::case_when(
      min_adm_age_rec3 <= age_subs_onset~ NA_integer_,
      TRUE~ as.integer(age_subs_onset)
    )
  )|>
  tidytable::mutate(
    age_prim_subs_onset_rec= tidytable::case_when(
      min_adm_age_rec3 <= age_prim_subs_onset~ NA_integer_,
      TRUE~ as.integer(age_prim_subs_onset)
    )
  )|>
  tidytable::mutate(
    age_prim_subs_onset_rec= tidytable::case_when(
      age_prim_subs_onset < age_subs_onset~ NA_integer_,
      TRUE~ as.integer(age_prim_subs_onset)
    )
  )|>
  (\(df) {
    assign(x= "SISTRAT23_c1_2010_2024_df_prev1r", value= df, envir= .GlobalEnv)
    })()

We created the variables age_subs_onset_rec and age_prim_subs_onset_rec in the SISTRAT23_c1_2010_2024_df_prev1r database. We replaced missing values with admission ages for records where the age of first substance use was less than or equal to the minimum admission age by HASH. Additionally, we corrected cases where the age of primary substance use onset was lower than the age of first substance use. The following table shows the amount of ages that varies within each user.

Code

prueba_age_subs_onset <- SISTRAT23_c1_2010_2024_df_prev1r|>
  tidytable::group_by(hash_key)|>
  tidytable::mutate(age_subs_onset_por_hash= tidytable::n_distinct(age_subs_onset))|>
  tidytable::ungroup()|>
  tidytable::filter(age_subs_onset_por_hash>1)|>
  tidytable::distinct(hash_key)

prueba_age_prim_subs_onset <- SISTRAT23_c1_2010_2024_df_prev1r|>
  tidytable::group_by(hash_key)|>
  tidytable::mutate(age_prim_subs_onset_por_hash= tidytable::n_distinct(age_prim_subs_onset))|>
  tidytable::ungroup()|>
  tidytable::filter(age_prim_subs_onset_por_hash>1)|>
  tidytable::distinct(hash_key)

age_subs_onset_summ <-
  SISTRAT23_c1_2010_2024_df_prev1r|>
   tidytable::filter(hash_key %in% tidytable::pull(prueba_age_subs_onset))|>
    tidytable::group_by(hash_key)|>
    tidytable::add_count()|>
    tidytable::summarise(
      n = mean(n),
      age_subs_onset_p25 = quantile(age_subs_onset, 0.25, na.rm = TRUE),
      age_subs_onset_p50 = quantile(age_subs_onset, 0.50, na.rm = TRUE),
      age_subs_onset_p75 = quantile(age_subs_onset, 0.75, na.rm = TRUE),
      age_subs_onset_sd  = sd(age_subs_onset, na.rm = TRUE),
      min_val   = min(age_subs_onset, na.rm = TRUE),
      max_val   = max(age_subs_onset, na.rm = TRUE),
      mean_val  = mean(age_subs_onset, na.rm = TRUE)
    )|>
    tidytable::mutate(ranges = abs(max_val - min_val))|>
    tidytable::ungroup()|>
    tidytable::mutate(
      diff_p25_p50 = abs(age_subs_onset_p50 - age_subs_onset_p25),
      diff_p75_p50 = abs(age_subs_onset_p50 - age_subs_onset_p75),
      min_mean     = abs(min_val - mean_val),
      max_mean     = abs(max_val - mean_val)
    )|>
    tidytable::summarise(
      avg_n        = mean(n),
      sd_n         = sd(n),
      avg_p25_p50  = mean(diff_p25_p50),
      avg_p75_p50  = mean(diff_p75_p50),
      avg_sd       = mean(age_subs_onset_sd, na.rm = TRUE),
      avg_min_mean = mean(min_mean, na.rm = TRUE),
      avg_max_mean = mean(max_mean, na.rm = TRUE),
      avg_ranges   = mean(ranges, na.rm = TRUE),
      sd_ranges    = sd(ranges, na.rm = TRUE),
      p75_ranges   = quantile(ranges, 0.75, na.rm = TRUE),
      p90_ranges   = quantile(ranges, 0.90, na.rm = TRUE)
    )|>
    (\(df) round(df, 2))()

age_prim_subs_onset_summ <-
  SISTRAT23_c1_2010_2024_df_prev1r|>
   tidytable::filter(hash_key %in% tidytable::pull(prueba_age_prim_subs_onset))|>
    tidytable::group_by(hash_key)|>
    tidytable::add_count()|>
    tidytable::summarise(
      n = mean(n),
      age_prim_subs_onset_p25 = quantile(age_prim_subs_onset, 0.25, na.rm = TRUE),
      age_prim_subs_onset_p50 = quantile(age_prim_subs_onset, 0.50, na.rm = TRUE),
      age_prim_subs_onset_p75 = quantile(age_prim_subs_onset, 0.75, na.rm = TRUE),
      age_prim_subs_onset_sd  = sd(age_prim_subs_onset, na.rm = TRUE),
      min_val   = min(age_prim_subs_onset, na.rm = TRUE),
      max_val   = max(age_prim_subs_onset, na.rm = TRUE),
      mean_val  = mean(age_prim_subs_onset, na.rm = TRUE)
    )|>
    tidytable::mutate(ranges = abs(max_val - min_val))|>
    tidytable::ungroup()|>
    tidytable::mutate(
      diff_p25_p50 = abs(age_prim_subs_onset_p50 - age_prim_subs_onset_p25),
      diff_p75_p50 = abs(age_prim_subs_onset_p50 - age_prim_subs_onset_p75),
      min_mean     = abs(min_val - mean_val),
      max_mean     = abs(max_val - mean_val)
    )|>
    tidytable::summarise(
      avg_n        = mean(n),
      sd_n         = sd(n),
      avg_p25_p50  = mean(diff_p25_p50),
      avg_p75_p50  = mean(diff_p75_p50),
      avg_sd       = mean(age_prim_subs_onset_sd, na.rm = TRUE),
      avg_min_mean = mean(min_mean, na.rm = TRUE),
      avg_max_mean = mean(max_mean, na.rm = TRUE),
      avg_ranges   = mean(ranges, na.rm = TRUE),
      sd_ranges    = sd(ranges, na.rm = TRUE),
      p75_ranges   = quantile(ranges, 0.75, na.rm = TRUE),
      p90_ranges   = quantile(ranges, 0.90, na.rm = TRUE)
    )|>
    (\(df) round(df, 2))()

rbind(cbind(Variable="Age of the Onset of Drug Use", age_subs_onset_summ), 
      cbind(Variable="Age of the Onset of Drug Use Primary Substance", age_prim_subs_onset_summ))|>
    DT::datatable(
      caption= "Table 2. Distribution of differences in years within users with inconsistent ages of onset",
      colnames= c("Variables","Avg. Cases per User", "Std.Dev. Cases per User","Avg. Diff. Perc 25-50", "Avg. Diff. Perc 75-50","Avg. Std.Dev","Avg. Diff. Min-Mean","Avg. Diff. Max-Mean","Avg. Range", "Std.Dev. Range","Perc 75 Range","Perc 90 Range"),
      options= list(
        pageLength= 25,
        scrollX= TRUE,
        scrollY= "150px",
        dom= 'Brt')
    )

The data presented in the table reveals a notable variability in the age of onset for both drug use and primary substance use. The significant standard deviations in the “Age of the Onset of Drug Use” and “Age of the Onset of Drug Use Primary Substance” , combined with the wide ranges shown in “Avg. Range” and “Std.Dev. Range”, indicate that the ages reported by users were not consistent. To address this lack of consistency and establish a more reliable and stable value for the age of onset, we decided to use external databases (Agreements C2 to C6) as a source for this critical variable. This approach ensures that our analysis is based on more dependable and standardized data.

HASHes with inconsistnt onset ages were grouped in invalid_age_sub_and_prim_sub_onset, and miss_age_sub_and_prim_sub_onset, for missing values (or eliminated due to inconsistencies with admission age).

Code

# cx_n: Number of CX records for this HASH_KEY (all rows after filtering).
# cx_age_subs_onset: Mean of edad_inicio_consumo across this HASH_KEY’s CX records (years; numeric).
# cx_age_prim_subs_onset: Mean of edad_inicio_sustancia_principal across this HASH_KEY’s CX records (years; numeric).
# cx_mode_prim_sub: The most frequent normalized primary substance label for this HASH_KEY (after recoding prim_sub). Tie-breaker: alphabetical by label.
# cx_mode_prim_sub_n: Count of rows for this HASH_KEY with cx_mode_prim_sub.
# cx_mode_prim_sub_prop: Proportion of that mode within all non-missing prim_sub rows for this HASH_KEY (cx_mode_prim_sub_n / total non-missing prim_sub).
# cx_age_prim_subs_onset_mean_: Mean edad_inicio_sustancia_principal for rows of that specific normalized primary substance within this HASH_KEY (years; numeric)

invisible("Include inconsistent onset ages")
inconsistent_age_subs_onset <- SISTRAT23_c1_2010_2024_df_prev1r|>
  tidytable::group_by(hash_key)|>
  tidytable::mutate(age_subs_onset_by_hash= tidytable::n_distinct(age_subs_onset_rec))|>
  tidytable::ungroup()|>
  tidytable::filter(age_subs_onset_by_hash>1)|>
  tidytable::distinct(hash_key)

inconsistent_age_prim_subs_onset <- SISTRAT23_c1_2010_2024_df_prev1r|>
  tidytable::group_by(hash_key, primary_sub)|>
  tidytable::mutate(age_prim_subs_onset_by_hash_primsub= tidytable::n_distinct(age_prim_subs_onset_rec))|>
  tidytable::ungroup()|>
  tidytable::filter(age_prim_subs_onset_by_hash_primsub>1)|>
  tidytable::distinct(hash_key, rn)
#27835

invalid_age_sub_and_prim_sub_onset <- unique(c(inconsistent_age_subs_onset$hash_key, inconsistent_age_prim_subs_onset$hash_key))

invisible("Include missing onset ages")
prueba_age_prim_subs_onset_miss <-SISTRAT23_c1_2010_2024_df_prev1r|>
  tidytable::group_by(hash_key)|>
  tidytable::summarise(na_per_hash = sum(is.na(age_prim_subs_onset_rec)),
    total = n())|>
  tidytable::mutate(perc_miss= na_per_hash/total)|>
  tidytable::filter(perc_miss==1)|> 
  tidytable::ungroup()
prueba_age_subs_onset_miss <-SISTRAT23_c1_2010_2024_df_prev1r|>
  tidytable::group_by(hash_key)|>
  tidytable::summarise(na_per_hash = sum(is.na(age_subs_onset_rec)),
    total = n())|>
  tidytable::mutate(perc_miss= na_per_hash/total)|>
  tidytable::filter(perc_miss==1)|> 
  tidytable::ungroup()

miss_age_sub_and_prim_sub_onset<- 
unique(c(prueba_age_subs_onset_miss$hash_key,prueba_age_prim_subs_onset_miss$hash_key))

invisible("======================================================")
invalid_age_sub_onset_c2<-
    CONS_C2_25_df|>
    tidytable::filter(HASH_KEY%in%unique(c(invalid_age_sub_and_prim_sub_onset, miss_age_sub_and_prim_sub_onset)))|>
    (\(df){
        message(paste0("C2, Entries: ", nrow(df)))
        message(paste0("C2, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow()))
        df})()|>
    tidytable::rename(prim_sub=sustancia_principal)|>
    tidytable::mutate(
      prim_sub= tidytable::case_when(
        grepl("crack|pasta",prim_sub, ignore.case=TRUE)~"cocaine paste",
        grepl("coca",prim_sub, ignore.case=TRUE)~"cocaine powder",
        grepl("marihuana|marijuana",prim_sub, ignore.case=TRUE)~"marijuana",
        grepl("anfeta|extasis|fenil|estimul",prim_sub, ignore.case=TRUE)~"amphetamine-type stimulants",
        grepl("alucin|lsd|hongos",prim_sub, ignore.case=TRUE)~"hallucinogens",
        grepl("opi|hero|metadona",prim_sub, ignore.case=TRUE)~"opioids",
        grepl("sedante|hipnotico|tranquiliz",prim_sub, ignore.case=TRUE)~"tranquilizers/hypnotics",
        grepl("inhalable",prim_sub, ignore.case=TRUE)~"inhalants",
        grepl("esteroid|otros",prim_sub, ignore.case=TRUE)~"others",
        grepl("especif|cip-crc|sin consumo",prim_sub, ignore.case=TRUE)~NA_character_,
        TRUE~prim_sub
      ))|>
    (\(df){
      # Overall per-HASH_KEY summary
      tidytable::group_by(df, HASH_KEY)|>
          tidytable::summarise(
            c2_age_subs_onset= mean(as.numeric(edad_inicio_consumo), na.rm=TRUE),
            c2_age_prim_subs_onset= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE),
            c2_n= tidytable::n())|>
          tidytable::ungroup()|>
          # Join the mode (most frequent primary substance) with its count and proportion
          tidytable::left_join(
            tidytable::filter(df, !is.na(prim_sub))|>
            tidytable::group_by(HASH_KEY, prim_sub)|>
            tidytable::summarise(
              c2_age_prim_subs_onset_mean= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE),
              c2_prim_sub_n= tidytable::n())|>
            tidytable::group_by(HASH_KEY)|>
            tidytable::mutate(c2_total_n= sum(c2_prim_sub_n))|>
            tidytable::arrange(HASH_KEY, -c2_prim_sub_n, prim_sub)|>
            tidytable::slice_head(n=1)|>
            tidytable::transmute(
                HASH_KEY,
                c2_mode_prim_sub= prim_sub,
                c2_mode_prim_sub_n= c2_prim_sub_n,
                c2_mode_prim_sub_prop= c2_prim_sub_n/c2_total_n),
              by="HASH_KEY")|>
          # Join wide columns: per-primary-substance mean onset age
          tidytable::left_join(
            tidytable::filter(df, !is.na(prim_sub))|>
            tidytable::group_by(HASH_KEY, prim_sub)|>
            tidytable::summarise(
                c2_age_prim_subs_onset_mean= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE))|>
            tidytable::mutate(prim_sub_key= gsub("[^A-Za-z0-9]+","_", tolower(prim_sub)))|>
            tidytable::mutate(col_name= paste0("c2_age_prim_subs_onset_mean_", prim_sub_key))|>
            tidytable::select(HASH_KEY, col_name, c2_age_prim_subs_onset_mean)|>
            tidytable::pivot_wider(names_from= col_name, values_from= c2_age_prim_subs_onset_mean),
              by="HASH_KEY")})()

C2, Entries: 418

C2, RUNs: 168

Code

# C2, Entries: 418
# C2, RUNs: 168

invisible("======================================================")
invalid_age_sub_onset_c3<-  
CONS_C3_25_df|>
    filter(HASH_KEY%in%unique(c(invalid_age_sub_and_prim_sub_onset,miss_age_sub_and_prim_sub_onset)))|>
    (\(df){
      message(paste0("C3, Entries: ", nrow(df)))
      message(paste0("C3, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow()))
      df})()|>
  tidytable::rename(prim_sub=sustancia_principal)|>
  tidytable::mutate(
    prim_sub= tidytable::case_when(
      grepl("crack|pasta",prim_sub, ignore.case=TRUE)~"cocaine paste",
      grepl("coca",prim_sub, ignore.case=TRUE)~"cocaine powder",
      grepl("marihuana|marijuana",prim_sub, ignore.case=TRUE)~"marijuana",
      grepl("anfeta|extasis|fenil|estimul",prim_sub, ignore.case=TRUE)~"amphetamine-type stimulants",
      grepl("alucin|lsd|hongos",prim_sub, ignore.case=TRUE)~"hallucinogens",
      grepl("opi|hero|metadona",prim_sub, ignore.case=TRUE)~"opioids",
      grepl("sedante|hipnotico|tranquiliz",prim_sub, ignore.case=TRUE)~"tranquilizers/hypnotics",
      grepl("inhalable",prim_sub, ignore.case=TRUE)~"inhalants",
      grepl("esteroid|otros",prim_sub, ignore.case=TRUE)~"others",
      grepl("especif|cip-crc|sin consumo",prim_sub, ignore.case=TRUE)~NA_character_,
      TRUE~prim_sub))|>
    (\(df){
      # Overall per-HASH_KEY summary
      tidytable::group_by(df, HASH_KEY)|>
        tidytable::summarise(
          c3_age_subs_onset= mean(as.numeric(edad_inicio_consumo), na.rm=TRUE),
          c3_age_prim_subs_onset= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE),
          c3_n= tidytable::n())|>
        tidytable::ungroup()|>
        # Join the mode (most frequent primary substance) with its count and proportion
        tidytable::left_join(
          tidytable::filter(df, !is.na(prim_sub))|>
          tidytable::group_by(HASH_KEY, prim_sub)|>
          tidytable::summarise(
            c3_age_prim_subs_onset_mean= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE),
            c3_prim_sub_n= tidytable::n())|>
          tidytable::group_by(HASH_KEY)|>
          tidytable::mutate(c3_total_n= sum(c3_prim_sub_n))|>
          tidytable::arrange(HASH_KEY, -c3_prim_sub_n, prim_sub)|>
          tidytable::slice_head(n=1)|>
          tidytable::transmute(
            HASH_KEY,
            c3_mode_prim_sub= prim_sub,
            c3_mode_prim_sub_n= c3_prim_sub_n,
            c3_mode_prim_sub_prop= c3_prim_sub_n/c3_total_n
          ), by="HASH_KEY")|>
        # Join wide columns: per-primary-substance mean onset age
        tidytable::left_join(
          tidytable::filter(df, !is.na(prim_sub))|>
          tidytable::group_by(HASH_KEY, prim_sub)|>
          tidytable::summarise(
            c3_age_prim_subs_onset_mean= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE))|>
          tidytable::mutate(prim_sub_key= gsub("[^A-Za-z0-9]+","_", tolower(prim_sub)))|>
          tidytable::mutate(col_name= paste0("c3_age_prim_subs_onset_mean_", prim_sub_key))|>
          tidytable::select(HASH_KEY, col_name, c3_age_prim_subs_onset_mean)|>
          tidytable::pivot_wider(names_from= col_name, values_from= c3_age_prim_subs_onset_mean),
            by="HASH_KEY")})()

C3, Entries: 495

C3, RUNs: 326

Code

# C3, Entries: 495
# C3, RUNs: 326

invisible("======================================================")
invalid_age_sub_onset_c4<-  
CONS_C4_25_df|>
    filter(HASH_KEY%in%unique(c(invalid_age_sub_and_prim_sub_onset,miss_age_sub_and_prim_sub_onset)))|>
    (\(df) {
      (message(paste0("C4, Entries: ", nrow(df))))
      (message(paste0("C4, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
      df
    })()|>  
    tidytable::rename(prim_sub=sustancia_principal)|>
    tidytable::mutate(
      prim_sub= tidytable::case_when(
        grepl("crack|pasta",prim_sub, ignore.case=TRUE)~"cocaine paste",
        grepl("coca",prim_sub, ignore.case=TRUE)~"cocaine powder",
        grepl("marihuana|marijuana",prim_sub, ignore.case=TRUE)~"marijuana",
        grepl("anfeta|extasis|fenil|estimul",prim_sub, ignore.case=TRUE)~"amphetamine-type stimulants",
        grepl("alucin|lsd|hongos",prim_sub, ignore.case=TRUE)~"hallucinogens",
        grepl("opi|hero|metadona",prim_sub, ignore.case=TRUE)~"opioids",
        grepl("sedante|hipnotico|tranquiliz",prim_sub, ignore.case=TRUE)~"tranquilizers/hypnotics",
        grepl("inhalable",prim_sub, ignore.case=TRUE)~"inhalants",
        grepl("esteroid|otros",prim_sub, ignore.case=TRUE)~"others",
        grepl("especif|cip-crc|sin consumo",prim_sub, ignore.case=TRUE)~NA_character_,
        TRUE~prim_sub))|>
    (\(df){
      # Overall per-HASH_KEY summary
      tidytable::group_by(df, HASH_KEY)|>
        tidytable::summarise(
          c4_age_subs_onset= mean(as.numeric(edad_inicio_consumo), na.rm=TRUE),
          c4_age_prim_subs_onset= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE),
          c4_n= tidytable::n())|>
        tidytable::ungroup()|>
        # Join the mode (most frequent primary substance) with its count and proportion
        tidytable::left_join(
          tidytable::filter(df, !is.na(prim_sub))|>
          tidytable::group_by(HASH_KEY, prim_sub)|>
          tidytable::summarise(
            c4_age_prim_subs_onset_mean= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE),
            c4_prim_sub_n= tidytable::n())|>
          tidytable::group_by(HASH_KEY)|>
          tidytable::mutate(c4_total_n= sum(c4_prim_sub_n))|>
          tidytable::arrange(HASH_KEY, -c4_prim_sub_n, prim_sub)|>
          tidytable::slice_head(n=1)|>
          tidytable::transmute(
            HASH_KEY,
            c4_mode_prim_sub= prim_sub,
            c4_mode_prim_sub_n= c4_prim_sub_n,
            c4_mode_prim_sub_prop= c4_prim_sub_n/c4_total_n
          ), by="HASH_KEY")|>
        # Join wide columns: per-primary-substance mean onset age
        tidytable::left_join(
          tidytable::filter(df, !is.na(prim_sub))|>
          tidytable::group_by(HASH_KEY, prim_sub)|>
          tidytable::summarise(
            c4_age_prim_subs_onset_mean= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE))|>
          tidytable::mutate(prim_sub_key= gsub("[^A-Za-z0-9]+","_", tolower(prim_sub)))|>
          tidytable::mutate(col_name= paste0("c4_age_prim_subs_onset_mean_", prim_sub_key))|>
          tidytable::select(HASH_KEY, col_name, c4_age_prim_subs_onset_mean)|>
          tidytable::pivot_wider(names_from= col_name, values_from= c4_age_prim_subs_onset_mean),
            by="HASH_KEY")})()

C4, Entries: 236

C4, RUNs: 162

Code

# C4, Entries: 236
# C4, RUNs: 162

invisible("======================================================")
invalid_age_sub_onset_c5<-  
CONS_C5_25_df|>
    filter(HASH_KEY%in%unique(c(invalid_age_sub_and_prim_sub_onset,miss_age_sub_and_prim_sub_onset)))|>
    (\(df) {
        (message(paste0("C5, Entries: ", nrow(df))))
        (message(paste0("C5, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    tidytable::rename(prim_sub=sustancia_principal)|>
    tidytable::mutate(
      prim_sub= tidytable::case_when(
        grepl("crack|pasta",prim_sub, ignore.case=TRUE)~"cocaine paste",
        grepl("coca",prim_sub, ignore.case=TRUE)~"cocaine powder",
        grepl("marihuana|marijuana",prim_sub, ignore.case=TRUE)~"marijuana",
        grepl("anfeta|extasis|fenil|estimul",prim_sub, ignore.case=TRUE)~"amphetamine-type stimulants",
        grepl("alucin|lsd|hongos",prim_sub, ignore.case=TRUE)~"hallucinogens",
        grepl("opi|hero|metadona",prim_sub, ignore.case=TRUE)~"opioids",
        grepl("sedante|hipnotico|tranquiliz",prim_sub, ignore.case=TRUE)~"tranquilizers/hypnotics",
        grepl("inhalable",prim_sub, ignore.case=TRUE)~"inhalants",
        grepl("esteroid|otros",prim_sub, ignore.case=TRUE)~"others",
        grepl("especif|cip-crc|sin consumo",prim_sub, ignore.case=TRUE)~NA_character_,
        TRUE~prim_sub))|>
    (\(df){
      # Overall per-HASH_KEY summary
      tidytable::group_by(df, HASH_KEY)|>
        tidytable::summarise(
          c5_age_subs_onset= mean(as.numeric(edad_inicio_sustancia_inicial), na.rm=TRUE),
          c5_age_prim_subs_onset= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE),
          c5_n= tidytable::n())|>
        tidytable::ungroup()|>
        # Join the mode (most frequent primary substance) with its count and proportion
        tidytable::left_join(
          tidytable::filter(df, !is.na(prim_sub))|>
          tidytable::group_by(HASH_KEY, prim_sub)|>
          tidytable::summarise(
            c5_age_prim_subs_onset_mean= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE),
            c5_prim_sub_n= tidytable::n())|>
          tidytable::group_by(HASH_KEY)|>
          tidytable::mutate(c5_total_n= sum(c5_prim_sub_n))|>
          tidytable::arrange(HASH_KEY, -c5_prim_sub_n, prim_sub)|>
          tidytable::slice_head(n=1)|>
          tidytable::transmute(
            HASH_KEY,
            c5_mode_prim_sub= prim_sub,
            c5_mode_prim_sub_n= c5_prim_sub_n,
            c5_mode_prim_sub_prop= c5_prim_sub_n/c5_total_n
          ), by="HASH_KEY")|>
        # Join wide columns: per-primary-substance mean onset age
        tidytable::left_join(
          tidytable::filter(df, !is.na(prim_sub))|>
          tidytable::group_by(HASH_KEY, prim_sub)|>
          tidytable::summarise(
            c5_age_prim_subs_onset_mean= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE))|>
          tidytable::mutate(prim_sub_key= gsub("[^A-Za-z0-9]+","_", tolower(prim_sub)))|>
          tidytable::mutate(col_name= paste0("c5_age_prim_subs_onset_mean_", prim_sub_key))|>
          tidytable::select(HASH_KEY, col_name, c5_age_prim_subs_onset_mean)|>
          tidytable::pivot_wider(names_from= col_name, values_from= c5_age_prim_subs_onset_mean),
            by="HASH_KEY")})()

C5, Entries: 129

C5, RUNs: 90

Code

# C5, Entries: 129
# C5, RUNs: 90

invisible("======================================================")
invalid_age_sub_onset_c6<-  
CONS_C6_25_df|>
    filter(HASH_KEY%in%unique(c(invalid_age_sub_and_prim_sub_onset,miss_age_sub_and_prim_sub_onset)))|>
    (\(df) {
        (message(paste0("C6, Entries: ", nrow(df))))
        (message(paste0("C6, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    tidytable::rename(prim_sub=sustancia_principal)|>
    tidytable::mutate(
      prim_sub= tidytable::case_when(
        grepl("crack|pasta",prim_sub, ignore.case=TRUE)~"cocaine paste",
        grepl("coca",prim_sub, ignore.case=TRUE)~"cocaine powder",
        grepl("marihuana|marijuana",prim_sub, ignore.case=TRUE)~"marijuana",
        grepl("anfeta|extasis|fenil|estimul",prim_sub, ignore.case=TRUE)~"amphetamine-type stimulants",
        grepl("alucin|lsd|hongos",prim_sub, ignore.case=TRUE)~"hallucinogens",
        grepl("opi|hero|metadona",prim_sub, ignore.case=TRUE)~"opioids",
        grepl("sedante|hipnotico|tranquiliz",prim_sub, ignore.case=TRUE)~"tranquilizers/hypnotics",
        grepl("inhalable",prim_sub, ignore.case=TRUE)~"inhalants",
        grepl("esteroid|otros",prim_sub, ignore.case=TRUE)~"others",
        grepl("especif|cip-crc|sin consumo",prim_sub, ignore.case=TRUE)~NA_character_,
        TRUE~prim_sub))|>
    (\(df){
      # Overall per-HASH_KEY summary
      tidytable::group_by(df, HASH_KEY)|>
        tidytable::summarise(
          c6_age_subs_onset= mean(as.numeric(edad_inicio_consumo), na.rm=TRUE),
          c6_age_prim_subs_onset= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE),
          c6_n= tidytable::n())|>
        tidytable::ungroup()|>
        # Join the mode (most frequent primary substance) with its count and proportion
        tidytable::left_join(
          tidytable::filter(df, !is.na(prim_sub))|>
          tidytable::group_by(HASH_KEY, prim_sub)|>
          tidytable::summarise(
            c6_age_prim_subs_onset_mean= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE),
            c6_prim_sub_n= tidytable::n())|>
          tidytable::group_by(HASH_KEY)|>
          tidytable::mutate(c6_total_n= sum(c6_prim_sub_n))|>
          tidytable::arrange(HASH_KEY, -c6_prim_sub_n, prim_sub)|>
          tidytable::slice_head(n=1)|>
          tidytable::transmute(
            HASH_KEY,
            c6_mode_prim_sub= prim_sub,
            c6_mode_prim_sub_n= c6_prim_sub_n,
            c6_mode_prim_sub_prop= c6_prim_sub_n/c6_total_n
          ), by="HASH_KEY")|>
        # Join wide columns: per-primary-substance mean onset age
        tidytable::left_join(
          tidytable::filter(df, !is.na(prim_sub))|>
          tidytable::group_by(HASH_KEY, prim_sub)|>
          tidytable::summarise(
            c6_age_prim_subs_onset_mean= mean(as.numeric(edad_inicio_sustancia_principal), na.rm=TRUE))|>
          tidytable::mutate(prim_sub_key= gsub("[^A-Za-z0-9]+","_", tolower(prim_sub)))|>
          tidytable::mutate(col_name= paste0("c6_age_prim_subs_onset_mean_", prim_sub_key))|>
          tidytable::select(HASH_KEY, col_name, c6_age_prim_subs_onset_mean)|>
          tidytable::pivot_wider(names_from= col_name, values_from= c6_age_prim_subs_onset_mean),
            by="HASH_KEY")})()

C6, Entries: 148

C6, RUNs: 112

Code

# C6, Entries: 148
# C6, RUNs: 112

For missing age of onset of substance use or invalid, we contrasted with external sources.

Code

invisible("handle the edge case of groups with only missing data, ensuring your final data frame is clean and doesn't contain unexpected Inf or -Inf values.")
invalid_age_onset_ext_info<-
  SISTRAT23_c1_2010_2024_df_prev1r|>
  tidytable::filter(hash_key%in%unique(c(invalid_age_sub_and_prim_sub_onset,miss_age_sub_and_prim_sub_onset)))|>
  (\(df){
    # Overall per-hash_key stats (NA-safe)
    # age_subs_onset stats (C1)
    #handle the edge case of groups with only missing data, ensuring your final data frame is clean and doesn't contain unexpected Inf or -Inf values.
    overall<- df|>
      tidytable::group_by(hash_key)|>
      tidytable::summarise(
        c1_n= tidytable::n(),
        c1_age_subs_onset_mean= if(all(is.na(age_subs_onset_rec))) NA_real_ else mean(as.numeric(age_subs_onset_rec), na.rm=TRUE),
        c1_age_subs_onset_min = if(all(is.na(age_subs_onset_rec))) NA_real_ else min(as.numeric(age_subs_onset_rec),  na.rm=TRUE),
        c1_age_subs_onset_max = if(all(is.na(age_subs_onset_rec))) NA_real_ else max(as.numeric(age_subs_onset_rec),  na.rm=TRUE),
        c1_age_subs_onset_ndist= tidytable::n_distinct(age_subs_onset_rec, na.rm=TRUE),

        c1_age_prim_subs_onset_mean= if(all(is.na(age_prim_subs_onset_rec))) NA_real_ else mean(as.numeric(age_prim_subs_onset_rec), na.rm=TRUE),
        c1_age_prim_subs_onset_min = if(all(is.na(age_prim_subs_onset_rec))) NA_real_ else min(as.numeric(age_prim_subs_onset_rec),  na.rm=TRUE),
        c1_age_prim_subs_onset_max = if(all(is.na(age_prim_subs_onset_rec))) NA_real_ else max(as.numeric(age_prim_subs_onset_rec),  na.rm=TRUE),
        c1_age_prim_subs_onset_ndist= tidytable::n_distinct(age_prim_subs_onset_rec, na.rm=TRUE)
      )|>
      tidytable::ungroup()

    # Per-primary_sub stats per hash_key
    by_sub<- df|>
      tidytable::filter(!is.na(primary_sub))|>
      tidytable::group_by(hash_key, primary_sub)|>
      tidytable::summarise(
        c1_age_prim_subs_onset_rec_mean= if(all(is.na(age_prim_subs_onset_rec))) NA_real_ else mean(as.numeric(age_prim_subs_onset_rec), na.rm=TRUE),
        c1_min_adm_age_by_substance= min(as.numeric(adm_age_rec3), na.rm=TRUE),
        c1_prim_sub_n= tidytable::n()
      )|>
      tidytable::ungroup()|>
      # Safe key for column names
      tidytable::mutate(primary_sub_key= gsub("[^A-Za-z0-9]+","_", tolower(primary_sub)))

    # Wide: per-substance mean
    wide_means<- by_sub|>
      tidytable::mutate(col_name= paste0("c1_age_prim_subs_onset_rec_mean_", primary_sub_key))|>
      tidytable::select(hash_key, col_name, c1_age_prim_subs_onset_rec_mean)|>
      tidytable::pivot_wider(names_from= col_name, values_from= c1_age_prim_subs_onset_rec_mean)

    # Wide: per-substance counts
    wide_counts<- by_sub|>
      tidytable::mutate(col_name= paste0("c1_prim_sub_n_", primary_sub_key))|>
      tidytable::select(hash_key, col_name, c1_prim_sub_n)|>
      tidytable::pivot_wider(names_from= col_name, values_from= c1_prim_sub_n)

    # Wide: per-substance minimum admission age
    wide_min_adm_age<- by_sub|>
      tidytable::mutate(col_name= paste0("c1_min_adm_age_by_subs_", primary_sub_key))|>
      tidytable::select(hash_key, col_name, c1_min_adm_age_by_substance)|>
      tidytable::pivot_wider(names_from= col_name, values_from= c1_min_adm_age_by_substance)

    overall|>
      tidytable::left_join(wide_min_adm_age, by="hash_key")|>
      tidytable::left_join(wide_means, by="hash_key")|>
      tidytable::left_join(wide_counts, by="hash_key")
  })()|> 
# add a count to check if theres more than one primary substance
 tidytable::rowwise()|>
  # Create a new column 'total_substance_use'
  tidytable::mutate(total_substance_use = sum(
    tidytable::c_across(c(
      c1_prim_sub_n_cocaine_paste,
      c1_prim_sub_n_alcohol,
      c1_prim_sub_n_cocaine_powder,
      c1_prim_sub_n_opioids,
      c1_prim_sub_n_marijuana,
      c1_prim_sub_n_amphetamine_type_stimulants,
      c1_prim_sub_n_tranquilizers_hypnotics,
      c1_prim_sub_n_inhalants,
      c1_prim_sub_n_dissociatives,
      c1_prim_sub_n_others,
      c1_prim_sub_n_hallucinogens
    )),na.rm = TRUE
  ))|>
  # Ungroup the data frame to return it to a normal data frame format
  tidytable::ungroup()|>
  tidytable::mutate(multiple_prim_sub= ifelse(total_substance_use>1,1,0))



message(paste0("We calculated summaries for inconsistent or missing ages (patients= ", formatC(nrow(invalid_age_onset_ext_info), big.mark=","),")"))

We calculated summaries for inconsistent or missing ages (patients= 33,899)

Code

#We calculated summaries for inconsistent or missing ages (patients= 33,898)


#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

invalid_age_onset_ext_info_w_c2c6 <- 
  invalid_age_onset_ext_info|>
  tidytable::left_join(tidytable::distinct(SISTRAT23_c1_2010_2024_df_prev1r[, c("hash_key", "min_adm_age_rec3")], .keep_all = TRUE), by="hash_key", multiple = "first")|>
  tidytable::left_join(invalid_age_sub_onset_c2, by = c("hash_key" = "HASH_KEY"))|>
  tidytable::left_join(invalid_age_sub_onset_c3, by = c("hash_key" = "HASH_KEY"))|>
  tidytable::left_join(invalid_age_sub_onset_c4, by = c("hash_key" = "HASH_KEY"))|>
  tidytable::left_join(invalid_age_sub_onset_c5, by = c("hash_key" = "HASH_KEY"))|>
  tidytable::left_join(invalid_age_sub_onset_c6, by = c("hash_key" = "HASH_KEY"))|>
  (\(df) {
      stopifnot(nrow(df) == nrow(invalid_age_onset_ext_info))
      df
  })()

invalid_age_onset_ext_info_w_c2c6|>
  tidytable::mutate(hash_key= as.numeric(factor(hash_key)))|>
  rio::export(paste0(getwd(),"/_out/inconsistent_subs_use_onset_any_prim_sub_w_c2c6.xlsx"))

We implemented an algorithm to resolve inconsistencies in substance use onset age:

Code

wdpath<-paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
bpmn::bpmn(paste0(wdpath, "cons/_input/diagram_age_onset.bpmn"))

Figure 1. Decision Tree for the Users with more than one value in a User-Invariant Variable

1. Pooled Potential Candidates - For each individual, we gathered all potential onset ages from: - Any substance: Columns labeled _age_subs_onset - Primary substance: Columns labeled _age_prim_subs_onset_mean_[substance]

2. Established Validity Boundaries - Universal Rules: - Minimum age: 5 years - Maximum age: 80 years - Substance-Specific Rules: - Any substance: Onset age < (min_adm_age_rec3 - 1) - Primary substance: - Onset age < Minimum treatment admission age for that substance - Onset age ≥ Onset age of all other substances (must be same or later)

3. Filter Valid Candidates - Discarded values violating: - Universal age boundaries [5-80] - Substance-specific rules above - Missing values

4. Resolve Conflicts - Selected final value from remaining candidates: 1. Mode: Most frequently occurring value. Example: If ages [12, 12, 14] remain → select 12 2. Tiebreaker: Median if no clear mode. Example: If ages [12, 14, 14] remain → select 14 3. Unresolved: If no valid candidates remain

5. Label Outcomes - Created two tracking columns: - PROC (Resolution Method): - ANY=mode[value] - PRIMARY=median[value] - UNRESOLVED - FLAG (Rejection Reason): - UB_missing: Upper bound missing - LB>UB: Lower bound > upper bound - conflict(primary<any): Primary substance violates “≥ all others” rule - no_candidate_in_range: No values in valid range

Code

#To test chunk
# pre) Remove intermediate objects created during your pipelines
objects_to_remove <-
    c(
        # main ext + resolved blocks
        "ext",
        "any_cols","any_long","any_counts","any_mode","any_median","any_sources",
        "any_bounds","any_resolved",
        # primary-by-substance blocks
        "sub_ub_long","prim_cols","prim_long","prim_has_cand","subcount_cols",
        "subs_used","valid_subs","primary_pool","prim_counts","prim_mode",
        "prim_median","prim_sources","prim_bounds","primary_resolved",
        "onset_resolved_from_ext","onset_resolved_from_ext_collapsed"
    )

rm(list = intersect(ls(envir = .GlobalEnv), objects_to_remove), envir = .GlobalEnv)
gc()

ext<-
invalid_age_onset_ext_info_w_c2c6
# ── ANY candidates (C1–C6) ────────────────────────────────────────────
# pick columns that look like *_age_subs_onset (but not the per-primary ones)
any_cols <- names(ext)
any_cols <- any_cols[grepl("(?:^|_)age_subs_onset(?:$|_)", any_cols, perl=TRUE) &
                     !grepl("prim", any_cols, ignore.case=TRUE)]

any_long <- ext|>
  tidytable::select(tidytable::any_of(c("hash_key","min_adm_age_rec3", any_cols)))|>
  tidytable::pivot_longer(cols=tidytable::all_of(any_cols),
                          names_to="src_any",
                          values_to="cand_any")|>
  tidytable::mutate(cand_any=as.numeric(cand_any))|>
  tidytable::filter(!is.na(cand_any))|>
  tidytable::mutate(LB_any=5L,
                    UB_any=ifelse(!is.na(min_adm_age_rec3), as.integer(min_adm_age_rec3)-1L, NA_integer_))|>
  tidytable::filter(cand_any>=5 & cand_any<=80)|>
  tidytable::filter(!is.na(UB_any) & cand_any<=UB_any)

# mode, median, sources, counts
any_counts <- any_long|>
  tidytable::group_by(hash_key, cand_any)|>
  tidytable::summarise(n_any=tidytable::n())|>
  tidytable::ungroup()

any_mode <- any_counts|>
  tidytable::arrange(hash_key, tidytable::desc(n_any), cand_any)|>
  tidytable::slice_head(n=1, by="hash_key")|>
  tidytable::rename(onset_any_mode=cand_any)

any_median <- any_long|>
  tidytable::summarise(onset_any_median=as.integer(median(cand_any)),
                       .by="hash_key")

any_sources <- any_long|>
  tidytable::summarise(sources_any=paste0(sort(unique(src_any)), collapse="+"),
                       n_any_cands=tidytable::n(), .by="hash_key")

any_bounds <- ext|>
  tidytable::select(hash_key, min_adm_age_rec3)|>
  tidytable::distinct()|>
  tidytable::mutate(LB_any=5L,
                    UB_any=ifelse(!is.na(min_adm_age_rec3), as.integer(min_adm_age_rec3)-1L, NA_integer_))

any_resolved <- any_bounds|>
  tidytable::left_join(any_mode, by="hash_key")|>
  tidytable::left_join(any_median, by="hash_key")|>
  tidytable::left_join(any_sources, by="hash_key")|>
  tidytable::mutate(onset_any_final=tidytable::coalesce(onset_any_mode, onset_any_median),
                    PROC_ANY=tidytable::case_when(
                      !is.na(onset_any_mode)~paste0("ANY=mode[",tidytable::coalesce(sources_any,"cands"),"]"),
                      is.na(onset_any_mode) & !is.na(onset_any_median)~paste0("ANY=median[",tidytable::coalesce(sources_any,"cands"),"]"),
                      TRUE~"ANY=unresolved"
                    ),
                    FLAG_ANY=tidytable::case_when(
                      is.na(UB_any)~"ANY_UB_missing",
                      LB_any>UB_any~"ANY_unsatisfiable(LB>UB)",
                      is.na(onset_any_final)~"ANY_no_candidate_in_range",
                      TRUE~NA_character_
                    ))|>
  tidytable::select(hash_key, LB_any, UB_any, onset_any_final, PROC_ANY, FLAG_ANY, n_any_cands, sources_any)

# ── PRIMARY-sub candidates (C1–C6) ─────────────────────────────────────
# Per-(hash × sub) UB from c1_min_adm_age_by_subs_* (wide -> long)
sub_ub_long <- ext|>
  tidytable::select(hash_key, min_adm_age_rec3, tidytable::starts_with("c1_min_adm_age_by_subs_"))|>
  tidytable::pivot_longer(cols=tidytable::starts_with("c1_min_adm_age_by_subs_"),
                          names_to="sub_col",
                          values_to="min_adm_age_by_substance")|>
  tidytable::mutate(sub_key=gsub("^c1_min_adm_age_by_subs_","", sub_col),
                    primary_sub_std=subkey_to_label(sub_key))|>
  tidytable::select(hash_key, primary_sub_std, min_adm_age_by_substance, min_adm_age_rec3)|>
  tidytable::filter(!is.na(primary_sub_std))

# Gather per-substance onset candidates from any source columns:
# pattern examples in your file:
#   c1_age_prim_subs_onset_rec_mean_alcohol
#   c2_age_prim_subs_onset_mean_cocaine_paste
#   c5_age_prim_subs_onset_mean_opioids
prim_cols <- names(ext)
prim_cols <- prim_cols[grepl("^c[1-6]_age_prim_subs_onset.*_mean_", prim_cols)]

prim_long <- ext|>
  tidytable::select(hash_key, tidytable::all_of(prim_cols))|>
  tidytable::pivot_longer(cols=tidytable::all_of(prim_cols),
                          names_to="raw_name",
                          values_to="cand_primary")|>
  tidytable::mutate(cand_primary=as.numeric(cand_primary))|>
  tidytable::filter(!is.na(cand_primary))|>
  # split "raw_name" into "src_primary" and "sub_key"
  tidytable::mutate(src_primary=sub("^((c[1-6]_age_prim_subs_onset.*_mean))_.*$","\\1", raw_name),
                    sub_key=sub("^c[1-6]_age_prim_subs_onset.*_mean_","", raw_name),
                    primary_sub_std=subkey_to_label(sub_key))|>
  tidytable::select(hash_key, primary_sub_std, cand_primary, src_primary)|>
  tidytable::filter(!is.na(primary_sub_std))

# 1) Substances with at least one valid candidate value (from c[1-6]_age_prim_subs_onset*_mean_*)
prim_has_cand <- prim_long|>
  tidytable::distinct(hash_key, primary_sub_std)

# 2) Substances really used by a patient (if c1_prim_sub_n_* columns exist)
subcount_cols <- names(ext)
subcount_cols <- subcount_cols[grepl("^c1_prim_sub_n_", subcount_cols)]

subs_used <- NULL
if(length(subcount_cols)>0){
  subs_used <- ext|>
    tidytable::select(hash_key, tidytable::all_of(subcount_cols))|>
    tidytable::pivot_longer(cols=tidytable::all_of(subcount_cols),
                            names_to="sub_col",
                            values_to="n_sub")|>
    tidytable::mutate(sub_key=gsub("^c1_prim_sub_n_","", sub_col),
                      primary_sub_std=subkey_to_label(sub_key))|>
    # If you don't have subkey_to_label, use:
    # tidytable::mutate(primary_sub_std=gsub("_"," ", sub_key))|>
    tidytable::filter(!is.na(n_sub) & n_sub>0)|>
    tidytable::distinct(hash_key, primary_sub_std)
} else {
  # no count columns present → empty table with same columns
  subs_used <- prim_has_cand[0,]
}

# 3) Valid set of substances per patient = (with candidates) ∪ (used ≥ 1)
valid_subs <- tidytable::bind_rows(prim_has_cand, subs_used)|>
  tidytable::distinct(hash_key, primary_sub_std)

# 4) Restrict UB-by-substance bounds to valid substances (filter AFTER they exist)
sub_ub_long <- sub_ub_long|>
  tidytable::semi_join(valid_subs, by=c("hash_key","primary_sub_std"))

# 5) Build the primary_pool (now it exists, so you can semi_join if you still want)
primary_pool <- prim_long|>
  tidytable::left_join(sub_ub_long, by=c("hash_key","primary_sub_std"))|>
  tidytable::left_join(
    any_resolved|> tidytable::select(hash_key, onset_any_final),
    by="hash_key"
  )|>
  # Safe LB/UB: avoid pmin(..., na.rm=TRUE) → Inf
  tidytable::mutate(
    LB_primary = tidytable::coalesce(pmax(5L, as.integer(onset_any_final)), 5L),
    UB_primary = tidytable::case_when(
      !is.na(min_adm_age_by_substance) & !is.na(min_adm_age_rec3) ~
        pmin(as.integer(min_adm_age_by_substance)-1L, as.integer(min_adm_age_rec3)-1L),
      !is.na(min_adm_age_by_substance) ~ as.integer(min_adm_age_by_substance)-1L,
      !is.na(min_adm_age_rec3) ~ as.integer(min_adm_age_rec3)-1L,
      TRUE ~ NA_integer_
    )
  )|>
  tidytable::filter(cand_primary>=5 & cand_primary<=80)|>
  tidytable::filter(!is.na(UB_primary) & LB_primary<=UB_primary)|>
  tidytable::filter(cand_primary>=LB_primary & cand_primary<=UB_primary)|>
  tidytable::semi_join(valid_subs, by=c("hash_key","primary_sub_std"))

# 6) Mode/median/sources on the filtered pool
prim_counts <- primary_pool|>
  tidytable::group_by(hash_key, primary_sub_std, cand_primary)|>
  tidytable::summarise(n_primary=tidytable::n())|>
  tidytable::ungroup()

prim_mode <- prim_counts|>
  tidytable::arrange(hash_key, primary_sub_std, tidytable::desc(n_primary), cand_primary)|>
  tidytable::slice_head(n=1, by=c("hash_key","primary_sub_std"))|>
  tidytable::rename(onset_primary_mode=cand_primary)

prim_median <- primary_pool|>
  tidytable::summarise(
    onset_primary_median=as.integer(median(cand_primary)),
    .by=c("hash_key","primary_sub_std")
  )

prim_sources <- primary_pool|>
  tidytable::summarise(
    sources_primary=paste0(sort(unique(src_primary)), collapse="+"),
    n_primary_cands=tidytable::n(),
    .by=c("hash_key","primary_sub_std")
  )

# 7) Primary bounds (recompute cleanly; no use of undefined objects)
prim_bounds <- sub_ub_long|>
  tidytable::left_join(
    any_resolved|> tidytable::select(hash_key, onset_any_final),
    by="hash_key"
  )|>
  tidytable::mutate(
    LB_primary = tidytable::coalesce(pmax(5L, as.integer(onset_any_final)), 5L),
    UB_primary = tidytable::case_when(
      !is.na(min_adm_age_by_substance) & !is.na(min_adm_age_rec3) ~
        pmin(as.integer(min_adm_age_by_substance)-1L, as.integer(min_adm_age_rec3)-1L),
      !is.na(min_adm_age_by_substance) ~ as.integer(min_adm_age_by_substance)-1L,
      !is.na(min_adm_age_rec3) ~ as.integer(min_adm_age_rec3)-1L,
      TRUE ~ NA_integer_
    )
  )|>
  tidytable::select(hash_key, primary_sub_std, LB_primary, UB_primary)

# 8) Resolve primary per (hash × sub)
primary_resolved <- prim_bounds|>
  tidytable::left_join(prim_mode, by=c("hash_key","primary_sub_std"))|>
  tidytable::left_join(prim_median, by=c("hash_key","primary_sub_std"))|>
  tidytable::left_join(prim_sources, by=c("hash_key","primary_sub_std"))|>
  tidytable::left_join(
    any_resolved|> tidytable::select(hash_key, onset_any_final),
    by="hash_key"
  )|>
  tidytable::mutate(
    onset_primary_final = tidytable::coalesce(onset_primary_mode, onset_primary_median),
    # enforce primary ≥ ANY (when BOTH exist)
    onset_primary_final = tidytable::case_when(
      !is.na(onset_any_final) & !is.na(onset_primary_final) & onset_primary_final < onset_any_final ~ NA_integer_,
      TRUE ~ onset_primary_final
    ),
    PROC_PRIMARY = tidytable::case_when(
      !is.na(onset_primary_mode) ~ paste0("PRIMARY=mode[", tidytable::coalesce(sources_primary,"cands"), "]"),
      is.na(onset_primary_mode) & !is.na(onset_primary_median) ~ paste0("PRIMARY=median[", tidytable::coalesce(sources_primary,"cands"), "]"),
      TRUE ~ "PRIMARY=unresolved"
    ),
    FLAG_PRIMARY = tidytable::case_when(
      is.na(UB_primary) ~ "PRIM_UB_missing",
      LB_primary > UB_primary ~ "PRIM_unsatisfiable(LB>UB)",
      !is.na(onset_any_final) & !is.na(onset_primary_mode) & onset_primary_mode < onset_any_final ~ "PRIM_conflict(primary<any)",
      !is.na(onset_any_final) & !is.na(onset_primary_median) & onset_primary_median < onset_any_final ~ "PRIM_conflict(primary<any)",
      is.na(onset_primary_final) ~ "PRIM_no_candidate_in_range",
      TRUE ~ NA_character_
    )
  )|>
  tidytable::select(hash_key, primary_sub_std, LB_primary, UB_primary,
                    onset_primary_final, PROC_PRIMARY, FLAG_PRIMARY,
                    n_primary_cands, sources_primary)

# Join bounds and ANY result to enforce precedence
primary_pool <- prim_long|>
  tidytable::left_join(sub_ub_long, by=c("hash_key","primary_sub_std"))|>
  tidytable::left_join(any_resolved|>
                         tidytable::select(hash_key, onset_any_final),
                       by="hash_key")|>
  tidytable::mutate(LB_primary=tidytable::coalesce(pmax(5L, as.integer(onset_any_final)), 5L),
                    UB_primary=pmin(as.integer(min_adm_age_by_substance)-1L,
                                    as.integer(min_adm_age_rec3)-1L, na.rm=TRUE))|>
  tidytable::filter(cand_primary>=5 & cand_primary<=80)|>
  tidytable::filter(!is.na(UB_primary) & LB_primary<=UB_primary)|>
  tidytable::filter(cand_primary>=LB_primary & cand_primary<=UB_primary)

# Pick mode → median per (hash × sub)
prim_counts <- primary_pool|>
  tidytable::group_by(hash_key, primary_sub_std, cand_primary)|>
  tidytable::summarise(n_primary=tidytable::n())|>
  tidytable::ungroup()

prim_mode <- prim_counts|>
  tidytable::arrange(hash_key, primary_sub_std, tidytable::desc(n_primary), cand_primary)|>
  tidytable::slice_head(n=1, by=c("hash_key","primary_sub_std"))|>
  tidytable::rename(onset_primary_mode=cand_primary)

prim_median <- primary_pool|>
  tidytable::summarise(onset_primary_median=as.integer(median(cand_primary)),
                       .by=c("hash_key","primary_sub_std"))

prim_sources <- primary_pool|>
  tidytable::summarise(sources_primary=paste0(sort(unique(src_primary)), collapse="+"),
                       n_primary_cands=tidytable::n(),
                       .by=c("hash_key","primary_sub_std"))

prim_bounds <- sub_ub_long|>
  tidytable::mutate(LB_primary=tidytable::coalesce(5L, 5L),  # placeholder before join with ANY
                    UB_primary=pmin(as.integer(min_adm_age_by_substance)-1L,
                                    as.integer(min_adm_age_rec3)-1L, na.rm=TRUE))|>
  tidytable::left_join(any_resolved|>
                         tidytable::select(hash_key, onset_any_final),
                       by="hash_key")|>
  tidytable::mutate(LB_primary=tidytable::coalesce(pmax(5L, as.integer(onset_any_final)), 5L))|>
  tidytable::select(hash_key, primary_sub_std, LB_primary, UB_primary)

primary_resolved <- prim_bounds|>
  tidytable::left_join(prim_mode, by=c("hash_key","primary_sub_std"))|>
  tidytable::left_join(prim_median, by=c("hash_key","primary_sub_std"))|>
  tidytable::left_join(prim_sources, by=c("hash_key","primary_sub_std"))|>
  tidytable::left_join(any_resolved|>
                         tidytable::select(hash_key, onset_any_final),
                       by="hash_key")|>
  tidytable::mutate(onset_primary_final=tidytable::coalesce(onset_primary_mode, onset_primary_median),
                    # enforce primary >= any (if both exist)
                    onset_primary_final=tidytable::case_when(
                      !is.na(onset_any_final) & !is.na(onset_primary_final) & onset_primary_final<onset_any_final ~ NA_integer_,
                      TRUE ~ onset_primary_final
                    ),
                    PROC_PRIMARY=tidytable::case_when(
                      !is.na(onset_primary_mode)~paste0("PRIMARY=mode[",tidytable::coalesce(sources_primary,"cands"),"]"),
                      is.na(onset_primary_mode) & !is.na(onset_primary_median)~paste0("PRIMARY=median[",tidytable::coalesce(sources_primary,"cands"),"]"),
                      TRUE~"PRIMARY=unresolved"
                    ),
                    FLAG_PRIMARY=tidytable::case_when(
                      is.na(UB_primary)~"PRIM_UB_missing",
                      LB_primary>UB_primary~"PRIM_unsatisfiable(LB>UB)",
                      !is.na(onset_any_final) & !is.na(onset_primary_mode) & onset_primary_mode<onset_any_final~"PRIM_conflict(primary<any)",
                      !is.na(onset_any_final) & !is.na(onset_primary_median) & onset_primary_median<onset_any_final~"PRIM_conflict(primary<any)",
                      is.na(onset_primary_final)~"PRIM_no_candidate_in_range",
                      TRUE~NA_character_
                    ))|>
  tidytable::select(hash_key, primary_sub_std, LB_primary, UB_primary,
                    onset_primary_final, PROC_PRIMARY, FLAG_PRIMARY,
                    n_primary_cands, sources_primary)

# ── Combined outputs (long & compact) ─────────────────────────────────
onset_resolved_from_ext <- primary_resolved|>
  tidytable::left_join(any_resolved, by="hash_key")|>
  tidytable::mutate(PROC=paste0(tidytable::coalesce(PROC_ANY,"ANY=NA")," | ",
                                tidytable::coalesce(PROC_PRIMARY,"PRIMARY=NA")),
                    FLAG=paste0(tidytable::coalesce(FLAG_ANY,""),
                                ifelse(!is.na(FLAG_ANY) & !is.na(FLAG_PRIMARY),"; ",""),
                                tidytable::coalesce(FLAG_PRIMARY,"")))|>
  tidytable::select(hash_key, primary_sub_std,
                    onset_any_final, LB_any, UB_any,
                    onset_primary_final, LB_primary, UB_primary,
                    PROC, FLAG)

#One row per patient and per actual primary substance—just filtering and collapsing did the job.
onset_resolved_from_ext_collapsed <- onset_resolved_from_ext|>
  tidytable::filter(!is.na(primary_sub_std))|>   # only declared substances
  tidytable::distinct(hash_key, primary_sub_std, .keep_all = TRUE)


onset_resolved_from_ext|>
  tidytable::summarise(
    any_resolved=sum(!is.na(onset_any_final)),
    primary_resolved=sum(!is.na(onset_primary_final)),
    any_unresolved=sum(is.na(onset_any_final)),
    primary_unresolved=sum(is.na(onset_primary_final))
  )|>
knitr::kable("markdown", caption="Results of the algorithm")

             used    (Mb) gc trigger    (Mb)   max used    (Mb)
Ncells    5350372   285.8    9072005   484.5    9072005   484.5
Vcells 1955808765 14921.7 3352345654 25576.4 2364978772 18043.4

Results of the algorithm
any_resolved	primary_resolved	any_unresolved	primary_unresolved
35668	39293	6557	2932

We reincorporated the resolved cases into the main database of C1 treatments.

Code

# Make sure you have an standardized key of substance in C1 database
stopifnot(
identical(attr(table(SISTRAT23_c1_2010_2024_df_prev1r$primary_sub),"name"),
attr(table(onset_resolved_from_ext $primary_sub_std),"name"))
)
library(tidytable)

# -- helper: integer mode with tie -> smallest
mode_pick_int <- function(x){
  x <- x[!is.na(x)]
  if(length(x)==0) return(NA_integer_)
  tx <- sort(table(x), decreasing = TRUE)
  as.integer(names(tx)[1L])
}

# 1) Build a STRICT one-row-per-hash ANY table -------------------------
any_by_hash <-
  onset_resolved_from_ext|>
  tidytable::select(hash_key, onset_any_final, LB_any, UB_any)|>
  # collapse to single row per hash_key; if multiple ANY values exist, take mode
  tidytable::summarise(
    onset_any_final = mode_pick_int(onset_any_final),
    LB_any = suppressWarnings(min(LB_any, na.rm=TRUE)),
    UB_any = suppressWarnings(max(UB_any, na.rm=TRUE)),
    .by = hash_key
  )|>
  # replace Inf/-Inf if group had all NA
  tidytable::mutate(
    LB_any = ifelse(is.infinite(LB_any), NA_integer_, as.integer(LB_any)),
    UB_any = ifelse(is.infinite(UB_any), NA_integer_, as.integer(UB_any))
  )

# sanity: exactly one row per hash
if(nrow(any_by_hash)!=nrow(tidytable::distinct(onset_resolved_from_ext, hash_key))){
  stop("ANY table is not 1-row-per-hash; aborting to prevent row blow-up.")
}

# 2) PRIMARY table is already unique on (hash_key × primary_sub_std) ----
primary_by <-
  onset_resolved_from_ext|>
  tidytable::select(
    hash_key, primary_sub_std,
    onset_primary_final, LB_primary, UB_primary,
    PROC, FLAG
  )

# sanity: uniqueness of PRIMARY keys
dup_ks <- primary_by|>tidytable::count(hash_key, primary_sub_std)|>tidytable::filter(n>1)
if(nrow(dup_ks)>0) stop("PRIMARY table has duplicate (hash × substance).")

# 3) Reintegration to episode level ------------------------------------
SISTRAT23_c1_2010_2024_df_prev1s <-
  SISTRAT23_c1_2010_2024_df_prev1r|>
  (\(df){
    cat(paste0("5.0. Before reintegration, rows: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("5.0. Before reintegration, RUNs : ", formatC(nrow(tidytable::distinct(df, hash_key)), big.mark=",")),"\n")
    df
  })()|>
  # standardize episode primary_sub for the key
  tidytable::mutate(primary_sub_std = tolower(primary_sub))|>
  # join ANY (one-to-many by hash_key only; row count must not change)
  tidytable::left_join(any_by_hash, by="hash_key")|>
  # join PRIMARY (many-to-one on (hash_key, primary_sub_std))
  tidytable::left_join(primary_by, by=c("hash_key","primary_sub_std"),
                       suffix=c("", "_PRIMARY"))|>
  # produce final columns
  tidytable::mutate(
    age_subs_onset_rec2 = tidytable::coalesce(onset_any_final, age_subs_onset_rec),
    age_primary_onset_rec2 = tidytable::coalesce(onset_primary_final, age_prim_subs_onset_rec),
    # keep PROC/FLAG as provided by primary_by (already contains "ANY=... | PRIMARY=...")
    PROC_onset = PROC,
    FLAG_onset = FLAG,
    # annotate OBS
    OBS = tidytable::case_when(
      !is.na(age_subs_onset_rec2) & !is.na(age_primary_onset_rec2)~ paste0(as.character(OBS), ";", "5.0.Onset resolution integrated (ANY+PRIMARY)"),
      !is.na(age_subs_onset_rec2) &  is.na(age_primary_onset_rec2)~ paste0(as.character(OBS), ";", "5.0.Onset resolution integrated (ANY only)"),
       is.na(age_subs_onset_rec2) & !is.na(age_primary_onset_rec2)~ paste0(as.character(OBS), ";", "5.0.Onset resolution integrated (PRIMARY only)"),
      TRUE ~ as.character(OBS)
    ))|>
  # drop intermediate duplicates of original PROC/FLAG if you wish
  tidytable::select(-PROC, -FLAG)|>
  tidytable::select(-any_of(c("primary_sub_std", "onset_any_final", "onset_primary_final")))|># "LB_any", "UB_any", "LB_primary", "UB_primary"
  tidytable::rename("LB_age_subs_onset_rec2"="LB_any", "UB_age_subs_onset_rec2"="UB_any", "LB_age_primary_onset_rec2"="LB_primary", "UB_age_primary_onset_rec2"="UB_primary")|>
  (\(df){
    cat(paste0("5.0. After reintegration, rows: ", formatC(nrow(df), big.mark=",")),"\n")
    cat(paste0("5.0. After reintegration, RUNs : ", formatC(nrow(tidytable::distinct(df, hash_key)), big.mark=",")),"\n")
    if(nrow(df)!=nrow(SISTRAT23_c1_2010_2024_df_prev1r)) stop("Error: row count changed after reintegration.")
    df
  })()

# 4) QA: episodes must be unique by rn
qa_dup <- SISTRAT23_c1_2010_2024_df_prev1s|>tidytable::count(rn)|>tidytable::filter(n>1)
if(nrow(qa_dup)>0){
  warning(paste0("Duplicated episodes after reintegration: ", nrow(qa_dup)))
}else{
  message("OK: no duplicated episodes by 'rn' after reintegration.")
}

OK: no duplicated episodes by ‘rn’ after reintegration.

Code

if(SISTRAT23_c1_2010_2024_df_prev1s|> group_by(rn)|> summarise(n=n())|> filter(n>1)|> nrow()>0) stop("Error: row count changed after reintegration.")

#ANY_no_candidate_in_range; PRIM_no_candidate_in_range (324) are the hard cases with neither resolvable.
#PRIM_conflict(primary<any) (7) are tiny; enforce PRIMARY ≥ ANY post-hoc or clamp.

5.0. Before reintegration, rows: 173,728 
5.0. Before reintegration, RUNs : 121,299 
5.0. After reintegration, rows: 173,728 
5.0. After reintegration, RUNs : 121,299

Columns recording admissible age of onset of substance use and primary use reintegrated to the main database called SISTRAT23_c1_2010_2024_df_prev1s, with the names age_subs_onset_rec2 and age_primary_onset_rec2. We kept LB and UB variables as boundaries in case of imputation. We also added the columns PROC_onset and FLAG_onset to designated the criteria used to replace values in age of onset of substance use variables and reasons for inconsistency.

Information about municipality of residence

First, we corrected CUT for Algarrobo (5602) which was bad written (6502).

We standardized and integrated commune-level poverty data for Chile across the period 2007-2024, combining rural classification information with annual poverty estimates. We addressed two critical data harmonization challenges: commune code standardization and temporal data integration.

The processing pipeline begins by reading rural classification data from the PNDR (National Rural Development Policy) classification file, which contained commune-level rural/urban designations. Simultaneously, we processed separate Excel files containing annual poverty estimates, each requiring specific handling due to variations in file structure and naming conventions across different years.

A key transformation involved standardizing commune codes using a mapping table that converts legacy codes (e.g., 16101, 16102) to standardized formats (8401, 8402). This standardization ensured consistent matching between rural classification data and poverty estimates.

After processing individual annual datasets, the code aggregated all years into a unified longitudinal dataset (pobr_mult_2010_2020) containing standardized commune codes, poverty percentages, and temporal identifiers. This created a comprehensive foundation for analyzing poverty trends across Chilean communes while maintaining data consistency and enabling rural-urban comparisons over the 14-year period.

The final output provides researchers with a clean, standardized dataset suitable for longitudinal analysis of poverty patterns, with particular value for examining relationships between rural classification and poverty dynamics across Chilean municipalities.

Code

# ===============================
# Commune poverty (income) by admission year + IPM 2022 (multidimensional)
# Relative paths via `wdpath`, no library(), package::-qualified, pipes |>, no spaces.
# ===============================

# --- helpers ---
standardize_comuna_code <- function(x){
  x <- as.character(x)
  map <- c(
    "16101"="8401","16102"="8402","16103"="8406","16104"="8407","16105"="8410",
    "16106"="8411","16107"="8413","16108"="8418","16109"="8421","16201"="8414",
    "16202"="8403","16203"="8404","16204"="8408","16205"="8412","16206"="8415",
    "16207"="8420","16301"="8416","16302"="8405","16303"="8409","16304"="8417",
    "16305"="8419"
  )
  y <- unname(map[x]); x[!is.na(y)] <- y[!is.na(y)]; x
}
nearest_year_fill <- function(p_all){
  yrs <- 2007:2024
  cods <- unique(p_all$cod)
  grid <- data.frame(cod=rep(cods,each=length(yrs)), anio=rep(yrs,times=length(cods)), stringsAsFactors=FALSE)

  df <- tidytable::left_join(grid, p_all, by=c("cod","anio"))

  obs  <- df|>tidytable::filter(!is.na(porc_pobr))
  miss <- df|>tidytable::filter(is.na(porc_pobr))
  if(nrow(miss)==0) return(obs|>tidytable::arrange(cod,anio)|>tidytable::distinct(cod,anio,.keep_all=TRUE))

  cand <- tidytable::inner_join(
            miss|>tidytable::rename(anio_miss=anio),
            obs |>tidytable::rename(anio_obs=anio, porc_obs=porc_pobr),
            by="cod"
          ) |>
          tidytable::mutate(dist=abs(anio_obs-anio_miss)) |>
          tidytable::arrange(cod, anio_miss, dist, tidytable::desc(anio_obs)) |>
          tidytable::distinct(cod, anio_miss, .keep_all=TRUE) |>
          tidytable::transmute(cod, anio=anio_miss, porc_pobr=porc_obs)

  tidytable::bind_rows(
    obs|>tidytable::rename(porc_pobr=porc_pobr, anio=anio),
    cand
  ) |>
  tidytable::arrange(cod,anio) |>
  tidytable::distinct(cod,anio,.keep_all=TRUE)
}
read_poverty_year <- function(anio,fname,skip_rows,inp_path){
  x <- readxl::read_excel(file.path(inp_path,fname), skip=skip_rows)|>
    tidytable::mutate(anio=anio)
  nm <- names(x)
  nm <- gsub("Código","Codigo",nm,fixed=TRUE)
  nm <- gsub("Nombre comuna","NombreComuna",nm,fixed=TRUE)
  names(x) <- nm
  measure <- ifelse(
    grepl("multidimensional|índice|indice",tolower(paste(names(x),collapse=" ")))||
    grepl("multidimensional|índice|indice",tolower(fname)),
    "multidimensional","income")
  x|>
    tidytable::rename_with(~"porc_pobr",tidyselect::matches("(?i)porcentaje.*pobreza"))|>
    tidytable::rename(
      ci_lo       = tidyselect::matches("(?i)l[ií]mite.*inferior"),
      ci_hi       = tidyselect::matches("(?i)l[ií]mite.*superior"),
      in_casen    = tidyselect::matches("(?i)presencia.*casen"),
      sae_tipo    = tidyselect::matches("(?i)tipo.*estimaci[oó]n"),
      n_pobres    = tidyselect::matches("(?i)n[uú]mero.*pobreza"),
      n_poblacion = tidyselect::matches("(?i)proyecciones.*poblaci[oó]n")
    )|>
    (\(df){
      if(!"Codigo"%in%names(df)) df <- tidytable::rename(df,Codigo=tidyselect::all_of(names(df)[2]))
      if(!"NombreComuna"%in%names(df)) df <- tidytable::rename(df,NombreComuna=tidyselect::all_of(names(df)[3]))
      # create missing optional cols
      opt_num <- c("ci_lo","ci_hi","n_poblacion","n_pobres")
      opt_chr <- c("in_casen","sae_tipo")
      for(nm in opt_num) if(!nm%in%names(df)) df[[nm]] <- NA_real_
      for(nm in opt_chr) if(!nm%in%names(df)) df[[nm]] <- NA_character_
      df
    })()|>
    tidytable::mutate(
      porc_pobr=as.numeric(gsub(",",".",gsub("%","",as.character(porc_pobr),fixed=TRUE),fixed=TRUE)),
      ci_lo=as.numeric(gsub(",",".",gsub("%","",as.character(ci_lo),fixed=TRUE),fixed=TRUE)),
      ci_hi=as.numeric(gsub(",",".",gsub("%","",as.character(ci_hi),fixed=TRUE),fixed=TRUE)),
      n_poblacion=as.numeric(gsub("\\.","",as.character(n_poblacion))),
      n_pobres=as.numeric(gsub("\\.","",as.character(n_pobres))),
      Codigo=as.character(Codigo),
      cod=standardize_comuna_code(gsub("^0+","",Codigo,perl=TRUE)),
      measure=measure
    )|>
    tidytable::select(anio,cod,measure,porc_pobr,ci_lo,ci_hi,in_casen,sae_tipo,n_poblacion,n_pobres)
}
# --- paths ---
inp_path <- file.path(wdpath,"cons/_input")
# --- rurality (optional, for later join if you want) ---
Comunas_PNDR <- readxl::read_excel(file.path(inp_path,"Clasificacion-comunas-PNDR.xlsx"))|>
  tidytable::mutate(cod=standardize_comuna_code(as.character(cod_com)))
# --- table-driven file list (income 2010–2020 + IPM 2022) ---
files_tbl <- tidytable::tribble(
  ~anio, ~fname, ~skip_rows,
  2022,"Estimaciones_Indice_Pobreza_Multidimensional_Comunas_2022.xlsx",1,
  2020,"Estimaciones_de_Tasa_de_Pobreza_por_Ingresos_por_Comunas_2020_revisada2022_09.xlsx",1,
  2019,"Estimaciones_de_Tasa_de_Pobreza_por_Ingresos_por_Comunas_2020_revisada2022_09.xlsx",1,
  2018,"PLANILLA_Estimaciones_comunales_tasa_pobreza_por_ingresos_multidimensional_2017.xlsx",1,
  2017,"PLANILLA_Estimaciones_comunales_tasa_pobreza_por_ingresos_multidimensional_2017.xlsx",1,
  2016,"PLANILLA_Estimaciones_comunales_tasa_pobreza_por_ingresos_multidimensional_2015.xlsx",1,
  2015,"PLANILLA_Estimaciones_comunales_tasa_pobreza_por_ingresos_multidimensional_2015.xlsx",1,
  2014,"PLANILLA_Estimaciones_comunales_tasa_pobreza_por_ingresos_2013.xlsx",1,
  2013,"PLANILLA_Estimaciones_comunales_tasa_pobreza_por_ingresos_2013.xlsx",1,
  2012,"Estimacion_tasa_de_pobreza_comunal_2011_(nueva _metodologia).xlsx",1,
  2011,"Estimacion_tasa_de_pobreza_comunal_2011_(nueva _metodologia).xlsx",1,
  2010,"Estimacion_tasa_de_pobreza_comunal_2011_(nueva _metodologia).xlsx",1
)
# --- read all files once ---
poverty_all <- purrr::pmap_dfr(
    list(files_tbl$anio,files_tbl$fname,files_tbl$skip_rows),
    ~read_poverty_year(..1,..2,..3,inp_path)
)