SENDAs Agreement 1 Update 2010-2022 (Prediction, step 1)

The dataset is enriched with derived variables, clinically meaningful recodings, and grouped diagnostic categories, while excluding patients still in treatment or with truncated admission dates. This results in a person-level dataset suitable for semi-competing risks modeling, where each row represents a unique index treatment episode with associated covariates and event times. This step involves mainly Data Pre-processing and Feature Engineering.

Author

Andrés González Santa Cruz

Published

January 5, 2026

Data Loading and Exploration

Loading Packages and uniting databases

Code

# invisible("Only run from Ubuntu")
# if (!(Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv"))) {
#   if(Sys.info()["sysname"]!="Windows"){
#     Sys.setenv(RETICULATE_PYTHON = "/home/fondecytacc/.pyenv/versions/3.11.5/bin/python")
#   }
# }

#clean enviroment
rm(list = ls()); gc()

time_before_dedup2<-Sys.time()

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

# Busca .mamba_root/envs/py311/python.exe desde getwd() hacia padres
find_python_rel <- function(start = getwd(),
                            rel = file.path(".mamba_root","envs","py311","python.exe")) {
  cur <- normalizePath(start, winslash = "/", mustWork = FALSE)
  repeat {
    cand <- normalizePath(file.path(cur, rel), winslash = "/", mustWork = FALSE)
    if (file.exists(cand)) return(cand)
    parent <- dirname(cur)
    if (identical(parent, cur)) return(NA_character_)  # llegó a la raíz
    cur <- parent
  }
}
# --- Bootstrap reticulate con ruta relativa a getwd() ---
if(Sys.info()["sysname"]!="Windows"){
  Sys.setenv(RETICULATE_PYTHON = "usr/bin/python3")
  reticulate::py_config()
} else {
  py <- find_python_rel()
  if (is.na(py)) {
    stop("No se encontró Python relativo a getwd() (buscando '.mamba_root/envs/py311/python.exe').\n",
         "Directorio actual: ", getwd())
  }
  # Forzar ese intérprete
  Sys.unsetenv(c("RETICULATE_CONDAENV","RETICULATE_PYTHON_FALLBACK"))
  Sys.setenv(RETICULATE_PYTHON = py)
  reticulate::use_python(py, required=T)
  reticulate::py_config()  # verificación
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

# --- Load 250927_c1_1025_df_prev1y.rds (simple, project-root aware, no functions) ---

# 1) Locate the file anywhere under the project root (current WD with a trailing "/cons" trimmed)
project_root <- sub("(/)?cons/?$", "", normalizePath(getwd(), winslash = "/", mustWork = FALSE))
fname        <- "251001_c1_1025_df_prev1y.rds"

hits <- list.files(project_root, pattern = paste0("^", fname, "$"),
                   recursive = TRUE, full.names = TRUE)

if (!length(hits)) stop("File not found: ", fname, "\nSearched under: ", project_root)
path <- hits[1]  # take first match

message("Loading: ", path)

# 2) Read into a dedicated environment (keeps .GlobalEnv clean)
env_prev1y <- new.env(parent = emptyenv())
obj        <- readRDS(path)
assign("SISTRAT23_c1_2010_2024_df_prev1y", obj, envir = env_prev1y)
rm(obj)

# 3) Quick sanity check
with(env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y, { message("Rows: ", nrow(env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y), " | Cols: ", ncol(env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y));  print(str(env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y))})
SISTRAT23_c1_2010_2024_df_prev1y<- 
env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y

rm(env_prev1y)
# (Optional) If you want it in the global env, uncomment:
# list2env(env_prev1y, envir = .GlobalEnv)

          used (Mb) gc trigger (Mb) max used (Mb)
Ncells  604456 32.3    1317361 70.4   961116 51.4
Vcells 1160482  8.9    8388608 64.0  1876213 14.4
python:         G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe
libpython:      G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python311.dll
pythonhome:     G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311
version:        3.11.5 | packaged by conda-forge | (main, Aug 27 2023, 03:23:48) [MSC v.1936 64 bit (AMD64)]
Architecture:   64bit
numpy:           [NOT FOUND]

NOTE: Python version was forced by RETICULATE_PYTHON
Classes 'tidytable', 'tbl', 'data.table' and 'data.frame':  162897 obs. of  125 variables:
 $ TABLE                        : chr  "2023" "2015" "2023" "2013" ...
 $ TABLE_rec_series             : chr  "||20232||" "||20151||" "||20231||" "||2013||" ...
 $ rn                           : int  240474 66234 225781 44305 69194 111875 235652 213374 14494 27258 ...
 $ rn_series                    : chr  NA NA NA NA ...
 $ num_trat_ant                 : num  0 1 0 1 2 2 0 0 0 1 ...
 $ fecha_ultimo_tratamiento     : chr  NA "3 a 4 anos" NA "1 a 2 anos" ...
 $ hash_key                     : chr  "00002d6e6b95298ec21341639f62bbf9a98cd51e1b8eb858c7df1e9680e5b22b" "000068521f2408c3c6d49b287428d89bad9c198726cea177017d649d8a66c001" "0000773f8535cc999b521b1d6c9784df95dd7bf19b68e1ae94da41114d45bc59" "00015ab15b7747293d16142f7cd40abc73495f0a7390a71c14fe37265f5efde2" ...
 $ min_adm_age_rec3             : num  57.9 31.5 38.9 20.6 20.6 ...
 $ adm_age_rec3                 : num  57.9 31.5 38.9 20.6 21.7 ...
 $ birth_date_rec               : Date, format: "1965-12-18" "1982-05-23" ...
 $ adm_date_num_rec2            : num  19675 16041 19187 15974 16364 ...
 $ adm_date_rec2                : Date, format: "2023-11-14" "2013-12-02" ...
 $ dit_rec6                     : num  20 485 365 176 199 36 211 302 15 217 ...
 $ disch_date_rec6              : Date, format: "2023-12-04" "2015-04-01" ...
 $ disch_date_num_rec6_trans    : num  19695 16526 19552 16150 16563 ...
 $ def_date                     : Date, format: NA NA ...
 $ adm_motive                   : chr  "spontaneous consultation" "sanitary sector" "other" "spontaneous consultation" ...
 $ tr_compliance_rec7           : chr  "early dropout" "referral" "completion" "late dropout" ...
 $ adm_disch_reason             : chr  NA NA NA NA ...
 $ referral_type                : chr  NA "other facility" NA NA ...
 $ plan_type                    : chr  "pg-pab" "pg-pab" "pg-pai" "pg-pab" ...
 $ id_centro                    : num  255 330 501 489 489 489 409 841 341 212 ...
 $ senda                        : chr  "si" "si" "si" "si" ...
 $ pub_center                   : Factor w/ 2 levels "FALSE","TRUE": 1 2 2 2 2 2 2 2 1 2 ...
 $ primary_sub                  : chr  "alcohol" "alcohol" "marijuana" "cocaine paste" ...
 $ second_sub1                  : chr  NA NA NA "marijuana" ...
 $ second_sub2                  : chr  NA NA NA "alcohol" ...
 $ second_sub3                  : chr  NA NA NA "tranquilizers/hypnotics" ...
 $ prim_sub_freq                : chr  "3. 2 to 3 days a week" "3. 2 to 3 days a week" "3. 2 to 3 days a week" "5. Daily" ...
 $ prim_sub_route               : chr  "Oral (drunk or eaten)" "Oral (drunk or eaten)" "Smoked or pulmonary aspiration" "Smoked or pulmonary aspiration" ...
 $ LB_age_primary_onset_rec2    : int  NA NA NA 10 10 10 NA NA 9 9 ...
 $ UB_age_primary_onset_rec2    : int  NA NA NA 19 19 19 NA NA 41 41 ...
 $ age_primary_onset_rec2       : num  14 13 13 17.7 17.7 ...
 $ first_sub_used               : chr  "alcohol" "alcohol" "marijuana" "alcohol" ...
 $ sus_ini_mod_mvv              : Factor w/ 5 levels "cocaine paste",..: 3 3 4 3 3 3 4 3 3 3 ...
 $ sus_ini_1                    : chr  NA NA NA NA ...
 $ sus_ini_2                    : chr  NA NA NA NA ...
 $ sus_ini_3                    : chr  NA NA NA NA ...
 $ LB_age_subs_onset_rec2       : int  NA NA NA 5 5 5 NA NA 5 5 ...
 $ UB_age_subs_onset_rec2       : int  NA NA NA 19 19 19 NA NA 41 41 ...
 $ age_subs_onset_rec2          : int  14 13 13 10 10 10 36 13 9 9 ...
 $ biopsych_comp                : chr  "2-moderate" "2-moderate" "1-mild" "2-moderate" ...
 $ mod_psiq_cie_10              : chr  NA "retraso mental::NA" "trastornos neuroticos, secundarios a situaciones estresantes y somatomorfos::NA" NA ...
 $ mod_psiq_dsm_iv              : chr  NA NA NA NA ...
 $ diagnostico_trs_fisico       : chr  "en estudio" "en estudio" "en estudio" "en estudio" ...
 $ otros_probl_at_sm_or         : chr  NA "otros" "abuso sexual" "sin otros problemas de salud mental" ...
 $ sub_dep_icd10_status         : chr  "hazardous consumption" "drug dependence" "hazardous consumption" "hazardous consumption" ...
 $ evaluacindelprocesoteraputico: chr  "logro minimo" "logro alto" "logro alto" "logro minimo" ...
 $ eva_consumo                  : chr  "logro minimo" "logro alto" "logro alto" "logro minimo" ...
 $ eva_fam                      : chr  "logro minimo" "logro intermedio" "logro intermedio" "logro minimo" ...
 $ eva_relinterp                : chr  "logro minimo" "logro alto" "logro intermedio" "logro minimo" ...
 $ eva_ocupacion                : chr  "logro minimo" "logro alto" "logro alto" "logro minimo" ...
 $ eva_sm                       : chr  "logro minimo" "logro intermedio" "logro alto" "logro minimo" ...
 $ eva_fisica                   : chr  "logro minimo" "logro alto" "logro alto" "logro minimo" ...
 $ eva_transgnorma              : chr  "logro minimo" "logro alto" "logro alto" "logro minimo" ...
 $ dg_global_nec_int_soc_or     : chr  "altas" NA "bajas" NA ...
 $ dg_nec_int_soc_cap_hum_or    : chr  "altas" NA "bajas" NA ...
 $ dg_nec_int_soc_cap_fis_or    : chr  "altas" NA "bajas" NA ...
 $ dg_nec_int_soc_cap_soc_or    : chr  "altas" NA "bajas" NA ...
 $ dg_global_nec_int_soc_egr_or : chr  "altas" "medias" NA NA ...
 $ dg_nec_int_soc_cap_hum_egr_or: chr  "altas" "medias" "bajas" NA ...
 $ dg_nec_int_soc_cap_fis_egr_or: chr  "altas" "medias" "bajas" NA ...
 $ dg_nec_int_soc_cap_soc_egr_or: chr  "altas" "bajas" "medias" NA ...
 $ usuario_tribunal_trat_droga  : chr  "no" "no" "no" "no" ...
 $ nationality_cons             : chr  "chile" "chile" "chile" "chile" ...
 $ ethnicity_c1_c6_historic     : chr  NA NA NA NA ...
 $ discapacidad                 : chr  "no" NA "no" NA ...
 $ opcion_discapacidad          : chr  NA NA NA NA ...
 $ sex_rec                      : chr  "hombre" "hombre" "mujer" "hombre" ...
 $ identidad_de_genero          : chr  "masculino" NA "femenino" NA ...
 $ orientacion_sexual           : chr  "heterosexual" NA "heterosexual" NA ...
 $ pregnant                     : chr  "no" NA "no" NA ...
 $ pregnant_disch               : chr  "no" NA "no" NA ...
 $ marital_status               : chr  "married/cohabiting" "single" "married/cohabiting" "single" ...
 $ tiene_menores_de_edad_a_cargo: chr  "no" NA "si" NA ...
 $ num_hijos_trat_res           : num  0 0 0 0 0 0 0 0 1 0 ...
 $ numero_de_hijos              : num  2 0 2 0 0 0 4 2 1 0 ...
 $ con_quien_vive               : chr  "con la pareja, hijos y padres o familia de origen" "solo" "unicamente con la pareja e hijos" "unicamente con padres o familia de origen" ...
 $ tipo_de_vivienda             : chr  "casa" "mediagua" "casa" "casa" ...
 $ precariedad_vivienda         : chr  "reside en una vivienda en buen estado" NA "reside en una vivienda en buen estado" NA ...
 $ tenure_status_household      : chr  "owner/transferred dwellings/pays dividends" "stays temporarily with a relative" "illegal settlement" "owner/transferred dwellings/pays dividends" ...
 $ servicios_basicos_95         : chr  "reside en una vivienda con servicios sanitarios basicos" NA "reside en una vivienda con servicios sanitarios basicos" NA ...
 $ perso_dormitorio_vivienda    : chr  "menor a 2,5 personas" NA "menor a 2,5 personas" NA ...
 $ ed_attainment                : chr  "3-Completed primary school or less" "2-Completed high school or less" "2-Completed high school or less" "3-Completed primary school or less" ...
 $ occupation_condition         : chr  "employed" "unemployed" "employed" "employed" ...
 $ occupation_status            : chr  NA NA NA "salaried" ...
 $ rubro_trabaja                : chr  NA NA NA "trabajadores no calificados (ocupaciones elementales)" ...
 $ laboral_ingresos             : chr  "300.001 a 400.000" NA "400.001 a 500.000" NA ...
 $ yr_block                     : num  1965 1980 1980 1990 1990 ...
 $ OBS_series                   : chr  NA NA NA NA ...
 $ senda_series                 : chr  NA NA NA NA ...
 $ pub_center_series            : chr  NA NA NA NA ...
 $ id_centro_series             : chr  NA NA NA NA ...
 $ disch_date_num_rec6          : num  19695 16526 19552 16150 16563 ...
 $ dg_psiq_cie_10_instudy       : logi  FALSE FALSE FALSE FALSE FALSE TRUE ...
 $ dg_psiq_cie_10_dg            : logi  FALSE TRUE TRUE FALSE FALSE FALSE ...
 $ dg_psiq_dsm_iv_instudy       : logi  FALSE FALSE FALSE TRUE TRUE TRUE ...
 $ dg_psiq_dsm_iv_dg            : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ plan_type_series             : chr  "||pg-pab||" "||pg-pab||" "||pg-pai||" "||pg-pab||" ...
  [list output truncated]
 - attr(*, ".internal.selfref")=<externalptr> 
NULL

Code

#https://github.com/rstudio/renv/issues/544
#renv falls back to copying rather than symlinking, which is evidently very slow in this configuration.
renv::settings$use.cache(FALSE)

#only use explicit dependencies (in DESCRIPTION)
renv::settings$snapshot.type("implicit")

#check if rstools is installed
if(Sys.info()["sysname"]=="Windows"){
try(installr::install.Rtools(check_r_update=F))
}

Installing package into ‘G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32’ (as ‘lib’ is unspecified)

Code

check_quarto_version <- function(required = "1.7.29", comparator = c("ge","gt","le","lt","eq")) {
  comparator <- match.arg(comparator)
  current <- package_version(paste(unlist(quarto::quarto_version()), collapse = "."))
  req     <- package_version(required)

  ok <- switch(comparator,
               ge = current >= req,
               gt = current >  req,
               le = current <= req,
               lt = current <  req,
               eq = current == req)

  if (!ok) {
    stop(sprintf("Quarto version check failed: need %s %s (installed: %s).",
                 comparator, required, current), call. = FALSE)
  }
  invisible(TRUE)
}

check_quarto_version("1.7.29", "ge") 

#change repository to CL
local({
  r <- getOption("repos")
  r["CRAN"] <- "https://cran.dcc.uchile.cl/"
  options(repos=r)
})

if(!require(pacman)){install.packages("pacman");require(pacman)}

Cargando paquete requerido: pacman

Code

if(!require(pak)){install.packages("pak");require(pak)}

Cargando paquete requerido: pak

Code

pacman::p_unlock(lib.loc = .libPaths()) #para no tener problemas reinstalando paquetes

No 00LOCK detected in: G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32 No 00LOCK detected in: C:/Program Files/R/R-4.4.1/library

Code

if(Sys.info()["sysname"]=="Windows"){
if (getRversion() != "4.4.1") { stop("Requires R version 4.4.1; Actual: ", getRversion()) }
}

#check docker
check_docker_running <- function() {
  # Try running 'docker info' to check if Docker is running
  system("docker info", intern = TRUE, ignore.stderr = TRUE)
}
if(Sys.info()["sysname"]=="Windows"){
  install_docker <- function() {
    # Open the Docker Desktop download page in the browser for installation
    browseURL("https://www.docker.com/products/docker-desktop")
  }
  # Main logic
  if (inherits(try(check_docker_running(), silent = TRUE), "try-error")) {
    liftr::install_docker()
  } else {
    message("Docker is running.")
  }
}

Warning in system(“docker info”, intern = TRUE, ignore.stderr = TRUE): el comando ejecutado ‘docker info’ tiene el estatus 1

Docker is running.

Code

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#PACKAGES#######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

unlink("*_cache", recursive=T)

# ----------------------------------------------------------------------
# 2. Use a single pak::pkg_install() call for most CRAN packages
# ----------------------------------------------------------------------

paks <-
  c(#"git", 
    # To connect to github
    "gh", #interface for  GitHub API from R
    #
    "gitcreds", # manages Git credentials (usernames, passwords, tokens)
    #
    "usethis", # simplifies common project setup tasks for R developers
    # Package to bring packages in development
    "devtools",
    # Package administration
    "renv",
    # To manipulate data
    "knitr", "pander", "DT",
    # Join
    "fuzzyjoin", "RecordLinkage",
    # For tables
    "tidyverse", "janitor",
    # For contingency tables
    "kableExtra",
    # For connections with python
    "reticulate",
    # To manipulate big data
    "polars", "sqldf",
    # To bring big databases
    "nanoparquet",
    # Interface for R and RStudio in R
    "installr", "rmarkdown", "quarto", "yaml", #"rstudioapi",
    # Time handling
    "clock",
    # Combine plots
    "ggpubr",
    # Parallelized iterative processing
    "furrr",
    # Work like a tibble with a data.table database
    "tidytable",
    # Split database into training and testing
    "caret",
    # Impute missing data
    "missRanger", "mice",
    # To modularize tasks
    "job",
    # For PhantomJS install checks
    "webshot"
  )

# dplyr
# janitor
# reshape2
# tidytable
# arrow
# boot
# broom
# car
# caret
# data.table
# DiagrammeR
# DiagrammeRsvg
# dplyr
# epiR
# epitools
# ggplot2
# glue
# htmlwidgets
# knitr
# lubridate
# naniar
# parallel
# polycor
# pROC
# psych
# readr
# rio
# rsvg
# scales
# stringr
# tableone
# rmarkdown
# biostat3
# codebook
# finalfit
# Hmisc
# kableExtra
# knitr
# devtools
# tidyr
# stringi
# stringr
# muhaz
# sqldf
# compareGroups
# survminer
# lubridate
# ggfortify
# car
# fuzzyjoin
# compareGroups
# caret
# job
# htmltools
# nanoparquet
# ggpubr
# polars
# installr
# clock
# pander
# reshape
# mice
# missRanger
# VIM
# withr
# biostat3
# broom
# glue
# finalfit
# purrr
# sf


# pak::pkg_install(paks)

pak::pak_sitrep()
# pak::sysreqs_check_installed(unique(unlist(paks)))
#pak::lockfile_create(unique(unlist(paks)),  "dependencies_duplicates24.lock", dependencies=T)
#pak::lockfile_install("dependencies_duplicates24.lock")
#https://rdrr.io/cran/pak/man/faq.html
#pak::cache_delete()

library(tidytable)

Adjuntando el paquete: ‘tidytable’

The following objects are masked from ‘package:stats’:

dt, filter, lag

The following object is masked from ‘package:base’:

%in%

Code

library(ggplot2)
library(readr)
library(glmulti)

Cargando paquete requerido: rJava

Cargando paquete requerido: leaps

Code

library(tableone)
library(survivalmodels)
#renv::install("patchwork@1.2.0")
library(survex)
library(SemiMarkov)

Cargando paquete requerido: numDeriv

Cargando paquete requerido: MASS

Adjuntando el paquete: ‘MASS’

The following object is masked from ‘package:tidytable’:

select

Cargando paquete requerido: Rsolnp

Code

library(flexsurv)

Cargando paquete requerido: survival

Code

library(rms)

Cargando paquete requerido: Hmisc

Adjuntando el paquete: ‘Hmisc’

The following object is masked from ‘package:tidytable’:

summarize

The following objects are masked from ‘package:base’:

format.pval, units

Code

library(survidm)
library(caret)

Cargando paquete requerido: lattice

Adjuntando el paquete: ‘caret’

The following object is masked from ‘package:survival’:

cluster

Code

# library(shapr)#https://norskregnesentral.github.io/shapr/
# library(survidm)#https://sci-hub.ee/10.1002/bimj.201700200
# library(SemiMarkov)#https://www.degruyterbrill.com/document/doi/10.1515/ijb-2020-0083/html?lang=en
# #https://www.jclinepi.com/article/S0895-4356(24)00142-2/fulltext

# ----------------------------------------------------------------------
# 3. Activate polars code completion (safe to try even if it fails)
# ----------------------------------------------------------------------
#try(polars_code_completion_activate())

# ----------------------------------------------------------------------
# 4. BPMN from GitHub (not on CRAN, so install via devtools if missing)
# ----------------------------------------------------------------------
if (!requireNamespace("bpmn", quietly = TRUE)) {
  devtools::install_github("bergant/bpmn")
}

# ----------------------------------------------------------------------
# 5. PhantomJS Check (use webshot if PhantomJS is missing)
# ----------------------------------------------------------------------
# if (!webshot::is_phantomjs_installed()) {
#   webshot::install_phantomjs()
# }

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#FUNCTIONS######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
# NO MORE DEBUGS
options(error = NULL)        # si antes tenías options(error = recover) o browser)
options(browserNLdisabled = FALSE)


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
#NAs are replaced with "" in knitr kable
options(knitr.kable.NA = '')

pander::panderOptions('big.mark', ',')
pander::panderOptions('decimal.mark', '.')

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#

#to format rows in bold
format_cells <- function(df, rows ,cols, value = c("italics", "bold", "strikethrough")){

  # select the correct markup
  # one * for italics, two ** for bold
  map <- setNames(c("*", "**", "~~"), c("italics", "bold", "strikethrough"))
  markup <- map[value]  

  for (r in rows){
    for(c in cols){

      # Make sure values are not factors
      df[[c]] <- as.character( df[[c]])

      # Update formatting
      df[r, c] <- ifelse(nchar(df[r, c])==0,"",paste0(markup, gsub(" ", "", df[r, c]), markup))
    }
  }

  return(df)
}
#To produce line breaks in messages and warnings
knitr::knit_hooks$set(
   error = function(x, options) {
     paste('\n\n<div class="alert alert-danger" style="font-size: small !important;">',
           gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
           '</div>', sep = '\n')
   },
   warning = function(x, options) {
     paste('\n\n<div class="alert alert-warning" style="font-size: small !important;">',
           gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
           '</div>', sep = '\n')
   },
   message = function(x, options) {
     paste('<div class="message" style="font-size: small !important;">',
           gsub('##', '\n', x),
           '</div>', sep = '\n')
   }
)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#

sum_dates <- function(x){
  cbind.data.frame(
    min= as.Date(min(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01"),
    p001= as.Date(quantile(unclass(as.Date(x)), .001, na.rm=T), origin = "1970-01-01"),
    p005= as.Date(quantile(unclass(as.Date(x)), .005, na.rm=T), origin = "1970-01-01"),
    p025= as.Date(quantile(unclass(as.Date(x)), .025, na.rm=T), origin = "1970-01-01"),
    p25= as.Date(quantile(unclass(as.Date(x)), .25, na.rm=T), origin = "1970-01-01"),
    p50= as.Date(quantile(unclass(as.Date(x)), .5, na.rm=T), origin = "1970-01-01"),
    p75= as.Date(quantile(unclass(as.Date(x)), .75, na.rm=T), origin = "1970-01-01"),
    p975= as.Date(quantile(unclass(as.Date(x)), .975, na.rm=T), origin = "1970-01-01"),
    p995= as.Date(quantile(unclass(as.Date(x)), .995, na.rm=T), origin = "1970-01-01"),
    p999= as.Date(quantile(unclass(as.Date(x)), .999, na.rm=T), origin = "1970-01-01"),
    max= as.Date(max(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01")
  )
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

# Define the function adapted for Polars
sum_dates_polars <- function(df, date_col) {
  # Create the list of quantiles
  quantiles <- c(0.001, 0.005, 0.025, 0.25, 0.5, 0.75, 0.975, 0.995, 0.999)
  # Create expressions to calculate min and max
  expr_list <- list(
    pl$col(date_col)$min()$alias("min"),
    pl$col(date_col)$max()$alias("max")
  )
  # Add expressions for quantiles
  for (q in quantiles) {
    expr_list <- append(expr_list, pl$col(date_col)$quantile(q)$alias(paste0("p", sub("\\.", "", as.character(q)))))
  }
  # Apply the expressions and return a DataFrame with the results
  df$select(expr_list)
}

# Custom function for sampling with a seed
sample_n_with_seed <- function(data, size, seed) {
  set.seed(seed)
  dplyr::sample_n(data, size)
}

# Function to get the most frequent value 
most_frequent <- function(x) { 
  uniq_vals <- unique(x)
  freq_vals <- sapply(uniq_vals, function(val) sum(x == val))
  most_freq <- uniq_vals[which(freq_vals == max(freq_vals))]
  
  if (length(most_freq) == 1) {
    return(most_freq)
  } else {
    return(NA)
  }
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#CONFIG #######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

options(scipen=2) #display numbers rather scientific number

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

# Define the function first
#joins these values with semicolons and optionally truncates the result if it exceeds a specified width.
toString2 <- function(x, width = NULL, ...) {
    string <- paste(x, collapse = "; ")
    if (missing(width) || is.null(width) || width == 0) 
        return(string)
    if (width < 0) 
        stop("'width' must be positive")
    if (nchar(string, type = "w") > width) {
        width <- max(6, width)
        string <- paste0(substr(string, 1, width - 3), "...")
    }
    string
}
normalize_txt <- function(x) {
  x|>
    stringi::stri_trans_general("Latin-ASCII")|>
    tolower()|>
    trimws()
}

pair_str <- function(main, sub) {
  main <- as.character(main)
  sub  <- as.character(sub)
  both_na <- is.na(main) & is.na(sub)

  out <- paste0(
    tidytable::coalesce(main, "NA"),
    "::",
    tidytable::coalesce(sub,  "NA")
  )

  out[both_na] <- NA_character_
  out
}

# ── Helpers ────────────────────────────────────────────────────────────
mode_pick_int <- function(x){
  x <- x[!is.na(x)]
  if(length(x)==0) return(NA_integer_)
  tx <- sort(table(x), decreasing = TRUE)
  as.integer(names(tx)[1L])
}

subkey_to_label <- function(x){
  y <- gsub("_"," ", tolower(x))
  y <- gsub("amphetamine type stimulants","amphetamine-type stimulants", y)
  y <- gsub("tranquilizers hypnotics","tranquilizers/hypnotics", y)
  y
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# ── Encryption ────────────────────────────────────────────────────────

# Pack: [24-byte nonce][ciphertext] -> hex
encrypt_chr <- function(x, key) {
    vapply(x, function(v) {
        if (is.na(v)) return(NA_character_)
        ct <- sodium::data_encrypt(charToRaw(as.character(v)), key)
        nonce <- attr(ct, "nonce")                 # 24 bytes
        sodium::bin2hex(c(nonce, ct))              # pack nonce + ct
    }, FUN.VALUE = character(1))
}

# Unpack: hex -> [nonce|ciphertext], restore attr(nonce), then decrypt
decrypt_chr <- function(x, key) {
    vapply(x, function(v) {
        if (is.na(v)) return(NA_character_)
        buf <- sodium::hex2bin(v)
        if (length(buf) < 25L) stop("Ciphertext too short or corrupted.")
        nonce <- buf[1:24]
        ct    <- buf[-(1:24)]
        attr(ct, "nonce") <- nonce
        rawToChar(sodium::data_decrypt(ct, key))
    }, FUN.VALUE = character(1))
}

# Function to encrypt a character vector
encrypt_column <- function(x, key) {
  sapply(x, function(item) {
    if (is.na(item) || item == "") {
      return(NA_character_)
    }
    encrypted_raw <- sodium::data_encrypt(charToRaw(item), key)
    base64enc::base64encode(encrypted_raw)  # Convert to base64 for storage
  }, USE.NAMES = FALSE)
}

decrypt_column <- function(x, key) {
  sapply(x, function(item) {
    if (is.na(item)) return(NA_character_)
    encrypted_raw <- base64enc::base64decode(item)
    rawToChar(sodium::data_decrypt(encrypted_raw, key))
  }, USE.NAMES = FALSE)
}


is_stata_ok <- function(x) {
  nchar(x) <= 32 & grepl("^[A-Za-z][A-Za-z0-9_]*$", x)
}


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# ── Tidy RMS ────────────────────────────────────────────────────────

tidy_cph <- function(model) {
  if (!inherits(model, "cph")) stop("Model must be an rms::cph object")
  
  # run summary
  s <- summary(model)
  df_s <- as.data.frame(unclass(s))
  df_s$rown <- rownames(s)
  
  # find rows that are "Hazard Ratio"
  hr_rows <- trimws(df_s$rown) == "Hazard Ratio"
  
  # build tidy tibble
  tibble::tibble(
    term     = trimws(rownames(s)[which(hr_rows) - 1L]), # previous row = variable name
    estimate = df_s$Effect[hr_rows],                     # Hazard Ratio
    conf.low = df_s$`Lower 0.95`[hr_rows],
    conf.high= df_s$`Upper 0.95`[hr_rows],
    p.value  = df_s$P[hr_rows]
  )
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# --- Step 1: Define File Paths ---

# SET THIS to the correct path of your secret text file
secret_file_path <- "C:/Users/andre/Documents/secret.txt"
# This locates the user-level .Renviron file automatically
renviron_path <- file.path(Sys.getenv("HOME"), ".Renviron")
# --- Step 2: Read the Secret Key from the .txt file ---
# Read the first line of the file and trim any whitespace
api_key <- trimws(readLines(secret_file_path, n = 1, warn = FALSE))
# --- Step 3: Check if the key is already in .Renviron ---
# Read all existing lines from .Renviron (if it exists)
if (file.exists(renviron_path)) {
  existing_lines <- readLines(renviron_path, warn = FALSE)
} else {
  existing_lines <- c()
}
# Check if a line for OPENAI_API_KEY already exists
key_exists <- any(grepl("^OPENAI_API_KEY=", existing_lines))
# --- Step 4: Write to .Renviron if the key isn't already there ---
if (!key_exists) {
  # Format the line to be added
  line_to_add <- sprintf('OPENAI_API_KEY="%s"', api_key)
  # Append the line to the .Renviron file, adding a newline before it if the file isn't empty
  cat(line_to_add, "\n", file = renviron_path, append = TRUE)
  message("✅ Success! OPENAI_API_KEY has been added to your .Renviron file.")
  message("Please restart your R session for the change to take effect.")
} else {
  message("ℹ️ The OPENAI_API_KEY already exists in your .Renviron file. No changes were made.")
}

ℹ️ The OPENAI_API_KEY already exists in your .Renviron file. No changes were made.

Code

options(.gander_chat = ellmer::chat_openai(model = "o1-mini"))

Error in contrib.url(repos, "source") : 
  trying to use CRAN without setting a mirror
* pak version:
- 0.8.0.1
* Version information:
- pak platform: x86_64-w64-mingw32 (current: x86_64-w64-mingw32, compatible)
- pak repository: - (local install?)
* Optional packages installed:
- pillar
* Library path:
- G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32
- C:/Program Files/R/R-4.4.1/library
* pak is installed at G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32/pak.
* Dependency versions:
- callr      3.7.6      
- cli        3.6.2      
- curl       5.2.1      
- desc       1.4.3      
- filelock   1.0.3      
- jsonlite   1.8.8      
- lpSolve    5.6.23.9000
- pkgbuild   1.4.4      
- pkgcache   2.2.2.9000 
- pkgdepends 0.7.2.9000 
- pkgsearch  3.1.3.9000 
- processx   3.8.4      
- ps         1.7.6      
- R6         2.5.1      
- zip        2.3.1      
* Dependencies can be loaded

Note

Data Pre-processing and Feature Engineering

Outliers in continuous variables will be managed using Tukey’s method, where values beyond 1.5 times the interquartile range (IQR) from the first or third quartile are flagged for review.

The analysis will incorporate 14 predictor variables, including three continuous variables. To model non-linear relationships, we will generate transformed variables, including logarithmic (log(x)), quadratic (x²), and cubic (x^3) terms. Additionally, some continuous variables may be discretized into categorical features.

If required, continuous predictors will be zero-centered and scaled by a factor to improve the model’s computational performance and numerical stability.

Aims

To identify key prognostic and predictive factors associated with the risk of, and time to, the first readmission to treatment among patients aged 18-64 undergoing initial treatment for Substance Use Disorder (SUD) in Chile between 2010 and 2020.
To identify key prognostic and predictive factors associated with the risk of, and time to, all-cause mortality among the same cohort of patients undergoing initial treatment for SUD in Chile.
To predict the risk and timing of first treatment readmission and all-cause mortality simultaneously, treating these outcomes within a semi-competing risks framework. This will allow for the estimation of cumulative incidence for each event in the presence of the other for patients aged 18-64 who initiated SUD treatment in Chile between 2010 and 2020.

This study has three primary objectives. First, we aim to identify key prognostic and predictive factors associated with the risk and timing of two critical outcomes: (i) first readmission to treatment and (ii) all-cause mortality. Second, using a semi-competing risks framework, we will develop a model to simultaneously predict these two outcomes, accounting for the fact that death precludes the possibility of readmission. Finally, this model will be used to estimate the probability of various patient trajectories over time following initial treatment. The study population includes all patients aged 18-64 who began treatment for Substance Use Disorder (SUD) in Chile between 2010 and 2020.

1. Structure of treatments and rules to collapse continuous entries

The variables involved in the analysis are the following:.

Time to first readmission from admission readmit_time_from_adm_m / from discharge readmit_time_from_disch_m in months, and event (readmit_event)
Time to mortality death_time_from_adm_m / from discharge death_time_from_disch_m in months, and event (death_event) Predisposing factors:
Admission age adm_age_rec3 [log1p(x)=adm_age_log, centered= adm_age_c,^2=adm_age_pow2, ^3=adm_age_pow3, three groups= adm_age_rec3_cat]
Sex (often used for biological categorization) sex_rec
Housing situation tenure_status_household_rec
Employment status occupation_condition_corr24
Marital status marital_status_rec
Poverty index of the commune of residence porc_pobr [log1p(x)=porc_pobr_log, centered= porc_pobr_c, six quantile groups= porc_pobr_c_cat6]
Urbanization level of the commune of residence urbanicity_cat
Educational attainment ed_attainment_corr
Living arrangement/Cohabitation status cohabitation

Need factors: * Psychiatric comorbidity dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg * Severity of Substance Use Disorder (SUD) sub_dep_icd10_status * Primary substance of use primary_sub_mod * Frequency of primary substance use at admission prim_sub_freq (recoded: prim_sub_freq_rec) * Polysubstance use polysubstance_strict

Enabling factors: * Length of stay in treatment (months) dit_rec6 [log1p(x)=treat_log, ^2=treat_days_pow2, ^3=treat_days_pow3, <90days=treat_lt_90] * Treatment outcome tr_outcome * Admission motive adm_motive

Suitable for a research context:.

Court-referred to drug treatment usuario_tribunal_trat_droga (very unbalanced, ~1% yes)
Type of housing tipo_de_vivienda_rec

Stratification variables:.

Treatment modality plan_type_corr

1.1. Cohort Construction for Time-to-Event Analysis

First, we flagged all qualifying treatment episodes by filtering for patients aged 18–65 with an admission date between 2010 and 2020. To ensure each patient served as their own baseline, we isolated only the first qualifying treatment for each individual, removing any second or subsequent qualifying treatments from the dataset. Using this first episode as the index event, we then built time-to-event variables in months for our outcomes of interest: death and readmission. All observations were administratively censored as of December 31, 2020, to all observations. The final analytical dataset included only the first qualifying treatment per patient, carrying forward the time-to-event information and a counter of any discarded prior treatments.

What can I do with adm truncated (0.2%), and currently in (0.1%)??

We created a formatted database in wide format for multistate analysis (SISTRAT23_c1_2010_2024_df_prev1z).

Code

# --- Safe helpers (no warnings) ---
safe_min_where_int  <- function(x, flag) {
  idx <- which(flag %in% TRUE & !is.na(x))
  if (length(idx) == 0L) NA_integer_ else as.integer(min(x[idx]))
}
safe_min_where_date <- function(x, flag) {
  idx <- which(flag %in% TRUE & !is.na(x))
  if (length(idx) == 0L) as.Date(NA) else min(x[idx])
}
safe_min_date <- function(x) {
  x <- x[!is.na(x)]
  if (length(x) == 0L) as.Date(NA) else min(x)
}

# Windows of interest
.start_date  <- as.Date("2010-01-01")
.end_date    <- as.Date("2020-12-31")
.censor_date <- as.Date("2020-12-31")

# 0) Start & canonical order + treatment index
df0 <- SISTRAT23_c1_2010_2024_df_prev1y|>
  tidytable::ungroup()|>
  tidytable::arrange(hash_key, adm_date_rec2, disch_date_rec6)|>
  tidytable::mutate(treatment = tidytable::row_number(), .by = hash_key)

n0_cases <- nrow(df0)
n0_runs  <- nrow(tidytable::distinct(df0, hash_key))

message("0. INITIAL database, cases: ", formatC(nrow(df0), big.mark=","), sep = "")

INITIAL database, cases: 162,897

Code

message("0. INITIAL database, RUNs: " , formatC(nrow(tidytable::distinct(df0, hash_key)), big.mark=","), sep = "")

INITIAL database, RUNs: 121,299

Code

# 1) Qualifying flag (age 18–65 & admission in 2010–2020 inclusive)
df1 <- df0|>
  tidytable::mutate(
    meets_crit = !is.na(adm_age_rec3)  & adm_age_rec3 >= 18 & adm_age_rec3 <= 64 &
                 !is.na(adm_date_rec2) & adm_date_rec2 >= .start_date & adm_date_rec2 <= .end_date
  )

# First qualifying treatment per patient + counter of discarded priors (no warnings)
first_keep_tbl <- df1|>
  tidytable::summarise(
    first_keep_treat = safe_min_where_int(treatment, meets_crit),
    .by = hash_key
  )|>
  tidytable::mutate(
    n_discarded_before = ifelse(is.na(first_keep_treat), NA_integer_,
                                as.integer(first_keep_treat - 1L))
  )

n_patients_meet <- first_keep_tbl|> tidytable::filter(!is.na(first_keep_treat))|> nrow()
message("1. Patients with ≥1 qualifying admission (age 18–64 & 2010–2020): ",
        formatC(n_patients_meet, big.mark=","), sep = "")

Patients with ≥1 qualifying admission (age 18–64 & 2010–2020): 88,632

Code

# 2) Keep ONLY qualifying episodes; rank; drop 3rd+
df_q <- df1|>
  tidytable::filter(meets_crit)|>
  tidytable::arrange(hash_key, adm_date_rec2, disch_date_rec6)|>
  tidytable::mutate(q_rank = tidytable::row_number(), .by = hash_key)|>
  tidytable::filter(q_rank <= 2L)

n1b_cases <- nrow(df_q)
n1b_patients <- nrow(tidytable::distinct(df_q, hash_key))
message("1b. AFTER filtering to qualifying episodes, cases: ",
        formatC(nrow(df_q), big.mark=","), sep = "")

1b. AFTER filtering to qualifying episodes, cases: 107,702

Code

message("1b. AFTER filtering to qualifying episodes, RUNs: ",
        formatC(nrow(tidytable::distinct(df_q, hash_key)), big.mark=","), sep = "")

1b. AFTER filtering to qualifying episodes, RUNs: 88,632

Code

message("2. AFTER dropping ≥3rd qualifying treatments per patient, cases: ",
        formatC(nrow(df_q), big.mark=","), sep = "")

AFTER dropping ≥3rd qualifying treatments per patient, cases: 107,702

Code

message("2. AFTER dropping ≥3rd qualifying treatments per patient, RUNs: ",
        formatC(nrow(tidytable::distinct(df_q, hash_key)), big.mark=","), sep = "")

AFTER dropping ≥3rd qualifying treatments per patient, RUNs: 88,632

Code

# 3) Anchors per patient (warning-free)
anchors <- df_q|>
  tidytable::summarise(
    first_adm   = safe_min_where_date(adm_date_rec2,   q_rank == 1L),
    first_disch = safe_min_where_date(disch_date_rec6, q_rank == 1L),
    second_adm  = safe_min_where_date(adm_date_rec2,   q_rank == 2L),  # NA if no second qualifying
    .by = hash_key
  )

# Death date per patient (from full data, not only qualifying rows)
death_tbl <- df0|>
  tidytable::summarise(death_date = safe_min_date(def_date), .by = hash_key)

# 4) Build per-person time-to-event (no min() calls here)
perperson <- df_q|>
  tidytable::distinct(hash_key)|>
  tidytable::left_join(anchors,       by = "hash_key")|>
  tidytable::left_join(death_tbl,     by = "hash_key")|>
  tidytable::left_join(first_keep_tbl, by = "hash_key")|>
  tidytable::mutate(
    # Event indicators with censoring at 2020-12-31
    readmit_event = as.integer(!is.na(second_adm) & second_adm <= .censor_date),
    death_event   = as.integer(!is.na(death_date)  & death_date  <= .censor_date),

    # End times for intervals (choose event time or censor date)
    readmit_end_from_adm   = ifelse(readmit_event == 1L, second_adm, .censor_date),
    readmit_end_from_disch = ifelse(readmit_event == 1L, second_adm, .censor_date),
    death_end_from_adm     = ifelse(death_event   == 1L, death_date, .censor_date),
    death_end_from_disch   = ifelse(death_event   == 1L, death_date, .censor_date),

    # Durations (months) from first qualifying admission/discharge
    readmit_time_from_adm_m =
      ifelse(!is.na(first_adm),
        as.numeric(lubridate::time_length(lubridate::interval(first_adm,   readmit_end_from_adm),   "months")),
        NA_real_),
    readmit_time_from_disch_m =
      ifelse(!is.na(first_disch),
        as.numeric(lubridate::time_length(lubridate::interval(first_disch, readmit_end_from_disch), "months")),
        NA_real_),
    death_time_from_adm_m =
      ifelse(!is.na(first_adm),
        as.numeric(lubridate::time_length(lubridate::interval(first_adm,   death_end_from_adm),     "months")),
        NA_real_),
    death_time_from_disch_m =
      ifelse(!is.na(first_disch),
        as.numeric(lubridate::time_length(lubridate::interval(first_disch, death_end_from_disch),   "months")),
        NA_real_)
  )|>
  # Clamp negatives (dirty dates safeguard)
  tidytable::mutate(
    readmit_time_from_adm_m   = ifelse(!is.na(readmit_time_from_adm_m),   pmax(readmit_time_from_adm_m,   0), NA_real_),
    readmit_time_from_disch_m = ifelse(!is.na(readmit_time_from_disch_m), pmax(readmit_time_from_disch_m, 0), NA_real_),
    death_time_from_adm_m     = ifelse(!is.na(death_time_from_adm_m),     pmax(death_time_from_adm_m,     0), NA_real_),
    death_time_from_disch_m   = ifelse(!is.na(death_time_from_disch_m),   pmax(death_time_from_disch_m,   0), NA_real_)
  )|>
  tidytable::mutate(discarded_before_flag = as.integer(n_discarded_before > 0L))

# 5) FINAL: keep only the first qualifying treatment row per patient, carry times/counters
SISTRAT23_c1_2010_2024_df_prev1z <-
  df_q|>
  tidytable::filter(q_rank == 1L)|>
  tidytable::left_join(perperson, by = "hash_key")|>
  tidytable::select(-meets_crit, -q_rank)|>
  (\(df){
  
    n_cases   <- nrow(df)
    n_runs    <- tidytable::distinct(df, hash_key) |> nrow()
    red_cases <- nrow(df0) - n_cases
    red_runs  <- (tidytable::distinct(df0, hash_key) |> nrow()) - n_runs
    readmit   <- sum(df$readmit_event)
  
    # store (step 3)
    n_cases   ->> m3_cases
    n_runs    ->> m3_runs
    red_cases ->> m3_cases_red
    red_runs  ->> m3_runs_red
    list(cases=n_cases, runs=n_runs, readmit=readmit, cases_red=red_cases, runs_red=red_runs) ->> m3
  
    message("3. FINAL: first qualifying treatment per patient (with time-to-event), cases: ",
            formatC(n_cases, big.mark=","), sep = "")
    message("3. FINAL: RUNs: ", formatC(n_runs, big.mark=","), sep = "")
    message("   CASES reduction vs initial(prev1y): ", formatC(red_cases, big.mark=","), sep = "")
    message("   RUNs  reduction vs initial(prev1y): ", formatC(red_runs,  big.mark=","), sep = "")
  
    if (n_cases > nrow(df0)) stop("Error: Added treatment episodes in the process")
    df
  })() |>
  tidytable::filter(!tr_compliance_rec7 == "currently in") |>
  (\(df){
  
    n_cases   <- nrow(df)
    n_runs    <- tidytable::distinct(df, hash_key) |> nrow()
    red_cases <- nrow(df0) - n_cases
    red_runs  <- (tidytable::distinct(df0, hash_key) |> nrow()) - n_runs
  
    # store (step 4)
    n_cases   ->> m4_cases
    n_runs    ->> m4_runs
    red_cases ->> m4_cases_red
    red_runs  ->> m4_runs_red
    list(cases=n_cases, runs=n_runs, cases_red=red_cases, runs_red=red_runs) ->> m4
  
    message("4. FINAL: leaving out patients currently in tr., cases: ",
            formatC(n_cases, big.mark=","), sep = "")
    message("4. FINAL: RUNs: ", formatC(n_runs, big.mark=","), sep = "")
    message("   CASES reduction vs initial(prev1y): ", formatC(red_cases, big.mark=","), sep = "")
    message("   RUNs  reduction vs initial(prev1y): ", formatC(red_runs,  big.mark=","), sep = "")
  
    df
  })()

FINAL: first qualifying treatment per patient (with time-to-event), cases: 88,632

FINAL: RUNs: 88,632

CASES reduction vs initial(prev1y): 74,265

RUNs reduction vs initial(prev1y): 32,667

FINAL: leaving out patients currently in tr., cases: 88,504

FINAL: RUNs: 88,504

CASES reduction vs initial(prev1y): 74,393

RUNs reduction vs initial(prev1y): 32,795

Code

#SISTRAT23_c1_2010_2024_df_prev1z |> filter(is.na(dit_m)) |> janitor::tabyl(tr_compliance_rec7, show_na = T)
SISTRAT23_c1_2010_2024_df_prev1z$porc_pobr_c_cat6<- cut2(SISTRAT23_c1_2010_2024_df_prev1z$porc_pobr_c,g=6)

cat("Remove intermediate databases\n")
rm(df0); rm(df1); rm(df_q); rm(anchors); rm(death_tbl); rm(first_keep_tbl)
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_


# --- choose your time origin for the multistate ---
.use_discharge_origin <- FALSE   # TRUE = origin at first_disch; FALSE = origin at first_adm

# Add origin_date
if (.use_discharge_origin) {
  perperson <- tidytable::mutate(SISTRAT23_c1_2010_2024_df_prev1z, origin_date = first_disch)
} else {
  perperson <- tidytable::mutate(SISTRAT23_c1_2010_2024_df_prev1z, origin_date = first_adm)
}

# 1) Drop patients who die before the origin (they never enter the process)
n_drop_predeath <- tidytable::filter(perperson, !is.na(death_date) & !is.na(origin_date) & death_date < origin_date) |>
  nrow()
message("A. Excluding patients who died before origin: ",
        formatC(n_drop_predeath, big.mark=","), sep = "")

A. Excluding patients who died before origin: 0

Code

stopifnot(
nrow(tidytable::filter(perperson, is.na(death_date) | is.na(origin_date) | death_date >= origin_date))== nrow(SISTRAT23_c1_2010_2024_df_prev1z)
)
# 2) Keep only those whose origin is observed and within study window (optional but recommended)
n_before <- nrow(SISTRAT23_c1_2010_2024_df_prev1z)
perperson2 <- tidytable::filter(perperson, !is.na(origin_date) & origin_date <= .censor_date)
message("B. Excluding missing/after-censor origins: ",
        formatC(n_before - nrow(perperson2), big.mark=","), sep = "")

B. Excluding missing/after-censor origins: 0

Code

# 3) Ensure monotone times: second_adm must be after origin (else reclassify as 'no readmission')
# perperson <- tidytable::mutate(perperson,
#     invalid_readmit = !is.na(second_adm) & second_adm <= origin_date,
#     second_adm      = ifelse(invalid_readmit, as.Date(NA), second_adm),
#     readmit_event   = ifelse(invalid_readmit, 0L, readmit_event)
#   )

rm(perperson); rm(perperson2)

Remove intermediate databases

1.1.* Flowchart

Code

#For SISTRAT23_c1_2010_2024_df_prev1t
try(load(paste0(dirname(path),"/SISTRAT_database_sep25_prevt.RData")))

#n_patients_meet;n1b_cases; n1b_patients; m4; m3

df23_ndp_20250824_SISTRAT23_c1_1024<- 
 readRDS(
   paste0(dirname(path),"/psu/23_ndp_20250824_SISTRAT23_c1_1024_df2.rds")
 )

# Labels
lab_orig <- paste0(
  "Original C1 Dataset \n(n = ",
  formatC(nrow(df23_ndp_20250824_SISTRAT23_c1_1024), format = "f", big.mark = ",", digits = 0),
  "; Patients = ",
  formatC(dplyr::n_distinct(df23_ndp_20250824_SISTRAT23_c1_1024$hash_key), format = "f", big.mark = ",", digits = 0),
  ")"
)

lab_orig_clean <- paste(
  "Pre-processing & Quality Checks",
  "&#8226; Remove exact duplicate records",
  "&#8226; Resolve overlapping episodes (keep longest / merge)",
  "&#8226; Fix inconsistent admission/discharge dates",
  "&#8226; Correct implausible birth dates/ages",
  "&#8226; Remove negative/implausible days in treatment",
"",
  sep = "\\l"
)
lab_post_orig  <- paste0('C1 Dataset \n(n = ', formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1t), format='f', big.mark=',', digits=0), '; Patients = ',formatC(dplyr::distinct(SISTRAT23_c1_2010_2024_df_prev1t, hash_key)|> nrow(), format='f', big.mark=',', digits=0),')')
lab_post_orig_clean <- paste("&#8226; Collapse consecutive linked", "treatments (≤45d gap, referral=yes)", sep = "\\l")


lab_proc  <- paste0('C1 Cleaned & Collapsed\n(n = ', formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1y), format='f', big.mark=',', digits=0), '; Patients = ',formatC(dplyr::distinct(SISTRAT23_c1_2010_2024_df_prev1y, hash_key)|> nrow(), format='f', big.mark=',', digits=0),')')
lab_flag  <- "Sorted by Admission Date\n+ First Treatment Flag"


lab_discard_first <- paste(
  "&#8226; Qualifying flag (age 18–64 & admission in 2010–2020 inclusive)",
  "&#8226; Dropping ≥3rd qualifying treatments per patient",
  paste0("n= ", formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1y)-n1b_cases, format = "f", digits = 0, big.mark = ","),"; Patients= ", formatC(dplyr::distinct(SISTRAT23_c1_2010_2024_df_prev1t, hash_key)|> nrow()- n1b_patients, format = "f", digits = 0, big.mark = ",")," put aside"),
"",sep = "\\l")
##CAMBIAR

lab_after <- paste0("After First-Treatment Rule\nn= ",formatC(m3$readmit+m3$cases, format= "f", big.mark=",", digits=0),"; Patients= ", formatC(m3$cases, format= "f", big.mark=",", digits=0))  # example counts
#Frpm 3. FINAL: first qualifying treatment per patient (with time-to-event), cases: 
lab_discard_single <- paste( 
  "1st qualifying treatment per patient (with time-to-event)",
  paste0("Patients = ", formatC(m3$cases-nrow(SISTRAT23_c1_2010_2024_df_prev1z), format = "f", big.mark = ",", digits = 0)), sep="\\l")

lab_final <- paste0("Final C1 Dataset\nn = ",
                    formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1z)+sum(SISTRAT23_c1_2010_2024_df_prev1z$readmit_event), format = "f", digits = 0, big.mark = ","),
                    "; Patients = ",
                    formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1z), format = "f", digits = 0, big.mark = ","))
# Diagram
gr <- DiagrammeR::grViz(
    paste0(
        'digraph flowchart {
      graph [layout = dot, rankdir = TB]

      node [fontname = "Times", shape = rectangle, fontsize = 33, style = filled, fillcolor = white, ranksep=0.2, nodesep=0.2]

      # New initial box and pre-clean note
      pre_original   [label = "', lab_orig, '", fillcolor = lightgray, shape = box]
      pre_clean      [label = "', lab_orig_clean, '", shape = note, fillcolor = white]

      # Intermediate new initial box and pre-clean note
      pre2_original   [label = "', lab_post_orig, '", fillcolor = white, shape = box]
      pre2_clean      [label = "', lab_post_orig_clean, '", shape = note, fillcolor = lemonchiffon]

      # Existing boxes
      original       [label = "', lab_proc, '", fillcolor = lightgray, shape = box]
      marked         [label = "', lab_flag, '", shape = box]
      after_rule     [label = "', lab_after, '", shape = box]
      final_dataset  [label = "', lab_final, '", fillcolor = lightgray, shape = box]

      discard_first   [label = "', lab_discard_first, '", shape = note, fillcolor = mistyrose]
      discard_single  [label = "', lab_discard_single, '", shape = note, fillcolor = mistyrose]

      # Invisible points for alignment
      v00 [shape = point, width = 0, style = invis]  # between orig and post orig
      v0 [shape = point, width = 0, style = invis]  # between pre_original and original      
      v1 [shape = point, width = 0, style = invis]
      v2 [shape = point, width = 0, style = invis]
      v3 [shape = point, width = 0, style = invis]

      # Main flow (add pre_original -> v0 -> original in front)
      pre_original -> v00 [arrowhead = none]
      v00 -> pre2_original
      
      pre2_original -> v0 [arrowhead = none]
      v0 -> original      

      original -> v1 [arrowhead = none]
      v1 -> marked
      marked -> v2 [arrowhead = none]
      v2 -> after_rule
      after_rule -> v3 [arrowhead = none]
      v3 -> final_dataset

      # Discard / note connections with solid black arrows
      v00 -> pre_clean     [color = black]     # new pre-processing note
      v0 -> pre2_clean     [color = black]     # new pre-processing note
      v2 -> discard_first [color = black]
      v3 -> discard_single[color = black]

      # Alignment
      { rank = same; pre_clean; v00 }
      { rank = same; pre2_clean; v0 }
      { rank = same; discard_first; v2 }
      { rank = same; discard_single; v3 }
    }'
    ),
  width = 600, height = 900
)

gr

Figure 3. Flowchart

Exported the graph into png fortmat.

Code

unlink(paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_pred_files"), recursive = TRUE)
htmlwidgets::saveWidget(gr, paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_pred.html"))
webshot::webshot(paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_pred.html"), 
                 paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_pred.png"),
                 vwidth = 300, vheight = 300*1.5,  zoom=10, expand=100)  # Prueba con diferentes coordenadas top, left, width, and height

Registered S3 methods overwritten by ‘callr’: method from format.callr_status_error
print.callr_status_error

1.2. Death decomposition

We decomposed the cumulative incidence of death following an initial treatment episode by its temporal relationship to the first SUD treatment readmission.

Cumulative incidence functions (CIFs) were estimated using the Aalen–Johansen method, and total mortality was also calculated via Kaplan–Meier to validate consistency. The resulting decomposition shows how overall mortality is partitioned over time relative to readmission, visualized as stacked cumulative probabilities.

Code

# ──────────────────────────────────────────────────────────────────────────────
# 0) Base y nombres
df2 <- SISTRAT23_c1_2010_2024_df_prev1z |>
  dplyr::rename(
    t_r = readmit_time_from_disch_m,
    e_r = readmit_event,
    t_d = death_time_from_disch_m,
    e_d = death_event
  ) |>
  # Sanitiza tipos
  dplyr::mutate(
    e_r = as.integer(e_r),
    e_d = as.integer(e_d)
  )

# Chequeo rápido (opcional)
stopifnot(nrow(df2) > 0)
# t_d debe existir para todos (muerte o fin de seguimiento)
stopifnot(all(is.finite(df2$t_d)))

# ──────────────────────────────────────────────────────────────────────────────
# 1) Definir causas de MUERTE (mutuamente excluyentes) y tiempo de análisis
#    1 = muere antes/igual que la 1ª readmisión
#    2 = muere después de la 1ª readmisión
#    0 = censurado (no muere)
df3 <- df2 |>
  dplyr::mutate(
    cause = dplyr::case_when(
      e_d == 1L & (e_r == 0L | t_d <= t_r) ~ 1L,   # muerte antes/igual que readmisión
      e_d == 1L &  e_r == 1L & t_r <  t_d  ~ 2L,   # readmisión primero, luego muerte
      TRUE ~ 0L                                    # no muere (censura)
    ),
    # Tiempo de análisis:
    #   - muertes: t_d (tiempo a muerte)
    #   - censura: t_d (tiempo a última observación)
    ftime2 = t_d
  )

# Consistencia con tu 2x2
# with(df3, table(cause, e_d))
# Esperado: cause=1 -> 3203 muertes; cause=2 -> 744 muertes

# ──────────────────────────────────────────────────────────────────────────────
# 2) CIF de muerte por causa (competing risks)
ci2 <- cmprsk::cuminc(ftime = df3$ftime2, fstatus = df3$cause, cencode = 0)

# Helper robusto: toma el ÚLTIMO número del nombre (la causa), ignora "Tests"
get_cif_by_code <- function(ci, code = 1L){
  keep <- vapply(ci, function(x) !is.null(x$est), logical(1))
  nm   <- names(ci)[keep]
  ci   <- ci[keep]
  codes <- suppressWarnings(as.integer(sub(".*[^0-9]([0-9]+)$", "\\1", nm)))
  idx   <- which(!is.na(codes) & codes == code)[1]
  if (!length(idx) || is.na(idx)) return(data.frame(time = numeric(0), est = numeric(0)))
  data.frame(time = ci[[idx]]$time, est = ci[[idx]]$est)
}

cif_d_before <- get_cif_by_code(ci2, 1L)  # muerte antes/igual que readmisión
cif_d_after  <- get_cif_by_code(ci2, 2L)  # muerte después de readmisión

# ──────────────────────────────────────────────────────────────────────────────
# 3) KM de muerte total (chequeo externo)
km <- survival::survfit(survival::Surv(time = df3$ftime2, event = df3$e_d) ~ 1)
km_df <- data.frame(time = km$time, P_dead_total = 1 - km$surv)

# ──────────────────────────────────────────────────────────────────────────────
# 4) Comparación a horizontes fijos (12, 60, 120 y tiempo final observado)
t_end <- max(df3$ftime2, na.rm = TRUE)
taus  <- c(12, 60, 120, t_end)

risk_at <- function(fit, tau) 1 - summary(fit, times = tau, extend = TRUE)$surv
AJ_at   <- function(cif_df, tau){
  if (!nrow(cif_df)) return(rep(0, length(tau)))
  # Aproximación por función escalón (respeta naturaleza de AJ)
  sf <- stepfun(cif_df$time, c(0, cif_df$est))
  sf(tau)
}

horiz <- data.frame(
  tau_m       = taus,
  KM_death    = sapply(taus, risk_at, fit = km),
  CIF_before  = AJ_at(cif_d_before, taus),
  CIF_after   = AJ_at(cif_d_after,  taus)
) |>
  dplyr::mutate(total_CIF = CIF_before + CIF_after)

# print(horiz)
# Esperado: KM_death ≈ total_CIF en cada tau (pequeñas diferencias numéricas)

# ──────────────────────────────────────────────────────────────────────────────
# 5) Serie completa para gráfico (con stepfun; evita errores de interpolación)
sf_km <- stepfun(km_df$time,             c(0, km_df$P_dead_total))
sf_c1 <- stepfun(cif_d_before$time,      c(0, cif_d_before$est))
sf_c2 <- stepfun(cif_d_after$time,       c(0, cif_d_after$est))

t_grid <- sort(unique(c(km_df$time, cif_d_before$time, cif_d_after$time)))
out <- data.frame(
  time             = t_grid,
  CIF_death_before = sf_c1(t_grid),
  CIF_death_after  = sf_c2(t_grid),
  P_dead_total     = sf_km(t_grid)
)
out$sum_cifs <- out$CIF_death_before + out$CIF_death_after

# Chequeo final (debe acercarse al KM final ~ 9%)
# tail(out[, c("time","CIF_death_before","CIF_death_after","sum_cifs","P_dead_total")], 3)

# ──────────────────────────────────────────────────────────────────────────────
# 6) Gráfico: descomposición correcta de la mortalidad total
ggplot(out, aes(x = time)) +
  geom_ribbon(aes(ymin = 0, ymax = CIF_death_before,
                  fill = "Muerte antes de readmisión"), alpha = 0.55) +
  geom_ribbon(aes(ymin = CIF_death_before, ymax = sum_cifs,
                  fill = "Muerte después de readmisión"), alpha = 0.55) +
  geom_step(aes(y = P_dead_total, color = "Mortalidad total (KM)"),
            linewidth = 0.9) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_fill_manual(values = c("Muerte antes de readmisión"   = "#A17C6C",
                               "Muerte después de readmisión" = "#5FA9B3")) +
  scale_color_manual(values = c("Mortalidad total (KM)" = "#660600")) +
  labs(x = "Meses desde el alta", y = "Probabilidad acumulada",
       fill = NULL, color = NULL,
       title = "Mortalidad acumulada: descomposición por relación con readmisión",
       subtitle = "CIF de muerte antes vs. después de la primera readmisión; línea = KM total") +
  theme_minimal(base_size = 16) +
  theme(legend.position = "bottom")

Code

# ──────────────────────────────────────────────────────────────────────────────

# CIF de "primer evento" (1=readmisión, 2=muerte), con tiempos al primer evento
ftime_first  <- pmin(df2$t_r, df2$t_d, na.rm = TRUE)
fstatus_first <- dplyr::case_when(
    df2$e_r == 1L & (df2$e_d == 0L | df2$t_r <  df2$t_d) ~ 1L, # readmisión primero
    df2$e_d == 1L & (df2$e_r == 0L | df2$t_d <= df2$t_r) ~ 2L, # muerte primero
    TRUE ~ 0L
)
ci_first <- cmprsk::cuminc(ftime = ftime_first, fstatus = fstatus_first, cencode = 0)
# readmisión como primer evento a 12m:
tp_first <- cmprsk::timepoints(ci_first, times = 12)
tp_first$est[,"12"]

# ──────────────────────────────────────────────────────────────────────────────
# Graficar
p <- out|>
    ggplot2::ggplot(ggplot2::aes(x = time)) +
    ggplot2::geom_ribbon(ggplot2::aes(ymin = 0, ymax = CIF_death_before,
                                      fill = "Death before readmission"), alpha = 0.5) +
    ggplot2::geom_ribbon(ggplot2::aes(ymin = CIF_death_before, ymax = P_dead_total,
                                      fill = "Death after readmission"), alpha = 0.5) +
    ggplot2::geom_step(ggplot2::aes(y = P_dead_total, color = "Total death (KM)"),
                       linewidth = 0.7) +
    ggplot2::geom_step(ggplot2::aes(y = CIF_death_before, color = "CIF of death before readmission"),
                       linewidth = 0.7) +
    ggplot2::scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                                limits = c(0, max(out[,"sum_cifs"]))) +
    ggplot2::scale_fill_manual(values = c("Death before readmission"= "#a6cee3",
                                          "Death after readmission"= "#b2df8a")) +
    ggplot2::scale_color_manual(values = c("Total death (KM)"= "#1f78b4", "Death before readmission"= "#666666")) +
    ggplot2::labs(x = "Months since discharge", y = "Cumulative probability",
                  fill = NULL, color = NULL) +
    ggplot2::theme_minimal(base_size = 20) +
    ggplot2::theme(legend.position = "bottom")

print(p)

Code

ggsave(
  filename = paste0(gsub("/cons","",getwd()), "/cons/_figs/_death_decomposition.png"),
   plot = p +
    labs(x = "Meses desde el egreso", y = "Probabilidad acumulada",
         fill = NULL, color = NULL)+
      scale_fill_manual(
        values = c(
          "Death before readmission" = "#a6cee3",
          "Death after readmission" = "#B48448"
        ),
        labels = c(
          "Death before readmission" = "Muerte antes de la readmisión",
          "Death after readmission" = "Muerte después de la readmisión"
        )
      ) +
      scale_color_manual(
        values = c(
          "Total death (KM)" = "#5A0D13",
          "CIF of death before readmission" = "#666666"
        ),
        labels = c(
          "Total death (KM)" = "Muerte total (KM)",
          "CIF of death before readmission" = "CIF de muerte antes de la readmisión"
        )
      )+ ggplot2::theme(
    legend.position = "bottom",
    legend.box = "horizontal",
    legend.text = ggplot2::element_text(size=16, margin=margin(l=0, r=-3, unit="pt")),
    legend.title = ggplot2::element_blank(),
    legend.spacing.x = grid::unit(0.2, "cm"),
    legend.spacing.y = grid::unit(0.2, "cm"),
    legend.key.width = grid::unit(1.0, "cm")
  ) +
  ggplot2::guides(
    fill = ggplot2::guide_legend(nrow = 2, byrow = TRUE, keywidth = unit(0.5, "cm")),
    color = ggplot2::guide_legend(nrow = 2, byrow = TRUE, keywidth = unit(0.5, "cm"))
  )+ ggplot2::scale_x_continuous(
  breaks = scales::breaks_width(12),
  labels = function(m) paste0(m, "m")    # o en años: paste0(m/12, " años")
),
    dpi = 600
)

Saving 7 x 5 in image Scale for fill is already present. Adding another scale for fill, which will replace the existing scale. Scale for colour is already present. Adding another scale for colour, which will replace the existing scale.

A total of 1,126 patients (1.3%) had valid treatments between 18-64 in their second or third treatment.

1.3. Rule-Based Aggregation into ICD-10–Informed groups

To reduce sparsity and improve interpretability, we grouped specific diagnostic labels into four clinically coherent supergroups aligned with ICD-10 logic: (1) Infectious diseases (STIs, viral hepatitis, and infections related to substance use); (2) Organ-system medical diseases (cardiovascular, hepatic, oral, and hematologic conditions); (3) Injuries and sequelae; and (4) Other specified medical conditions (broad/vague descriptors such as “enfermedades somáticas”, “otras enfermedades…”, and pregnancy-related items). We preserved the status values “en estudio” and “sin trastorno” as-is, and left missing values as NA. For multi-response entries (diagnostico_trs_fisico_series), we parsed “||”-separated items, applied the same mapping to each token, deduplicated within-row, and produced both a collapsed supergroup string and one-hot indicators for the supergroups to avoid inflating counts while preventing sparse categories.

Code

icd10_broad <- function(x) {
x <- tolower(x)
tidytable::case_when(
stringr::str_detect(x, "organic|organico|demenc|alzheimer|parkinson|delirium|cerebral") ~ "F0 Organic",
stringr::str_detect(x, "psicoactiva|alcohol|marihuana|canabis|cannabis|cocain|opio|opiace|benzodiazep|sustancias") ~ "F1 Substance use",
stringr::str_detect(x, "esquizofren|psicotip|delirant|psicosis") ~ "F2 Psychotic",
stringr::str_detect(x, "estado de animo|afectiv|depres|bipolar|maniaco|distimia|hipoman") ~ "F3 Mood",
stringr::str_detect(x, "neurotic|ansiedad|fobi|panico|obsesivo|compulsiv|estres|adaptaci|somatoform|somatiz") ~ "F4 Anxiety/Stress/Somatoform",
stringr::str_detect(x, "comportamiento.*fisiolog|alimentari|anorex|bulim|sueñ|insomni|disfuncion sexual") ~ "F5 Physio/Eating/Sleep/Sexual",
stringr::str_detect(x, stringr::regex("personalidad|comportamiento del adulto|antisocial|limite|evitativ|habit|habitos|impuls|control de los impulsos|control\\s+de\\s+impulsos", ignore_case = TRUE)) ~ "F6 Personality/Adult behaviour",
stringr::str_detect(x, "retraso mental|discapacidad intelectual|intelectual") ~ "F7 Intellectual disability",
stringr::str_detect(x, "desarrollo psicolog|autism|asperger|lenguaje|aprendizaje|espectro autista|tdah|t\\s*d\\s*a\\s*h") ~ "F8/9 Neurodevelopment/Child",
TRUE ~ "Other/unspecified"
)
}

cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Create dummied for psychiatric comorbidity\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Living arrangement/Cohabitation status\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Domestic violence & Sex abuse\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Primary subs\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Tr outcome\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")

# Create multi-label dummies using tidytable
SISTRAT23_c1_2010_2024_df_prev1z <- 
  SISTRAT23_c1_2010_2024_df_prev1z|>
  tidytable::mutate(dx_norm = normalize_txt(mod_psiq_cie_10))|>
  tidytable::mutate(dit_m = readmit_time_from_adm_m-readmit_time_from_disch_m)|>
  # avoid NAs propagating into str_detect
  tidytable::mutate(dx_norm_nna = ifelse(is.na(dx_norm), "", dx_norm))|>
  tidytable::mutate(
    f0_organic                         = stringr::str_detect(dx_norm_nna, "mentales organicos"),
    f1_substance_use                   = stringr::str_detect(dx_norm_nna, "sustancias psicoactivas|consumo de sustancias|psicoactiva"),
    f2_psychotic                       = stringr::str_detect(dx_norm_nna, "esquizofren|esquizotip|ideas? delirant|psicosis"),
    f3_mood                            = stringr::str_detect(dx_norm_nna, "humor \\(afectivos\\)"),
    f4_anxiety_stress_somatoform       = stringr::str_detect(dx_norm_nna, "neuroticos|estresantes|somatomorfos"),
    f5_physio_eating_sleep_sexual      = stringr::str_detect(dx_norm_nna, "conducta alimentaria|disfunciones fisiologicas|factores somaticos"),
    f6_personality_adult_behaviour     = stringr::str_detect(dx_norm_nna, "personalidad y del comportamiento del adulto|habitos y del control de los impulsos|transformacion persistente de la personalidad"),
    f7_intellectual_disability         = stringr::str_detect(dx_norm_nna, "retraso mental|discapacidad intelectual"),
    f8_9_neurodevelopment_child        = stringr::str_detect(dx_norm_nna, "desarrollo psicolog|comienzo habitual en la infancia y adolescencia")
  )|>
  tidytable::mutate(marital_status_rec= tidytable::case_when(marital_status== "separated/divorced/annulled"~ "separated/divorced/annulled/widowed", marital_status=="widowed"~"separated/divorced/annulled/widowed", T~ marital_status))|>
  tidytable::mutate(
  cqv_norm=con_quien_vive|>stringi::stri_trans_general("Latin-ASCII")|>
    stringr::str_to_lower()|>stringr::str_squish())|>
  tidytable::mutate(
    # 5-level (more granular)
    con_quien_vive_joel5=dplyr::case_when(
      # Alone
      stringr::str_detect(cqv_norm,"^solo$")~"Alone",
      # Couple/children only (no family-of-origin)
      stringr::str_detect(cqv_norm,"^unicamente con (la )?pareja( e hijos)?$")|
        stringr::str_detect(cqv_norm,"^unicamente con pareja$")|
        stringr::str_detect(cqv_norm,"^unicamente con hijos$")~
        "Couple/children only",
      # Couple/children + family-of-origin (extended household)
      stringr::str_detect(cqv_norm,"(pareja|hijos).*(padres|familia de origen)")|
        stringr::str_detect(cqv_norm,"^con la pareja,? hijos y padres")~
        "Couple/children + origin",
      # Family-of-origin only
      stringr::str_detect(cqv_norm,"^unicamente con padres|^con padres|familia de origen$")|
        stringr::str_detect(cqv_norm,"^con la madre \\(sola\\)$")|
        stringr::str_detect(cqv_norm,"^con abuelos$")|
        stringr::str_detect(cqv_norm,"^con hermanos$")~
        "Family of origin",
      # Others (friends / other non-relatives / other relative)
      stringr::str_detect(cqv_norm,"con amigos|otro no pariente|otro pariente|^otros$")~
        "Others",
      TRUE~"Others"))|>
  tidytable::mutate(
    # 4-level (collapsed)
    cohabitation=dplyr::case_when(
      con_quien_vive_joel5=="Alone"~"alone",
      con_quien_vive_joel5 %in% c("Couple/children only","Couple/children + origin")~"with couple/children",
      con_quien_vive_joel5=="Family of origin"~"family of origin",
      TRUE~"Others"
    ))|>
  tidytable::mutate(
    tenure_status_household_rec= tidytable::case_when(grepl("illegal|others",tenure_status_household)~"illegal settlement & others", T~tenure_status_household)
  )|>
  tidytable::mutate(
    cohabitation=forcats::fct_relevel(
      factor(cohabitation),
      c("alone","with couple/children","family of origin","others")
    ))|>
  tidytable::select(-tidytable::any_of(c("cqv_norm", "con_quien_vive_joel5", "dx_norm_nna")))|>
  tidytable::mutate(dom_violence=factor(tidytable::case_when(
      grepl("Violencia Intrafamiliar$",otros_probl_at_sm_or, ignore.case=T)~1,
      is.na(otros_probl_at_sm_or)~NA_real_,
      T~0),levels=c(0,1),labels=c("No domestic violence","Domestic violence")))|>
    tidytable::mutate(sex_abuse= factor(tidytable::case_when(
      grepl("Abuso Sexual",otros_probl_at_sm_or, ignore.case=T)~1,
      is.na(otros_probl_at_sm_or)~NA_real_,
      T~0),levels=c(0,1),labels=c("No sexual abuse","Sexual abuse")))|>
    tidytable::mutate(
        primary_sub_std = stringr::str_to_lower(stringr::str_squish(primary_sub)),
        primary_sub_mod = tidytable::case_when(
            stringr::str_detect(primary_sub_std, "cocaine\\s*(base\\s*)?paste|pasta\\s*base|\\bpaco\\b") ~ "cocaine paste",
            stringr::str_detect(primary_sub_std, "cocaine\\s*(powder|hydrochloride|hcl)") |
                stringr::str_detect(primary_sub_std, "clorhidrato\\s*de\\s*coca|clorhidrato|hidrocloruro") ~ "cocaine powder",
            stringr::str_detect(primary_sub_std, "\\balcohol\\b") ~ "alcohol",
            stringr::str_detect(primary_sub_std, "marij|cannab|marihu") ~ "marijuana",
            TRUE ~ "others"
        ))|>
    tidytable::mutate(primary_sub_mod = factor(primary_sub_mod,
                                    levels = c("cocaine paste","cocaine powder","alcohol","marijuana","others")))|>
  tidytable::select(-tidytable::any_of(c("primary_sub_std", "dx_norm")))|>
  tidytable::mutate(urbanicity_cat= tidytable::case_when(grepl("Mix",clasificacion)~ "2.Mixed", grepl("Urb",clasificacion)~ "3.Urban", grepl("Rur",clasificacion)~ "1.Rural", T~NA_character_))|>
  tidytable::mutate(ethnicity= ifelse(!is.na(ethnicity_c1_c6_historic),1,0))|>
    tidytable::mutate(
        tr_outcome = tidytable::case_when(
            tr_compliance_rec7 == "currently in" & is.na(adm_disch_reason) ~ NA_character_,
            stringr::str_detect(coalesce(tr_compliance_rec7, ""), "dropout") & is.na(adm_disch_reason) ~ "dropout",
            # explicit non-discharge statuses with NA reason
            tr_compliance_rec7 == "referral"    & is.na(adm_disch_reason) ~ "referral",
            tr_compliance_rec7 == "completion"  & is.na(adm_disch_reason) ~ "completion",
            # discharge branches (note the parentheses)
            stringr::str_detect(coalesce(tr_compliance_rec7, ""), "discharge") &
                adm_disch_reason %in% c("agreement_end", "death", "no_local_service") ~ "adm discharge - adm reasons",
            stringr::str_detect(coalesce(tr_compliance_rec7, ""), "discharge") &
                (adm_disch_reason == "rule_violation" | is.na(adm_disch_reason)) ~ "adm discharge - rule violation/undet",
            TRUE ~ "other"
        ))

Warning: 1 unknown level in f: others

Code

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Recode Type of housing, Admission age, Treatment days, Tr. compliance; Nationallity (Chile); \n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")

SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
  tipo_de_vivienda_rec <- factor(tidytable::case_when(
    tipo_de_vivienda %in% c("casa", "departamento") ~ "formal housing",
    tipo_de_vivienda == "pieza dentro de la vivienda" ~ "shared/secondary unit",
    tipo_de_vivienda%in% c("caleta o punto de calle", "mediagua movil (carpa, casa rodante o similar)", "hospederia", "residencial, pension, hostal", "choza, rancho, ruca", "informal/rural housing") ~ "homeless/unsheltered/informal/temporary housing/institutional/collective",
    is.na(tipo_de_vivienda)~ NA_character_,
    TRUE ~ "other/unknown"  # catches "otro" and NA
    ),
  levels = c(
    "formal housing",
    "shared/secondary unit",
    "homeless/unsheltered/informal/temporary housing/institutional/collective",
    "other/unknown"
    ))
})

SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
  prim_sub_freq_rec <- factor(tidytable::case_when(
    prim_sub_freq %in% c("1. Less than 1 day a week", "2. 1 day a week")~ "1.≤1 day/wk",
    prim_sub_freq %in% c("3. 2 to 3 days a week", "4. 4 to 6 days a week")~ "2.2–6 days/wk",
    is.na(prim_sub_freq)~ NA_character_,
    TRUE ~ "3.Daily"  # catches "otro" and NA
    ),
  levels = c(
    "1.≤1 day/wk",
    "2.2–6 days/wk",
    "3.Daily"
    ))
})

SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
  tipo_de_vivienda_rec2 <- factor(tidytable::case_when(
    tipo_de_vivienda %in% c("casa", "departamento") ~ "formal housing",
    # tipo_de_vivienda == "pieza dentro de la vivienda" ~ "shared/secondary unit",
    # tipo_de_vivienda%in% c("caleta o punto de calle", "mediagua movil (carpa, casa rodante o similar)", "hospederia", "residencial, pension, hostal", "choza, rancho, ruca", "informal/rural housing") ~ "homeless/unsheltered/informal/temporary housing/institutional/collective",
    is.na(tipo_de_vivienda)~ NA_character_,
    TRUE ~ "other/unknown"  # catches "otro" and NA
    ),
  levels = c(
    "formal housing",
    # "shared/secondary unit",
    # "homeless/unsheltered/informal/temporary housing/institutional/collective",
    "other/unknown"
    ))
})

#Admission age
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_log <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(adm_age_rec3), NA_real_, log1p(adm_age_rec3)))

SISTRAT23_c1_2010_2024_df_prev1z$adm_age_c <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(adm_age_rec3), NA_real_, scale(adm_age_rec3, center = TRUE, scale = FALSE)))

#Warning in log1p(adm_age_rec3) : Se han producido NaNs
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_pow2 <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(adm_age_rec3), NA_real_, adm_age_rec3^2))
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_pow3 <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(adm_age_rec3), NA_real_, adm_age_rec3^3))

# Admission Age categories
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_cat <- with(
    SISTRAT23_c1_2010_2024_df_prev1z,
    {
        res <- rep(NA_character_, length(adm_age_rec3))
        res[adm_age_rec3 >= 18 & adm_age_rec3 < 30] <- "18-29"
        res[adm_age_rec3 >= 30 & adm_age_rec3 < 45] <- "30-44"
        res[adm_age_rec3 >= 45 & adm_age_rec3 < 65] <- "45-64"
        res
    }
)
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_cat <- factor(
  SISTRAT23_c1_2010_2024_df_prev1z$adm_age_cat,
  levels = c("18-29", "30-44", "45-64")
)
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_c <- scale(SISTRAT23_c1_2010_2024_df_prev1z$adm_age_rec3, center = TRUE, scale = FALSE)


invisible("Created the 90 days translated in months by subsetting and taking the average value")
dit90<- 
mean(SISTRAT23_c1_2010_2024_df_prev1z$dit_m[which(SISTRAT23_c1_2010_2024_df_prev1z$dit_rec6==90)])

# Treatment duration transformations
SISTRAT23_c1_2010_2024_df_prev1z$treat_log <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(dit_m), NA_real_, log1p(dit_m)))
#Warning in log1p(dit_m) : Se han producido NaNs
SISTRAT23_c1_2010_2024_df_prev1z$treat_days_pow2 <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(dit_m), NA_real_, dit_m^2))
SISTRAT23_c1_2010_2024_df_prev1z$treat_days_pow3 <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(dit_m), NA_real_, dit_m^3))
SISTRAT23_c1_2010_2024_df_prev1z$treat_lt_90 <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(dit_m), NA_integer_, as.integer(dit_m < dit90)))

SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
  sex_rec <- dplyr::case_when(
    sex_rec == "mujer" ~ "woman",
    sex_rec == "hombre" ~ "man",
    T~ NA_character_
  )
  sex_rec <- factor(sex_rec, levels = c(
    "man",
    "woman"
  ))
})
SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
  nationality_chile <- dplyr::case_when(
    nationality_cons== "chile" ~ "chile",
    !nationality_cons== "chile" ~ "other",
    T~ NA_character_
  )
  nationality_chile <- factor(nationality_chile, levels = c(
    "chile",
    "other"
  ))
})

SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
  any_violence <- dplyr::case_when(
    sex_abuse== "Sexual abuse"| dom_violence=="Domestic violence"~ "1.Domestic violence/sex abuse",
    grepl("No", sex_abuse) & grepl("No", dom_violence)~ "0.No domestic violence/sex abuse",
    grepl("No", sex_abuse) & is.na(dom_violence)~ "0.No domestic violence/sex abuse",
    is.na(sex_abuse) & grepl("No", dom_violence)~ "0.No domestic violence/sex abuse",
    T~ NA_character_
  )
  any_violence <- factor(any_violence, levels = c(
    "1.Domestic violence/sex abuse",
    "0.No domestic violence/sex abuse"
  ))
})
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

# 1) Define supergroup sets and a vectorized classifier that keeps NAs as NA
infectious_set <- c(
  "ets",
  "hepatitis b, c, d",
  "infecciosas relacionadas con uso de sustancias"
)
organ_set <- c(
  "cardiopatias: miocardiopatia dilatada por oh, arritmias, hta",
  "hepatitis alcoholica subaguda",
  "hepatitis cronica",
  "patologia bucal",
  "anemia: megaloblastica y ferropenica"
)
injury_set <- c("traumatismos y secuelas secundarios")
other_spec_set <- c(
  "otras enfermedades o condiciones fisicas limitantes",
  "otras enfermedades o condiciones de riesgo vital",
  "enfermedades somaticas",
  "patologia de la gestion y del nino intrauterino"
)
supergroups_levels <- c(
  "en estudio",
  "sin trastorno",
  "Infectious diseases (STIs, viral hepatitis, substance-use–related)",
  "Organ-system medical diseases (cardio, hepatic, oral, hematologic)",
  "Injuries and sequelae",
  "Other specified medical conditions"
)
classify_dx_vec <- function(x) {
  x <- stringr::str_squish(x)
  out <- rep(NA_character_, length(x))
  # statuses (kept as-is)
  out[x == "en estudio"] <- "en estudio"
  out[x == "sin trastorno"] <- "sin trastorno"
  # infectious
  out[x %in% infectious_set] <- "Infectious diseases (STIs, viral hepatitis, substance-use–related)"
  # organ/system
  out[x %in% organ_set] <- "Organ-system medical diseases (cardio, hepatic, oral, hematologic)"
  # injuries
  out[x %in% injury_set] <- "Injuries and sequelae"
  # other specified (broad/vague)
  out[x %in% other_spec_set] <- "Other specified medical conditions"
  # funnel any unforeseen non-NA labels to "Other specified"
  out[is.na(out) & !is.na(x)] <- "Other specified medical conditions"
  out
}
# 2) Apply to the single-diagnosis field; keep is.na(.) as NA
SISTRAT23_c1_2010_2024_df_prev1z <- SISTRAT23_c1_2010_2024_df_prev1z|>
  dplyr::mutate(
    phys_dx_supergroup = classify_dx_vec(diagnostico_trs_fisico),
    phys_dx_supergroup = forcats::fct_relevel(
      factor(phys_dx_supergroup),
      supergroups_levels))
# 3) Apply to the multi-response series (split on "||", classify, dedup per row)
#    - Produce: a collapsed string per row and wide one-hot indicators
multi_tokens <- SISTRAT23_c1_2010_2024_df_prev1z|>
  dplyr::mutate(.rowid = dplyr::row_number(),
                dx_series_raw = diagnostico_trs_fisico_series)|>
  tidyr::separate_rows(dx_series_raw, sep = "\\|\\|")|>
  dplyr::mutate(
    dx_token = stringr::str_squish(dx_series_raw),
    dx_token = dplyr::na_if(dx_token, ""),
    dx_token = dplyr::na_if(dx_token, "NA"),
    supergroup_token = classify_dx_vec(dx_token)
  )|>
  dplyr::distinct(.rowid, supergroup_token, .keep_all = TRUE)
# Collapsed string per row (NA if nothing classifiable present)
series_collapsed <- multi_tokens|>
  dplyr::filter(!is.na(supergroup_token))|>
  dplyr::group_by(.rowid)|>
  dplyr::summarise(
    dx_supergroup_series = paste(sort(unique(supergroup_token)), collapse = "||"),
    .groups = "drop"
  )
# One-hot indicators per supergroup (sanitized column names)
series_dummies <- multi_tokens|>
  dplyr::filter(!is.na(supergroup_token))|>
  dplyr::mutate(value = 1L)|>
  dplyr::distinct(.rowid, supergroup_token, .keep_all = TRUE)|>
  tidyr::pivot_wider(
    names_from = supergroup_token,
    values_from = value,
    values_fill = list(value = 0)
  )|>
  dplyr::rename_with(.fn = base::make.names, .cols = - .rowid)|> 
  dplyr::select(dplyr::any_of(c(".rowid", "hash_key", "en.estudio", "Other.specified.medical.conditions", "Organ.system.medical.diseases..cardio..hepatic..oral..hematologic.", "sin.trastorno", "Injuries.and.sequelae", "Infectious.diseases..STIs..viral.hepatitis..substance.use.related.")))|> 
  janitor::clean_names()|>
  dplyr::rename("phys_dg_org_med_card_hep_oral_hem"="organ_system_medical_diseases_cardio_hepatic_oral_hematologic", "phys_dg_infect_viral_hep_sud_rel"="infectious_diseases_st_is_viral_hepatitis_substance_use_related", "phys_dg_instudy"="en_estudio", "phys_dg_other_med_cond"="other_specified_medical_conditions", "phys_dg_inj_sequelae"="injuries_and_sequelae")|>
  dplyr::group_by(hash_key)|>
  dplyr::summarise(
    phys_dx_instudy = sum(phys_dg_instudy, na.rm = TRUE),
    phys_dx_other_spec_medical_cond = sum(phys_dg_other_med_cond, na.rm = TRUE),
    phys_dx_organ_system_med_dis = sum(phys_dg_org_med_card_hep_oral_hem, na.rm = TRUE),
    phys_dx_injuries_and_sequelae = sum(phys_dg_inj_sequelae, na.rm = TRUE),
    phys_dx_infectious_diseases = sum(phys_dg_infect_viral_hep_sud_rel, na.rm = TRUE),
    .groups = "drop"
  )|>
  # Optional: Reorder columns for clarity (hash_key first, then dummies)
  dplyr::select(hash_key, dplyr::everything())
# 4) Join back to your main data (keeps rows with NA series as NA)
SISTRAT23_c1_2010_2024_df_prev1z <- SISTRAT23_c1_2010_2024_df_prev1z|>
  dplyr::mutate(.rowid = dplyr::row_number())|>
  dplyr::left_join(series_collapsed, by = ".rowid", multiple="first")|>
  dplyr::left_join(series_dummies,  by = "hash_key", multiple="first")|>
  dplyr::select(-.rowid)|> 
  dplyr::mutate(
    # assuming logical columns like dx_F0, dx_F2, ..., dx_F8_9
    dx_f2_smi_psychotic   = as.integer(f2_psychotic),
    dx_f3_mood            = as.integer(f3_mood),
    dx_f45_anx_stress_phys = as.integer(f4_anxiety_stress_somatoform| f5_physio_eating_sleep_sexual),
    dx_f6_personality     = as.integer(f6_personality_adult_behaviour),
    dx_f0789_neurocog_dev    = as.integer(f0_organic | f7_intellectual_disability | f8_9_neurodevelopment_child)
    # optional: single collapsed factor, with priority
  )

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
hist(table(round(SISTRAT23_c1_2010_2024_df_prev1z$dit_m,1), exclude=NULL), breaks=120, main= "Histogram: Months in treatment", xlab="Months")

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Create dummied for psychiatric comorbidity
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Living arrangement/Cohabitation status
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Domestic violence & Sex abuse
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Primary subs
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Tr outcome
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Recode Type of housing, Admission age, Treatment days, Tr. compliance; Nationallity (Chile); 
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

1.4. Missingness

We assessed missing data across predictors, calculating the overall proportion of complete cases and percentage of missing values for each variable.

Code

# Character vector of the variables
tot_variables <- c(
  "readmit_time_from_adm_m", "readmit_time_from_disch_m", "readmit_event",
  "death_time_from_adm_m", "death_time_from_disch_m", "death_event",
  "sex_rec", "adm_age_log", "adm_age_pow2", "adm_age_pow3", "adm_age_c", "adm_age_cat", "tenure_status_household", "occupation_condition_corr24",
  "marital_status_rec", "porc_pobr", "porc_pobr_log", "porc_pobr_c",
  "ethnicity", "nationality_chile", "urbanicity_cat", "ed_attainment_corr", "cohabitation",
  "dg_psiq_cie_10_instudy", "dg_psiq_cie_10_dg",  "f0_organic", "f1_substance_use", "f2_psychotic",
  "f3_mood", "f4_anxiety_stress_somatoform", "f5_physio_eating_sleep_sexual",
  "f6_personality_adult_behaviour", "f7_intellectual_disability", "f8_9_neurodevelopment_child", 
  "dx_f2_smi_psychotic", "dx_f3_mood", "dx_f45_anx_stress_phys", "dx_f6_personality", "dx_f0789_neurocog_dev",
  "sub_dep_icd10_status", "dom_violence", "sex_abuse", "any_violence",
  "phys_dx_supergroup","phys_dx_instudy", "phys_dx_other_spec_medical_cond", "phys_dx_organ_system_med_dis", 
  "phys_dx_injuries_and_sequelae", "phys_dx_infectious_diseases",
  "prim_sub_freq", "prim_sub_freq_rec", "polysubstance_strict", "dit_m", "treat_log",
  "treat_days_pow2", "treat_days_pow3", "treat_lt_90", "tr_outcome",
  "adm_motive", "primary_sub", "primary_sub_mod", "usuario_tribunal_trat_droga",
  "tipo_de_vivienda_rec", "tipo_de_vivienda_rec2", "plan_type_corr", "evaluacindelprocesoteraputico", 
  
  "eva_consumo", "eva_fam", "eva_relinterp", "eva_ocupacion", "eva_sm", "eva_fisica", "eva_transgnorma"
)
# Character vector of the variables restricted to factor or character columns
char_variables <- c(
  "sex_rec", "tenure_status_household", "occupation_condition_corr24",
  "marital_status_rec", "ethnicity", "urbanicity_cat", "ed_attainment_corr", "cohabitation",
  "dg_psiq_cie_10_instudy", "dg_psiq_cie_10_dg", "f0_organic", "f1_substance_use", "f2_psychotic",
  "f3_mood", "f4_anxiety_stress_somatoform", "f5_physio_eating_sleep_sexual",
  "f6_personality_adult_behaviour", "f7_intellectual_disability", "f8_9_neurodevelopment_child", 
  "dx_f2_smi_psychotic", "dx_f3_mood", "dx_f45_anx_stress_phys", "dx_f6_personality", "dx_f0789_neurocog_dev",
  "sub_dep_icd10_status", "dom_violence", "sex_abuse", "any_violence",
  "phys_dx_supergroup", "phys_dx_instudy", "phys_dx_other_spec_medical_cond", "phys_dx_organ_system_med_dis", 
  "phys_dx_injuries_and_sequelae", "phys_dx_infectious_diseases",
  "prim_sub_freq", "prim_sub_freq_rec", "polysubstance_strict", "treat_lt_90", "tr_outcome",
  "adm_motive", "primary_sub", "primary_sub_mod", "usuario_tribunal_trat_droga",
  "tipo_de_vivienda_rec", "tipo_de_vivienda_rec2", "plan_type_corr", "evaluacindelprocesoteraputico",
  "eva_consumo", "eva_fam", "eva_relinterp", "eva_ocupacion", "eva_sm", "eva_fisica", "eva_transgnorma"
)

paste0( "Percentage of the total that has missing values: ",
  scales::percent(1- (
      SISTRAT23_c1_2010_2024_df_prev1z[
        which(complete.cases(subset(SISTRAT23_c1_2010_2024_df_prev1z, select = tot_variables
            ))),] %>% nrow() / nrow(SISTRAT23_c1_2010_2024_df_prev1z) 
    ), accuracy = 0.1),
  "; total = ", format(nrow(SISTRAT23_c1_2010_2024_df_prev1z), big.mark = ",")
)
paste0( "Percentage of the total that has missing values (w/o other variables): ",
  scales::percent(1- (
      SISTRAT23_c1_2010_2024_df_prev1z[
        which(complete.cases(subset(SISTRAT23_c1_2010_2024_df_prev1z, select = setdiff(tot_variables,   c("usuario_tribunal_trat_droga", "dom_violence", "sex_abuse", "any_violence", "ethnicity", "tipo_de_vivienda_rec", "tipo_de_vivienda_rec2", "phys_dx_supergroup", "phys_dx_instudy", "phys_dx_other_spec_medical_cond", "phys_dx_organ_system_med_dis", "phys_dx_injuries_and_sequelae", "evaluacindelprocesoteraputico"))
            ))),] %>% nrow() / nrow(SISTRAT23_c1_2010_2024_df_prev1z) 
    ), accuracy = 0.1),
  "; total = ", format(nrow(SISTRAT23_c1_2010_2024_df_prev1z), big.mark = ",")
)

# Variables to check
vars_check <- c(tot_variables)
# Compute % missing per variable
missing_pct <- colMeans(
  is.na(subset(SISTRAT23_c1_2010_2024_df_prev1z, select = vars_check))
) * 100
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# Named vector of labels (names = variable names)
labels_map <- c(
  # Times & events
  readmit_time_from_adm_m         = "Readmission time from admission (months)",
  readmit_time_from_disch_m       = "Readmission time from discharge (months)",
  death_time_from_adm_m           = "Death time from admission (months)",
  death_time_from_disch_m         = "Death time from discharge (months)",
  readmit_event                   = "Readmission event",
  death_event                     = "Death event",

  # Demographics & status
  sex_rec                         = "Sex (biological categorization)",
  tenure_status_household         = "Housing situation",
  tenure_status_household_rec     = "Housing situation (- informal settlement)",
  occupation_condition_corr24     = "Employment status",
  marital_status_rec              = "Marital status",
  ed_attainment_corr              = "Educational attainment",
  ethnicity                       = "Ethnicity (=1)",
  nationality_chile               = "Nationality (Chile)",
  adm_age_log                     = "Admission age (log)",
  adm_age_pow2                    = "Admission age ^2",
  adm_age_pow3                    = "Admission age ^3",
  adm_age_c                       = "Admission age (centered)",
  adm_age_cat                     = "Admission age (three groups)",

  # Contextual
  porc_pobr                       = "Poverty index of commune of residence",
  porc_pobr_log                   = "Poverty index (log1p)",
  porc_pobr_c                     = "Poverty index (centered)",
  porc_pobr_c_cat6                = "Poverty index (6 quantile groups)",
  urbanicity_cat                  = "Urbanization level of commune of residence",

  # Living arrangement
  cohabitation                    = "Cohabitation status (harmonized)",

  # Clinical/behavioral
  dg_psiq_cie_10_instudy          = "Psychiatric comorbidity (ICD-10, in-study)",
  dg_psiq_cie_10_dg               = "Psychiatric comorbidity (ICD-10, diagnosis record)",
  sub_dep_icd10_status            = "SUD severity (ICD-10)",
  prim_sub_freq                   = "Primary-substance use frequency at admission",
  prim_sub_freq_rec               = "Primary-substance use frequency at admission (simplified)",
  polysubstance_strict            = "Polysubstance use (strict)",

  # Treatment exposure
  dit_m                           = "Length of stay in treatment (months)",
  treat_log                       = "Length of stay (log1p months)",
  treat_days_pow2                 = "Length of stay (months)^2",
  treat_days_pow3                 = "Length of stay (months)^3",
  treat_lt_90                     = "Treatment length < 90 days (indicator)",
  tr_outcome                      = "Treatment outcome",
  adm_motive                      = "Admission motive",
  primary_sub                     = "Primary substance of use (detailed substances)",
  primary_sub_mod                 = "Primary substance of use (recoded)",

  # Research-context extras
  usuario_tribunal_trat_droga     = "Court-referred to drug treatment (1 = yes)",
  tipo_de_vivienda_rec            = "Housing type (recoded)",
  tipo_de_vivienda_rec2           = "Housing type (recoded & simplified)",

  # Stratification
  plan_type_corr                  = "Treatment modality (plan type)",
  prim_sub_licit                  = "Licit primary substance at admission",
  f0_organic                      = "F0. Organic, including symptomatic, mental disorders",
  f1_substance_use                = "F1. Mental and behavioural disorders due to psychoactive substance use",
  f2_psychotic                    = "F2. Schizophrenia, schizotypal and delusional disorders",
  f3_mood                         = "F3. Mood [affective] disorders",
  f4_anxiety_stress_somatoform    = "F4. Neurotic, stress-related and somatoform disorders",
  f5_physio_eating_sleep_sexual   = "F5. Behavioural syndromes associated with physiological disturbances and physical factors",
  f6_personality_adult_behaviour  = "F6. Disorders of adult personality and behaviour",
  f7_intellectual_disability      = "F7. Intellectual disability",
  f8_9_neurodevelopment_child     = "F8-9. Disorders of psychological development and behavioural/emotional disorders with onset in childhood and adolescence",
  dx_f2_smi_psychotic             = "Psychotic disorders (F2)",
  dx_f3_mood                      = "Mood disorders (F3)",
  dx_f45_anx_stress_phys          = "Anxiety & stress-related (F4–F5)",
  dx_f6_personality               = "Personality disorders (F6)",
  dx_f0789_neurocog_dev           = "Neurocognitive & neurodevelopmental (F0, F7–F9)",

  # Violence indicators
  any_violence                    = "Domestic violence/Sexual abuse",
  dom_violence                    = "Domestic violence",
  sex_abuse                       = "Sexual abuse",

  # Physical diagnosis groups
  phys_dx_supergroup              = "Physical diagnosis supergroup (single)",
  phys_dx_instudy                 = "Physical diagnosis: under study",
  phys_dx_other_spec_medical_cond = "Physical diagnosis: other medical conditions",
  phys_dx_organ_system_med_dis    = "Physical diagnosis: organ-system diseases",
  phys_dx_injuries_and_sequelae   = "Physical diagnosis: injuries & sequelae",
  phys_dx_infectious_diseases     = "Physical diagnosis: infectious diseases",

  # Evaluation at discharge
  evaluacindelprocesoteraputico   = "Assessment of the therapeutic process",
  eva_consumo                     = "Evaluation of consumption pattern at discharge",
  eva_fam                         = "Family situation evaluation at discharge",
  eva_relinterp                   = "Interpersonal relations evaluation at discharge",
  eva_ocupacion                   = "Occupational situation evaluation at discharge",
  eva_sm                          = "Mental health evaluation at discharge",
  eva_fisica                      = "Physical health evaluation at discharge",
  eva_transgnorma                 = "Social norm transgression evaluation at discharge"
)

# Apply labels safely (only to variables that exist)
for(nm in intersect(names(SISTRAT23_c1_2010_2024_df_prev1z), names(labels_map))){
  attr(SISTRAT23_c1_2010_2024_df_prev1z[[nm]],"label")<-labels_map[[nm]]
}
# Extract labels for each variable, defaulting to NA if not present
labels_vec <- sapply(names(missing_pct), function(nm) {
  lbl <- attr(SISTRAT23_c1_2010_2024_df_prev1z[[nm]], "label")
  if (is.null(lbl)) NA_character_ else lbl
})

# Round and format into data.frame with labels
data.frame(
  Variable = names(missing_pct),
  Label = labels_vec,
  Missing_Percent = round(missing_pct, 2))|>
  # Display as markdown table
  knitr::kable(format = "markdown",
    align = c("l", "l", "r"),
    col.names = c("Variable", "Label", "% Missing"),
    row.names = FALSE)

[1] "Percentage of the total that has missing values: 33.3%; total = 88,504"
[1] "Percentage of the total that has missing values (w/o other variables): 6.3%; total = 88,504"

Variable	Label	% Missing
readmit_time_from_adm_m	Readmission time from admission (months)	0.00
readmit_time_from_disch_m	Readmission time from discharge (months)	0.00
readmit_event	Readmission event	0.00
death_time_from_adm_m	Death time from admission (months)	0.00
death_time_from_disch_m	Death time from discharge (months)	0.00
death_event	Death event	0.00
sex_rec	Sex (biological categorization)	0.00
adm_age_log	Admission age (log)	0.00
adm_age_pow2	Admission age ^2	0.00
adm_age_pow3	Admission age ^3	0.00
adm_age_c	Admission age (centered)	0.00
adm_age_cat	Admission age (three groups)	0.00
tenure_status_household	Housing situation	5.06
occupation_condition_corr24	Employment status	0.00
marital_status_rec	Marital status	0.18
porc_pobr	Poverty index of commune of residence	0.00
porc_pobr_log	Poverty index (log1p)	0.00
porc_pobr_c	Poverty index (centered)	0.00
ethnicity	Ethnicity (=1)	0.00
nationality_chile	Nationality (Chile)	0.00
urbanicity_cat	Urbanization level of commune of residence	0.00
ed_attainment_corr	Educational attainment	0.43
cohabitation	Cohabitation status (harmonized)	0.00
dg_psiq_cie_10_instudy	Psychiatric comorbidity (ICD-10, in-study)	0.00
dg_psiq_cie_10_dg	Psychiatric comorbidity (ICD-10, diagnosis record)	0.00
f0_organic	F0. Organic, including symptomatic, mental disorders	0.00
f1_substance_use	F1. Mental and behavioural disorders due to psychoactive substance use	0.00
f2_psychotic	F2. Schizophrenia, schizotypal and delusional disorders	0.00
f3_mood	F3. Mood [affective] disorders	0.00
f4_anxiety_stress_somatoform	F4. Neurotic, stress-related and somatoform disorders	0.00
f5_physio_eating_sleep_sexual	F5. Behavioural syndromes associated with physiological disturbances and physical factors	0.00
f6_personality_adult_behaviour	F6. Disorders of adult personality and behaviour	0.00
f7_intellectual_disability	F7. Intellectual disability	0.00
f8_9_neurodevelopment_child	F8-9. Disorders of psychological development and behavioural/emotional disorders with onset in childhood and adolescence	0.00
dx_f2_smi_psychotic	Psychotic disorders (F2)	0.00
dx_f3_mood	Mood disorders (F3)	0.00
dx_f45_anx_stress_phys	Anxiety & stress-related (F4–F5)	0.00
dx_f6_personality	Personality disorders (F6)	0.00
dx_f0789_neurocog_dev	Neurocognitive & neurodevelopmental (F0, F7–F9)	0.00
sub_dep_icd10_status	SUD severity (ICD-10)	0.00
dom_violence	Domestic violence	20.09
sex_abuse	Sexual abuse	20.09
any_violence	Domestic violence/Sexual abuse	20.09
phys_dx_supergroup	Physical diagnosis supergroup (single)	0.17
phys_dx_instudy	Physical diagnosis: under study	0.17
phys_dx_other_spec_medical_cond	Physical diagnosis: other medical conditions	0.17
phys_dx_organ_system_med_dis	Physical diagnosis: organ-system diseases	0.17
phys_dx_injuries_and_sequelae	Physical diagnosis: injuries & sequelae	0.17
phys_dx_infectious_diseases	Physical diagnosis: infectious diseases	0.17
prim_sub_freq	Primary-substance use frequency at admission	0.46
prim_sub_freq_rec	Primary-substance use frequency at admission (simplified)	0.46
polysubstance_strict	Polysubstance use (strict)	0.00
dit_m	Length of stay in treatment (months)	0.00
treat_log	Length of stay (log1p months)	0.00
treat_days_pow2	Length of stay (months)^2	0.00
treat_days_pow3	Length of stay (months)^3	0.00
treat_lt_90	Treatment length < 90 days (indicator)	0.00
tr_outcome	Treatment outcome	0.00
adm_motive	Admission motive	0.00
primary_sub	Primary substance of use (detailed substances)	0.00
primary_sub_mod	Primary substance of use (recoded)	0.00
usuario_tribunal_trat_droga	Court-referred to drug treatment (1 = yes)	4.40
tipo_de_vivienda_rec	Housing type (recoded)	7.26
tipo_de_vivienda_rec2	Housing type (recoded & simplified)	7.26
plan_type_corr	Treatment modality (plan type)	0.00
evaluacindelprocesoteraputico	Assessment of the therapeutic process	0.25
eva_consumo	Evaluation of consumption pattern at discharge	0.27
eva_fam	Family situation evaluation at discharge	0.27
eva_relinterp	Interpersonal relations evaluation at discharge	0.27
eva_ocupacion	Occupational situation evaluation at discharge	0.27
eva_sm	Mental health evaluation at discharge	0.27
eva_fisica	Physical health evaluation at discharge	0.27
eva_transgnorma	Social norm transgression evaluation at discharge	0.27

1.4.* Currently in

We identified 215 cases with missing treatment outcomes. For these cases, treatment outcome, duration of treatment, and other treatment-related variables (e.g., evaluation at discharge) are unknown.

Code

SISTRAT23_c1_2010_2024_df_prev1z|> 
  dplyr::filter(tr_outcome=="other")|> 
  dplyr::select(hash_key, tr_outcome, adm_date_rec2, disch_date_rec6, dit_rec6, adm_year)|>
    #janitor::tabyl(adm_year)
  dplyr::mutate(
     hash_key = gsub(
         "(?<=^.{8}).{2}",   # lookbehind for first 8 chars, match the next 2
         "**",               # replace with two asterisks
         hash_key,
         perl = TRUE
     )) |> 
  head()

# A tidytable: 6 × 6
  hash_key            tr_outcome adm_date_rec2 disch_date_rec6 dit_rec6 adm_year
  <chr>               <chr>      <date>        <date>             <dbl>    <int>
1 0065a4d6**af0c519d… other      2016-08-08    2019-08-09          1096     2016
2 00be34d5**32b9d1ee… other      2016-04-13    2019-04-14          1096     2016
3 01fea888**92b29fcc… other      2019-05-01    2022-05-01          1096     2019
4 0426e03d**26cb489d… other      2011-04-27    2014-04-27          1096     2011
5 048e829d**f1109af2… other      2011-07-25    2014-07-25          1096     2011
6 04b987f5**a824ea89… other      2011-02-07    2014-02-07          1096     2011

1.5. Descriptives

Code

## 0) Guard: use only columns that exist
tot_variables_present<-intersect(tot_variables,names(SISTRAT23_c1_2010_2024_df_prev1z))
char_variables_present<-intersect(c(char_variables, "readmit_event", "death_event"),names(SISTRAT23_c1_2010_2024_df_prev1z))
extra_cat<-intersect(c("disch_age_cat"),names(SISTRAT23_c1_2010_2024_df_prev1z))
extra_num<-intersect(c("disch_age_rec"),names(SISTRAT23_c1_2010_2024_df_prev1z))

categorical_vars<-union(char_variables_present,extra_cat)
numerical_vars<-setdiff(union(tot_variables_present,union(extra_cat,extra_num)),categorical_vars)

## 1) Create TableOne (coerce character categoricals to factor inline)
table_one_clean<-tableone::CreateTableOne(
  vars= setdiff(c(categorical_vars,numerical_vars), "plan_type_corr"),
  data= tidytable::mutate(SISTRAT23_c1_2010_2024_df_prev1z,
    tidytable::across(tidyselect::all_of(categorical_vars),\(x){if(is.character(x)) factor(x) else x}),
porc_pobr= porc_pobr*100),
  factorVars= setdiff(char_variables_present,"plan_type_corr"),
  strata= "plan_type_corr",
  addOverall= TRUE,
  test= TRUE,
  smd= TRUE#,
  #labelTranslations=label_map
)

## 4) Print (treat numeric as non-normal; keep your formatting)
escape_regex<-function(x) gsub("([][{}()+*^$.|?\\\\])","\\\\\\1",x)

rn<-rownames(table_one_clean_print)
keys<-names(labels_map)
keys<-keys[order(nchar(keys),decreasing=TRUE)]  # avoid partial overlaps

for(nm in keys){
  pat<-paste0("^",escape_regex(nm),"(?=\\s|\\(|:|=|$)")  # start, then space/(...)/: /= or end
  rn<-sub(pat,labels_map[[nm]],rn,perl=TRUE)            # replace only the prefix; keep the rest
}
rownames(table_one_clean_print)<-rn

wdpath<- paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))

## 5) HTML table
table_one_clean_print|>
  knitr::kable("html", caption= "Table 1. Descriptives by baseline treatment setting")|>
  kableExtra::kable_classic()|>
  kableExtra::row_spec(0:1, bold = TRUE)|>
  kableExtra::scroll_box(height = "400px")

write.table(table_one_clean_print, 
            file= paste0(wdpath,"cons/_out/table_one_prediction_clean.txt"), 
            sep = "\t", row.names = T)

Table 1. Descriptives by baseline treatment setting
	level	Overall	m-pai	m-pr	pg-pab	pg-pai	pg-pr	p	test	SMD	Missing
n		88,504	5,263	3,545	32,424	37,555	9,717
Sex (biological categorization) (%)	man	65806 (74.4)	0 (0.0)	0 (0.0)	26249 (81.0)	30414 (81.0)	9143 (94.1)	<0.001		2.377	0.0
	woman	22696 (25.6)	5263 (100.0)	3545 (100.0)	6175 (19.0)	7139 (19.0)	574 (5.9)
Housing situation (%)	illegal settlement	947 (1.1)	65 (1.3)	86 (2.6)	227 (0.7)	340 (0.9)	229 (2.7)	<0.001		0.200	5.1
	others	2380 (2.8)	139 (2.7)	89 (2.7)	974 (3.1)	875 (2.4)	303 (3.5)
	owner/transferred dwellings/pays dividends	30670 (36.5)	1716 (33.4)	940 (28.5)	12047 (38.8)	12451 (34.7)	3516 (40.7)
	renting	15476 (18.4)	1007 (19.6)	471 (14.3)	5936 (19.1)	6818 (19.0)	1244 (14.4)
	stays temporarily with a relative	34553 (41.1)	2214 (43.1)	1708 (51.9)	11877 (38.2)	15410 (42.9)	3344 (38.7)
Employment status (%)	employed	43336 (49.0)	1603 (30.5)	361 (10.2)	21086 (65.0)	18574 (49.5)	1712 (17.6)	<0.001		0.792	0.0
	inactive	14307 (16.2)	1972 (37.5)	1665 (47.0)	3570 (11.0)	5386 (14.3)	1714 (17.6)
	unemployed	30861 (34.9)	1688 (32.1)	1519 (42.8)	7768 (24.0)	13595 (36.2)	6291 (64.7)
Marital status (%)	married/cohabiting	29768 (33.7)	1662 (31.6)	822 (23.2)	13127 (40.6)	11976 (31.9)	2181 (22.5)	<0.001		0.211	0.2
	separated/divorced/annulled/widowed	10269 (11.6)	644 (12.2)	441 (12.5)	3881 (12.0)	4234 (11.3)	1069 (11.0)
	single	48304 (54.7)	2953 (56.2)	2277 (64.3)	15339 (47.4)	21279 (56.8)	6456 (66.5)
Ethnicity (=1) (%)	0	82870 (93.6)	4868 (92.5)	3312 (93.4)	30386 (93.7)	35158 (93.6)	9146 (94.1)	0.003		0.028	0.0
	1	5634 (6.4)	395 (7.5)	233 (6.6)	2038 (6.3)	2397 (6.4)	571 (5.9)
Urbanization level of commune of residence (%)	1.Rural	7669 (8.7)	210 (4.0)	187 (5.3)	3030 (9.3)	3803 (10.1)	439 (4.5)	<0.001		0.176	0.0
	2.Mixed	8659 (9.8)	383 (7.3)	261 (7.4)	4116 (12.7)	3203 (8.5)	696 (7.2)
	3.Urban	72176 (81.6)	4670 (88.7)	3097 (87.4)	25278 (78.0)	30549 (81.3)	8582 (88.3)
Educational attainment (%)	1-More than high school	16157 (18.3)	899 (17.1)	641 (18.1)	5356 (16.6)	7672 (20.5)	1589 (16.4)	<0.001		0.085	0.4
	2-Completed high school or less	48791 (55.4)	2785 (53.0)	1855 (52.4)	17756 (55.1)	20891 (55.8)	5504 (56.7)
	3-Completed primary school or less	23176 (26.3)	1567 (29.8)	1042 (29.5)	9108 (28.3)	8847 (23.6)	2612 (26.9)
Cohabitation status (harmonized) (%)	alone	8415 (9.5)	278 (5.3)	337 (9.5)	2815 (8.7)	3404 (9.1)	1581 (16.3)	<0.001		0.468	0.0
	with couple/children	39730 (44.9)	3507 (66.6)	1828 (51.6)	16525 (51.0)	15679 (41.7)	2191 (22.5)
	family of origin	32725 (37.0)	1010 (19.2)	954 (26.9)	10683 (32.9)	15142 (40.3)	4936 (50.8)
	Others	7634 (8.6)	468 (8.9)	426 (12.0)	2401 (7.4)	3330 (8.9)	1009 (10.4)
Psychiatric comorbidity (ICD-10, in-study) (%)	FALSE	73020 (82.5)	4482 (85.2)	2678 (75.5)	27173 (83.8)	30921 (82.3)	7766 (79.9)	<0.001		0.118	0.0
	TRUE	15484 (17.5)	781 (14.8)	867 (24.5)	5251 (16.2)	6634 (17.7)	1951 (20.1)
Psychiatric comorbidity (ICD-10, diagnosis record) (%)	FALSE	49013 (55.4)	2459 (46.7)	1260 (35.5)	20654 (63.7)	19896 (53.0)	4744 (48.8)	<0.001		0.256	0.0
	TRUE	39491 (44.6)	2804 (53.3)	2285 (64.5)	11770 (36.3)	17659 (47.0)	4973 (51.2)
F0. Organic, including symptomatic, mental disorders (%)	FALSE	87399 (98.8)	5233 (99.4)	3492 (98.5)	32163 (99.2)	36958 (98.4)	9553 (98.3)	<0.001		0.057	0.0
	TRUE	1105 (1.2)	30 (0.6)	53 (1.5)	261 (0.8)	597 (1.6)	164 (1.7)
F1. Mental and behavioural disorders due to psychoactive substance use (%)	FALSE	88504 (100.0)	5263 (100.0)	3545 (100.0)	32424 (100.0)	37555 (100.0)	9717 (100.0)	NA		<0.001	0.0
F2. Schizophrenia, schizotypal and delusional disorders (%)	FALSE	87145 (98.5)	5240 (99.6)	3500 (98.7)	32248 (99.5)	36691 (97.7)	9466 (97.4)	<0.001		0.103	0.0
	TRUE	1359 (1.5)	23 (0.4)	45 (1.3)	176 (0.5)	864 (2.3)	251 (2.6)
F3. Mood [affective] disorders (%)	FALSE	79431 (89.7)	4575 (86.9)	3088 (87.1)	29383 (90.6)	33645 (89.6)	8740 (89.9)	<0.001		0.065	0.0
	TRUE	9073 (10.3)	688 (13.1)	457 (12.9)	3041 (9.4)	3910 (10.4)	977 (10.1)
F4. Neurotic, stress-related and somatoform disorders (%)	FALSE	86864 (98.1)	5158 (98.0)	3513 (99.1)	31682 (97.7)	36894 (98.2)	9617 (99.0)	<0.001		0.061	0.0
	TRUE	1640 (1.9)	105 (2.0)	32 (0.9)	742 (2.3)	661 (1.8)	100 (1.0)
F5. Behavioural syndromes associated with physiological disturbances and physical factors (%)	FALSE	88051 (99.5)	5217 (99.1)	3514 (99.1)	32287 (99.6)	37359 (99.5)	9674 (99.6)	<0.001		0.033	0.0
	TRUE	453 (0.5)	46 (0.9)	31 (0.9)	137 (0.4)	196 (0.5)	43 (0.4)
F6. Disorders of adult personality and behaviour (%)	FALSE	63374 (71.6)	3400 (64.6)	1887 (53.2)	25216 (77.8)	26369 (70.2)	6502 (66.9)	<0.001		0.236	0.0
	TRUE	25130 (28.4)	1863 (35.4)	1658 (46.8)	7208 (22.2)	11186 (29.8)	3215 (33.1)
F7. Intellectual disability (%)	FALSE	87866 (99.3)	5233 (99.4)	3520 (99.3)	32208 (99.3)	37278 (99.3)	9627 (99.1)	0.063		0.018	0.0
	TRUE	638 (0.7)	30 (0.6)	25 (0.7)	216 (0.7)	277 (0.7)	90 (0.9)
F8-9. Disorders of psychological development and behavioural/emotional disorders with onset in childhood and adolescence (%)	FALSE	87237 (98.6)	5177 (98.4)	3494 (98.6)	32100 (99.0)	37033 (98.6)	9433 (97.1)	<0.001		0.060	0.0
	TRUE	1267 (1.4)	86 (1.6)	51 (1.4)	324 (1.0)	522 (1.4)	284 (2.9)
Psychotic disorders (F2) (%)	0	87145 (98.5)	5240 (99.6)	3500 (98.7)	32248 (99.5)	36691 (97.7)	9466 (97.4)	<0.001		0.103	0.0
	1	1359 (1.5)	23 (0.4)	45 (1.3)	176 (0.5)	864 (2.3)	251 (2.6)
Mood disorders (F3) (%)	0	79431 (89.7)	4575 (86.9)	3088 (87.1)	29383 (90.6)	33645 (89.6)	8740 (89.9)	<0.001		0.065	0.0
	1	9073 (10.3)	688 (13.1)	457 (12.9)	3041 (9.4)	3910 (10.4)	977 (10.1)
Anxiety & stress-related (F4–F5) (%)	0	86422 (97.6)	5114 (97.2)	3484 (98.3)	31549 (97.3)	36701 (97.7)	9574 (98.5)	<0.001		0.051	0.0
	1	2082 (2.4)	149 (2.8)	61 (1.7)	875 (2.7)	854 (2.3)	143 (1.5)
Personality disorders (F6) (%)	0	63374 (71.6)	3400 (64.6)	1887 (53.2)	25216 (77.8)	26369 (70.2)	6502 (66.9)	<0.001		0.236	0.0
	1	25130 (28.4)	1863 (35.4)	1658 (46.8)	7208 (22.2)	11186 (29.8)	3215 (33.1)
Neurocognitive & neurodevelopmental (F0, F7–F9) (%)	0	85517 (96.6)	5117 (97.2)	3416 (96.4)	31626 (97.5)	36169 (96.3)	9189 (94.6)	<0.001		0.072	0.0
	1	2987 (3.4)	146 (2.8)	129 (3.6)	798 (2.5)	1386 (3.7)	528 (5.4)
SUD severity (ICD-10) (%)	drug dependence	64429 (72.8)	4105 (78.0)	3343 (94.3)	17824 (55.0)	30278 (80.6)	8879 (91.4)	<0.001		0.477	0.0
	hazardous consumption	24075 (27.2)	1158 (22.0)	202 (5.7)	14600 (45.0)	7277 (19.4)	838 (8.6)
Domestic violence (%)	No domestic violence	52301 (74.0)	2670 (60.3)	1851 (55.4)	18960 (78.4)	22784 (75.9)	6036 (69.3)	<0.001		0.268	20.1
	Domestic violence	18419 (26.0)	1761 (39.7)	1491 (44.6)	5236 (21.6)	7251 (24.1)	2680 (30.7)
Sexual abuse (%)	No sexual abuse	69345 (98.1)	4174 (94.2)	3070 (91.9)	23977 (99.1)	29570 (98.5)	8554 (98.1)	<0.001		0.192	20.1
	Sexual abuse	1375 (1.9)	257 (5.8)	272 (8.1)	219 (0.9)	465 (1.5)	162 (1.9)
Domestic violence/Sexual abuse (%)	1.Domestic violence/sex abuse	19794 (28.0)	2018 (45.5)	1763 (52.8)	5455 (22.5)	7716 (25.7)	2842 (32.6)	<0.001		0.344	20.1
	0.No domestic violence/sex abuse	50926 (72.0)	2413 (54.5)	1579 (47.2)	18741 (77.5)	22319 (74.3)	5874 (67.4)
Physical diagnosis supergroup (single) (%)	en estudio	48717 (55.1)	2814 (53.5)	1990 (56.2)	17779 (55.0)	20990 (55.9)	5144 (53.1)	<0.001		0.240	0.2
	sin trastorno	31242 (35.4)	1909 (36.3)	883 (24.9)	12409 (38.4)	12952 (34.5)	3089 (31.9)
	Infectious diseases (STIs, viral hepatitis, substance-use–related)	700 (0.8)	75 (1.4)	85 (2.4)	131 (0.4)	300 (0.8)	109 (1.1)
	Organ-system medical diseases (cardio, hepatic, oral, hematologic)	2450 (2.8)	149 (2.8)	185 (5.2)	649 (2.0)	981 (2.6)	486 (5.0)
	Injuries and sequelae	1327 (1.5)	47 (0.9)	131 (3.7)	293 (0.9)	481 (1.3)	375 (3.9)
	Other specified medical conditions	3917 (4.4)	266 (5.1)	270 (7.6)	1076 (3.3)	1814 (4.8)	491 (5.1)
Physical diagnosis: under study (%)	0	39061 (44.2)	2420 (46.0)	1404 (39.6)	14514 (44.9)	16398 (43.7)	4325 (44.6)	<0.001		0.056	0.2
	1	49292 (55.8)	2840 (54.0)	2140 (60.4)	17823 (55.1)	21120 (56.3)	5369 (55.4)
Physical diagnosis: other medical conditions (%)	0	84405 (95.5)	4992 (94.9)	3263 (92.1)	31258 (96.7)	35695 (95.1)	9197 (94.9)	<0.001		0.083	0.2
	1	3948 (4.5)	268 (5.1)	281 (7.9)	1079 (3.3)	1823 (4.9)	497 (5.1)
Physical diagnosis: organ-system diseases (%)	0	85879 (97.2)	5109 (97.1)	3355 (94.7)	31686 (98.0)	36531 (97.4)	9198 (94.9)	<0.001		0.097	0.2
	1	2474 (2.8)	151 (2.9)	189 (5.3)	651 (2.0)	987 (2.6)	496 (5.1)
Physical diagnosis: injuries & sequelae (%)	0	87011 (98.5)	5213 (99.1)	3406 (96.1)	32043 (99.1)	37036 (98.7)	9313 (96.1)	<0.001		0.120	0.2
	1	1342 (1.5)	47 (0.9)	138 (3.9)	294 (0.9)	482 (1.3)	381 (3.9)
Physical diagnosis: infectious diseases (%)	0	87644 (99.2)	5185 (98.6)	3454 (97.5)	32205 (99.6)	37216 (99.2)	9584 (98.9)	<0.001		0.086	0.2
	1	709 (0.8)	75 (1.4)	90 (2.5)	132 (0.4)	302 (0.8)	110 (1.1)
Primary-substance use frequency at admission (%)	1. Less than 1 day a week	4399 (5.0)	243 (4.6)	25 (0.7)	2611 (8.1)	1324 (3.5)	196 (2.0)	<0.001		0.586	0.5
	2. 1 day a week	5831 (6.6)	295 (5.6)	34 (1.0)	3375 (10.5)	1897 (5.1)	230 (2.4)
	3. 2 to 3 days a week	24533 (27.8)	1452 (27.7)	354 (10.0)	11825 (36.7)	9905 (26.5)	997 (10.3)
	4. 4 to 6 days a week	14483 (16.4)	808 (15.4)	440 (12.4)	5165 (16.0)	6808 (18.2)	1262 (13.0)
	5. Daily	38847 (44.1)	2444 (46.6)	2686 (75.9)	9259 (28.7)	17436 (46.7)	7022 (72.3)
Primary-substance use frequency at admission (simplified) (%)	1.≤1 day/wk	10230 (11.6)	538 (10.3)	59 (1.7)	5986 (18.6)	3221 (8.6)	426 (4.4)	<0.001		0.554	0.5
	2.2–6 days/wk	39016 (44.3)	2260 (43.1)	794 (22.4)	16990 (52.7)	16713 (44.7)	2259 (23.3)
	3.Daily	38847 (44.1)	2444 (46.6)	2686 (75.9)	9259 (28.7)	17436 (46.7)	7022 (72.3)
Polysubstance use (strict) (%)	0	23961 (27.1)	1395 (26.5)	653 (18.4)	11608 (35.8)	8624 (23.0)	1681 (17.3)	<0.001		0.210	0.0
	1	64543 (72.9)	3868 (73.5)	2892 (81.6)	20816 (64.2)	28931 (77.0)	8036 (82.7)
Treatment length < 90 days (indicator) (%)	0	67525 (76.3)	4202 (79.8)	2213 (62.4)	26054 (80.4)	28309 (75.4)	6747 (69.4)	<0.001		0.210	0.0
	1	20979 (23.7)	1061 (20.2)	1332 (37.6)	6370 (19.6)	9246 (24.6)	2970 (30.6)
Treatment outcome (%)	adm discharge - adm reasons	1319 (1.5)	78 (1.5)	8 (0.2)	598 (1.8)	611 (1.6)	24 (0.2)	<0.001		0.279	0.0
	adm discharge - rule violation/undet	6463 (7.3)	322 (6.1)	397 (11.2)	1946 (6.0)	2532 (6.7)	1266 (13.0)
	completion	23408 (26.4)	1592 (30.2)	932 (26.3)	8093 (25.0)	9700 (25.8)	3091 (31.8)
	dropout	47055 (53.2)	2617 (49.7)	1544 (43.6)	18904 (58.3)	20201 (53.8)	3789 (39.0)
	other	215 (0.2)	7 (0.1)	8 (0.2)	98 (0.3)	67 (0.2)	35 (0.4)
	referral	10044 (11.3)	647 (12.3)	656 (18.5)	2785 (8.6)	4444 (11.8)	1512 (15.6)
Admission motive (%)	another SUD facility/FONODROGAS/SENDA Previene	8287 (9.4)	304 (5.8)	793 (22.4)	1592 (4.9)	3202 (8.5)	2396 (24.7)	<0.001		0.458	0.0
	justice sector	8470 (9.6)	941 (17.9)	241 (6.8)	3297 (10.2)	3395 (9.0)	596 (6.1)
	other	4574 (5.2)	387 (7.4)	211 (6.0)	1272 (3.9)	2161 (5.8)	543 (5.6)
	sanitary sector	27482 (31.1)	1758 (33.4)	1300 (36.7)	8942 (27.6)	12707 (33.8)	2775 (28.6)
	spontaneous consultation	39691 (44.8)	1873 (35.6)	1000 (28.2)	17321 (53.4)	16090 (42.8)	3407 (35.1)
Primary substance of use (detailed substances) (%)	alcohol	30214 (34.1)	1497 (28.4)	621 (17.5)	14317 (44.2)	11834 (31.5)	1945 (20.0)	<0.001		0.477	0.0
	amphetamine-type stimulants	123 (0.1)	7 (0.1)	5 (0.1)	46 (0.1)	55 (0.1)	10 (0.1)
	cocaine paste	33442 (37.8)	2006 (38.1)	2253 (63.6)	8554 (26.4)	14585 (38.8)	6044 (62.2)
	cocaine powder	17221 (19.5)	1109 (21.1)	506 (14.3)	6522 (20.1)	7775 (20.7)	1309 (13.5)
	dissociatives	3 (0.0)	0 (0.0)	0 (0.0)	1 (0.0)	2 (0.0)	0 (0.0)
	hallucinogens	13 (0.0)	1 (0.0)	0 (0.0)	4 (0.0)	8 (0.0)	0 (0.0)
	inhalants	67 (0.1)	0 (0.0)	5 (0.1)	11 (0.0)	35 (0.1)	16 (0.2)
	marijuana	6029 (6.8)	462 (8.8)	77 (2.2)	2553 (7.9)	2641 (7.0)	296 (3.0)
	opioids	326 (0.4)	34 (0.6)	23 (0.6)	51 (0.2)	186 (0.5)	32 (0.3)
	others	116 (0.1)	22 (0.4)	1 (0.0)	41 (0.1)	46 (0.1)	6 (0.1)
	tranquilizers/hypnotics	950 (1.1)	125 (2.4)	54 (1.5)	324 (1.0)	388 (1.0)	59 (0.6)
Primary substance of use (recoded) (%)	cocaine paste	33442 (37.8)	2006 (38.1)	2253 (63.6)	8554 (26.4)	14585 (38.8)	6044 (62.2)	<0.001		0.469	0.0
	cocaine powder	17221 (19.5)	1109 (21.1)	506 (14.3)	6522 (20.1)	7775 (20.7)	1309 (13.5)
	alcohol	30214 (34.1)	1497 (28.4)	621 (17.5)	14317 (44.2)	11834 (31.5)	1945 (20.0)
	marijuana	6029 (6.8)	462 (8.8)	77 (2.2)	2553 (7.9)	2641 (7.0)	296 (3.0)
	others	1598 (1.8)	189 (3.6)	88 (2.5)	478 (1.5)	720 (1.9)	123 (1.3)
Court-referred to drug treatment (1 = yes) (%)	no	82939 (98.0)	4696 (98.2)	3277 (97.6)	31093 (98.8)	34832 (97.8)	9041 (96.6)	<0.001		0.068	4.4
	si	1668 (2.0)	86 (1.8)	80 (2.4)	389 (1.2)	790 (2.2)	323 (3.4)
Housing type (recoded) (%)	formal housing	73763 (89.9)	4441 (90.2)	2139 (81.6)	28843 (92.6)	32164 (90.2)	6176 (79.8)	<0.001		0.297	7.3
	shared/secondary unit	2787 (3.4)	215 (4.4)	88 (3.4)	995 (3.2)	1244 (3.5)	245 (3.2)
	homeless/unsheltered/informal/temporary housing/institutional/collective	2864 (3.5)	64 (1.3)	256 (9.8)	402 (1.3)	1074 (3.0)	1068 (13.8)
	other/unknown	2667 (3.2)	203 (4.1)	137 (5.2)	912 (2.9)	1168 (3.3)	247 (3.2)
Housing type (recoded & simplified) (%)	formal housing	73763 (89.9)	4441 (90.2)	2139 (81.6)	28843 (92.6)	32164 (90.2)	6176 (79.8)	<0.001		0.201	7.3
	other/unknown	8318 (10.1)	482 (9.8)	481 (18.4)	2309 (7.4)	3486 (9.8)	1560 (20.2)
Assessment of the therapeutic process (%)	logro alto	21177 (24.0)	1320 (25.1)	874 (24.7)	7482 (23.1)	8574 (22.9)	2927 (30.2)	<0.001		0.097	0.2
	logro intermedio	28665 (32.5)	1728 (32.9)	1140 (32.2)	10361 (32.1)	12104 (32.3)	3332 (34.4)
	logro minimo	38444 (43.5)	2208 (42.0)	1523 (43.1)	14483 (44.8)	16808 (44.8)	3422 (35.3)
Evaluation of consumption pattern at discharge (%)	logro alto	25866 (29.3)	1654 (31.5)	1140 (32.2)	9054 (28.0)	10295 (27.5)	3723 (38.5)	<0.001		0.134	0.3
	logro intermedio	25279 (28.6)	1468 (27.9)	941 (26.6)	9269 (28.7)	10726 (28.6)	2875 (29.7)
	logro minimo	37124 (42.1)	2134 (40.6)	1456 (41.2)	13997 (43.3)	16453 (43.9)	3084 (31.9)
Family situation evaluation at discharge (%)	logro alto	20889 (23.7)	1259 (24.0)	843 (23.8)	7617 (23.6)	8321 (22.2)	2849 (29.4)	<0.001		0.090	0.3
	logro intermedio	28144 (31.9)	1702 (32.4)	1109 (31.4)	10226 (31.6)	11824 (31.6)	3283 (33.9)
	logro minimo	39236 (44.5)	2295 (43.7)	1585 (44.8)	14477 (44.8)	17329 (46.2)	3550 (36.7)
Interpersonal relations evaluation at discharge (%)	logro alto	21507 (24.4)	1371 (26.1)	941 (26.6)	7633 (23.6)	8661 (23.1)	2901 (30.0)	<0.001		0.103	0.3
	logro intermedio	28053 (31.8)	1668 (31.7)	1068 (30.2)	10180 (31.5)	11820 (31.5)	3317 (34.3)
	logro minimo	38709 (43.9)	2217 (42.2)	1528 (43.2)	14507 (44.9)	16993 (45.3)	3464 (35.8)
Occupational situation evaluation at discharge (%)	logro alto	27771 (31.5)	1474 (28.0)	832 (23.5)	10739 (33.2)	11733 (31.3)	2993 (30.9)	<0.001		0.135	0.3
	logro intermedio	23483 (26.6)	1365 (26.0)	803 (22.7)	8754 (27.1)	10054 (26.8)	2507 (25.9)
	logro minimo	37015 (41.9)	2417 (46.0)	1902 (53.8)	12827 (39.7)	15687 (41.9)	4182 (43.2)
Mental health evaluation at discharge (%)	logro alto	21471 (24.3)	1312 (25.0)	906 (25.6)	7584 (23.5)	8689 (23.2)	2980 (30.8)	<0.001		0.103	0.3
	logro intermedio	29000 (32.9)	1704 (32.4)	1119 (31.6)	10632 (32.9)	12135 (32.4)	3410 (35.2)
	logro minimo	37798 (42.8)	2240 (42.6)	1512 (42.7)	14104 (43.6)	16650 (44.4)	3292 (34.0)
Physical health evaluation at discharge (%)	logro alto	26845 (30.4)	1605 (30.5)	1340 (37.9)	8990 (27.8)	10536 (28.1)	4374 (45.2)	<0.001		0.209	0.3
	logro intermedio	27330 (31.0)	1594 (30.3)	1052 (29.7)	10073 (31.2)	11710 (31.2)	2901 (30.0)
	logro minimo	34094 (38.6)	2057 (39.1)	1145 (32.4)	13257 (41.0)	15228 (40.6)	2407 (24.9)
Social norm transgression evaluation at discharge (%)	logro alto	31583 (35.8)	2015 (38.3)	1001 (28.3)	11845 (36.6)	13416 (35.8)	3306 (34.1)	<0.001		0.117	0.3
	logro intermedio	21831 (24.7)	1217 (23.2)	875 (24.7)	7852 (24.3)	9165 (24.5)	2722 (28.1)
	logro minimo	34855 (39.5)	2024 (38.5)	1661 (47.0)	12623 (39.1)	14893 (39.7)	3654 (37.7)
Readmission event (%)	0	69434 (78.5)	4045 (76.9)	2065 (58.3)	26397 (81.4)	30155 (80.3)	6772 (69.7)	<0.001		0.257	0.0
	1	19070 (21.5)	1218 (23.1)	1480 (41.7)	6027 (18.6)	7400 (19.7)	2945 (30.3)
Death event (%)	0	84557 (95.5)	5117 (97.2)	3366 (95.0)	30960 (95.5)	35979 (95.8)	9135 (94.0)	<0.001		0.072	0.0
	1	3947 (4.5)	146 (2.8)	179 (5.0)	1464 (4.5)	1576 (4.2)	582 (6.0)
Readmission time from admission (months) (median [IQR])		47.9 [22.6, 80.7]	40.1 [18.9, 69.5]	33.6 [13.3, 72.5]	53.1 [26.5, 84.4]	45.8 [22.3, 77.3]	50.3 [19.5, 89.0]	<0.001	nonnorm	0.175	0.0
Readmission time from discharge (months) (median [IQR])		41.0 [15.0, 73.0]	32.0 [10.5, 60.3]	27.0 [4.9, 66.3]	45.0 [18.9, 76.1]	38.5 [14.7, 69.1]	43.6 [11.8, 80.7]	<0.001	nonnorm	0.170	0.0
Death time from admission (months) (median [IQR])		62.5 [34.0, 91.1]	54.0 [29.2, 84.9]	68.5 [37.9, 99.2]	64.7 [36.8, 91.6]	58.0 [31.2, 87.2]	73.9 [41.9, 101.7]	<0.001	nonnorm	0.196	0.0
Death time from discharge (months) (median [IQR])		54.3 [27.0, 83.9]	46.4 [21.1, 76.1]	61.4 [31.7, 92.6]	56.1 [29.0, 84.4]	50.5 [24.0, 80.0]	67.2 [35.0, 95.9]	<0.001	nonnorm	0.217	0.0
Admission age (log) (median [IQR])		3.6 [3.3, 3.8]	3.5 [3.3, 3.7]	3.5 [3.3, 3.7]	3.6 [3.4, 3.8]	3.6 [3.3, 3.8]	3.5 [3.3, 3.7]	<0.001	nonnorm	0.142	0.0
Admission age ^2 (median [IQR])		1,169.6 [751.9, 1,849.2]	1,052.4 [691.2, 1,610.8]	1,041.4 [677.6, 1,637.8]	1,261.3 [794.1, 2,009.7]	1,151.9 [746.4, 1,795.2]	1,091.6 [714.5, 1,717.3]	<0.001	nonnorm	0.140	0.0
Admission age ^3 (median [IQR])		40,001.7 [20,615.9, 79,520.9]	34,138.4 [18,170.7, 64,650.2]	33,604.5 [17,636.9, 66,282.6]	44,795.6 [22,378.1, 90,096.1]	39,096.3 [20,391.2, 76,063.3]	36,067.8 [19,098.4, 71,163.8]	<0.001	nonnorm	0.136	0.0
Admission age (centered) (median [IQR])		-1.6 [-8.3, 7.2]	-3.3 [-9.5, 4.4]	-3.5 [-9.7, 4.7]	-0.2 [-7.6, 9.1]	-1.8 [-8.4, 6.6]	-2.7 [-9.0, 5.7]	<0.001	nonnorm	0.142	0.0
Admission age (three groups) (%)	18-29	30634 (34.6)	2125 (40.4)	1449 (40.9)	10165 (31.4)	13181 (35.1)	3714 (38.2)	<0.001		0.127	0.0
	30-44	39637 (44.8)	2299 (43.7)	1526 (43.0)	14282 (44.0)	17190 (45.8)	4340 (44.7)
	45-64	18233 (20.6)	839 (15.9)	570 (16.1)	7977 (24.6)	7184 (19.1)	1663 (17.1)
Poverty index of commune of residence (median [IQR])		12.4 [9.1, 17.0]	11.2 [8.9, 15.3]	12.5 [8.9, 16.9]	13.1 [9.3, 17.9]	11.9 [9.0, 15.7]	12.4 [9.0, 17.6]	<0.001	nonnorm	0.140	0.0
Poverty index (log1p) (median [IQR])		0.1 [0.1, 0.2]	0.1 [0.1, 0.1]	0.1 [0.1, 0.2]	0.1 [0.1, 0.2]	0.1 [0.1, 0.1]	0.1 [0.1, 0.2]	<0.001	nonnorm	0.139	0.0
Poverty index (centered) (median [IQR])		0.0 [-0.1, 0.0]	0.0 [-0.1, 0.0]	0.0 [-0.1, 0.0]	0.0 [0.0, 0.0]	0.0 [-0.1, 0.0]	0.0 [-0.1, 0.0]	<0.001	nonnorm	0.140	0.0
Nationality (Chile) (%)	chile	87935 (99.4)	5212 (99.0)	3524 (99.4)	32238 (99.4)	37307 (99.3)	9654 (99.4)	0.021		0.020	0.0
	other	569 (0.6)	51 (1.0)	21 (0.6)	186 (0.6)	248 (0.7)	63 (0.6)
Length of stay in treatment (months) (median [IQR])		5.6 [3.0, 10.0]	6.5 [3.4, 11.5]	4.4 [1.7, 9.1]	5.8 [3.4, 10.0]	5.5 [2.9, 10.0]	5.3 [2.3, 9.6]	<0.001	nonnorm	0.144	0.0
Length of stay (log1p months) (median [IQR])		1.9 [1.4, 2.4]	2.0 [1.5, 2.5]	1.7 [1.0, 2.3]	1.9 [1.5, 2.4]	1.9 [1.4, 2.4]	1.8 [1.2, 2.4]	<0.001	nonnorm	0.228	0.0
Length of stay (months)^2 (median [IQR])		31.1 [9.0, 100.0]	42.0 [11.4, 132.6]	19.0 [3.0, 82.2]	33.7 [11.3, 99.4]	29.7 [8.6, 99.4]	27.7 [5.4, 93.0]	<0.001	nonnorm	0.084	0.0
Length of stay (months)^3 (median [IQR])		173.8 [27.0, 1,000.0]	272.6 [38.3, 1,527.3]	82.6 [5.3, 744.8]	195.8 [37.8, 990.4]	162.0 [25.3, 990.4]	146.1 [12.5, 897.3]	<0.001	nonnorm	0.057	0.0

General-population settings were mostly male (81–94%). Heavy sex imbalances by setting (1/4= women).
GP settings mostly lived with couples/kids or family of origin (~51%)
Dropout rates exceed 50% in all groups; highest in Basic ambulatory (58%); Completion rates highest in General-poulation residential (32%) and Intensive ambulatory (30%)
Patients in residential treatments were dominated by cocaine base paste as the primary substance (64-62%), daily use (72-76%) and drug dependence (>90%); Basic ambualtory, dominated by alcohol (44%).
Residential treatments were more single (23% vs. 34% overall)
Domestic violence and sexual abuse were higher in women-only settings (40–45% and 6–8% vs. 26% and 2% overall, respectively)
Ambulatory basic received more patients aged 40-65 (25%); women-only programs, less (16%)
Also women-only settings showed SUD severity and Depression slightly higher.
Women-only residential had diagnoses F6 (Disorders of adult personality and behavior) (47% vs. 28% overall) and more diagnoses in study (24% vs. 18% overall)
Women-only residential were younger (34 [13–72]), less went to treatment spontaneously (28% vs. 45% overall)
Residential settings had approx. two times more organ-system diseases (5.3%) and injuries & sequelae (3.9%)
Formal housing: >90% in intensive ambulatory settings vs. 80% in general-population residential; Homelessness/informal housing: 14% in General-population residential vs. 1–3% elsewhere.
Employment was lowest in Women-only residential (10%) and General-population residential (18%), with many unemployed (43–65%), while General-population basic ambulatory had the highest employed rate (65%). Education was similar across groups, with most having high school or less (53–57%), but primary school or less was more common in women-only settings (30%).
Urban residence highest: 88–89% in women-only settings.
**We did not find patients with sex that was inconsistent with their clinical presentation regarding the Pathology of Gestation and of the Intrauterine Fetus without Disorder. Although this study focuses on pregnancy and fetal outcomes, the male parent’s (father’s) medical history and genetic contribution were documented as part of the patient data. G:/My Drive/Alvacast/SISTRAT 2023//cons/_out/table_one_prediction_clean.txt **.
Considering these differences, we think that certain predictors may behave differently by settings

Code

SISTRAT23_c1_2010_2024_df_prev1z|>
    janitor::tabyl(readmit_event, death_event)|>
    janitor::adorn_percentages("all")|>
    janitor::adorn_pct_formatting(digits = 1)|>
    janitor::adorn_ns(position = "front")|> 
    knitr::kable("markdown", caption= "Intersection of Readmission (rows) w/ Mortality (cols)")

Intersection of Readmission (rows) w/ Mortality (cols)
readmit_event	0	1
0	66,231 (74.8%)	3,203 (3.6%)
1	18,326 (20.7%)	744 (0.8%)

1.6. Near-zero variance

Code

# freqRatio The ratio of the frequency of the most common value to the frequency of the second most common value. A large number means the variable is dominated by one value (i.e., nearly constant). #high (e.g. > 20) → highly unbalanced (rare category vs majority).
# percentUnique The percentage of unique values relative to the sample size. Low values mean the variable has very few distinct values — often categorical or binary, but potentially too invariant. #High → many distinct values (continuous-type).
# zeroVar   TRUE means the variable has zero variance (same value for all observations). These should always be removed.
# nzv   TRUE means the variable has near-zero variance (too little variation to be useful). These are usually removed before modeling.

categorical_vars2<- 
  setdiff(categorical_vars, c("nationality_cons", "phys_dx_supergroup", "dx_supergroup_series", "f1_substance_use"))
numerical_vars2 <- 
  setdiff(numerical_vars, c("nationality_cons", "phys_dx_supergroup", "dx_supergroup_series"))

nzv <- nearZeroVar(subset(SISTRAT23_c1_2010_2024_df_prev1z, select= setdiff(c(categorical_vars,numerical_vars), "plan_type_corr")),
                   names = TRUE, #column names are returned.
                   saveMetrics= TRUE, # a data frame with predictor information is returned.
                   allowParallel= TRUE,
                   foreach= TRUE
                   )
dplyr::mutate_if(nzv, is.numeric, ~round(.,2)) |> 
  knitr::kable(caption="Near zero variance", col.names= c("Variables", "Freq most common value / second", "% of unique values", "Zero variance","Near-zero variance"))

Near zero variance
Variables	Freq most common value / second	% of unique values	Zero variance	Near-zero variance
sex_rec	2.90	0.00	FALSE	FALSE
tenure_status_household	1.13	0.01	FALSE	FALSE
occupation_condition_corr24	1.40	0.00	FALSE	FALSE
marital_status_rec	1.62	0.00	FALSE	FALSE
ethnicity	14.71	0.00	FALSE	FALSE
urbanicity_cat	8.34	0.00	FALSE	FALSE
ed_attainment_corr	2.11	0.00	FALSE	FALSE
cohabitation	1.21	0.00	FALSE	FALSE
dg_psiq_cie_10_instudy	4.72	0.00	FALSE	FALSE
dg_psiq_cie_10_dg	1.24	0.00	FALSE	FALSE
f0_organic	79.09	0.00	FALSE	TRUE
f1_substance_use	0.00	0.00	TRUE	TRUE
f2_psychotic	64.12	0.00	FALSE	TRUE
f3_mood	8.75	0.00	FALSE	FALSE
f4_anxiety_stress_somatoform	52.97	0.00	FALSE	TRUE
f5_physio_eating_sleep_sexual	194.37	0.00	FALSE	TRUE
f6_personality_adult_behaviour	2.52	0.00	FALSE	FALSE
f7_intellectual_disability	137.72	0.00	FALSE	TRUE
f8_9_neurodevelopment_child	68.85	0.00	FALSE	TRUE
dx_f2_smi_psychotic	64.12	0.00	FALSE	TRUE
dx_f3_mood	8.75	0.00	FALSE	FALSE
dx_f45_anx_stress_phys	41.51	0.00	FALSE	TRUE
dx_f6_personality	2.52	0.00	FALSE	FALSE
dx_f0789_neurocog_dev	28.63	0.00	FALSE	TRUE
sub_dep_icd10_status	2.68	0.00	FALSE	FALSE
dom_violence	2.84	0.00	FALSE	FALSE
sex_abuse	50.43	0.00	FALSE	TRUE
any_violence	2.57	0.00	FALSE	FALSE
phys_dx_supergroup	1.56	0.01	FALSE	FALSE
phys_dx_instudy	1.26	0.00	FALSE	FALSE
phys_dx_other_spec_medical_cond	21.38	0.00	FALSE	TRUE
phys_dx_organ_system_med_dis	34.71	0.00	FALSE	TRUE
phys_dx_injuries_and_sequelae	64.84	0.00	FALSE	TRUE
phys_dx_infectious_diseases	123.62	0.00	FALSE	TRUE
prim_sub_freq	1.58	0.01	FALSE	FALSE
prim_sub_freq_rec	1.00	0.00	FALSE	FALSE
polysubstance_strict	2.69	0.00	FALSE	FALSE
treat_lt_90	3.22	0.00	FALSE	FALSE
tr_outcome	2.01	0.01	FALSE	FALSE
adm_motive	1.44	0.01	FALSE	FALSE
primary_sub	1.11	0.01	FALSE	FALSE
primary_sub_mod	1.11	0.01	FALSE	FALSE
usuario_tribunal_trat_droga	49.72	0.00	FALSE	TRUE
tipo_de_vivienda_rec	25.76	0.00	FALSE	TRUE
tipo_de_vivienda_rec2	8.87	0.00	FALSE	FALSE
evaluacindelprocesoteraputico	1.34	0.00	FALSE	FALSE
eva_consumo	1.44	0.00	FALSE	FALSE
eva_fam	1.39	0.00	FALSE	FALSE
eva_relinterp	1.38	0.00	FALSE	FALSE
eva_ocupacion	1.33	0.00	FALSE	FALSE
eva_sm	1.30	0.00	FALSE	FALSE
eva_fisica	1.25	0.00	FALSE	FALSE
eva_transgnorma	1.10	0.00	FALSE	FALSE
readmit_event	3.64	0.00	FALSE	FALSE
death_event	21.42	0.00	FALSE	TRUE
readmit_time_from_adm_m	1.10	7.86	FALSE	FALSE
readmit_time_from_disch_m	17.66	7.53	FALSE	FALSE
death_time_from_adm_m	1.06	5.54	FALSE	FALSE
death_time_from_disch_m	19.39	5.61	FALSE	TRUE
adm_age_log	1.19	5.17	FALSE	FALSE
adm_age_pow2	1.19	5.17	FALSE	FALSE
adm_age_pow3	1.19	5.17	FALSE	FALSE
adm_age_c	1.19	5.17	FALSE	FALSE
adm_age_cat	1.29	0.00	FALSE	FALSE
porc_pobr	1.06	1.00	FALSE	FALSE
porc_pobr_log	1.06	1.00	FALSE	FALSE
porc_pobr_c	1.06	1.00	FALSE	FALSE
nationality_chile	154.54	0.00	FALSE	TRUE
dit_m	1.02	14.27	FALSE	FALSE
treat_log	1.00	12.26	FALSE	FALSE
treat_days_pow2	1.06	14.27	FALSE	FALSE
treat_days_pow3	1.01	14.27	FALSE	FALSE

Given that near-zero variance can lead to large standard errors, convergence warnings, or even quasi-separation if the infrequent category aligns perfectly with the outcome, we deleted the following predictors due to potential collinearity: nationality_cons, phys_dx_supergroup, and dx_supergroup_series.

Also, we deleted 1 due to zero variance.

Variables with near zero variance: f5_physio_eating_sleep_sexual, nationality_chile, f7_intellectual_disability, phys_dx_infectious_diseases, f0_organic, f8_9_neurodevelopment_child, phys_dx_injuries_and_sequelae, f2_psychotic, dx_f2_smi_psychotic, f4_anxiety_stress_somatoform, sex_abuse, usuario_tribunal_trat_droga, dx_f45_anx_stress_phys, phys_dx_organ_system_med_dis, dx_f0789_neurocog_dev, tipo_de_vivienda_rec, death_event, phys_dx_other_spec_medical_cond, death_time_from_disch_m, f1_substance_use.

For tipo_de_vivienda_rec, we used tipo_de_vivienda_rec2 instead.
For psychiatric variables, we grouped them into severe mental illness and neurocognitive disorders (dx_f_any_severe_mental= F0|F2|F7|F(8-9)|), any internalizing/stress-related diagnosis or of the somatoform spectrum (dx_f_any_internalizing F(4-5)).
For any physical comorbidity, we applied grouping for any diagnostic (any_phys_dx): Infectuous diseases, Organ-system, Injuries and sequelae and Other specified medical conditions.
For nationality_chile, we changed the dominant category and changed it for foreigners instead (national_foreign).
Court-mandated or drug court treatment user (usuario_tribunal_trat_droga) are less than 2% of the SUD treatment population. We did not use it as a predictor.
Reported sexual abuse also represent less than 2% of the population. We have a variable called any_violence that groups domestic violence with sexual abuse

For the rest of variables, we marked them and review them conceptually later, collapse rare categories if possible, and avoid including obvious.

2. Prepare for survival setting w/ ML

2.1. Defined variables and clean database

We numeric and categorical predictors, excluded target and time-leak columns, and added a unique row IDs. Then we converted categorical variables to factors.

Code

num_vars = c(
    # floats
    "readmit_time_from_adm_m",
    "death_time_from_adm_m",
    "adm_age_rec3",
    # "adm_age_log",
    # "adm_age_pow2",
    # "adm_age_pow3",
    # "adm_age_c",
    "porc_pobr",
    # "porc_pobr_log",
    # "porc_pobr_c",
    "dit_m",
    # "treat_log",
    # "treat_days_pow2",
    # "treat_days_pow3"
    "num_trat_ant",# only 5993 missings
    "age_subs_onset_rec2" # only 4300 missings
)
cat_vars = c(
    # category
    "sex_rec",
    "tenure_status_household",
    "cohabitation",
    "sub_dep_icd10_status",
    #"dom_violence", # check collinearity
    #"sex_abuse", # collinear #freqRatio>20
    "any_violence",
    "prim_sub_freq_rec",
    "tr_outcome",
    "adm_motive",
    "first_sub_used",
    "primary_sub_mod",
    #"tipo_de_vivienda_rec", #freqRatio>20
    "tipo_de_vivienda_rec2",
    #"adm_age_cat", # collinear
    #"nationality_chile",
    "national_foreign",
    "plan_type_corr",
    # object
    "occupation_condition_corr24",
    "marital_status_rec",
    "urbanicity_cat",
    "ed_attainment_corr",
    #"prim_sub_freq",
    #"primary_sub",
    #"usuario_tribunal_trat_droga", #freqRatio>20
    "evaluacindelprocesoteraputico",
    "eva_consumo",
    "eva_fam",
    "eva_relinterp",
    "eva_ocupacion",
    "eva_sm",
    "eva_fisica",
    "eva_transgnorma",
    # booleans
    "ethnicity",    
    "dg_psiq_cie_10_instudy",
    "dg_psiq_cie_10_dg",
    #"f0_organic", #freqRatio>20
    #"f2_psychotic", #freqRatio>20
    #"f3_mood", #collinear
    #"f4_anxiety_stress_somatoform", #freqRatio>20
    #"f5_physio_eating_sleep_sexual", #freqRatio>20
    #"f6_personality_adult_behaviour", #freqRatio>20
    #"f7_intellectual_disability", #freqRatio>20
    #"f8_9_neurodevelopment_child", #freqRatio>20
    # ints
    #"dx_f2_smi_psychotic",
    "dx_f3_mood",
    #"dx_f45_anx_stress_phys",
    "dx_f6_personality",
    #"dx_f0789_neurocog_dev",
    "dx_f_any_severe_mental",
    "dx_f_any_internalizing",
    "phys_dx_instudy",
    "any_phys_dx",
    #"phys_dx_other_spec_medical_cond",
    #"phys_dx_organ_system_med_dis",
    #"phys_dx_injuries_and_sequelae",
    #"phys_dx_infectious_diseases",
    "polysubstance_strict"
    #"treat_lt_90" #collinear
)


# 0. Define columns to exclude (endpoints + time leaks)
target_cols <- c(
  "readmit_time_from_disch_m", "readmit_event",
  "death_time_from_disch_m", "death_event"
)
leak_time_cols <- c("readmit_time_from_adm_m", "death_time_from_adm_m")
cols_to_exclude <- c(target_cols, leak_time_cols)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
SISTRAT23_c1_2010_2024_df_model<-  
SISTRAT23_c1_2010_2024_df_prev1z%>%
  dplyr::mutate(original_row_id = dplyr::row_number())%>%  # Create a unique ID
  dplyr::mutate(
      national_foreign = dplyr::if_else(nationality_chile == "other", 1L, 0L)
    )%>%
  dplyr::mutate(
    dx_f_any_severe_mental =
      (f2_psychotic == 1L |
       dx_f2_smi_psychotic == 1L |
       f0_organic == 1L |
       f7_intellectual_disability == 1L |
       f8_9_neurodevelopment_child == 1L |
       dx_f0789_neurocog_dev == 1L)
  )%>%
  dplyr::mutate(
    any_internalizing =
      (f4_anxiety_stress_somatoform == 1L |
       dx_f45_anx_stress_phys == 1L |
       f5_physio_eating_sleep_sexual == 1L)
  )%>%
  dplyr::mutate(
    any_phys_dx = (phys_dx_infectious_diseases == 1L |
                   phys_dx_injuries_and_sequelae == 1L |
                   phys_dx_organ_system_med_dis == 1L |
                   phys_dx_other_spec_medical_cond == 1L)
  )


# 1. Create predictor dataset (remove excluded columns)
df_pred <- SISTRAT23_c1_2010_2024_df_model %>%
  dplyr::select(any_of(c(num_vars, cat_vars, cols_to_exclude))) 
factor_vars<- 
c("sex_rec", "tenure_status_household", "cohabitation", "sub_dep_icd10_status", "any_violence", "tr_outcome", "adm_motive", "primary_sub_mod",  "tipo_de_vivienda_rec2", "plan_type_corr", "occupation_condition_corr24", "marital_status_rec", "urbanicity_cat", "ed_attainment_corr", "prim_sub_freq_rec", "first_sub_used", "evaluacindelprocesoteraputico", "eva_consumo", "eva_fam", "eva_relinterp", "eva_ocupacion", "eva_sm", "eva_fisica", "eva_transgnorma")
invisible(
df_pred[, (factor_vars) := lapply(.SD, as.factor), .SDcols = factor_vars]
)

2.2. Imputation

Then we run MICE-like with PMM (15 donors, 200 trees, 5 datasets, 10 iterations) using a correlation-based predictor matrix, and finally extracted the 5 completed/imputed datasets, restoring factor levels.

Code

# Stable imputation
# set.seed(2125)
# imp <- mice(
#     clean_dummified_df,
#     m = 5,
#     maxit = 10,
#     ridge = 1e-04,  # Regularización agresiva pero necesaria
#     donors = 15,
#     allow.na = FALSE
# )
#  it im dep      meth                                  out
# 1  0  0     collinear                          porc_pobr_c
# 2  0  0     collinear                  dx_f2_smi_psychotic
# 3  0  0     collinear                           dx_f3_mood
# 4  0  0     collinear                    dx_f6_personality
# 5  0  0     collinear sub_dep_icd10_status_drug_dependence
# 6  0  0     collinear              nationality_chile_other

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

  
cat("Define number of complete datasets to create\n")
replicates_n <- 5

  # Check if the session is interactive (i.e., you are using RStudio manually)
  if (interactive()) {
    if(
    !file.exists(file.path(wdpath, 
                           "data/20241015_out", 
                           "filled_datasets.Rds"))
    ) {
      # If interactive, offload to a background job to keep console free
      job::job({
        source(file.path(wdpath, "cons", "_hist_scripts", "missranger_mice_like_pred1.R"))
      }, title = "Imputation Job") # Optional: adds a nice title to the jobs pane
    } else {
      filled_datasets <- readr::read_rds(file.path(wdpath, "data/20241015_out", "filled_datasets.Rds"))
    }  
  } else {
    # If NOT interactive (e.g., batch mode), just run it directly
      
    library(dplyr)
    library(purrr)
    library(tibble)
    
    # Run MICE imputation (corrected syntax)
    time_event_vars <- c(
      "readmit_time_from_disch_m", "readmit_event",
      "death_time_from_disch_m", "death_event"
    )
    predictor_formula <- as.formula(
      #paste(". ~ . -", paste(time_event_vars, collapse = " - "))
      paste(". ~ . ", sep="")
    )
    
    cat("MICE-like but with missranger")
    #https://cran.r-project.org/web/packages/missRanger/vignettes/working_with_censoring.html
    set.seed(2125)
    filled_datasets <- replicate(
      replicates_n,
      missRanger::missRanger(
        df_pred,
        formula = predictor_formula,  # Excluye explícitamente outcomes
        verbose = 2,
        pmm.k = 15,#10, # predictive mean matching with 15 nearest neighbors for num variables
        num.trees = 200, # trees sufficient for stability
        #seed      = 2125, #if you want 5 different but reproducible imputations. dont use insede missRanger
        maxiter = 10, #50 is overkill unless you have very heavy missingness and time isn’t an issue
        returnOOB = TRUE,
        respect.unordered.factors = "order" # controls how categorical variables are handled inside the trees, giving a better treatment of unordered factors.
      ),
      simplify = FALSE
    )
    #_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
    #_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
    cat("Check remaining missing values\n")
    imputed_data <- map_dfr(1:5, function(i) {
      df <- filled_datasets[[i]]
      # add control variables
      dplyr::mutate(df,
             .imp = i,  # imputation number
             .id = SISTRAT23_c1_2010_2024_df_model$original_row_id  # Use original ID
      ) %>%
        # Keep variables necessary for analysis
        dplyr::select(
          .imp, .id,
          everything()
        )
    }, .id = "imputation_id") %>% 
      dplyr::select(-imputation_id)  # cleaning
    #_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
    #_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
    cat("Check remaining missing values\n")
    
    get_all_oobs <- function(imputed_list) {
      # 1. Extract OOBs from all 5 datasets into one big table
      all_oobs <- map_dfr(seq_along(imputed_list), function(i) {
        # Get the attribute
        oob_vec <- attr(imputed_list[[i]], "oob")
        # Create a tidy table
        tibble::tibble(
          Variable = names(oob_vec),
          Error_Value = as.numeric(oob_vec),
          Imputation_Run = as.factor(i)
        )
      })
      # 2. Create a Summary Table (Mean across the 5 runs)
      oob_summary <- all_oobs %>%
        dplyr::group_by(Variable) %>%
        dplyr::summarise(
          Mean_OOB = mean(Error_Value),
          SD_OOB = sd(Error_Value),
          Best_Run = min(Error_Value),
          Worst_Run = max(Error_Value)
        ) %>%
        dplyr::arrange(Mean_OOB) # Sort: Best variables (lowest error) first
      # 3. Print the summary nicely
      cat("\n=== COMPLETE OOB ERROR REPORT ===\n")
      cat("Scale: 0.00 = Perfect Prediction | 1.00 = Useless/Random\n")
      print(as.data.frame(oob_summary), digits = 3)
      return(oob_summary)
    }

#Takes 1:41:49
#Takes 1:38:06

  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  imputed_data %>%
    dplyr::select(where(~ any(is.na(.x)))) %>%
    { 
      if (ncol(.) == 0) {
        message("✅ No missing values remain after imputation")
        tibble::tibble()
      } else {
        dplyr::summarise(., across(everything(), ~ scales::percent(mean(is.na(.x)), accuracy = 0.1))) %>%
          tidyr::pivot_longer(everything(), names_to = "variable", values_to = "missing_pct") %>%
          dplyr::arrange(desc(as.numeric(gsub("%", "", missing_pct))))
      }
    } %>%
    print()
  }

Adjuntando el paquete: ‘dplyr’

The following objects are masked from ‘package:Hmisc’:

src, summarize

The following object is masked from ‘package:MASS’:

select

The following object is masked from ‘package:survex’:

explain

The following objects are masked from ‘package:tidytable’:

across, add_count, add_tally, anti_join, arrange, between,
bind_cols, bind_rows, c_across, case_match, case_when, coalesce,
consecutive_id, count, cross_join, cume_dist, cur_column, cur_data,
cur_group_id, cur_group_rows, dense_rank, desc, distinct, filter,
first, full_join, group_by, group_cols, group_split, group_vars,
if_all, if_any, if_else, inner_join, is_grouped_df, lag, last,
lead, left_join, min_rank, mutate, n, n_distinct, na_if, nest_by,
nest_join, nth, percent_rank, pick, pull, recode, reframe,
relocate, rename, rename_with, right_join, row_number, rowwise,
select, semi_join, slice, slice_head, slice_max, slice_min,
slice_sample, slice_tail, summarise, summarize, tally, top_n,
transmute, tribble, ungroup

The following objects are masked from ‘package:stats’:

filter, lag

The following objects are masked from ‘package:base’:

intersect, setdiff, setequal, union

Adjuntando el paquete: ‘purrr’

The following object is masked from ‘package:caret’:

lift

The following objects are masked from ‘package:tidytable’:

map, map_chr, map_dbl, map_df, map_dfc, map_dfr, map_int, map_lgl,
map_vec, map2, map2_chr, map2_dbl, map2_df, map2_dfc, map2_dfr,
map2_int, map2_lgl, map2_vec, pmap, pmap_chr, pmap_dbl, pmap_df,
pmap_dfc, pmap_dfr, pmap_int, pmap_lgl, pmap_vec, walk

Adjuntando el paquete: ‘tibble’

The following objects are masked from ‘package:tidytable’:

enframe, tribble

Missing value imputation by random forests Missing value imputation by random forests Missing value imputation by random forests Missing value imputation by random forests Missing value imputation by random forests

✅ No missing values remain after imputation

Code

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

if (interactive()) {
  if(
  !file.exists(file.path(wdpath, 
                         "data/20241015_out", 
                         "filled_datasets.Rds"))
  ) {
    
    final_oob_stats <- get_all_oobs(filled_datasets)
    # cat("\n=== COMPLETE OOB ERROR REPORT ===\n")
    #   cat("Scale: 0.00 = Perfect Prediction | 1.00 = Useless/Random\n")
    final_oob_stats|>  
      dplyr::mutate_if(is.numeric, ~round(.,4))|> 
      knitr::kable(caption="Oob report")
  }
}
# tipo_de_vivienda_rec2 (0.08): Extremely reliable. The model almost certainly 
# knows what kind of house someone lives in based on their other traits.
# any_phys_dx (0.09): Very strong prediction.
# sex_rec (0.12): Standard; usually easy to predict from other social variables.
# eva_ variables (~0.14 - 0.21): Your "Evaluation" scales (consumo, fam, ocupacion, etc.) 
# are performing very well. This suggests clear patterns in your psychological/evaluation metrics.

# marital_status_rec (0.30): Decent.
# first_sub_used (0.34): The model has a fair idea of what substance they used 
# first, but it's not certain.
# prim_sub_freq_rec (0.41) & ed_attainment_corr (0.43): These are getting "fuzzy." 
# The imputed values will be plausible, but they might not perfectly 
# capture the individual nuance for every person.

# age_subs_onset_rec2 (0.72): High error. The age at which someone started using 
# substances is hard to predict from their current status.
# phys_dx_instudy (0.78): Very high. The specific physical diagnosis is likely too 
# complex or rare to be predicted by general social/eval variables.
# num_trat_ant (0.86): Critical Flag. This is nearly 0.90.


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#previous imputations

#saveRDS(imp, file= file.path(wdpath, "data/20241015_out", "imp.Rds"))

#saveRDS(filled_datasets, file= file.path(wdpath, "data/20241015_out", "filled_datasets.Rds"))

Define number of complete datasets to create
MICE-like but with missranger
Variables to impute:        sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute:   readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event

    sex_rc  phys__  any_p_  mrtl__  evlcnd  ev_cns  eva_fm  ev_rln  ev_cpc  eva_sm  ev_fsc  ev_trn  ed_tt_  prm___  ag___2  tnr_s_  nm_tr_  tp___2  frst__  any_vl
iter 1: 0.1291  0.8622  0.0901  0.3067  0.3256  0.1515  0.2041  0.1787  0.2639  0.1514  0.2049  0.2117  0.4397  0.4154  0.8086  0.4572  0.8687  0.0820  0.3422  0.2541  
iter 2: 0.1231  0.7830  0.0910  0.3039  0.0980  0.1480  0.1874  0.1632  0.2421  0.1409  0.2007  0.2107  0.4330  0.4144  0.7246  0.4495  0.8687  0.0812  0.3423  0.2538  
iter 3: 0.1243  0.7840  0.0908  0.3039  0.0985  0.1490  0.1872  0.1628  0.2431  0.1411  0.1999  0.2106  0.4338  0.4127  0.7232  0.4474  0.8681  0.0805  0.3429  0.2534  
iter 4: 0.1236  0.7834  0.0906  0.3038  0.0985  0.1484  0.1875  0.1627  0.2446  0.1408  0.1999  0.2113  0.4332  0.4129  0.7246  0.4498  0.8674  0.0812  0.3415  0.2527  

Variables to impute:        sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute:   readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event

    sex_rc  phys__  any_p_  mrtl__  evlcnd  ev_cns  eva_fm  ev_rln  ev_cpc  eva_sm  ev_fsc  ev_trn  ed_tt_  prm___  ag___2  tnr_s_  nm_tr_  tp___2  frst__  any_vl
iter 1: 0.1294  0.8623  0.0908  0.3066  0.3259  0.1508  0.2041  0.1795  0.2623  0.1516  0.2042  0.2112  0.4383  0.4156  0.8099  0.4559  0.8686  0.0820  0.3422  0.2538  
iter 2: 0.1238  0.7842  0.0913  0.3028  0.0975  0.1483  0.1865  0.1642  0.2423  0.1404  0.2003  0.2107  0.4338  0.4139  0.7257  0.4465  0.8685  0.0814  0.3425  0.2537  
iter 3: 0.1232  0.7832  0.0907  0.3043  0.0984  0.1487  0.1865  0.1619  0.2432  0.1412  0.2003  0.2101  0.4325  0.4142  0.7230  0.4475  0.8682  0.0814  0.3432  0.2531  
iter 4: 0.1231  0.7844  0.0907  0.3027  0.0982  0.1484  0.1868  0.1622  0.2433  0.1415  0.2001  0.2111  0.4342  0.4133  0.7243  0.4490  0.8677  0.0815  0.3430  0.2528  

Variables to impute:        sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute:   readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event

    sex_rc  phys__  any_p_  mrtl__  evlcnd  ev_cns  eva_fm  ev_rln  ev_cpc  eva_sm  ev_fsc  ev_trn  ed_tt_  prm___  ag___2  tnr_s_  nm_tr_  tp___2  frst__  any_vl
iter 1: 0.1293  0.8615  0.0905  0.3064  0.3256  0.1509  0.2040  0.1794  0.2624  0.1516  0.2044  0.2113  0.4376  0.4152  0.8121  0.4550  0.8695  0.0818  0.3427  0.2546  
iter 2: 0.1236  0.7865  0.0910  0.3039  0.0978  0.1488  0.1864  0.1630  0.2433  0.1408  0.2004  0.2116  0.4330  0.4139  0.7219  0.4485  0.8691  0.0805  0.3430  0.2522  
iter 3: 0.1231  0.7829  0.0908  0.3023  0.0977  0.1483  0.1868  0.1630  0.2433  0.1404  0.1998  0.2098  0.4346  0.4150  0.7271  0.4475  0.8681  0.0800  0.3419  0.2528  
iter 4: 0.1237  0.7842  0.0907  0.3041  0.0986  0.1485  0.1868  0.1629  0.2430  0.1411  0.2004  0.2107  0.4334  0.4152  0.7230  0.4482  0.8679  0.0803  0.3440  0.2534  

Variables to impute:        sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute:   readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event

    sex_rc  phys__  any_p_  mrtl__  evlcnd  ev_cns  eva_fm  ev_rln  ev_cpc  eva_sm  ev_fsc  ev_trn  ed_tt_  prm___  ag___2  tnr_s_  nm_tr_  tp___2  frst__  any_vl
iter 1: 0.1290  0.8622  0.0909  0.3060  0.3260  0.1508  0.2036  0.1787  0.2631  0.1516  0.2040  0.2121  0.4399  0.4144  0.8112  0.4553  0.8681  0.0821  0.3427  0.2523  
iter 2: 0.1242  0.7833  0.0913  0.3034  0.0981  0.1484  0.1867  0.1631  0.2438  0.1408  0.1999  0.2108  0.4335  0.4143  0.7241  0.4477  0.8672  0.0803  0.3422  0.2526  
iter 3: 0.1234  0.7830  0.0906  0.3039  0.0979  0.1492  0.1874  0.1628  0.2433  0.1412  0.2003  0.2113  0.4335  0.4142  0.7238  0.4467  0.8684  0.0806  0.3418  0.2549  

Variables to impute:        sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute:   readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event

    sex_rc  phys__  any_p_  mrtl__  evlcnd  ev_cns  eva_fm  ev_rln  ev_cpc  eva_sm  ev_fsc  ev_trn  ed_tt_  prm___  ag___2  tnr_s_  nm_tr_  tp___2  frst__  any_vl
iter 1: 0.1292  0.8634  0.0907  0.3059  0.3247  0.1513  0.2041  0.1794  0.2627  0.1515  0.2044  0.2109  0.4383  0.4172  0.8079  0.4569  0.8703  0.0831  0.3433  0.2532  
iter 2: 0.1230  0.7853  0.0908  0.3037  0.0980  0.1481  0.1869  0.1634  0.2425  0.1410  0.2004  0.2122  0.4335  0.4135  0.7237  0.4473  0.8676  0.0799  0.3417  0.2538  
iter 3: 0.1234  0.7838  0.0913  0.3037  0.0979  0.1485  0.1867  0.1620  0.2437  0.1409  0.2001  0.2109  0.4330  0.4147  0.7222  0.4475  0.8692  0.0811  0.3424  0.2534  
Check remaining missing values
Check remaining missing values
# A tibble: 0 × 0

2.3. One-hot encoding

We applied one‑hot encoding with dummyVars, and combined them with numeric predictors. Finally, we cleaned column names to produce a tidy modeling dataset, and generated a vector with dummified_vars. But also we decided to discard predictor variables with high uncertainty in the imputations, such as substance use onset age (age_subs_onset_rec2) physical diagnosis, in study (phys_dx_instudy) and the reported number of previous treatments (num_trat_ant).

The resulting database is called processed_datasets, which is a list of the imputed databases.

We also exported corrected_datasets to avoid immortal time bias in patients who were not readmitted because they died. This step is essential for accurately estimating readmission.

Code

cat("Define vector to discard dummy levels\n")

dummified_lvls_vars<- c("sex_rec_man", "tenure_status_household_others", "cohabitation_others", "sub_dep_icd10_status_hazardous_consumption", "any_violence_0_no_domestic_violence_sex_abuse", "tr_outcome_other", "adm_motive_other", "primary_sub_mod_others", "tipo_de_vivienda_rec2_formal_housing", "plan_type_corr_pg_pab", "occupation_condition_corr24_employed", "marital_status_rec_married_cohabiting", "urbanicity_cat_3_urban", "ed_attainment_corr_1_more_than_high_school", "prim_sub_freq_rec_1_1_day_wk", "first_sub_used_hallucinogens", "first_sub_used_inhalants", "first_sub_used_amphetamine_type_stimulants", "first_sub_used_others", "evaluacindelprocesoteraputico_logro_alto", "eva_consumo_logro_alto", "eva_fam_logro_alto", "eva_relinterp_logro_alto", "eva_ocupacion_logro_alto", "eva_sm_logro_alto", "eva_fisica_logro_alto", "eva_transgnorma_logro_alto")

high_uncertainty_imps<- c("age_subs_onset_rec2", "phys_dx_instudy", "num_trat_ant")

cat("Map dummification\n")
# 0. Define your factor variables dynamically (or use your manual list)
# It is safer to detect them from the first dataset to avoid typos
factor_vars <- names(filled_datasets[[1]])[sapply(filled_datasets[[1]], is.factor)]
dummy_formula <- as.formula(paste("~", paste(factor_vars, collapse = " + ")))

# Apply the transformation to EACH dataset in the list
processed_datasets <- map(filled_datasets, function(df_current) {
  # 1. Create the dummy model based on THIS specific dataset
  # (Though technically the levels should be the same, it's safer to re-run)
  dummies_model <- dummyVars(dummy_formula, data = df_current, fullRank = FALSE)
  # 2. Predict (generate) the dummies
  df_dummies_encoded <- predict(dummies_model, newdata = df_current)
  df_dummies_encoded <- as.data.frame(df_dummies_encoded)
  # 3. Combine non-factor columns with new dummy columns
  # We use dplyr::select to be safe regardless of whether it's data.table or data.frame
  numeric_part <- df_current %>% dplyr::select(-all_of(factor_vars))
  clean_dummified_df <- cbind(numeric_part, df_dummies_encoded)
  # 4. Clean names
  clean_dummified_df <- janitor::clean_names(clean_dummified_df)
  # 5. Select variables
  clean_dummified_df <- clean_dummified_df|> 
    dplyr::select(-any_of(dummified_lvls_vars))|> 
    dplyr::select(-any_of(high_uncertainty_imps))
  return(clean_dummified_df)
})

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Export databases\n")
#To google colab to explore predictors with Coxnet

# 1. Apply Correction to Each Dataset in the List
corrected_datasets <- map(processed_datasets, function(df) {
  # Ensure columns are numeric for comparison
  df <- df|> 
    mutate(
      readmit_time = as.numeric(readmit_time_from_disch_m),
      death_time   = as.numeric(death_time_from_disch_m),
      # Ensure events are 0/1 integers
      readmit_event = as.integer(as.character(readmit_event)), 
      death_event   = as.integer(as.character(death_event))
    )
  # --- THE CORRECTION LOGIC ---
  df <- df|>
    dplyr::mutate(
      # condition: Patient died (death_event==1) AND death happened before (or at) the readmission endpoint
      is_competing_risk = (death_event == 1) & (death_time <= readmit_time),
      # 1. Update Time: If competing risk, cut follow-up at death time. Otherwise keep original.
      readmit_time_from_disch_m = dplyr::if_else(is_competing_risk, death_time, readmit_time),
      # 2. Update Event: If competing risk, censor readmission (set to 0). Otherwise keep original.
      readmit_event = dplyr::if_else(is_competing_risk, 0, readmit_event)
    ) %>%
    # Remove temporary helper columns if you want to keep the dataset clean
    dplyr::select(-readmit_time, -death_time, -is_competing_risk)
  return(df)
})

# 1. Define the destination folder
output_dir <- file.path(wdpath, "data/20241015_out/pred1")

# 2. Iterate and Export
# iwalk automatically gives you the object (.x) and its index (.y)
purrr::iwalk(corrected_datasets, function(df, i) {
  # Construct the full file path
  # e.g. .../data/20241015_out/imputation_1.parquet
  file_name <- file.path(output_dir, paste0("imputation_", i, ".parquet"))
  # Export using rio
  rio::export(df, file_name)
  message(paste("Saved:", file_name))
})

Saved: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/pred1/imputation_1.parquet

Saved: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/pred1/imputation_2.parquet

Saved: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/pred1/imputation_3.parquet

Saved: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/pred1/imputation_4.parquet

Saved: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/pred1/imputation_5.parquet

Code

#Export list
rio::export(processed_datasets, paste0(file.path(wdpath, "data/20241015_out/pred1"),"/processed_datasets.Rds"))

rio::export(corrected_datasets, paste0(file.path(wdpath, "data/20241015_out/pred1"),"/corrected_datasets.Rds"))

Define vector to discard dummy levels
Map dummification
Export databases

Session info

Code

#|echo: true
#|error: true
#|message: true
#|paged.print: true
message(paste0("R library: ", Sys.getenv("R_LIBS_USER")))

R library: G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32

Code

message(paste0("Date: ",withr::with_locale(new = c('LC_TIME' = 'C'), code =Sys.time())))

Date: 2026-01-05 20:48:54.721862

Code

message(paste0("Editor context: ", path))

Editor context: G:/My Drive/Alvacast/SISTRAT 2023/data/20241015_out/251001_c1_1025_df_prev1y.rds

Code

cat("quarto version: "); quarto::quarto_version()

quarto version:

[1] '1.7.29'

Code

sesion_info <- devtools::session_info()

Warning in system2(“quarto”, “-V”, stdout = TRUE, env = paste0(“TMPDIR=”, : el comando ejecutado ‘“quarto” TMPDIR=C:/Users/andre/AppData/Local/Temp/RtmpwlZmSq/file50ac14525543 -V’ tiene el estatus 1

Code

dplyr::select(
  tibble::as_tibble(sesion_info$packages),
  c(package, loadedversion, source)
)|> 
  DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
              caption = htmltools::tags$caption(
        style = 'caption-side: top; text-align: left;',
        '', htmltools::em('R packages')),
      options=list(
initComplete = htmlwidgets::JS(
        "function(settings, json) {",
        "$(this.api().tables().body()).css({
            'font-family': 'Helvetica Neue',
            'font-size': '70%', 
            'code-inline-font-size': '15%', 
            'white-space': 'nowrap',
            'line-height': '0.75em',
            'min-height': '0.5em'
            });",
        "}")))

Code

#|echo: true
#|error: true
#|message: true
#|paged.print: true
#|class-output: center-table

reticulate::py_list_packages()|> 
  DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
              caption = htmltools::tags$caption(
        style = 'caption-side: top; text-align: left;',
        '', htmltools::em('Python packages')),
      options=list(
initComplete = htmlwidgets::JS(
        "function(settings, json) {",
        "$(this.api().tables().body()).css({
            'font-family': 'Helvetica Neue',
            'font-size': '70%', 
            'code-inline-font-size': '15%', 
            'white-space': 'nowrap',
            'line-height': '0.75em',
            'min-height': '0.5em'
            });",
        "}")))

Warning in system2(python, args, stdout = TRUE): el comando ejecutado ‘“G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe” -m pip freeze’ tiene el estatus 1

Save

Code

wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}

paste0(getwd(),"/cons")
file.path(paste0(wdpath,"data/20241015_out"))
file.path(paste0(envpath,"data/20241015_out"))

# Save
rdata_path <- file.path(wdpath, "data/20241015_out", paste0("pred1_ndp_", format(Sys.time(), "%Y_%m_%d"), ".Rdata"))

save.image(rdata_path)
cat("Saved in:",
    rdata_path)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
  password <- Sys.getenv("PASSWORD_SECRET")
} else {
  if (interactive()) {
    utils::savehistory(tempfile())
    Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
    utils::loadhistory()
  }
  Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
save.image(paste0(rdata_path,".enc"))

# Encriptar el archivo en el mismo lugar
httr2::secret_encrypt_file(path = paste0(rdata_path,".enc"), key = "PASSWORD_SECRET")

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Copy renv lock into cons folder\n")

if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
  message("Running on RStudio Server or inside Docker. Folder copy skipped.")

} else {
    
  source_folder <- 
  destination_folder <- paste0(wdpath,"cons/renv")
  
  # Copy the folder recursively
    file.copy(paste0(wdpath,"renv.lock"), paste0(wdpath,"cons/renv.lock"), overwrite = TRUE)
  
  message("Renv lock copy performed.")
}

Renv lock copy performed.

Code

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
time_after_dedup2<-Sys.time()

paste0("Time in markdown: ");time_after_dedup2-time_before_dedup2

[1] "G:/My Drive/Alvacast/SISTRAT 2023/cons/cons"
[1] "G:/My Drive/Alvacast/SISTRAT 2023//data/20241015_out"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/data/20241015_out"
Saved in: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/pred1_ndp_2026_01_05.RdataCopy renv lock into cons folder
[1] "Time in markdown: "
Time difference of 1.467835 hours