SENDAs Agreement 1 Update 2010-2022 (Prediction, step 1)

The dataset is enriched with derived variables, clinically meaningful recodings, and grouped diagnostic categories, while excluding patients still in treatment or with truncated admission dates. This results in a person-level dataset suitable for semi-competing risks modeling, where each row represents a unique index treatment episode with associated covariates and event times. This step involves mainly Data Pre-processing and Feature Engineering.

Author

Andrés González Santa Cruz

Published

January 5, 2026


Data Loading and Exploration

Loading Packages and uniting databases

Code
# invisible("Only run from Ubuntu")
# if (!(Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv"))) {
#   if(Sys.info()["sysname"]!="Windows"){
#     Sys.setenv(RETICULATE_PYTHON = "/home/fondecytacc/.pyenv/versions/3.11.5/bin/python")
#   }
# }

#clean enviroment
rm(list = ls()); gc()

time_before_dedup2<-Sys.time()

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

# Busca .mamba_root/envs/py311/python.exe desde getwd() hacia padres
find_python_rel <- function(start = getwd(),
                            rel = file.path(".mamba_root","envs","py311","python.exe")) {
  cur <- normalizePath(start, winslash = "/", mustWork = FALSE)
  repeat {
    cand <- normalizePath(file.path(cur, rel), winslash = "/", mustWork = FALSE)
    if (file.exists(cand)) return(cand)
    parent <- dirname(cur)
    if (identical(parent, cur)) return(NA_character_)  # llegó a la raíz
    cur <- parent
  }
}
# --- Bootstrap reticulate con ruta relativa a getwd() ---
if(Sys.info()["sysname"]!="Windows"){
  Sys.setenv(RETICULATE_PYTHON = "usr/bin/python3")
  reticulate::py_config()
} else {
  py <- find_python_rel()
  if (is.na(py)) {
    stop("No se encontró Python relativo a getwd() (buscando '.mamba_root/envs/py311/python.exe').\n",
         "Directorio actual: ", getwd())
  }
  # Forzar ese intérprete
  Sys.unsetenv(c("RETICULATE_CONDAENV","RETICULATE_PYTHON_FALLBACK"))
  Sys.setenv(RETICULATE_PYTHON = py)
  reticulate::use_python(py, required=T)
  reticulate::py_config()  # verificación
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

# --- Load 250927_c1_1025_df_prev1y.rds (simple, project-root aware, no functions) ---

# 1) Locate the file anywhere under the project root (current WD with a trailing "/cons" trimmed)
project_root <- sub("(/)?cons/?$", "", normalizePath(getwd(), winslash = "/", mustWork = FALSE))
fname        <- "251001_c1_1025_df_prev1y.rds"

hits <- list.files(project_root, pattern = paste0("^", fname, "$"),
                   recursive = TRUE, full.names = TRUE)

if (!length(hits)) stop("File not found: ", fname, "\nSearched under: ", project_root)
path <- hits[1]  # take first match

message("Loading: ", path)

# 2) Read into a dedicated environment (keeps .GlobalEnv clean)
env_prev1y <- new.env(parent = emptyenv())
obj        <- readRDS(path)
assign("SISTRAT23_c1_2010_2024_df_prev1y", obj, envir = env_prev1y)
rm(obj)

# 3) Quick sanity check
with(env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y, { message("Rows: ", nrow(env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y), " | Cols: ", ncol(env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y));  print(str(env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y))})
SISTRAT23_c1_2010_2024_df_prev1y<- 
env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y

rm(env_prev1y)
# (Optional) If you want it in the global env, uncomment:
# list2env(env_prev1y, envir = .GlobalEnv)
          used (Mb) gc trigger (Mb) max used (Mb)
Ncells  604456 32.3    1317361 70.4   961116 51.4
Vcells 1160482  8.9    8388608 64.0  1876213 14.4
python:         G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe
libpython:      G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python311.dll
pythonhome:     G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311
version:        3.11.5 | packaged by conda-forge | (main, Aug 27 2023, 03:23:48) [MSC v.1936 64 bit (AMD64)]
Architecture:   64bit
numpy:           [NOT FOUND]

NOTE: Python version was forced by RETICULATE_PYTHON
Classes 'tidytable', 'tbl', 'data.table' and 'data.frame':  162897 obs. of  125 variables:
 $ TABLE                        : chr  "2023" "2015" "2023" "2013" ...
 $ TABLE_rec_series             : chr  "||20232||" "||20151||" "||20231||" "||2013||" ...
 $ rn                           : int  240474 66234 225781 44305 69194 111875 235652 213374 14494 27258 ...
 $ rn_series                    : chr  NA NA NA NA ...
 $ num_trat_ant                 : num  0 1 0 1 2 2 0 0 0 1 ...
 $ fecha_ultimo_tratamiento     : chr  NA "3 a 4 anos" NA "1 a 2 anos" ...
 $ hash_key                     : chr  "00002d6e6b95298ec21341639f62bbf9a98cd51e1b8eb858c7df1e9680e5b22b" "000068521f2408c3c6d49b287428d89bad9c198726cea177017d649d8a66c001" "0000773f8535cc999b521b1d6c9784df95dd7bf19b68e1ae94da41114d45bc59" "00015ab15b7747293d16142f7cd40abc73495f0a7390a71c14fe37265f5efde2" ...
 $ min_adm_age_rec3             : num  57.9 31.5 38.9 20.6 20.6 ...
 $ adm_age_rec3                 : num  57.9 31.5 38.9 20.6 21.7 ...
 $ birth_date_rec               : Date, format: "1965-12-18" "1982-05-23" ...
 $ adm_date_num_rec2            : num  19675 16041 19187 15974 16364 ...
 $ adm_date_rec2                : Date, format: "2023-11-14" "2013-12-02" ...
 $ dit_rec6                     : num  20 485 365 176 199 36 211 302 15 217 ...
 $ disch_date_rec6              : Date, format: "2023-12-04" "2015-04-01" ...
 $ disch_date_num_rec6_trans    : num  19695 16526 19552 16150 16563 ...
 $ def_date                     : Date, format: NA NA ...
 $ adm_motive                   : chr  "spontaneous consultation" "sanitary sector" "other" "spontaneous consultation" ...
 $ tr_compliance_rec7           : chr  "early dropout" "referral" "completion" "late dropout" ...
 $ adm_disch_reason             : chr  NA NA NA NA ...
 $ referral_type                : chr  NA "other facility" NA NA ...
 $ plan_type                    : chr  "pg-pab" "pg-pab" "pg-pai" "pg-pab" ...
 $ id_centro                    : num  255 330 501 489 489 489 409 841 341 212 ...
 $ senda                        : chr  "si" "si" "si" "si" ...
 $ pub_center                   : Factor w/ 2 levels "FALSE","TRUE": 1 2 2 2 2 2 2 2 1 2 ...
 $ primary_sub                  : chr  "alcohol" "alcohol" "marijuana" "cocaine paste" ...
 $ second_sub1                  : chr  NA NA NA "marijuana" ...
 $ second_sub2                  : chr  NA NA NA "alcohol" ...
 $ second_sub3                  : chr  NA NA NA "tranquilizers/hypnotics" ...
 $ prim_sub_freq                : chr  "3. 2 to 3 days a week" "3. 2 to 3 days a week" "3. 2 to 3 days a week" "5. Daily" ...
 $ prim_sub_route               : chr  "Oral (drunk or eaten)" "Oral (drunk or eaten)" "Smoked or pulmonary aspiration" "Smoked or pulmonary aspiration" ...
 $ LB_age_primary_onset_rec2    : int  NA NA NA 10 10 10 NA NA 9 9 ...
 $ UB_age_primary_onset_rec2    : int  NA NA NA 19 19 19 NA NA 41 41 ...
 $ age_primary_onset_rec2       : num  14 13 13 17.7 17.7 ...
 $ first_sub_used               : chr  "alcohol" "alcohol" "marijuana" "alcohol" ...
 $ sus_ini_mod_mvv              : Factor w/ 5 levels "cocaine paste",..: 3 3 4 3 3 3 4 3 3 3 ...
 $ sus_ini_1                    : chr  NA NA NA NA ...
 $ sus_ini_2                    : chr  NA NA NA NA ...
 $ sus_ini_3                    : chr  NA NA NA NA ...
 $ LB_age_subs_onset_rec2       : int  NA NA NA 5 5 5 NA NA 5 5 ...
 $ UB_age_subs_onset_rec2       : int  NA NA NA 19 19 19 NA NA 41 41 ...
 $ age_subs_onset_rec2          : int  14 13 13 10 10 10 36 13 9 9 ...
 $ biopsych_comp                : chr  "2-moderate" "2-moderate" "1-mild" "2-moderate" ...
 $ mod_psiq_cie_10              : chr  NA "retraso mental::NA" "trastornos neuroticos, secundarios a situaciones estresantes y somatomorfos::NA" NA ...
 $ mod_psiq_dsm_iv              : chr  NA NA NA NA ...
 $ diagnostico_trs_fisico       : chr  "en estudio" "en estudio" "en estudio" "en estudio" ...
 $ otros_probl_at_sm_or         : chr  NA "otros" "abuso sexual" "sin otros problemas de salud mental" ...
 $ sub_dep_icd10_status         : chr  "hazardous consumption" "drug dependence" "hazardous consumption" "hazardous consumption" ...
 $ evaluacindelprocesoteraputico: chr  "logro minimo" "logro alto" "logro alto" "logro minimo" ...
 $ eva_consumo                  : chr  "logro minimo" "logro alto" "logro alto" "logro minimo" ...
 $ eva_fam                      : chr  "logro minimo" "logro intermedio" "logro intermedio" "logro minimo" ...
 $ eva_relinterp                : chr  "logro minimo" "logro alto" "logro intermedio" "logro minimo" ...
 $ eva_ocupacion                : chr  "logro minimo" "logro alto" "logro alto" "logro minimo" ...
 $ eva_sm                       : chr  "logro minimo" "logro intermedio" "logro alto" "logro minimo" ...
 $ eva_fisica                   : chr  "logro minimo" "logro alto" "logro alto" "logro minimo" ...
 $ eva_transgnorma              : chr  "logro minimo" "logro alto" "logro alto" "logro minimo" ...
 $ dg_global_nec_int_soc_or     : chr  "altas" NA "bajas" NA ...
 $ dg_nec_int_soc_cap_hum_or    : chr  "altas" NA "bajas" NA ...
 $ dg_nec_int_soc_cap_fis_or    : chr  "altas" NA "bajas" NA ...
 $ dg_nec_int_soc_cap_soc_or    : chr  "altas" NA "bajas" NA ...
 $ dg_global_nec_int_soc_egr_or : chr  "altas" "medias" NA NA ...
 $ dg_nec_int_soc_cap_hum_egr_or: chr  "altas" "medias" "bajas" NA ...
 $ dg_nec_int_soc_cap_fis_egr_or: chr  "altas" "medias" "bajas" NA ...
 $ dg_nec_int_soc_cap_soc_egr_or: chr  "altas" "bajas" "medias" NA ...
 $ usuario_tribunal_trat_droga  : chr  "no" "no" "no" "no" ...
 $ nationality_cons             : chr  "chile" "chile" "chile" "chile" ...
 $ ethnicity_c1_c6_historic     : chr  NA NA NA NA ...
 $ discapacidad                 : chr  "no" NA "no" NA ...
 $ opcion_discapacidad          : chr  NA NA NA NA ...
 $ sex_rec                      : chr  "hombre" "hombre" "mujer" "hombre" ...
 $ identidad_de_genero          : chr  "masculino" NA "femenino" NA ...
 $ orientacion_sexual           : chr  "heterosexual" NA "heterosexual" NA ...
 $ pregnant                     : chr  "no" NA "no" NA ...
 $ pregnant_disch               : chr  "no" NA "no" NA ...
 $ marital_status               : chr  "married/cohabiting" "single" "married/cohabiting" "single" ...
 $ tiene_menores_de_edad_a_cargo: chr  "no" NA "si" NA ...
 $ num_hijos_trat_res           : num  0 0 0 0 0 0 0 0 1 0 ...
 $ numero_de_hijos              : num  2 0 2 0 0 0 4 2 1 0 ...
 $ con_quien_vive               : chr  "con la pareja, hijos y padres o familia de origen" "solo" "unicamente con la pareja e hijos" "unicamente con padres o familia de origen" ...
 $ tipo_de_vivienda             : chr  "casa" "mediagua" "casa" "casa" ...
 $ precariedad_vivienda         : chr  "reside en una vivienda en buen estado" NA "reside en una vivienda en buen estado" NA ...
 $ tenure_status_household      : chr  "owner/transferred dwellings/pays dividends" "stays temporarily with a relative" "illegal settlement" "owner/transferred dwellings/pays dividends" ...
 $ servicios_basicos_95         : chr  "reside en una vivienda con servicios sanitarios basicos" NA "reside en una vivienda con servicios sanitarios basicos" NA ...
 $ perso_dormitorio_vivienda    : chr  "menor a 2,5 personas" NA "menor a 2,5 personas" NA ...
 $ ed_attainment                : chr  "3-Completed primary school or less" "2-Completed high school or less" "2-Completed high school or less" "3-Completed primary school or less" ...
 $ occupation_condition         : chr  "employed" "unemployed" "employed" "employed" ...
 $ occupation_status            : chr  NA NA NA "salaried" ...
 $ rubro_trabaja                : chr  NA NA NA "trabajadores no calificados (ocupaciones elementales)" ...
 $ laboral_ingresos             : chr  "300.001 a 400.000" NA "400.001 a 500.000" NA ...
 $ yr_block                     : num  1965 1980 1980 1990 1990 ...
 $ OBS_series                   : chr  NA NA NA NA ...
 $ senda_series                 : chr  NA NA NA NA ...
 $ pub_center_series            : chr  NA NA NA NA ...
 $ id_centro_series             : chr  NA NA NA NA ...
 $ disch_date_num_rec6          : num  19695 16526 19552 16150 16563 ...
 $ dg_psiq_cie_10_instudy       : logi  FALSE FALSE FALSE FALSE FALSE TRUE ...
 $ dg_psiq_cie_10_dg            : logi  FALSE TRUE TRUE FALSE FALSE FALSE ...
 $ dg_psiq_dsm_iv_instudy       : logi  FALSE FALSE FALSE TRUE TRUE TRUE ...
 $ dg_psiq_dsm_iv_dg            : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ plan_type_series             : chr  "||pg-pab||" "||pg-pab||" "||pg-pai||" "||pg-pab||" ...
  [list output truncated]
 - attr(*, ".internal.selfref")=<externalptr> 
NULL
Code
#https://github.com/rstudio/renv/issues/544
#renv falls back to copying rather than symlinking, which is evidently very slow in this configuration.
renv::settings$use.cache(FALSE)

#only use explicit dependencies (in DESCRIPTION)
renv::settings$snapshot.type("implicit")

#check if rstools is installed
if(Sys.info()["sysname"]=="Windows"){
try(installr::install.Rtools(check_r_update=F))
}

Installing package into ‘G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32’ (as ‘lib’ is unspecified)

Code
check_quarto_version <- function(required = "1.7.29", comparator = c("ge","gt","le","lt","eq")) {
  comparator <- match.arg(comparator)
  current <- package_version(paste(unlist(quarto::quarto_version()), collapse = "."))
  req     <- package_version(required)

  ok <- switch(comparator,
               ge = current >= req,
               gt = current >  req,
               le = current <= req,
               lt = current <  req,
               eq = current == req)

  if (!ok) {
    stop(sprintf("Quarto version check failed: need %s %s (installed: %s).",
                 comparator, required, current), call. = FALSE)
  }
  invisible(TRUE)
}

check_quarto_version("1.7.29", "ge") 

#change repository to CL
local({
  r <- getOption("repos")
  r["CRAN"] <- "https://cran.dcc.uchile.cl/"
  options(repos=r)
})

if(!require(pacman)){install.packages("pacman");require(pacman)}

Cargando paquete requerido: pacman

Code
if(!require(pak)){install.packages("pak");require(pak)}

Cargando paquete requerido: pak

Code
pacman::p_unlock(lib.loc = .libPaths()) #para no tener problemas reinstalando paquetes

No 00LOCK detected in: G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32 No 00LOCK detected in: C:/Program Files/R/R-4.4.1/library

Code
if(Sys.info()["sysname"]=="Windows"){
if (getRversion() != "4.4.1") { stop("Requires R version 4.4.1; Actual: ", getRversion()) }
}

#check docker
check_docker_running <- function() {
  # Try running 'docker info' to check if Docker is running
  system("docker info", intern = TRUE, ignore.stderr = TRUE)
}
if(Sys.info()["sysname"]=="Windows"){
  install_docker <- function() {
    # Open the Docker Desktop download page in the browser for installation
    browseURL("https://www.docker.com/products/docker-desktop")
  }
  # Main logic
  if (inherits(try(check_docker_running(), silent = TRUE), "try-error")) {
    liftr::install_docker()
  } else {
    message("Docker is running.")
  }
}

Warning in system(“docker info”, intern = TRUE, ignore.stderr = TRUE): el comando ejecutado ‘docker info’ tiene el estatus 1

Docker is running.

Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#PACKAGES#######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

unlink("*_cache", recursive=T)

# ----------------------------------------------------------------------
# 2. Use a single pak::pkg_install() call for most CRAN packages
# ----------------------------------------------------------------------

paks <-
  c(#"git", 
    # To connect to github
    "gh", #interface for  GitHub API from R
    #
    "gitcreds", # manages Git credentials (usernames, passwords, tokens)
    #
    "usethis", # simplifies common project setup tasks for R developers
    # Package to bring packages in development
    "devtools",
    # Package administration
    "renv",
    # To manipulate data
    "knitr", "pander", "DT",
    # Join
    "fuzzyjoin", "RecordLinkage",
    # For tables
    "tidyverse", "janitor",
    # For contingency tables
    "kableExtra",
    # For connections with python
    "reticulate",
    # To manipulate big data
    "polars", "sqldf",
    # To bring big databases
    "nanoparquet",
    # Interface for R and RStudio in R
    "installr", "rmarkdown", "quarto", "yaml", #"rstudioapi",
    # Time handling
    "clock",
    # Combine plots
    "ggpubr",
    # Parallelized iterative processing
    "furrr",
    # Work like a tibble with a data.table database
    "tidytable",
    # Split database into training and testing
    "caret",
    # Impute missing data
    "missRanger", "mice",
    # To modularize tasks
    "job",
    # For PhantomJS install checks
    "webshot"
  )

# dplyr
# janitor
# reshape2
# tidytable
# arrow
# boot
# broom
# car
# caret
# data.table
# DiagrammeR
# DiagrammeRsvg
# dplyr
# epiR
# epitools
# ggplot2
# glue
# htmlwidgets
# knitr
# lubridate
# naniar
# parallel
# polycor
# pROC
# psych
# readr
# rio
# rsvg
# scales
# stringr
# tableone
# rmarkdown
# biostat3
# codebook
# finalfit
# Hmisc
# kableExtra
# knitr
# devtools
# tidyr
# stringi
# stringr
# muhaz
# sqldf
# compareGroups
# survminer
# lubridate
# ggfortify
# car
# fuzzyjoin
# compareGroups
# caret
# job
# htmltools
# nanoparquet
# ggpubr
# polars
# installr
# clock
# pander
# reshape
# mice
# missRanger
# VIM
# withr
# biostat3
# broom
# glue
# finalfit
# purrr
# sf


# pak::pkg_install(paks)

pak::pak_sitrep()
# pak::sysreqs_check_installed(unique(unlist(paks)))
#pak::lockfile_create(unique(unlist(paks)),  "dependencies_duplicates24.lock", dependencies=T)
#pak::lockfile_install("dependencies_duplicates24.lock")
#https://rdrr.io/cran/pak/man/faq.html
#pak::cache_delete()

library(tidytable)

Adjuntando el paquete: ‘tidytable’

The following objects are masked from ‘package:stats’:

dt, filter, lag

The following object is masked from ‘package:base’:

%in%
Code
library(ggplot2)
library(readr)
library(glmulti)

Cargando paquete requerido: rJava

Cargando paquete requerido: leaps

Code
library(tableone)
library(survivalmodels)
#renv::install("patchwork@1.2.0")
library(survex)
library(SemiMarkov)

Cargando paquete requerido: numDeriv

Cargando paquete requerido: MASS

Adjuntando el paquete: ‘MASS’

The following object is masked from ‘package:tidytable’:

select

Cargando paquete requerido: Rsolnp

Code
library(flexsurv)

Cargando paquete requerido: survival

Code
library(rms)

Cargando paquete requerido: Hmisc

Adjuntando el paquete: ‘Hmisc’

The following object is masked from ‘package:tidytable’:

summarize

The following objects are masked from ‘package:base’:

format.pval, units
Code
library(survidm)
library(caret)

Cargando paquete requerido: lattice

Adjuntando el paquete: ‘caret’

The following object is masked from ‘package:survival’:

cluster
Code
# library(shapr)#https://norskregnesentral.github.io/shapr/
# library(survidm)#https://sci-hub.ee/10.1002/bimj.201700200
# library(SemiMarkov)#https://www.degruyterbrill.com/document/doi/10.1515/ijb-2020-0083/html?lang=en
# #https://www.jclinepi.com/article/S0895-4356(24)00142-2/fulltext

# ----------------------------------------------------------------------
# 3. Activate polars code completion (safe to try even if it fails)
# ----------------------------------------------------------------------
#try(polars_code_completion_activate())

# ----------------------------------------------------------------------
# 4. BPMN from GitHub (not on CRAN, so install via devtools if missing)
# ----------------------------------------------------------------------
if (!requireNamespace("bpmn", quietly = TRUE)) {
  devtools::install_github("bergant/bpmn")
}

# ----------------------------------------------------------------------
# 5. PhantomJS Check (use webshot if PhantomJS is missing)
# ----------------------------------------------------------------------
# if (!webshot::is_phantomjs_installed()) {
#   webshot::install_phantomjs()
# }

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#FUNCTIONS######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
# NO MORE DEBUGS
options(error = NULL)        # si antes tenías options(error = recover) o browser)
options(browserNLdisabled = FALSE)


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
#NAs are replaced with "" in knitr kable
options(knitr.kable.NA = '')

pander::panderOptions('big.mark', ',')
pander::panderOptions('decimal.mark', '.')

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#

#to format rows in bold
format_cells <- function(df, rows ,cols, value = c("italics", "bold", "strikethrough")){

  # select the correct markup
  # one * for italics, two ** for bold
  map <- setNames(c("*", "**", "~~"), c("italics", "bold", "strikethrough"))
  markup <- map[value]  

  for (r in rows){
    for(c in cols){

      # Make sure values are not factors
      df[[c]] <- as.character( df[[c]])

      # Update formatting
      df[r, c] <- ifelse(nchar(df[r, c])==0,"",paste0(markup, gsub(" ", "", df[r, c]), markup))
    }
  }

  return(df)
}
#To produce line breaks in messages and warnings
knitr::knit_hooks$set(
   error = function(x, options) {
     paste('\n\n<div class="alert alert-danger" style="font-size: small !important;">',
           gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
           '</div>', sep = '\n')
   },
   warning = function(x, options) {
     paste('\n\n<div class="alert alert-warning" style="font-size: small !important;">',
           gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
           '</div>', sep = '\n')
   },
   message = function(x, options) {
     paste('<div class="message" style="font-size: small !important;">',
           gsub('##', '\n', x),
           '</div>', sep = '\n')
   }
)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#

sum_dates <- function(x){
  cbind.data.frame(
    min= as.Date(min(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01"),
    p001= as.Date(quantile(unclass(as.Date(x)), .001, na.rm=T), origin = "1970-01-01"),
    p005= as.Date(quantile(unclass(as.Date(x)), .005, na.rm=T), origin = "1970-01-01"),
    p025= as.Date(quantile(unclass(as.Date(x)), .025, na.rm=T), origin = "1970-01-01"),
    p25= as.Date(quantile(unclass(as.Date(x)), .25, na.rm=T), origin = "1970-01-01"),
    p50= as.Date(quantile(unclass(as.Date(x)), .5, na.rm=T), origin = "1970-01-01"),
    p75= as.Date(quantile(unclass(as.Date(x)), .75, na.rm=T), origin = "1970-01-01"),
    p975= as.Date(quantile(unclass(as.Date(x)), .975, na.rm=T), origin = "1970-01-01"),
    p995= as.Date(quantile(unclass(as.Date(x)), .995, na.rm=T), origin = "1970-01-01"),
    p999= as.Date(quantile(unclass(as.Date(x)), .999, na.rm=T), origin = "1970-01-01"),
    max= as.Date(max(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01")
  )
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

# Define the function adapted for Polars
sum_dates_polars <- function(df, date_col) {
  # Create the list of quantiles
  quantiles <- c(0.001, 0.005, 0.025, 0.25, 0.5, 0.75, 0.975, 0.995, 0.999)
  # Create expressions to calculate min and max
  expr_list <- list(
    pl$col(date_col)$min()$alias("min"),
    pl$col(date_col)$max()$alias("max")
  )
  # Add expressions for quantiles
  for (q in quantiles) {
    expr_list <- append(expr_list, pl$col(date_col)$quantile(q)$alias(paste0("p", sub("\\.", "", as.character(q)))))
  }
  # Apply the expressions and return a DataFrame with the results
  df$select(expr_list)
}

# Custom function for sampling with a seed
sample_n_with_seed <- function(data, size, seed) {
  set.seed(seed)
  dplyr::sample_n(data, size)
}

# Function to get the most frequent value 
most_frequent <- function(x) { 
  uniq_vals <- unique(x)
  freq_vals <- sapply(uniq_vals, function(val) sum(x == val))
  most_freq <- uniq_vals[which(freq_vals == max(freq_vals))]
  
  if (length(most_freq) == 1) {
    return(most_freq)
  } else {
    return(NA)
  }
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#CONFIG #######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

options(scipen=2) #display numbers rather scientific number

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

# Define the function first
#joins these values with semicolons and optionally truncates the result if it exceeds a specified width.
toString2 <- function(x, width = NULL, ...) {
    string <- paste(x, collapse = "; ")
    if (missing(width) || is.null(width) || width == 0) 
        return(string)
    if (width < 0) 
        stop("'width' must be positive")
    if (nchar(string, type = "w") > width) {
        width <- max(6, width)
        string <- paste0(substr(string, 1, width - 3), "...")
    }
    string
}
normalize_txt <- function(x) {
  x|>
    stringi::stri_trans_general("Latin-ASCII")|>
    tolower()|>
    trimws()
}

pair_str <- function(main, sub) {
  main <- as.character(main)
  sub  <- as.character(sub)
  both_na <- is.na(main) & is.na(sub)

  out <- paste0(
    tidytable::coalesce(main, "NA"),
    "::",
    tidytable::coalesce(sub,  "NA")
  )

  out[both_na] <- NA_character_
  out
}

# ── Helpers ────────────────────────────────────────────────────────────
mode_pick_int <- function(x){
  x <- x[!is.na(x)]
  if(length(x)==0) return(NA_integer_)
  tx <- sort(table(x), decreasing = TRUE)
  as.integer(names(tx)[1L])
}

subkey_to_label <- function(x){
  y <- gsub("_"," ", tolower(x))
  y <- gsub("amphetamine type stimulants","amphetamine-type stimulants", y)
  y <- gsub("tranquilizers hypnotics","tranquilizers/hypnotics", y)
  y
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# ── Encryption ────────────────────────────────────────────────────────

# Pack: [24-byte nonce][ciphertext] -> hex
encrypt_chr <- function(x, key) {
    vapply(x, function(v) {
        if (is.na(v)) return(NA_character_)
        ct <- sodium::data_encrypt(charToRaw(as.character(v)), key)
        nonce <- attr(ct, "nonce")                 # 24 bytes
        sodium::bin2hex(c(nonce, ct))              # pack nonce + ct
    }, FUN.VALUE = character(1))
}

# Unpack: hex -> [nonce|ciphertext], restore attr(nonce), then decrypt
decrypt_chr <- function(x, key) {
    vapply(x, function(v) {
        if (is.na(v)) return(NA_character_)
        buf <- sodium::hex2bin(v)
        if (length(buf) < 25L) stop("Ciphertext too short or corrupted.")
        nonce <- buf[1:24]
        ct    <- buf[-(1:24)]
        attr(ct, "nonce") <- nonce
        rawToChar(sodium::data_decrypt(ct, key))
    }, FUN.VALUE = character(1))
}

# Function to encrypt a character vector
encrypt_column <- function(x, key) {
  sapply(x, function(item) {
    if (is.na(item) || item == "") {
      return(NA_character_)
    }
    encrypted_raw <- sodium::data_encrypt(charToRaw(item), key)
    base64enc::base64encode(encrypted_raw)  # Convert to base64 for storage
  }, USE.NAMES = FALSE)
}

decrypt_column <- function(x, key) {
  sapply(x, function(item) {
    if (is.na(item)) return(NA_character_)
    encrypted_raw <- base64enc::base64decode(item)
    rawToChar(sodium::data_decrypt(encrypted_raw, key))
  }, USE.NAMES = FALSE)
}


is_stata_ok <- function(x) {
  nchar(x) <= 32 & grepl("^[A-Za-z][A-Za-z0-9_]*$", x)
}


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# ── Tidy RMS ────────────────────────────────────────────────────────

tidy_cph <- function(model) {
  if (!inherits(model, "cph")) stop("Model must be an rms::cph object")
  
  # run summary
  s <- summary(model)
  df_s <- as.data.frame(unclass(s))
  df_s$rown <- rownames(s)
  
  # find rows that are "Hazard Ratio"
  hr_rows <- trimws(df_s$rown) == "Hazard Ratio"
  
  # build tidy tibble
  tibble::tibble(
    term     = trimws(rownames(s)[which(hr_rows) - 1L]), # previous row = variable name
    estimate = df_s$Effect[hr_rows],                     # Hazard Ratio
    conf.low = df_s$`Lower 0.95`[hr_rows],
    conf.high= df_s$`Upper 0.95`[hr_rows],
    p.value  = df_s$P[hr_rows]
  )
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# --- Step 1: Define File Paths ---

# SET THIS to the correct path of your secret text file
secret_file_path <- "C:/Users/andre/Documents/secret.txt"
# This locates the user-level .Renviron file automatically
renviron_path <- file.path(Sys.getenv("HOME"), ".Renviron")
# --- Step 2: Read the Secret Key from the .txt file ---
# Read the first line of the file and trim any whitespace
api_key <- trimws(readLines(secret_file_path, n = 1, warn = FALSE))
# --- Step 3: Check if the key is already in .Renviron ---
# Read all existing lines from .Renviron (if it exists)
if (file.exists(renviron_path)) {
  existing_lines <- readLines(renviron_path, warn = FALSE)
} else {
  existing_lines <- c()
}
# Check if a line for OPENAI_API_KEY already exists
key_exists <- any(grepl("^OPENAI_API_KEY=", existing_lines))
# --- Step 4: Write to .Renviron if the key isn't already there ---
if (!key_exists) {
  # Format the line to be added
  line_to_add <- sprintf('OPENAI_API_KEY="%s"', api_key)
  # Append the line to the .Renviron file, adding a newline before it if the file isn't empty
  cat(line_to_add, "\n", file = renviron_path, append = TRUE)
  message("✅ Success! OPENAI_API_KEY has been added to your .Renviron file.")
  message("Please restart your R session for the change to take effect.")
} else {
  message("ℹ️ The OPENAI_API_KEY already exists in your .Renviron file. No changes were made.")
}

ℹ️ The OPENAI_API_KEY already exists in your .Renviron file. No changes were made.

Code
options(.gander_chat = ellmer::chat_openai(model = "o1-mini"))
Error in contrib.url(repos, "source") : 
  trying to use CRAN without setting a mirror
* pak version:
- 0.8.0.1
* Version information:
- pak platform: x86_64-w64-mingw32 (current: x86_64-w64-mingw32, compatible)
- pak repository: - (local install?)
* Optional packages installed:
- pillar
* Library path:
- G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32
- C:/Program Files/R/R-4.4.1/library
* pak is installed at G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32/pak.
* Dependency versions:
- callr      3.7.6      
- cli        3.6.2      
- curl       5.2.1      
- desc       1.4.3      
- filelock   1.0.3      
- jsonlite   1.8.8      
- lpSolve    5.6.23.9000
- pkgbuild   1.4.4      
- pkgcache   2.2.2.9000 
- pkgdepends 0.7.2.9000 
- pkgsearch  3.1.3.9000 
- processx   3.8.4      
- ps         1.7.6      
- R6         2.5.1      
- zip        2.3.1      
* Dependencies can be loaded


Note

Data Pre-processing and Feature Engineering

Outliers in continuous variables will be managed using Tukey’s method, where values beyond 1.5 times the interquartile range (IQR) from the first or third quartile are flagged for review.

The analysis will incorporate 14 predictor variables, including three continuous variables. To model non-linear relationships, we will generate transformed variables, including logarithmic (log(x)), quadratic (x2), and cubic (x^3) terms. Additionally, some continuous variables may be discretized into categorical features.

If required, continuous predictors will be zero-centered and scaled by a factor to improve the model’s computational performance and numerical stability.

Aims

  • To identify key prognostic and predictive factors associated with the risk of, and time to, the first readmission to treatment among patients aged 18-64 undergoing initial treatment for Substance Use Disorder (SUD) in Chile between 2010 and 2020.

  • To identify key prognostic and predictive factors associated with the risk of, and time to, all-cause mortality among the same cohort of patients undergoing initial treatment for SUD in Chile.

  • To predict the risk and timing of first treatment readmission and all-cause mortality simultaneously, treating these outcomes within a semi-competing risks framework. This will allow for the estimation of cumulative incidence for each event in the presence of the other for patients aged 18-64 who initiated SUD treatment in Chile between 2010 and 2020.

This study has three primary objectives. First, we aim to identify key prognostic and predictive factors associated with the risk and timing of two critical outcomes: (i) first readmission to treatment and (ii) all-cause mortality. Second, using a semi-competing risks framework, we will develop a model to simultaneously predict these two outcomes, accounting for the fact that death precludes the possibility of readmission. Finally, this model will be used to estimate the probability of various patient trajectories over time following initial treatment. The study population includes all patients aged 18-64 who began treatment for Substance Use Disorder (SUD) in Chile between 2010 and 2020.


1. Structure of treatments and rules to collapse continuous entries

The variables involved in the analysis are the following:.

  • Time to first readmission from admission readmit_time_from_adm_m / from discharge readmit_time_from_disch_m in months, and event (readmit_event)
  • Time to mortality death_time_from_adm_m / from discharge death_time_from_disch_m in months, and event (death_event) Predisposing factors:
  • Admission age adm_age_rec3 [log1p(x)=adm_age_log, centered= adm_age_c,^2=adm_age_pow2, ^3=adm_age_pow3, three groups= adm_age_rec3_cat]
  • Sex (often used for biological categorization) sex_rec
  • Housing situation tenure_status_household_rec
  • Employment status occupation_condition_corr24
  • Marital status marital_status_rec
  • Poverty index of the commune of residence porc_pobr [log1p(x)=porc_pobr_log, centered= porc_pobr_c, six quantile groups= porc_pobr_c_cat6]
  • Urbanization level of the commune of residence urbanicity_cat
  • Educational attainment ed_attainment_corr
  • Living arrangement/Cohabitation status cohabitation

Need factors: * Psychiatric comorbidity dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg * Severity of Substance Use Disorder (SUD) sub_dep_icd10_status * Primary substance of use primary_sub_mod * Frequency of primary substance use at admission prim_sub_freq (recoded: prim_sub_freq_rec) * Polysubstance use polysubstance_strict

Enabling factors: * Length of stay in treatment (months) dit_rec6 [log1p(x)=treat_log, ^2=treat_days_pow2, ^3=treat_days_pow3, <90days=treat_lt_90] * Treatment outcome tr_outcome * Admission motive adm_motive

Suitable for a research context:.

  • Court-referred to drug treatment usuario_tribunal_trat_droga (very unbalanced, ~1% yes)
  • Type of housing tipo_de_vivienda_rec

Stratification variables:.

  • Treatment modality plan_type_corr

1.1. Cohort Construction for Time-to-Event Analysis

First, we flagged all qualifying treatment episodes by filtering for patients aged 18–65 with an admission date between 2010 and 2020. To ensure each patient served as their own baseline, we isolated only the first qualifying treatment for each individual, removing any second or subsequent qualifying treatments from the dataset. Using this first episode as the index event, we then built time-to-event variables in months for our outcomes of interest: death and readmission. All observations were administratively censored as of December 31, 2020, to all observations. The final analytical dataset included only the first qualifying treatment per patient, carrying forward the time-to-event information and a counter of any discarded prior treatments.

  • What can I do with adm truncated (0.2%), and currently in (0.1%)??

We created a formatted database in wide format for multistate analysis (SISTRAT23_c1_2010_2024_df_prev1z).

Code
# --- Safe helpers (no warnings) ---
safe_min_where_int  <- function(x, flag) {
  idx <- which(flag %in% TRUE & !is.na(x))
  if (length(idx) == 0L) NA_integer_ else as.integer(min(x[idx]))
}
safe_min_where_date <- function(x, flag) {
  idx <- which(flag %in% TRUE & !is.na(x))
  if (length(idx) == 0L) as.Date(NA) else min(x[idx])
}
safe_min_date <- function(x) {
  x <- x[!is.na(x)]
  if (length(x) == 0L) as.Date(NA) else min(x)
}

# Windows of interest
.start_date  <- as.Date("2010-01-01")
.end_date    <- as.Date("2020-12-31")
.censor_date <- as.Date("2020-12-31")

# 0) Start & canonical order + treatment index
df0 <- SISTRAT23_c1_2010_2024_df_prev1y|>
  tidytable::ungroup()|>
  tidytable::arrange(hash_key, adm_date_rec2, disch_date_rec6)|>
  tidytable::mutate(treatment = tidytable::row_number(), .by = hash_key)

n0_cases <- nrow(df0)
n0_runs  <- nrow(tidytable::distinct(df0, hash_key))

message("0. INITIAL database, cases: ", formatC(nrow(df0), big.mark=","), sep = "")
  1. INITIAL database, cases: 162,897
Code
message("0. INITIAL database, RUNs: " , formatC(nrow(tidytable::distinct(df0, hash_key)), big.mark=","), sep = "")
  1. INITIAL database, RUNs: 121,299
Code
# 1) Qualifying flag (age 18–65 & admission in 2010–2020 inclusive)
df1 <- df0|>
  tidytable::mutate(
    meets_crit = !is.na(adm_age_rec3)  & adm_age_rec3 >= 18 & adm_age_rec3 <= 64 &
                 !is.na(adm_date_rec2) & adm_date_rec2 >= .start_date & adm_date_rec2 <= .end_date
  )

# First qualifying treatment per patient + counter of discarded priors (no warnings)
first_keep_tbl <- df1|>
  tidytable::summarise(
    first_keep_treat = safe_min_where_int(treatment, meets_crit),
    .by = hash_key
  )|>
  tidytable::mutate(
    n_discarded_before = ifelse(is.na(first_keep_treat), NA_integer_,
                                as.integer(first_keep_treat - 1L))
  )

n_patients_meet <- first_keep_tbl|> tidytable::filter(!is.na(first_keep_treat))|> nrow()
message("1. Patients with ≥1 qualifying admission (age 18–64 & 2010–2020): ",
        formatC(n_patients_meet, big.mark=","), sep = "")
  1. Patients with ≥1 qualifying admission (age 18–64 & 2010–2020): 88,632
Code
# 2) Keep ONLY qualifying episodes; rank; drop 3rd+
df_q <- df1|>
  tidytable::filter(meets_crit)|>
  tidytable::arrange(hash_key, adm_date_rec2, disch_date_rec6)|>
  tidytable::mutate(q_rank = tidytable::row_number(), .by = hash_key)|>
  tidytable::filter(q_rank <= 2L)

n1b_cases <- nrow(df_q)
n1b_patients <- nrow(tidytable::distinct(df_q, hash_key))
message("1b. AFTER filtering to qualifying episodes, cases: ",
        formatC(nrow(df_q), big.mark=","), sep = "")

1b. AFTER filtering to qualifying episodes, cases: 107,702

Code
message("1b. AFTER filtering to qualifying episodes, RUNs: ",
        formatC(nrow(tidytable::distinct(df_q, hash_key)), big.mark=","), sep = "")

1b. AFTER filtering to qualifying episodes, RUNs: 88,632

Code
message("2. AFTER dropping ≥3rd qualifying treatments per patient, cases: ",
        formatC(nrow(df_q), big.mark=","), sep = "")
  1. AFTER dropping ≥3rd qualifying treatments per patient, cases: 107,702
Code
message("2. AFTER dropping ≥3rd qualifying treatments per patient, RUNs: ",
        formatC(nrow(tidytable::distinct(df_q, hash_key)), big.mark=","), sep = "")
  1. AFTER dropping ≥3rd qualifying treatments per patient, RUNs: 88,632
Code
# 3) Anchors per patient (warning-free)
anchors <- df_q|>
  tidytable::summarise(
    first_adm   = safe_min_where_date(adm_date_rec2,   q_rank == 1L),
    first_disch = safe_min_where_date(disch_date_rec6, q_rank == 1L),
    second_adm  = safe_min_where_date(adm_date_rec2,   q_rank == 2L),  # NA if no second qualifying
    .by = hash_key
  )

# Death date per patient (from full data, not only qualifying rows)
death_tbl <- df0|>
  tidytable::summarise(death_date = safe_min_date(def_date), .by = hash_key)

# 4) Build per-person time-to-event (no min() calls here)
perperson <- df_q|>
  tidytable::distinct(hash_key)|>
  tidytable::left_join(anchors,       by = "hash_key")|>
  tidytable::left_join(death_tbl,     by = "hash_key")|>
  tidytable::left_join(first_keep_tbl, by = "hash_key")|>
  tidytable::mutate(
    # Event indicators with censoring at 2020-12-31
    readmit_event = as.integer(!is.na(second_adm) & second_adm <= .censor_date),
    death_event   = as.integer(!is.na(death_date)  & death_date  <= .censor_date),

    # End times for intervals (choose event time or censor date)
    readmit_end_from_adm   = ifelse(readmit_event == 1L, second_adm, .censor_date),
    readmit_end_from_disch = ifelse(readmit_event == 1L, second_adm, .censor_date),
    death_end_from_adm     = ifelse(death_event   == 1L, death_date, .censor_date),
    death_end_from_disch   = ifelse(death_event   == 1L, death_date, .censor_date),

    # Durations (months) from first qualifying admission/discharge
    readmit_time_from_adm_m =
      ifelse(!is.na(first_adm),
        as.numeric(lubridate::time_length(lubridate::interval(first_adm,   readmit_end_from_adm),   "months")),
        NA_real_),
    readmit_time_from_disch_m =
      ifelse(!is.na(first_disch),
        as.numeric(lubridate::time_length(lubridate::interval(first_disch, readmit_end_from_disch), "months")),
        NA_real_),
    death_time_from_adm_m =
      ifelse(!is.na(first_adm),
        as.numeric(lubridate::time_length(lubridate::interval(first_adm,   death_end_from_adm),     "months")),
        NA_real_),
    death_time_from_disch_m =
      ifelse(!is.na(first_disch),
        as.numeric(lubridate::time_length(lubridate::interval(first_disch, death_end_from_disch),   "months")),
        NA_real_)
  )|>
  # Clamp negatives (dirty dates safeguard)
  tidytable::mutate(
    readmit_time_from_adm_m   = ifelse(!is.na(readmit_time_from_adm_m),   pmax(readmit_time_from_adm_m,   0), NA_real_),
    readmit_time_from_disch_m = ifelse(!is.na(readmit_time_from_disch_m), pmax(readmit_time_from_disch_m, 0), NA_real_),
    death_time_from_adm_m     = ifelse(!is.na(death_time_from_adm_m),     pmax(death_time_from_adm_m,     0), NA_real_),
    death_time_from_disch_m   = ifelse(!is.na(death_time_from_disch_m),   pmax(death_time_from_disch_m,   0), NA_real_)
  )|>
  tidytable::mutate(discarded_before_flag = as.integer(n_discarded_before > 0L))

# 5) FINAL: keep only the first qualifying treatment row per patient, carry times/counters
SISTRAT23_c1_2010_2024_df_prev1z <-
  df_q|>
  tidytable::filter(q_rank == 1L)|>
  tidytable::left_join(perperson, by = "hash_key")|>
  tidytable::select(-meets_crit, -q_rank)|>
  (\(df){
  
    n_cases   <- nrow(df)
    n_runs    <- tidytable::distinct(df, hash_key) |> nrow()
    red_cases <- nrow(df0) - n_cases
    red_runs  <- (tidytable::distinct(df0, hash_key) |> nrow()) - n_runs
    readmit   <- sum(df$readmit_event)
  
    # store (step 3)
    n_cases   ->> m3_cases
    n_runs    ->> m3_runs
    red_cases ->> m3_cases_red
    red_runs  ->> m3_runs_red
    list(cases=n_cases, runs=n_runs, readmit=readmit, cases_red=red_cases, runs_red=red_runs) ->> m3
  
    message("3. FINAL: first qualifying treatment per patient (with time-to-event), cases: ",
            formatC(n_cases, big.mark=","), sep = "")
    message("3. FINAL: RUNs: ", formatC(n_runs, big.mark=","), sep = "")
    message("   CASES reduction vs initial(prev1y): ", formatC(red_cases, big.mark=","), sep = "")
    message("   RUNs  reduction vs initial(prev1y): ", formatC(red_runs,  big.mark=","), sep = "")
  
    if (n_cases > nrow(df0)) stop("Error: Added treatment episodes in the process")
    df
  })() |>
  tidytable::filter(!tr_compliance_rec7 == "currently in") |>
  (\(df){
  
    n_cases   <- nrow(df)
    n_runs    <- tidytable::distinct(df, hash_key) |> nrow()
    red_cases <- nrow(df0) - n_cases
    red_runs  <- (tidytable::distinct(df0, hash_key) |> nrow()) - n_runs
  
    # store (step 4)
    n_cases   ->> m4_cases
    n_runs    ->> m4_runs
    red_cases ->> m4_cases_red
    red_runs  ->> m4_runs_red
    list(cases=n_cases, runs=n_runs, cases_red=red_cases, runs_red=red_runs) ->> m4
  
    message("4. FINAL: leaving out patients currently in tr., cases: ",
            formatC(n_cases, big.mark=","), sep = "")
    message("4. FINAL: RUNs: ", formatC(n_runs, big.mark=","), sep = "")
    message("   CASES reduction vs initial(prev1y): ", formatC(red_cases, big.mark=","), sep = "")
    message("   RUNs  reduction vs initial(prev1y): ", formatC(red_runs,  big.mark=","), sep = "")
  
    df
  })()
  1. FINAL: first qualifying treatment per patient (with time-to-event), cases: 88,632
  1. FINAL: RUNs: 88,632

CASES reduction vs initial(prev1y): 74,265

RUNs reduction vs initial(prev1y): 32,667

  1. FINAL: leaving out patients currently in tr., cases: 88,504
  1. FINAL: RUNs: 88,504

CASES reduction vs initial(prev1y): 74,393

RUNs reduction vs initial(prev1y): 32,795

Code
#SISTRAT23_c1_2010_2024_df_prev1z |> filter(is.na(dit_m)) |> janitor::tabyl(tr_compliance_rec7, show_na = T)
SISTRAT23_c1_2010_2024_df_prev1z$porc_pobr_c_cat6<- cut2(SISTRAT23_c1_2010_2024_df_prev1z$porc_pobr_c,g=6)

cat("Remove intermediate databases\n")
rm(df0); rm(df1); rm(df_q); rm(anchors); rm(death_tbl); rm(first_keep_tbl)
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_


# --- choose your time origin for the multistate ---
.use_discharge_origin <- FALSE   # TRUE = origin at first_disch; FALSE = origin at first_adm

# Add origin_date
if (.use_discharge_origin) {
  perperson <- tidytable::mutate(SISTRAT23_c1_2010_2024_df_prev1z, origin_date = first_disch)
} else {
  perperson <- tidytable::mutate(SISTRAT23_c1_2010_2024_df_prev1z, origin_date = first_adm)
}

# 1) Drop patients who die before the origin (they never enter the process)
n_drop_predeath <- tidytable::filter(perperson, !is.na(death_date) & !is.na(origin_date) & death_date < origin_date) |>
  nrow()
message("A. Excluding patients who died before origin: ",
        formatC(n_drop_predeath, big.mark=","), sep = "")

A. Excluding patients who died before origin: 0

Code
stopifnot(
nrow(tidytable::filter(perperson, is.na(death_date) | is.na(origin_date) | death_date >= origin_date))== nrow(SISTRAT23_c1_2010_2024_df_prev1z)
)
# 2) Keep only those whose origin is observed and within study window (optional but recommended)
n_before <- nrow(SISTRAT23_c1_2010_2024_df_prev1z)
perperson2 <- tidytable::filter(perperson, !is.na(origin_date) & origin_date <= .censor_date)
message("B. Excluding missing/after-censor origins: ",
        formatC(n_before - nrow(perperson2), big.mark=","), sep = "")

B. Excluding missing/after-censor origins: 0

Code
# 3) Ensure monotone times: second_adm must be after origin (else reclassify as 'no readmission')
# perperson <- tidytable::mutate(perperson,
#     invalid_readmit = !is.na(second_adm) & second_adm <= origin_date,
#     second_adm      = ifelse(invalid_readmit, as.Date(NA), second_adm),
#     readmit_event   = ifelse(invalid_readmit, 0L, readmit_event)
#   )

rm(perperson); rm(perperson2)
Remove intermediate databases

1.1.* Flowchart

Code
#For SISTRAT23_c1_2010_2024_df_prev1t
try(load(paste0(dirname(path),"/SISTRAT_database_sep25_prevt.RData")))

#n_patients_meet;n1b_cases; n1b_patients; m4; m3

df23_ndp_20250824_SISTRAT23_c1_1024<- 
 readRDS(
   paste0(dirname(path),"/psu/23_ndp_20250824_SISTRAT23_c1_1024_df2.rds")
 )

# Labels
lab_orig <- paste0(
  "Original C1 Dataset \n(n = ",
  formatC(nrow(df23_ndp_20250824_SISTRAT23_c1_1024), format = "f", big.mark = ",", digits = 0),
  "; Patients = ",
  formatC(dplyr::n_distinct(df23_ndp_20250824_SISTRAT23_c1_1024$hash_key), format = "f", big.mark = ",", digits = 0),
  ")"
)

lab_orig_clean <- paste(
  "Pre-processing & Quality Checks",
  "&#8226; Remove exact duplicate records",
  "&#8226; Resolve overlapping episodes (keep longest / merge)",
  "&#8226; Fix inconsistent admission/discharge dates",
  "&#8226; Correct implausible birth dates/ages",
  "&#8226; Remove negative/implausible days in treatment",
"",
  sep = "\\l"
)
lab_post_orig  <- paste0('C1 Dataset \n(n = ', formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1t), format='f', big.mark=',', digits=0), '; Patients = ',formatC(dplyr::distinct(SISTRAT23_c1_2010_2024_df_prev1t, hash_key)|> nrow(), format='f', big.mark=',', digits=0),')')
lab_post_orig_clean <- paste("&#8226; Collapse consecutive linked", "treatments (≤45d gap, referral=yes)", sep = "\\l")


lab_proc  <- paste0('C1 Cleaned & Collapsed\n(n = ', formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1y), format='f', big.mark=',', digits=0), '; Patients = ',formatC(dplyr::distinct(SISTRAT23_c1_2010_2024_df_prev1y, hash_key)|> nrow(), format='f', big.mark=',', digits=0),')')
lab_flag  <- "Sorted by Admission Date\n+ First Treatment Flag"


lab_discard_first <- paste(
  "&#8226; Qualifying flag (age 18–64 & admission in 2010–2020 inclusive)",
  "&#8226; Dropping ≥3rd qualifying treatments per patient",
  paste0("n= ", formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1y)-n1b_cases, format = "f", digits = 0, big.mark = ","),"; Patients= ", formatC(dplyr::distinct(SISTRAT23_c1_2010_2024_df_prev1t, hash_key)|> nrow()- n1b_patients, format = "f", digits = 0, big.mark = ",")," put aside"),
"",sep = "\\l")
##CAMBIAR

lab_after <- paste0("After First-Treatment Rule\nn= ",formatC(m3$readmit+m3$cases, format= "f", big.mark=",", digits=0),"; Patients= ", formatC(m3$cases, format= "f", big.mark=",", digits=0))  # example counts
#Frpm 3. FINAL: first qualifying treatment per patient (with time-to-event), cases: 
lab_discard_single <- paste( 
  "1st qualifying treatment per patient (with time-to-event)",
  paste0("Patients = ", formatC(m3$cases-nrow(SISTRAT23_c1_2010_2024_df_prev1z), format = "f", big.mark = ",", digits = 0)), sep="\\l")

lab_final <- paste0("Final C1 Dataset\nn = ",
                    formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1z)+sum(SISTRAT23_c1_2010_2024_df_prev1z$readmit_event), format = "f", digits = 0, big.mark = ","),
                    "; Patients = ",
                    formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1z), format = "f", digits = 0, big.mark = ","))
# Diagram
gr <- DiagrammeR::grViz(
    paste0(
        'digraph flowchart {
      graph [layout = dot, rankdir = TB]

      node [fontname = "Times", shape = rectangle, fontsize = 33, style = filled, fillcolor = white, ranksep=0.2, nodesep=0.2]

      # New initial box and pre-clean note
      pre_original   [label = "', lab_orig, '", fillcolor = lightgray, shape = box]
      pre_clean      [label = "', lab_orig_clean, '", shape = note, fillcolor = white]

      # Intermediate new initial box and pre-clean note
      pre2_original   [label = "', lab_post_orig, '", fillcolor = white, shape = box]
      pre2_clean      [label = "', lab_post_orig_clean, '", shape = note, fillcolor = lemonchiffon]

      # Existing boxes
      original       [label = "', lab_proc, '", fillcolor = lightgray, shape = box]
      marked         [label = "', lab_flag, '", shape = box]
      after_rule     [label = "', lab_after, '", shape = box]
      final_dataset  [label = "', lab_final, '", fillcolor = lightgray, shape = box]

      discard_first   [label = "', lab_discard_first, '", shape = note, fillcolor = mistyrose]
      discard_single  [label = "', lab_discard_single, '", shape = note, fillcolor = mistyrose]

      # Invisible points for alignment
      v00 [shape = point, width = 0, style = invis]  # between orig and post orig
      v0 [shape = point, width = 0, style = invis]  # between pre_original and original      
      v1 [shape = point, width = 0, style = invis]
      v2 [shape = point, width = 0, style = invis]
      v3 [shape = point, width = 0, style = invis]

      # Main flow (add pre_original -> v0 -> original in front)
      pre_original -> v00 [arrowhead = none]
      v00 -> pre2_original
      
      pre2_original -> v0 [arrowhead = none]
      v0 -> original      

      original -> v1 [arrowhead = none]
      v1 -> marked
      marked -> v2 [arrowhead = none]
      v2 -> after_rule
      after_rule -> v3 [arrowhead = none]
      v3 -> final_dataset

      # Discard / note connections with solid black arrows
      v00 -> pre_clean     [color = black]     # new pre-processing note
      v0 -> pre2_clean     [color = black]     # new pre-processing note
      v2 -> discard_first [color = black]
      v3 -> discard_single[color = black]

      # Alignment
      { rank = same; pre_clean; v00 }
      { rank = same; pre2_clean; v0 }
      { rank = same; discard_first; v2 }
      { rank = same; discard_single; v3 }
    }'
    ),
  width = 600, height = 900
)

gr

Figure 3. Flowchart

Exported the graph into png fortmat.

Code
unlink(paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_pred_files"), recursive = TRUE)
htmlwidgets::saveWidget(gr, paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_pred.html"))
webshot::webshot(paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_pred.html"), 
                 paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_pred.png"),
                 vwidth = 300, vheight = 300*1.5,  zoom=10, expand=100)  # Prueba con diferentes coordenadas top, left, width, and height

Registered S3 methods overwritten by ‘callr’: method from format.callr_status_error
print.callr_status_error

1.2. Death decomposition

We decomposed the cumulative incidence of death following an initial treatment episode by its temporal relationship to the first SUD treatment readmission.

Cumulative incidence functions (CIFs) were estimated using the Aalen–Johansen method, and total mortality was also calculated via Kaplan–Meier to validate consistency. The resulting decomposition shows how overall mortality is partitioned over time relative to readmission, visualized as stacked cumulative probabilities.

Code
# ──────────────────────────────────────────────────────────────────────────────
# 0) Base y nombres
df2 <- SISTRAT23_c1_2010_2024_df_prev1z |>
  dplyr::rename(
    t_r = readmit_time_from_disch_m,
    e_r = readmit_event,
    t_d = death_time_from_disch_m,
    e_d = death_event
  ) |>
  # Sanitiza tipos
  dplyr::mutate(
    e_r = as.integer(e_r),
    e_d = as.integer(e_d)
  )

# Chequeo rápido (opcional)
stopifnot(nrow(df2) > 0)
# t_d debe existir para todos (muerte o fin de seguimiento)
stopifnot(all(is.finite(df2$t_d)))

# ──────────────────────────────────────────────────────────────────────────────
# 1) Definir causas de MUERTE (mutuamente excluyentes) y tiempo de análisis
#    1 = muere antes/igual que la 1ª readmisión
#    2 = muere después de la 1ª readmisión
#    0 = censurado (no muere)
df3 <- df2 |>
  dplyr::mutate(
    cause = dplyr::case_when(
      e_d == 1L & (e_r == 0L | t_d <= t_r) ~ 1L,   # muerte antes/igual que readmisión
      e_d == 1L &  e_r == 1L & t_r <  t_d  ~ 2L,   # readmisión primero, luego muerte
      TRUE ~ 0L                                    # no muere (censura)
    ),
    # Tiempo de análisis:
    #   - muertes: t_d (tiempo a muerte)
    #   - censura: t_d (tiempo a última observación)
    ftime2 = t_d
  )

# Consistencia con tu 2x2
# with(df3, table(cause, e_d))
# Esperado: cause=1 -> 3203 muertes; cause=2 -> 744 muertes

# ──────────────────────────────────────────────────────────────────────────────
# 2) CIF de muerte por causa (competing risks)
ci2 <- cmprsk::cuminc(ftime = df3$ftime2, fstatus = df3$cause, cencode = 0)

# Helper robusto: toma el ÚLTIMO número del nombre (la causa), ignora "Tests"
get_cif_by_code <- function(ci, code = 1L){
  keep <- vapply(ci, function(x) !is.null(x$est), logical(1))
  nm   <- names(ci)[keep]
  ci   <- ci[keep]
  codes <- suppressWarnings(as.integer(sub(".*[^0-9]([0-9]+)$", "\\1", nm)))
  idx   <- which(!is.na(codes) & codes == code)[1]
  if (!length(idx) || is.na(idx)) return(data.frame(time = numeric(0), est = numeric(0)))
  data.frame(time = ci[[idx]]$time, est = ci[[idx]]$est)
}

cif_d_before <- get_cif_by_code(ci2, 1L)  # muerte antes/igual que readmisión
cif_d_after  <- get_cif_by_code(ci2, 2L)  # muerte después de readmisión

# ──────────────────────────────────────────────────────────────────────────────
# 3) KM de muerte total (chequeo externo)
km <- survival::survfit(survival::Surv(time = df3$ftime2, event = df3$e_d) ~ 1)
km_df <- data.frame(time = km$time, P_dead_total = 1 - km$surv)

# ──────────────────────────────────────────────────────────────────────────────
# 4) Comparación a horizontes fijos (12, 60, 120 y tiempo final observado)
t_end <- max(df3$ftime2, na.rm = TRUE)
taus  <- c(12, 60, 120, t_end)

risk_at <- function(fit, tau) 1 - summary(fit, times = tau, extend = TRUE)$surv
AJ_at   <- function(cif_df, tau){
  if (!nrow(cif_df)) return(rep(0, length(tau)))
  # Aproximación por función escalón (respeta naturaleza de AJ)
  sf <- stepfun(cif_df$time, c(0, cif_df$est))
  sf(tau)
}

horiz <- data.frame(
  tau_m       = taus,
  KM_death    = sapply(taus, risk_at, fit = km),
  CIF_before  = AJ_at(cif_d_before, taus),
  CIF_after   = AJ_at(cif_d_after,  taus)
) |>
  dplyr::mutate(total_CIF = CIF_before + CIF_after)

# print(horiz)
# Esperado: KM_death ≈ total_CIF en cada tau (pequeñas diferencias numéricas)

# ──────────────────────────────────────────────────────────────────────────────
# 5) Serie completa para gráfico (con stepfun; evita errores de interpolación)
sf_km <- stepfun(km_df$time,             c(0, km_df$P_dead_total))
sf_c1 <- stepfun(cif_d_before$time,      c(0, cif_d_before$est))
sf_c2 <- stepfun(cif_d_after$time,       c(0, cif_d_after$est))

t_grid <- sort(unique(c(km_df$time, cif_d_before$time, cif_d_after$time)))
out <- data.frame(
  time             = t_grid,
  CIF_death_before = sf_c1(t_grid),
  CIF_death_after  = sf_c2(t_grid),
  P_dead_total     = sf_km(t_grid)
)
out$sum_cifs <- out$CIF_death_before + out$CIF_death_after

# Chequeo final (debe acercarse al KM final ~ 9%)
# tail(out[, c("time","CIF_death_before","CIF_death_after","sum_cifs","P_dead_total")], 3)

# ──────────────────────────────────────────────────────────────────────────────
# 6) Gráfico: descomposición correcta de la mortalidad total
ggplot(out, aes(x = time)) +
  geom_ribbon(aes(ymin = 0, ymax = CIF_death_before,
                  fill = "Muerte antes de readmisión"), alpha = 0.55) +
  geom_ribbon(aes(ymin = CIF_death_before, ymax = sum_cifs,
                  fill = "Muerte después de readmisión"), alpha = 0.55) +
  geom_step(aes(y = P_dead_total, color = "Mortalidad total (KM)"),
            linewidth = 0.9) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_fill_manual(values = c("Muerte antes de readmisión"   = "#A17C6C",
                               "Muerte después de readmisión" = "#5FA9B3")) +
  scale_color_manual(values = c("Mortalidad total (KM)" = "#660600")) +
  labs(x = "Meses desde el alta", y = "Probabilidad acumulada",
       fill = NULL, color = NULL,
       title = "Mortalidad acumulada: descomposición por relación con readmisión",
       subtitle = "CIF de muerte antes vs. después de la primera readmisión; línea = KM total") +
  theme_minimal(base_size = 16) +
  theme(legend.position = "bottom")
Code
# ──────────────────────────────────────────────────────────────────────────────

# CIF de "primer evento" (1=readmisión, 2=muerte), con tiempos al primer evento
ftime_first  <- pmin(df2$t_r, df2$t_d, na.rm = TRUE)
fstatus_first <- dplyr::case_when(
    df2$e_r == 1L & (df2$e_d == 0L | df2$t_r <  df2$t_d) ~ 1L, # readmisión primero
    df2$e_d == 1L & (df2$e_r == 0L | df2$t_d <= df2$t_r) ~ 2L, # muerte primero
    TRUE ~ 0L
)
ci_first <- cmprsk::cuminc(ftime = ftime_first, fstatus = fstatus_first, cencode = 0)
# readmisión como primer evento a 12m:
tp_first <- cmprsk::timepoints(ci_first, times = 12)
tp_first$est[,"12"]

# ──────────────────────────────────────────────────────────────────────────────
# Graficar
p <- out|>
    ggplot2::ggplot(ggplot2::aes(x = time)) +
    ggplot2::geom_ribbon(ggplot2::aes(ymin = 0, ymax = CIF_death_before,
                                      fill = "Death before readmission"), alpha = 0.5) +
    ggplot2::geom_ribbon(ggplot2::aes(ymin = CIF_death_before, ymax = P_dead_total,
                                      fill = "Death after readmission"), alpha = 0.5) +
    ggplot2::geom_step(ggplot2::aes(y = P_dead_total, color = "Total death (KM)"),
                       linewidth = 0.7) +
    ggplot2::geom_step(ggplot2::aes(y = CIF_death_before, color = "CIF of death before readmission"),
                       linewidth = 0.7) +
    ggplot2::scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                                limits = c(0, max(out[,"sum_cifs"]))) +
    ggplot2::scale_fill_manual(values = c("Death before readmission"= "#a6cee3",
                                          "Death after readmission"= "#b2df8a")) +
    ggplot2::scale_color_manual(values = c("Total death (KM)"= "#1f78b4", "Death before readmission"= "#666666")) +
    ggplot2::labs(x = "Months since discharge", y = "Cumulative probability",
                  fill = NULL, color = NULL) +
    ggplot2::theme_minimal(base_size = 20) +
    ggplot2::theme(legend.position = "bottom")

print(p)
Code
ggsave(
  filename = paste0(gsub("/cons","",getwd()), "/cons/_figs/_death_decomposition.png"),
   plot = p +
    labs(x = "Meses desde el egreso", y = "Probabilidad acumulada",
         fill = NULL, color = NULL)+
      scale_fill_manual(
        values = c(
          "Death before readmission" = "#a6cee3",
          "Death after readmission" = "#B48448"
        ),
        labels = c(
          "Death before readmission" = "Muerte antes de la readmisión",
          "Death after readmission" = "Muerte después de la readmisión"
        )
      ) +
      scale_color_manual(
        values = c(
          "Total death (KM)" = "#5A0D13",
          "CIF of death before readmission" = "#666666"
        ),
        labels = c(
          "Total death (KM)" = "Muerte total (KM)",
          "CIF of death before readmission" = "CIF de muerte antes de la readmisión"
        )
      )+ ggplot2::theme(
    legend.position = "bottom",
    legend.box = "horizontal",
    legend.text = ggplot2::element_text(size=16, margin=margin(l=0, r=-3, unit="pt")),
    legend.title = ggplot2::element_blank(),
    legend.spacing.x = grid::unit(0.2, "cm"),
    legend.spacing.y = grid::unit(0.2, "cm"),
    legend.key.width = grid::unit(1.0, "cm")
  ) +
  ggplot2::guides(
    fill = ggplot2::guide_legend(nrow = 2, byrow = TRUE, keywidth = unit(0.5, "cm")),
    color = ggplot2::guide_legend(nrow = 2, byrow = TRUE, keywidth = unit(0.5, "cm"))
  )+ ggplot2::scale_x_continuous(
  breaks = scales::breaks_width(12),
  labels = function(m) paste0(m, "m")    # o en años: paste0(m/12, " años")
),
    dpi = 600
)

Saving 7 x 5 in image Scale for fill is already present. Adding another scale for fill, which will replace the existing scale. Scale for colour is already present. Adding another scale for colour, which will replace the existing scale.

A total of 1,126 patients (1.3%) had valid treatments between 18-64 in their second or third treatment.

1.3. Rule-Based Aggregation into ICD-10–Informed groups

To reduce sparsity and improve interpretability, we grouped specific diagnostic labels into four clinically coherent supergroups aligned with ICD-10 logic: (1) Infectious diseases (STIs, viral hepatitis, and infections related to substance use); (2) Organ-system medical diseases (cardiovascular, hepatic, oral, and hematologic conditions); (3) Injuries and sequelae; and (4) Other specified medical conditions (broad/vague descriptors such as “enfermedades somáticas”, “otras enfermedades…”, and pregnancy-related items). We preserved the status values “en estudio” and “sin trastorno” as-is, and left missing values as NA. For multi-response entries (diagnostico_trs_fisico_series), we parsed “||”-separated items, applied the same mapping to each token, deduplicated within-row, and produced both a collapsed supergroup string and one-hot indicators for the supergroups to avoid inflating counts while preventing sparse categories.

Code
icd10_broad <- function(x) {
x <- tolower(x)
tidytable::case_when(
stringr::str_detect(x, "organic|organico|demenc|alzheimer|parkinson|delirium|cerebral") ~ "F0 Organic",
stringr::str_detect(x, "psicoactiva|alcohol|marihuana|canabis|cannabis|cocain|opio|opiace|benzodiazep|sustancias") ~ "F1 Substance use",
stringr::str_detect(x, "esquizofren|psicotip|delirant|psicosis") ~ "F2 Psychotic",
stringr::str_detect(x, "estado de animo|afectiv|depres|bipolar|maniaco|distimia|hipoman") ~ "F3 Mood",
stringr::str_detect(x, "neurotic|ansiedad|fobi|panico|obsesivo|compulsiv|estres|adaptaci|somatoform|somatiz") ~ "F4 Anxiety/Stress/Somatoform",
stringr::str_detect(x, "comportamiento.*fisiolog|alimentari|anorex|bulim|sueñ|insomni|disfuncion sexual") ~ "F5 Physio/Eating/Sleep/Sexual",
stringr::str_detect(x, stringr::regex("personalidad|comportamiento del adulto|antisocial|limite|evitativ|habit|habitos|impuls|control de los impulsos|control\\s+de\\s+impulsos", ignore_case = TRUE)) ~ "F6 Personality/Adult behaviour",
stringr::str_detect(x, "retraso mental|discapacidad intelectual|intelectual") ~ "F7 Intellectual disability",
stringr::str_detect(x, "desarrollo psicolog|autism|asperger|lenguaje|aprendizaje|espectro autista|tdah|t\\s*d\\s*a\\s*h") ~ "F8/9 Neurodevelopment/Child",
TRUE ~ "Other/unspecified"
)
}

cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Create dummied for psychiatric comorbidity\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Living arrangement/Cohabitation status\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Domestic violence & Sex abuse\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Primary subs\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Tr outcome\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")

# Create multi-label dummies using tidytable
SISTRAT23_c1_2010_2024_df_prev1z <- 
  SISTRAT23_c1_2010_2024_df_prev1z|>
  tidytable::mutate(dx_norm = normalize_txt(mod_psiq_cie_10))|>
  tidytable::mutate(dit_m = readmit_time_from_adm_m-readmit_time_from_disch_m)|>
  # avoid NAs propagating into str_detect
  tidytable::mutate(dx_norm_nna = ifelse(is.na(dx_norm), "", dx_norm))|>
  tidytable::mutate(
    f0_organic                         = stringr::str_detect(dx_norm_nna, "mentales organicos"),
    f1_substance_use                   = stringr::str_detect(dx_norm_nna, "sustancias psicoactivas|consumo de sustancias|psicoactiva"),
    f2_psychotic                       = stringr::str_detect(dx_norm_nna, "esquizofren|esquizotip|ideas? delirant|psicosis"),
    f3_mood                            = stringr::str_detect(dx_norm_nna, "humor \\(afectivos\\)"),
    f4_anxiety_stress_somatoform       = stringr::str_detect(dx_norm_nna, "neuroticos|estresantes|somatomorfos"),
    f5_physio_eating_sleep_sexual      = stringr::str_detect(dx_norm_nna, "conducta alimentaria|disfunciones fisiologicas|factores somaticos"),
    f6_personality_adult_behaviour     = stringr::str_detect(dx_norm_nna, "personalidad y del comportamiento del adulto|habitos y del control de los impulsos|transformacion persistente de la personalidad"),
    f7_intellectual_disability         = stringr::str_detect(dx_norm_nna, "retraso mental|discapacidad intelectual"),
    f8_9_neurodevelopment_child        = stringr::str_detect(dx_norm_nna, "desarrollo psicolog|comienzo habitual en la infancia y adolescencia")
  )|>
  tidytable::mutate(marital_status_rec= tidytable::case_when(marital_status== "separated/divorced/annulled"~ "separated/divorced/annulled/widowed", marital_status=="widowed"~"separated/divorced/annulled/widowed", T~ marital_status))|>
  tidytable::mutate(
  cqv_norm=con_quien_vive|>stringi::stri_trans_general("Latin-ASCII")|>
    stringr::str_to_lower()|>stringr::str_squish())|>
  tidytable::mutate(
    # 5-level (more granular)
    con_quien_vive_joel5=dplyr::case_when(
      # Alone
      stringr::str_detect(cqv_norm,"^solo$")~"Alone",
      # Couple/children only (no family-of-origin)
      stringr::str_detect(cqv_norm,"^unicamente con (la )?pareja( e hijos)?$")|
        stringr::str_detect(cqv_norm,"^unicamente con pareja$")|
        stringr::str_detect(cqv_norm,"^unicamente con hijos$")~
        "Couple/children only",
      # Couple/children + family-of-origin (extended household)
      stringr::str_detect(cqv_norm,"(pareja|hijos).*(padres|familia de origen)")|
        stringr::str_detect(cqv_norm,"^con la pareja,? hijos y padres")~
        "Couple/children + origin",
      # Family-of-origin only
      stringr::str_detect(cqv_norm,"^unicamente con padres|^con padres|familia de origen$")|
        stringr::str_detect(cqv_norm,"^con la madre \\(sola\\)$")|
        stringr::str_detect(cqv_norm,"^con abuelos$")|
        stringr::str_detect(cqv_norm,"^con hermanos$")~
        "Family of origin",
      # Others (friends / other non-relatives / other relative)
      stringr::str_detect(cqv_norm,"con amigos|otro no pariente|otro pariente|^otros$")~
        "Others",
      TRUE~"Others"))|>
  tidytable::mutate(
    # 4-level (collapsed)
    cohabitation=dplyr::case_when(
      con_quien_vive_joel5=="Alone"~"alone",
      con_quien_vive_joel5 %in% c("Couple/children only","Couple/children + origin")~"with couple/children",
      con_quien_vive_joel5=="Family of origin"~"family of origin",
      TRUE~"Others"
    ))|>
  tidytable::mutate(
    tenure_status_household_rec= tidytable::case_when(grepl("illegal|others",tenure_status_household)~"illegal settlement & others", T~tenure_status_household)
  )|>
  tidytable::mutate(
    cohabitation=forcats::fct_relevel(
      factor(cohabitation),
      c("alone","with couple/children","family of origin","others")
    ))|>
  tidytable::select(-tidytable::any_of(c("cqv_norm", "con_quien_vive_joel5", "dx_norm_nna")))|>
  tidytable::mutate(dom_violence=factor(tidytable::case_when(
      grepl("Violencia Intrafamiliar$",otros_probl_at_sm_or, ignore.case=T)~1,
      is.na(otros_probl_at_sm_or)~NA_real_,
      T~0),levels=c(0,1),labels=c("No domestic violence","Domestic violence")))|>
    tidytable::mutate(sex_abuse= factor(tidytable::case_when(
      grepl("Abuso Sexual",otros_probl_at_sm_or, ignore.case=T)~1,
      is.na(otros_probl_at_sm_or)~NA_real_,
      T~0),levels=c(0,1),labels=c("No sexual abuse","Sexual abuse")))|>
    tidytable::mutate(
        primary_sub_std = stringr::str_to_lower(stringr::str_squish(primary_sub)),
        primary_sub_mod = tidytable::case_when(
            stringr::str_detect(primary_sub_std, "cocaine\\s*(base\\s*)?paste|pasta\\s*base|\\bpaco\\b") ~ "cocaine paste",
            stringr::str_detect(primary_sub_std, "cocaine\\s*(powder|hydrochloride|hcl)") |
                stringr::str_detect(primary_sub_std, "clorhidrato\\s*de\\s*coca|clorhidrato|hidrocloruro") ~ "cocaine powder",
            stringr::str_detect(primary_sub_std, "\\balcohol\\b") ~ "alcohol",
            stringr::str_detect(primary_sub_std, "marij|cannab|marihu") ~ "marijuana",
            TRUE ~ "others"
        ))|>
    tidytable::mutate(primary_sub_mod = factor(primary_sub_mod,
                                    levels = c("cocaine paste","cocaine powder","alcohol","marijuana","others")))|>
  tidytable::select(-tidytable::any_of(c("primary_sub_std", "dx_norm")))|>
  tidytable::mutate(urbanicity_cat= tidytable::case_when(grepl("Mix",clasificacion)~ "2.Mixed", grepl("Urb",clasificacion)~ "3.Urban", grepl("Rur",clasificacion)~ "1.Rural", T~NA_character_))|>
  tidytable::mutate(ethnicity= ifelse(!is.na(ethnicity_c1_c6_historic),1,0))|>
    tidytable::mutate(
        tr_outcome = tidytable::case_when(
            tr_compliance_rec7 == "currently in" & is.na(adm_disch_reason) ~ NA_character_,
            stringr::str_detect(coalesce(tr_compliance_rec7, ""), "dropout") & is.na(adm_disch_reason) ~ "dropout",
            # explicit non-discharge statuses with NA reason
            tr_compliance_rec7 == "referral"    & is.na(adm_disch_reason) ~ "referral",
            tr_compliance_rec7 == "completion"  & is.na(adm_disch_reason) ~ "completion",
            # discharge branches (note the parentheses)
            stringr::str_detect(coalesce(tr_compliance_rec7, ""), "discharge") &
                adm_disch_reason %in% c("agreement_end", "death", "no_local_service") ~ "adm discharge - adm reasons",
            stringr::str_detect(coalesce(tr_compliance_rec7, ""), "discharge") &
                (adm_disch_reason == "rule_violation" | is.na(adm_disch_reason)) ~ "adm discharge - rule violation/undet",
            TRUE ~ "other"
        ))

Warning: 1 unknown level in f: others

Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Recode Type of housing, Admission age, Treatment days, Tr. compliance; Nationallity (Chile); \n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")

SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
  tipo_de_vivienda_rec <- factor(tidytable::case_when(
    tipo_de_vivienda %in% c("casa", "departamento") ~ "formal housing",
    tipo_de_vivienda == "pieza dentro de la vivienda" ~ "shared/secondary unit",
    tipo_de_vivienda%in% c("caleta o punto de calle", "mediagua movil (carpa, casa rodante o similar)", "hospederia", "residencial, pension, hostal", "choza, rancho, ruca", "informal/rural housing") ~ "homeless/unsheltered/informal/temporary housing/institutional/collective",
    is.na(tipo_de_vivienda)~ NA_character_,
    TRUE ~ "other/unknown"  # catches "otro" and NA
    ),
  levels = c(
    "formal housing",
    "shared/secondary unit",
    "homeless/unsheltered/informal/temporary housing/institutional/collective",
    "other/unknown"
    ))
})

SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
  prim_sub_freq_rec <- factor(tidytable::case_when(
    prim_sub_freq %in% c("1. Less than 1 day a week", "2. 1 day a week")~ "1.≤1 day/wk",
    prim_sub_freq %in% c("3. 2 to 3 days a week", "4. 4 to 6 days a week")~ "2.2–6 days/wk",
    is.na(prim_sub_freq)~ NA_character_,
    TRUE ~ "3.Daily"  # catches "otro" and NA
    ),
  levels = c(
    "1.≤1 day/wk",
    "2.2–6 days/wk",
    "3.Daily"
    ))
})

SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
  tipo_de_vivienda_rec2 <- factor(tidytable::case_when(
    tipo_de_vivienda %in% c("casa", "departamento") ~ "formal housing",
    # tipo_de_vivienda == "pieza dentro de la vivienda" ~ "shared/secondary unit",
    # tipo_de_vivienda%in% c("caleta o punto de calle", "mediagua movil (carpa, casa rodante o similar)", "hospederia", "residencial, pension, hostal", "choza, rancho, ruca", "informal/rural housing") ~ "homeless/unsheltered/informal/temporary housing/institutional/collective",
    is.na(tipo_de_vivienda)~ NA_character_,
    TRUE ~ "other/unknown"  # catches "otro" and NA
    ),
  levels = c(
    "formal housing",
    # "shared/secondary unit",
    # "homeless/unsheltered/informal/temporary housing/institutional/collective",
    "other/unknown"
    ))
})

#Admission age
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_log <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(adm_age_rec3), NA_real_, log1p(adm_age_rec3)))

SISTRAT23_c1_2010_2024_df_prev1z$adm_age_c <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(adm_age_rec3), NA_real_, scale(adm_age_rec3, center = TRUE, scale = FALSE)))

#Warning in log1p(adm_age_rec3) : Se han producido NaNs
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_pow2 <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(adm_age_rec3), NA_real_, adm_age_rec3^2))
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_pow3 <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(adm_age_rec3), NA_real_, adm_age_rec3^3))

# Admission Age categories
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_cat <- with(
    SISTRAT23_c1_2010_2024_df_prev1z,
    {
        res <- rep(NA_character_, length(adm_age_rec3))
        res[adm_age_rec3 >= 18 & adm_age_rec3 < 30] <- "18-29"
        res[adm_age_rec3 >= 30 & adm_age_rec3 < 45] <- "30-44"
        res[adm_age_rec3 >= 45 & adm_age_rec3 < 65] <- "45-64"
        res
    }
)
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_cat <- factor(
  SISTRAT23_c1_2010_2024_df_prev1z$adm_age_cat,
  levels = c("18-29", "30-44", "45-64")
)
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_c <- scale(SISTRAT23_c1_2010_2024_df_prev1z$adm_age_rec3, center = TRUE, scale = FALSE)


invisible("Created the 90 days translated in months by subsetting and taking the average value")
dit90<- 
mean(SISTRAT23_c1_2010_2024_df_prev1z$dit_m[which(SISTRAT23_c1_2010_2024_df_prev1z$dit_rec6==90)])

# Treatment duration transformations
SISTRAT23_c1_2010_2024_df_prev1z$treat_log <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(dit_m), NA_real_, log1p(dit_m)))
#Warning in log1p(dit_m) : Se han producido NaNs
SISTRAT23_c1_2010_2024_df_prev1z$treat_days_pow2 <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(dit_m), NA_real_, dit_m^2))
SISTRAT23_c1_2010_2024_df_prev1z$treat_days_pow3 <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(dit_m), NA_real_, dit_m^3))
SISTRAT23_c1_2010_2024_df_prev1z$treat_lt_90 <- with(SISTRAT23_c1_2010_2024_df_prev1z, 
  ifelse(is.na(dit_m), NA_integer_, as.integer(dit_m < dit90)))

SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
  sex_rec <- dplyr::case_when(
    sex_rec == "mujer" ~ "woman",
    sex_rec == "hombre" ~ "man",
    T~ NA_character_
  )
  sex_rec <- factor(sex_rec, levels = c(
    "man",
    "woman"
  ))
})
SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
  nationality_chile <- dplyr::case_when(
    nationality_cons== "chile" ~ "chile",
    !nationality_cons== "chile" ~ "other",
    T~ NA_character_
  )
  nationality_chile <- factor(nationality_chile, levels = c(
    "chile",
    "other"
  ))
})

SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
  any_violence <- dplyr::case_when(
    sex_abuse== "Sexual abuse"| dom_violence=="Domestic violence"~ "1.Domestic violence/sex abuse",
    grepl("No", sex_abuse) & grepl("No", dom_violence)~ "0.No domestic violence/sex abuse",
    grepl("No", sex_abuse) & is.na(dom_violence)~ "0.No domestic violence/sex abuse",
    is.na(sex_abuse) & grepl("No", dom_violence)~ "0.No domestic violence/sex abuse",
    T~ NA_character_
  )
  any_violence <- factor(any_violence, levels = c(
    "1.Domestic violence/sex abuse",
    "0.No domestic violence/sex abuse"
  ))
})
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

# 1) Define supergroup sets and a vectorized classifier that keeps NAs as NA
infectious_set <- c(
  "ets",
  "hepatitis b, c, d",
  "infecciosas relacionadas con uso de sustancias"
)
organ_set <- c(
  "cardiopatias: miocardiopatia dilatada por oh, arritmias, hta",
  "hepatitis alcoholica subaguda",
  "hepatitis cronica",
  "patologia bucal",
  "anemia: megaloblastica y ferropenica"
)
injury_set <- c("traumatismos y secuelas secundarios")
other_spec_set <- c(
  "otras enfermedades o condiciones fisicas limitantes",
  "otras enfermedades o condiciones de riesgo vital",
  "enfermedades somaticas",
  "patologia de la gestion y del nino intrauterino"
)
supergroups_levels <- c(
  "en estudio",
  "sin trastorno",
  "Infectious diseases (STIs, viral hepatitis, substance-use–related)",
  "Organ-system medical diseases (cardio, hepatic, oral, hematologic)",
  "Injuries and sequelae",
  "Other specified medical conditions"
)
classify_dx_vec <- function(x) {
  x <- stringr::str_squish(x)
  out <- rep(NA_character_, length(x))
  # statuses (kept as-is)
  out[x == "en estudio"] <- "en estudio"
  out[x == "sin trastorno"] <- "sin trastorno"
  # infectious
  out[x %in% infectious_set] <- "Infectious diseases (STIs, viral hepatitis, substance-use–related)"
  # organ/system
  out[x %in% organ_set] <- "Organ-system medical diseases (cardio, hepatic, oral, hematologic)"
  # injuries
  out[x %in% injury_set] <- "Injuries and sequelae"
  # other specified (broad/vague)
  out[x %in% other_spec_set] <- "Other specified medical conditions"
  # funnel any unforeseen non-NA labels to "Other specified"
  out[is.na(out) & !is.na(x)] <- "Other specified medical conditions"
  out
}
# 2) Apply to the single-diagnosis field; keep is.na(.) as NA
SISTRAT23_c1_2010_2024_df_prev1z <- SISTRAT23_c1_2010_2024_df_prev1z|>
  dplyr::mutate(
    phys_dx_supergroup = classify_dx_vec(diagnostico_trs_fisico),
    phys_dx_supergroup = forcats::fct_relevel(
      factor(phys_dx_supergroup),
      supergroups_levels))
# 3) Apply to the multi-response series (split on "||", classify, dedup per row)
#    - Produce: a collapsed string per row and wide one-hot indicators
multi_tokens <- SISTRAT23_c1_2010_2024_df_prev1z|>
  dplyr::mutate(.rowid = dplyr::row_number(),
                dx_series_raw = diagnostico_trs_fisico_series)|>
  tidyr::separate_rows(dx_series_raw, sep = "\\|\\|")|>
  dplyr::mutate(
    dx_token = stringr::str_squish(dx_series_raw),
    dx_token = dplyr::na_if(dx_token, ""),
    dx_token = dplyr::na_if(dx_token, "NA"),
    supergroup_token = classify_dx_vec(dx_token)
  )|>
  dplyr::distinct(.rowid, supergroup_token, .keep_all = TRUE)
# Collapsed string per row (NA if nothing classifiable present)
series_collapsed <- multi_tokens|>
  dplyr::filter(!is.na(supergroup_token))|>
  dplyr::group_by(.rowid)|>
  dplyr::summarise(
    dx_supergroup_series = paste(sort(unique(supergroup_token)), collapse = "||"),
    .groups = "drop"
  )
# One-hot indicators per supergroup (sanitized column names)
series_dummies <- multi_tokens|>
  dplyr::filter(!is.na(supergroup_token))|>
  dplyr::mutate(value = 1L)|>
  dplyr::distinct(.rowid, supergroup_token, .keep_all = TRUE)|>
  tidyr::pivot_wider(
    names_from = supergroup_token,
    values_from = value,
    values_fill = list(value = 0)
  )|>
  dplyr::rename_with(.fn = base::make.names, .cols = - .rowid)|> 
  dplyr::select(dplyr::any_of(c(".rowid", "hash_key", "en.estudio", "Other.specified.medical.conditions", "Organ.system.medical.diseases..cardio..hepatic..oral..hematologic.", "sin.trastorno", "Injuries.and.sequelae", "Infectious.diseases..STIs..viral.hepatitis..substance.use.related.")))|> 
  janitor::clean_names()|>
  dplyr::rename("phys_dg_org_med_card_hep_oral_hem"="organ_system_medical_diseases_cardio_hepatic_oral_hematologic", "phys_dg_infect_viral_hep_sud_rel"="infectious_diseases_st_is_viral_hepatitis_substance_use_related", "phys_dg_instudy"="en_estudio", "phys_dg_other_med_cond"="other_specified_medical_conditions", "phys_dg_inj_sequelae"="injuries_and_sequelae")|>
  dplyr::group_by(hash_key)|>
  dplyr::summarise(
    phys_dx_instudy = sum(phys_dg_instudy, na.rm = TRUE),
    phys_dx_other_spec_medical_cond = sum(phys_dg_other_med_cond, na.rm = TRUE),
    phys_dx_organ_system_med_dis = sum(phys_dg_org_med_card_hep_oral_hem, na.rm = TRUE),
    phys_dx_injuries_and_sequelae = sum(phys_dg_inj_sequelae, na.rm = TRUE),
    phys_dx_infectious_diseases = sum(phys_dg_infect_viral_hep_sud_rel, na.rm = TRUE),
    .groups = "drop"
  )|>
  # Optional: Reorder columns for clarity (hash_key first, then dummies)
  dplyr::select(hash_key, dplyr::everything())
# 4) Join back to your main data (keeps rows with NA series as NA)
SISTRAT23_c1_2010_2024_df_prev1z <- SISTRAT23_c1_2010_2024_df_prev1z|>
  dplyr::mutate(.rowid = dplyr::row_number())|>
  dplyr::left_join(series_collapsed, by = ".rowid", multiple="first")|>
  dplyr::left_join(series_dummies,  by = "hash_key", multiple="first")|>
  dplyr::select(-.rowid)|> 
  dplyr::mutate(
    # assuming logical columns like dx_F0, dx_F2, ..., dx_F8_9
    dx_f2_smi_psychotic   = as.integer(f2_psychotic),
    dx_f3_mood            = as.integer(f3_mood),
    dx_f45_anx_stress_phys = as.integer(f4_anxiety_stress_somatoform| f5_physio_eating_sleep_sexual),
    dx_f6_personality     = as.integer(f6_personality_adult_behaviour),
    dx_f0789_neurocog_dev    = as.integer(f0_organic | f7_intellectual_disability | f8_9_neurodevelopment_child)
    # optional: single collapsed factor, with priority
  )

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
hist(table(round(SISTRAT23_c1_2010_2024_df_prev1z$dit_m,1), exclude=NULL), breaks=120, main= "Histogram: Months in treatment", xlab="Months")

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Create dummied for psychiatric comorbidity
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Living arrangement/Cohabitation status
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Domestic violence & Sex abuse
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Primary subs
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Tr outcome
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Recode Type of housing, Admission age, Treatment days, Tr. compliance; Nationallity (Chile); 
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

1.4. Missingness

We assessed missing data across predictors, calculating the overall proportion of complete cases and percentage of missing values for each variable.

Code
# Character vector of the variables
tot_variables <- c(
  "readmit_time_from_adm_m", "readmit_time_from_disch_m", "readmit_event",
  "death_time_from_adm_m", "death_time_from_disch_m", "death_event",
  "sex_rec", "adm_age_log", "adm_age_pow2", "adm_age_pow3", "adm_age_c", "adm_age_cat", "tenure_status_household", "occupation_condition_corr24",
  "marital_status_rec", "porc_pobr", "porc_pobr_log", "porc_pobr_c",
  "ethnicity", "nationality_chile", "urbanicity_cat", "ed_attainment_corr", "cohabitation",
  "dg_psiq_cie_10_instudy", "dg_psiq_cie_10_dg",  "f0_organic", "f1_substance_use", "f2_psychotic",
  "f3_mood", "f4_anxiety_stress_somatoform", "f5_physio_eating_sleep_sexual",
  "f6_personality_adult_behaviour", "f7_intellectual_disability", "f8_9_neurodevelopment_child", 
  "dx_f2_smi_psychotic", "dx_f3_mood", "dx_f45_anx_stress_phys", "dx_f6_personality", "dx_f0789_neurocog_dev",
  "sub_dep_icd10_status", "dom_violence", "sex_abuse", "any_violence",
  "phys_dx_supergroup","phys_dx_instudy", "phys_dx_other_spec_medical_cond", "phys_dx_organ_system_med_dis", 
  "phys_dx_injuries_and_sequelae", "phys_dx_infectious_diseases",
  "prim_sub_freq", "prim_sub_freq_rec", "polysubstance_strict", "dit_m", "treat_log",
  "treat_days_pow2", "treat_days_pow3", "treat_lt_90", "tr_outcome",
  "adm_motive", "primary_sub", "primary_sub_mod", "usuario_tribunal_trat_droga",
  "tipo_de_vivienda_rec", "tipo_de_vivienda_rec2", "plan_type_corr", "evaluacindelprocesoteraputico", 
  
  "eva_consumo", "eva_fam", "eva_relinterp", "eva_ocupacion", "eva_sm", "eva_fisica", "eva_transgnorma"
)
# Character vector of the variables restricted to factor or character columns
char_variables <- c(
  "sex_rec", "tenure_status_household", "occupation_condition_corr24",
  "marital_status_rec", "ethnicity", "urbanicity_cat", "ed_attainment_corr", "cohabitation",
  "dg_psiq_cie_10_instudy", "dg_psiq_cie_10_dg", "f0_organic", "f1_substance_use", "f2_psychotic",
  "f3_mood", "f4_anxiety_stress_somatoform", "f5_physio_eating_sleep_sexual",
  "f6_personality_adult_behaviour", "f7_intellectual_disability", "f8_9_neurodevelopment_child", 
  "dx_f2_smi_psychotic", "dx_f3_mood", "dx_f45_anx_stress_phys", "dx_f6_personality", "dx_f0789_neurocog_dev",
  "sub_dep_icd10_status", "dom_violence", "sex_abuse", "any_violence",
  "phys_dx_supergroup", "phys_dx_instudy", "phys_dx_other_spec_medical_cond", "phys_dx_organ_system_med_dis", 
  "phys_dx_injuries_and_sequelae", "phys_dx_infectious_diseases",
  "prim_sub_freq", "prim_sub_freq_rec", "polysubstance_strict", "treat_lt_90", "tr_outcome",
  "adm_motive", "primary_sub", "primary_sub_mod", "usuario_tribunal_trat_droga",
  "tipo_de_vivienda_rec", "tipo_de_vivienda_rec2", "plan_type_corr", "evaluacindelprocesoteraputico",
  "eva_consumo", "eva_fam", "eva_relinterp", "eva_ocupacion", "eva_sm", "eva_fisica", "eva_transgnorma"
)

paste0( "Percentage of the total that has missing values: ",
  scales::percent(1- (
      SISTRAT23_c1_2010_2024_df_prev1z[
        which(complete.cases(subset(SISTRAT23_c1_2010_2024_df_prev1z, select = tot_variables
            ))),] %>% nrow() / nrow(SISTRAT23_c1_2010_2024_df_prev1z) 
    ), accuracy = 0.1),
  "; total = ", format(nrow(SISTRAT23_c1_2010_2024_df_prev1z), big.mark = ",")
)
paste0( "Percentage of the total that has missing values (w/o other variables): ",
  scales::percent(1- (
      SISTRAT23_c1_2010_2024_df_prev1z[
        which(complete.cases(subset(SISTRAT23_c1_2010_2024_df_prev1z, select = setdiff(tot_variables,   c("usuario_tribunal_trat_droga", "dom_violence", "sex_abuse", "any_violence", "ethnicity", "tipo_de_vivienda_rec", "tipo_de_vivienda_rec2", "phys_dx_supergroup", "phys_dx_instudy", "phys_dx_other_spec_medical_cond", "phys_dx_organ_system_med_dis", "phys_dx_injuries_and_sequelae", "evaluacindelprocesoteraputico"))
            ))),] %>% nrow() / nrow(SISTRAT23_c1_2010_2024_df_prev1z) 
    ), accuracy = 0.1),
  "; total = ", format(nrow(SISTRAT23_c1_2010_2024_df_prev1z), big.mark = ",")
)

# Variables to check
vars_check <- c(tot_variables)
# Compute % missing per variable
missing_pct <- colMeans(
  is.na(subset(SISTRAT23_c1_2010_2024_df_prev1z, select = vars_check))
) * 100
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# Named vector of labels (names = variable names)
labels_map <- c(
  # Times & events
  readmit_time_from_adm_m         = "Readmission time from admission (months)",
  readmit_time_from_disch_m       = "Readmission time from discharge (months)",
  death_time_from_adm_m           = "Death time from admission (months)",
  death_time_from_disch_m         = "Death time from discharge (months)",
  readmit_event                   = "Readmission event",
  death_event                     = "Death event",

  # Demographics & status
  sex_rec                         = "Sex (biological categorization)",
  tenure_status_household         = "Housing situation",
  tenure_status_household_rec     = "Housing situation (- informal settlement)",
  occupation_condition_corr24     = "Employment status",
  marital_status_rec              = "Marital status",
  ed_attainment_corr              = "Educational attainment",
  ethnicity                       = "Ethnicity (=1)",
  nationality_chile               = "Nationality (Chile)",
  adm_age_log                     = "Admission age (log)",
  adm_age_pow2                    = "Admission age ^2",
  adm_age_pow3                    = "Admission age ^3",
  adm_age_c                       = "Admission age (centered)",
  adm_age_cat                     = "Admission age (three groups)",

  # Contextual
  porc_pobr                       = "Poverty index of commune of residence",
  porc_pobr_log                   = "Poverty index (log1p)",
  porc_pobr_c                     = "Poverty index (centered)",
  porc_pobr_c_cat6                = "Poverty index (6 quantile groups)",
  urbanicity_cat                  = "Urbanization level of commune of residence",

  # Living arrangement
  cohabitation                    = "Cohabitation status (harmonized)",

  # Clinical/behavioral
  dg_psiq_cie_10_instudy          = "Psychiatric comorbidity (ICD-10, in-study)",
  dg_psiq_cie_10_dg               = "Psychiatric comorbidity (ICD-10, diagnosis record)",
  sub_dep_icd10_status            = "SUD severity (ICD-10)",
  prim_sub_freq                   = "Primary-substance use frequency at admission",
  prim_sub_freq_rec               = "Primary-substance use frequency at admission (simplified)",
  polysubstance_strict            = "Polysubstance use (strict)",

  # Treatment exposure
  dit_m                           = "Length of stay in treatment (months)",
  treat_log                       = "Length of stay (log1p months)",
  treat_days_pow2                 = "Length of stay (months)^2",
  treat_days_pow3                 = "Length of stay (months)^3",
  treat_lt_90                     = "Treatment length < 90 days (indicator)",
  tr_outcome                      = "Treatment outcome",
  adm_motive                      = "Admission motive",
  primary_sub                     = "Primary substance of use (detailed substances)",
  primary_sub_mod                 = "Primary substance of use (recoded)",

  # Research-context extras
  usuario_tribunal_trat_droga     = "Court-referred to drug treatment (1 = yes)",
  tipo_de_vivienda_rec            = "Housing type (recoded)",
  tipo_de_vivienda_rec2           = "Housing type (recoded & simplified)",

  # Stratification
  plan_type_corr                  = "Treatment modality (plan type)",
  prim_sub_licit                  = "Licit primary substance at admission",
  f0_organic                      = "F0. Organic, including symptomatic, mental disorders",
  f1_substance_use                = "F1. Mental and behavioural disorders due to psychoactive substance use",
  f2_psychotic                    = "F2. Schizophrenia, schizotypal and delusional disorders",
  f3_mood                         = "F3. Mood [affective] disorders",
  f4_anxiety_stress_somatoform    = "F4. Neurotic, stress-related and somatoform disorders",
  f5_physio_eating_sleep_sexual   = "F5. Behavioural syndromes associated with physiological disturbances and physical factors",
  f6_personality_adult_behaviour  = "F6. Disorders of adult personality and behaviour",
  f7_intellectual_disability      = "F7. Intellectual disability",
  f8_9_neurodevelopment_child     = "F8-9. Disorders of psychological development and behavioural/emotional disorders with onset in childhood and adolescence",
  dx_f2_smi_psychotic             = "Psychotic disorders (F2)",
  dx_f3_mood                      = "Mood disorders (F3)",
  dx_f45_anx_stress_phys          = "Anxiety & stress-related (F4–F5)",
  dx_f6_personality               = "Personality disorders (F6)",
  dx_f0789_neurocog_dev           = "Neurocognitive & neurodevelopmental (F0, F7–F9)",

  # Violence indicators
  any_violence                    = "Domestic violence/Sexual abuse",
  dom_violence                    = "Domestic violence",
  sex_abuse                       = "Sexual abuse",

  # Physical diagnosis groups
  phys_dx_supergroup              = "Physical diagnosis supergroup (single)",
  phys_dx_instudy                 = "Physical diagnosis: under study",
  phys_dx_other_spec_medical_cond = "Physical diagnosis: other medical conditions",
  phys_dx_organ_system_med_dis    = "Physical diagnosis: organ-system diseases",
  phys_dx_injuries_and_sequelae   = "Physical diagnosis: injuries & sequelae",
  phys_dx_infectious_diseases     = "Physical diagnosis: infectious diseases",

  # Evaluation at discharge
  evaluacindelprocesoteraputico   = "Assessment of the therapeutic process",
  eva_consumo                     = "Evaluation of consumption pattern at discharge",
  eva_fam                         = "Family situation evaluation at discharge",
  eva_relinterp                   = "Interpersonal relations evaluation at discharge",
  eva_ocupacion                   = "Occupational situation evaluation at discharge",
  eva_sm                          = "Mental health evaluation at discharge",
  eva_fisica                      = "Physical health evaluation at discharge",
  eva_transgnorma                 = "Social norm transgression evaluation at discharge"
)

# Apply labels safely (only to variables that exist)
for(nm in intersect(names(SISTRAT23_c1_2010_2024_df_prev1z), names(labels_map))){
  attr(SISTRAT23_c1_2010_2024_df_prev1z[[nm]],"label")<-labels_map[[nm]]
}
# Extract labels for each variable, defaulting to NA if not present
labels_vec <- sapply(names(missing_pct), function(nm) {
  lbl <- attr(SISTRAT23_c1_2010_2024_df_prev1z[[nm]], "label")
  if (is.null(lbl)) NA_character_ else lbl
})

# Round and format into data.frame with labels
data.frame(
  Variable = names(missing_pct),
  Label = labels_vec,
  Missing_Percent = round(missing_pct, 2))|>
  # Display as markdown table
  knitr::kable(format = "markdown",
    align = c("l", "l", "r"),
    col.names = c("Variable", "Label", "% Missing"),
    row.names = FALSE)
[1] "Percentage of the total that has missing values: 33.3%; total = 88,504"
[1] "Percentage of the total that has missing values (w/o other variables): 6.3%; total = 88,504"
Variable Label % Missing
readmit_time_from_adm_m Readmission time from admission (months) 0.00
readmit_time_from_disch_m Readmission time from discharge (months) 0.00
readmit_event Readmission event 0.00
death_time_from_adm_m Death time from admission (months) 0.00
death_time_from_disch_m Death time from discharge (months) 0.00
death_event Death event 0.00
sex_rec Sex (biological categorization) 0.00
adm_age_log Admission age (log) 0.00
adm_age_pow2 Admission age ^2 0.00
adm_age_pow3 Admission age ^3 0.00
adm_age_c Admission age (centered) 0.00
adm_age_cat Admission age (three groups) 0.00
tenure_status_household Housing situation 5.06
occupation_condition_corr24 Employment status 0.00
marital_status_rec Marital status 0.18
porc_pobr Poverty index of commune of residence 0.00
porc_pobr_log Poverty index (log1p) 0.00
porc_pobr_c Poverty index (centered) 0.00
ethnicity Ethnicity (=1) 0.00
nationality_chile Nationality (Chile) 0.00
urbanicity_cat Urbanization level of commune of residence 0.00
ed_attainment_corr Educational attainment 0.43
cohabitation Cohabitation status (harmonized) 0.00
dg_psiq_cie_10_instudy Psychiatric comorbidity (ICD-10, in-study) 0.00
dg_psiq_cie_10_dg Psychiatric comorbidity (ICD-10, diagnosis record) 0.00
f0_organic F0. Organic, including symptomatic, mental disorders 0.00
f1_substance_use F1. Mental and behavioural disorders due to psychoactive substance use 0.00
f2_psychotic F2. Schizophrenia, schizotypal and delusional disorders 0.00
f3_mood F3. Mood [affective] disorders 0.00
f4_anxiety_stress_somatoform F4. Neurotic, stress-related and somatoform disorders 0.00
f5_physio_eating_sleep_sexual F5. Behavioural syndromes associated with physiological disturbances and physical factors 0.00
f6_personality_adult_behaviour F6. Disorders of adult personality and behaviour 0.00
f7_intellectual_disability F7. Intellectual disability 0.00
f8_9_neurodevelopment_child F8-9. Disorders of psychological development and behavioural/emotional disorders with onset in childhood and adolescence 0.00
dx_f2_smi_psychotic Psychotic disorders (F2) 0.00
dx_f3_mood Mood disorders (F3) 0.00
dx_f45_anx_stress_phys Anxiety & stress-related (F4–F5) 0.00
dx_f6_personality Personality disorders (F6) 0.00
dx_f0789_neurocog_dev Neurocognitive & neurodevelopmental (F0, F7–F9) 0.00
sub_dep_icd10_status SUD severity (ICD-10) 0.00
dom_violence Domestic violence 20.09
sex_abuse Sexual abuse 20.09
any_violence Domestic violence/Sexual abuse 20.09
phys_dx_supergroup Physical diagnosis supergroup (single) 0.17
phys_dx_instudy Physical diagnosis: under study 0.17
phys_dx_other_spec_medical_cond Physical diagnosis: other medical conditions 0.17
phys_dx_organ_system_med_dis Physical diagnosis: organ-system diseases 0.17
phys_dx_injuries_and_sequelae Physical diagnosis: injuries & sequelae 0.17
phys_dx_infectious_diseases Physical diagnosis: infectious diseases 0.17
prim_sub_freq Primary-substance use frequency at admission 0.46
prim_sub_freq_rec Primary-substance use frequency at admission (simplified) 0.46
polysubstance_strict Polysubstance use (strict) 0.00
dit_m Length of stay in treatment (months) 0.00
treat_log Length of stay (log1p months) 0.00
treat_days_pow2 Length of stay (months)^2 0.00
treat_days_pow3 Length of stay (months)^3 0.00
treat_lt_90 Treatment length < 90 days (indicator) 0.00
tr_outcome Treatment outcome 0.00
adm_motive Admission motive 0.00
primary_sub Primary substance of use (detailed substances) 0.00
primary_sub_mod Primary substance of use (recoded) 0.00
usuario_tribunal_trat_droga Court-referred to drug treatment (1 = yes) 4.40
tipo_de_vivienda_rec Housing type (recoded) 7.26
tipo_de_vivienda_rec2 Housing type (recoded & simplified) 7.26
plan_type_corr Treatment modality (plan type) 0.00
evaluacindelprocesoteraputico Assessment of the therapeutic process 0.25
eva_consumo Evaluation of consumption pattern at discharge 0.27
eva_fam Family situation evaluation at discharge 0.27
eva_relinterp Interpersonal relations evaluation at discharge 0.27
eva_ocupacion Occupational situation evaluation at discharge 0.27
eva_sm Mental health evaluation at discharge 0.27
eva_fisica Physical health evaluation at discharge 0.27
eva_transgnorma Social norm transgression evaluation at discharge 0.27

1.4.* Currently in

We identified 215 cases with missing treatment outcomes. For these cases, treatment outcome, duration of treatment, and other treatment-related variables (e.g., evaluation at discharge) are unknown.

Code
SISTRAT23_c1_2010_2024_df_prev1z|> 
  dplyr::filter(tr_outcome=="other")|> 
  dplyr::select(hash_key, tr_outcome, adm_date_rec2, disch_date_rec6, dit_rec6, adm_year)|>
    #janitor::tabyl(adm_year)
  dplyr::mutate(
     hash_key = gsub(
         "(?<=^.{8}).{2}",   # lookbehind for first 8 chars, match the next 2
         "**",               # replace with two asterisks
         hash_key,
         perl = TRUE
     )) |> 
  head()
# A tidytable: 6 × 6
  hash_key            tr_outcome adm_date_rec2 disch_date_rec6 dit_rec6 adm_year
  <chr>               <chr>      <date>        <date>             <dbl>    <int>
1 0065a4d6**af0c519d… other      2016-08-08    2019-08-09          1096     2016
2 00be34d5**32b9d1ee… other      2016-04-13    2019-04-14          1096     2016
3 01fea888**92b29fcc… other      2019-05-01    2022-05-01          1096     2019
4 0426e03d**26cb489d… other      2011-04-27    2014-04-27          1096     2011
5 048e829d**f1109af2… other      2011-07-25    2014-07-25          1096     2011
6 04b987f5**a824ea89… other      2011-02-07    2014-02-07          1096     2011

1.5. Descriptives

Code
## 0) Guard: use only columns that exist
tot_variables_present<-intersect(tot_variables,names(SISTRAT23_c1_2010_2024_df_prev1z))
char_variables_present<-intersect(c(char_variables, "readmit_event", "death_event"),names(SISTRAT23_c1_2010_2024_df_prev1z))
extra_cat<-intersect(c("disch_age_cat"),names(SISTRAT23_c1_2010_2024_df_prev1z))
extra_num<-intersect(c("disch_age_rec"),names(SISTRAT23_c1_2010_2024_df_prev1z))

categorical_vars<-union(char_variables_present,extra_cat)
numerical_vars<-setdiff(union(tot_variables_present,union(extra_cat,extra_num)),categorical_vars)

## 1) Create TableOne (coerce character categoricals to factor inline)
table_one_clean<-tableone::CreateTableOne(
  vars= setdiff(c(categorical_vars,numerical_vars), "plan_type_corr"),
  data= tidytable::mutate(SISTRAT23_c1_2010_2024_df_prev1z,
    tidytable::across(tidyselect::all_of(categorical_vars),\(x){if(is.character(x)) factor(x) else x}),
porc_pobr= porc_pobr*100),
  factorVars= setdiff(char_variables_present,"plan_type_corr"),
  strata= "plan_type_corr",
  addOverall= TRUE,
  test= TRUE,
  smd= TRUE#,
  #labelTranslations=label_map
)

## 4) Print (treat numeric as non-normal; keep your formatting)
escape_regex<-function(x) gsub("([][{}()+*^$.|?\\\\])","\\\\\\1",x)

rn<-rownames(table_one_clean_print)
keys<-names(labels_map)
keys<-keys[order(nchar(keys),decreasing=TRUE)]  # avoid partial overlaps

for(nm in keys){
  pat<-paste0("^",escape_regex(nm),"(?=\\s|\\(|:|=|$)")  # start, then space/(...)/: /= or end
  rn<-sub(pat,labels_map[[nm]],rn,perl=TRUE)            # replace only the prefix; keep the rest
}
rownames(table_one_clean_print)<-rn

wdpath<- paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))

## 5) HTML table
table_one_clean_print|>
  knitr::kable("html", caption= "Table 1. Descriptives by baseline treatment setting")|>
  kableExtra::kable_classic()|>
  kableExtra::row_spec(0:1, bold = TRUE)|>
  kableExtra::scroll_box(height = "400px")

write.table(table_one_clean_print, 
            file= paste0(wdpath,"cons/_out/table_one_prediction_clean.txt"), 
            sep = "\t", row.names = T)
Table 1. Descriptives by baseline treatment setting
level Overall m-pai m-pr pg-pab pg-pai pg-pr p test SMD Missing
n 88,504 5,263 3,545 32,424 37,555 9,717
Sex (biological categorization) (%) man 65806 (74.4) 0 (0.0) 0 (0.0) 26249 (81.0) 30414 (81.0) 9143 (94.1) <0.001 2.377 0.0
woman 22696 (25.6) 5263 (100.0) 3545 (100.0) 6175 (19.0) 7139 (19.0) 574 (5.9)
Housing situation (%) illegal settlement 947 (1.1) 65 (1.3) 86 (2.6) 227 (0.7) 340 (0.9) 229 (2.7) <0.001 0.200 5.1
others 2380 (2.8) 139 (2.7) 89 (2.7) 974 (3.1) 875 (2.4) 303 (3.5)
owner/transferred dwellings/pays dividends 30670 (36.5) 1716 (33.4) 940 (28.5) 12047 (38.8) 12451 (34.7) 3516 (40.7)
renting 15476 (18.4) 1007 (19.6) 471 (14.3) 5936 (19.1) 6818 (19.0) 1244 (14.4)
stays temporarily with a relative 34553 (41.1) 2214 (43.1) 1708 (51.9) 11877 (38.2) 15410 (42.9) 3344 (38.7)
Employment status (%) employed 43336 (49.0) 1603 (30.5) 361 (10.2) 21086 (65.0) 18574 (49.5) 1712 (17.6) <0.001 0.792 0.0
inactive 14307 (16.2) 1972 (37.5) 1665 (47.0) 3570 (11.0) 5386 (14.3) 1714 (17.6)
unemployed 30861 (34.9) 1688 (32.1) 1519 (42.8) 7768 (24.0) 13595 (36.2) 6291 (64.7)
Marital status (%) married/cohabiting 29768 (33.7) 1662 (31.6) 822 (23.2) 13127 (40.6) 11976 (31.9) 2181 (22.5) <0.001 0.211 0.2
separated/divorced/annulled/widowed 10269 (11.6) 644 (12.2) 441 (12.5) 3881 (12.0) 4234 (11.3) 1069 (11.0)
single 48304 (54.7) 2953 (56.2) 2277 (64.3) 15339 (47.4) 21279 (56.8) 6456 (66.5)
Ethnicity (=1) (%) 0 82870 (93.6) 4868 (92.5) 3312 (93.4) 30386 (93.7) 35158 (93.6) 9146 (94.1) 0.003 0.028 0.0
1 5634 (6.4) 395 (7.5) 233 (6.6) 2038 (6.3) 2397 (6.4) 571 (5.9)
Urbanization level of commune of residence (%) 1.Rural 7669 (8.7) 210 (4.0) 187 (5.3) 3030 (9.3) 3803 (10.1) 439 (4.5) <0.001 0.176 0.0
2.Mixed 8659 (9.8) 383 (7.3) 261 (7.4) 4116 (12.7) 3203 (8.5) 696 (7.2)
3.Urban 72176 (81.6) 4670 (88.7) 3097 (87.4) 25278 (78.0) 30549 (81.3) 8582 (88.3)
Educational attainment (%) 1-More than high school 16157 (18.3) 899 (17.1) 641 (18.1) 5356 (16.6) 7672 (20.5) 1589 (16.4) <0.001 0.085 0.4
2-Completed high school or less 48791 (55.4) 2785 (53.0) 1855 (52.4) 17756 (55.1) 20891 (55.8) 5504 (56.7)
3-Completed primary school or less 23176 (26.3) 1567 (29.8) 1042 (29.5) 9108 (28.3) 8847 (23.6) 2612 (26.9)
Cohabitation status (harmonized) (%) alone 8415 (9.5) 278 (5.3) 337 (9.5) 2815 (8.7) 3404 (9.1) 1581 (16.3) <0.001 0.468 0.0
with couple/children 39730 (44.9) 3507 (66.6) 1828 (51.6) 16525 (51.0) 15679 (41.7) 2191 (22.5)
family of origin 32725 (37.0) 1010 (19.2) 954 (26.9) 10683 (32.9) 15142 (40.3) 4936 (50.8)
Others 7634 (8.6) 468 (8.9) 426 (12.0) 2401 (7.4) 3330 (8.9) 1009 (10.4)
Psychiatric comorbidity (ICD-10, in-study) (%) FALSE 73020 (82.5) 4482 (85.2) 2678 (75.5) 27173 (83.8) 30921 (82.3) 7766 (79.9) <0.001 0.118 0.0
TRUE 15484 (17.5) 781 (14.8) 867 (24.5) 5251 (16.2) 6634 (17.7) 1951 (20.1)
Psychiatric comorbidity (ICD-10, diagnosis record) (%) FALSE 49013 (55.4) 2459 (46.7) 1260 (35.5) 20654 (63.7) 19896 (53.0) 4744 (48.8) <0.001 0.256 0.0
TRUE 39491 (44.6) 2804 (53.3) 2285 (64.5) 11770 (36.3) 17659 (47.0) 4973 (51.2)
F0. Organic, including symptomatic, mental disorders (%) FALSE 87399 (98.8) 5233 (99.4) 3492 (98.5) 32163 (99.2) 36958 (98.4) 9553 (98.3) <0.001 0.057 0.0
TRUE 1105 (1.2) 30 (0.6) 53 (1.5) 261 (0.8) 597 (1.6) 164 (1.7)
F1. Mental and behavioural disorders due to psychoactive substance use (%) FALSE 88504 (100.0) 5263 (100.0) 3545 (100.0) 32424 (100.0) 37555 (100.0) 9717 (100.0) NA <0.001 0.0
F2. Schizophrenia, schizotypal and delusional disorders (%) FALSE 87145 (98.5) 5240 (99.6) 3500 (98.7) 32248 (99.5) 36691 (97.7) 9466 (97.4) <0.001 0.103 0.0
TRUE 1359 (1.5) 23 (0.4) 45 (1.3) 176 (0.5) 864 (2.3) 251 (2.6)
F3. Mood [affective] disorders (%) FALSE 79431 (89.7) 4575 (86.9) 3088 (87.1) 29383 (90.6) 33645 (89.6) 8740 (89.9) <0.001 0.065 0.0
TRUE 9073 (10.3) 688 (13.1) 457 (12.9) 3041 (9.4) 3910 (10.4) 977 (10.1)
F4. Neurotic, stress-related and somatoform disorders (%) FALSE 86864 (98.1) 5158 (98.0) 3513 (99.1) 31682 (97.7) 36894 (98.2) 9617 (99.0) <0.001 0.061 0.0
TRUE 1640 (1.9) 105 (2.0) 32 (0.9) 742 (2.3) 661 (1.8) 100 (1.0)
F5. Behavioural syndromes associated with physiological disturbances and physical factors (%) FALSE 88051 (99.5) 5217 (99.1) 3514 (99.1) 32287 (99.6) 37359 (99.5) 9674 (99.6) <0.001 0.033 0.0
TRUE 453 (0.5) 46 (0.9) 31 (0.9) 137 (0.4) 196 (0.5) 43 (0.4)
F6. Disorders of adult personality and behaviour (%) FALSE 63374 (71.6) 3400 (64.6) 1887 (53.2) 25216 (77.8) 26369 (70.2) 6502 (66.9) <0.001 0.236 0.0
TRUE 25130 (28.4) 1863 (35.4) 1658 (46.8) 7208 (22.2) 11186 (29.8) 3215 (33.1)
F7. Intellectual disability (%) FALSE 87866 (99.3) 5233 (99.4) 3520 (99.3) 32208 (99.3) 37278 (99.3) 9627 (99.1) 0.063 0.018 0.0
TRUE 638 (0.7) 30 (0.6) 25 (0.7) 216 (0.7) 277 (0.7) 90 (0.9)
F8-9. Disorders of psychological development and behavioural/emotional disorders with onset in childhood and adolescence (%) FALSE 87237 (98.6) 5177 (98.4) 3494 (98.6) 32100 (99.0) 37033 (98.6) 9433 (97.1) <0.001 0.060 0.0
TRUE 1267 (1.4) 86 (1.6) 51 (1.4) 324 (1.0) 522 (1.4) 284 (2.9)
Psychotic disorders (F2) (%) 0 87145 (98.5) 5240 (99.6) 3500 (98.7) 32248 (99.5) 36691 (97.7) 9466 (97.4) <0.001 0.103 0.0
1 1359 (1.5) 23 (0.4) 45 (1.3) 176 (0.5) 864 (2.3) 251 (2.6)
Mood disorders (F3) (%) 0 79431 (89.7) 4575 (86.9) 3088 (87.1) 29383 (90.6) 33645 (89.6) 8740 (89.9) <0.001 0.065 0.0
1 9073 (10.3) 688 (13.1) 457 (12.9) 3041 (9.4) 3910 (10.4) 977 (10.1)
Anxiety & stress-related (F4–F5) (%) 0 86422 (97.6) 5114 (97.2) 3484 (98.3) 31549 (97.3) 36701 (97.7) 9574 (98.5) <0.001 0.051 0.0
1 2082 (2.4) 149 (2.8) 61 (1.7) 875 (2.7) 854 (2.3) 143 (1.5)
Personality disorders (F6) (%) 0 63374 (71.6) 3400 (64.6) 1887 (53.2) 25216 (77.8) 26369 (70.2) 6502 (66.9) <0.001 0.236 0.0
1 25130 (28.4) 1863 (35.4) 1658 (46.8) 7208 (22.2) 11186 (29.8) 3215 (33.1)
Neurocognitive & neurodevelopmental (F0, F7–F9) (%) 0 85517 (96.6) 5117 (97.2) 3416 (96.4) 31626 (97.5) 36169 (96.3) 9189 (94.6) <0.001 0.072 0.0
1 2987 (3.4) 146 (2.8) 129 (3.6) 798 (2.5) 1386 (3.7) 528 (5.4)
SUD severity (ICD-10) (%) drug dependence 64429 (72.8) 4105 (78.0) 3343 (94.3) 17824 (55.0) 30278 (80.6) 8879 (91.4) <0.001 0.477 0.0
hazardous consumption 24075 (27.2) 1158 (22.0) 202 (5.7) 14600 (45.0) 7277 (19.4) 838 (8.6)
Domestic violence (%) No domestic violence 52301 (74.0) 2670 (60.3) 1851 (55.4) 18960 (78.4) 22784 (75.9) 6036 (69.3) <0.001 0.268 20.1
Domestic violence 18419 (26.0) 1761 (39.7) 1491 (44.6) 5236 (21.6) 7251 (24.1) 2680 (30.7)
Sexual abuse (%) No sexual abuse 69345 (98.1) 4174 (94.2) 3070 (91.9) 23977 (99.1) 29570 (98.5) 8554 (98.1) <0.001 0.192 20.1
Sexual abuse 1375 (1.9) 257 (5.8) 272 (8.1) 219 (0.9) 465 (1.5) 162 (1.9)
Domestic violence/Sexual abuse (%) 1.Domestic violence/sex abuse 19794 (28.0) 2018 (45.5) 1763 (52.8) 5455 (22.5) 7716 (25.7) 2842 (32.6) <0.001 0.344 20.1
0.No domestic violence/sex abuse 50926 (72.0) 2413 (54.5) 1579 (47.2) 18741 (77.5) 22319 (74.3) 5874 (67.4)
Physical diagnosis supergroup (single) (%) en estudio 48717 (55.1) 2814 (53.5) 1990 (56.2) 17779 (55.0) 20990 (55.9) 5144 (53.1) <0.001 0.240 0.2
sin trastorno 31242 (35.4) 1909 (36.3) 883 (24.9) 12409 (38.4) 12952 (34.5) 3089 (31.9)
Infectious diseases (STIs, viral hepatitis, substance-use–related) 700 (0.8) 75 (1.4) 85 (2.4) 131 (0.4) 300 (0.8) 109 (1.1)
Organ-system medical diseases (cardio, hepatic, oral, hematologic) 2450 (2.8) 149 (2.8) 185 (5.2) 649 (2.0) 981 (2.6) 486 (5.0)
Injuries and sequelae 1327 (1.5) 47 (0.9) 131 (3.7) 293 (0.9) 481 (1.3) 375 (3.9)
Other specified medical conditions 3917 (4.4) 266 (5.1) 270 (7.6) 1076 (3.3) 1814 (4.8) 491 (5.1)
Physical diagnosis: under study (%) 0 39061 (44.2) 2420 (46.0) 1404 (39.6) 14514 (44.9) 16398 (43.7) 4325 (44.6) <0.001 0.056 0.2
1 49292 (55.8) 2840 (54.0) 2140 (60.4) 17823 (55.1) 21120 (56.3) 5369 (55.4)
Physical diagnosis: other medical conditions (%) 0 84405 (95.5) 4992 (94.9) 3263 (92.1) 31258 (96.7) 35695 (95.1) 9197 (94.9) <0.001 0.083 0.2
1 3948 (4.5) 268 (5.1) 281 (7.9) 1079 (3.3) 1823 (4.9) 497 (5.1)
Physical diagnosis: organ-system diseases (%) 0 85879 (97.2) 5109 (97.1) 3355 (94.7) 31686 (98.0) 36531 (97.4) 9198 (94.9) <0.001 0.097 0.2
1 2474 (2.8) 151 (2.9) 189 (5.3) 651 (2.0) 987 (2.6) 496 (5.1)
Physical diagnosis: injuries & sequelae (%) 0 87011 (98.5) 5213 (99.1) 3406 (96.1) 32043 (99.1) 37036 (98.7) 9313 (96.1) <0.001 0.120 0.2
1 1342 (1.5) 47 (0.9) 138 (3.9) 294 (0.9) 482 (1.3) 381 (3.9)
Physical diagnosis: infectious diseases (%) 0 87644 (99.2) 5185 (98.6) 3454 (97.5) 32205 (99.6) 37216 (99.2) 9584 (98.9) <0.001 0.086 0.2
1 709 (0.8) 75 (1.4) 90 (2.5) 132 (0.4) 302 (0.8) 110 (1.1)
Primary-substance use frequency at admission (%) 1. Less than 1 day a week 4399 (5.0) 243 (4.6) 25 (0.7) 2611 (8.1) 1324 (3.5) 196 (2.0) <0.001 0.586 0.5
2. 1 day a week 5831 (6.6) 295 (5.6) 34 (1.0) 3375 (10.5) 1897 (5.1) 230 (2.4)
3. 2 to 3 days a week 24533 (27.8) 1452 (27.7) 354 (10.0) 11825 (36.7) 9905 (26.5) 997 (10.3)
4. 4 to 6 days a week 14483 (16.4) 808 (15.4) 440 (12.4) 5165 (16.0) 6808 (18.2) 1262 (13.0)
5. Daily 38847 (44.1) 2444 (46.6) 2686 (75.9) 9259 (28.7) 17436 (46.7) 7022 (72.3)
Primary-substance use frequency at admission (simplified) (%) 1.≤1 day/wk 10230 (11.6) 538 (10.3) 59 (1.7) 5986 (18.6) 3221 (8.6) 426 (4.4) <0.001 0.554 0.5
2.2–6 days/wk 39016 (44.3) 2260 (43.1) 794 (22.4) 16990 (52.7) 16713 (44.7) 2259 (23.3)
3.Daily 38847 (44.1) 2444 (46.6) 2686 (75.9) 9259 (28.7) 17436 (46.7) 7022 (72.3)
Polysubstance use (strict) (%) 0 23961 (27.1) 1395 (26.5) 653 (18.4) 11608 (35.8) 8624 (23.0) 1681 (17.3) <0.001 0.210 0.0
1 64543 (72.9) 3868 (73.5) 2892 (81.6) 20816 (64.2) 28931 (77.0) 8036 (82.7)
Treatment length < 90 days (indicator) (%) 0 67525 (76.3) 4202 (79.8) 2213 (62.4) 26054 (80.4) 28309 (75.4) 6747 (69.4) <0.001 0.210 0.0
1 20979 (23.7) 1061 (20.2) 1332 (37.6) 6370 (19.6) 9246 (24.6) 2970 (30.6)
Treatment outcome (%) adm discharge - adm reasons 1319 (1.5) 78 (1.5) 8 (0.2) 598 (1.8) 611 (1.6) 24 (0.2) <0.001 0.279 0.0
adm discharge - rule violation/undet 6463 (7.3) 322 (6.1) 397 (11.2) 1946 (6.0) 2532 (6.7) 1266 (13.0)
completion 23408 (26.4) 1592 (30.2) 932 (26.3) 8093 (25.0) 9700 (25.8) 3091 (31.8)
dropout 47055 (53.2) 2617 (49.7) 1544 (43.6) 18904 (58.3) 20201 (53.8) 3789 (39.0)
other 215 (0.2) 7 (0.1) 8 (0.2) 98 (0.3) 67 (0.2) 35 (0.4)
referral 10044 (11.3) 647 (12.3) 656 (18.5) 2785 (8.6) 4444 (11.8) 1512 (15.6)
Admission motive (%) another SUD facility/FONODROGAS/SENDA Previene 8287 (9.4) 304 (5.8) 793 (22.4) 1592 (4.9) 3202 (8.5) 2396 (24.7) <0.001 0.458 0.0
justice sector 8470 (9.6) 941 (17.9) 241 (6.8) 3297 (10.2) 3395 (9.0) 596 (6.1)
other 4574 (5.2) 387 (7.4) 211 (6.0) 1272 (3.9) 2161 (5.8) 543 (5.6)
sanitary sector 27482 (31.1) 1758 (33.4) 1300 (36.7) 8942 (27.6) 12707 (33.8) 2775 (28.6)
spontaneous consultation 39691 (44.8) 1873 (35.6) 1000 (28.2) 17321 (53.4) 16090 (42.8) 3407 (35.1)
Primary substance of use (detailed substances) (%) alcohol 30214 (34.1) 1497 (28.4) 621 (17.5) 14317 (44.2) 11834 (31.5) 1945 (20.0) <0.001 0.477 0.0
amphetamine-type stimulants 123 (0.1) 7 (0.1) 5 (0.1) 46 (0.1) 55 (0.1) 10 (0.1)
cocaine paste 33442 (37.8) 2006 (38.1) 2253 (63.6) 8554 (26.4) 14585 (38.8) 6044 (62.2)
cocaine powder 17221 (19.5) 1109 (21.1) 506 (14.3) 6522 (20.1) 7775 (20.7) 1309 (13.5)
dissociatives 3 (0.0) 0 (0.0) 0 (0.0) 1 (0.0) 2 (0.0) 0 (0.0)
hallucinogens 13 (0.0) 1 (0.0) 0 (0.0) 4 (0.0) 8 (0.0) 0 (0.0)
inhalants 67 (0.1) 0 (0.0) 5 (0.1) 11 (0.0) 35 (0.1) 16 (0.2)
marijuana 6029 (6.8) 462 (8.8) 77 (2.2) 2553 (7.9) 2641 (7.0) 296 (3.0)
opioids 326 (0.4) 34 (0.6) 23 (0.6) 51 (0.2) 186 (0.5) 32 (0.3)
others 116 (0.1) 22 (0.4) 1 (0.0) 41 (0.1) 46 (0.1) 6 (0.1)
tranquilizers/hypnotics 950 (1.1) 125 (2.4) 54 (1.5) 324 (1.0) 388 (1.0) 59 (0.6)
Primary substance of use (recoded) (%) cocaine paste 33442 (37.8) 2006 (38.1) 2253 (63.6) 8554 (26.4) 14585 (38.8) 6044 (62.2) <0.001 0.469 0.0
cocaine powder 17221 (19.5) 1109 (21.1) 506 (14.3) 6522 (20.1) 7775 (20.7) 1309 (13.5)
alcohol 30214 (34.1) 1497 (28.4) 621 (17.5) 14317 (44.2) 11834 (31.5) 1945 (20.0)
marijuana 6029 (6.8) 462 (8.8) 77 (2.2) 2553 (7.9) 2641 (7.0) 296 (3.0)
others 1598 (1.8) 189 (3.6) 88 (2.5) 478 (1.5) 720 (1.9) 123 (1.3)
Court-referred to drug treatment (1 = yes) (%) no 82939 (98.0) 4696 (98.2) 3277 (97.6) 31093 (98.8) 34832 (97.8) 9041 (96.6) <0.001 0.068 4.4
si 1668 (2.0) 86 (1.8) 80 (2.4) 389 (1.2) 790 (2.2) 323 (3.4)
Housing type (recoded) (%) formal housing 73763 (89.9) 4441 (90.2) 2139 (81.6) 28843 (92.6) 32164 (90.2) 6176 (79.8) <0.001 0.297 7.3
shared/secondary unit 2787 (3.4) 215 (4.4) 88 (3.4) 995 (3.2) 1244 (3.5) 245 (3.2)
homeless/unsheltered/informal/temporary housing/institutional/collective 2864 (3.5) 64 (1.3) 256 (9.8) 402 (1.3) 1074 (3.0) 1068 (13.8)
other/unknown 2667 (3.2) 203 (4.1) 137 (5.2) 912 (2.9) 1168 (3.3) 247 (3.2)
Housing type (recoded & simplified) (%) formal housing 73763 (89.9) 4441 (90.2) 2139 (81.6) 28843 (92.6) 32164 (90.2) 6176 (79.8) <0.001 0.201 7.3
other/unknown 8318 (10.1) 482 (9.8) 481 (18.4) 2309 (7.4) 3486 (9.8) 1560 (20.2)
Assessment of the therapeutic process (%) logro alto 21177 (24.0) 1320 (25.1) 874 (24.7) 7482 (23.1) 8574 (22.9) 2927 (30.2) <0.001 0.097 0.2
logro intermedio 28665 (32.5) 1728 (32.9) 1140 (32.2) 10361 (32.1) 12104 (32.3) 3332 (34.4)
logro minimo 38444 (43.5) 2208 (42.0) 1523 (43.1) 14483 (44.8) 16808 (44.8) 3422 (35.3)
Evaluation of consumption pattern at discharge (%) logro alto 25866 (29.3) 1654 (31.5) 1140 (32.2) 9054 (28.0) 10295 (27.5) 3723 (38.5) <0.001 0.134 0.3
logro intermedio 25279 (28.6) 1468 (27.9) 941 (26.6) 9269 (28.7) 10726 (28.6) 2875 (29.7)
logro minimo 37124 (42.1) 2134 (40.6) 1456 (41.2) 13997 (43.3) 16453 (43.9) 3084 (31.9)
Family situation evaluation at discharge (%) logro alto 20889 (23.7) 1259 (24.0) 843 (23.8) 7617 (23.6) 8321 (22.2) 2849 (29.4) <0.001 0.090 0.3
logro intermedio 28144 (31.9) 1702 (32.4) 1109 (31.4) 10226 (31.6) 11824 (31.6) 3283 (33.9)
logro minimo 39236 (44.5) 2295 (43.7) 1585 (44.8) 14477 (44.8) 17329 (46.2) 3550 (36.7)
Interpersonal relations evaluation at discharge (%) logro alto 21507 (24.4) 1371 (26.1) 941 (26.6) 7633 (23.6) 8661 (23.1) 2901 (30.0) <0.001 0.103 0.3
logro intermedio 28053 (31.8) 1668 (31.7) 1068 (30.2) 10180 (31.5) 11820 (31.5) 3317 (34.3)
logro minimo 38709 (43.9) 2217 (42.2) 1528 (43.2) 14507 (44.9) 16993 (45.3) 3464 (35.8)
Occupational situation evaluation at discharge (%) logro alto 27771 (31.5) 1474 (28.0) 832 (23.5) 10739 (33.2) 11733 (31.3) 2993 (30.9) <0.001 0.135 0.3
logro intermedio 23483 (26.6) 1365 (26.0) 803 (22.7) 8754 (27.1) 10054 (26.8) 2507 (25.9)
logro minimo 37015 (41.9) 2417 (46.0) 1902 (53.8) 12827 (39.7) 15687 (41.9) 4182 (43.2)
Mental health evaluation at discharge (%) logro alto 21471 (24.3) 1312 (25.0) 906 (25.6) 7584 (23.5) 8689 (23.2) 2980 (30.8) <0.001 0.103 0.3
logro intermedio 29000 (32.9) 1704 (32.4) 1119 (31.6) 10632 (32.9) 12135 (32.4) 3410 (35.2)
logro minimo 37798 (42.8) 2240 (42.6) 1512 (42.7) 14104 (43.6) 16650 (44.4) 3292 (34.0)
Physical health evaluation at discharge (%) logro alto 26845 (30.4) 1605 (30.5) 1340 (37.9) 8990 (27.8) 10536 (28.1) 4374 (45.2) <0.001 0.209 0.3
logro intermedio 27330 (31.0) 1594 (30.3) 1052 (29.7) 10073 (31.2) 11710 (31.2) 2901 (30.0)
logro minimo 34094 (38.6) 2057 (39.1) 1145 (32.4) 13257 (41.0) 15228 (40.6) 2407 (24.9)
Social norm transgression evaluation at discharge (%) logro alto 31583 (35.8) 2015 (38.3) 1001 (28.3) 11845 (36.6) 13416 (35.8) 3306 (34.1) <0.001 0.117 0.3
logro intermedio 21831 (24.7) 1217 (23.2) 875 (24.7) 7852 (24.3) 9165 (24.5) 2722 (28.1)
logro minimo 34855 (39.5) 2024 (38.5) 1661 (47.0) 12623 (39.1) 14893 (39.7) 3654 (37.7)
Readmission event (%) 0 69434 (78.5) 4045 (76.9) 2065 (58.3) 26397 (81.4) 30155 (80.3) 6772 (69.7) <0.001 0.257 0.0
1 19070 (21.5) 1218 (23.1) 1480 (41.7) 6027 (18.6) 7400 (19.7) 2945 (30.3)
Death event (%) 0 84557 (95.5) 5117 (97.2) 3366 (95.0) 30960 (95.5) 35979 (95.8) 9135 (94.0) <0.001 0.072 0.0
1 3947 (4.5) 146 (2.8) 179 (5.0) 1464 (4.5) 1576 (4.2) 582 (6.0)
Readmission time from admission (months) (median [IQR]) 47.9 [22.6, 80.7] 40.1 [18.9, 69.5] 33.6 [13.3, 72.5] 53.1 [26.5, 84.4] 45.8 [22.3, 77.3] 50.3 [19.5, 89.0] <0.001 nonnorm 0.175 0.0
Readmission time from discharge (months) (median [IQR]) 41.0 [15.0, 73.0] 32.0 [10.5, 60.3] 27.0 [4.9, 66.3] 45.0 [18.9, 76.1] 38.5 [14.7, 69.1] 43.6 [11.8, 80.7] <0.001 nonnorm 0.170 0.0
Death time from admission (months) (median [IQR]) 62.5 [34.0, 91.1] 54.0 [29.2, 84.9] 68.5 [37.9, 99.2] 64.7 [36.8, 91.6] 58.0 [31.2, 87.2] 73.9 [41.9, 101.7] <0.001 nonnorm 0.196 0.0
Death time from discharge (months) (median [IQR]) 54.3 [27.0, 83.9] 46.4 [21.1, 76.1] 61.4 [31.7, 92.6] 56.1 [29.0, 84.4] 50.5 [24.0, 80.0] 67.2 [35.0, 95.9] <0.001 nonnorm 0.217 0.0
Admission age (log) (median [IQR]) 3.6 [3.3, 3.8] 3.5 [3.3, 3.7] 3.5 [3.3, 3.7] 3.6 [3.4, 3.8] 3.6 [3.3, 3.8] 3.5 [3.3, 3.7] <0.001 nonnorm 0.142 0.0
Admission age ^2 (median [IQR]) 1,169.6 [751.9, 1,849.2] 1,052.4 [691.2, 1,610.8] 1,041.4 [677.6, 1,637.8] 1,261.3 [794.1, 2,009.7] 1,151.9 [746.4, 1,795.2] 1,091.6 [714.5, 1,717.3] <0.001 nonnorm 0.140 0.0
Admission age ^3 (median [IQR]) 40,001.7 [20,615.9, 79,520.9] 34,138.4 [18,170.7, 64,650.2] 33,604.5 [17,636.9, 66,282.6] 44,795.6 [22,378.1, 90,096.1] 39,096.3 [20,391.2, 76,063.3] 36,067.8 [19,098.4, 71,163.8] <0.001 nonnorm 0.136 0.0
Admission age (centered) (median [IQR]) -1.6 [-8.3, 7.2] -3.3 [-9.5, 4.4] -3.5 [-9.7, 4.7] -0.2 [-7.6, 9.1] -1.8 [-8.4, 6.6] -2.7 [-9.0, 5.7] <0.001 nonnorm 0.142 0.0
Admission age (three groups) (%) 18-29 30634 (34.6) 2125 (40.4) 1449 (40.9) 10165 (31.4) 13181 (35.1) 3714 (38.2) <0.001 0.127 0.0
30-44 39637 (44.8) 2299 (43.7) 1526 (43.0) 14282 (44.0) 17190 (45.8) 4340 (44.7)
45-64 18233 (20.6) 839 (15.9) 570 (16.1) 7977 (24.6) 7184 (19.1) 1663 (17.1)
Poverty index of commune of residence (median [IQR]) 12.4 [9.1, 17.0] 11.2 [8.9, 15.3] 12.5 [8.9, 16.9] 13.1 [9.3, 17.9] 11.9 [9.0, 15.7] 12.4 [9.0, 17.6] <0.001 nonnorm 0.140 0.0
Poverty index (log1p) (median [IQR]) 0.1 [0.1, 0.2] 0.1 [0.1, 0.1] 0.1 [0.1, 0.2] 0.1 [0.1, 0.2] 0.1 [0.1, 0.1] 0.1 [0.1, 0.2] <0.001 nonnorm 0.139 0.0
Poverty index (centered) (median [IQR]) 0.0 [-0.1, 0.0] 0.0 [-0.1, 0.0] 0.0 [-0.1, 0.0] 0.0 [0.0, 0.0] 0.0 [-0.1, 0.0] 0.0 [-0.1, 0.0] <0.001 nonnorm 0.140 0.0
Nationality (Chile) (%) chile 87935 (99.4) 5212 (99.0) 3524 (99.4) 32238 (99.4) 37307 (99.3) 9654 (99.4) 0.021 0.020 0.0
other 569 (0.6) 51 (1.0) 21 (0.6) 186 (0.6) 248 (0.7) 63 (0.6)
Length of stay in treatment (months) (median [IQR]) 5.6 [3.0, 10.0] 6.5 [3.4, 11.5] 4.4 [1.7, 9.1] 5.8 [3.4, 10.0] 5.5 [2.9, 10.0] 5.3 [2.3, 9.6] <0.001 nonnorm 0.144 0.0
Length of stay (log1p months) (median [IQR]) 1.9 [1.4, 2.4] 2.0 [1.5, 2.5] 1.7 [1.0, 2.3] 1.9 [1.5, 2.4] 1.9 [1.4, 2.4] 1.8 [1.2, 2.4] <0.001 nonnorm 0.228 0.0
Length of stay (months)^2 (median [IQR]) 31.1 [9.0, 100.0] 42.0 [11.4, 132.6] 19.0 [3.0, 82.2] 33.7 [11.3, 99.4] 29.7 [8.6, 99.4] 27.7 [5.4, 93.0] <0.001 nonnorm 0.084 0.0
Length of stay (months)^3 (median [IQR]) 173.8 [27.0, 1,000.0] 272.6 [38.3, 1,527.3] 82.6 [5.3, 744.8] 195.8 [37.8, 990.4] 162.0 [25.3, 990.4] 146.1 [12.5, 897.3] <0.001 nonnorm 0.057 0.0
  • General-population settings were mostly male (81–94%). Heavy sex imbalances by setting (1/4= women).

  • GP settings mostly lived with couples/kids or family of origin (~51%)

  • Dropout rates exceed 50% in all groups; highest in Basic ambulatory (58%); Completion rates highest in General-poulation residential (32%) and Intensive ambulatory (30%)

  • Patients in residential treatments were dominated by cocaine base paste as the primary substance (64-62%), daily use (72-76%) and drug dependence (>90%); Basic ambualtory, dominated by alcohol (44%).

  • Residential treatments were more single (23% vs. 34% overall)

  • Domestic violence and sexual abuse were higher in women-only settings (40–45% and 6–8% vs. 26% and 2% overall, respectively)

  • Ambulatory basic received more patients aged 40-65 (25%); women-only programs, less (16%)

  • Also women-only settings showed SUD severity and Depression slightly higher.

  • Women-only residential had diagnoses F6 (Disorders of adult personality and behavior) (47% vs. 28% overall) and more diagnoses in study (24% vs. 18% overall)

  • Women-only residential were younger (34 [13–72]), less went to treatment spontaneously (28% vs. 45% overall)

  • Residential settings had approx. two times more organ-system diseases (5.3%) and injuries & sequelae (3.9%)

  • Formal housing: >90% in intensive ambulatory settings vs. 80% in general-population residential; Homelessness/informal housing: 14% in General-population residential vs. 1–3% elsewhere.

  • Employment was lowest in Women-only residential (10%) and General-population residential (18%), with many unemployed (43–65%), while General-population basic ambulatory had the highest employed rate (65%). Education was similar across groups, with most having high school or less (53–57%), but primary school or less was more common in women-only settings (30%).

  • Urban residence highest: 88–89% in women-only settings.

  • **We did not find patients with sex that was inconsistent with their clinical presentation regarding the Pathology of Gestation and of the Intrauterine Fetus without Disorder. Although this study focuses on pregnancy and fetal outcomes, the male parent’s (father’s) medical history and genetic contribution were documented as part of the patient data. G:/My Drive/Alvacast/SISTRAT 2023//cons/_out/table_one_prediction_clean.txt **.

  • Considering these differences, we think that certain predictors may behave differently by settings

Code
SISTRAT23_c1_2010_2024_df_prev1z|>
    janitor::tabyl(readmit_event, death_event)|>
    janitor::adorn_percentages("all")|>
    janitor::adorn_pct_formatting(digits = 1)|>
    janitor::adorn_ns(position = "front")|> 
    knitr::kable("markdown", caption= "Intersection of Readmission (rows) w/ Mortality (cols)")
Intersection of Readmission (rows) w/ Mortality (cols)
readmit_event 0 1
0 66,231 (74.8%) 3,203 (3.6%)
1 18,326 (20.7%) 744 (0.8%)

1.6. Near-zero variance

Code
# freqRatio The ratio of the frequency of the most common value to the frequency of the second most common value. A large number means the variable is dominated by one value (i.e., nearly constant). #high (e.g. > 20) → highly unbalanced (rare category vs majority).
# percentUnique The percentage of unique values relative to the sample size. Low values mean the variable has very few distinct values — often categorical or binary, but potentially too invariant. #High → many distinct values (continuous-type).
# zeroVar   TRUE means the variable has zero variance (same value for all observations). These should always be removed.
# nzv   TRUE means the variable has near-zero variance (too little variation to be useful). These are usually removed before modeling.

categorical_vars2<- 
  setdiff(categorical_vars, c("nationality_cons", "phys_dx_supergroup", "dx_supergroup_series", "f1_substance_use"))
numerical_vars2 <- 
  setdiff(numerical_vars, c("nationality_cons", "phys_dx_supergroup", "dx_supergroup_series"))

nzv <- nearZeroVar(subset(SISTRAT23_c1_2010_2024_df_prev1z, select= setdiff(c(categorical_vars,numerical_vars), "plan_type_corr")),
                   names = TRUE, #column names are returned.
                   saveMetrics= TRUE, # a data frame with predictor information is returned.
                   allowParallel= TRUE,
                   foreach= TRUE
                   )
dplyr::mutate_if(nzv, is.numeric, ~round(.,2)) |> 
  knitr::kable(caption="Near zero variance", col.names= c("Variables", "Freq most common value / second", "% of unique values", "Zero variance","Near-zero variance"))
Near zero variance
Variables Freq most common value / second % of unique values Zero variance Near-zero variance
sex_rec 2.90 0.00 FALSE FALSE
tenure_status_household 1.13 0.01 FALSE FALSE
occupation_condition_corr24 1.40 0.00 FALSE FALSE
marital_status_rec 1.62 0.00 FALSE FALSE
ethnicity 14.71 0.00 FALSE FALSE
urbanicity_cat 8.34 0.00 FALSE FALSE
ed_attainment_corr 2.11 0.00 FALSE FALSE
cohabitation 1.21 0.00 FALSE FALSE
dg_psiq_cie_10_instudy 4.72 0.00 FALSE FALSE
dg_psiq_cie_10_dg 1.24 0.00 FALSE FALSE
f0_organic 79.09 0.00 FALSE TRUE
f1_substance_use 0.00 0.00 TRUE TRUE
f2_psychotic 64.12 0.00 FALSE TRUE
f3_mood 8.75 0.00 FALSE FALSE
f4_anxiety_stress_somatoform 52.97 0.00 FALSE TRUE
f5_physio_eating_sleep_sexual 194.37 0.00 FALSE TRUE
f6_personality_adult_behaviour 2.52 0.00 FALSE FALSE
f7_intellectual_disability 137.72 0.00 FALSE TRUE
f8_9_neurodevelopment_child 68.85 0.00 FALSE TRUE
dx_f2_smi_psychotic 64.12 0.00 FALSE TRUE
dx_f3_mood 8.75 0.00 FALSE FALSE
dx_f45_anx_stress_phys 41.51 0.00 FALSE TRUE
dx_f6_personality 2.52 0.00 FALSE FALSE
dx_f0789_neurocog_dev 28.63 0.00 FALSE TRUE
sub_dep_icd10_status 2.68 0.00 FALSE FALSE
dom_violence 2.84 0.00 FALSE FALSE
sex_abuse 50.43 0.00 FALSE TRUE
any_violence 2.57 0.00 FALSE FALSE
phys_dx_supergroup 1.56 0.01 FALSE FALSE
phys_dx_instudy 1.26 0.00 FALSE FALSE
phys_dx_other_spec_medical_cond 21.38 0.00 FALSE TRUE
phys_dx_organ_system_med_dis 34.71 0.00 FALSE TRUE
phys_dx_injuries_and_sequelae 64.84 0.00 FALSE TRUE
phys_dx_infectious_diseases 123.62 0.00 FALSE TRUE
prim_sub_freq 1.58 0.01 FALSE FALSE
prim_sub_freq_rec 1.00 0.00 FALSE FALSE
polysubstance_strict 2.69 0.00 FALSE FALSE
treat_lt_90 3.22 0.00 FALSE FALSE
tr_outcome 2.01 0.01 FALSE FALSE
adm_motive 1.44 0.01 FALSE FALSE
primary_sub 1.11 0.01 FALSE FALSE
primary_sub_mod 1.11 0.01 FALSE FALSE
usuario_tribunal_trat_droga 49.72 0.00 FALSE TRUE
tipo_de_vivienda_rec 25.76 0.00 FALSE TRUE
tipo_de_vivienda_rec2 8.87 0.00 FALSE FALSE
evaluacindelprocesoteraputico 1.34 0.00 FALSE FALSE
eva_consumo 1.44 0.00 FALSE FALSE
eva_fam 1.39 0.00 FALSE FALSE
eva_relinterp 1.38 0.00 FALSE FALSE
eva_ocupacion 1.33 0.00 FALSE FALSE
eva_sm 1.30 0.00 FALSE FALSE
eva_fisica 1.25 0.00 FALSE FALSE
eva_transgnorma 1.10 0.00 FALSE FALSE
readmit_event 3.64 0.00 FALSE FALSE
death_event 21.42 0.00 FALSE TRUE
readmit_time_from_adm_m 1.10 7.86 FALSE FALSE
readmit_time_from_disch_m 17.66 7.53 FALSE FALSE
death_time_from_adm_m 1.06 5.54 FALSE FALSE
death_time_from_disch_m 19.39 5.61 FALSE TRUE
adm_age_log 1.19 5.17 FALSE FALSE
adm_age_pow2 1.19 5.17 FALSE FALSE
adm_age_pow3 1.19 5.17 FALSE FALSE
adm_age_c 1.19 5.17 FALSE FALSE
adm_age_cat 1.29 0.00 FALSE FALSE
porc_pobr 1.06 1.00 FALSE FALSE
porc_pobr_log 1.06 1.00 FALSE FALSE
porc_pobr_c 1.06 1.00 FALSE FALSE
nationality_chile 154.54 0.00 FALSE TRUE
dit_m 1.02 14.27 FALSE FALSE
treat_log 1.00 12.26 FALSE FALSE
treat_days_pow2 1.06 14.27 FALSE FALSE
treat_days_pow3 1.01 14.27 FALSE FALSE

Given that near-zero variance can lead to large standard errors, convergence warnings, or even quasi-separation if the infrequent category aligns perfectly with the outcome, we deleted the following predictors due to potential collinearity: nationality_cons, phys_dx_supergroup, and dx_supergroup_series.

Also, we deleted 1 due to zero variance.

Variables with near zero variance: f5_physio_eating_sleep_sexual, nationality_chile, f7_intellectual_disability, phys_dx_infectious_diseases, f0_organic, f8_9_neurodevelopment_child, phys_dx_injuries_and_sequelae, f2_psychotic, dx_f2_smi_psychotic, f4_anxiety_stress_somatoform, sex_abuse, usuario_tribunal_trat_droga, dx_f45_anx_stress_phys, phys_dx_organ_system_med_dis, dx_f0789_neurocog_dev, tipo_de_vivienda_rec, death_event, phys_dx_other_spec_medical_cond, death_time_from_disch_m, f1_substance_use.

  • For tipo_de_vivienda_rec, we used tipo_de_vivienda_rec2 instead.

  • For psychiatric variables, we grouped them into severe mental illness and neurocognitive disorders (dx_f_any_severe_mental= F0|F2|F7|F(8-9)|), any internalizing/stress-related diagnosis or of the somatoform spectrum (dx_f_any_internalizing F(4-5)).

  • For any physical comorbidity, we applied grouping for any diagnostic (any_phys_dx): Infectuous diseases, Organ-system, Injuries and sequelae and Other specified medical conditions.

  • For nationality_chile, we changed the dominant category and changed it for foreigners instead (national_foreign).

  • Court-mandated or drug court treatment user (usuario_tribunal_trat_droga) are less than 2% of the SUD treatment population. We did not use it as a predictor.

  • Reported sexual abuse also represent less than 2% of the population. We have a variable called any_violence that groups domestic violence with sexual abuse

For the rest of variables, we marked them and review them conceptually later, collapse rare categories if possible, and avoid including obvious.

2. Prepare for survival setting w/ ML

2.1. Defined variables and clean database

We numeric and categorical predictors, excluded target and time-leak columns, and added a unique row IDs. Then we converted categorical variables to factors.

Code
num_vars = c(
    # floats
    "readmit_time_from_adm_m",
    "death_time_from_adm_m",
    "adm_age_rec3",
    # "adm_age_log",
    # "adm_age_pow2",
    # "adm_age_pow3",
    # "adm_age_c",
    "porc_pobr",
    # "porc_pobr_log",
    # "porc_pobr_c",
    "dit_m",
    # "treat_log",
    # "treat_days_pow2",
    # "treat_days_pow3"
    "num_trat_ant",# only 5993 missings
    "age_subs_onset_rec2" # only 4300 missings
)
cat_vars = c(
    # category
    "sex_rec",
    "tenure_status_household",
    "cohabitation",
    "sub_dep_icd10_status",
    #"dom_violence", # check collinearity
    #"sex_abuse", # collinear #freqRatio>20
    "any_violence",
    "prim_sub_freq_rec",
    "tr_outcome",
    "adm_motive",
    "first_sub_used",
    "primary_sub_mod",
    #"tipo_de_vivienda_rec", #freqRatio>20
    "tipo_de_vivienda_rec2",
    #"adm_age_cat", # collinear
    #"nationality_chile",
    "national_foreign",
    "plan_type_corr",
    # object
    "occupation_condition_corr24",
    "marital_status_rec",
    "urbanicity_cat",
    "ed_attainment_corr",
    #"prim_sub_freq",
    #"primary_sub",
    #"usuario_tribunal_trat_droga", #freqRatio>20
    "evaluacindelprocesoteraputico",
    "eva_consumo",
    "eva_fam",
    "eva_relinterp",
    "eva_ocupacion",
    "eva_sm",
    "eva_fisica",
    "eva_transgnorma",
    # booleans
    "ethnicity",    
    "dg_psiq_cie_10_instudy",
    "dg_psiq_cie_10_dg",
    #"f0_organic", #freqRatio>20
    #"f2_psychotic", #freqRatio>20
    #"f3_mood", #collinear
    #"f4_anxiety_stress_somatoform", #freqRatio>20
    #"f5_physio_eating_sleep_sexual", #freqRatio>20
    #"f6_personality_adult_behaviour", #freqRatio>20
    #"f7_intellectual_disability", #freqRatio>20
    #"f8_9_neurodevelopment_child", #freqRatio>20
    # ints
    #"dx_f2_smi_psychotic",
    "dx_f3_mood",
    #"dx_f45_anx_stress_phys",
    "dx_f6_personality",
    #"dx_f0789_neurocog_dev",
    "dx_f_any_severe_mental",
    "dx_f_any_internalizing",
    "phys_dx_instudy",
    "any_phys_dx",
    #"phys_dx_other_spec_medical_cond",
    #"phys_dx_organ_system_med_dis",
    #"phys_dx_injuries_and_sequelae",
    #"phys_dx_infectious_diseases",
    "polysubstance_strict"
    #"treat_lt_90" #collinear
)


# 0. Define columns to exclude (endpoints + time leaks)
target_cols <- c(
  "readmit_time_from_disch_m", "readmit_event",
  "death_time_from_disch_m", "death_event"
)
leak_time_cols <- c("readmit_time_from_adm_m", "death_time_from_adm_m")
cols_to_exclude <- c(target_cols, leak_time_cols)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
SISTRAT23_c1_2010_2024_df_model<-  
SISTRAT23_c1_2010_2024_df_prev1z%>%
  dplyr::mutate(original_row_id = dplyr::row_number())%>%  # Create a unique ID
  dplyr::mutate(
      national_foreign = dplyr::if_else(nationality_chile == "other", 1L, 0L)
    )%>%
  dplyr::mutate(
    dx_f_any_severe_mental =
      (f2_psychotic == 1L |
       dx_f2_smi_psychotic == 1L |
       f0_organic == 1L |
       f7_intellectual_disability == 1L |
       f8_9_neurodevelopment_child == 1L |
       dx_f0789_neurocog_dev == 1L)
  )%>%
  dplyr::mutate(
    any_internalizing =
      (f4_anxiety_stress_somatoform == 1L |
       dx_f45_anx_stress_phys == 1L |
       f5_physio_eating_sleep_sexual == 1L)
  )%>%
  dplyr::mutate(
    any_phys_dx = (phys_dx_infectious_diseases == 1L |
                   phys_dx_injuries_and_sequelae == 1L |
                   phys_dx_organ_system_med_dis == 1L |
                   phys_dx_other_spec_medical_cond == 1L)
  )


# 1. Create predictor dataset (remove excluded columns)
df_pred <- SISTRAT23_c1_2010_2024_df_model %>%
  dplyr::select(any_of(c(num_vars, cat_vars, cols_to_exclude))) 
factor_vars<- 
c("sex_rec", "tenure_status_household", "cohabitation", "sub_dep_icd10_status", "any_violence", "tr_outcome", "adm_motive", "primary_sub_mod",  "tipo_de_vivienda_rec2", "plan_type_corr", "occupation_condition_corr24", "marital_status_rec", "urbanicity_cat", "ed_attainment_corr", "prim_sub_freq_rec", "first_sub_used", "evaluacindelprocesoteraputico", "eva_consumo", "eva_fam", "eva_relinterp", "eva_ocupacion", "eva_sm", "eva_fisica", "eva_transgnorma")
invisible(
df_pred[, (factor_vars) := lapply(.SD, as.factor), .SDcols = factor_vars]
)

2.2. Imputation

Then we run MICE-like with PMM (15 donors, 200 trees, 5 datasets, 10 iterations) using a correlation-based predictor matrix, and finally extracted the 5 completed/imputed datasets, restoring factor levels.

Code
# Stable imputation
# set.seed(2125)
# imp <- mice(
#     clean_dummified_df,
#     m = 5,
#     maxit = 10,
#     ridge = 1e-04,  # Regularización agresiva pero necesaria
#     donors = 15,
#     allow.na = FALSE
# )
#  it im dep      meth                                  out
# 1  0  0     collinear                          porc_pobr_c
# 2  0  0     collinear                  dx_f2_smi_psychotic
# 3  0  0     collinear                           dx_f3_mood
# 4  0  0     collinear                    dx_f6_personality
# 5  0  0     collinear sub_dep_icd10_status_drug_dependence
# 6  0  0     collinear              nationality_chile_other

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

  
cat("Define number of complete datasets to create\n")
replicates_n <- 5

  # Check if the session is interactive (i.e., you are using RStudio manually)
  if (interactive()) {
    if(
    !file.exists(file.path(wdpath, 
                           "data/20241015_out", 
                           "filled_datasets.Rds"))
    ) {
      # If interactive, offload to a background job to keep console free
      job::job({
        source(file.path(wdpath, "cons", "_hist_scripts", "missranger_mice_like_pred1.R"))
      }, title = "Imputation Job") # Optional: adds a nice title to the jobs pane
    } else {
      filled_datasets <- readr::read_rds(file.path(wdpath, "data/20241015_out", "filled_datasets.Rds"))
    }  
  } else {
    # If NOT interactive (e.g., batch mode), just run it directly
      
    library(dplyr)
    library(purrr)
    library(tibble)
    
    # Run MICE imputation (corrected syntax)
    time_event_vars <- c(
      "readmit_time_from_disch_m", "readmit_event",
      "death_time_from_disch_m", "death_event"
    )
    predictor_formula <- as.formula(
      #paste(". ~ . -", paste(time_event_vars, collapse = " - "))
      paste(". ~ . ", sep="")
    )
    
    cat("MICE-like but with missranger")
    #https://cran.r-project.org/web/packages/missRanger/vignettes/working_with_censoring.html
    set.seed(2125)
    filled_datasets <- replicate(
      replicates_n,
      missRanger::missRanger(
        df_pred,
        formula = predictor_formula,  # Excluye explícitamente outcomes
        verbose = 2,
        pmm.k = 15,#10, # predictive mean matching with 15 nearest neighbors for num variables
        num.trees = 200, # trees sufficient for stability
        #seed      = 2125, #if you want 5 different but reproducible imputations. dont use insede missRanger
        maxiter = 10, #50 is overkill unless you have very heavy missingness and time isn’t an issue
        returnOOB = TRUE,
        respect.unordered.factors = "order" # controls how categorical variables are handled inside the trees, giving a better treatment of unordered factors.
      ),
      simplify = FALSE
    )
    #_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
    #_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
    cat("Check remaining missing values\n")
    imputed_data <- map_dfr(1:5, function(i) {
      df <- filled_datasets[[i]]
      # add control variables
      dplyr::mutate(df,
             .imp = i,  # imputation number
             .id = SISTRAT23_c1_2010_2024_df_model$original_row_id  # Use original ID
      ) %>%
        # Keep variables necessary for analysis
        dplyr::select(
          .imp, .id,
          everything()
        )
    }, .id = "imputation_id") %>% 
      dplyr::select(-imputation_id)  # cleaning
    #_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
    #_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
    cat("Check remaining missing values\n")
    
    get_all_oobs <- function(imputed_list) {
      # 1. Extract OOBs from all 5 datasets into one big table
      all_oobs <- map_dfr(seq_along(imputed_list), function(i) {
        # Get the attribute
        oob_vec <- attr(imputed_list[[i]], "oob")
        # Create a tidy table
        tibble::tibble(
          Variable = names(oob_vec),
          Error_Value = as.numeric(oob_vec),
          Imputation_Run = as.factor(i)
        )
      })
      # 2. Create a Summary Table (Mean across the 5 runs)
      oob_summary <- all_oobs %>%
        dplyr::group_by(Variable) %>%
        dplyr::summarise(
          Mean_OOB = mean(Error_Value),
          SD_OOB = sd(Error_Value),
          Best_Run = min(Error_Value),
          Worst_Run = max(Error_Value)
        ) %>%
        dplyr::arrange(Mean_OOB) # Sort: Best variables (lowest error) first
      # 3. Print the summary nicely
      cat("\n=== COMPLETE OOB ERROR REPORT ===\n")
      cat("Scale: 0.00 = Perfect Prediction | 1.00 = Useless/Random\n")
      print(as.data.frame(oob_summary), digits = 3)
      return(oob_summary)
    }

#Takes 1:41:49
#Takes 1:38:06

  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  imputed_data %>%
    dplyr::select(where(~ any(is.na(.x)))) %>%
    { 
      if (ncol(.) == 0) {
        message("✅ No missing values remain after imputation")
        tibble::tibble()
      } else {
        dplyr::summarise(., across(everything(), ~ scales::percent(mean(is.na(.x)), accuracy = 0.1))) %>%
          tidyr::pivot_longer(everything(), names_to = "variable", values_to = "missing_pct") %>%
          dplyr::arrange(desc(as.numeric(gsub("%", "", missing_pct))))
      }
    } %>%
    print()
  }

Adjuntando el paquete: ‘dplyr’

The following objects are masked from ‘package:Hmisc’:

src, summarize

The following object is masked from ‘package:MASS’:

select

The following object is masked from ‘package:survex’:

explain

The following objects are masked from ‘package:tidytable’:

across, add_count, add_tally, anti_join, arrange, between,
bind_cols, bind_rows, c_across, case_match, case_when, coalesce,
consecutive_id, count, cross_join, cume_dist, cur_column, cur_data,
cur_group_id, cur_group_rows, dense_rank, desc, distinct, filter,
first, full_join, group_by, group_cols, group_split, group_vars,
if_all, if_any, if_else, inner_join, is_grouped_df, lag, last,
lead, left_join, min_rank, mutate, n, n_distinct, na_if, nest_by,
nest_join, nth, percent_rank, pick, pull, recode, reframe,
relocate, rename, rename_with, right_join, row_number, rowwise,
select, semi_join, slice, slice_head, slice_max, slice_min,
slice_sample, slice_tail, summarise, summarize, tally, top_n,
transmute, tribble, ungroup

The following objects are masked from ‘package:stats’:

filter, lag

The following objects are masked from ‘package:base’:

intersect, setdiff, setequal, union

Adjuntando el paquete: ‘purrr’

The following object is masked from ‘package:caret’:

lift

The following objects are masked from ‘package:tidytable’:

map, map_chr, map_dbl, map_df, map_dfc, map_dfr, map_int, map_lgl,
map_vec, map2, map2_chr, map2_dbl, map2_df, map2_dfc, map2_dfr,
map2_int, map2_lgl, map2_vec, pmap, pmap_chr, pmap_dbl, pmap_df,
pmap_dfc, pmap_dfr, pmap_int, pmap_lgl, pmap_vec, walk

Adjuntando el paquete: ‘tibble’

The following objects are masked from ‘package:tidytable’:

enframe, tribble

Missing value imputation by random forests Missing value imputation by random forests Missing value imputation by random forests Missing value imputation by random forests Missing value imputation by random forests

✅ No missing values remain after imputation

Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

if (interactive()) {
  if(
  !file.exists(file.path(wdpath, 
                         "data/20241015_out", 
                         "filled_datasets.Rds"))
  ) {
    
    final_oob_stats <- get_all_oobs(filled_datasets)
    # cat("\n=== COMPLETE OOB ERROR REPORT ===\n")
    #   cat("Scale: 0.00 = Perfect Prediction | 1.00 = Useless/Random\n")
    final_oob_stats|>  
      dplyr::mutate_if(is.numeric, ~round(.,4))|> 
      knitr::kable(caption="Oob report")
  }
}
# tipo_de_vivienda_rec2 (0.08): Extremely reliable. The model almost certainly 
# knows what kind of house someone lives in based on their other traits.
# any_phys_dx (0.09): Very strong prediction.
# sex_rec (0.12): Standard; usually easy to predict from other social variables.
# eva_ variables (~0.14 - 0.21): Your "Evaluation" scales (consumo, fam, ocupacion, etc.) 
# are performing very well. This suggests clear patterns in your psychological/evaluation metrics.

# marital_status_rec (0.30): Decent.
# first_sub_used (0.34): The model has a fair idea of what substance they used 
# first, but it's not certain.
# prim_sub_freq_rec (0.41) & ed_attainment_corr (0.43): These are getting "fuzzy." 
# The imputed values will be plausible, but they might not perfectly 
# capture the individual nuance for every person.

# age_subs_onset_rec2 (0.72): High error. The age at which someone started using 
# substances is hard to predict from their current status.
# phys_dx_instudy (0.78): Very high. The specific physical diagnosis is likely too 
# complex or rare to be predicted by general social/eval variables.
# num_trat_ant (0.86): Critical Flag. This is nearly 0.90.


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#previous imputations

#saveRDS(imp, file= file.path(wdpath, "data/20241015_out", "imp.Rds"))

#saveRDS(filled_datasets, file= file.path(wdpath, "data/20241015_out", "filled_datasets.Rds"))
Define number of complete datasets to create
MICE-like but with missranger
Variables to impute:        sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute:   readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event

    sex_rc  phys__  any_p_  mrtl__  evlcnd  ev_cns  eva_fm  ev_rln  ev_cpc  eva_sm  ev_fsc  ev_trn  ed_tt_  prm___  ag___2  tnr_s_  nm_tr_  tp___2  frst__  any_vl
iter 1: 0.1291  0.8622  0.0901  0.3067  0.3256  0.1515  0.2041  0.1787  0.2639  0.1514  0.2049  0.2117  0.4397  0.4154  0.8086  0.4572  0.8687  0.0820  0.3422  0.2541  
iter 2: 0.1231  0.7830  0.0910  0.3039  0.0980  0.1480  0.1874  0.1632  0.2421  0.1409  0.2007  0.2107  0.4330  0.4144  0.7246  0.4495  0.8687  0.0812  0.3423  0.2538  
iter 3: 0.1243  0.7840  0.0908  0.3039  0.0985  0.1490  0.1872  0.1628  0.2431  0.1411  0.1999  0.2106  0.4338  0.4127  0.7232  0.4474  0.8681  0.0805  0.3429  0.2534  
iter 4: 0.1236  0.7834  0.0906  0.3038  0.0985  0.1484  0.1875  0.1627  0.2446  0.1408  0.1999  0.2113  0.4332  0.4129  0.7246  0.4498  0.8674  0.0812  0.3415  0.2527  

Variables to impute:        sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute:   readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event

    sex_rc  phys__  any_p_  mrtl__  evlcnd  ev_cns  eva_fm  ev_rln  ev_cpc  eva_sm  ev_fsc  ev_trn  ed_tt_  prm___  ag___2  tnr_s_  nm_tr_  tp___2  frst__  any_vl
iter 1: 0.1294  0.8623  0.0908  0.3066  0.3259  0.1508  0.2041  0.1795  0.2623  0.1516  0.2042  0.2112  0.4383  0.4156  0.8099  0.4559  0.8686  0.0820  0.3422  0.2538  
iter 2: 0.1238  0.7842  0.0913  0.3028  0.0975  0.1483  0.1865  0.1642  0.2423  0.1404  0.2003  0.2107  0.4338  0.4139  0.7257  0.4465  0.8685  0.0814  0.3425  0.2537  
iter 3: 0.1232  0.7832  0.0907  0.3043  0.0984  0.1487  0.1865  0.1619  0.2432  0.1412  0.2003  0.2101  0.4325  0.4142  0.7230  0.4475  0.8682  0.0814  0.3432  0.2531  
iter 4: 0.1231  0.7844  0.0907  0.3027  0.0982  0.1484  0.1868  0.1622  0.2433  0.1415  0.2001  0.2111  0.4342  0.4133  0.7243  0.4490  0.8677  0.0815  0.3430  0.2528  

Variables to impute:        sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute:   readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event

    sex_rc  phys__  any_p_  mrtl__  evlcnd  ev_cns  eva_fm  ev_rln  ev_cpc  eva_sm  ev_fsc  ev_trn  ed_tt_  prm___  ag___2  tnr_s_  nm_tr_  tp___2  frst__  any_vl
iter 1: 0.1293  0.8615  0.0905  0.3064  0.3256  0.1509  0.2040  0.1794  0.2624  0.1516  0.2044  0.2113  0.4376  0.4152  0.8121  0.4550  0.8695  0.0818  0.3427  0.2546  
iter 2: 0.1236  0.7865  0.0910  0.3039  0.0978  0.1488  0.1864  0.1630  0.2433  0.1408  0.2004  0.2116  0.4330  0.4139  0.7219  0.4485  0.8691  0.0805  0.3430  0.2522  
iter 3: 0.1231  0.7829  0.0908  0.3023  0.0977  0.1483  0.1868  0.1630  0.2433  0.1404  0.1998  0.2098  0.4346  0.4150  0.7271  0.4475  0.8681  0.0800  0.3419  0.2528  
iter 4: 0.1237  0.7842  0.0907  0.3041  0.0986  0.1485  0.1868  0.1629  0.2430  0.1411  0.2004  0.2107  0.4334  0.4152  0.7230  0.4482  0.8679  0.0803  0.3440  0.2534  

Variables to impute:        sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute:   readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event

    sex_rc  phys__  any_p_  mrtl__  evlcnd  ev_cns  eva_fm  ev_rln  ev_cpc  eva_sm  ev_fsc  ev_trn  ed_tt_  prm___  ag___2  tnr_s_  nm_tr_  tp___2  frst__  any_vl
iter 1: 0.1290  0.8622  0.0909  0.3060  0.3260  0.1508  0.2036  0.1787  0.2631  0.1516  0.2040  0.2121  0.4399  0.4144  0.8112  0.4553  0.8681  0.0821  0.3427  0.2523  
iter 2: 0.1242  0.7833  0.0913  0.3034  0.0981  0.1484  0.1867  0.1631  0.2438  0.1408  0.1999  0.2108  0.4335  0.4143  0.7241  0.4477  0.8672  0.0803  0.3422  0.2526  
iter 3: 0.1234  0.7830  0.0906  0.3039  0.0979  0.1492  0.1874  0.1628  0.2433  0.1412  0.2003  0.2113  0.4335  0.4142  0.7238  0.4467  0.8684  0.0806  0.3418  0.2549  

Variables to impute:        sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute:   readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event

    sex_rc  phys__  any_p_  mrtl__  evlcnd  ev_cns  eva_fm  ev_rln  ev_cpc  eva_sm  ev_fsc  ev_trn  ed_tt_  prm___  ag___2  tnr_s_  nm_tr_  tp___2  frst__  any_vl
iter 1: 0.1292  0.8634  0.0907  0.3059  0.3247  0.1513  0.2041  0.1794  0.2627  0.1515  0.2044  0.2109  0.4383  0.4172  0.8079  0.4569  0.8703  0.0831  0.3433  0.2532  
iter 2: 0.1230  0.7853  0.0908  0.3037  0.0980  0.1481  0.1869  0.1634  0.2425  0.1410  0.2004  0.2122  0.4335  0.4135  0.7237  0.4473  0.8676  0.0799  0.3417  0.2538  
iter 3: 0.1234  0.7838  0.0913  0.3037  0.0979  0.1485  0.1867  0.1620  0.2437  0.1409  0.2001  0.2109  0.4330  0.4147  0.7222  0.4475  0.8692  0.0811  0.3424  0.2534  
Check remaining missing values
Check remaining missing values
# A tibble: 0 × 0

2.3. One-hot encoding

We applied one‑hot encoding with dummyVars, and combined them with numeric predictors. Finally, we cleaned column names to produce a tidy modeling dataset, and generated a vector with dummified_vars. But also we decided to discard predictor variables with high uncertainty in the imputations, such as substance use onset age (age_subs_onset_rec2) physical diagnosis, in study (phys_dx_instudy) and the reported number of previous treatments (num_trat_ant).

The resulting database is called processed_datasets, which is a list of the imputed databases.

We also exported corrected_datasets to avoid immortal time bias in patients who were not readmitted because they died. This step is essential for accurately estimating readmission.

Code
cat("Define vector to discard dummy levels\n")

dummified_lvls_vars<- c("sex_rec_man", "tenure_status_household_others", "cohabitation_others", "sub_dep_icd10_status_hazardous_consumption", "any_violence_0_no_domestic_violence_sex_abuse", "tr_outcome_other", "adm_motive_other", "primary_sub_mod_others", "tipo_de_vivienda_rec2_formal_housing", "plan_type_corr_pg_pab", "occupation_condition_corr24_employed", "marital_status_rec_married_cohabiting", "urbanicity_cat_3_urban", "ed_attainment_corr_1_more_than_high_school", "prim_sub_freq_rec_1_1_day_wk", "first_sub_used_hallucinogens", "first_sub_used_inhalants", "first_sub_used_amphetamine_type_stimulants", "first_sub_used_others", "evaluacindelprocesoteraputico_logro_alto", "eva_consumo_logro_alto", "eva_fam_logro_alto", "eva_relinterp_logro_alto", "eva_ocupacion_logro_alto", "eva_sm_logro_alto", "eva_fisica_logro_alto", "eva_transgnorma_logro_alto")

high_uncertainty_imps<- c("age_subs_onset_rec2", "phys_dx_instudy", "num_trat_ant")

cat("Map dummification\n")
# 0. Define your factor variables dynamically (or use your manual list)
# It is safer to detect them from the first dataset to avoid typos
factor_vars <- names(filled_datasets[[1]])[sapply(filled_datasets[[1]], is.factor)]
dummy_formula <- as.formula(paste("~", paste(factor_vars, collapse = " + ")))

# Apply the transformation to EACH dataset in the list
processed_datasets <- map(filled_datasets, function(df_current) {
  # 1. Create the dummy model based on THIS specific dataset
  # (Though technically the levels should be the same, it's safer to re-run)
  dummies_model <- dummyVars(dummy_formula, data = df_current, fullRank = FALSE)
  # 2. Predict (generate) the dummies
  df_dummies_encoded <- predict(dummies_model, newdata = df_current)
  df_dummies_encoded <- as.data.frame(df_dummies_encoded)
  # 3. Combine non-factor columns with new dummy columns
  # We use dplyr::select to be safe regardless of whether it's data.table or data.frame
  numeric_part <- df_current %>% dplyr::select(-all_of(factor_vars))
  clean_dummified_df <- cbind(numeric_part, df_dummies_encoded)
  # 4. Clean names
  clean_dummified_df <- janitor::clean_names(clean_dummified_df)
  # 5. Select variables
  clean_dummified_df <- clean_dummified_df|> 
    dplyr::select(-any_of(dummified_lvls_vars))|> 
    dplyr::select(-any_of(high_uncertainty_imps))
  return(clean_dummified_df)
})

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Export databases\n")
#To google colab to explore predictors with Coxnet

# 1. Apply Correction to Each Dataset in the List
corrected_datasets <- map(processed_datasets, function(df) {
  # Ensure columns are numeric for comparison
  df <- df|> 
    mutate(
      readmit_time = as.numeric(readmit_time_from_disch_m),
      death_time   = as.numeric(death_time_from_disch_m),
      # Ensure events are 0/1 integers
      readmit_event = as.integer(as.character(readmit_event)), 
      death_event   = as.integer(as.character(death_event))
    )
  # --- THE CORRECTION LOGIC ---
  df <- df|>
    dplyr::mutate(
      # condition: Patient died (death_event==1) AND death happened before (or at) the readmission endpoint
      is_competing_risk = (death_event == 1) & (death_time <= readmit_time),
      # 1. Update Time: If competing risk, cut follow-up at death time. Otherwise keep original.
      readmit_time_from_disch_m = dplyr::if_else(is_competing_risk, death_time, readmit_time),
      # 2. Update Event: If competing risk, censor readmission (set to 0). Otherwise keep original.
      readmit_event = dplyr::if_else(is_competing_risk, 0, readmit_event)
    ) %>%
    # Remove temporary helper columns if you want to keep the dataset clean
    dplyr::select(-readmit_time, -death_time, -is_competing_risk)
  return(df)
})

# 1. Define the destination folder
output_dir <- file.path(wdpath, "data/20241015_out/pred1")

# 2. Iterate and Export
# iwalk automatically gives you the object (.x) and its index (.y)
purrr::iwalk(corrected_datasets, function(df, i) {
  # Construct the full file path
  # e.g. .../data/20241015_out/imputation_1.parquet
  file_name <- file.path(output_dir, paste0("imputation_", i, ".parquet"))
  # Export using rio
  rio::export(df, file_name)
  message(paste("Saved:", file_name))
})

Saved: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/pred1/imputation_1.parquet

Saved: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/pred1/imputation_2.parquet

Saved: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/pred1/imputation_3.parquet

Saved: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/pred1/imputation_4.parquet

Saved: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/pred1/imputation_5.parquet

Code
#Export list
rio::export(processed_datasets, paste0(file.path(wdpath, "data/20241015_out/pred1"),"/processed_datasets.Rds"))

rio::export(corrected_datasets, paste0(file.path(wdpath, "data/20241015_out/pred1"),"/corrected_datasets.Rds"))
Define vector to discard dummy levels
Map dummification
Export databases

Session info

Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
message(paste0("R library: ", Sys.getenv("R_LIBS_USER")))

R library: G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32

Code
message(paste0("Date: ",withr::with_locale(new = c('LC_TIME' = 'C'), code =Sys.time())))

Date: 2026-01-05 20:48:54.721862

Code
message(paste0("Editor context: ", path))

Editor context: G:/My Drive/Alvacast/SISTRAT 2023/data/20241015_out/251001_c1_1025_df_prev1y.rds

Code
cat("quarto version: "); quarto::quarto_version()
quarto version: 
[1] '1.7.29'
Code
sesion_info <- devtools::session_info()

Warning in system2(“quarto”, “-V”, stdout = TRUE, env = paste0(“TMPDIR=”, : el comando ejecutado ‘“quarto” TMPDIR=C:/Users/andre/AppData/Local/Temp/RtmpwlZmSq/file50ac14525543 -V’ tiene el estatus 1

Code
dplyr::select(
  tibble::as_tibble(sesion_info$packages),
  c(package, loadedversion, source)
)|> 
  DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
              caption = htmltools::tags$caption(
        style = 'caption-side: top; text-align: left;',
        '', htmltools::em('R packages')),
      options=list(
initComplete = htmlwidgets::JS(
        "function(settings, json) {",
        "$(this.api().tables().body()).css({
            'font-family': 'Helvetica Neue',
            'font-size': '70%', 
            'code-inline-font-size': '15%', 
            'white-space': 'nowrap',
            'line-height': '0.75em',
            'min-height': '0.5em'
            });",
        "}")))
Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
#|class-output: center-table

reticulate::py_list_packages()|> 
  DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
              caption = htmltools::tags$caption(
        style = 'caption-side: top; text-align: left;',
        '', htmltools::em('Python packages')),
      options=list(
initComplete = htmlwidgets::JS(
        "function(settings, json) {",
        "$(this.api().tables().body()).css({
            'font-family': 'Helvetica Neue',
            'font-size': '70%', 
            'code-inline-font-size': '15%', 
            'white-space': 'nowrap',
            'line-height': '0.75em',
            'min-height': '0.5em'
            });",
        "}"))) 

Warning in system2(python, args, stdout = TRUE): el comando ejecutado ‘“G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe” -m pip freeze’ tiene el estatus 1

Save

Code
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}

paste0(getwd(),"/cons")
file.path(paste0(wdpath,"data/20241015_out"))
file.path(paste0(envpath,"data/20241015_out"))

# Save
rdata_path <- file.path(wdpath, "data/20241015_out", paste0("pred1_ndp_", format(Sys.time(), "%Y_%m_%d"), ".Rdata"))

save.image(rdata_path)
cat("Saved in:",
    rdata_path)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
  password <- Sys.getenv("PASSWORD_SECRET")
} else {
  if (interactive()) {
    utils::savehistory(tempfile())
    Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
    utils::loadhistory()
  }
  Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
save.image(paste0(rdata_path,".enc"))

# Encriptar el archivo en el mismo lugar
httr2::secret_encrypt_file(path = paste0(rdata_path,".enc"), key = "PASSWORD_SECRET")

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Copy renv lock into cons folder\n")

if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
  message("Running on RStudio Server or inside Docker. Folder copy skipped.")

} else {
    
  source_folder <- 
  destination_folder <- paste0(wdpath,"cons/renv")
  
  # Copy the folder recursively
    file.copy(paste0(wdpath,"renv.lock"), paste0(wdpath,"cons/renv.lock"), overwrite = TRUE)
  
  message("Renv lock copy performed.")
}

Renv lock copy performed.

Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
time_after_dedup2<-Sys.time()

paste0("Time in markdown: ");time_after_dedup2-time_before_dedup2
[1] "G:/My Drive/Alvacast/SISTRAT 2023/cons/cons"
[1] "G:/My Drive/Alvacast/SISTRAT 2023//data/20241015_out"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/data/20241015_out"
Saved in: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/pred1_ndp_2026_01_05.RdataCopy renv lock into cons folder
[1] "Time in markdown: "
Time difference of 1.467835 hours
Back to top