SENDAs Agreement 1 Update 2010-2022 (Prediction, step 1)
The dataset is enriched with derived variables, clinically meaningful recodings, and grouped diagnostic categories, while excluding patients still in treatment or with truncated admission dates. This results in a person-level dataset suitable for semi-competing risks modeling, where each row represents a unique index treatment episode with associated covariates and event times. This step involves mainly Data Pre-processing and Feature Engineering.
Data Loading and Exploration
Loading Packages and uniting databases
Code
# invisible("Only run from Ubuntu")
# if (!(Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv"))) {
# if(Sys.info()["sysname"]!="Windows"){
# Sys.setenv(RETICULATE_PYTHON = "/home/fondecytacc/.pyenv/versions/3.11.5/bin/python")
# }
# }
#clean enviroment
rm(list = ls()); gc()
time_before_dedup2<-Sys.time()
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# Busca .mamba_root/envs/py311/python.exe desde getwd() hacia padres
find_python_rel <- function(start = getwd(),
rel = file.path(".mamba_root","envs","py311","python.exe")) {
cur <- normalizePath(start, winslash = "/", mustWork = FALSE)
repeat {
cand <- normalizePath(file.path(cur, rel), winslash = "/", mustWork = FALSE)
if (file.exists(cand)) return(cand)
parent <- dirname(cur)
if (identical(parent, cur)) return(NA_character_) # llegó a la raíz
cur <- parent
}
}
# --- Bootstrap reticulate con ruta relativa a getwd() ---
if(Sys.info()["sysname"]!="Windows"){
Sys.setenv(RETICULATE_PYTHON = "usr/bin/python3")
reticulate::py_config()
} else {
py <- find_python_rel()
if (is.na(py)) {
stop("No se encontró Python relativo a getwd() (buscando '.mamba_root/envs/py311/python.exe').\n",
"Directorio actual: ", getwd())
}
# Forzar ese intérprete
Sys.unsetenv(c("RETICULATE_CONDAENV","RETICULATE_PYTHON_FALLBACK"))
Sys.setenv(RETICULATE_PYTHON = py)
reticulate::use_python(py, required=T)
reticulate::py_config() # verificación
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
# --- Load 250927_c1_1025_df_prev1y.rds (simple, project-root aware, no functions) ---
# 1) Locate the file anywhere under the project root (current WD with a trailing "/cons" trimmed)
project_root <- sub("(/)?cons/?$", "", normalizePath(getwd(), winslash = "/", mustWork = FALSE))
fname <- "251001_c1_1025_df_prev1y.rds"
hits <- list.files(project_root, pattern = paste0("^", fname, "$"),
recursive = TRUE, full.names = TRUE)
if (!length(hits)) stop("File not found: ", fname, "\nSearched under: ", project_root)
path <- hits[1] # take first match
message("Loading: ", path)
# 2) Read into a dedicated environment (keeps .GlobalEnv clean)
env_prev1y <- new.env(parent = emptyenv())
obj <- readRDS(path)
assign("SISTRAT23_c1_2010_2024_df_prev1y", obj, envir = env_prev1y)
rm(obj)
# 3) Quick sanity check
with(env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y, { message("Rows: ", nrow(env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y), " | Cols: ", ncol(env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y)); print(str(env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y))})
SISTRAT23_c1_2010_2024_df_prev1y<-
env_prev1y$SISTRAT23_c1_2010_2024_df_prev1y
rm(env_prev1y)
# (Optional) If you want it in the global env, uncomment:
# list2env(env_prev1y, envir = .GlobalEnv) used (Mb) gc trigger (Mb) max used (Mb)
Ncells 604456 32.3 1317361 70.4 961116 51.4
Vcells 1160482 8.9 8388608 64.0 1876213 14.4
python: G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe
libpython: G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python311.dll
pythonhome: G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311
version: 3.11.5 | packaged by conda-forge | (main, Aug 27 2023, 03:23:48) [MSC v.1936 64 bit (AMD64)]
Architecture: 64bit
numpy: [NOT FOUND]
NOTE: Python version was forced by RETICULATE_PYTHON
Classes 'tidytable', 'tbl', 'data.table' and 'data.frame': 162897 obs. of 125 variables:
$ TABLE : chr "2023" "2015" "2023" "2013" ...
$ TABLE_rec_series : chr "||20232||" "||20151||" "||20231||" "||2013||" ...
$ rn : int 240474 66234 225781 44305 69194 111875 235652 213374 14494 27258 ...
$ rn_series : chr NA NA NA NA ...
$ num_trat_ant : num 0 1 0 1 2 2 0 0 0 1 ...
$ fecha_ultimo_tratamiento : chr NA "3 a 4 anos" NA "1 a 2 anos" ...
$ hash_key : chr "00002d6e6b95298ec21341639f62bbf9a98cd51e1b8eb858c7df1e9680e5b22b" "000068521f2408c3c6d49b287428d89bad9c198726cea177017d649d8a66c001" "0000773f8535cc999b521b1d6c9784df95dd7bf19b68e1ae94da41114d45bc59" "00015ab15b7747293d16142f7cd40abc73495f0a7390a71c14fe37265f5efde2" ...
$ min_adm_age_rec3 : num 57.9 31.5 38.9 20.6 20.6 ...
$ adm_age_rec3 : num 57.9 31.5 38.9 20.6 21.7 ...
$ birth_date_rec : Date, format: "1965-12-18" "1982-05-23" ...
$ adm_date_num_rec2 : num 19675 16041 19187 15974 16364 ...
$ adm_date_rec2 : Date, format: "2023-11-14" "2013-12-02" ...
$ dit_rec6 : num 20 485 365 176 199 36 211 302 15 217 ...
$ disch_date_rec6 : Date, format: "2023-12-04" "2015-04-01" ...
$ disch_date_num_rec6_trans : num 19695 16526 19552 16150 16563 ...
$ def_date : Date, format: NA NA ...
$ adm_motive : chr "spontaneous consultation" "sanitary sector" "other" "spontaneous consultation" ...
$ tr_compliance_rec7 : chr "early dropout" "referral" "completion" "late dropout" ...
$ adm_disch_reason : chr NA NA NA NA ...
$ referral_type : chr NA "other facility" NA NA ...
$ plan_type : chr "pg-pab" "pg-pab" "pg-pai" "pg-pab" ...
$ id_centro : num 255 330 501 489 489 489 409 841 341 212 ...
$ senda : chr "si" "si" "si" "si" ...
$ pub_center : Factor w/ 2 levels "FALSE","TRUE": 1 2 2 2 2 2 2 2 1 2 ...
$ primary_sub : chr "alcohol" "alcohol" "marijuana" "cocaine paste" ...
$ second_sub1 : chr NA NA NA "marijuana" ...
$ second_sub2 : chr NA NA NA "alcohol" ...
$ second_sub3 : chr NA NA NA "tranquilizers/hypnotics" ...
$ prim_sub_freq : chr "3. 2 to 3 days a week" "3. 2 to 3 days a week" "3. 2 to 3 days a week" "5. Daily" ...
$ prim_sub_route : chr "Oral (drunk or eaten)" "Oral (drunk or eaten)" "Smoked or pulmonary aspiration" "Smoked or pulmonary aspiration" ...
$ LB_age_primary_onset_rec2 : int NA NA NA 10 10 10 NA NA 9 9 ...
$ UB_age_primary_onset_rec2 : int NA NA NA 19 19 19 NA NA 41 41 ...
$ age_primary_onset_rec2 : num 14 13 13 17.7 17.7 ...
$ first_sub_used : chr "alcohol" "alcohol" "marijuana" "alcohol" ...
$ sus_ini_mod_mvv : Factor w/ 5 levels "cocaine paste",..: 3 3 4 3 3 3 4 3 3 3 ...
$ sus_ini_1 : chr NA NA NA NA ...
$ sus_ini_2 : chr NA NA NA NA ...
$ sus_ini_3 : chr NA NA NA NA ...
$ LB_age_subs_onset_rec2 : int NA NA NA 5 5 5 NA NA 5 5 ...
$ UB_age_subs_onset_rec2 : int NA NA NA 19 19 19 NA NA 41 41 ...
$ age_subs_onset_rec2 : int 14 13 13 10 10 10 36 13 9 9 ...
$ biopsych_comp : chr "2-moderate" "2-moderate" "1-mild" "2-moderate" ...
$ mod_psiq_cie_10 : chr NA "retraso mental::NA" "trastornos neuroticos, secundarios a situaciones estresantes y somatomorfos::NA" NA ...
$ mod_psiq_dsm_iv : chr NA NA NA NA ...
$ diagnostico_trs_fisico : chr "en estudio" "en estudio" "en estudio" "en estudio" ...
$ otros_probl_at_sm_or : chr NA "otros" "abuso sexual" "sin otros problemas de salud mental" ...
$ sub_dep_icd10_status : chr "hazardous consumption" "drug dependence" "hazardous consumption" "hazardous consumption" ...
$ evaluacindelprocesoteraputico: chr "logro minimo" "logro alto" "logro alto" "logro minimo" ...
$ eva_consumo : chr "logro minimo" "logro alto" "logro alto" "logro minimo" ...
$ eva_fam : chr "logro minimo" "logro intermedio" "logro intermedio" "logro minimo" ...
$ eva_relinterp : chr "logro minimo" "logro alto" "logro intermedio" "logro minimo" ...
$ eva_ocupacion : chr "logro minimo" "logro alto" "logro alto" "logro minimo" ...
$ eva_sm : chr "logro minimo" "logro intermedio" "logro alto" "logro minimo" ...
$ eva_fisica : chr "logro minimo" "logro alto" "logro alto" "logro minimo" ...
$ eva_transgnorma : chr "logro minimo" "logro alto" "logro alto" "logro minimo" ...
$ dg_global_nec_int_soc_or : chr "altas" NA "bajas" NA ...
$ dg_nec_int_soc_cap_hum_or : chr "altas" NA "bajas" NA ...
$ dg_nec_int_soc_cap_fis_or : chr "altas" NA "bajas" NA ...
$ dg_nec_int_soc_cap_soc_or : chr "altas" NA "bajas" NA ...
$ dg_global_nec_int_soc_egr_or : chr "altas" "medias" NA NA ...
$ dg_nec_int_soc_cap_hum_egr_or: chr "altas" "medias" "bajas" NA ...
$ dg_nec_int_soc_cap_fis_egr_or: chr "altas" "medias" "bajas" NA ...
$ dg_nec_int_soc_cap_soc_egr_or: chr "altas" "bajas" "medias" NA ...
$ usuario_tribunal_trat_droga : chr "no" "no" "no" "no" ...
$ nationality_cons : chr "chile" "chile" "chile" "chile" ...
$ ethnicity_c1_c6_historic : chr NA NA NA NA ...
$ discapacidad : chr "no" NA "no" NA ...
$ opcion_discapacidad : chr NA NA NA NA ...
$ sex_rec : chr "hombre" "hombre" "mujer" "hombre" ...
$ identidad_de_genero : chr "masculino" NA "femenino" NA ...
$ orientacion_sexual : chr "heterosexual" NA "heterosexual" NA ...
$ pregnant : chr "no" NA "no" NA ...
$ pregnant_disch : chr "no" NA "no" NA ...
$ marital_status : chr "married/cohabiting" "single" "married/cohabiting" "single" ...
$ tiene_menores_de_edad_a_cargo: chr "no" NA "si" NA ...
$ num_hijos_trat_res : num 0 0 0 0 0 0 0 0 1 0 ...
$ numero_de_hijos : num 2 0 2 0 0 0 4 2 1 0 ...
$ con_quien_vive : chr "con la pareja, hijos y padres o familia de origen" "solo" "unicamente con la pareja e hijos" "unicamente con padres o familia de origen" ...
$ tipo_de_vivienda : chr "casa" "mediagua" "casa" "casa" ...
$ precariedad_vivienda : chr "reside en una vivienda en buen estado" NA "reside en una vivienda en buen estado" NA ...
$ tenure_status_household : chr "owner/transferred dwellings/pays dividends" "stays temporarily with a relative" "illegal settlement" "owner/transferred dwellings/pays dividends" ...
$ servicios_basicos_95 : chr "reside en una vivienda con servicios sanitarios basicos" NA "reside en una vivienda con servicios sanitarios basicos" NA ...
$ perso_dormitorio_vivienda : chr "menor a 2,5 personas" NA "menor a 2,5 personas" NA ...
$ ed_attainment : chr "3-Completed primary school or less" "2-Completed high school or less" "2-Completed high school or less" "3-Completed primary school or less" ...
$ occupation_condition : chr "employed" "unemployed" "employed" "employed" ...
$ occupation_status : chr NA NA NA "salaried" ...
$ rubro_trabaja : chr NA NA NA "trabajadores no calificados (ocupaciones elementales)" ...
$ laboral_ingresos : chr "300.001 a 400.000" NA "400.001 a 500.000" NA ...
$ yr_block : num 1965 1980 1980 1990 1990 ...
$ OBS_series : chr NA NA NA NA ...
$ senda_series : chr NA NA NA NA ...
$ pub_center_series : chr NA NA NA NA ...
$ id_centro_series : chr NA NA NA NA ...
$ disch_date_num_rec6 : num 19695 16526 19552 16150 16563 ...
$ dg_psiq_cie_10_instudy : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
$ dg_psiq_cie_10_dg : logi FALSE TRUE TRUE FALSE FALSE FALSE ...
$ dg_psiq_dsm_iv_instudy : logi FALSE FALSE FALSE TRUE TRUE TRUE ...
$ dg_psiq_dsm_iv_dg : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ plan_type_series : chr "||pg-pab||" "||pg-pab||" "||pg-pai||" "||pg-pab||" ...
[list output truncated]
- attr(*, ".internal.selfref")=<externalptr>
NULL
Code
#https://github.com/rstudio/renv/issues/544
#renv falls back to copying rather than symlinking, which is evidently very slow in this configuration.
renv::settings$use.cache(FALSE)
#only use explicit dependencies (in DESCRIPTION)
renv::settings$snapshot.type("implicit")
#check if rstools is installed
if(Sys.info()["sysname"]=="Windows"){
try(installr::install.Rtools(check_r_update=F))
}Code
check_quarto_version <- function(required = "1.7.29", comparator = c("ge","gt","le","lt","eq")) {
comparator <- match.arg(comparator)
current <- package_version(paste(unlist(quarto::quarto_version()), collapse = "."))
req <- package_version(required)
ok <- switch(comparator,
ge = current >= req,
gt = current > req,
le = current <= req,
lt = current < req,
eq = current == req)
if (!ok) {
stop(sprintf("Quarto version check failed: need %s %s (installed: %s).",
comparator, required, current), call. = FALSE)
}
invisible(TRUE)
}
check_quarto_version("1.7.29", "ge")
#change repository to CL
local({
r <- getOption("repos")
r["CRAN"] <- "https://cran.dcc.uchile.cl/"
options(repos=r)
})
if(!require(pacman)){install.packages("pacman");require(pacman)}Code
if(!require(pak)){install.packages("pak");require(pak)}Code
pacman::p_unlock(lib.loc = .libPaths()) #para no tener problemas reinstalando paquetesCode
if(Sys.info()["sysname"]=="Windows"){
if (getRversion() != "4.4.1") { stop("Requires R version 4.4.1; Actual: ", getRversion()) }
}
#check docker
check_docker_running <- function() {
# Try running 'docker info' to check if Docker is running
system("docker info", intern = TRUE, ignore.stderr = TRUE)
}
if(Sys.info()["sysname"]=="Windows"){
install_docker <- function() {
# Open the Docker Desktop download page in the browser for installation
browseURL("https://www.docker.com/products/docker-desktop")
}
# Main logic
if (inherits(try(check_docker_running(), silent = TRUE), "try-error")) {
liftr::install_docker()
} else {
message("Docker is running.")
}
}Warning in system(“docker info”, intern = TRUE, ignore.stderr = TRUE): el comando ejecutado ‘docker info’ tiene el estatus 1
Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#PACKAGES#######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
unlink("*_cache", recursive=T)
# ----------------------------------------------------------------------
# 2. Use a single pak::pkg_install() call for most CRAN packages
# ----------------------------------------------------------------------
paks <-
c(#"git",
# To connect to github
"gh", #interface for GitHub API from R
#
"gitcreds", # manages Git credentials (usernames, passwords, tokens)
#
"usethis", # simplifies common project setup tasks for R developers
# Package to bring packages in development
"devtools",
# Package administration
"renv",
# To manipulate data
"knitr", "pander", "DT",
# Join
"fuzzyjoin", "RecordLinkage",
# For tables
"tidyverse", "janitor",
# For contingency tables
"kableExtra",
# For connections with python
"reticulate",
# To manipulate big data
"polars", "sqldf",
# To bring big databases
"nanoparquet",
# Interface for R and RStudio in R
"installr", "rmarkdown", "quarto", "yaml", #"rstudioapi",
# Time handling
"clock",
# Combine plots
"ggpubr",
# Parallelized iterative processing
"furrr",
# Work like a tibble with a data.table database
"tidytable",
# Split database into training and testing
"caret",
# Impute missing data
"missRanger", "mice",
# To modularize tasks
"job",
# For PhantomJS install checks
"webshot"
)
# dplyr
# janitor
# reshape2
# tidytable
# arrow
# boot
# broom
# car
# caret
# data.table
# DiagrammeR
# DiagrammeRsvg
# dplyr
# epiR
# epitools
# ggplot2
# glue
# htmlwidgets
# knitr
# lubridate
# naniar
# parallel
# polycor
# pROC
# psych
# readr
# rio
# rsvg
# scales
# stringr
# tableone
# rmarkdown
# biostat3
# codebook
# finalfit
# Hmisc
# kableExtra
# knitr
# devtools
# tidyr
# stringi
# stringr
# muhaz
# sqldf
# compareGroups
# survminer
# lubridate
# ggfortify
# car
# fuzzyjoin
# compareGroups
# caret
# job
# htmltools
# nanoparquet
# ggpubr
# polars
# installr
# clock
# pander
# reshape
# mice
# missRanger
# VIM
# withr
# biostat3
# broom
# glue
# finalfit
# purrr
# sf
# pak::pkg_install(paks)
pak::pak_sitrep()
# pak::sysreqs_check_installed(unique(unlist(paks)))
#pak::lockfile_create(unique(unlist(paks)), "dependencies_duplicates24.lock", dependencies=T)
#pak::lockfile_install("dependencies_duplicates24.lock")
#https://rdrr.io/cran/pak/man/faq.html
#pak::cache_delete()
library(tidytable)Code
library(ggplot2)
library(readr)
library(glmulti)Code
library(tableone)
library(survivalmodels)
#renv::install("patchwork@1.2.0")
library(survex)
library(SemiMarkov)Code
library(flexsurv)Code
library(rms)Code
library(survidm)
library(caret)Code
# library(shapr)#https://norskregnesentral.github.io/shapr/
# library(survidm)#https://sci-hub.ee/10.1002/bimj.201700200
# library(SemiMarkov)#https://www.degruyterbrill.com/document/doi/10.1515/ijb-2020-0083/html?lang=en
# #https://www.jclinepi.com/article/S0895-4356(24)00142-2/fulltext
# ----------------------------------------------------------------------
# 3. Activate polars code completion (safe to try even if it fails)
# ----------------------------------------------------------------------
#try(polars_code_completion_activate())
# ----------------------------------------------------------------------
# 4. BPMN from GitHub (not on CRAN, so install via devtools if missing)
# ----------------------------------------------------------------------
if (!requireNamespace("bpmn", quietly = TRUE)) {
devtools::install_github("bergant/bpmn")
}
# ----------------------------------------------------------------------
# 5. PhantomJS Check (use webshot if PhantomJS is missing)
# ----------------------------------------------------------------------
# if (!webshot::is_phantomjs_installed()) {
# webshot::install_phantomjs()
# }
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#FUNCTIONS######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
# NO MORE DEBUGS
options(error = NULL) # si antes tenías options(error = recover) o browser)
options(browserNLdisabled = FALSE)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
#NAs are replaced with "" in knitr kable
options(knitr.kable.NA = '')
pander::panderOptions('big.mark', ',')
pander::panderOptions('decimal.mark', '.')
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
#to format rows in bold
format_cells <- function(df, rows ,cols, value = c("italics", "bold", "strikethrough")){
# select the correct markup
# one * for italics, two ** for bold
map <- setNames(c("*", "**", "~~"), c("italics", "bold", "strikethrough"))
markup <- map[value]
for (r in rows){
for(c in cols){
# Make sure values are not factors
df[[c]] <- as.character( df[[c]])
# Update formatting
df[r, c] <- ifelse(nchar(df[r, c])==0,"",paste0(markup, gsub(" ", "", df[r, c]), markup))
}
}
return(df)
}
#To produce line breaks in messages and warnings
knitr::knit_hooks$set(
error = function(x, options) {
paste('\n\n<div class="alert alert-danger" style="font-size: small !important;">',
gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
'</div>', sep = '\n')
},
warning = function(x, options) {
paste('\n\n<div class="alert alert-warning" style="font-size: small !important;">',
gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
'</div>', sep = '\n')
},
message = function(x, options) {
paste('<div class="message" style="font-size: small !important;">',
gsub('##', '\n', x),
'</div>', sep = '\n')
}
)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
sum_dates <- function(x){
cbind.data.frame(
min= as.Date(min(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01"),
p001= as.Date(quantile(unclass(as.Date(x)), .001, na.rm=T), origin = "1970-01-01"),
p005= as.Date(quantile(unclass(as.Date(x)), .005, na.rm=T), origin = "1970-01-01"),
p025= as.Date(quantile(unclass(as.Date(x)), .025, na.rm=T), origin = "1970-01-01"),
p25= as.Date(quantile(unclass(as.Date(x)), .25, na.rm=T), origin = "1970-01-01"),
p50= as.Date(quantile(unclass(as.Date(x)), .5, na.rm=T), origin = "1970-01-01"),
p75= as.Date(quantile(unclass(as.Date(x)), .75, na.rm=T), origin = "1970-01-01"),
p975= as.Date(quantile(unclass(as.Date(x)), .975, na.rm=T), origin = "1970-01-01"),
p995= as.Date(quantile(unclass(as.Date(x)), .995, na.rm=T), origin = "1970-01-01"),
p999= as.Date(quantile(unclass(as.Date(x)), .999, na.rm=T), origin = "1970-01-01"),
max= as.Date(max(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01")
)
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# Define the function adapted for Polars
sum_dates_polars <- function(df, date_col) {
# Create the list of quantiles
quantiles <- c(0.001, 0.005, 0.025, 0.25, 0.5, 0.75, 0.975, 0.995, 0.999)
# Create expressions to calculate min and max
expr_list <- list(
pl$col(date_col)$min()$alias("min"),
pl$col(date_col)$max()$alias("max")
)
# Add expressions for quantiles
for (q in quantiles) {
expr_list <- append(expr_list, pl$col(date_col)$quantile(q)$alias(paste0("p", sub("\\.", "", as.character(q)))))
}
# Apply the expressions and return a DataFrame with the results
df$select(expr_list)
}
# Custom function for sampling with a seed
sample_n_with_seed <- function(data, size, seed) {
set.seed(seed)
dplyr::sample_n(data, size)
}
# Function to get the most frequent value
most_frequent <- function(x) {
uniq_vals <- unique(x)
freq_vals <- sapply(uniq_vals, function(val) sum(x == val))
most_freq <- uniq_vals[which(freq_vals == max(freq_vals))]
if (length(most_freq) == 1) {
return(most_freq)
} else {
return(NA)
}
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#CONFIG #######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
options(scipen=2) #display numbers rather scientific number
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
# Define the function first
#joins these values with semicolons and optionally truncates the result if it exceeds a specified width.
toString2 <- function(x, width = NULL, ...) {
string <- paste(x, collapse = "; ")
if (missing(width) || is.null(width) || width == 0)
return(string)
if (width < 0)
stop("'width' must be positive")
if (nchar(string, type = "w") > width) {
width <- max(6, width)
string <- paste0(substr(string, 1, width - 3), "...")
}
string
}
normalize_txt <- function(x) {
x|>
stringi::stri_trans_general("Latin-ASCII")|>
tolower()|>
trimws()
}
pair_str <- function(main, sub) {
main <- as.character(main)
sub <- as.character(sub)
both_na <- is.na(main) & is.na(sub)
out <- paste0(
tidytable::coalesce(main, "NA"),
"::",
tidytable::coalesce(sub, "NA")
)
out[both_na] <- NA_character_
out
}
# ── Helpers ────────────────────────────────────────────────────────────
mode_pick_int <- function(x){
x <- x[!is.na(x)]
if(length(x)==0) return(NA_integer_)
tx <- sort(table(x), decreasing = TRUE)
as.integer(names(tx)[1L])
}
subkey_to_label <- function(x){
y <- gsub("_"," ", tolower(x))
y <- gsub("amphetamine type stimulants","amphetamine-type stimulants", y)
y <- gsub("tranquilizers hypnotics","tranquilizers/hypnotics", y)
y
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# ── Encryption ────────────────────────────────────────────────────────
# Pack: [24-byte nonce][ciphertext] -> hex
encrypt_chr <- function(x, key) {
vapply(x, function(v) {
if (is.na(v)) return(NA_character_)
ct <- sodium::data_encrypt(charToRaw(as.character(v)), key)
nonce <- attr(ct, "nonce") # 24 bytes
sodium::bin2hex(c(nonce, ct)) # pack nonce + ct
}, FUN.VALUE = character(1))
}
# Unpack: hex -> [nonce|ciphertext], restore attr(nonce), then decrypt
decrypt_chr <- function(x, key) {
vapply(x, function(v) {
if (is.na(v)) return(NA_character_)
buf <- sodium::hex2bin(v)
if (length(buf) < 25L) stop("Ciphertext too short or corrupted.")
nonce <- buf[1:24]
ct <- buf[-(1:24)]
attr(ct, "nonce") <- nonce
rawToChar(sodium::data_decrypt(ct, key))
}, FUN.VALUE = character(1))
}
# Function to encrypt a character vector
encrypt_column <- function(x, key) {
sapply(x, function(item) {
if (is.na(item) || item == "") {
return(NA_character_)
}
encrypted_raw <- sodium::data_encrypt(charToRaw(item), key)
base64enc::base64encode(encrypted_raw) # Convert to base64 for storage
}, USE.NAMES = FALSE)
}
decrypt_column <- function(x, key) {
sapply(x, function(item) {
if (is.na(item)) return(NA_character_)
encrypted_raw <- base64enc::base64decode(item)
rawToChar(sodium::data_decrypt(encrypted_raw, key))
}, USE.NAMES = FALSE)
}
is_stata_ok <- function(x) {
nchar(x) <= 32 & grepl("^[A-Za-z][A-Za-z0-9_]*$", x)
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# ── Tidy RMS ────────────────────────────────────────────────────────
tidy_cph <- function(model) {
if (!inherits(model, "cph")) stop("Model must be an rms::cph object")
# run summary
s <- summary(model)
df_s <- as.data.frame(unclass(s))
df_s$rown <- rownames(s)
# find rows that are "Hazard Ratio"
hr_rows <- trimws(df_s$rown) == "Hazard Ratio"
# build tidy tibble
tibble::tibble(
term = trimws(rownames(s)[which(hr_rows) - 1L]), # previous row = variable name
estimate = df_s$Effect[hr_rows], # Hazard Ratio
conf.low = df_s$`Lower 0.95`[hr_rows],
conf.high= df_s$`Upper 0.95`[hr_rows],
p.value = df_s$P[hr_rows]
)
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# --- Step 1: Define File Paths ---
# SET THIS to the correct path of your secret text file
secret_file_path <- "C:/Users/andre/Documents/secret.txt"
# This locates the user-level .Renviron file automatically
renviron_path <- file.path(Sys.getenv("HOME"), ".Renviron")
# --- Step 2: Read the Secret Key from the .txt file ---
# Read the first line of the file and trim any whitespace
api_key <- trimws(readLines(secret_file_path, n = 1, warn = FALSE))
# --- Step 3: Check if the key is already in .Renviron ---
# Read all existing lines from .Renviron (if it exists)
if (file.exists(renviron_path)) {
existing_lines <- readLines(renviron_path, warn = FALSE)
} else {
existing_lines <- c()
}
# Check if a line for OPENAI_API_KEY already exists
key_exists <- any(grepl("^OPENAI_API_KEY=", existing_lines))
# --- Step 4: Write to .Renviron if the key isn't already there ---
if (!key_exists) {
# Format the line to be added
line_to_add <- sprintf('OPENAI_API_KEY="%s"', api_key)
# Append the line to the .Renviron file, adding a newline before it if the file isn't empty
cat(line_to_add, "\n", file = renviron_path, append = TRUE)
message("✅ Success! OPENAI_API_KEY has been added to your .Renviron file.")
message("Please restart your R session for the change to take effect.")
} else {
message("ℹ️ The OPENAI_API_KEY already exists in your .Renviron file. No changes were made.")
}Code
options(.gander_chat = ellmer::chat_openai(model = "o1-mini"))Error in contrib.url(repos, "source") :
trying to use CRAN without setting a mirror
* pak version:
- 0.8.0.1
* Version information:
- pak platform: x86_64-w64-mingw32 (current: x86_64-w64-mingw32, compatible)
- pak repository: - (local install?)
* Optional packages installed:
- pillar
* Library path:
- G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32
- C:/Program Files/R/R-4.4.1/library
* pak is installed at G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32/pak.
* Dependency versions:
- callr 3.7.6
- cli 3.6.2
- curl 5.2.1
- desc 1.4.3
- filelock 1.0.3
- jsonlite 1.8.8
- lpSolve 5.6.23.9000
- pkgbuild 1.4.4
- pkgcache 2.2.2.9000
- pkgdepends 0.7.2.9000
- pkgsearch 3.1.3.9000
- processx 3.8.4
- ps 1.7.6
- R6 2.5.1
- zip 2.3.1
* Dependencies can be loaded
Data Pre-processing and Feature Engineering
Outliers in continuous variables will be managed using Tukey’s method, where values beyond 1.5 times the interquartile range (IQR) from the first or third quartile are flagged for review.
The analysis will incorporate 14 predictor variables, including three continuous variables. To model non-linear relationships, we will generate transformed variables, including logarithmic (log(x)), quadratic (x2), and cubic (x^3) terms. Additionally, some continuous variables may be discretized into categorical features.
If required, continuous predictors will be zero-centered and scaled by a factor to improve the model’s computational performance and numerical stability.
Aims
To identify key prognostic and predictive factors associated with the risk of, and time to, the first readmission to treatment among patients aged 18-64 undergoing initial treatment for Substance Use Disorder (SUD) in Chile between 2010 and 2020.
To identify key prognostic and predictive factors associated with the risk of, and time to, all-cause mortality among the same cohort of patients undergoing initial treatment for SUD in Chile.
To predict the risk and timing of first treatment readmission and all-cause mortality simultaneously, treating these outcomes within a semi-competing risks framework. This will allow for the estimation of cumulative incidence for each event in the presence of the other for patients aged 18-64 who initiated SUD treatment in Chile between 2010 and 2020.
This study has three primary objectives. First, we aim to identify key prognostic and predictive factors associated with the risk and timing of two critical outcomes: (i) first readmission to treatment and (ii) all-cause mortality. Second, using a semi-competing risks framework, we will develop a model to simultaneously predict these two outcomes, accounting for the fact that death precludes the possibility of readmission. Finally, this model will be used to estimate the probability of various patient trajectories over time following initial treatment. The study population includes all patients aged 18-64 who began treatment for Substance Use Disorder (SUD) in Chile between 2010 and 2020.
1. Structure of treatments and rules to collapse continuous entries
The variables involved in the analysis are the following:.
- Time to first readmission from admission
readmit_time_from_adm_m/ from dischargereadmit_time_from_disch_min months, and event (readmit_event) - Time to mortality
death_time_from_adm_m/ from dischargedeath_time_from_disch_min months, and event (death_event) Predisposing factors: - Admission age
adm_age_rec3[log1p(x)=adm_age_log, centered=adm_age_c,^2=adm_age_pow2, ^3=adm_age_pow3, three groups=adm_age_rec3_cat] - Sex (often used for biological categorization)
sex_rec - Housing situation
tenure_status_household_rec - Employment status
occupation_condition_corr24 - Marital status
marital_status_rec - Poverty index of the commune of residence
porc_pobr[log1p(x)=porc_pobr_log, centered=porc_pobr_c, six quantile groups=porc_pobr_c_cat6] - Urbanization level of the commune of residence
urbanicity_cat - Educational attainment
ed_attainment_corr - Living arrangement/Cohabitation status
cohabitation
Need factors: * Psychiatric comorbidity dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg * Severity of Substance Use Disorder (SUD) sub_dep_icd10_status * Primary substance of use primary_sub_mod * Frequency of primary substance use at admission prim_sub_freq (recoded: prim_sub_freq_rec) * Polysubstance use polysubstance_strict
Enabling factors: * Length of stay in treatment (months) dit_rec6 [log1p(x)=treat_log, ^2=treat_days_pow2, ^3=treat_days_pow3, <90days=treat_lt_90] * Treatment outcome tr_outcome * Admission motive adm_motive
Suitable for a research context:.
- Court-referred to drug treatment
usuario_tribunal_trat_droga(very unbalanced, ~1% yes) - Type of housing
tipo_de_vivienda_rec
Stratification variables:.
- Treatment modality
plan_type_corr
1.1. Cohort Construction for Time-to-Event Analysis
First, we flagged all qualifying treatment episodes by filtering for patients aged 18–65 with an admission date between 2010 and 2020. To ensure each patient served as their own baseline, we isolated only the first qualifying treatment for each individual, removing any second or subsequent qualifying treatments from the dataset. Using this first episode as the index event, we then built time-to-event variables in months for our outcomes of interest: death and readmission. All observations were administratively censored as of December 31, 2020, to all observations. The final analytical dataset included only the first qualifying treatment per patient, carrying forward the time-to-event information and a counter of any discarded prior treatments.
- What can I do with adm truncated (0.2%), and currently in (0.1%)??
We created a formatted database in wide format for multistate analysis (SISTRAT23_c1_2010_2024_df_prev1z).
Code
# --- Safe helpers (no warnings) ---
safe_min_where_int <- function(x, flag) {
idx <- which(flag %in% TRUE & !is.na(x))
if (length(idx) == 0L) NA_integer_ else as.integer(min(x[idx]))
}
safe_min_where_date <- function(x, flag) {
idx <- which(flag %in% TRUE & !is.na(x))
if (length(idx) == 0L) as.Date(NA) else min(x[idx])
}
safe_min_date <- function(x) {
x <- x[!is.na(x)]
if (length(x) == 0L) as.Date(NA) else min(x)
}
# Windows of interest
.start_date <- as.Date("2010-01-01")
.end_date <- as.Date("2020-12-31")
.censor_date <- as.Date("2020-12-31")
# 0) Start & canonical order + treatment index
df0 <- SISTRAT23_c1_2010_2024_df_prev1y|>
tidytable::ungroup()|>
tidytable::arrange(hash_key, adm_date_rec2, disch_date_rec6)|>
tidytable::mutate(treatment = tidytable::row_number(), .by = hash_key)
n0_cases <- nrow(df0)
n0_runs <- nrow(tidytable::distinct(df0, hash_key))
message("0. INITIAL database, cases: ", formatC(nrow(df0), big.mark=","), sep = "")Code
message("0. INITIAL database, RUNs: " , formatC(nrow(tidytable::distinct(df0, hash_key)), big.mark=","), sep = "")Code
# 1) Qualifying flag (age 18–65 & admission in 2010–2020 inclusive)
df1 <- df0|>
tidytable::mutate(
meets_crit = !is.na(adm_age_rec3) & adm_age_rec3 >= 18 & adm_age_rec3 <= 64 &
!is.na(adm_date_rec2) & adm_date_rec2 >= .start_date & adm_date_rec2 <= .end_date
)
# First qualifying treatment per patient + counter of discarded priors (no warnings)
first_keep_tbl <- df1|>
tidytable::summarise(
first_keep_treat = safe_min_where_int(treatment, meets_crit),
.by = hash_key
)|>
tidytable::mutate(
n_discarded_before = ifelse(is.na(first_keep_treat), NA_integer_,
as.integer(first_keep_treat - 1L))
)
n_patients_meet <- first_keep_tbl|> tidytable::filter(!is.na(first_keep_treat))|> nrow()
message("1. Patients with ≥1 qualifying admission (age 18–64 & 2010–2020): ",
formatC(n_patients_meet, big.mark=","), sep = "")Code
# 2) Keep ONLY qualifying episodes; rank; drop 3rd+
df_q <- df1|>
tidytable::filter(meets_crit)|>
tidytable::arrange(hash_key, adm_date_rec2, disch_date_rec6)|>
tidytable::mutate(q_rank = tidytable::row_number(), .by = hash_key)|>
tidytable::filter(q_rank <= 2L)
n1b_cases <- nrow(df_q)
n1b_patients <- nrow(tidytable::distinct(df_q, hash_key))
message("1b. AFTER filtering to qualifying episodes, cases: ",
formatC(nrow(df_q), big.mark=","), sep = "")Code
message("1b. AFTER filtering to qualifying episodes, RUNs: ",
formatC(nrow(tidytable::distinct(df_q, hash_key)), big.mark=","), sep = "")Code
message("2. AFTER dropping ≥3rd qualifying treatments per patient, cases: ",
formatC(nrow(df_q), big.mark=","), sep = "")Code
message("2. AFTER dropping ≥3rd qualifying treatments per patient, RUNs: ",
formatC(nrow(tidytable::distinct(df_q, hash_key)), big.mark=","), sep = "")Code
# 3) Anchors per patient (warning-free)
anchors <- df_q|>
tidytable::summarise(
first_adm = safe_min_where_date(adm_date_rec2, q_rank == 1L),
first_disch = safe_min_where_date(disch_date_rec6, q_rank == 1L),
second_adm = safe_min_where_date(adm_date_rec2, q_rank == 2L), # NA if no second qualifying
.by = hash_key
)
# Death date per patient (from full data, not only qualifying rows)
death_tbl <- df0|>
tidytable::summarise(death_date = safe_min_date(def_date), .by = hash_key)
# 4) Build per-person time-to-event (no min() calls here)
perperson <- df_q|>
tidytable::distinct(hash_key)|>
tidytable::left_join(anchors, by = "hash_key")|>
tidytable::left_join(death_tbl, by = "hash_key")|>
tidytable::left_join(first_keep_tbl, by = "hash_key")|>
tidytable::mutate(
# Event indicators with censoring at 2020-12-31
readmit_event = as.integer(!is.na(second_adm) & second_adm <= .censor_date),
death_event = as.integer(!is.na(death_date) & death_date <= .censor_date),
# End times for intervals (choose event time or censor date)
readmit_end_from_adm = ifelse(readmit_event == 1L, second_adm, .censor_date),
readmit_end_from_disch = ifelse(readmit_event == 1L, second_adm, .censor_date),
death_end_from_adm = ifelse(death_event == 1L, death_date, .censor_date),
death_end_from_disch = ifelse(death_event == 1L, death_date, .censor_date),
# Durations (months) from first qualifying admission/discharge
readmit_time_from_adm_m =
ifelse(!is.na(first_adm),
as.numeric(lubridate::time_length(lubridate::interval(first_adm, readmit_end_from_adm), "months")),
NA_real_),
readmit_time_from_disch_m =
ifelse(!is.na(first_disch),
as.numeric(lubridate::time_length(lubridate::interval(first_disch, readmit_end_from_disch), "months")),
NA_real_),
death_time_from_adm_m =
ifelse(!is.na(first_adm),
as.numeric(lubridate::time_length(lubridate::interval(first_adm, death_end_from_adm), "months")),
NA_real_),
death_time_from_disch_m =
ifelse(!is.na(first_disch),
as.numeric(lubridate::time_length(lubridate::interval(first_disch, death_end_from_disch), "months")),
NA_real_)
)|>
# Clamp negatives (dirty dates safeguard)
tidytable::mutate(
readmit_time_from_adm_m = ifelse(!is.na(readmit_time_from_adm_m), pmax(readmit_time_from_adm_m, 0), NA_real_),
readmit_time_from_disch_m = ifelse(!is.na(readmit_time_from_disch_m), pmax(readmit_time_from_disch_m, 0), NA_real_),
death_time_from_adm_m = ifelse(!is.na(death_time_from_adm_m), pmax(death_time_from_adm_m, 0), NA_real_),
death_time_from_disch_m = ifelse(!is.na(death_time_from_disch_m), pmax(death_time_from_disch_m, 0), NA_real_)
)|>
tidytable::mutate(discarded_before_flag = as.integer(n_discarded_before > 0L))
# 5) FINAL: keep only the first qualifying treatment row per patient, carry times/counters
SISTRAT23_c1_2010_2024_df_prev1z <-
df_q|>
tidytable::filter(q_rank == 1L)|>
tidytable::left_join(perperson, by = "hash_key")|>
tidytable::select(-meets_crit, -q_rank)|>
(\(df){
n_cases <- nrow(df)
n_runs <- tidytable::distinct(df, hash_key) |> nrow()
red_cases <- nrow(df0) - n_cases
red_runs <- (tidytable::distinct(df0, hash_key) |> nrow()) - n_runs
readmit <- sum(df$readmit_event)
# store (step 3)
n_cases ->> m3_cases
n_runs ->> m3_runs
red_cases ->> m3_cases_red
red_runs ->> m3_runs_red
list(cases=n_cases, runs=n_runs, readmit=readmit, cases_red=red_cases, runs_red=red_runs) ->> m3
message("3. FINAL: first qualifying treatment per patient (with time-to-event), cases: ",
formatC(n_cases, big.mark=","), sep = "")
message("3. FINAL: RUNs: ", formatC(n_runs, big.mark=","), sep = "")
message(" CASES reduction vs initial(prev1y): ", formatC(red_cases, big.mark=","), sep = "")
message(" RUNs reduction vs initial(prev1y): ", formatC(red_runs, big.mark=","), sep = "")
if (n_cases > nrow(df0)) stop("Error: Added treatment episodes in the process")
df
})() |>
tidytable::filter(!tr_compliance_rec7 == "currently in") |>
(\(df){
n_cases <- nrow(df)
n_runs <- tidytable::distinct(df, hash_key) |> nrow()
red_cases <- nrow(df0) - n_cases
red_runs <- (tidytable::distinct(df0, hash_key) |> nrow()) - n_runs
# store (step 4)
n_cases ->> m4_cases
n_runs ->> m4_runs
red_cases ->> m4_cases_red
red_runs ->> m4_runs_red
list(cases=n_cases, runs=n_runs, cases_red=red_cases, runs_red=red_runs) ->> m4
message("4. FINAL: leaving out patients currently in tr., cases: ",
formatC(n_cases, big.mark=","), sep = "")
message("4. FINAL: RUNs: ", formatC(n_runs, big.mark=","), sep = "")
message(" CASES reduction vs initial(prev1y): ", formatC(red_cases, big.mark=","), sep = "")
message(" RUNs reduction vs initial(prev1y): ", formatC(red_runs, big.mark=","), sep = "")
df
})()Code
#SISTRAT23_c1_2010_2024_df_prev1z |> filter(is.na(dit_m)) |> janitor::tabyl(tr_compliance_rec7, show_na = T)
SISTRAT23_c1_2010_2024_df_prev1z$porc_pobr_c_cat6<- cut2(SISTRAT23_c1_2010_2024_df_prev1z$porc_pobr_c,g=6)
cat("Remove intermediate databases\n")
rm(df0); rm(df1); rm(df_q); rm(anchors); rm(death_tbl); rm(first_keep_tbl)
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
# --- choose your time origin for the multistate ---
.use_discharge_origin <- FALSE # TRUE = origin at first_disch; FALSE = origin at first_adm
# Add origin_date
if (.use_discharge_origin) {
perperson <- tidytable::mutate(SISTRAT23_c1_2010_2024_df_prev1z, origin_date = first_disch)
} else {
perperson <- tidytable::mutate(SISTRAT23_c1_2010_2024_df_prev1z, origin_date = first_adm)
}
# 1) Drop patients who die before the origin (they never enter the process)
n_drop_predeath <- tidytable::filter(perperson, !is.na(death_date) & !is.na(origin_date) & death_date < origin_date) |>
nrow()
message("A. Excluding patients who died before origin: ",
formatC(n_drop_predeath, big.mark=","), sep = "")Code
stopifnot(
nrow(tidytable::filter(perperson, is.na(death_date) | is.na(origin_date) | death_date >= origin_date))== nrow(SISTRAT23_c1_2010_2024_df_prev1z)
)
# 2) Keep only those whose origin is observed and within study window (optional but recommended)
n_before <- nrow(SISTRAT23_c1_2010_2024_df_prev1z)
perperson2 <- tidytable::filter(perperson, !is.na(origin_date) & origin_date <= .censor_date)
message("B. Excluding missing/after-censor origins: ",
formatC(n_before - nrow(perperson2), big.mark=","), sep = "")Code
# 3) Ensure monotone times: second_adm must be after origin (else reclassify as 'no readmission')
# perperson <- tidytable::mutate(perperson,
# invalid_readmit = !is.na(second_adm) & second_adm <= origin_date,
# second_adm = ifelse(invalid_readmit, as.Date(NA), second_adm),
# readmit_event = ifelse(invalid_readmit, 0L, readmit_event)
# )
rm(perperson); rm(perperson2)Remove intermediate databases
1.1.* Flowchart
Code
#For SISTRAT23_c1_2010_2024_df_prev1t
try(load(paste0(dirname(path),"/SISTRAT_database_sep25_prevt.RData")))
#n_patients_meet;n1b_cases; n1b_patients; m4; m3
df23_ndp_20250824_SISTRAT23_c1_1024<-
readRDS(
paste0(dirname(path),"/psu/23_ndp_20250824_SISTRAT23_c1_1024_df2.rds")
)
# Labels
lab_orig <- paste0(
"Original C1 Dataset \n(n = ",
formatC(nrow(df23_ndp_20250824_SISTRAT23_c1_1024), format = "f", big.mark = ",", digits = 0),
"; Patients = ",
formatC(dplyr::n_distinct(df23_ndp_20250824_SISTRAT23_c1_1024$hash_key), format = "f", big.mark = ",", digits = 0),
")"
)
lab_orig_clean <- paste(
"Pre-processing & Quality Checks",
"• Remove exact duplicate records",
"• Resolve overlapping episodes (keep longest / merge)",
"• Fix inconsistent admission/discharge dates",
"• Correct implausible birth dates/ages",
"• Remove negative/implausible days in treatment",
"",
sep = "\\l"
)
lab_post_orig <- paste0('C1 Dataset \n(n = ', formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1t), format='f', big.mark=',', digits=0), '; Patients = ',formatC(dplyr::distinct(SISTRAT23_c1_2010_2024_df_prev1t, hash_key)|> nrow(), format='f', big.mark=',', digits=0),')')
lab_post_orig_clean <- paste("• Collapse consecutive linked", "treatments (≤45d gap, referral=yes)", sep = "\\l")
lab_proc <- paste0('C1 Cleaned & Collapsed\n(n = ', formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1y), format='f', big.mark=',', digits=0), '; Patients = ',formatC(dplyr::distinct(SISTRAT23_c1_2010_2024_df_prev1y, hash_key)|> nrow(), format='f', big.mark=',', digits=0),')')
lab_flag <- "Sorted by Admission Date\n+ First Treatment Flag"
lab_discard_first <- paste(
"• Qualifying flag (age 18–64 & admission in 2010–2020 inclusive)",
"• Dropping ≥3rd qualifying treatments per patient",
paste0("n= ", formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1y)-n1b_cases, format = "f", digits = 0, big.mark = ","),"; Patients= ", formatC(dplyr::distinct(SISTRAT23_c1_2010_2024_df_prev1t, hash_key)|> nrow()- n1b_patients, format = "f", digits = 0, big.mark = ",")," put aside"),
"",sep = "\\l")
##CAMBIAR
lab_after <- paste0("After First-Treatment Rule\nn= ",formatC(m3$readmit+m3$cases, format= "f", big.mark=",", digits=0),"; Patients= ", formatC(m3$cases, format= "f", big.mark=",", digits=0)) # example counts
#Frpm 3. FINAL: first qualifying treatment per patient (with time-to-event), cases:
lab_discard_single <- paste(
"1st qualifying treatment per patient (with time-to-event)",
paste0("Patients = ", formatC(m3$cases-nrow(SISTRAT23_c1_2010_2024_df_prev1z), format = "f", big.mark = ",", digits = 0)), sep="\\l")
lab_final <- paste0("Final C1 Dataset\nn = ",
formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1z)+sum(SISTRAT23_c1_2010_2024_df_prev1z$readmit_event), format = "f", digits = 0, big.mark = ","),
"; Patients = ",
formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1z), format = "f", digits = 0, big.mark = ","))
# Diagram
gr <- DiagrammeR::grViz(
paste0(
'digraph flowchart {
graph [layout = dot, rankdir = TB]
node [fontname = "Times", shape = rectangle, fontsize = 33, style = filled, fillcolor = white, ranksep=0.2, nodesep=0.2]
# New initial box and pre-clean note
pre_original [label = "', lab_orig, '", fillcolor = lightgray, shape = box]
pre_clean [label = "', lab_orig_clean, '", shape = note, fillcolor = white]
# Intermediate new initial box and pre-clean note
pre2_original [label = "', lab_post_orig, '", fillcolor = white, shape = box]
pre2_clean [label = "', lab_post_orig_clean, '", shape = note, fillcolor = lemonchiffon]
# Existing boxes
original [label = "', lab_proc, '", fillcolor = lightgray, shape = box]
marked [label = "', lab_flag, '", shape = box]
after_rule [label = "', lab_after, '", shape = box]
final_dataset [label = "', lab_final, '", fillcolor = lightgray, shape = box]
discard_first [label = "', lab_discard_first, '", shape = note, fillcolor = mistyrose]
discard_single [label = "', lab_discard_single, '", shape = note, fillcolor = mistyrose]
# Invisible points for alignment
v00 [shape = point, width = 0, style = invis] # between orig and post orig
v0 [shape = point, width = 0, style = invis] # between pre_original and original
v1 [shape = point, width = 0, style = invis]
v2 [shape = point, width = 0, style = invis]
v3 [shape = point, width = 0, style = invis]
# Main flow (add pre_original -> v0 -> original in front)
pre_original -> v00 [arrowhead = none]
v00 -> pre2_original
pre2_original -> v0 [arrowhead = none]
v0 -> original
original -> v1 [arrowhead = none]
v1 -> marked
marked -> v2 [arrowhead = none]
v2 -> after_rule
after_rule -> v3 [arrowhead = none]
v3 -> final_dataset
# Discard / note connections with solid black arrows
v00 -> pre_clean [color = black] # new pre-processing note
v0 -> pre2_clean [color = black] # new pre-processing note
v2 -> discard_first [color = black]
v3 -> discard_single[color = black]
# Alignment
{ rank = same; pre_clean; v00 }
{ rank = same; pre2_clean; v0 }
{ rank = same; discard_first; v2 }
{ rank = same; discard_single; v3 }
}'
),
width = 600, height = 900
)
grFigure 3. Flowchart
Exported the graph into png fortmat.
Code
unlink(paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_pred_files"), recursive = TRUE)
htmlwidgets::saveWidget(gr, paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_pred.html"))
webshot::webshot(paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_pred.html"),
paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_pred.png"),
vwidth = 300, vheight = 300*1.5, zoom=10, expand=100) # Prueba con diferentes coordenadas top, left, width, and height1.2. Death decomposition
We decomposed the cumulative incidence of death following an initial treatment episode by its temporal relationship to the first SUD treatment readmission.
Cumulative incidence functions (CIFs) were estimated using the Aalen–Johansen method, and total mortality was also calculated via Kaplan–Meier to validate consistency. The resulting decomposition shows how overall mortality is partitioned over time relative to readmission, visualized as stacked cumulative probabilities.
Code
# ──────────────────────────────────────────────────────────────────────────────
# 0) Base y nombres
df2 <- SISTRAT23_c1_2010_2024_df_prev1z |>
dplyr::rename(
t_r = readmit_time_from_disch_m,
e_r = readmit_event,
t_d = death_time_from_disch_m,
e_d = death_event
) |>
# Sanitiza tipos
dplyr::mutate(
e_r = as.integer(e_r),
e_d = as.integer(e_d)
)
# Chequeo rápido (opcional)
stopifnot(nrow(df2) > 0)
# t_d debe existir para todos (muerte o fin de seguimiento)
stopifnot(all(is.finite(df2$t_d)))
# ──────────────────────────────────────────────────────────────────────────────
# 1) Definir causas de MUERTE (mutuamente excluyentes) y tiempo de análisis
# 1 = muere antes/igual que la 1ª readmisión
# 2 = muere después de la 1ª readmisión
# 0 = censurado (no muere)
df3 <- df2 |>
dplyr::mutate(
cause = dplyr::case_when(
e_d == 1L & (e_r == 0L | t_d <= t_r) ~ 1L, # muerte antes/igual que readmisión
e_d == 1L & e_r == 1L & t_r < t_d ~ 2L, # readmisión primero, luego muerte
TRUE ~ 0L # no muere (censura)
),
# Tiempo de análisis:
# - muertes: t_d (tiempo a muerte)
# - censura: t_d (tiempo a última observación)
ftime2 = t_d
)
# Consistencia con tu 2x2
# with(df3, table(cause, e_d))
# Esperado: cause=1 -> 3203 muertes; cause=2 -> 744 muertes
# ──────────────────────────────────────────────────────────────────────────────
# 2) CIF de muerte por causa (competing risks)
ci2 <- cmprsk::cuminc(ftime = df3$ftime2, fstatus = df3$cause, cencode = 0)
# Helper robusto: toma el ÚLTIMO número del nombre (la causa), ignora "Tests"
get_cif_by_code <- function(ci, code = 1L){
keep <- vapply(ci, function(x) !is.null(x$est), logical(1))
nm <- names(ci)[keep]
ci <- ci[keep]
codes <- suppressWarnings(as.integer(sub(".*[^0-9]([0-9]+)$", "\\1", nm)))
idx <- which(!is.na(codes) & codes == code)[1]
if (!length(idx) || is.na(idx)) return(data.frame(time = numeric(0), est = numeric(0)))
data.frame(time = ci[[idx]]$time, est = ci[[idx]]$est)
}
cif_d_before <- get_cif_by_code(ci2, 1L) # muerte antes/igual que readmisión
cif_d_after <- get_cif_by_code(ci2, 2L) # muerte después de readmisión
# ──────────────────────────────────────────────────────────────────────────────
# 3) KM de muerte total (chequeo externo)
km <- survival::survfit(survival::Surv(time = df3$ftime2, event = df3$e_d) ~ 1)
km_df <- data.frame(time = km$time, P_dead_total = 1 - km$surv)
# ──────────────────────────────────────────────────────────────────────────────
# 4) Comparación a horizontes fijos (12, 60, 120 y tiempo final observado)
t_end <- max(df3$ftime2, na.rm = TRUE)
taus <- c(12, 60, 120, t_end)
risk_at <- function(fit, tau) 1 - summary(fit, times = tau, extend = TRUE)$surv
AJ_at <- function(cif_df, tau){
if (!nrow(cif_df)) return(rep(0, length(tau)))
# Aproximación por función escalón (respeta naturaleza de AJ)
sf <- stepfun(cif_df$time, c(0, cif_df$est))
sf(tau)
}
horiz <- data.frame(
tau_m = taus,
KM_death = sapply(taus, risk_at, fit = km),
CIF_before = AJ_at(cif_d_before, taus),
CIF_after = AJ_at(cif_d_after, taus)
) |>
dplyr::mutate(total_CIF = CIF_before + CIF_after)
# print(horiz)
# Esperado: KM_death ≈ total_CIF en cada tau (pequeñas diferencias numéricas)
# ──────────────────────────────────────────────────────────────────────────────
# 5) Serie completa para gráfico (con stepfun; evita errores de interpolación)
sf_km <- stepfun(km_df$time, c(0, km_df$P_dead_total))
sf_c1 <- stepfun(cif_d_before$time, c(0, cif_d_before$est))
sf_c2 <- stepfun(cif_d_after$time, c(0, cif_d_after$est))
t_grid <- sort(unique(c(km_df$time, cif_d_before$time, cif_d_after$time)))
out <- data.frame(
time = t_grid,
CIF_death_before = sf_c1(t_grid),
CIF_death_after = sf_c2(t_grid),
P_dead_total = sf_km(t_grid)
)
out$sum_cifs <- out$CIF_death_before + out$CIF_death_after
# Chequeo final (debe acercarse al KM final ~ 9%)
# tail(out[, c("time","CIF_death_before","CIF_death_after","sum_cifs","P_dead_total")], 3)
# ──────────────────────────────────────────────────────────────────────────────
# 6) Gráfico: descomposición correcta de la mortalidad total
ggplot(out, aes(x = time)) +
geom_ribbon(aes(ymin = 0, ymax = CIF_death_before,
fill = "Muerte antes de readmisión"), alpha = 0.55) +
geom_ribbon(aes(ymin = CIF_death_before, ymax = sum_cifs,
fill = "Muerte después de readmisión"), alpha = 0.55) +
geom_step(aes(y = P_dead_total, color = "Mortalidad total (KM)"),
linewidth = 0.9) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
scale_fill_manual(values = c("Muerte antes de readmisión" = "#A17C6C",
"Muerte después de readmisión" = "#5FA9B3")) +
scale_color_manual(values = c("Mortalidad total (KM)" = "#660600")) +
labs(x = "Meses desde el alta", y = "Probabilidad acumulada",
fill = NULL, color = NULL,
title = "Mortalidad acumulada: descomposición por relación con readmisión",
subtitle = "CIF de muerte antes vs. después de la primera readmisión; línea = KM total") +
theme_minimal(base_size = 16) +
theme(legend.position = "bottom")Code
# ──────────────────────────────────────────────────────────────────────────────
# CIF de "primer evento" (1=readmisión, 2=muerte), con tiempos al primer evento
ftime_first <- pmin(df2$t_r, df2$t_d, na.rm = TRUE)
fstatus_first <- dplyr::case_when(
df2$e_r == 1L & (df2$e_d == 0L | df2$t_r < df2$t_d) ~ 1L, # readmisión primero
df2$e_d == 1L & (df2$e_r == 0L | df2$t_d <= df2$t_r) ~ 2L, # muerte primero
TRUE ~ 0L
)
ci_first <- cmprsk::cuminc(ftime = ftime_first, fstatus = fstatus_first, cencode = 0)
# readmisión como primer evento a 12m:
tp_first <- cmprsk::timepoints(ci_first, times = 12)
tp_first$est[,"12"]
# ──────────────────────────────────────────────────────────────────────────────
# Graficar
p <- out|>
ggplot2::ggplot(ggplot2::aes(x = time)) +
ggplot2::geom_ribbon(ggplot2::aes(ymin = 0, ymax = CIF_death_before,
fill = "Death before readmission"), alpha = 0.5) +
ggplot2::geom_ribbon(ggplot2::aes(ymin = CIF_death_before, ymax = P_dead_total,
fill = "Death after readmission"), alpha = 0.5) +
ggplot2::geom_step(ggplot2::aes(y = P_dead_total, color = "Total death (KM)"),
linewidth = 0.7) +
ggplot2::geom_step(ggplot2::aes(y = CIF_death_before, color = "CIF of death before readmission"),
linewidth = 0.7) +
ggplot2::scale_y_continuous(labels = scales::percent_format(accuracy = 1),
limits = c(0, max(out[,"sum_cifs"]))) +
ggplot2::scale_fill_manual(values = c("Death before readmission"= "#a6cee3",
"Death after readmission"= "#b2df8a")) +
ggplot2::scale_color_manual(values = c("Total death (KM)"= "#1f78b4", "Death before readmission"= "#666666")) +
ggplot2::labs(x = "Months since discharge", y = "Cumulative probability",
fill = NULL, color = NULL) +
ggplot2::theme_minimal(base_size = 20) +
ggplot2::theme(legend.position = "bottom")
print(p)Code
ggsave(
filename = paste0(gsub("/cons","",getwd()), "/cons/_figs/_death_decomposition.png"),
plot = p +
labs(x = "Meses desde el egreso", y = "Probabilidad acumulada",
fill = NULL, color = NULL)+
scale_fill_manual(
values = c(
"Death before readmission" = "#a6cee3",
"Death after readmission" = "#B48448"
),
labels = c(
"Death before readmission" = "Muerte antes de la readmisión",
"Death after readmission" = "Muerte después de la readmisión"
)
) +
scale_color_manual(
values = c(
"Total death (KM)" = "#5A0D13",
"CIF of death before readmission" = "#666666"
),
labels = c(
"Total death (KM)" = "Muerte total (KM)",
"CIF of death before readmission" = "CIF de muerte antes de la readmisión"
)
)+ ggplot2::theme(
legend.position = "bottom",
legend.box = "horizontal",
legend.text = ggplot2::element_text(size=16, margin=margin(l=0, r=-3, unit="pt")),
legend.title = ggplot2::element_blank(),
legend.spacing.x = grid::unit(0.2, "cm"),
legend.spacing.y = grid::unit(0.2, "cm"),
legend.key.width = grid::unit(1.0, "cm")
) +
ggplot2::guides(
fill = ggplot2::guide_legend(nrow = 2, byrow = TRUE, keywidth = unit(0.5, "cm")),
color = ggplot2::guide_legend(nrow = 2, byrow = TRUE, keywidth = unit(0.5, "cm"))
)+ ggplot2::scale_x_continuous(
breaks = scales::breaks_width(12),
labels = function(m) paste0(m, "m") # o en años: paste0(m/12, " años")
),
dpi = 600
)A total of 1,126 patients (1.3%) had valid treatments between 18-64 in their second or third treatment.
1.3. Rule-Based Aggregation into ICD-10–Informed groups
To reduce sparsity and improve interpretability, we grouped specific diagnostic labels into four clinically coherent supergroups aligned with ICD-10 logic: (1) Infectious diseases (STIs, viral hepatitis, and infections related to substance use); (2) Organ-system medical diseases (cardiovascular, hepatic, oral, and hematologic conditions); (3) Injuries and sequelae; and (4) Other specified medical conditions (broad/vague descriptors such as “enfermedades somáticas”, “otras enfermedades…”, and pregnancy-related items). We preserved the status values “en estudio” and “sin trastorno” as-is, and left missing values as NA. For multi-response entries (diagnostico_trs_fisico_series), we parsed “||”-separated items, applied the same mapping to each token, deduplicated within-row, and produced both a collapsed supergroup string and one-hot indicators for the supergroups to avoid inflating counts while preventing sparse categories.
Code
icd10_broad <- function(x) {
x <- tolower(x)
tidytable::case_when(
stringr::str_detect(x, "organic|organico|demenc|alzheimer|parkinson|delirium|cerebral") ~ "F0 Organic",
stringr::str_detect(x, "psicoactiva|alcohol|marihuana|canabis|cannabis|cocain|opio|opiace|benzodiazep|sustancias") ~ "F1 Substance use",
stringr::str_detect(x, "esquizofren|psicotip|delirant|psicosis") ~ "F2 Psychotic",
stringr::str_detect(x, "estado de animo|afectiv|depres|bipolar|maniaco|distimia|hipoman") ~ "F3 Mood",
stringr::str_detect(x, "neurotic|ansiedad|fobi|panico|obsesivo|compulsiv|estres|adaptaci|somatoform|somatiz") ~ "F4 Anxiety/Stress/Somatoform",
stringr::str_detect(x, "comportamiento.*fisiolog|alimentari|anorex|bulim|sueñ|insomni|disfuncion sexual") ~ "F5 Physio/Eating/Sleep/Sexual",
stringr::str_detect(x, stringr::regex("personalidad|comportamiento del adulto|antisocial|limite|evitativ|habit|habitos|impuls|control de los impulsos|control\\s+de\\s+impulsos", ignore_case = TRUE)) ~ "F6 Personality/Adult behaviour",
stringr::str_detect(x, "retraso mental|discapacidad intelectual|intelectual") ~ "F7 Intellectual disability",
stringr::str_detect(x, "desarrollo psicolog|autism|asperger|lenguaje|aprendizaje|espectro autista|tdah|t\\s*d\\s*a\\s*h") ~ "F8/9 Neurodevelopment/Child",
TRUE ~ "Other/unspecified"
)
}
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Create dummied for psychiatric comorbidity\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Living arrangement/Cohabitation status\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Domestic violence & Sex abuse\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Primary subs\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Tr outcome\n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
# Create multi-label dummies using tidytable
SISTRAT23_c1_2010_2024_df_prev1z <-
SISTRAT23_c1_2010_2024_df_prev1z|>
tidytable::mutate(dx_norm = normalize_txt(mod_psiq_cie_10))|>
tidytable::mutate(dit_m = readmit_time_from_adm_m-readmit_time_from_disch_m)|>
# avoid NAs propagating into str_detect
tidytable::mutate(dx_norm_nna = ifelse(is.na(dx_norm), "", dx_norm))|>
tidytable::mutate(
f0_organic = stringr::str_detect(dx_norm_nna, "mentales organicos"),
f1_substance_use = stringr::str_detect(dx_norm_nna, "sustancias psicoactivas|consumo de sustancias|psicoactiva"),
f2_psychotic = stringr::str_detect(dx_norm_nna, "esquizofren|esquizotip|ideas? delirant|psicosis"),
f3_mood = stringr::str_detect(dx_norm_nna, "humor \\(afectivos\\)"),
f4_anxiety_stress_somatoform = stringr::str_detect(dx_norm_nna, "neuroticos|estresantes|somatomorfos"),
f5_physio_eating_sleep_sexual = stringr::str_detect(dx_norm_nna, "conducta alimentaria|disfunciones fisiologicas|factores somaticos"),
f6_personality_adult_behaviour = stringr::str_detect(dx_norm_nna, "personalidad y del comportamiento del adulto|habitos y del control de los impulsos|transformacion persistente de la personalidad"),
f7_intellectual_disability = stringr::str_detect(dx_norm_nna, "retraso mental|discapacidad intelectual"),
f8_9_neurodevelopment_child = stringr::str_detect(dx_norm_nna, "desarrollo psicolog|comienzo habitual en la infancia y adolescencia")
)|>
tidytable::mutate(marital_status_rec= tidytable::case_when(marital_status== "separated/divorced/annulled"~ "separated/divorced/annulled/widowed", marital_status=="widowed"~"separated/divorced/annulled/widowed", T~ marital_status))|>
tidytable::mutate(
cqv_norm=con_quien_vive|>stringi::stri_trans_general("Latin-ASCII")|>
stringr::str_to_lower()|>stringr::str_squish())|>
tidytable::mutate(
# 5-level (more granular)
con_quien_vive_joel5=dplyr::case_when(
# Alone
stringr::str_detect(cqv_norm,"^solo$")~"Alone",
# Couple/children only (no family-of-origin)
stringr::str_detect(cqv_norm,"^unicamente con (la )?pareja( e hijos)?$")|
stringr::str_detect(cqv_norm,"^unicamente con pareja$")|
stringr::str_detect(cqv_norm,"^unicamente con hijos$")~
"Couple/children only",
# Couple/children + family-of-origin (extended household)
stringr::str_detect(cqv_norm,"(pareja|hijos).*(padres|familia de origen)")|
stringr::str_detect(cqv_norm,"^con la pareja,? hijos y padres")~
"Couple/children + origin",
# Family-of-origin only
stringr::str_detect(cqv_norm,"^unicamente con padres|^con padres|familia de origen$")|
stringr::str_detect(cqv_norm,"^con la madre \\(sola\\)$")|
stringr::str_detect(cqv_norm,"^con abuelos$")|
stringr::str_detect(cqv_norm,"^con hermanos$")~
"Family of origin",
# Others (friends / other non-relatives / other relative)
stringr::str_detect(cqv_norm,"con amigos|otro no pariente|otro pariente|^otros$")~
"Others",
TRUE~"Others"))|>
tidytable::mutate(
# 4-level (collapsed)
cohabitation=dplyr::case_when(
con_quien_vive_joel5=="Alone"~"alone",
con_quien_vive_joel5 %in% c("Couple/children only","Couple/children + origin")~"with couple/children",
con_quien_vive_joel5=="Family of origin"~"family of origin",
TRUE~"Others"
))|>
tidytable::mutate(
tenure_status_household_rec= tidytable::case_when(grepl("illegal|others",tenure_status_household)~"illegal settlement & others", T~tenure_status_household)
)|>
tidytable::mutate(
cohabitation=forcats::fct_relevel(
factor(cohabitation),
c("alone","with couple/children","family of origin","others")
))|>
tidytable::select(-tidytable::any_of(c("cqv_norm", "con_quien_vive_joel5", "dx_norm_nna")))|>
tidytable::mutate(dom_violence=factor(tidytable::case_when(
grepl("Violencia Intrafamiliar$",otros_probl_at_sm_or, ignore.case=T)~1,
is.na(otros_probl_at_sm_or)~NA_real_,
T~0),levels=c(0,1),labels=c("No domestic violence","Domestic violence")))|>
tidytable::mutate(sex_abuse= factor(tidytable::case_when(
grepl("Abuso Sexual",otros_probl_at_sm_or, ignore.case=T)~1,
is.na(otros_probl_at_sm_or)~NA_real_,
T~0),levels=c(0,1),labels=c("No sexual abuse","Sexual abuse")))|>
tidytable::mutate(
primary_sub_std = stringr::str_to_lower(stringr::str_squish(primary_sub)),
primary_sub_mod = tidytable::case_when(
stringr::str_detect(primary_sub_std, "cocaine\\s*(base\\s*)?paste|pasta\\s*base|\\bpaco\\b") ~ "cocaine paste",
stringr::str_detect(primary_sub_std, "cocaine\\s*(powder|hydrochloride|hcl)") |
stringr::str_detect(primary_sub_std, "clorhidrato\\s*de\\s*coca|clorhidrato|hidrocloruro") ~ "cocaine powder",
stringr::str_detect(primary_sub_std, "\\balcohol\\b") ~ "alcohol",
stringr::str_detect(primary_sub_std, "marij|cannab|marihu") ~ "marijuana",
TRUE ~ "others"
))|>
tidytable::mutate(primary_sub_mod = factor(primary_sub_mod,
levels = c("cocaine paste","cocaine powder","alcohol","marijuana","others")))|>
tidytable::select(-tidytable::any_of(c("primary_sub_std", "dx_norm")))|>
tidytable::mutate(urbanicity_cat= tidytable::case_when(grepl("Mix",clasificacion)~ "2.Mixed", grepl("Urb",clasificacion)~ "3.Urban", grepl("Rur",clasificacion)~ "1.Rural", T~NA_character_))|>
tidytable::mutate(ethnicity= ifelse(!is.na(ethnicity_c1_c6_historic),1,0))|>
tidytable::mutate(
tr_outcome = tidytable::case_when(
tr_compliance_rec7 == "currently in" & is.na(adm_disch_reason) ~ NA_character_,
stringr::str_detect(coalesce(tr_compliance_rec7, ""), "dropout") & is.na(adm_disch_reason) ~ "dropout",
# explicit non-discharge statuses with NA reason
tr_compliance_rec7 == "referral" & is.na(adm_disch_reason) ~ "referral",
tr_compliance_rec7 == "completion" & is.na(adm_disch_reason) ~ "completion",
# discharge branches (note the parentheses)
stringr::str_detect(coalesce(tr_compliance_rec7, ""), "discharge") &
adm_disch_reason %in% c("agreement_end", "death", "no_local_service") ~ "adm discharge - adm reasons",
stringr::str_detect(coalesce(tr_compliance_rec7, ""), "discharge") &
(adm_disch_reason == "rule_violation" | is.na(adm_disch_reason)) ~ "adm discharge - rule violation/undet",
TRUE ~ "other"
))Warning: 1 unknown level in f: others
Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
cat("Recode Type of housing, Admission age, Treatment days, Tr. compliance; Nationallity (Chile); \n")
cat("#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:\n")
SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
tipo_de_vivienda_rec <- factor(tidytable::case_when(
tipo_de_vivienda %in% c("casa", "departamento") ~ "formal housing",
tipo_de_vivienda == "pieza dentro de la vivienda" ~ "shared/secondary unit",
tipo_de_vivienda%in% c("caleta o punto de calle", "mediagua movil (carpa, casa rodante o similar)", "hospederia", "residencial, pension, hostal", "choza, rancho, ruca", "informal/rural housing") ~ "homeless/unsheltered/informal/temporary housing/institutional/collective",
is.na(tipo_de_vivienda)~ NA_character_,
TRUE ~ "other/unknown" # catches "otro" and NA
),
levels = c(
"formal housing",
"shared/secondary unit",
"homeless/unsheltered/informal/temporary housing/institutional/collective",
"other/unknown"
))
})
SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
prim_sub_freq_rec <- factor(tidytable::case_when(
prim_sub_freq %in% c("1. Less than 1 day a week", "2. 1 day a week")~ "1.≤1 day/wk",
prim_sub_freq %in% c("3. 2 to 3 days a week", "4. 4 to 6 days a week")~ "2.2–6 days/wk",
is.na(prim_sub_freq)~ NA_character_,
TRUE ~ "3.Daily" # catches "otro" and NA
),
levels = c(
"1.≤1 day/wk",
"2.2–6 days/wk",
"3.Daily"
))
})
SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
tipo_de_vivienda_rec2 <- factor(tidytable::case_when(
tipo_de_vivienda %in% c("casa", "departamento") ~ "formal housing",
# tipo_de_vivienda == "pieza dentro de la vivienda" ~ "shared/secondary unit",
# tipo_de_vivienda%in% c("caleta o punto de calle", "mediagua movil (carpa, casa rodante o similar)", "hospederia", "residencial, pension, hostal", "choza, rancho, ruca", "informal/rural housing") ~ "homeless/unsheltered/informal/temporary housing/institutional/collective",
is.na(tipo_de_vivienda)~ NA_character_,
TRUE ~ "other/unknown" # catches "otro" and NA
),
levels = c(
"formal housing",
# "shared/secondary unit",
# "homeless/unsheltered/informal/temporary housing/institutional/collective",
"other/unknown"
))
})
#Admission age
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_log <- with(SISTRAT23_c1_2010_2024_df_prev1z,
ifelse(is.na(adm_age_rec3), NA_real_, log1p(adm_age_rec3)))
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_c <- with(SISTRAT23_c1_2010_2024_df_prev1z,
ifelse(is.na(adm_age_rec3), NA_real_, scale(adm_age_rec3, center = TRUE, scale = FALSE)))
#Warning in log1p(adm_age_rec3) : Se han producido NaNs
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_pow2 <- with(SISTRAT23_c1_2010_2024_df_prev1z,
ifelse(is.na(adm_age_rec3), NA_real_, adm_age_rec3^2))
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_pow3 <- with(SISTRAT23_c1_2010_2024_df_prev1z,
ifelse(is.na(adm_age_rec3), NA_real_, adm_age_rec3^3))
# Admission Age categories
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_cat <- with(
SISTRAT23_c1_2010_2024_df_prev1z,
{
res <- rep(NA_character_, length(adm_age_rec3))
res[adm_age_rec3 >= 18 & adm_age_rec3 < 30] <- "18-29"
res[adm_age_rec3 >= 30 & adm_age_rec3 < 45] <- "30-44"
res[adm_age_rec3 >= 45 & adm_age_rec3 < 65] <- "45-64"
res
}
)
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_cat <- factor(
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_cat,
levels = c("18-29", "30-44", "45-64")
)
SISTRAT23_c1_2010_2024_df_prev1z$adm_age_c <- scale(SISTRAT23_c1_2010_2024_df_prev1z$adm_age_rec3, center = TRUE, scale = FALSE)
invisible("Created the 90 days translated in months by subsetting and taking the average value")
dit90<-
mean(SISTRAT23_c1_2010_2024_df_prev1z$dit_m[which(SISTRAT23_c1_2010_2024_df_prev1z$dit_rec6==90)])
# Treatment duration transformations
SISTRAT23_c1_2010_2024_df_prev1z$treat_log <- with(SISTRAT23_c1_2010_2024_df_prev1z,
ifelse(is.na(dit_m), NA_real_, log1p(dit_m)))
#Warning in log1p(dit_m) : Se han producido NaNs
SISTRAT23_c1_2010_2024_df_prev1z$treat_days_pow2 <- with(SISTRAT23_c1_2010_2024_df_prev1z,
ifelse(is.na(dit_m), NA_real_, dit_m^2))
SISTRAT23_c1_2010_2024_df_prev1z$treat_days_pow3 <- with(SISTRAT23_c1_2010_2024_df_prev1z,
ifelse(is.na(dit_m), NA_real_, dit_m^3))
SISTRAT23_c1_2010_2024_df_prev1z$treat_lt_90 <- with(SISTRAT23_c1_2010_2024_df_prev1z,
ifelse(is.na(dit_m), NA_integer_, as.integer(dit_m < dit90)))
SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
sex_rec <- dplyr::case_when(
sex_rec == "mujer" ~ "woman",
sex_rec == "hombre" ~ "man",
T~ NA_character_
)
sex_rec <- factor(sex_rec, levels = c(
"man",
"woman"
))
})
SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
nationality_chile <- dplyr::case_when(
nationality_cons== "chile" ~ "chile",
!nationality_cons== "chile" ~ "other",
T~ NA_character_
)
nationality_chile <- factor(nationality_chile, levels = c(
"chile",
"other"
))
})
SISTRAT23_c1_2010_2024_df_prev1z <- within(SISTRAT23_c1_2010_2024_df_prev1z, {
any_violence <- dplyr::case_when(
sex_abuse== "Sexual abuse"| dom_violence=="Domestic violence"~ "1.Domestic violence/sex abuse",
grepl("No", sex_abuse) & grepl("No", dom_violence)~ "0.No domestic violence/sex abuse",
grepl("No", sex_abuse) & is.na(dom_violence)~ "0.No domestic violence/sex abuse",
is.na(sex_abuse) & grepl("No", dom_violence)~ "0.No domestic violence/sex abuse",
T~ NA_character_
)
any_violence <- factor(any_violence, levels = c(
"1.Domestic violence/sex abuse",
"0.No domestic violence/sex abuse"
))
})
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# 1) Define supergroup sets and a vectorized classifier that keeps NAs as NA
infectious_set <- c(
"ets",
"hepatitis b, c, d",
"infecciosas relacionadas con uso de sustancias"
)
organ_set <- c(
"cardiopatias: miocardiopatia dilatada por oh, arritmias, hta",
"hepatitis alcoholica subaguda",
"hepatitis cronica",
"patologia bucal",
"anemia: megaloblastica y ferropenica"
)
injury_set <- c("traumatismos y secuelas secundarios")
other_spec_set <- c(
"otras enfermedades o condiciones fisicas limitantes",
"otras enfermedades o condiciones de riesgo vital",
"enfermedades somaticas",
"patologia de la gestion y del nino intrauterino"
)
supergroups_levels <- c(
"en estudio",
"sin trastorno",
"Infectious diseases (STIs, viral hepatitis, substance-use–related)",
"Organ-system medical diseases (cardio, hepatic, oral, hematologic)",
"Injuries and sequelae",
"Other specified medical conditions"
)
classify_dx_vec <- function(x) {
x <- stringr::str_squish(x)
out <- rep(NA_character_, length(x))
# statuses (kept as-is)
out[x == "en estudio"] <- "en estudio"
out[x == "sin trastorno"] <- "sin trastorno"
# infectious
out[x %in% infectious_set] <- "Infectious diseases (STIs, viral hepatitis, substance-use–related)"
# organ/system
out[x %in% organ_set] <- "Organ-system medical diseases (cardio, hepatic, oral, hematologic)"
# injuries
out[x %in% injury_set] <- "Injuries and sequelae"
# other specified (broad/vague)
out[x %in% other_spec_set] <- "Other specified medical conditions"
# funnel any unforeseen non-NA labels to "Other specified"
out[is.na(out) & !is.na(x)] <- "Other specified medical conditions"
out
}
# 2) Apply to the single-diagnosis field; keep is.na(.) as NA
SISTRAT23_c1_2010_2024_df_prev1z <- SISTRAT23_c1_2010_2024_df_prev1z|>
dplyr::mutate(
phys_dx_supergroup = classify_dx_vec(diagnostico_trs_fisico),
phys_dx_supergroup = forcats::fct_relevel(
factor(phys_dx_supergroup),
supergroups_levels))
# 3) Apply to the multi-response series (split on "||", classify, dedup per row)
# - Produce: a collapsed string per row and wide one-hot indicators
multi_tokens <- SISTRAT23_c1_2010_2024_df_prev1z|>
dplyr::mutate(.rowid = dplyr::row_number(),
dx_series_raw = diagnostico_trs_fisico_series)|>
tidyr::separate_rows(dx_series_raw, sep = "\\|\\|")|>
dplyr::mutate(
dx_token = stringr::str_squish(dx_series_raw),
dx_token = dplyr::na_if(dx_token, ""),
dx_token = dplyr::na_if(dx_token, "NA"),
supergroup_token = classify_dx_vec(dx_token)
)|>
dplyr::distinct(.rowid, supergroup_token, .keep_all = TRUE)
# Collapsed string per row (NA if nothing classifiable present)
series_collapsed <- multi_tokens|>
dplyr::filter(!is.na(supergroup_token))|>
dplyr::group_by(.rowid)|>
dplyr::summarise(
dx_supergroup_series = paste(sort(unique(supergroup_token)), collapse = "||"),
.groups = "drop"
)
# One-hot indicators per supergroup (sanitized column names)
series_dummies <- multi_tokens|>
dplyr::filter(!is.na(supergroup_token))|>
dplyr::mutate(value = 1L)|>
dplyr::distinct(.rowid, supergroup_token, .keep_all = TRUE)|>
tidyr::pivot_wider(
names_from = supergroup_token,
values_from = value,
values_fill = list(value = 0)
)|>
dplyr::rename_with(.fn = base::make.names, .cols = - .rowid)|>
dplyr::select(dplyr::any_of(c(".rowid", "hash_key", "en.estudio", "Other.specified.medical.conditions", "Organ.system.medical.diseases..cardio..hepatic..oral..hematologic.", "sin.trastorno", "Injuries.and.sequelae", "Infectious.diseases..STIs..viral.hepatitis..substance.use.related.")))|>
janitor::clean_names()|>
dplyr::rename("phys_dg_org_med_card_hep_oral_hem"="organ_system_medical_diseases_cardio_hepatic_oral_hematologic", "phys_dg_infect_viral_hep_sud_rel"="infectious_diseases_st_is_viral_hepatitis_substance_use_related", "phys_dg_instudy"="en_estudio", "phys_dg_other_med_cond"="other_specified_medical_conditions", "phys_dg_inj_sequelae"="injuries_and_sequelae")|>
dplyr::group_by(hash_key)|>
dplyr::summarise(
phys_dx_instudy = sum(phys_dg_instudy, na.rm = TRUE),
phys_dx_other_spec_medical_cond = sum(phys_dg_other_med_cond, na.rm = TRUE),
phys_dx_organ_system_med_dis = sum(phys_dg_org_med_card_hep_oral_hem, na.rm = TRUE),
phys_dx_injuries_and_sequelae = sum(phys_dg_inj_sequelae, na.rm = TRUE),
phys_dx_infectious_diseases = sum(phys_dg_infect_viral_hep_sud_rel, na.rm = TRUE),
.groups = "drop"
)|>
# Optional: Reorder columns for clarity (hash_key first, then dummies)
dplyr::select(hash_key, dplyr::everything())
# 4) Join back to your main data (keeps rows with NA series as NA)
SISTRAT23_c1_2010_2024_df_prev1z <- SISTRAT23_c1_2010_2024_df_prev1z|>
dplyr::mutate(.rowid = dplyr::row_number())|>
dplyr::left_join(series_collapsed, by = ".rowid", multiple="first")|>
dplyr::left_join(series_dummies, by = "hash_key", multiple="first")|>
dplyr::select(-.rowid)|>
dplyr::mutate(
# assuming logical columns like dx_F0, dx_F2, ..., dx_F8_9
dx_f2_smi_psychotic = as.integer(f2_psychotic),
dx_f3_mood = as.integer(f3_mood),
dx_f45_anx_stress_phys = as.integer(f4_anxiety_stress_somatoform| f5_physio_eating_sleep_sexual),
dx_f6_personality = as.integer(f6_personality_adult_behaviour),
dx_f0789_neurocog_dev = as.integer(f0_organic | f7_intellectual_disability | f8_9_neurodevelopment_child)
# optional: single collapsed factor, with priority
)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
hist(table(round(SISTRAT23_c1_2010_2024_df_prev1z$dit_m,1), exclude=NULL), breaks=120, main= "Histogram: Months in treatment", xlab="Months")#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Create dummied for psychiatric comorbidity
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Living arrangement/Cohabitation status
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Domestic violence & Sex abuse
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Primary subs
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Tr outcome
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Recode Type of housing, Admission age, Treatment days, Tr. compliance; Nationallity (Chile);
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
1.4. Missingness
We assessed missing data across predictors, calculating the overall proportion of complete cases and percentage of missing values for each variable.
Code
# Character vector of the variables
tot_variables <- c(
"readmit_time_from_adm_m", "readmit_time_from_disch_m", "readmit_event",
"death_time_from_adm_m", "death_time_from_disch_m", "death_event",
"sex_rec", "adm_age_log", "adm_age_pow2", "adm_age_pow3", "adm_age_c", "adm_age_cat", "tenure_status_household", "occupation_condition_corr24",
"marital_status_rec", "porc_pobr", "porc_pobr_log", "porc_pobr_c",
"ethnicity", "nationality_chile", "urbanicity_cat", "ed_attainment_corr", "cohabitation",
"dg_psiq_cie_10_instudy", "dg_psiq_cie_10_dg", "f0_organic", "f1_substance_use", "f2_psychotic",
"f3_mood", "f4_anxiety_stress_somatoform", "f5_physio_eating_sleep_sexual",
"f6_personality_adult_behaviour", "f7_intellectual_disability", "f8_9_neurodevelopment_child",
"dx_f2_smi_psychotic", "dx_f3_mood", "dx_f45_anx_stress_phys", "dx_f6_personality", "dx_f0789_neurocog_dev",
"sub_dep_icd10_status", "dom_violence", "sex_abuse", "any_violence",
"phys_dx_supergroup","phys_dx_instudy", "phys_dx_other_spec_medical_cond", "phys_dx_organ_system_med_dis",
"phys_dx_injuries_and_sequelae", "phys_dx_infectious_diseases",
"prim_sub_freq", "prim_sub_freq_rec", "polysubstance_strict", "dit_m", "treat_log",
"treat_days_pow2", "treat_days_pow3", "treat_lt_90", "tr_outcome",
"adm_motive", "primary_sub", "primary_sub_mod", "usuario_tribunal_trat_droga",
"tipo_de_vivienda_rec", "tipo_de_vivienda_rec2", "plan_type_corr", "evaluacindelprocesoteraputico",
"eva_consumo", "eva_fam", "eva_relinterp", "eva_ocupacion", "eva_sm", "eva_fisica", "eva_transgnorma"
)
# Character vector of the variables restricted to factor or character columns
char_variables <- c(
"sex_rec", "tenure_status_household", "occupation_condition_corr24",
"marital_status_rec", "ethnicity", "urbanicity_cat", "ed_attainment_corr", "cohabitation",
"dg_psiq_cie_10_instudy", "dg_psiq_cie_10_dg", "f0_organic", "f1_substance_use", "f2_psychotic",
"f3_mood", "f4_anxiety_stress_somatoform", "f5_physio_eating_sleep_sexual",
"f6_personality_adult_behaviour", "f7_intellectual_disability", "f8_9_neurodevelopment_child",
"dx_f2_smi_psychotic", "dx_f3_mood", "dx_f45_anx_stress_phys", "dx_f6_personality", "dx_f0789_neurocog_dev",
"sub_dep_icd10_status", "dom_violence", "sex_abuse", "any_violence",
"phys_dx_supergroup", "phys_dx_instudy", "phys_dx_other_spec_medical_cond", "phys_dx_organ_system_med_dis",
"phys_dx_injuries_and_sequelae", "phys_dx_infectious_diseases",
"prim_sub_freq", "prim_sub_freq_rec", "polysubstance_strict", "treat_lt_90", "tr_outcome",
"adm_motive", "primary_sub", "primary_sub_mod", "usuario_tribunal_trat_droga",
"tipo_de_vivienda_rec", "tipo_de_vivienda_rec2", "plan_type_corr", "evaluacindelprocesoteraputico",
"eva_consumo", "eva_fam", "eva_relinterp", "eva_ocupacion", "eva_sm", "eva_fisica", "eva_transgnorma"
)
paste0( "Percentage of the total that has missing values: ",
scales::percent(1- (
SISTRAT23_c1_2010_2024_df_prev1z[
which(complete.cases(subset(SISTRAT23_c1_2010_2024_df_prev1z, select = tot_variables
))),] %>% nrow() / nrow(SISTRAT23_c1_2010_2024_df_prev1z)
), accuracy = 0.1),
"; total = ", format(nrow(SISTRAT23_c1_2010_2024_df_prev1z), big.mark = ",")
)
paste0( "Percentage of the total that has missing values (w/o other variables): ",
scales::percent(1- (
SISTRAT23_c1_2010_2024_df_prev1z[
which(complete.cases(subset(SISTRAT23_c1_2010_2024_df_prev1z, select = setdiff(tot_variables, c("usuario_tribunal_trat_droga", "dom_violence", "sex_abuse", "any_violence", "ethnicity", "tipo_de_vivienda_rec", "tipo_de_vivienda_rec2", "phys_dx_supergroup", "phys_dx_instudy", "phys_dx_other_spec_medical_cond", "phys_dx_organ_system_med_dis", "phys_dx_injuries_and_sequelae", "evaluacindelprocesoteraputico"))
))),] %>% nrow() / nrow(SISTRAT23_c1_2010_2024_df_prev1z)
), accuracy = 0.1),
"; total = ", format(nrow(SISTRAT23_c1_2010_2024_df_prev1z), big.mark = ",")
)
# Variables to check
vars_check <- c(tot_variables)
# Compute % missing per variable
missing_pct <- colMeans(
is.na(subset(SISTRAT23_c1_2010_2024_df_prev1z, select = vars_check))
) * 100
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# Named vector of labels (names = variable names)
labels_map <- c(
# Times & events
readmit_time_from_adm_m = "Readmission time from admission (months)",
readmit_time_from_disch_m = "Readmission time from discharge (months)",
death_time_from_adm_m = "Death time from admission (months)",
death_time_from_disch_m = "Death time from discharge (months)",
readmit_event = "Readmission event",
death_event = "Death event",
# Demographics & status
sex_rec = "Sex (biological categorization)",
tenure_status_household = "Housing situation",
tenure_status_household_rec = "Housing situation (- informal settlement)",
occupation_condition_corr24 = "Employment status",
marital_status_rec = "Marital status",
ed_attainment_corr = "Educational attainment",
ethnicity = "Ethnicity (=1)",
nationality_chile = "Nationality (Chile)",
adm_age_log = "Admission age (log)",
adm_age_pow2 = "Admission age ^2",
adm_age_pow3 = "Admission age ^3",
adm_age_c = "Admission age (centered)",
adm_age_cat = "Admission age (three groups)",
# Contextual
porc_pobr = "Poverty index of commune of residence",
porc_pobr_log = "Poverty index (log1p)",
porc_pobr_c = "Poverty index (centered)",
porc_pobr_c_cat6 = "Poverty index (6 quantile groups)",
urbanicity_cat = "Urbanization level of commune of residence",
# Living arrangement
cohabitation = "Cohabitation status (harmonized)",
# Clinical/behavioral
dg_psiq_cie_10_instudy = "Psychiatric comorbidity (ICD-10, in-study)",
dg_psiq_cie_10_dg = "Psychiatric comorbidity (ICD-10, diagnosis record)",
sub_dep_icd10_status = "SUD severity (ICD-10)",
prim_sub_freq = "Primary-substance use frequency at admission",
prim_sub_freq_rec = "Primary-substance use frequency at admission (simplified)",
polysubstance_strict = "Polysubstance use (strict)",
# Treatment exposure
dit_m = "Length of stay in treatment (months)",
treat_log = "Length of stay (log1p months)",
treat_days_pow2 = "Length of stay (months)^2",
treat_days_pow3 = "Length of stay (months)^3",
treat_lt_90 = "Treatment length < 90 days (indicator)",
tr_outcome = "Treatment outcome",
adm_motive = "Admission motive",
primary_sub = "Primary substance of use (detailed substances)",
primary_sub_mod = "Primary substance of use (recoded)",
# Research-context extras
usuario_tribunal_trat_droga = "Court-referred to drug treatment (1 = yes)",
tipo_de_vivienda_rec = "Housing type (recoded)",
tipo_de_vivienda_rec2 = "Housing type (recoded & simplified)",
# Stratification
plan_type_corr = "Treatment modality (plan type)",
prim_sub_licit = "Licit primary substance at admission",
f0_organic = "F0. Organic, including symptomatic, mental disorders",
f1_substance_use = "F1. Mental and behavioural disorders due to psychoactive substance use",
f2_psychotic = "F2. Schizophrenia, schizotypal and delusional disorders",
f3_mood = "F3. Mood [affective] disorders",
f4_anxiety_stress_somatoform = "F4. Neurotic, stress-related and somatoform disorders",
f5_physio_eating_sleep_sexual = "F5. Behavioural syndromes associated with physiological disturbances and physical factors",
f6_personality_adult_behaviour = "F6. Disorders of adult personality and behaviour",
f7_intellectual_disability = "F7. Intellectual disability",
f8_9_neurodevelopment_child = "F8-9. Disorders of psychological development and behavioural/emotional disorders with onset in childhood and adolescence",
dx_f2_smi_psychotic = "Psychotic disorders (F2)",
dx_f3_mood = "Mood disorders (F3)",
dx_f45_anx_stress_phys = "Anxiety & stress-related (F4–F5)",
dx_f6_personality = "Personality disorders (F6)",
dx_f0789_neurocog_dev = "Neurocognitive & neurodevelopmental (F0, F7–F9)",
# Violence indicators
any_violence = "Domestic violence/Sexual abuse",
dom_violence = "Domestic violence",
sex_abuse = "Sexual abuse",
# Physical diagnosis groups
phys_dx_supergroup = "Physical diagnosis supergroup (single)",
phys_dx_instudy = "Physical diagnosis: under study",
phys_dx_other_spec_medical_cond = "Physical diagnosis: other medical conditions",
phys_dx_organ_system_med_dis = "Physical diagnosis: organ-system diseases",
phys_dx_injuries_and_sequelae = "Physical diagnosis: injuries & sequelae",
phys_dx_infectious_diseases = "Physical diagnosis: infectious diseases",
# Evaluation at discharge
evaluacindelprocesoteraputico = "Assessment of the therapeutic process",
eva_consumo = "Evaluation of consumption pattern at discharge",
eva_fam = "Family situation evaluation at discharge",
eva_relinterp = "Interpersonal relations evaluation at discharge",
eva_ocupacion = "Occupational situation evaluation at discharge",
eva_sm = "Mental health evaluation at discharge",
eva_fisica = "Physical health evaluation at discharge",
eva_transgnorma = "Social norm transgression evaluation at discharge"
)
# Apply labels safely (only to variables that exist)
for(nm in intersect(names(SISTRAT23_c1_2010_2024_df_prev1z), names(labels_map))){
attr(SISTRAT23_c1_2010_2024_df_prev1z[[nm]],"label")<-labels_map[[nm]]
}
# Extract labels for each variable, defaulting to NA if not present
labels_vec <- sapply(names(missing_pct), function(nm) {
lbl <- attr(SISTRAT23_c1_2010_2024_df_prev1z[[nm]], "label")
if (is.null(lbl)) NA_character_ else lbl
})
# Round and format into data.frame with labels
data.frame(
Variable = names(missing_pct),
Label = labels_vec,
Missing_Percent = round(missing_pct, 2))|>
# Display as markdown table
knitr::kable(format = "markdown",
align = c("l", "l", "r"),
col.names = c("Variable", "Label", "% Missing"),
row.names = FALSE)[1] "Percentage of the total that has missing values: 33.3%; total = 88,504"
[1] "Percentage of the total that has missing values (w/o other variables): 6.3%; total = 88,504"
| Variable | Label | % Missing |
|---|---|---|
| readmit_time_from_adm_m | Readmission time from admission (months) | 0.00 |
| readmit_time_from_disch_m | Readmission time from discharge (months) | 0.00 |
| readmit_event | Readmission event | 0.00 |
| death_time_from_adm_m | Death time from admission (months) | 0.00 |
| death_time_from_disch_m | Death time from discharge (months) | 0.00 |
| death_event | Death event | 0.00 |
| sex_rec | Sex (biological categorization) | 0.00 |
| adm_age_log | Admission age (log) | 0.00 |
| adm_age_pow2 | Admission age ^2 | 0.00 |
| adm_age_pow3 | Admission age ^3 | 0.00 |
| adm_age_c | Admission age (centered) | 0.00 |
| adm_age_cat | Admission age (three groups) | 0.00 |
| tenure_status_household | Housing situation | 5.06 |
| occupation_condition_corr24 | Employment status | 0.00 |
| marital_status_rec | Marital status | 0.18 |
| porc_pobr | Poverty index of commune of residence | 0.00 |
| porc_pobr_log | Poverty index (log1p) | 0.00 |
| porc_pobr_c | Poverty index (centered) | 0.00 |
| ethnicity | Ethnicity (=1) | 0.00 |
| nationality_chile | Nationality (Chile) | 0.00 |
| urbanicity_cat | Urbanization level of commune of residence | 0.00 |
| ed_attainment_corr | Educational attainment | 0.43 |
| cohabitation | Cohabitation status (harmonized) | 0.00 |
| dg_psiq_cie_10_instudy | Psychiatric comorbidity (ICD-10, in-study) | 0.00 |
| dg_psiq_cie_10_dg | Psychiatric comorbidity (ICD-10, diagnosis record) | 0.00 |
| f0_organic | F0. Organic, including symptomatic, mental disorders | 0.00 |
| f1_substance_use | F1. Mental and behavioural disorders due to psychoactive substance use | 0.00 |
| f2_psychotic | F2. Schizophrenia, schizotypal and delusional disorders | 0.00 |
| f3_mood | F3. Mood [affective] disorders | 0.00 |
| f4_anxiety_stress_somatoform | F4. Neurotic, stress-related and somatoform disorders | 0.00 |
| f5_physio_eating_sleep_sexual | F5. Behavioural syndromes associated with physiological disturbances and physical factors | 0.00 |
| f6_personality_adult_behaviour | F6. Disorders of adult personality and behaviour | 0.00 |
| f7_intellectual_disability | F7. Intellectual disability | 0.00 |
| f8_9_neurodevelopment_child | F8-9. Disorders of psychological development and behavioural/emotional disorders with onset in childhood and adolescence | 0.00 |
| dx_f2_smi_psychotic | Psychotic disorders (F2) | 0.00 |
| dx_f3_mood | Mood disorders (F3) | 0.00 |
| dx_f45_anx_stress_phys | Anxiety & stress-related (F4–F5) | 0.00 |
| dx_f6_personality | Personality disorders (F6) | 0.00 |
| dx_f0789_neurocog_dev | Neurocognitive & neurodevelopmental (F0, F7–F9) | 0.00 |
| sub_dep_icd10_status | SUD severity (ICD-10) | 0.00 |
| dom_violence | Domestic violence | 20.09 |
| sex_abuse | Sexual abuse | 20.09 |
| any_violence | Domestic violence/Sexual abuse | 20.09 |
| phys_dx_supergroup | Physical diagnosis supergroup (single) | 0.17 |
| phys_dx_instudy | Physical diagnosis: under study | 0.17 |
| phys_dx_other_spec_medical_cond | Physical diagnosis: other medical conditions | 0.17 |
| phys_dx_organ_system_med_dis | Physical diagnosis: organ-system diseases | 0.17 |
| phys_dx_injuries_and_sequelae | Physical diagnosis: injuries & sequelae | 0.17 |
| phys_dx_infectious_diseases | Physical diagnosis: infectious diseases | 0.17 |
| prim_sub_freq | Primary-substance use frequency at admission | 0.46 |
| prim_sub_freq_rec | Primary-substance use frequency at admission (simplified) | 0.46 |
| polysubstance_strict | Polysubstance use (strict) | 0.00 |
| dit_m | Length of stay in treatment (months) | 0.00 |
| treat_log | Length of stay (log1p months) | 0.00 |
| treat_days_pow2 | Length of stay (months)^2 | 0.00 |
| treat_days_pow3 | Length of stay (months)^3 | 0.00 |
| treat_lt_90 | Treatment length < 90 days (indicator) | 0.00 |
| tr_outcome | Treatment outcome | 0.00 |
| adm_motive | Admission motive | 0.00 |
| primary_sub | Primary substance of use (detailed substances) | 0.00 |
| primary_sub_mod | Primary substance of use (recoded) | 0.00 |
| usuario_tribunal_trat_droga | Court-referred to drug treatment (1 = yes) | 4.40 |
| tipo_de_vivienda_rec | Housing type (recoded) | 7.26 |
| tipo_de_vivienda_rec2 | Housing type (recoded & simplified) | 7.26 |
| plan_type_corr | Treatment modality (plan type) | 0.00 |
| evaluacindelprocesoteraputico | Assessment of the therapeutic process | 0.25 |
| eva_consumo | Evaluation of consumption pattern at discharge | 0.27 |
| eva_fam | Family situation evaluation at discharge | 0.27 |
| eva_relinterp | Interpersonal relations evaluation at discharge | 0.27 |
| eva_ocupacion | Occupational situation evaluation at discharge | 0.27 |
| eva_sm | Mental health evaluation at discharge | 0.27 |
| eva_fisica | Physical health evaluation at discharge | 0.27 |
| eva_transgnorma | Social norm transgression evaluation at discharge | 0.27 |
1.4.* Currently in
We identified 215 cases with missing treatment outcomes. For these cases, treatment outcome, duration of treatment, and other treatment-related variables (e.g., evaluation at discharge) are unknown.
Code
SISTRAT23_c1_2010_2024_df_prev1z|>
dplyr::filter(tr_outcome=="other")|>
dplyr::select(hash_key, tr_outcome, adm_date_rec2, disch_date_rec6, dit_rec6, adm_year)|>
#janitor::tabyl(adm_year)
dplyr::mutate(
hash_key = gsub(
"(?<=^.{8}).{2}", # lookbehind for first 8 chars, match the next 2
"**", # replace with two asterisks
hash_key,
perl = TRUE
)) |>
head()# A tidytable: 6 × 6
hash_key tr_outcome adm_date_rec2 disch_date_rec6 dit_rec6 adm_year
<chr> <chr> <date> <date> <dbl> <int>
1 0065a4d6**af0c519d… other 2016-08-08 2019-08-09 1096 2016
2 00be34d5**32b9d1ee… other 2016-04-13 2019-04-14 1096 2016
3 01fea888**92b29fcc… other 2019-05-01 2022-05-01 1096 2019
4 0426e03d**26cb489d… other 2011-04-27 2014-04-27 1096 2011
5 048e829d**f1109af2… other 2011-07-25 2014-07-25 1096 2011
6 04b987f5**a824ea89… other 2011-02-07 2014-02-07 1096 2011
1.5. Descriptives
Code
## 0) Guard: use only columns that exist
tot_variables_present<-intersect(tot_variables,names(SISTRAT23_c1_2010_2024_df_prev1z))
char_variables_present<-intersect(c(char_variables, "readmit_event", "death_event"),names(SISTRAT23_c1_2010_2024_df_prev1z))
extra_cat<-intersect(c("disch_age_cat"),names(SISTRAT23_c1_2010_2024_df_prev1z))
extra_num<-intersect(c("disch_age_rec"),names(SISTRAT23_c1_2010_2024_df_prev1z))
categorical_vars<-union(char_variables_present,extra_cat)
numerical_vars<-setdiff(union(tot_variables_present,union(extra_cat,extra_num)),categorical_vars)
## 1) Create TableOne (coerce character categoricals to factor inline)
table_one_clean<-tableone::CreateTableOne(
vars= setdiff(c(categorical_vars,numerical_vars), "plan_type_corr"),
data= tidytable::mutate(SISTRAT23_c1_2010_2024_df_prev1z,
tidytable::across(tidyselect::all_of(categorical_vars),\(x){if(is.character(x)) factor(x) else x}),
porc_pobr= porc_pobr*100),
factorVars= setdiff(char_variables_present,"plan_type_corr"),
strata= "plan_type_corr",
addOverall= TRUE,
test= TRUE,
smd= TRUE#,
#labelTranslations=label_map
)
## 4) Print (treat numeric as non-normal; keep your formatting)
escape_regex<-function(x) gsub("([][{}()+*^$.|?\\\\])","\\\\\\1",x)
rn<-rownames(table_one_clean_print)
keys<-names(labels_map)
keys<-keys[order(nchar(keys),decreasing=TRUE)] # avoid partial overlaps
for(nm in keys){
pat<-paste0("^",escape_regex(nm),"(?=\\s|\\(|:|=|$)") # start, then space/(...)/: /= or end
rn<-sub(pat,labels_map[[nm]],rn,perl=TRUE) # replace only the prefix; keep the rest
}
rownames(table_one_clean_print)<-rn
wdpath<- paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
## 5) HTML table
table_one_clean_print|>
knitr::kable("html", caption= "Table 1. Descriptives by baseline treatment setting")|>
kableExtra::kable_classic()|>
kableExtra::row_spec(0:1, bold = TRUE)|>
kableExtra::scroll_box(height = "400px")
write.table(table_one_clean_print,
file= paste0(wdpath,"cons/_out/table_one_prediction_clean.txt"),
sep = "\t", row.names = T)| level | Overall | m-pai | m-pr | pg-pab | pg-pai | pg-pr | p | test | SMD | Missing | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| n | 88,504 | 5,263 | 3,545 | 32,424 | 37,555 | 9,717 | |||||
| Sex (biological categorization) (%) | man | 65806 (74.4) | 0 (0.0) | 0 (0.0) | 26249 (81.0) | 30414 (81.0) | 9143 (94.1) | <0.001 | 2.377 | 0.0 | |
| woman | 22696 (25.6) | 5263 (100.0) | 3545 (100.0) | 6175 (19.0) | 7139 (19.0) | 574 (5.9) | |||||
| Housing situation (%) | illegal settlement | 947 (1.1) | 65 (1.3) | 86 (2.6) | 227 (0.7) | 340 (0.9) | 229 (2.7) | <0.001 | 0.200 | 5.1 | |
| others | 2380 (2.8) | 139 (2.7) | 89 (2.7) | 974 (3.1) | 875 (2.4) | 303 (3.5) | |||||
| owner/transferred dwellings/pays dividends | 30670 (36.5) | 1716 (33.4) | 940 (28.5) | 12047 (38.8) | 12451 (34.7) | 3516 (40.7) | |||||
| renting | 15476 (18.4) | 1007 (19.6) | 471 (14.3) | 5936 (19.1) | 6818 (19.0) | 1244 (14.4) | |||||
| stays temporarily with a relative | 34553 (41.1) | 2214 (43.1) | 1708 (51.9) | 11877 (38.2) | 15410 (42.9) | 3344 (38.7) | |||||
| Employment status (%) | employed | 43336 (49.0) | 1603 (30.5) | 361 (10.2) | 21086 (65.0) | 18574 (49.5) | 1712 (17.6) | <0.001 | 0.792 | 0.0 | |
| inactive | 14307 (16.2) | 1972 (37.5) | 1665 (47.0) | 3570 (11.0) | 5386 (14.3) | 1714 (17.6) | |||||
| unemployed | 30861 (34.9) | 1688 (32.1) | 1519 (42.8) | 7768 (24.0) | 13595 (36.2) | 6291 (64.7) | |||||
| Marital status (%) | married/cohabiting | 29768 (33.7) | 1662 (31.6) | 822 (23.2) | 13127 (40.6) | 11976 (31.9) | 2181 (22.5) | <0.001 | 0.211 | 0.2 | |
| separated/divorced/annulled/widowed | 10269 (11.6) | 644 (12.2) | 441 (12.5) | 3881 (12.0) | 4234 (11.3) | 1069 (11.0) | |||||
| single | 48304 (54.7) | 2953 (56.2) | 2277 (64.3) | 15339 (47.4) | 21279 (56.8) | 6456 (66.5) | |||||
| Ethnicity (=1) (%) | 0 | 82870 (93.6) | 4868 (92.5) | 3312 (93.4) | 30386 (93.7) | 35158 (93.6) | 9146 (94.1) | 0.003 | 0.028 | 0.0 | |
| 1 | 5634 (6.4) | 395 (7.5) | 233 (6.6) | 2038 (6.3) | 2397 (6.4) | 571 (5.9) | |||||
| Urbanization level of commune of residence (%) | 1.Rural | 7669 (8.7) | 210 (4.0) | 187 (5.3) | 3030 (9.3) | 3803 (10.1) | 439 (4.5) | <0.001 | 0.176 | 0.0 | |
| 2.Mixed | 8659 (9.8) | 383 (7.3) | 261 (7.4) | 4116 (12.7) | 3203 (8.5) | 696 (7.2) | |||||
| 3.Urban | 72176 (81.6) | 4670 (88.7) | 3097 (87.4) | 25278 (78.0) | 30549 (81.3) | 8582 (88.3) | |||||
| Educational attainment (%) | 1-More than high school | 16157 (18.3) | 899 (17.1) | 641 (18.1) | 5356 (16.6) | 7672 (20.5) | 1589 (16.4) | <0.001 | 0.085 | 0.4 | |
| 2-Completed high school or less | 48791 (55.4) | 2785 (53.0) | 1855 (52.4) | 17756 (55.1) | 20891 (55.8) | 5504 (56.7) | |||||
| 3-Completed primary school or less | 23176 (26.3) | 1567 (29.8) | 1042 (29.5) | 9108 (28.3) | 8847 (23.6) | 2612 (26.9) | |||||
| Cohabitation status (harmonized) (%) | alone | 8415 (9.5) | 278 (5.3) | 337 (9.5) | 2815 (8.7) | 3404 (9.1) | 1581 (16.3) | <0.001 | 0.468 | 0.0 | |
| with couple/children | 39730 (44.9) | 3507 (66.6) | 1828 (51.6) | 16525 (51.0) | 15679 (41.7) | 2191 (22.5) | |||||
| family of origin | 32725 (37.0) | 1010 (19.2) | 954 (26.9) | 10683 (32.9) | 15142 (40.3) | 4936 (50.8) | |||||
| Others | 7634 (8.6) | 468 (8.9) | 426 (12.0) | 2401 (7.4) | 3330 (8.9) | 1009 (10.4) | |||||
| Psychiatric comorbidity (ICD-10, in-study) (%) | FALSE | 73020 (82.5) | 4482 (85.2) | 2678 (75.5) | 27173 (83.8) | 30921 (82.3) | 7766 (79.9) | <0.001 | 0.118 | 0.0 | |
| TRUE | 15484 (17.5) | 781 (14.8) | 867 (24.5) | 5251 (16.2) | 6634 (17.7) | 1951 (20.1) | |||||
| Psychiatric comorbidity (ICD-10, diagnosis record) (%) | FALSE | 49013 (55.4) | 2459 (46.7) | 1260 (35.5) | 20654 (63.7) | 19896 (53.0) | 4744 (48.8) | <0.001 | 0.256 | 0.0 | |
| TRUE | 39491 (44.6) | 2804 (53.3) | 2285 (64.5) | 11770 (36.3) | 17659 (47.0) | 4973 (51.2) | |||||
| F0. Organic, including symptomatic, mental disorders (%) | FALSE | 87399 (98.8) | 5233 (99.4) | 3492 (98.5) | 32163 (99.2) | 36958 (98.4) | 9553 (98.3) | <0.001 | 0.057 | 0.0 | |
| TRUE | 1105 (1.2) | 30 (0.6) | 53 (1.5) | 261 (0.8) | 597 (1.6) | 164 (1.7) | |||||
| F1. Mental and behavioural disorders due to psychoactive substance use (%) | FALSE | 88504 (100.0) | 5263 (100.0) | 3545 (100.0) | 32424 (100.0) | 37555 (100.0) | 9717 (100.0) | NA | <0.001 | 0.0 | |
| F2. Schizophrenia, schizotypal and delusional disorders (%) | FALSE | 87145 (98.5) | 5240 (99.6) | 3500 (98.7) | 32248 (99.5) | 36691 (97.7) | 9466 (97.4) | <0.001 | 0.103 | 0.0 | |
| TRUE | 1359 (1.5) | 23 (0.4) | 45 (1.3) | 176 (0.5) | 864 (2.3) | 251 (2.6) | |||||
| F3. Mood [affective] disorders (%) | FALSE | 79431 (89.7) | 4575 (86.9) | 3088 (87.1) | 29383 (90.6) | 33645 (89.6) | 8740 (89.9) | <0.001 | 0.065 | 0.0 | |
| TRUE | 9073 (10.3) | 688 (13.1) | 457 (12.9) | 3041 (9.4) | 3910 (10.4) | 977 (10.1) | |||||
| F4. Neurotic, stress-related and somatoform disorders (%) | FALSE | 86864 (98.1) | 5158 (98.0) | 3513 (99.1) | 31682 (97.7) | 36894 (98.2) | 9617 (99.0) | <0.001 | 0.061 | 0.0 | |
| TRUE | 1640 (1.9) | 105 (2.0) | 32 (0.9) | 742 (2.3) | 661 (1.8) | 100 (1.0) | |||||
| F5. Behavioural syndromes associated with physiological disturbances and physical factors (%) | FALSE | 88051 (99.5) | 5217 (99.1) | 3514 (99.1) | 32287 (99.6) | 37359 (99.5) | 9674 (99.6) | <0.001 | 0.033 | 0.0 | |
| TRUE | 453 (0.5) | 46 (0.9) | 31 (0.9) | 137 (0.4) | 196 (0.5) | 43 (0.4) | |||||
| F6. Disorders of adult personality and behaviour (%) | FALSE | 63374 (71.6) | 3400 (64.6) | 1887 (53.2) | 25216 (77.8) | 26369 (70.2) | 6502 (66.9) | <0.001 | 0.236 | 0.0 | |
| TRUE | 25130 (28.4) | 1863 (35.4) | 1658 (46.8) | 7208 (22.2) | 11186 (29.8) | 3215 (33.1) | |||||
| F7. Intellectual disability (%) | FALSE | 87866 (99.3) | 5233 (99.4) | 3520 (99.3) | 32208 (99.3) | 37278 (99.3) | 9627 (99.1) | 0.063 | 0.018 | 0.0 | |
| TRUE | 638 (0.7) | 30 (0.6) | 25 (0.7) | 216 (0.7) | 277 (0.7) | 90 (0.9) | |||||
| F8-9. Disorders of psychological development and behavioural/emotional disorders with onset in childhood and adolescence (%) | FALSE | 87237 (98.6) | 5177 (98.4) | 3494 (98.6) | 32100 (99.0) | 37033 (98.6) | 9433 (97.1) | <0.001 | 0.060 | 0.0 | |
| TRUE | 1267 (1.4) | 86 (1.6) | 51 (1.4) | 324 (1.0) | 522 (1.4) | 284 (2.9) | |||||
| Psychotic disorders (F2) (%) | 0 | 87145 (98.5) | 5240 (99.6) | 3500 (98.7) | 32248 (99.5) | 36691 (97.7) | 9466 (97.4) | <0.001 | 0.103 | 0.0 | |
| 1 | 1359 (1.5) | 23 (0.4) | 45 (1.3) | 176 (0.5) | 864 (2.3) | 251 (2.6) | |||||
| Mood disorders (F3) (%) | 0 | 79431 (89.7) | 4575 (86.9) | 3088 (87.1) | 29383 (90.6) | 33645 (89.6) | 8740 (89.9) | <0.001 | 0.065 | 0.0 | |
| 1 | 9073 (10.3) | 688 (13.1) | 457 (12.9) | 3041 (9.4) | 3910 (10.4) | 977 (10.1) | |||||
| Anxiety & stress-related (F4–F5) (%) | 0 | 86422 (97.6) | 5114 (97.2) | 3484 (98.3) | 31549 (97.3) | 36701 (97.7) | 9574 (98.5) | <0.001 | 0.051 | 0.0 | |
| 1 | 2082 (2.4) | 149 (2.8) | 61 (1.7) | 875 (2.7) | 854 (2.3) | 143 (1.5) | |||||
| Personality disorders (F6) (%) | 0 | 63374 (71.6) | 3400 (64.6) | 1887 (53.2) | 25216 (77.8) | 26369 (70.2) | 6502 (66.9) | <0.001 | 0.236 | 0.0 | |
| 1 | 25130 (28.4) | 1863 (35.4) | 1658 (46.8) | 7208 (22.2) | 11186 (29.8) | 3215 (33.1) | |||||
| Neurocognitive & neurodevelopmental (F0, F7–F9) (%) | 0 | 85517 (96.6) | 5117 (97.2) | 3416 (96.4) | 31626 (97.5) | 36169 (96.3) | 9189 (94.6) | <0.001 | 0.072 | 0.0 | |
| 1 | 2987 (3.4) | 146 (2.8) | 129 (3.6) | 798 (2.5) | 1386 (3.7) | 528 (5.4) | |||||
| SUD severity (ICD-10) (%) | drug dependence | 64429 (72.8) | 4105 (78.0) | 3343 (94.3) | 17824 (55.0) | 30278 (80.6) | 8879 (91.4) | <0.001 | 0.477 | 0.0 | |
| hazardous consumption | 24075 (27.2) | 1158 (22.0) | 202 (5.7) | 14600 (45.0) | 7277 (19.4) | 838 (8.6) | |||||
| Domestic violence (%) | No domestic violence | 52301 (74.0) | 2670 (60.3) | 1851 (55.4) | 18960 (78.4) | 22784 (75.9) | 6036 (69.3) | <0.001 | 0.268 | 20.1 | |
| Domestic violence | 18419 (26.0) | 1761 (39.7) | 1491 (44.6) | 5236 (21.6) | 7251 (24.1) | 2680 (30.7) | |||||
| Sexual abuse (%) | No sexual abuse | 69345 (98.1) | 4174 (94.2) | 3070 (91.9) | 23977 (99.1) | 29570 (98.5) | 8554 (98.1) | <0.001 | 0.192 | 20.1 | |
| Sexual abuse | 1375 (1.9) | 257 (5.8) | 272 (8.1) | 219 (0.9) | 465 (1.5) | 162 (1.9) | |||||
| Domestic violence/Sexual abuse (%) | 1.Domestic violence/sex abuse | 19794 (28.0) | 2018 (45.5) | 1763 (52.8) | 5455 (22.5) | 7716 (25.7) | 2842 (32.6) | <0.001 | 0.344 | 20.1 | |
| 0.No domestic violence/sex abuse | 50926 (72.0) | 2413 (54.5) | 1579 (47.2) | 18741 (77.5) | 22319 (74.3) | 5874 (67.4) | |||||
| Physical diagnosis supergroup (single) (%) | en estudio | 48717 (55.1) | 2814 (53.5) | 1990 (56.2) | 17779 (55.0) | 20990 (55.9) | 5144 (53.1) | <0.001 | 0.240 | 0.2 | |
| sin trastorno | 31242 (35.4) | 1909 (36.3) | 883 (24.9) | 12409 (38.4) | 12952 (34.5) | 3089 (31.9) | |||||
| Infectious diseases (STIs, viral hepatitis, substance-use–related) | 700 (0.8) | 75 (1.4) | 85 (2.4) | 131 (0.4) | 300 (0.8) | 109 (1.1) | |||||
| Organ-system medical diseases (cardio, hepatic, oral, hematologic) | 2450 (2.8) | 149 (2.8) | 185 (5.2) | 649 (2.0) | 981 (2.6) | 486 (5.0) | |||||
| Injuries and sequelae | 1327 (1.5) | 47 (0.9) | 131 (3.7) | 293 (0.9) | 481 (1.3) | 375 (3.9) | |||||
| Other specified medical conditions | 3917 (4.4) | 266 (5.1) | 270 (7.6) | 1076 (3.3) | 1814 (4.8) | 491 (5.1) | |||||
| Physical diagnosis: under study (%) | 0 | 39061 (44.2) | 2420 (46.0) | 1404 (39.6) | 14514 (44.9) | 16398 (43.7) | 4325 (44.6) | <0.001 | 0.056 | 0.2 | |
| 1 | 49292 (55.8) | 2840 (54.0) | 2140 (60.4) | 17823 (55.1) | 21120 (56.3) | 5369 (55.4) | |||||
| Physical diagnosis: other medical conditions (%) | 0 | 84405 (95.5) | 4992 (94.9) | 3263 (92.1) | 31258 (96.7) | 35695 (95.1) | 9197 (94.9) | <0.001 | 0.083 | 0.2 | |
| 1 | 3948 (4.5) | 268 (5.1) | 281 (7.9) | 1079 (3.3) | 1823 (4.9) | 497 (5.1) | |||||
| Physical diagnosis: organ-system diseases (%) | 0 | 85879 (97.2) | 5109 (97.1) | 3355 (94.7) | 31686 (98.0) | 36531 (97.4) | 9198 (94.9) | <0.001 | 0.097 | 0.2 | |
| 1 | 2474 (2.8) | 151 (2.9) | 189 (5.3) | 651 (2.0) | 987 (2.6) | 496 (5.1) | |||||
| Physical diagnosis: injuries & sequelae (%) | 0 | 87011 (98.5) | 5213 (99.1) | 3406 (96.1) | 32043 (99.1) | 37036 (98.7) | 9313 (96.1) | <0.001 | 0.120 | 0.2 | |
| 1 | 1342 (1.5) | 47 (0.9) | 138 (3.9) | 294 (0.9) | 482 (1.3) | 381 (3.9) | |||||
| Physical diagnosis: infectious diseases (%) | 0 | 87644 (99.2) | 5185 (98.6) | 3454 (97.5) | 32205 (99.6) | 37216 (99.2) | 9584 (98.9) | <0.001 | 0.086 | 0.2 | |
| 1 | 709 (0.8) | 75 (1.4) | 90 (2.5) | 132 (0.4) | 302 (0.8) | 110 (1.1) | |||||
| Primary-substance use frequency at admission (%) | 1. Less than 1 day a week | 4399 (5.0) | 243 (4.6) | 25 (0.7) | 2611 (8.1) | 1324 (3.5) | 196 (2.0) | <0.001 | 0.586 | 0.5 | |
| 2. 1 day a week | 5831 (6.6) | 295 (5.6) | 34 (1.0) | 3375 (10.5) | 1897 (5.1) | 230 (2.4) | |||||
| 3. 2 to 3 days a week | 24533 (27.8) | 1452 (27.7) | 354 (10.0) | 11825 (36.7) | 9905 (26.5) | 997 (10.3) | |||||
| 4. 4 to 6 days a week | 14483 (16.4) | 808 (15.4) | 440 (12.4) | 5165 (16.0) | 6808 (18.2) | 1262 (13.0) | |||||
| 5. Daily | 38847 (44.1) | 2444 (46.6) | 2686 (75.9) | 9259 (28.7) | 17436 (46.7) | 7022 (72.3) | |||||
| Primary-substance use frequency at admission (simplified) (%) | 1.≤1 day/wk | 10230 (11.6) | 538 (10.3) | 59 (1.7) | 5986 (18.6) | 3221 (8.6) | 426 (4.4) | <0.001 | 0.554 | 0.5 | |
| 2.2–6 days/wk | 39016 (44.3) | 2260 (43.1) | 794 (22.4) | 16990 (52.7) | 16713 (44.7) | 2259 (23.3) | |||||
| 3.Daily | 38847 (44.1) | 2444 (46.6) | 2686 (75.9) | 9259 (28.7) | 17436 (46.7) | 7022 (72.3) | |||||
| Polysubstance use (strict) (%) | 0 | 23961 (27.1) | 1395 (26.5) | 653 (18.4) | 11608 (35.8) | 8624 (23.0) | 1681 (17.3) | <0.001 | 0.210 | 0.0 | |
| 1 | 64543 (72.9) | 3868 (73.5) | 2892 (81.6) | 20816 (64.2) | 28931 (77.0) | 8036 (82.7) | |||||
| Treatment length < 90 days (indicator) (%) | 0 | 67525 (76.3) | 4202 (79.8) | 2213 (62.4) | 26054 (80.4) | 28309 (75.4) | 6747 (69.4) | <0.001 | 0.210 | 0.0 | |
| 1 | 20979 (23.7) | 1061 (20.2) | 1332 (37.6) | 6370 (19.6) | 9246 (24.6) | 2970 (30.6) | |||||
| Treatment outcome (%) | adm discharge - adm reasons | 1319 (1.5) | 78 (1.5) | 8 (0.2) | 598 (1.8) | 611 (1.6) | 24 (0.2) | <0.001 | 0.279 | 0.0 | |
| adm discharge - rule violation/undet | 6463 (7.3) | 322 (6.1) | 397 (11.2) | 1946 (6.0) | 2532 (6.7) | 1266 (13.0) | |||||
| completion | 23408 (26.4) | 1592 (30.2) | 932 (26.3) | 8093 (25.0) | 9700 (25.8) | 3091 (31.8) | |||||
| dropout | 47055 (53.2) | 2617 (49.7) | 1544 (43.6) | 18904 (58.3) | 20201 (53.8) | 3789 (39.0) | |||||
| other | 215 (0.2) | 7 (0.1) | 8 (0.2) | 98 (0.3) | 67 (0.2) | 35 (0.4) | |||||
| referral | 10044 (11.3) | 647 (12.3) | 656 (18.5) | 2785 (8.6) | 4444 (11.8) | 1512 (15.6) | |||||
| Admission motive (%) | another SUD facility/FONODROGAS/SENDA Previene | 8287 (9.4) | 304 (5.8) | 793 (22.4) | 1592 (4.9) | 3202 (8.5) | 2396 (24.7) | <0.001 | 0.458 | 0.0 | |
| justice sector | 8470 (9.6) | 941 (17.9) | 241 (6.8) | 3297 (10.2) | 3395 (9.0) | 596 (6.1) | |||||
| other | 4574 (5.2) | 387 (7.4) | 211 (6.0) | 1272 (3.9) | 2161 (5.8) | 543 (5.6) | |||||
| sanitary sector | 27482 (31.1) | 1758 (33.4) | 1300 (36.7) | 8942 (27.6) | 12707 (33.8) | 2775 (28.6) | |||||
| spontaneous consultation | 39691 (44.8) | 1873 (35.6) | 1000 (28.2) | 17321 (53.4) | 16090 (42.8) | 3407 (35.1) | |||||
| Primary substance of use (detailed substances) (%) | alcohol | 30214 (34.1) | 1497 (28.4) | 621 (17.5) | 14317 (44.2) | 11834 (31.5) | 1945 (20.0) | <0.001 | 0.477 | 0.0 | |
| amphetamine-type stimulants | 123 (0.1) | 7 (0.1) | 5 (0.1) | 46 (0.1) | 55 (0.1) | 10 (0.1) | |||||
| cocaine paste | 33442 (37.8) | 2006 (38.1) | 2253 (63.6) | 8554 (26.4) | 14585 (38.8) | 6044 (62.2) | |||||
| cocaine powder | 17221 (19.5) | 1109 (21.1) | 506 (14.3) | 6522 (20.1) | 7775 (20.7) | 1309 (13.5) | |||||
| dissociatives | 3 (0.0) | 0 (0.0) | 0 (0.0) | 1 (0.0) | 2 (0.0) | 0 (0.0) | |||||
| hallucinogens | 13 (0.0) | 1 (0.0) | 0 (0.0) | 4 (0.0) | 8 (0.0) | 0 (0.0) | |||||
| inhalants | 67 (0.1) | 0 (0.0) | 5 (0.1) | 11 (0.0) | 35 (0.1) | 16 (0.2) | |||||
| marijuana | 6029 (6.8) | 462 (8.8) | 77 (2.2) | 2553 (7.9) | 2641 (7.0) | 296 (3.0) | |||||
| opioids | 326 (0.4) | 34 (0.6) | 23 (0.6) | 51 (0.2) | 186 (0.5) | 32 (0.3) | |||||
| others | 116 (0.1) | 22 (0.4) | 1 (0.0) | 41 (0.1) | 46 (0.1) | 6 (0.1) | |||||
| tranquilizers/hypnotics | 950 (1.1) | 125 (2.4) | 54 (1.5) | 324 (1.0) | 388 (1.0) | 59 (0.6) | |||||
| Primary substance of use (recoded) (%) | cocaine paste | 33442 (37.8) | 2006 (38.1) | 2253 (63.6) | 8554 (26.4) | 14585 (38.8) | 6044 (62.2) | <0.001 | 0.469 | 0.0 | |
| cocaine powder | 17221 (19.5) | 1109 (21.1) | 506 (14.3) | 6522 (20.1) | 7775 (20.7) | 1309 (13.5) | |||||
| alcohol | 30214 (34.1) | 1497 (28.4) | 621 (17.5) | 14317 (44.2) | 11834 (31.5) | 1945 (20.0) | |||||
| marijuana | 6029 (6.8) | 462 (8.8) | 77 (2.2) | 2553 (7.9) | 2641 (7.0) | 296 (3.0) | |||||
| others | 1598 (1.8) | 189 (3.6) | 88 (2.5) | 478 (1.5) | 720 (1.9) | 123 (1.3) | |||||
| Court-referred to drug treatment (1 = yes) (%) | no | 82939 (98.0) | 4696 (98.2) | 3277 (97.6) | 31093 (98.8) | 34832 (97.8) | 9041 (96.6) | <0.001 | 0.068 | 4.4 | |
| si | 1668 (2.0) | 86 (1.8) | 80 (2.4) | 389 (1.2) | 790 (2.2) | 323 (3.4) | |||||
| Housing type (recoded) (%) | formal housing | 73763 (89.9) | 4441 (90.2) | 2139 (81.6) | 28843 (92.6) | 32164 (90.2) | 6176 (79.8) | <0.001 | 0.297 | 7.3 | |
| shared/secondary unit | 2787 (3.4) | 215 (4.4) | 88 (3.4) | 995 (3.2) | 1244 (3.5) | 245 (3.2) | |||||
| homeless/unsheltered/informal/temporary housing/institutional/collective | 2864 (3.5) | 64 (1.3) | 256 (9.8) | 402 (1.3) | 1074 (3.0) | 1068 (13.8) | |||||
| other/unknown | 2667 (3.2) | 203 (4.1) | 137 (5.2) | 912 (2.9) | 1168 (3.3) | 247 (3.2) | |||||
| Housing type (recoded & simplified) (%) | formal housing | 73763 (89.9) | 4441 (90.2) | 2139 (81.6) | 28843 (92.6) | 32164 (90.2) | 6176 (79.8) | <0.001 | 0.201 | 7.3 | |
| other/unknown | 8318 (10.1) | 482 (9.8) | 481 (18.4) | 2309 (7.4) | 3486 (9.8) | 1560 (20.2) | |||||
| Assessment of the therapeutic process (%) | logro alto | 21177 (24.0) | 1320 (25.1) | 874 (24.7) | 7482 (23.1) | 8574 (22.9) | 2927 (30.2) | <0.001 | 0.097 | 0.2 | |
| logro intermedio | 28665 (32.5) | 1728 (32.9) | 1140 (32.2) | 10361 (32.1) | 12104 (32.3) | 3332 (34.4) | |||||
| logro minimo | 38444 (43.5) | 2208 (42.0) | 1523 (43.1) | 14483 (44.8) | 16808 (44.8) | 3422 (35.3) | |||||
| Evaluation of consumption pattern at discharge (%) | logro alto | 25866 (29.3) | 1654 (31.5) | 1140 (32.2) | 9054 (28.0) | 10295 (27.5) | 3723 (38.5) | <0.001 | 0.134 | 0.3 | |
| logro intermedio | 25279 (28.6) | 1468 (27.9) | 941 (26.6) | 9269 (28.7) | 10726 (28.6) | 2875 (29.7) | |||||
| logro minimo | 37124 (42.1) | 2134 (40.6) | 1456 (41.2) | 13997 (43.3) | 16453 (43.9) | 3084 (31.9) | |||||
| Family situation evaluation at discharge (%) | logro alto | 20889 (23.7) | 1259 (24.0) | 843 (23.8) | 7617 (23.6) | 8321 (22.2) | 2849 (29.4) | <0.001 | 0.090 | 0.3 | |
| logro intermedio | 28144 (31.9) | 1702 (32.4) | 1109 (31.4) | 10226 (31.6) | 11824 (31.6) | 3283 (33.9) | |||||
| logro minimo | 39236 (44.5) | 2295 (43.7) | 1585 (44.8) | 14477 (44.8) | 17329 (46.2) | 3550 (36.7) | |||||
| Interpersonal relations evaluation at discharge (%) | logro alto | 21507 (24.4) | 1371 (26.1) | 941 (26.6) | 7633 (23.6) | 8661 (23.1) | 2901 (30.0) | <0.001 | 0.103 | 0.3 | |
| logro intermedio | 28053 (31.8) | 1668 (31.7) | 1068 (30.2) | 10180 (31.5) | 11820 (31.5) | 3317 (34.3) | |||||
| logro minimo | 38709 (43.9) | 2217 (42.2) | 1528 (43.2) | 14507 (44.9) | 16993 (45.3) | 3464 (35.8) | |||||
| Occupational situation evaluation at discharge (%) | logro alto | 27771 (31.5) | 1474 (28.0) | 832 (23.5) | 10739 (33.2) | 11733 (31.3) | 2993 (30.9) | <0.001 | 0.135 | 0.3 | |
| logro intermedio | 23483 (26.6) | 1365 (26.0) | 803 (22.7) | 8754 (27.1) | 10054 (26.8) | 2507 (25.9) | |||||
| logro minimo | 37015 (41.9) | 2417 (46.0) | 1902 (53.8) | 12827 (39.7) | 15687 (41.9) | 4182 (43.2) | |||||
| Mental health evaluation at discharge (%) | logro alto | 21471 (24.3) | 1312 (25.0) | 906 (25.6) | 7584 (23.5) | 8689 (23.2) | 2980 (30.8) | <0.001 | 0.103 | 0.3 | |
| logro intermedio | 29000 (32.9) | 1704 (32.4) | 1119 (31.6) | 10632 (32.9) | 12135 (32.4) | 3410 (35.2) | |||||
| logro minimo | 37798 (42.8) | 2240 (42.6) | 1512 (42.7) | 14104 (43.6) | 16650 (44.4) | 3292 (34.0) | |||||
| Physical health evaluation at discharge (%) | logro alto | 26845 (30.4) | 1605 (30.5) | 1340 (37.9) | 8990 (27.8) | 10536 (28.1) | 4374 (45.2) | <0.001 | 0.209 | 0.3 | |
| logro intermedio | 27330 (31.0) | 1594 (30.3) | 1052 (29.7) | 10073 (31.2) | 11710 (31.2) | 2901 (30.0) | |||||
| logro minimo | 34094 (38.6) | 2057 (39.1) | 1145 (32.4) | 13257 (41.0) | 15228 (40.6) | 2407 (24.9) | |||||
| Social norm transgression evaluation at discharge (%) | logro alto | 31583 (35.8) | 2015 (38.3) | 1001 (28.3) | 11845 (36.6) | 13416 (35.8) | 3306 (34.1) | <0.001 | 0.117 | 0.3 | |
| logro intermedio | 21831 (24.7) | 1217 (23.2) | 875 (24.7) | 7852 (24.3) | 9165 (24.5) | 2722 (28.1) | |||||
| logro minimo | 34855 (39.5) | 2024 (38.5) | 1661 (47.0) | 12623 (39.1) | 14893 (39.7) | 3654 (37.7) | |||||
| Readmission event (%) | 0 | 69434 (78.5) | 4045 (76.9) | 2065 (58.3) | 26397 (81.4) | 30155 (80.3) | 6772 (69.7) | <0.001 | 0.257 | 0.0 | |
| 1 | 19070 (21.5) | 1218 (23.1) | 1480 (41.7) | 6027 (18.6) | 7400 (19.7) | 2945 (30.3) | |||||
| Death event (%) | 0 | 84557 (95.5) | 5117 (97.2) | 3366 (95.0) | 30960 (95.5) | 35979 (95.8) | 9135 (94.0) | <0.001 | 0.072 | 0.0 | |
| 1 | 3947 (4.5) | 146 (2.8) | 179 (5.0) | 1464 (4.5) | 1576 (4.2) | 582 (6.0) | |||||
| Readmission time from admission (months) (median [IQR]) | 47.9 [22.6, 80.7] | 40.1 [18.9, 69.5] | 33.6 [13.3, 72.5] | 53.1 [26.5, 84.4] | 45.8 [22.3, 77.3] | 50.3 [19.5, 89.0] | <0.001 | nonnorm | 0.175 | 0.0 | |
| Readmission time from discharge (months) (median [IQR]) | 41.0 [15.0, 73.0] | 32.0 [10.5, 60.3] | 27.0 [4.9, 66.3] | 45.0 [18.9, 76.1] | 38.5 [14.7, 69.1] | 43.6 [11.8, 80.7] | <0.001 | nonnorm | 0.170 | 0.0 | |
| Death time from admission (months) (median [IQR]) | 62.5 [34.0, 91.1] | 54.0 [29.2, 84.9] | 68.5 [37.9, 99.2] | 64.7 [36.8, 91.6] | 58.0 [31.2, 87.2] | 73.9 [41.9, 101.7] | <0.001 | nonnorm | 0.196 | 0.0 | |
| Death time from discharge (months) (median [IQR]) | 54.3 [27.0, 83.9] | 46.4 [21.1, 76.1] | 61.4 [31.7, 92.6] | 56.1 [29.0, 84.4] | 50.5 [24.0, 80.0] | 67.2 [35.0, 95.9] | <0.001 | nonnorm | 0.217 | 0.0 | |
| Admission age (log) (median [IQR]) | 3.6 [3.3, 3.8] | 3.5 [3.3, 3.7] | 3.5 [3.3, 3.7] | 3.6 [3.4, 3.8] | 3.6 [3.3, 3.8] | 3.5 [3.3, 3.7] | <0.001 | nonnorm | 0.142 | 0.0 | |
| Admission age ^2 (median [IQR]) | 1,169.6 [751.9, 1,849.2] | 1,052.4 [691.2, 1,610.8] | 1,041.4 [677.6, 1,637.8] | 1,261.3 [794.1, 2,009.7] | 1,151.9 [746.4, 1,795.2] | 1,091.6 [714.5, 1,717.3] | <0.001 | nonnorm | 0.140 | 0.0 | |
| Admission age ^3 (median [IQR]) | 40,001.7 [20,615.9, 79,520.9] | 34,138.4 [18,170.7, 64,650.2] | 33,604.5 [17,636.9, 66,282.6] | 44,795.6 [22,378.1, 90,096.1] | 39,096.3 [20,391.2, 76,063.3] | 36,067.8 [19,098.4, 71,163.8] | <0.001 | nonnorm | 0.136 | 0.0 | |
| Admission age (centered) (median [IQR]) | -1.6 [-8.3, 7.2] | -3.3 [-9.5, 4.4] | -3.5 [-9.7, 4.7] | -0.2 [-7.6, 9.1] | -1.8 [-8.4, 6.6] | -2.7 [-9.0, 5.7] | <0.001 | nonnorm | 0.142 | 0.0 | |
| Admission age (three groups) (%) | 18-29 | 30634 (34.6) | 2125 (40.4) | 1449 (40.9) | 10165 (31.4) | 13181 (35.1) | 3714 (38.2) | <0.001 | 0.127 | 0.0 | |
| 30-44 | 39637 (44.8) | 2299 (43.7) | 1526 (43.0) | 14282 (44.0) | 17190 (45.8) | 4340 (44.7) | |||||
| 45-64 | 18233 (20.6) | 839 (15.9) | 570 (16.1) | 7977 (24.6) | 7184 (19.1) | 1663 (17.1) | |||||
| Poverty index of commune of residence (median [IQR]) | 12.4 [9.1, 17.0] | 11.2 [8.9, 15.3] | 12.5 [8.9, 16.9] | 13.1 [9.3, 17.9] | 11.9 [9.0, 15.7] | 12.4 [9.0, 17.6] | <0.001 | nonnorm | 0.140 | 0.0 | |
| Poverty index (log1p) (median [IQR]) | 0.1 [0.1, 0.2] | 0.1 [0.1, 0.1] | 0.1 [0.1, 0.2] | 0.1 [0.1, 0.2] | 0.1 [0.1, 0.1] | 0.1 [0.1, 0.2] | <0.001 | nonnorm | 0.139 | 0.0 | |
| Poverty index (centered) (median [IQR]) | 0.0 [-0.1, 0.0] | 0.0 [-0.1, 0.0] | 0.0 [-0.1, 0.0] | 0.0 [0.0, 0.0] | 0.0 [-0.1, 0.0] | 0.0 [-0.1, 0.0] | <0.001 | nonnorm | 0.140 | 0.0 | |
| Nationality (Chile) (%) | chile | 87935 (99.4) | 5212 (99.0) | 3524 (99.4) | 32238 (99.4) | 37307 (99.3) | 9654 (99.4) | 0.021 | 0.020 | 0.0 | |
| other | 569 (0.6) | 51 (1.0) | 21 (0.6) | 186 (0.6) | 248 (0.7) | 63 (0.6) | |||||
| Length of stay in treatment (months) (median [IQR]) | 5.6 [3.0, 10.0] | 6.5 [3.4, 11.5] | 4.4 [1.7, 9.1] | 5.8 [3.4, 10.0] | 5.5 [2.9, 10.0] | 5.3 [2.3, 9.6] | <0.001 | nonnorm | 0.144 | 0.0 | |
| Length of stay (log1p months) (median [IQR]) | 1.9 [1.4, 2.4] | 2.0 [1.5, 2.5] | 1.7 [1.0, 2.3] | 1.9 [1.5, 2.4] | 1.9 [1.4, 2.4] | 1.8 [1.2, 2.4] | <0.001 | nonnorm | 0.228 | 0.0 | |
| Length of stay (months)^2 (median [IQR]) | 31.1 [9.0, 100.0] | 42.0 [11.4, 132.6] | 19.0 [3.0, 82.2] | 33.7 [11.3, 99.4] | 29.7 [8.6, 99.4] | 27.7 [5.4, 93.0] | <0.001 | nonnorm | 0.084 | 0.0 | |
| Length of stay (months)^3 (median [IQR]) | 173.8 [27.0, 1,000.0] | 272.6 [38.3, 1,527.3] | 82.6 [5.3, 744.8] | 195.8 [37.8, 990.4] | 162.0 [25.3, 990.4] | 146.1 [12.5, 897.3] | <0.001 | nonnorm | 0.057 | 0.0 |
General-population settings were mostly male (81–94%). Heavy sex imbalances by setting (1/4= women).
GP settings mostly lived with couples/kids or family of origin (~51%)
Dropout rates exceed 50% in all groups; highest in Basic ambulatory (58%); Completion rates highest in General-poulation residential (32%) and Intensive ambulatory (30%)
Patients in residential treatments were dominated by cocaine base paste as the primary substance (64-62%), daily use (72-76%) and drug dependence (>90%); Basic ambualtory, dominated by alcohol (44%).
Residential treatments were more single (23% vs. 34% overall)
Domestic violence and sexual abuse were higher in women-only settings (40–45% and 6–8% vs. 26% and 2% overall, respectively)
Ambulatory basic received more patients aged 40-65 (25%); women-only programs, less (16%)
Also women-only settings showed SUD severity and Depression slightly higher.
Women-only residential had diagnoses F6 (Disorders of adult personality and behavior) (47% vs. 28% overall) and more diagnoses in study (24% vs. 18% overall)
Women-only residential were younger (34 [13–72]), less went to treatment spontaneously (28% vs. 45% overall)
Residential settings had approx. two times more organ-system diseases (5.3%) and injuries & sequelae (3.9%)
Formal housing: >90% in intensive ambulatory settings vs. 80% in general-population residential; Homelessness/informal housing: 14% in General-population residential vs. 1–3% elsewhere.
Employment was lowest in Women-only residential (10%) and General-population residential (18%), with many unemployed (43–65%), while General-population basic ambulatory had the highest employed rate (65%). Education was similar across groups, with most having high school or less (53–57%), but primary school or less was more common in women-only settings (30%).
Urban residence highest: 88–89% in women-only settings.
**We did not find patients with sex that was inconsistent with their clinical presentation regarding the Pathology of Gestation and of the Intrauterine Fetus without Disorder. Although this study focuses on pregnancy and fetal outcomes, the male parent’s (father’s) medical history and genetic contribution were documented as part of the patient data. G:/My Drive/Alvacast/SISTRAT 2023//cons/_out/table_one_prediction_clean.txt **.
Considering these differences, we think that certain predictors may behave differently by settings
Code
SISTRAT23_c1_2010_2024_df_prev1z|>
janitor::tabyl(readmit_event, death_event)|>
janitor::adorn_percentages("all")|>
janitor::adorn_pct_formatting(digits = 1)|>
janitor::adorn_ns(position = "front")|>
knitr::kable("markdown", caption= "Intersection of Readmission (rows) w/ Mortality (cols)")| readmit_event | 0 | 1 |
|---|---|---|
| 0 | 66,231 (74.8%) | 3,203 (3.6%) |
| 1 | 18,326 (20.7%) | 744 (0.8%) |
1.6. Near-zero variance
Code
# freqRatio The ratio of the frequency of the most common value to the frequency of the second most common value. A large number means the variable is dominated by one value (i.e., nearly constant). #high (e.g. > 20) → highly unbalanced (rare category vs majority).
# percentUnique The percentage of unique values relative to the sample size. Low values mean the variable has very few distinct values — often categorical or binary, but potentially too invariant. #High → many distinct values (continuous-type).
# zeroVar TRUE means the variable has zero variance (same value for all observations). These should always be removed.
# nzv TRUE means the variable has near-zero variance (too little variation to be useful). These are usually removed before modeling.
categorical_vars2<-
setdiff(categorical_vars, c("nationality_cons", "phys_dx_supergroup", "dx_supergroup_series", "f1_substance_use"))
numerical_vars2 <-
setdiff(numerical_vars, c("nationality_cons", "phys_dx_supergroup", "dx_supergroup_series"))
nzv <- nearZeroVar(subset(SISTRAT23_c1_2010_2024_df_prev1z, select= setdiff(c(categorical_vars,numerical_vars), "plan_type_corr")),
names = TRUE, #column names are returned.
saveMetrics= TRUE, # a data frame with predictor information is returned.
allowParallel= TRUE,
foreach= TRUE
)
dplyr::mutate_if(nzv, is.numeric, ~round(.,2)) |>
knitr::kable(caption="Near zero variance", col.names= c("Variables", "Freq most common value / second", "% of unique values", "Zero variance","Near-zero variance"))| Variables | Freq most common value / second | % of unique values | Zero variance | Near-zero variance |
|---|---|---|---|---|
| sex_rec | 2.90 | 0.00 | FALSE | FALSE |
| tenure_status_household | 1.13 | 0.01 | FALSE | FALSE |
| occupation_condition_corr24 | 1.40 | 0.00 | FALSE | FALSE |
| marital_status_rec | 1.62 | 0.00 | FALSE | FALSE |
| ethnicity | 14.71 | 0.00 | FALSE | FALSE |
| urbanicity_cat | 8.34 | 0.00 | FALSE | FALSE |
| ed_attainment_corr | 2.11 | 0.00 | FALSE | FALSE |
| cohabitation | 1.21 | 0.00 | FALSE | FALSE |
| dg_psiq_cie_10_instudy | 4.72 | 0.00 | FALSE | FALSE |
| dg_psiq_cie_10_dg | 1.24 | 0.00 | FALSE | FALSE |
| f0_organic | 79.09 | 0.00 | FALSE | TRUE |
| f1_substance_use | 0.00 | 0.00 | TRUE | TRUE |
| f2_psychotic | 64.12 | 0.00 | FALSE | TRUE |
| f3_mood | 8.75 | 0.00 | FALSE | FALSE |
| f4_anxiety_stress_somatoform | 52.97 | 0.00 | FALSE | TRUE |
| f5_physio_eating_sleep_sexual | 194.37 | 0.00 | FALSE | TRUE |
| f6_personality_adult_behaviour | 2.52 | 0.00 | FALSE | FALSE |
| f7_intellectual_disability | 137.72 | 0.00 | FALSE | TRUE |
| f8_9_neurodevelopment_child | 68.85 | 0.00 | FALSE | TRUE |
| dx_f2_smi_psychotic | 64.12 | 0.00 | FALSE | TRUE |
| dx_f3_mood | 8.75 | 0.00 | FALSE | FALSE |
| dx_f45_anx_stress_phys | 41.51 | 0.00 | FALSE | TRUE |
| dx_f6_personality | 2.52 | 0.00 | FALSE | FALSE |
| dx_f0789_neurocog_dev | 28.63 | 0.00 | FALSE | TRUE |
| sub_dep_icd10_status | 2.68 | 0.00 | FALSE | FALSE |
| dom_violence | 2.84 | 0.00 | FALSE | FALSE |
| sex_abuse | 50.43 | 0.00 | FALSE | TRUE |
| any_violence | 2.57 | 0.00 | FALSE | FALSE |
| phys_dx_supergroup | 1.56 | 0.01 | FALSE | FALSE |
| phys_dx_instudy | 1.26 | 0.00 | FALSE | FALSE |
| phys_dx_other_spec_medical_cond | 21.38 | 0.00 | FALSE | TRUE |
| phys_dx_organ_system_med_dis | 34.71 | 0.00 | FALSE | TRUE |
| phys_dx_injuries_and_sequelae | 64.84 | 0.00 | FALSE | TRUE |
| phys_dx_infectious_diseases | 123.62 | 0.00 | FALSE | TRUE |
| prim_sub_freq | 1.58 | 0.01 | FALSE | FALSE |
| prim_sub_freq_rec | 1.00 | 0.00 | FALSE | FALSE |
| polysubstance_strict | 2.69 | 0.00 | FALSE | FALSE |
| treat_lt_90 | 3.22 | 0.00 | FALSE | FALSE |
| tr_outcome | 2.01 | 0.01 | FALSE | FALSE |
| adm_motive | 1.44 | 0.01 | FALSE | FALSE |
| primary_sub | 1.11 | 0.01 | FALSE | FALSE |
| primary_sub_mod | 1.11 | 0.01 | FALSE | FALSE |
| usuario_tribunal_trat_droga | 49.72 | 0.00 | FALSE | TRUE |
| tipo_de_vivienda_rec | 25.76 | 0.00 | FALSE | TRUE |
| tipo_de_vivienda_rec2 | 8.87 | 0.00 | FALSE | FALSE |
| evaluacindelprocesoteraputico | 1.34 | 0.00 | FALSE | FALSE |
| eva_consumo | 1.44 | 0.00 | FALSE | FALSE |
| eva_fam | 1.39 | 0.00 | FALSE | FALSE |
| eva_relinterp | 1.38 | 0.00 | FALSE | FALSE |
| eva_ocupacion | 1.33 | 0.00 | FALSE | FALSE |
| eva_sm | 1.30 | 0.00 | FALSE | FALSE |
| eva_fisica | 1.25 | 0.00 | FALSE | FALSE |
| eva_transgnorma | 1.10 | 0.00 | FALSE | FALSE |
| readmit_event | 3.64 | 0.00 | FALSE | FALSE |
| death_event | 21.42 | 0.00 | FALSE | TRUE |
| readmit_time_from_adm_m | 1.10 | 7.86 | FALSE | FALSE |
| readmit_time_from_disch_m | 17.66 | 7.53 | FALSE | FALSE |
| death_time_from_adm_m | 1.06 | 5.54 | FALSE | FALSE |
| death_time_from_disch_m | 19.39 | 5.61 | FALSE | TRUE |
| adm_age_log | 1.19 | 5.17 | FALSE | FALSE |
| adm_age_pow2 | 1.19 | 5.17 | FALSE | FALSE |
| adm_age_pow3 | 1.19 | 5.17 | FALSE | FALSE |
| adm_age_c | 1.19 | 5.17 | FALSE | FALSE |
| adm_age_cat | 1.29 | 0.00 | FALSE | FALSE |
| porc_pobr | 1.06 | 1.00 | FALSE | FALSE |
| porc_pobr_log | 1.06 | 1.00 | FALSE | FALSE |
| porc_pobr_c | 1.06 | 1.00 | FALSE | FALSE |
| nationality_chile | 154.54 | 0.00 | FALSE | TRUE |
| dit_m | 1.02 | 14.27 | FALSE | FALSE |
| treat_log | 1.00 | 12.26 | FALSE | FALSE |
| treat_days_pow2 | 1.06 | 14.27 | FALSE | FALSE |
| treat_days_pow3 | 1.01 | 14.27 | FALSE | FALSE |
Given that near-zero variance can lead to large standard errors, convergence warnings, or even quasi-separation if the infrequent category aligns perfectly with the outcome, we deleted the following predictors due to potential collinearity: nationality_cons, phys_dx_supergroup, and dx_supergroup_series.
Also, we deleted 1 due to zero variance.
Variables with near zero variance: f5_physio_eating_sleep_sexual, nationality_chile, f7_intellectual_disability, phys_dx_infectious_diseases, f0_organic, f8_9_neurodevelopment_child, phys_dx_injuries_and_sequelae, f2_psychotic, dx_f2_smi_psychotic, f4_anxiety_stress_somatoform, sex_abuse, usuario_tribunal_trat_droga, dx_f45_anx_stress_phys, phys_dx_organ_system_med_dis, dx_f0789_neurocog_dev, tipo_de_vivienda_rec, death_event, phys_dx_other_spec_medical_cond, death_time_from_disch_m, f1_substance_use.
For
tipo_de_vivienda_rec, we usedtipo_de_vivienda_rec2instead.For psychiatric variables, we grouped them into severe mental illness and neurocognitive disorders (
dx_f_any_severe_mental= F0|F2|F7|F(8-9)|), any internalizing/stress-related diagnosis or of the somatoform spectrum (dx_f_any_internalizingF(4-5)).For any physical comorbidity, we applied grouping for any diagnostic (
any_phys_dx): Infectuous diseases, Organ-system, Injuries and sequelae and Other specified medical conditions.For
nationality_chile, we changed the dominant category and changed it for foreigners instead (national_foreign).Court-mandated or drug court treatment user (
usuario_tribunal_trat_droga) are less than 2% of the SUD treatment population. We did not use it as a predictor.Reported sexual abuse also represent less than 2% of the population. We have a variable called
any_violencethat groups domestic violence with sexual abuse
For the rest of variables, we marked them and review them conceptually later, collapse rare categories if possible, and avoid including obvious.
2. Prepare for survival setting w/ ML
2.1. Defined variables and clean database
We numeric and categorical predictors, excluded target and time-leak columns, and added a unique row IDs. Then we converted categorical variables to factors.
Code
num_vars = c(
# floats
"readmit_time_from_adm_m",
"death_time_from_adm_m",
"adm_age_rec3",
# "adm_age_log",
# "adm_age_pow2",
# "adm_age_pow3",
# "adm_age_c",
"porc_pobr",
# "porc_pobr_log",
# "porc_pobr_c",
"dit_m",
# "treat_log",
# "treat_days_pow2",
# "treat_days_pow3"
"num_trat_ant",# only 5993 missings
"age_subs_onset_rec2" # only 4300 missings
)
cat_vars = c(
# category
"sex_rec",
"tenure_status_household",
"cohabitation",
"sub_dep_icd10_status",
#"dom_violence", # check collinearity
#"sex_abuse", # collinear #freqRatio>20
"any_violence",
"prim_sub_freq_rec",
"tr_outcome",
"adm_motive",
"first_sub_used",
"primary_sub_mod",
#"tipo_de_vivienda_rec", #freqRatio>20
"tipo_de_vivienda_rec2",
#"adm_age_cat", # collinear
#"nationality_chile",
"national_foreign",
"plan_type_corr",
# object
"occupation_condition_corr24",
"marital_status_rec",
"urbanicity_cat",
"ed_attainment_corr",
#"prim_sub_freq",
#"primary_sub",
#"usuario_tribunal_trat_droga", #freqRatio>20
"evaluacindelprocesoteraputico",
"eva_consumo",
"eva_fam",
"eva_relinterp",
"eva_ocupacion",
"eva_sm",
"eva_fisica",
"eva_transgnorma",
# booleans
"ethnicity",
"dg_psiq_cie_10_instudy",
"dg_psiq_cie_10_dg",
#"f0_organic", #freqRatio>20
#"f2_psychotic", #freqRatio>20
#"f3_mood", #collinear
#"f4_anxiety_stress_somatoform", #freqRatio>20
#"f5_physio_eating_sleep_sexual", #freqRatio>20
#"f6_personality_adult_behaviour", #freqRatio>20
#"f7_intellectual_disability", #freqRatio>20
#"f8_9_neurodevelopment_child", #freqRatio>20
# ints
#"dx_f2_smi_psychotic",
"dx_f3_mood",
#"dx_f45_anx_stress_phys",
"dx_f6_personality",
#"dx_f0789_neurocog_dev",
"dx_f_any_severe_mental",
"dx_f_any_internalizing",
"phys_dx_instudy",
"any_phys_dx",
#"phys_dx_other_spec_medical_cond",
#"phys_dx_organ_system_med_dis",
#"phys_dx_injuries_and_sequelae",
#"phys_dx_infectious_diseases",
"polysubstance_strict"
#"treat_lt_90" #collinear
)
# 0. Define columns to exclude (endpoints + time leaks)
target_cols <- c(
"readmit_time_from_disch_m", "readmit_event",
"death_time_from_disch_m", "death_event"
)
leak_time_cols <- c("readmit_time_from_adm_m", "death_time_from_adm_m")
cols_to_exclude <- c(target_cols, leak_time_cols)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
SISTRAT23_c1_2010_2024_df_model<-
SISTRAT23_c1_2010_2024_df_prev1z%>%
dplyr::mutate(original_row_id = dplyr::row_number())%>% # Create a unique ID
dplyr::mutate(
national_foreign = dplyr::if_else(nationality_chile == "other", 1L, 0L)
)%>%
dplyr::mutate(
dx_f_any_severe_mental =
(f2_psychotic == 1L |
dx_f2_smi_psychotic == 1L |
f0_organic == 1L |
f7_intellectual_disability == 1L |
f8_9_neurodevelopment_child == 1L |
dx_f0789_neurocog_dev == 1L)
)%>%
dplyr::mutate(
any_internalizing =
(f4_anxiety_stress_somatoform == 1L |
dx_f45_anx_stress_phys == 1L |
f5_physio_eating_sleep_sexual == 1L)
)%>%
dplyr::mutate(
any_phys_dx = (phys_dx_infectious_diseases == 1L |
phys_dx_injuries_and_sequelae == 1L |
phys_dx_organ_system_med_dis == 1L |
phys_dx_other_spec_medical_cond == 1L)
)
# 1. Create predictor dataset (remove excluded columns)
df_pred <- SISTRAT23_c1_2010_2024_df_model %>%
dplyr::select(any_of(c(num_vars, cat_vars, cols_to_exclude)))
factor_vars<-
c("sex_rec", "tenure_status_household", "cohabitation", "sub_dep_icd10_status", "any_violence", "tr_outcome", "adm_motive", "primary_sub_mod", "tipo_de_vivienda_rec2", "plan_type_corr", "occupation_condition_corr24", "marital_status_rec", "urbanicity_cat", "ed_attainment_corr", "prim_sub_freq_rec", "first_sub_used", "evaluacindelprocesoteraputico", "eva_consumo", "eva_fam", "eva_relinterp", "eva_ocupacion", "eva_sm", "eva_fisica", "eva_transgnorma")
invisible(
df_pred[, (factor_vars) := lapply(.SD, as.factor), .SDcols = factor_vars]
)2.2. Imputation
Then we run MICE-like with PMM (15 donors, 200 trees, 5 datasets, 10 iterations) using a correlation-based predictor matrix, and finally extracted the 5 completed/imputed datasets, restoring factor levels.
Code
# Stable imputation
# set.seed(2125)
# imp <- mice(
# clean_dummified_df,
# m = 5,
# maxit = 10,
# ridge = 1e-04, # Regularización agresiva pero necesaria
# donors = 15,
# allow.na = FALSE
# )
# it im dep meth out
# 1 0 0 collinear porc_pobr_c
# 2 0 0 collinear dx_f2_smi_psychotic
# 3 0 0 collinear dx_f3_mood
# 4 0 0 collinear dx_f6_personality
# 5 0 0 collinear sub_dep_icd10_status_drug_dependence
# 6 0 0 collinear nationality_chile_other
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Define number of complete datasets to create\n")
replicates_n <- 5
# Check if the session is interactive (i.e., you are using RStudio manually)
if (interactive()) {
if(
!file.exists(file.path(wdpath,
"data/20241015_out",
"filled_datasets.Rds"))
) {
# If interactive, offload to a background job to keep console free
job::job({
source(file.path(wdpath, "cons", "_hist_scripts", "missranger_mice_like_pred1.R"))
}, title = "Imputation Job") # Optional: adds a nice title to the jobs pane
} else {
filled_datasets <- readr::read_rds(file.path(wdpath, "data/20241015_out", "filled_datasets.Rds"))
}
} else {
# If NOT interactive (e.g., batch mode), just run it directly
library(dplyr)
library(purrr)
library(tibble)
# Run MICE imputation (corrected syntax)
time_event_vars <- c(
"readmit_time_from_disch_m", "readmit_event",
"death_time_from_disch_m", "death_event"
)
predictor_formula <- as.formula(
#paste(". ~ . -", paste(time_event_vars, collapse = " - "))
paste(". ~ . ", sep="")
)
cat("MICE-like but with missranger")
#https://cran.r-project.org/web/packages/missRanger/vignettes/working_with_censoring.html
set.seed(2125)
filled_datasets <- replicate(
replicates_n,
missRanger::missRanger(
df_pred,
formula = predictor_formula, # Excluye explícitamente outcomes
verbose = 2,
pmm.k = 15,#10, # predictive mean matching with 15 nearest neighbors for num variables
num.trees = 200, # trees sufficient for stability
#seed = 2125, #if you want 5 different but reproducible imputations. dont use insede missRanger
maxiter = 10, #50 is overkill unless you have very heavy missingness and time isn’t an issue
returnOOB = TRUE,
respect.unordered.factors = "order" # controls how categorical variables are handled inside the trees, giving a better treatment of unordered factors.
),
simplify = FALSE
)
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("Check remaining missing values\n")
imputed_data <- map_dfr(1:5, function(i) {
df <- filled_datasets[[i]]
# add control variables
dplyr::mutate(df,
.imp = i, # imputation number
.id = SISTRAT23_c1_2010_2024_df_model$original_row_id # Use original ID
) %>%
# Keep variables necessary for analysis
dplyr::select(
.imp, .id,
everything()
)
}, .id = "imputation_id") %>%
dplyr::select(-imputation_id) # cleaning
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("Check remaining missing values\n")
get_all_oobs <- function(imputed_list) {
# 1. Extract OOBs from all 5 datasets into one big table
all_oobs <- map_dfr(seq_along(imputed_list), function(i) {
# Get the attribute
oob_vec <- attr(imputed_list[[i]], "oob")
# Create a tidy table
tibble::tibble(
Variable = names(oob_vec),
Error_Value = as.numeric(oob_vec),
Imputation_Run = as.factor(i)
)
})
# 2. Create a Summary Table (Mean across the 5 runs)
oob_summary <- all_oobs %>%
dplyr::group_by(Variable) %>%
dplyr::summarise(
Mean_OOB = mean(Error_Value),
SD_OOB = sd(Error_Value),
Best_Run = min(Error_Value),
Worst_Run = max(Error_Value)
) %>%
dplyr::arrange(Mean_OOB) # Sort: Best variables (lowest error) first
# 3. Print the summary nicely
cat("\n=== COMPLETE OOB ERROR REPORT ===\n")
cat("Scale: 0.00 = Perfect Prediction | 1.00 = Useless/Random\n")
print(as.data.frame(oob_summary), digits = 3)
return(oob_summary)
}
#Takes 1:41:49
#Takes 1:38:06
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
imputed_data %>%
dplyr::select(where(~ any(is.na(.x)))) %>%
{
if (ncol(.) == 0) {
message("✅ No missing values remain after imputation")
tibble::tibble()
} else {
dplyr::summarise(., across(everything(), ~ scales::percent(mean(is.na(.x)), accuracy = 0.1))) %>%
tidyr::pivot_longer(everything(), names_to = "variable", values_to = "missing_pct") %>%
dplyr::arrange(desc(as.numeric(gsub("%", "", missing_pct))))
}
} %>%
print()
}Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
if (interactive()) {
if(
!file.exists(file.path(wdpath,
"data/20241015_out",
"filled_datasets.Rds"))
) {
final_oob_stats <- get_all_oobs(filled_datasets)
# cat("\n=== COMPLETE OOB ERROR REPORT ===\n")
# cat("Scale: 0.00 = Perfect Prediction | 1.00 = Useless/Random\n")
final_oob_stats|>
dplyr::mutate_if(is.numeric, ~round(.,4))|>
knitr::kable(caption="Oob report")
}
}
# tipo_de_vivienda_rec2 (0.08): Extremely reliable. The model almost certainly
# knows what kind of house someone lives in based on their other traits.
# any_phys_dx (0.09): Very strong prediction.
# sex_rec (0.12): Standard; usually easy to predict from other social variables.
# eva_ variables (~0.14 - 0.21): Your "Evaluation" scales (consumo, fam, ocupacion, etc.)
# are performing very well. This suggests clear patterns in your psychological/evaluation metrics.
# marital_status_rec (0.30): Decent.
# first_sub_used (0.34): The model has a fair idea of what substance they used
# first, but it's not certain.
# prim_sub_freq_rec (0.41) & ed_attainment_corr (0.43): These are getting "fuzzy."
# The imputed values will be plausible, but they might not perfectly
# capture the individual nuance for every person.
# age_subs_onset_rec2 (0.72): High error. The age at which someone started using
# substances is hard to predict from their current status.
# phys_dx_instudy (0.78): Very high. The specific physical diagnosis is likely too
# complex or rare to be predicted by general social/eval variables.
# num_trat_ant (0.86): Critical Flag. This is nearly 0.90.
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#previous imputations
#saveRDS(imp, file= file.path(wdpath, "data/20241015_out", "imp.Rds"))
#saveRDS(filled_datasets, file= file.path(wdpath, "data/20241015_out", "filled_datasets.Rds"))Define number of complete datasets to create
MICE-like but with missranger
Variables to impute: sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute: readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event
sex_rc phys__ any_p_ mrtl__ evlcnd ev_cns eva_fm ev_rln ev_cpc eva_sm ev_fsc ev_trn ed_tt_ prm___ ag___2 tnr_s_ nm_tr_ tp___2 frst__ any_vl
iter 1: 0.1291 0.8622 0.0901 0.3067 0.3256 0.1515 0.2041 0.1787 0.2639 0.1514 0.2049 0.2117 0.4397 0.4154 0.8086 0.4572 0.8687 0.0820 0.3422 0.2541
iter 2: 0.1231 0.7830 0.0910 0.3039 0.0980 0.1480 0.1874 0.1632 0.2421 0.1409 0.2007 0.2107 0.4330 0.4144 0.7246 0.4495 0.8687 0.0812 0.3423 0.2538
iter 3: 0.1243 0.7840 0.0908 0.3039 0.0985 0.1490 0.1872 0.1628 0.2431 0.1411 0.1999 0.2106 0.4338 0.4127 0.7232 0.4474 0.8681 0.0805 0.3429 0.2534
iter 4: 0.1236 0.7834 0.0906 0.3038 0.0985 0.1484 0.1875 0.1627 0.2446 0.1408 0.1999 0.2113 0.4332 0.4129 0.7246 0.4498 0.8674 0.0812 0.3415 0.2527
Variables to impute: sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute: readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event
sex_rc phys__ any_p_ mrtl__ evlcnd ev_cns eva_fm ev_rln ev_cpc eva_sm ev_fsc ev_trn ed_tt_ prm___ ag___2 tnr_s_ nm_tr_ tp___2 frst__ any_vl
iter 1: 0.1294 0.8623 0.0908 0.3066 0.3259 0.1508 0.2041 0.1795 0.2623 0.1516 0.2042 0.2112 0.4383 0.4156 0.8099 0.4559 0.8686 0.0820 0.3422 0.2538
iter 2: 0.1238 0.7842 0.0913 0.3028 0.0975 0.1483 0.1865 0.1642 0.2423 0.1404 0.2003 0.2107 0.4338 0.4139 0.7257 0.4465 0.8685 0.0814 0.3425 0.2537
iter 3: 0.1232 0.7832 0.0907 0.3043 0.0984 0.1487 0.1865 0.1619 0.2432 0.1412 0.2003 0.2101 0.4325 0.4142 0.7230 0.4475 0.8682 0.0814 0.3432 0.2531
iter 4: 0.1231 0.7844 0.0907 0.3027 0.0982 0.1484 0.1868 0.1622 0.2433 0.1415 0.2001 0.2111 0.4342 0.4133 0.7243 0.4490 0.8677 0.0815 0.3430 0.2528
Variables to impute: sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute: readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event
sex_rc phys__ any_p_ mrtl__ evlcnd ev_cns eva_fm ev_rln ev_cpc eva_sm ev_fsc ev_trn ed_tt_ prm___ ag___2 tnr_s_ nm_tr_ tp___2 frst__ any_vl
iter 1: 0.1293 0.8615 0.0905 0.3064 0.3256 0.1509 0.2040 0.1794 0.2624 0.1516 0.2044 0.2113 0.4376 0.4152 0.8121 0.4550 0.8695 0.0818 0.3427 0.2546
iter 2: 0.1236 0.7865 0.0910 0.3039 0.0978 0.1488 0.1864 0.1630 0.2433 0.1408 0.2004 0.2116 0.4330 0.4139 0.7219 0.4485 0.8691 0.0805 0.3430 0.2522
iter 3: 0.1231 0.7829 0.0908 0.3023 0.0977 0.1483 0.1868 0.1630 0.2433 0.1404 0.1998 0.2098 0.4346 0.4150 0.7271 0.4475 0.8681 0.0800 0.3419 0.2528
iter 4: 0.1237 0.7842 0.0907 0.3041 0.0986 0.1485 0.1868 0.1629 0.2430 0.1411 0.2004 0.2107 0.4334 0.4152 0.7230 0.4482 0.8679 0.0803 0.3440 0.2534
Variables to impute: sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute: readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event
sex_rc phys__ any_p_ mrtl__ evlcnd ev_cns eva_fm ev_rln ev_cpc eva_sm ev_fsc ev_trn ed_tt_ prm___ ag___2 tnr_s_ nm_tr_ tp___2 frst__ any_vl
iter 1: 0.1290 0.8622 0.0909 0.3060 0.3260 0.1508 0.2036 0.1787 0.2631 0.1516 0.2040 0.2121 0.4399 0.4144 0.8112 0.4553 0.8681 0.0821 0.3427 0.2523
iter 2: 0.1242 0.7833 0.0913 0.3034 0.0981 0.1484 0.1867 0.1631 0.2438 0.1408 0.1999 0.2108 0.4335 0.4143 0.7241 0.4477 0.8672 0.0803 0.3422 0.2526
iter 3: 0.1234 0.7830 0.0906 0.3039 0.0979 0.1492 0.1874 0.1628 0.2433 0.1412 0.2003 0.2113 0.4335 0.4142 0.7238 0.4467 0.8684 0.0806 0.3418 0.2549
Variables to impute: sex_rec, phys_dx_instudy, any_phys_dx, marital_status_rec, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ed_attainment_corr, prim_sub_freq_rec, age_subs_onset_rec2, tenure_status_household, num_trat_ant, tipo_de_vivienda_rec2, first_sub_used, any_violence
Variables used to impute: readmit_time_from_adm_m, death_time_from_adm_m, adm_age_rec3, porc_pobr, dit_m, num_trat_ant, age_subs_onset_rec2, sex_rec, tenure_status_household, cohabitation, sub_dep_icd10_status, any_violence, prim_sub_freq_rec, tr_outcome, adm_motive, first_sub_used, primary_sub_mod, tipo_de_vivienda_rec2, national_foreign, plan_type_corr, occupation_condition_corr24, marital_status_rec, urbanicity_cat, ed_attainment_corr, evaluacindelprocesoteraputico, eva_consumo, eva_fam, eva_relinterp, eva_ocupacion, eva_sm, eva_fisica, eva_transgnorma, ethnicity, dg_psiq_cie_10_instudy, dg_psiq_cie_10_dg, dx_f3_mood, dx_f6_personality, dx_f_any_severe_mental, phys_dx_instudy, any_phys_dx, polysubstance_strict, readmit_time_from_disch_m, readmit_event, death_time_from_disch_m, death_event
sex_rc phys__ any_p_ mrtl__ evlcnd ev_cns eva_fm ev_rln ev_cpc eva_sm ev_fsc ev_trn ed_tt_ prm___ ag___2 tnr_s_ nm_tr_ tp___2 frst__ any_vl
iter 1: 0.1292 0.8634 0.0907 0.3059 0.3247 0.1513 0.2041 0.1794 0.2627 0.1515 0.2044 0.2109 0.4383 0.4172 0.8079 0.4569 0.8703 0.0831 0.3433 0.2532
iter 2: 0.1230 0.7853 0.0908 0.3037 0.0980 0.1481 0.1869 0.1634 0.2425 0.1410 0.2004 0.2122 0.4335 0.4135 0.7237 0.4473 0.8676 0.0799 0.3417 0.2538
iter 3: 0.1234 0.7838 0.0913 0.3037 0.0979 0.1485 0.1867 0.1620 0.2437 0.1409 0.2001 0.2109 0.4330 0.4147 0.7222 0.4475 0.8692 0.0811 0.3424 0.2534
Check remaining missing values
Check remaining missing values
# A tibble: 0 × 0
2.3. One-hot encoding
We applied one‑hot encoding with dummyVars, and combined them with numeric predictors. Finally, we cleaned column names to produce a tidy modeling dataset, and generated a vector with dummified_vars. But also we decided to discard predictor variables with high uncertainty in the imputations, such as substance use onset age (age_subs_onset_rec2) physical diagnosis, in study (phys_dx_instudy) and the reported number of previous treatments (num_trat_ant).
The resulting database is called processed_datasets, which is a list of the imputed databases.
We also exported corrected_datasets to avoid immortal time bias in patients who were not readmitted because they died. This step is essential for accurately estimating readmission.
Code
cat("Define vector to discard dummy levels\n")
dummified_lvls_vars<- c("sex_rec_man", "tenure_status_household_others", "cohabitation_others", "sub_dep_icd10_status_hazardous_consumption", "any_violence_0_no_domestic_violence_sex_abuse", "tr_outcome_other", "adm_motive_other", "primary_sub_mod_others", "tipo_de_vivienda_rec2_formal_housing", "plan_type_corr_pg_pab", "occupation_condition_corr24_employed", "marital_status_rec_married_cohabiting", "urbanicity_cat_3_urban", "ed_attainment_corr_1_more_than_high_school", "prim_sub_freq_rec_1_1_day_wk", "first_sub_used_hallucinogens", "first_sub_used_inhalants", "first_sub_used_amphetamine_type_stimulants", "first_sub_used_others", "evaluacindelprocesoteraputico_logro_alto", "eva_consumo_logro_alto", "eva_fam_logro_alto", "eva_relinterp_logro_alto", "eva_ocupacion_logro_alto", "eva_sm_logro_alto", "eva_fisica_logro_alto", "eva_transgnorma_logro_alto")
high_uncertainty_imps<- c("age_subs_onset_rec2", "phys_dx_instudy", "num_trat_ant")
cat("Map dummification\n")
# 0. Define your factor variables dynamically (or use your manual list)
# It is safer to detect them from the first dataset to avoid typos
factor_vars <- names(filled_datasets[[1]])[sapply(filled_datasets[[1]], is.factor)]
dummy_formula <- as.formula(paste("~", paste(factor_vars, collapse = " + ")))
# Apply the transformation to EACH dataset in the list
processed_datasets <- map(filled_datasets, function(df_current) {
# 1. Create the dummy model based on THIS specific dataset
# (Though technically the levels should be the same, it's safer to re-run)
dummies_model <- dummyVars(dummy_formula, data = df_current, fullRank = FALSE)
# 2. Predict (generate) the dummies
df_dummies_encoded <- predict(dummies_model, newdata = df_current)
df_dummies_encoded <- as.data.frame(df_dummies_encoded)
# 3. Combine non-factor columns with new dummy columns
# We use dplyr::select to be safe regardless of whether it's data.table or data.frame
numeric_part <- df_current %>% dplyr::select(-all_of(factor_vars))
clean_dummified_df <- cbind(numeric_part, df_dummies_encoded)
# 4. Clean names
clean_dummified_df <- janitor::clean_names(clean_dummified_df)
# 5. Select variables
clean_dummified_df <- clean_dummified_df|>
dplyr::select(-any_of(dummified_lvls_vars))|>
dplyr::select(-any_of(high_uncertainty_imps))
return(clean_dummified_df)
})
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Export databases\n")
#To google colab to explore predictors with Coxnet
# 1. Apply Correction to Each Dataset in the List
corrected_datasets <- map(processed_datasets, function(df) {
# Ensure columns are numeric for comparison
df <- df|>
mutate(
readmit_time = as.numeric(readmit_time_from_disch_m),
death_time = as.numeric(death_time_from_disch_m),
# Ensure events are 0/1 integers
readmit_event = as.integer(as.character(readmit_event)),
death_event = as.integer(as.character(death_event))
)
# --- THE CORRECTION LOGIC ---
df <- df|>
dplyr::mutate(
# condition: Patient died (death_event==1) AND death happened before (or at) the readmission endpoint
is_competing_risk = (death_event == 1) & (death_time <= readmit_time),
# 1. Update Time: If competing risk, cut follow-up at death time. Otherwise keep original.
readmit_time_from_disch_m = dplyr::if_else(is_competing_risk, death_time, readmit_time),
# 2. Update Event: If competing risk, censor readmission (set to 0). Otherwise keep original.
readmit_event = dplyr::if_else(is_competing_risk, 0, readmit_event)
) %>%
# Remove temporary helper columns if you want to keep the dataset clean
dplyr::select(-readmit_time, -death_time, -is_competing_risk)
return(df)
})
# 1. Define the destination folder
output_dir <- file.path(wdpath, "data/20241015_out/pred1")
# 2. Iterate and Export
# iwalk automatically gives you the object (.x) and its index (.y)
purrr::iwalk(corrected_datasets, function(df, i) {
# Construct the full file path
# e.g. .../data/20241015_out/imputation_1.parquet
file_name <- file.path(output_dir, paste0("imputation_", i, ".parquet"))
# Export using rio
rio::export(df, file_name)
message(paste("Saved:", file_name))
})Code
#Export list
rio::export(processed_datasets, paste0(file.path(wdpath, "data/20241015_out/pred1"),"/processed_datasets.Rds"))
rio::export(corrected_datasets, paste0(file.path(wdpath, "data/20241015_out/pred1"),"/corrected_datasets.Rds"))Define vector to discard dummy levels
Map dummification
Export databases
Session info
Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
message(paste0("R library: ", Sys.getenv("R_LIBS_USER")))Code
message(paste0("Date: ",withr::with_locale(new = c('LC_TIME' = 'C'), code =Sys.time())))Code
message(paste0("Editor context: ", path))Code
cat("quarto version: "); quarto::quarto_version()quarto version:
[1] '1.7.29'
Code
sesion_info <- devtools::session_info()Warning in system2(“quarto”, “-V”, stdout = TRUE, env = paste0(“TMPDIR=”, : el comando ejecutado ‘“quarto” TMPDIR=C:/Users/andre/AppData/Local/Temp/RtmpwlZmSq/file50ac14525543 -V’ tiene el estatus 1
Code
dplyr::select(
tibble::as_tibble(sesion_info$packages),
c(package, loadedversion, source)
)|>
DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'', htmltools::em('R packages')),
options=list(
initComplete = htmlwidgets::JS(
"function(settings, json) {",
"$(this.api().tables().body()).css({
'font-family': 'Helvetica Neue',
'font-size': '70%',
'code-inline-font-size': '15%',
'white-space': 'nowrap',
'line-height': '0.75em',
'min-height': '0.5em'
});",
"}")))Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
#|class-output: center-table
reticulate::py_list_packages()|>
DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'', htmltools::em('Python packages')),
options=list(
initComplete = htmlwidgets::JS(
"function(settings, json) {",
"$(this.api().tables().body()).css({
'font-family': 'Helvetica Neue',
'font-size': '70%',
'code-inline-font-size': '15%',
'white-space': 'nowrap',
'line-height': '0.75em',
'min-height': '0.5em'
});",
"}"))) Warning in system2(python, args, stdout = TRUE): el comando ejecutado ‘“G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe” -m pip freeze’ tiene el estatus 1
Save
Code
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
paste0(getwd(),"/cons")
file.path(paste0(wdpath,"data/20241015_out"))
file.path(paste0(envpath,"data/20241015_out"))
# Save
rdata_path <- file.path(wdpath, "data/20241015_out", paste0("pred1_ndp_", format(Sys.time(), "%Y_%m_%d"), ".Rdata"))
save.image(rdata_path)
cat("Saved in:",
rdata_path)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
password <- Sys.getenv("PASSWORD_SECRET")
} else {
if (interactive()) {
utils::savehistory(tempfile())
Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
utils::loadhistory()
}
Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
save.image(paste0(rdata_path,".enc"))
# Encriptar el archivo en el mismo lugar
httr2::secret_encrypt_file(path = paste0(rdata_path,".enc"), key = "PASSWORD_SECRET")
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Copy renv lock into cons folder\n")
if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
message("Running on RStudio Server or inside Docker. Folder copy skipped.")
} else {
source_folder <-
destination_folder <- paste0(wdpath,"cons/renv")
# Copy the folder recursively
file.copy(paste0(wdpath,"renv.lock"), paste0(wdpath,"cons/renv.lock"), overwrite = TRUE)
message("Renv lock copy performed.")
}Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
time_after_dedup2<-Sys.time()
paste0("Time in markdown: ");time_after_dedup2-time_before_dedup2[1] "G:/My Drive/Alvacast/SISTRAT 2023/cons/cons"
[1] "G:/My Drive/Alvacast/SISTRAT 2023//data/20241015_out"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/data/20241015_out"
Saved in: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/pred1_ndp_2026_01_05.RdataCopy renv lock into cons folder
[1] "Time in markdown: "
Time difference of 1.467835 hours