SENDAs Agreement 1 Update 2010-2022
Load administrative data from SENDAs patient, compare information with previous databases and explore new data. Focus on other agreements
Data Loading and Exploration
Loading Packages and uniting databases
Proceed to load the necessary packages.
Code
unlink("*_files", recursive=T)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# --- Bootstrap reticulate con ruta relativa a getwd() ---
suppressPackageStartupMessages(library(reticulate))
# Busca .mamba_root/envs/py311/python.exe desde getwd() hacia padres
find_python_rel <- function(start = getwd(),
rel = file.path(".mamba_root","envs","py311","python.exe")) {
cur <- normalizePath(start, winslash = "/", mustWork = FALSE)
repeat {
cand <- normalizePath(file.path(cur, rel), winslash = "/", mustWork = FALSE)
if (file.exists(cand)) return(cand)
parent <- dirname(cur)
if (identical(parent, cur)) return(NA_character_) # llegó a la raíz
cur <- parent
}
}
py <- find_python_rel()
if (is.na(py)) {
stop("No se encontró Python relativo a getwd() (buscando '.mamba_root/envs/py311/python.exe').\n",
"Directorio actual: ", getwd())
}
# Forzar ese intérprete
Sys.unsetenv(c("RETICULATE_CONDAENV","RETICULATE_PYTHON_FALLBACK"))
Sys.setenv(RETICULATE_PYTHON = py)
use_python(py, required = TRUE)
py_config() # verificaciónpython: G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe
libpython: G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python311.dll
pythonhome: G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311
version: 3.11.5 | packaged by conda-forge | (main, Aug 27 2023, 03:23:48) [MSC v.1936 64 bit (AMD64)]
Architecture: 64bit
numpy: [NOT FOUND]
NOTE: Python version was forced by RETICULATE_PYTHON
Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#https://github.com/rstudio/renv/issues/544
#renv falls back to copying rather than symlinking, which is evidently very slow in this configuration.
renv::settings$use.cache(FALSE)
#check if rstools is installed
try(installr::install.Rtools(check_r_update=F))Error in contrib.url(repos, "source") :
trying to use CRAN without setting a mirror
Code
#change repository to CL
local({
r <- getOption("repos")
r["CRAN"] <- "https://cran.dcc.uchile.cl/"
options(repos=r)
})
if(!require(pacman)){install.packages("pacman");require(pacman)}Code
pacman::p_unlock(lib.loc = .libPaths()) #para no tener problemas reinstalando paquetesCode
if(Sys.info()["sysname"]=="Windows"){
if (getRversion() != "4.4.1") { stop("Requires R version 4.4.1; Actual: ", getRversion()) }
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#PACKAGES#######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#Package to bring packages in development
if(!require(devtools)){install.packages("devtools")}Code
#Package administration
if(!require(renv)){install.packages("renv")}Code
#To manipulate data
if(!require(tidyverse)){install.packages("tidyverse")}Code
if(!require(janitor)){install.packages("janitor")}Code
if(!require(plyr)){install.packages("plyr")}Code
#For contingency tables
if(!require(kableExtra)){install.packages("kableExtra")}Code
#For connections with python
if(!require(reticulate)){install.packages("reticulate")}
#To manipulate big data
if(!require(polars)){install.packages("polars", repos = "https://community.r-multiverse.org")}Warning: package ‘polars’ was built under R version 4.4.3
Code
#To bring big databases
if(!require(nanoparquet)){install.packages("nanoparquet")}Code
#interface for rstudio in R
if(!require(rstudioapi)){install.packages("rstudioapi")}Code
#time handling
if(!require(clock)){install.packages("clock")}Code
#combine plots
if(!require(ggpubr)){install.packages("ggpubr")}Code
#parallelized iterative processing
if(!require(furrr)){install.packages("furrr")}Code
#work like a tibble with a data.table database
if(!require(tidytable)){install.packages("tidytable")}Code
# pacman::p_load(
# altair, arrow, biostat3, car, caret, chilemapas, choroplethr, choroplethrAdmin1,
# choroplethrMaps, codebook, compareGroups, DiagrammeR, DiagrammeRsvg, DT, epiR, epitools,
# factoextra, FactoMineR, finalfit, flexsurv, fmsb, ggfortify, ggiraph, ggiraphExtra,
# ggpubr, ggrepel, glca, gridExtra, here, Hmisc, htmlwidgets, installr, janitor, kableExtra,
# lsmeans, magick, matrixStats, Metrics, muhaz, naniar, neuralnet, NeuralNetTools, pagedown,
# panelr, patchwork, pdp, plotly, plyr, plotly, posterdown, polycor, pROC, psych, radiant,
# rateratio.test, reshape, reshape2, reticulate, rio, ROCit, rnaturalearth, rsvg, sf, sjPlot,
# sqldf, Statamarkdown, survminer, survMisc, tableone, tidylog, tidyverse, treemapify, VIM,
# webshot, xaringanthemer, zoo, install=T
# )
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#FUNCTIONS######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#replace columns
rename_if_present_to_main <- function(df, rename_map, main_names) {
# keep only pairs where source exists and target is in the main schema
src_in_df <- rename_map %in% names(df)
tgt_in_main <- names(rename_map) %in% main_names
present <- rename_map[src_in_df & tgt_in_main]
if (length(present)) {
df <- df %>% rename(!!!setNames(unname(present), names(present)))
}
df
}
replace_columns_if_any <- function(df, x, y) {
if (x %in% names(df)) {
names(df)[names(df) == x] <- y
}
return(df)
}
#WINDOWS do not restrict memory size
if(.Platform$OS.type == "windows") withAutoprint({
memory.size()
memory.size(TRUE)
memory.limit()
})> memory.size()
Warning: ‘memory.size()’ is no longer supported
[1] Inf
> memory.size(TRUE)
Warning: ‘memory.size()’ is no longer supported
[1] Inf
> memory.limit()
Warning: ‘memory.limit()’ is no longer supported
[1] Inf
Code
memory.limit(size=56000)Warning: ‘memory.limit()’ is no longer supported
[1] Inf
Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# NO MORE DEBUGS
options(error = NULL) # si antes tenías options(error = recover) o browser)
options(browserNLdisabled = FALSE)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#NAs are replaced with "" in knitr kable
options(knitr.kable.NA = '')
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#to format rows in bold
format_cells <- function(df, rows ,cols, value = c("italics", "bold", "strikethrough")){
# select the correct markup
# one * for italics, two ** for bold
map <- setNames(c("*", "**", "~~"), c("italics", "bold", "strikethrough"))
markup <- map[value]
for (r in rows){
for(c in cols){
# Make sure values are not factors
df[[c]] <- as.character( df[[c]])
# Update formatting
df[r, c] <- ifelse(nchar(df[r, c])==0,"",paste0(markup, gsub(" ", "", df[r, c]), markup))
}
}
return(df)
}
#To produce line breaks in messages and warnings
knitr::knit_hooks$set(
error = function(x, options) {
paste('\n\n<div class="alert alert-danger" style="font-size: 0.7rem !important;">',
gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
'</div>', sep = '\n')
},
warning = function(x, options) {
paste('\n\n<div class="alert alert-warning" style="font-size: 0.7rem !important;">',
gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
'</div>', sep = '\n')
},
message = function(x, options) {
paste('<div class="message" style="font-size: 0.7rem !important;">',
gsub('##', '\n', x),
'</div>', sep = '\n')
}
)
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#CONFIG #######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
options(scipen=2) #display numbers rather scientific number
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#ENCODING#######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
conv_chars <- data.frame(
ANSI = c("Á", "á", "É", "é", "Í", "í", "Ó", "ó", "Ú", "ú", "Ñ", "ñ", "¿", "ó"),
UTF_8 = c("Ã", "á", "É", "é", "Ã", "Ã", "Ó", "ó", "Ú", "ú", "Ñ", "ñ", "¿", "ò"),
JAVASCRIPT = c("u00c1", "u00e1", "u00c9", "u00e9", "u00cd", "u00ed", "u00d3", "u00f3", "u00da", "u00fa", "u00d1", "u00f1", "u00bf", "0xF2"),
HTML = c("Á", "á", "É", "é", "Í", "í", "Ó", "ó", "Ú", "ú", "Ñ", "ñ", "¿", "")
)
convert_chars <- function(x) {
x <- gsub("ó", "ó", x) # ó
x <- gsub("á", "á", x) # á
x <- gsub("é", "é", x) # é
x <- gsub("ú", "ú", x) # ú
x <- gsub("ñ", "ñ", x) # ñ
x <- gsub("Ñ", "Ñ", x) # Ñ (mayúscula)
x <- gsub("ÃÂ", "Á", x) # Á
x <- gsub("º", "º", x) # º
x <- gsub("°", "°", x) # °
x <- gsub("ª", "ª", x) # ª
x <- gsub("¡", "¡", x) # ¡
x <- gsub("¿", "¿", x) # ¿
x <- gsub("ÃÂ", "í", x) # í
x <- gsub("Ó", "Ó", x) # Ó
x <- gsub("Â", "Ê", x) # Ê
x <- gsub("Ãâ€", "É", x) # É
x <- gsub("ü", "ü", x) # ü
x <- gsub("ï", "ï", x) # ï
x <- gsub("ö", "ö", x) # ö
x <- gsub("«", "«", x) # «
x <- gsub("»", "»", x) # »
x <- gsub("Ç", "Ç", x) # Ç
x <- gsub("ç", "ç", x) # ç
x <- gsub("ÂÂ", "", x) # Otros casos residuales
x <- gsub("Ã", "", x) # Otros casos residuales
return(x)
}
sum_dates <- function(x){
cbind.data.frame(
min= as.Date(min(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01"),
p001= as.Date(quantile(unclass(as.Date(x)), .001, na.rm=T), origin = "1970-01-01"),
p005= as.Date(quantile(unclass(as.Date(x)), .005, na.rm=T), origin = "1970-01-01"),
p025= as.Date(quantile(unclass(as.Date(x)), .025, na.rm=T), origin = "1970-01-01"),
p25= as.Date(quantile(unclass(as.Date(x)), .25, na.rm=T), origin = "1970-01-01"),
p50= as.Date(quantile(unclass(as.Date(x)), .5, na.rm=T), origin = "1970-01-01"),
p75= as.Date(quantile(unclass(as.Date(x)), .75, na.rm=T), origin = "1970-01-01"),
p975= as.Date(quantile(unclass(as.Date(x)), .975, na.rm=T), origin = "1970-01-01"),
p995= as.Date(quantile(unclass(as.Date(x)), .995, na.rm=T), origin = "1970-01-01"),
p999= as.Date(quantile(unclass(as.Date(x)), .999, na.rm=T), origin = "1970-01-01"),
max= as.Date(max(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01")
)
}
# Define the function adapted for Polars
sum_dates_polars <- function(df, date_col) {
# Create the list of quantiles
quantiles <- c(0.001, 0.005, 0.025, 0.25, 0.5, 0.75, 0.975, 0.995, 0.999)
# Create expressions to calculate min and max
expr_list <- list(
pl$col(date_col)$min()$alias("min"),
pl$col(date_col)$max()$alias("max")
)
# Add expressions for quantiles
for (q in quantiles) {
expr_list <- append(expr_list, pl$col(date_col)$quantile(q)$alias(paste0("p", sub("\\.", "", as.character(q)))))
}
# Apply the expressions and return a DataFrame with the results
df$select(expr_list)
}C2-C6
SENDA has the following treatment programs:
- General Adult Program (covered in Agreement 1 or C1)
- Specific Women’s Program (covered in Agreement 1 or C1)
- Street Program
- General Children and Adolescents Program
- Probation Program
- Adolescent Offenders Program
- Adult Offenders Program
(Source: https://www.senda.gob.cl/wp-content/uploads/2022/05/Cuenta-Publica-SENDA-2022.pdf)
Code
#E:\Mi unidad\Alvacast\SISTRAT 2023\data\20231018_original_data
# Define the directories
dir_c2_c6_oct <- paste0(gsub("cons", "",
paste0(getwd(),"cons")
), "data/20231018_original_data/")
#matches a string that starts with c
SISTRAT23_c26<-list.files(path=toString(dir_c2_c6_oct), pattern="^c")
dir_c1_oct <- paste0(gsub("cons", "",
paste0(getwd(),"/cons")
), "data/20231018_original_data/")
# Function to simplify pattern matching
matches_pattern <- function(x, patterns) {
any(sapply(patterns, function(p) grepl(p, x)))
}
# Create a function to process each file
process_file <- function(dir, x) {
# Determine the HASH_KEY index based on file name
prefix <- ifelse(matches_pattern(x, "dup1"), "SISTRAT23dup1_",
ifelse(matches_pattern(x, "dup2"), "SISTRAT23dup2_", "SISTRAT23_"))
# Read and process the file
dataset<-readr::read_delim(paste0(dir, x),
na = c("", "NA", "null"),
locale = locale(encoding = "windows-1252"),
guess_max = min(1e5, Inf),
skip = 0)
colnames(dataset) <- sapply(names(dataset), convert_chars)
dataset %>%
janitor::clean_names() %>%
dplyr::rename(
HASH_KEY = !!names(.[(ncol(.))])) %>%
dplyr::mutate(TABLE = rep(x)) %>%
dplyr::select(TABLE, HASH_KEY, everything()) %>%
assign(paste0(prefix, stringr::str_sub(x, 1, 4)), ., envir = .GlobalEnv)
}
#Read data and format
purrr::walk(SISTRAT23_c26, ~process_file(toString(dir_c2_c6_oct), .x))Code
# Process C2-C6 data
CONS_C2 <- plyr::rbind.fill(SISTRAT23dup1_c2_o, SISTRAT23dup2_c2_o) %>%
data.table::data.table() %>%
dplyr::mutate(TABLE = substr(TABLE, start=1, stop=2))
CONS_C3 <- SISTRAT23_c3_o %>% dplyr::mutate(TABLE = substr(TABLE, start=1, stop=2))
CONS_C4 <- SISTRAT23_c4_o %>% dplyr::mutate(TABLE = substr(TABLE, start=1, stop=2))
CONS_C5 <- SISTRAT23_c5_o %>% dplyr::mutate(TABLE = substr(TABLE, start=1, stop=2))
CONS_C6 <- SISTRAT23_c6_o %>% dplyr::mutate(TABLE = substr(TABLE, start=1, stop=2))Replicate the analysis for 2023-2024 databases and merged them. We have the doubt of what column bf means.
Code
dir_c2_c6_oct_25 <- paste0(gsub("cons", "",
paste0(getwd(),"/cons")
), "data/20250529_original_data/")
#matches a string that contains _enc.
SISTRAT23_c26_25<-list.files(dir_c2_c6_oct_25, pattern = "c[2-6]")
# Create a function to process each file
process_file23_c26 <- function(dir, x) {
# Determine the HASH_KEY index based on file name
# Read and process the file
dataset<-readr::read_delim(paste0(dir_c2_c6_oct_25, x),
na = c("", "NA", "null"),
locale = locale(encoding = "windows-1252"),
guess_max = min(1e5, Inf),
skip = 0)
colnames(dataset) <- sapply(names(dataset), convert_chars)
dataset %>%
janitor::clean_names() %>%
dplyr::rename(
HASH_KEY = !!names(.[(ncol(.))])) %>%
dplyr::mutate(TABLE = 2023) %>%
dplyr::select(TABLE, HASH_KEY, everything()) %>%
assign(paste0("df_2022_24_", stringr::str_sub(x, 11,12)), ., envir = .GlobalEnv)
}
#Apply function
purrr::walk(SISTRAT23_c26_25, ~process_file23_c26(toString(dir_c2_c6_oct), .x))Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Normalize variable names")
rename_if_present <- function(df, rename_map) {
present <- rename_map[rename_map %in% names(df)]
if (length(present)) {
df <- df %>% rename(!!!setNames(unname(present), names(present)))
}
df
}
# c2 — needs renaming
rename_map_c2 <- c(
"codigo_identificaci_afn" = "codigo_identificacion",
"regi_afn_centro" = "region_centro",
"n_afomerodehijos" = "numerodehijos",
"a_afos_deserci_afn_escolar" = "anos_desercion_escolar",
"via_administraci_afn" = "via_administracion",
"a_setratadeunamujerembaraza" = "setratadeunamujerembarazad",
"orientaci_afn_sexual" = "orientaci_a2n_sexual",
"opci_afndiscapacidad" = "opciondiscapacidad"
)
df_2022_24_c2 <- rename_if_present(df_2022_24_c2, rename_map_c2)
# c3–c6 — maps intentionally empty (names already match)
df_2022_24_c3 <- rename_if_present(df_2022_24_c3, c())
df_2022_24_c4 <- rename_if_present(df_2022_24_c4, c())
df_2022_24_c5 <- rename_if_present(df_2022_24_c5, c())
df_2022_24_c6 <- rename_if_present(df_2022_24_c6, c())
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Merge 2024 with 2025
CONS_C2_25 <- plyr::rbind.fill(CONS_C2, df_2022_24_c2) %>%
data.table::data.table()
CONS_C3_25 <- plyr::rbind.fill(CONS_C3, df_2022_24_c3) %>%
data.table::data.table()
CONS_C4_25 <- plyr::rbind.fill(CONS_C4, df_2022_24_c4) %>%
data.table::data.table()
CONS_C5_25 <- plyr::rbind.fill(CONS_C5, df_2022_24_c5) %>%
data.table::data.table()
CONS_C6_25 <- plyr::rbind.fill(CONS_C6, df_2022_24_c6) %>%
data.table::data.table()
unique_values_list_CONS_C2_25 <- setNames(
lapply(setdiff(names(CONS_C2_25),c("HASH_KEY")), function(col_name) {
CONS_C2_25 |>
select(all_of(col_name)) |>
distinct() |>
pull()
}),
setdiff(names(CONS_C2_25),c("HASH_KEY"))
)
unique_values_list_CONS_C3_25 <- setNames(
lapply(setdiff(names(CONS_C3_25),c("HASH_KEY")), function(col_name) {
CONS_C3_25 |>
select(all_of(col_name)) |>
distinct() |>
pull()
}),
setdiff(names(CONS_C3_25),c("HASH_KEY"))
)
unique_values_list_CONS_C4_25 <- setNames(
lapply(setdiff(names(CONS_C4_25),c("HASH_KEY")), function(col_name) {
CONS_C4_25 |>
select(all_of(col_name)) |>
distinct() |>
pull()
}),
setdiff(names(CONS_C4_25),c("HASH_KEY"))
)
unique_values_list_CONS_C5_25 <- setNames(
lapply(setdiff(names(CONS_C5_25),c("HASH_KEY")), function(col_name) {
CONS_C5_25 |>
select(all_of(col_name)) |>
distinct() |>
pull()
}),
setdiff(names(CONS_C5_25),c("HASH_KEY"))
)
unique_values_list_CONS_C6_25 <- setNames(
lapply(setdiff(names(CONS_C6_25),c("HASH_KEY")), function(col_name) {
CONS_C6_25 |>
select(all_of(col_name)) |>
distinct() |>
pull()
}),
setdiff(names(CONS_C6_25),c("HASH_KEY"))
)
distinct_values_dbs_c2c6<-
rbind.data.frame(
cbind.data.frame(df="c2",list_to_df(unique_values_list_CONS_C2_25)),
cbind.data.frame(df="c3",list_to_df(unique_values_list_CONS_C3_25)),
cbind.data.frame(df="c4",list_to_df(unique_values_list_CONS_C4_25)),
cbind.data.frame(df="c5",list_to_df(unique_values_list_CONS_C5_25)),
cbind.data.frame(df="c6",list_to_df(unique_values_list_CONS_C6_25))
)
distinct_values_dbs_c2c6|>
group_by(df, variable)|>
slice(1:5)|>
rio::export("_out/unique_values_variable_names_db.csv")Normalize variable names
Normalize variable names & explore data
We normalized the variable names according to the C1 naming convention. We assumed that “commune/municipality” referred to the patient rather than the center. Variable names were formatted in snake case for greater standardization. We also corrected the names according to the information provided by SENDA for fields where only the position name in alphabetical order was available: columnas_sin_nombre_REV.
Code
MAIN_NAMES <- names(SISTRAT23_c1_2010_2024_df2)
map_c2 <- c(
"region_del_centro" = "region_centro",
"servicio_de_salud" = "servicio_salud",
"comuna_residencia" = "comuna_usuario",# supose uers commune
"origen_de_ingreso" = "origen_ingreso",
"fecha_ingreso_a_tratamiento" = "fecha_ingreso_tratamiento",
"numero_de_tratamientos_anteriores" = "numero_tratamientos_anteriores",
"sustancia_de_inicio" = "sustancia_inicial",
"edad_inicio_consumo" = "edad_inicio_sustancia_inicial",
"frecuencia_de_consumo_sustancia_principal" = "frecuencia_consumo",
"via_administracion_sustancia_principal" = "via_administracion",
"diagnostico_trs_consumo_sustancia" = "diagnosticotrsconsumosustanc",
"diagnostico_trs_fisico" = "diagnosticotrsfisico",
"diagnostico_trs_psiquiatrico_dsm_iv" = "diagnosticotrsdsm",
"diagnostico_trs_psiquiatrico_sub_dsm_iv" = "diagnosticotrsdsmsub",
"x2_diagnostico_trs_psiquiatrico_dsm_iv" = "diagnosticotrsdsm2",
"x2_diagnostico_trs_psiquiatrico_sub_dsm_iv" = "diagnosticotrsdsmsub2",
"x3_diagnostico_trs_psiquiatrico_dsm_iv" = "diagnosticotrsdsm3",
"x3_diagnostico_trs_psiquiatrico_sub_dsm_iv" = "diagnosticotrsdsmsub3",
"diagnostico_trs_psiquiatrico_cie_10" = "diagnosticotrscie10",
"diagnostico_trs_psiquiatrico_sub_cie_10" = "diagnosticotrscie10sub",
"x2_diagnostico_trs_psiquiatrico_cie_10" = "diagnosticotrscie102",
"x2_diagnostico_trs_psiquiatrico_sub_cie_10" = "diagnosticotrscie10sub2",
"x3_diagnostico_trs_psiquiatrico_cie_10" = "diagnosticotrscie103",
"x3_diagnostico_trs_psiquiatrico_sub_cie_10" = "diagnosticotrscie10sub3",
"compromiso_biopsicosocial" = "compromisobiopsicosocial",
"se_trata_de_una_mujer_embarazada" = "setratadeunamujerembarazad",
"fecha_egreso_de_tratamiento" = "fecha_egreso",
"motivo_de_egreso" = "motivo_egreso",
"evaluacion_del_proceso_terapeutico" = "evaluacion_proceso_terapeutico_e",
"evaluacion_al_egreso_respecto_al_patron_de_consumo" = "patronde_consumo",
"evaluacion_al_egreso_respecto_a_situacion_familiar" = "situacion_familiar",
"evaluacion_al_egreso_respecto_relaciones_interpersonales" = "relacion_interpesonal",
"evaluacion_al_egreso_respecto_a_situacion_ocupacional" = "situacion_ocupacional",
"evaluacion_al_egreso_respecto_salud_mental" = "salud_mental",
"evaluacion_al_egreso_respecto_salud_fisica" = "salud_fisica",
"dias_en_tratamiento" = "diasdetratamiento",
"n_meses_en_tratamiento" = "n_mesesen_tratamiento",
"tipo_centro" = "tipode_centro",
"id_centro" = "i_dcentro",
"identidad_de_genero" = "identidaddegenero",
"orientacion_sexual" = "orientaci_a2n_sexual",
"opcion_discapacidad" = "opciondiscapacidad",
"ha_estado_embarazada_egreso" = "haestadoembarazadaegreso"
)
CONS_C2_25 <- rename_if_present_to_main(CONS_C2_25, map_c2, MAIN_NAMES)
CONS_C2_25 <- replace_columns_if_any(
df = CONS_C2_25,
x = "bi",
y = "otros_problemas_de_atencion_de_salud_mental2"
)
map_c3 <- c(
"region_del_centro" = "regiondel_centro",
"servicio_de_salud" = "serviciode_salud",
"dias_en_tratamiento" = "diasen_tratamiento",
"n_meses_en_tratamiento" = "n_mesesen_tratamiento",
"origen_de_ingreso" = "origende_ingreso",
"pais_nacimiento" = "pa_a_s_nacimiento",
"numero_de_hijos" = "numerode_hijos",
"numero_de_tratamientos_anteriores" = "numerode_tratamientos_anteriore",
"sustancia_de_inicio" = "sustanciade_inicio",
"se_trata_de_una_mujer_embarazada" = "setratadeunamujerembarazad",
"escolaridad_ultimo_ano_cursado" = "escolaridadultimoanocursado",
"categoria_ocupacional" = "categor_a_a_ocupacional",
"rubro_trabaja" = "enquerubrotrabaja",
"otras_sustancias_no1" = "otras_sustanciasno1",
"otras_sustancias_no2" = "otras_sustanciasno2",
"frecuencia_de_consumo_sustancia_principal" = "frecuenciade_consumo_sustancia",
"via_administracion_sustancia_principal" = "va_a_administracion_sustancia_pr",
"diagnostico_trs_consumo_sustancia" = "diagnostico_trs_consumo_sustanc",
# "diagnostico_trs_psiquiatrico_cie_10" = "diagnostico_trs_psiquiatrico_ci",
# "diagnostico_trs_psiquiatrico_sub_cie_10" = "diagnostico_trs_psiquiatrico_su",
# "diagnostico_trs_psiquiatrico_dsm_iv" = "diagnostico_trs_psiquiatrico",
"diagnostico_trs_fisico" = "diagnostico_trs_fa_sico",
"otros_problemas_de_atencion_de_salud_mental" = "otros_problemasde_atencionde_s",
"fecha_ingreso_a_tratamiento" = "fecha_ingresoa_tratamiento",
"fecha_egreso_de_tratamiento" = "fecha_egresode_tratamiento",
"motivo_de_egreso" = "motivode_egreso",
"evaluacion_del_proceso_terapeutico" = "evaluaciondel_proceso_terapeuti",
"evaluacion_al_egreso_respecto_al_patron_de_consumo" = "evaluacional_egreso_respectoal",
"evaluacion_al_egreso_respecto_a_situacion_familiar" = "evaluacional_egreso_respectoa",
"evaluacion_al_egreso_respecto_relaciones_interpersonales" = "evaluacional_egreso_respecto_re",
"evaluacion_al_egreso_respecto_salud_mental" = "evaluacional_egreso_respecto_sa",
"evaluacion_al_egreso_respecto_trasgresion_a_la_norma_social" = "evaluacional_egreso_respecto_tr",
"ha_estado_embarazada_egreso" = "haestadoembarazadaegreso",
"identidad_de_genero" = "identidaddegenero",
"opcion_discapacidad" = "opciondiscapacidad"
)
CONS_C3_25 <- rename_if_present_to_main(CONS_C3_25, map_c3, MAIN_NAMES)
# CONS_C3_25[,c("diagnostico_trs_psiquiatrico_ci", "diagnostico_trs_psiquiatrico_su", "diagnostico_trs_psiquiatrico", "ar")]
CONS_C3_25 <- replace_columns_if_any(
df = CONS_C3_25,
x = "ar",
y = "diagnostico_trs_psiquiatrico_sub_cie_10_2"
)
CONS_C3_25 <- replace_columns_if_any(
df = CONS_C3_25,
x = "aw",
y = "otros_problemas_de_atencion_de_salud_mental_2"
)
CONS_C3_25 <- replace_columns_if_any(
df = CONS_C3_25,
x = "bh",
y = "evaluacion_al_egreso_respecto_sit_ocup"
)
CONS_C3_25 <- replace_columns_if_any(
df = CONS_C3_25,
x = "bj",
y = "evaluacion_al_egreso_respecto_salud_fisica"
)
map_c4 <- c(
"codigo_identificacion" = "codigoidentificacion",
"nombre_centro" = "nombrecentro",
"tipo_centro" = "tipocentro",
"region_del_centro" = "regiondelcentro",
"fecha_nacimiento" = "fechanacimiento",
"fecha_ingreso_a_tratamiento" = "fechaingresotratamiento",
"tipo_de_programa" = "tipoprograma",
"tipo_de_plan" = "tipoplan",
"origen_de_ingreso" = "origendeingreso",
"consentimiento_informado" = "consentimientoinformado",
"pais_nacimiento" = "paisnacimiento",
"estado_conyugal" = "estadoconyugal",
"numero_de_hijos" = "numerodehijos",
"escolaridad_ultimo_ano_cursado" = "escolaridadultimoaa_ocursado",
"se_trata_de_una_mujer_embarazada" = "setratadeunamujerembarazada",
"condicion_ocupacional" = "condicionocupacional",
"categoria_ocupacional" = "categoriaocupacional",
"rubro_trabaja" = "enquerubrotrabaja",
"con_quien_vive" = "conquienvive",
"parentesco_con_el_jefe_de_hogar" = "parentescoconeljefedehogar",
"tipo_de_vivienda" = "tipovivienda",
"tenencia_de_la_vivienda" = "tenenciadelavivienda",
"numero_de_tratamientos_anteriores" = "numerotratamientosanteriores",
"fecha_ultimo_tratamiento" = "fechaultimotratamiento",
"sustancia_principal" = "sustanciaprincipal",
"otras_sustancias_no1" = "otrasustanci_an1",
"otras_sustancias_no2" = "otrasustanci_an2",
"frecuencia_de_consumo_sustancia_principal" = "frecuenciadeconsumosustancia",
"via_administracion_sustancia_principal" = "viadeadministracionsustancia",
"sustancia_de_inicio" = "sustanciadeinicio",
"edad_inicio_consumo" = "edaddeiniciosustanciainicia",
"edad_inicio_sustancia_principal" = "edaddeinico_sustancia_principa",
"diagnostico_trs_consumo_sustancia" = "diagnosticotrsconsumosustanc",
"diagnostico_trs_psiquiatrico_cie_10" = "diagnosticotrspsiquiatricoci",
"diagnostico_trs_psiquiatrico_dsm_iv" = "diagnosticotrspsiquiatricods",
"diagnostico_trs_fisico" = "diagnosticotrsfisico",
"otros_problemas_de_atencion_de_salud_mental" = "otrosproblemasdeatenciondes",
"compromiso_biopsicosocial" = "comprimisobiopsicosocial",
"fecha_egreso_de_tratamiento" = "fechaegresotratamiento",
"motivo_de_egreso" = "motivodeegreso",
"evaluacion_del_proceso_terapeutico" = "evaluaciondelprocesoterapeuti",
"evaluacion_al_egreso_respecto_al_patron_de_consumo" = "evaluacionalegresorespectoal",
"evaluacion_al_egreso_respecto_a_situacion_familiar" = "evaluacionalegresorespectoa",
"evaluacion_al_egreso_respecto_relaciones_interpersonales" = "evaluacionalegresorespectore",
"dias_en_tratamiento" = "diasdetratamiento",
"ha_estado_embarazada_egreso" = "haestadoembarazadaegreso",
"identidad_de_genero" = "identidaddegenero",
"orientacion_sexual" = "orientacion_sexual",
"opcion_discapacidad" = "opciondiscapacidad"
)
CONS_C4_25 <- rename_if_present_to_main(CONS_C4_25, map_c4, MAIN_NAMES)
CONS_C4_25 <- replace_columns_if_any(
df = CONS_C4_25,
x = "ao",
y = "diagnostico_trs_psiquiatrico_cie_10_2"
)
CONS_C4_25 <- replace_columns_if_any(
df = CONS_C4_25,
x = "ap",
y = "diagnostico_trs_psiquiatrico_cie_10_3"
)
CONS_C4_25 <- replace_columns_if_any(
df = CONS_C4_25,
x = "ar",
y = "diagnostico_trs_psiquiatrico_dsm_iv_2"
)
CONS_C4_25 <- replace_columns_if_any(
df = CONS_C4_25,
x = "as",
y = "diagnostico_trs_psiquiatrico_dsm_iv_3"
)
CONS_C4_25 <- replace_columns_if_any(
df = CONS_C4_25,
x = "av",
y = "otros_problemas_de_atencion_de_salud_mental_2"
)
CONS_C4_25 <- replace_columns_if_any(
df = CONS_C4_25,
x = "ba",
y = "tiempodepermanenciaprivadode_meses"
)
CONS_C4_25 <- replace_columns_if_any(
df = CONS_C4_25,
x = "bb",
y = "tiempodepermanenciaprivadode_anos"
)
CONS_C4_25 <- replace_columns_if_any(
df = CONS_C4_25,
x = "bf",
y = "diagnostico_trs_psiquiatrico_cie_10_al_egreso"
)
CONS_C4_25 <- replace_columns_if_any(
df = CONS_C4_25,
x = "bj",
y = "evaluacion_al_egreso_respecto_a_situacion_ocupacional"
)
CONS_C4_25 <- replace_columns_if_any(
df = CONS_C4_25,
x = "bk",
y = "evaluacion_al_egreso_respecto_salud_mental"
)
CONS_C4_25 <- replace_columns_if_any(
df = CONS_C4_25,
x = "bl",
y = "evaluacion_al_egreso_respecto_salud_fisica"
)
CONS_C4_25 <- replace_columns_if_any(
df = CONS_C4_25,
x = "bm",
y = "evaluacion_al_egreso_respecto_trasgresion_a_la_norma_social"
)
map_c5 <- c(
"codigo_identificacion" = "codigo_identificaci_afn",
"tipo_centro" = "tipode_centro",
"region_del_centro" = "regi_afn_centro",
"comuna_residencia" = "comuna_usuario",# suposed commune of user
"servicio_de_salud" = "servicio_salud",
"tipo_de_plan" = "tipode_plan",
"dias_en_tratamiento" = "diasen_tratamiento",
"fecha_nacimiento" = "fechanacimiento",
"n_meses_en_tratamiento" = "n_meses_tratamiento",
"fecha_ingreso_a_tratamiento" = "fecha_ingreso_tratamiento",
"origen_de_ingreso" = "origen_ingreso",
"numero_de_hijos" = "n_afomerodehijos",
"via_administracion_sustancia_principal" = "via_administraci_afn",
"fecha_egreso_de_tratamiento" = "fecha_egreso",
"motivo_de_egreso" = "motivo_egreso",
"otros_problemas_de_atencion_de_salud_mental" = "otros_problemasde_atenci_afnde",
"evaluacion_del_proceso_terapeutico_e" = "evaluacion_proceso_terapeutico_e",
"evaluacion_al_egreso_respecto_al_patron_de_consumo" = "patronde_consumo",
"evaluacion_al_egreso_respecto_a_situacion_familiar" = "situacion_familiar",
"evaluacion_al_egreso_respecto_relaciones_interpersonales" = "relacion_interpesonal",
"evaluacion_al_egreso_respecto_a_situacion_ocupacional" = "situacion_ocupacional",
"evaluacion_al_egreso_respecto_salud_mental" = "salud_mental",
"evaluacion_al_egreso_respecto_salud_fisica" = "salud_fisica",
"diagnostico_trs_consumo_sustancia" = "diagn_afstico_trs_consumode_sus",
"diagnostico_trs_fisico" = "diagn_afstico_trs_f_afsico",
"diagnostico_trs_psiquiatrico_dsm_iv" = "diagn_afstico_trs_psiqui_aftrico",
"diagnostico_trs_psiquiatrico_dsm_iv_2" = "diagn_afstico_trs_psiqui_aftric",
"motivodeegreso_alta_administra" = "motivo_de_egreso_alta_administrativa",
"orientacion_sexual" = "orientaci_afn_sexual",
"opcion_discapacidad" = "opci_afndiscapacidad",
"id_centro" = "centro_id",
"identidad_de_genero" = "identidaddegenero",
"ha_estado_embarazada_egreso" = "haestadoembarazadaegreso"
)
CONS_C5_25 <- rename_if_present_to_main(CONS_C5_25, map_c5, MAIN_NAMES)
CONS_C5_25 <- replace_columns_if_any(
df = CONS_C5_25,
x = "s",
y = "enlo_penal_2"
)
CONS_C5_25 <- replace_columns_if_any(
df = CONS_C5_25,
x = "be",
y = "diagnostico_trs_psiquiatrico_sub_dsm_iv"
)
CONS_C5_25 <- replace_columns_if_any(
df = CONS_C5_25,
x = "bg",
y = "diagnostico_trs_psiquiatrico_sub_dsm_iv_2"
)
CONS_C5_25 <- replace_columns_if_any(
df = CONS_C5_25,
x = "bh",
y = "diagnostico_trs_psiquiatrico_cie_10"
)
CONS_C5_25 <- replace_columns_if_any(
df = CONS_C5_25,
x = "bi",
y = "diagnostico_trs_psiquiatrico_sub_cie_10"
)
CONS_C5_25 <- replace_columns_if_any(
df = CONS_C5_25,
x = "bj",
y = "diagnostico_trs_psiquiatrico_cie_10_2"
)
CONS_C5_25 <- replace_columns_if_any(
df = CONS_C5_25,
x = "bk",
y = "diagnostico_trs_psiquiatrico_sub_cie_10_2"
)
CONS_C5_25 <- replace_columns_if_any(
df = CONS_C5_25,
x = "bm",
y = "otros_problemas_de_atencion_de_salud_mental_2"
)
map_c6 <- c(
"codigo_identificacion" = "codigoidentificacion",
"nombre_centro" = "nombredelcentro",
"tipo_centro" = "tipocentro",
"region_del_centro" = "regiondelcentro",
"fecha_ingreso_a_tratamiento" = "fechaingresotratamiento",
"dias_en_tratamiento" = "diasen_tratamiento",
"n_meses_en_tratamiento" = "n_mesesen_tratamiento",
"fecha_nacimiento" = "fechanacimiento",
"consentimiento_informado" = "consentimientoinformado",
"comuna_residencia" = "comuna",# suposed commune of user
"origen_de_ingreso" = "origendeingreso",
"pais_nacimiento" = "paisnacimiento",
"estado_conyugal" = "estadoconyugal",
"numero_de_hijos" = "numerodehijos",
"escolaridad_ultimo_ano_cursado" = "escolaridadultimoaa_ocursado",
"se_trata_de_una_mujer_embarazada" = "setratadeunamujerembarazada",
"condicion_ocupacional" = "condicionocupacional",
"categoria_ocupacional" = "categoriaocupacional",
"con_quien_vive" = "conquienviva_a",
"parentesco_con_el_jefe_de_hogar" = "parentescoconeljefedehogar",
"tipo_de_vivienda" = "tipovivienda",
"tenencia_de_la_vivienda" = "tenenciadelavivienda",
"numero_de_tratamientos_anteriores" = "numerotratamientosanteriores",
"fecha_ultimo_tratamiento" = "fechaultimotratamiento",
"sustancia_principal" = "sustanciaprincipal",
"otras_sustancias_no1" = "otrasustanci_an1",
"otras_sustancias_no2" = "otrasustanci_an2",
"frecuencia_de_consumo_sustancia_principal" = "frecuenciadeconsumosustancia",
"edad_inicio_sustancia_principal" = "edaddeiniciosustanciaprinci",
"via_administracion_sustancia_principal" = "viadeadministracionsustancia",
"sustancia_de_inicio" = "sustanciadeinicio",
"edad_inicio_consumo" = "edaddeiniciosustanciainicia",
"diagnostico_trs_consumo_sustancia" = "diagnosticotrsconsumosustanc",
"diagnostico_trs_psiquiatrico_cie_10" = "diagnosticotrspsiquiatricoci",
"diagnostico_trs_psiquiatrico_dsm_iv" = "diagnosticotrspsiquiatricods",
"otros_problemas_de_atencion_de_salud_mental" = "otrosproblemasdeatenciondes",
"compromiso_biopsicosocial" = "comprimisobiopsicosocial",
"fecha_egreso_de_tratamiento" = "fechaegresotratamiento",
"motivo_de_egreso" = "motivodeegreso",
"evaluacion_del_proceso_terapeutico" = "evaluaciondelprocesoterapeuti",
"evaluacion_al_egreso_respecto_al_patron_de_consumo" = "evaluacionalegresorespectoal",
"evaluacion_al_egreso_respecto_a_situacion_familiar" = "evaluacionalegresorespectoa",
"evaluacion_al_egreso_respecto_relaciones_interpersonales" = "evaluacionalegresorespectore",
"motivo_de_egreso_alta_administrativa" = "motivoaltaadministrativa",
"ha_estado_embarazada_egreso" = "haestadoembarazadaegreso",
"identidad_de_genero" = "identidaddegenero",
"orientacion_sexual" = "orientacion_sexual",
"opcion_discapacidad" = "opciondiscapacidad"
)
CONS_C6_25 <- rename_if_present_to_main(CONS_C6_25, map_c6, MAIN_NAMES)
CONS_C6_25 <- replace_columns_if_any(
df = CONS_C6_25,
x = "an",
y = "frecuencia_de_consumo_sustancia_principal_medio_cerrado"
)
CONS_C6_25 <- replace_columns_if_any(
df = CONS_C6_25,
x = "au",
y = "diagnostico_trs_psiquiatrico_cie_10_2"
)
CONS_C6_25 <- replace_columns_if_any(
df = CONS_C6_25,
x = "av",
y = "diagnostico_trs_psiquiatrico_cie_10_3"
)
CONS_C6_25 <- replace_columns_if_any(
df = CONS_C6_25,
x = "ax",
y = "diagnostico_trs_psiquiatrico_dsm_iv_2"
)
CONS_C6_25 <- replace_columns_if_any(
df = CONS_C6_25,
x = "ay",
y = "diagnostico_trs_psiquiatrico_dsm_iv_3"
)
CONS_C6_25 <- replace_columns_if_any(
df = CONS_C6_25,
x = "ba",
y = "otros_problemas_de_atencion_de_salud_mental_2"
)
CONS_C6_25 <- replace_columns_if_any(
df = CONS_C6_25,
x = "bn",
y = "diagnostico_trs_psiquiatrico_cie_10_al_egreso"
)
CONS_C6_25 <- replace_columns_if_any(
df = CONS_C6_25,
x = "br",
y = "evaluacion_al_egreso_respecto_a_situacion_ocupacional"
)
CONS_C6_25 <- replace_columns_if_any(
df = CONS_C6_25,
x = "bs",
y = "evaluacion_al_egreso_respecto_salud_mental"
)
CONS_C6_25 <- replace_columns_if_any(
df = CONS_C6_25,
x = "bt",
y = "evaluacion_al_egreso_respecto_salud_fisica"
)
CONS_C6_25 <- replace_columns_if_any(
df = CONS_C6_25,
x = "bu",
y = "evaluacion_al_egreso_respecto_trasgresion_a_la_norma_social"
)Code
CONS_C6_25$edad_ingreso<-
lubridate::time_length(
interval(
as.Date(CONS_C6_25$fechanacimiento, format = "%d/%m/%Y"),
as.Date(CONS_C6_25$fecha_ingreso_a_tratamiento, format = "%d/%m/%Y")
),
unit = "year"
)
# Create the histograms for each dataset
hist_c2 <- ggplot(CONS_C2_25, aes(x = edad)) +
geom_histogram(fill = "gray70", color = "black", bins = 30) +
ggtitle("c2")+ theme_bw()+ xlim(0,90)
hist_c3 <- ggplot(CONS_C3_25, aes(x = edad)) +
geom_histogram(fill = "gray70", color = "black", bins = 30) +
ggtitle("c3")+ theme_bw()+ xlim(0,90)
hist_c4 <- ggplot(CONS_C4_25, aes(x = edad)) +
geom_histogram(fill = "gray70", color = "black", bins = 30) +
ggtitle("c4")+ theme_bw()+ xlim(0,90)
hist_c5 <- ggplot(CONS_C5_25, aes(x = edad)) +
geom_histogram(fill = "gray70", color = "black", bins = 30) +
ggtitle("c5")+ theme_bw()+ xlim(0,90)
hist_c6 <- ggplot(CONS_C6_25, aes(x = edad_ingreso)) +
geom_histogram(fill = "gray70", color = "black", bins = 30) +
ggtitle("c6")+ theme_bw()+ xlim(0,90)
# Combine histograms into a single plot
combined_plot <- ggpubr::ggarrange(hist_c2, hist_c3, hist_c4, hist_c5, hist_c6, ncol = 3, nrow = 2)Warning: Removed 2 rows containing non-finite outside the scale range (stat_bin()).
Warning: Removed 2 rows containing missing values or values outside the scale range (geom_bar()). Removed 2 rows containing missing values or values outside the scale range (geom_bar()).
Warning: Removed 2 rows containing non-finite outside the scale range (stat_bin()).
Warning: Removed 2 rows containing missing values or values outside the scale range (geom_bar()).
Warning: Removed 1 row containing non-finite outside the scale range (stat_bin()).
Warning: Removed 2 rows containing missing values or values outside the scale range (geom_bar()). Removed 2 rows containing missing values or values outside the scale range (geom_bar()).
Code
# Display the combined plot
print(combined_plot)Code
# Get the column names for each dataset
names_c2 <- names(CONS_C2_25)
names_c3 <- names(CONS_C3_25)
names_c4 <- names(CONS_C4_25)
names_c5 <- names(CONS_C5_25)
names_c6 <- names(CONS_C6_25)
df_c2 <- data.frame(Dataset = "CONS_C2", Column_Name = names_c2)
df_c3 <- data.frame(Dataset = "CONS_C3", Column_Name = names_c3)
df_c4 <- data.frame(Dataset = "CONS_C4", Column_Name = names_c4)
df_c5 <- data.frame(Dataset = "CONS_C5", Column_Name = names_c5)
df_c6 <- data.frame(Dataset = "CONS_C6", Column_Name = names_c6)
dplyr::full_join(df_c2, df_c3, by = "Column_Name") %>%
dplyr::full_join(df_c4, by = "Column_Name") %>%
dplyr::full_join(df_c5, by = "Column_Name") %>%
dplyr::full_join(df_c6, by = "Column_Name") %>%
dplyr::arrange(Column_Name) -> joined_df
joined_df %>%
dplyr::select(Column_Name, everything()) %>%
dplyr::arrange(Column_Name) %>%
dplyr::rename("c2"="Dataset.x","c3"="Dataset.y", "c4"="Dataset.x.x", "c5"="Dataset.y.y","c6"="Dataset") %>%
dplyr::mutate_at(.vars = vars(matches("c[0-9]$")),
.funs = ~ ifelse(!is.na(.), "X", "")) %>%
kbl("markdown", caption="Replicated Fields in Databases")| Column_Name | c2 | c3 | c4 | c5 | c6 |
|---|---|---|---|---|---|
| HASH_KEY | X | X | X | X | X |
| TABLE | X | X | X | X | X |
| a_afos_deserci_afn_escolar | X | X | |||
| a_setratadeunamujerembaraza | X | ||||
| actualmenteen_sistema_escolar | X | X | |||
| antecedentes_penales | X | ||||
| aplicacia_nigi | X | ||||
| asistencia_senameegreso | X | ||||
| asistencia_senameingreso | X | ||||
| categoria_ocupacional | X | X | X | ||
| causa_delito | X | X | |||
| codigo_identificaci_afn | X | ||||
| codigo_identificacion | X | X | X | X | |
| compromiso_biopsicosocial | X | X | X | X | X |
| comuna_residencia | X | X | X | X | |
| comunadelcentro | X | ||||
| con_quien_vive | X | X | X | X | X |
| concausajudicial | X | ||||
| condicion_ocupacional | X | X | X | ||
| condiciondelaley20603 | X | ||||
| consentimiento_informado | X | X | X | X | X |
| consorcio | X | ||||
| delito | X | ||||
| delitoporelcualcumpleconden | X | ||||
| diagn_afstico_trs_psiqui_aftric | X | ||||
| diagnostico_trastorno_psiquiatri | X | ||||
| diagnostico_trs_consumo_sustancia | X | X | X | X | X |
| diagnostico_trs_fa_sico2 | X | ||||
| diagnostico_trs_fa_sico3 | X | ||||
| diagnostico_trs_fisico | X | X | X | X | |
| diagnostico_trs_psiquiatrico | X | ||||
| diagnostico_trs_psiquiatrico_ci | X | ||||
| diagnostico_trs_psiquiatrico_cie_10 | X | X | X | X | |
| diagnostico_trs_psiquiatrico_cie_10_2 | X | X | X | ||
| diagnostico_trs_psiquiatrico_cie_10_3 | X | X | |||
| diagnostico_trs_psiquiatrico_cie_10_al_egreso | X | X | |||
| diagnostico_trs_psiquiatrico_dsm_iv | X | X | X | X | |
| diagnostico_trs_psiquiatrico_dsm_iv_2 | X | X | |||
| diagnostico_trs_psiquiatrico_dsm_iv_3 | X | X | |||
| diagnostico_trs_psiquiatrico_su | X | ||||
| diagnostico_trs_psiquiatrico_sub_cie_10 | X | X | |||
| diagnostico_trs_psiquiatrico_sub_cie_10_2 | X | X | |||
| diagnostico_trs_psiquiatrico_sub_dsm_iv | X | X | |||
| diagnostico_trs_psiquiatrico_sub_dsm_iv_2 | X | ||||
| diagnosticotrscie10egreso1 | X | X | |||
| diagnosticotrscie10egreso2 | X | X | |||
| diagnosticotrscie10egreso3 | X | ||||
| dias_en_tratamiento | X | X | X | X | X |
| discapacidad | X | X | X | X | X |
| donde_vive | X | X | |||
| duraciondelacondenaaa_o | X | ||||
| duraciondelacondenada_as | X | ||||
| duraciondelacondenames | X | ||||
| edad | X | X | X | X | |
| edad_ingreso | X | ||||
| edad_inicio_consumo | X | X | X | X | |
| edad_inicio_sustancia_inicial | X | ||||
| edad_inicio_sustancia_principal | X | X | X | X | X |
| edaddeiniciodeconductasdeli | X | ||||
| edaddela_primeradetencion | X | ||||
| edaddeprimerdelito | X | X | |||
| embarazo | X | ||||
| en_tribunalesde_familia1 | X | ||||
| en_tribunalesde_familia2 | X | ||||
| en_tribunalesde_familia3 | X | ||||
| en_tribunalesde_familia4 | X | ||||
| en_tribunalesde_familia5 | X | ||||
| enlo_penal | X | ||||
| enlo_penal_2 | X | ||||
| enquerubrotrabajaba | X | ||||
| escolaridad | X | X | |||
| escolaridad_opc | X | X | |||
| escolaridad_ultimo_ano_cursado | X | X | X | ||
| estado_civil | X | X | |||
| estado_conyugal | X | X | X | ||
| estado_laboral | X | X | |||
| etnia | X | X | X | X | X |
| evaluacion_al_egreso_respecto_a_situacion_familiar | X | X | X | X | X |
| evaluacion_al_egreso_respecto_a_situacion_ocupacional | X | X | X | X | |
| evaluacion_al_egreso_respecto_al_patron_de_consumo | X | X | X | X | X |
| evaluacion_al_egreso_respecto_relaciones_interpersonales | X | X | X | X | X |
| evaluacion_al_egreso_respecto_salud_fisica | X | X | X | X | X |
| evaluacion_al_egreso_respecto_salud_mental | X | X | X | X | X |
| evaluacion_al_egreso_respecto_sit_ocup | X | ||||
| evaluacion_al_egreso_respecto_trasgresion_a_la_norma_social | X | X | X | ||
| evaluacion_del_proceso_terapeutico | X | X | X | X | |
| evaluacion_proceso_terapeutico_e | X | ||||
| fecha_egreso_de_tratamiento | X | X | X | X | X |
| fecha_ingreso_a_tratamiento | X | X | X | X | X |
| fecha_nacimiento | X | X | |||
| fecha_ultimo_tratamiento | X | X | X | ||
| fechanacimiento | X | X | |||
| frecuencia_consumo | X | ||||
| frecuencia_de_consumo_sustancia_principal | X | X | X | X | |
| frecuencia_de_consumo_sustancia_principal_medio_cerrado | X | ||||
| ha_estado_embarazada_egreso | X | X | X | X | X |
| hatenidoingresosa_cip | X | ||||
| hatenidoingresosa_crc | X | ||||
| id_centro | X | X | |||
| idbd | X | ||||
| identidad_de_genero | X | X | X | X | X |
| intoxicaci_afn_aguda | X | ||||
| intoxicacionaguda | X | ||||
| lugardondeduerme | X | ||||
| lugardonderealiza_ultimotrata | X | ||||
| motivo_de_egreso | X | X | X | X | X |
| motivo_de_egreso_alta_administrativa | X | ||||
| motivodeegreso_alta_administra | X | X | |||
| n_afomerodehijos | X | ||||
| n_as_merodevecesquehaingresado | X | X | |||
| n_meses_en_tratamiento | X | X | X | X | |
| nacionalidad | X | X | X | X | X |
| nombre_centro | X | X | X | X | X |
| nombre_consorcio_prestador | X | ||||
| nombre_usuario | X | X | |||
| numero_de_hijos | X | X | X | X | |
| numero_de_tratamientos_anteriores | X | X | X | X | |
| numero_tratamientos_anteriores | X | ||||
| numerode_sanciones_anteriores | X | ||||
| opci_afndiscapacidad | X | ||||
| opcion_discapacidad | X | X | X | X | |
| orientaci_afn_sexual | X | ||||
| orientacion_sexual | X | X | X | X | |
| origen_de_ingreso | X | X | X | X | X |
| otra_sustancia1 | X | X | |||
| otra_sustancia2 | X | X | |||
| otra_sustancia3 | X | ||||
| otras_sustancias_no1 | X | X | X | ||
| otras_sustancias_no2 | X | X | X | ||
| otros_problemas_de_atencion_de_salud_mental | X | X | X | X | |
| otros_problemas_de_atencion_de_salud_mental2 | X | ||||
| otros_problemas_de_atencion_de_salud_mental_2 | X | X | X | X | |
| otrosproblemasdeatencionclin | X | ||||
| pais_nacimiento | X | X | X | ||
| parentesco_con_el_jefe_de_hogar | X | X | |||
| penamixta | X | ||||
| plan | X | ||||
| privadode_libertad | X | ||||
| programa_sename | X | ||||
| programa_senamequeescontrapar | X | ||||
| regi_afn_centro | X | ||||
| regi_afn_usuario | X | ||||
| region | X | ||||
| region_del_centro | X | X | X | X | |
| reincidencia | X | ||||
| rubro_trabaja | X | X | |||
| rubro_trabaja_previocondiciond | X | ||||
| s_afndromede_abstinencia | X | ||||
| sancion_accesoria | X | X | |||
| sancion_medida | X | ||||
| sanciono_medida20084 | X | ||||
| se_trata_de_una_mujer_embarazada | X | X | X | ||
| servicio_de_salud | X | X | X | ||
| sexo | X | X | X | X | X |
| sindromedeabstinencia | X | ||||
| sustancia_de_inicio | X | X | X | X | |
| sustancia_inicial | X | ||||
| sustancia_principal | X | X | X | X | X |
| sustanciaprincipalenmediocer | X | ||||
| tenencia_de_la_vivienda | X | X | |||
| tiempocumplimientodecondenaa | X | ||||
| tiempocumplimientodecondenad | X | ||||
| tiempocumplimientodecondenam | X | ||||
| tiempodepermanenciaprivadode | X | ||||
| tiempodepermanenciaprivadode_anos | X | ||||
| tiempodepermanenciaprivadode_meses | X | ||||
| tiempoen_calle_ano | X | ||||
| tiempoen_calle_mes | X | ||||
| tipo_centro | X | X | X | X | X |
| tipo_centro_derivacion | X | ||||
| tipo_de_plan | X | X | |||
| tipo_de_programa | X | ||||
| tipo_de_vivienda | X | X | |||
| tipo_delito | X | X | |||
| tipocondena | X | ||||
| tipoplan | X | ||||
| tipoprograma | X | ||||
| tiposdelugar | X | ||||
| ultimo_tratamiento | X | X | |||
| via_administraci_afn | X | ||||
| via_administracion_sustancia_principal | X | X | X | X | |
| x2_diagnostico_trs_psiquiatrico_cie_10 | X | ||||
| x2_diagnostico_trs_psiquiatrico_dsm_iv | X | ||||
| x2_diagnostico_trs_psiquiatrico_sub_cie_10 | X | ||||
| x2_diagnostico_trs_psiquiatrico_sub_dsm_iv | X | ||||
| x3_diagnostico_trs_psiquiatrico_cie_10 | X | ||||
| x3_diagnostico_trs_psiquiatrico_dsm_iv | X | ||||
| x3_diagnostico_trs_psiquiatrico_sub_cie_10 | X | ||||
| x3_diagnostico_trs_psiquiatrico_sub_dsm_iv | X |
We can see the following from C2-C6 databases that share different names.
Columns with only 2 letters or less (presumably generated by Excel based on their position) were renamed to follow a sequential format. Each of these 2-letter columns was renamed using the name of the previous column followed by a sequential number. This helps in making the column names more meaningful and maintaining a clear, organized structure, especially when the original names may have been automatically generated or lacked context.
Code
rename_short_columns <- function(df) {
col_names <- names(df) # Get column names
base_name <- col_names[1] # Start with the first column name as base
counter <- 2 # Start numbering from 2 for additional columns
for (i in 2:length(col_names)) {
if (nchar(col_names[i]) <= 2) { # Check if the column name has only 2 characters
col_names[i] <- paste0(base_name, "_", counter) # Use base name + sequential number
counter <- counter + 1 # Increment the counter
} else {
base_name <- col_names[i] # Update the base name to the current column name
counter <- 2 # Reset counter for the next series of short columns
}
}
names(df) <- col_names # Assign the new column names to the DataFrame
return(df)
}
# Applying the function
SISTRAT23_c1_2010_2024_df2 <- rename_short_columns(SISTRAT23_c1_2010_2024_df2)
CONS_C2_25 <- rename_short_columns(CONS_C2_25)
CONS_C3_25 <- rename_short_columns(CONS_C3_25)
CONS_C4_25 <- rename_short_columns(CONS_C4_25)
CONS_C5_25 <- rename_short_columns(CONS_C5_25)
CONS_C6_25 <- rename_short_columns(CONS_C6_25)Code
rbind(
cbind("c2_AdmDate_min",min(readr::parse_date(CONS_C2_25$fecha_ingreso_a_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c2_AdmDate_max",max(readr::parse_date(CONS_C2_25$fecha_ingreso_a_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c2_DischDate_min",min(readr::parse_date(CONS_C2_25$fecha_egreso_de_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c2_DischDate_max",max(readr::parse_date(CONS_C2_25$fecha_egreso_de_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c2_BirthDate_min",min(readr::parse_date(CONS_C2_25$fecha_nacimiento,"%d/%m/%Y"),na.rm=T)),
cbind("c2_BirthDate_max",max(readr::parse_date(CONS_C2_25$fecha_nacimiento,"%d/%m/%Y"),na.rm=T)),
cbind("c3_AdmDate_min",min(readr::parse_date(CONS_C3_25$fecha_ingreso_a_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c3_AdmDate_max",max(readr::parse_date(CONS_C3_25$fecha_ingreso_a_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c3_DischDate_min",min(readr::parse_date(CONS_C3_25$fecha_egreso_de_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c3_DischDate_max",max(readr::parse_date(CONS_C3_25$fecha_egreso_de_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c4_AdmDate_min",min(readr::parse_date(CONS_C4_25$fecha_ingreso_a_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c4_AdmDate_max",max(readr::parse_date(CONS_C4_25$fecha_ingreso_a_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c4_DischDate_min",min(readr::parse_date(CONS_C4_25$fecha_egreso_de_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c4_DischDate_max",max(readr::parse_date(CONS_C4_25$fecha_egreso_de_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c4_BirthDate_min",min(readr::parse_date(CONS_C4_25$fechanacimiento,"%d/%m/%Y"),na.rm=T)),
cbind("c4_BirthDate_max",max(readr::parse_date(CONS_C4_25$fechanacimiento,"%d/%m/%Y"),na.rm=T)),
cbind("c5_AdmDate_min",min(readr::parse_date(CONS_C5_25$fecha_ingreso_a_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c5_AdmDate_max",max(readr::parse_date(CONS_C5_25$fecha_ingreso_a_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c5_DischDate_min",min(readr::parse_date(CONS_C5_25$fecha_egreso_de_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c5_DischDate_max",max(readr::parse_date(CONS_C5_25$fecha_egreso_de_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c5_BirthDate_min",min(readr::parse_date(CONS_C5_25$fecha_nacimiento,"%d/%m/%Y"),na.rm=T)),
cbind("c5_BirthDate_max",max(readr::parse_date(CONS_C5_25$fecha_nacimiento,"%d/%m/%Y"),na.rm=T)),
cbind("c6_AdmDate_min",min(readr::parse_date(CONS_C6_25$fecha_ingreso_a_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c6_AdmDate_max",max(readr::parse_date(CONS_C6_25$fecha_ingreso_a_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c6_DischDate_min",min(readr::parse_date(CONS_C6_25$fecha_egreso_de_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c6_DischDate_max",max(readr::parse_date(CONS_C6_25$fecha_egreso_de_tratamiento,"%d/%m/%Y"),na.rm=T)),
cbind("c6_BirthDate_min",min(readr::parse_date(CONS_C6_25$fechanacimiento,"%d/%m/%Y"),na.rm=T)),
cbind("c6_BirthDate_max",max(readr::parse_date(CONS_C6_25$fechanacimiento,"%d/%m/%Y"),na.rm=T))
) %>%
data.frame() %>%
dplyr::rename("Date"="X2") %>%
tidyr::separate(X1,sep="_", into=c("db","time","value")) %>%
dplyr::mutate(Date=as.Date(as.numeric(Date),origin = "1970-01-01")) %>%
tidyr::pivot_wider(names_from=value, values_from=Date) %>%
kbl("markdown", caption= "Range of dates in different databases")| db | time | min | max |
|---|---|---|---|
| c2 | AdmDate | 2009-11-02 | 2024-12-20 |
| c2 | DischDate | 2014-01-02 | 2025-05-29 |
| c2 | BirthDate | 1983-06-30 | 2010-09-15 |
| c3 | AdmDate | 2011-02-07 | 2024-11-29 |
| c3 | DischDate | 2014-05-08 | 2025-05-16 |
| c4 | AdmDate | 2002-01-01 | 2024-12-20 |
| c4 | DischDate | 2014-07-24 | 2025-05-19 |
| c4 | BirthDate | 1924-11-07 | 2014-12-09 |
| c5 | AdmDate | 2013-01-02 | 2024-12-20 |
| c5 | DischDate | 2014-05-02 | 2025-05-23 |
| c5 | BirthDate | 1994-01-06 | 2024-10-26 |
| c6 | AdmDate | 2015-06-02 | 2024-12-20 |
| c6 | DischDate | 2017-05-06 | 2025-05-05 |
| c6 | BirthDate | 1936-10-27 | 2018-06-09 |
Next, we standardized column names from CONS_C2 to CONS_C6, by setting specific patterns and replacements for each dataset (CONS_C2 to CONS_C6), tailored to each dataset’s needs (e.g., _afn to on in CONS_C2). Then, we applied both specific and general patterns to column names in each dataset, ensuring consistent naming to replace patterns systematically across all column names.
Code
# Define los patrones y reemplazos para cada DataFrame
patterns_C2 <- c("_afn", "n_afomero", "a_afos")
replacements_C2 <- c("on", "numero", "anios")
patterns_C3 <- c("pa_a_s", "fa_sico", "categor_a_a")
replacements_C3 <- c("pais", "fisico", "categoria")
patterns_C4 <- c("aa_o")
replacements_C4 <- c("anio")
patterns_C5 <- c("psiqui_aftrico", "s_afndromede")
replacements_C5 <- c("psiquiatrico", "sindrome")
patterns_C6 <- c("aa_o", "viva_a", "sustanci_an", "n_as_mero", "aa_o", "da_as", "aplicacia_n")
replacements_C6 <- c("anio", "vivia", "sustancia_n", "numero", "anio", "dias", "aplicacion")
patterns <- c(patterns_C2, patterns_C3, patterns_C4, patterns_C5, patterns_C6)
replacements <-c(replacements_C2, replacements_C3, replacements_C4, replacements_C5, replacements_C6)
for (i in seq_along(patterns)) {
colnames(CONS_C2_25) <- sub(patterns[i], replacements[i], colnames(CONS_C2_25))
colnames(CONS_C3_25) <- sub(patterns[i], replacements[i], colnames(CONS_C3_25))
colnames(CONS_C4_25) <- sub(patterns[i], replacements[i], colnames(CONS_C4_25))
colnames(CONS_C5_25) <- sub(patterns[i], replacements[i], colnames(CONS_C5_25))
colnames(CONS_C6_25) <- sub(patterns[i], replacements[i], colnames(CONS_C6_25))
}Clean C2
Code
# Define a named vector with replacements
replacements <- c(
"ó" = "ó", "á" = "á", "é" = "é", "ú" = "ú",
"ñ" = "ñ", "Ñ" = "Ñ", "ÃÂ" = "Á", "á" = "á",
"é" = "é", "ú" = "ú", "ñ" = "ñ", "Ñ" = "Ñ",
"ÃÂ" = "Á", "º" = "º", "°" = "°", "ª" = "ª",
"¡" = "¡", "¿" = "¿", "ÃÂ" = "í", "ÃÂ" = "í",
"Ó" = "Ó", "Â" = "Ê", "Ãâ€" = "É", "ü" = "ü",
"ï" = "ï", "ö" = "ö", "«" = "«", "»" = "»",
"Ç" = "Ç", "ç" = "ç", "ÂÂ" = "", "Ã" = "",
"\u00AD" = "", "\u00C2\u00AD" = ""
)
# Create a function to apply the replacements
replace_chars <- function(column) {
reduce(names(replacements), ~ str_replace_all(.x, .y, replacements[.y]), .init = column)
}
CONS_C2_25_df <- CONS_C2_25 %>%
tidytable::mutate(tidytable::across(tidytable::everything(), replace_chars))
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Obtain unique values by column")
unique_values_list_c2 <- setNames(
lapply(names(CONS_C2_25_df), function(col_name) {
# Obtener los valores únicos de la columna
unique_values <- unique(CONS_C2_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C2_25_df) # Asignar los nombres de las columnas a la lista
)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# Apply transformations to character columns
CONS_C2_25_df <- CONS_C2_25_df %>%
tidytable::mutate(tidytable::across(tidytable::everything(), ~ {
.x %>%
stringr::str_to_lower() %>% # Convert to lowercase
stringr::str_trim() %>% # Trim leading and trailing whitespace
stringr::str_replace_all("\\s+", " ") %>% # Replace multiple spaces with a single space
stringr::str_replace_all("\\s*\\.\\s*$", "") # Remove periods at the end (and spaces before)
}))
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##
replacements <- c(
"\u00c2" = "", "viá‘a" = "viña", "reloncavá\u008d" = "reloncavi",
"maráa" = "maría", "á‘uble" = "ñuble", "vánculos" = "vínculos",
"concepciá“n" = "concepción", "aysá‰n" = "aysén", "mánimo" = "mínimo",
"m\\?mo" = "mínimo", "clánica" = "clínica", "prisionizaci\\?" = "prisionalización",
"explotaci\\?omercial" = "explotación comercial", "patología" = "patología",
"cardiopatías" = "cardiopatías", "especáfico" = "específico",
"esquizotápico" = "esquizotípico", "tricotilomanía" = "tricotilomanía",
"hipomanáaco" = "hipomaníaco", "lámite" = "límite", "manáaco" = "maníaco",
"á\u0081nimo" = "ánimo", "cleptomanía" = "cleptomanía", "hipocondría" = "hipocondría",
"raá\u008dces" = "raíces", "raá\\u008dces" = "raíces", "curacavá" = "curacaví",
"raáces" = "raíces", "terapá‰utica" = "terapéutica", "raáces" = "raíces",
"\\?ble" = "ñuble", "báo-báo" = "bío-bío", "iba\\?s" = "ibañez",
"reloncavá" = "reloncaví", "valparaáso" = "valparaíso", "araucanáa" = "araucanía",
"á‘uble" = "ñuble", "especáfico" = "específico", "vi\\? del mar" = "viña del mar",
"do\\?hue" = "doñihue", "huala\\?" = "hualaé", "\\?qu\\?" = "ñiquén",
"cha\\?ral" = "chañaral", "ollag\\?" = "ollagüe", "vicu\\?" = "vicuña",
"ca\\?te" = "cañete", "\\?\\?a" = "ñuñoa", "policáa" = "policía",
"garantáa" = "garantía", "fiscaláa" = "fiscalía", "haitá" = "haití",
"hungráa" = "hungría", "paás bajos" = "países bajos", "atacame\\?" = "atacameño",
"y\\?na" = "yámana", "y\\?gan" = "yagán", "hipn\\?os" = "hipnóticos",
"hero\\?" = "heroína", "code\\?" = "codeína", "analg\\?cos" = "analgésicos",
"barbit\\?os" = "barbitúricos", "alucin\\?os" = "alucinógenos",
"ãƒâ³n" = "ón", "ãƒâ©n" = "én", "ãƒâº" = "ú", "ãƒâºa" = "úa",
"ãƒâos" = "íos", "ãƒâuble" = "ñuble", "ãƒâ³n general" = "ón general",
"ãƒâ" = "í", "ãƒâ³n casa" = "ón casa", "ãƒârbara" = "árbara",
"naãƒâ" = "ñ", "raãƒâces" = "raíces", "baãƒâsico" = "básico",
"ãƒâ©utico" = "éutico", "vaãƒânculos" = "vínculos", "marãƒâa" = "maría",
"inaãƒâ©s" = "inés", "raí\\u008dces" = "raíces", "chiloí©" = "chiloé",
"terapí©utico" = "terapéutico", "bísico" = "básico", "peí±ablanca" = "peñablanca",
"iní©s" = "inés", "infracción" = "infracción", "layantú" = "layantu",
"oriã³n" = "orion", "valparaãso" = "valparaiso", "fãsico" = "fisico",
"ningãún gãénero" = "ningun genero", "viãña" = "viña",
"corporación" = "corporacion", "aysã©n" = "aysen", "\tcodesam" = "codesam",
"corporación" = "corporacion", "concepción" = "concepcion",
"hábitos" = "habitos", "psíquica" = "psiquica", "neuróticos" = "neuroticos",
"fisiológicas" = "fisiologicas", "somáticos" = "somaticos",
"orgánicos" = "organicos", "sintomáticos" = "sintomaticos",
"psicológico" = "psicologico", "mínimo" = "minimo", "sanción" = "sancion",
"terapéutica" = "terapeutica", "término" = "termino", "derivación" = "derivacion",
"prisionalización" = "prisionalizacion", "explotación" = "explotacion",
"estrés" = "estres", "años" = "años", "dãas"="días", "mãnimo"="minimo","ã©"="e"
)
replace_chars <- function(column) {
reduce(names(replacements), ~ stringr::str_replace_all(.x, .y, replacements[.y]), .init = column)
}
CONS_C2_25_df <- CONS_C2_25_df %>%
tidytable::mutate(tidytable::across(tidytable::everything(), replace_chars))
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##
unique_values_list_c21 <- setNames(
lapply(names(CONS_C2_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C2_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C2_25_df) # assign column names to the list
)
#1:5
#unique_values_list_c21[27:37]
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# Crear una lista para almacenar las expresiones de reemplazo
replacements3 <- c(
# "\u00AD" = "", "\u00C2\u00AD" = "", "\u00C2" = "",
# "ráo negro" = "río negro", "báo-báo" = "bío-bío",
# "pe\\?lolen" = "peñalolén", "pe\\?flor" = "peñaflor",
# "san gregorio de \\?quén" = "san gregorio de ñiquén",
# "\\?o nitroso" = "óxido nitroso", "coca\\?" = "cocaína",
# "nunca estud¡" = "nunca estudió",
# "t\\?ica comercial/industrial/normalista" = "técnica comercial/industrial/normalista",
"profesional \\(4 o m\\?a\\? incompleta" = "profesional (4 o más incompleta)",
"profesional \\(4 o m\\?a\\? completa" = "profesional (4 o más completa)",
# "t\\?ica profesional˜" = "técnica profesional",
"t\\?ico superior \\(1-3 a\\? completa" = "técnico superior (1-3 años completa)",
# "educaci\\?\\?ca" = "educación básica",
"t\\?ico superior \\(1-3 a\\? incompleta" = "técnico superior (1-3 años incompleta)",
# "cientáficos" = "científicos", "ášnicamente" = "únicamente",
# "hospeder\\?" = "hospedería", "residencial, pensi\\?hostal" = "residencial, pensión, hostal",
# "ocupaci\\?rregular" = "ocupación irregular", "cocaána" = "cocaína",
# "heroána" = "heroína", "codeána" = "codeína",
# "sintomático" = "sintomático", "disfunción" = "disfunción",
# "lesión" = "lesión", "días" = "días", "orientación" = "orientación",
# "especificación" = "especificación", "\\tcodesam" = "codesam",
"cleptomanáa" = "cleptomanía", "tricotilomanáa" = "tricotilomanía",
"cardiopatáas" = "cardiopatías", "patologáa" = "patología",
"último" = "últimos", "\\|(\\d+)" = "1", "1o" = "10",
"aná\u0081stasis"= "anástasis",
"á‘uá‘oa" = "ñuñoa", "cocaána" = "cocaína", "hospederáa" = "hospedería",
"ášnicamente" = "únicamente", "t\\?ico" = "técnico", "nunca estudi" = "nunca estudió",
"t\\?ica"= "técnica", "educaci\\?\\?ca" = "educación básica",
"dáas" = "días", "dáa" = "día",
"lesiones gravásimaslesiones gravásimas" = "lesiones gravísimas",
"táas" = "tías", "táos" = "tíos", "crámenes" = "crímenes",
"jurádica" = "jurídica", "daá‘o" = "daño", "puchuncavá"= "puchuncaví",
"suspención" = "suspensión", "á‘iquen" = "ñiquen", "doá‘ihue" = "doñihue",
"caá‘ete" = "cañete", "vicuá‘a" = "vicuña", "á‘uá‘oa"="ñuñoa",
"chaá‘aral"= "chañaral", "quilpuá‰"= "quilpué",
"peá‘alolen" = "peñalolén", "peá‘aflor" = "peñaflor",
"ibaá‘ez" = "ibáñez", "prisionización" = "prisionalización",
"piromanáa" = "piromanía", "psicotropas" = "psicotrópicas",
"fásico" = "físico", "psáquica" = "psíquica"
)
replace_chars <- function(column) {
reduce(names(replacements3), ~ stringr::str_replace_all(.x, .y, replacements3[.y]), .init = column)
}
CONS_C2_25_df <- CONS_C2_25_df %>%
tidytable::mutate(tidytable::across(tidytable::everything(), replace_chars))
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Obtain unique values by column, again")
unique_values_list_c22 <- setNames(
lapply(names(CONS_C2_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C2_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C2_25_df) # assign column names to the list
)
#_#_#_#_#_#_
invisible("See problematic characters")
df_c2_problems <- map_dfr(names(unique_values_list_c22), function(name) {
tibble(element_name = name, subelement = unique_values_list_c22[[name]])
}) %>%
dplyr::filter(str_detect(subelement, "[^[:ascii:]]|Ã|Â|ã|â|\\?|\\\\|í|ì|î|ï|é|è|ê|ë|ó|ò|ô|õ|ö|ú|ù|û|ü|ñ|Ñ|ñ|¿|’|“|â€|ã³|ã©|ã|ãº|ã¼|ã²|ã¼|ã³|ã³|ã±|[@#$%^&*<>~`{}\\[\\]]"))Code
replacements4 <- c(
"á" = "a", "é" = "e", "í" = "i", "ó" = "o", "ú" = "u", "ñ" = "n",
"Á" = "A", "É" = "E", "Í" = "I", "Ó" = "O", "Ú" = "U", "Ñ" = "N"
)
replace_chars4 <- function(column) {
for (pattern in names(replacements4)) {
column <- gsub(pattern, replacements4[pattern], column, fixed = TRUE)
}
return(column)
}
CONS_C2_25_df <- CONS_C2_25_df%>%
tidytable::mutate(tidytable::across(tidytable::everything(), ~ replace_chars4(.)))
unique_values_list_c23 <- setNames(
lapply(names(CONS_C2_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C2_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C2_25_df) # assign column names to the list
)
if(list_to_df(unique_values_list_c23 ) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value)) |> arrange(variable, value) |> nrow()>0){
warning(paste0( "Values with sign '?'= ",
list_to_df(unique_values_list_c23 ) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value)) |> arrange(variable, value) |> nrow())
)
}Warning: Values with sign ‘?’= 13
Code
if(list_to_df(unique_values_list_c23 ) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value)) |> arrange(variable, value) |> nrow()>0){
warning(paste0( "Values with signs '´ “ '= ",
list_to_df(unique_values_list_c23 ) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value)) |> arrange(variable, value) |> nrow())
)
}Warning: Values with signs ‘´ “’= 4
Code
df_c2_problems2 <- map_dfr(names(unique_values_list_c23), function(name) {
tibble(element_name = name, subelement = unique_values_list_c23[[name]])
}) %>%
dplyr::filter(str_detect(subelement, "[^[:ascii:]]|Ã|Â|ã|â|\\?|\\\\|í|ì|î|ï|é|è|ê|ë|ó|ò|ô|õ|ö|ú|ù|û|ü|ñ|Ñ|ñ|¿|’|“|â€|ã³|ã©|ã|ãº|ã¼|ã²|ã¼|ã³|ã³|ã±|[@#$%^&*<>~`{}\\[\\]]"))
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
repl_generic <- c(
"á"="á","é"="é","ÃÂ"="í","ó"="ó","ú"="ú",
"ñ"="ñ","Ñ"="ñ","ü"="ü","Ü"="ü",
"á"="á","é"="é","Ã"="í","ó"="ó","ú"="ú",
"ñ"="ñ","ü"="ü","Ü"="ü",
"\u00C2"="", "\u00AD"="",
"\u008d" = "í", # a veces aparece por 'í'
"\u009d" = "í",
"\u0091" = "'", # comillas “raras” de CP1252
"\u0092" = "'",
"\u2013" = "–", # normaliza (si prefieres guion simple: "-" )
"\u2014" = "–",
"\\s*–\\s*" = " – " # espacios consistentes alrededor del en dash
)
repl_domain <- c(
"profesional \\(4 o m\\?a\\? incompleta" = "profesional (4 o mas incompleta)",
"profesional \\(4 o m\\?a\\? completa" = "profesional (4 o mas completa)",
"t\\?ico superior \\(1-3 a\\? completa" = "tecnico superior (1-3 anos completa)",
"t\\?ico superior \\(1-3 a\\? incompleta" = "tecnico superior (1-3 anos incompleta)",
"cleptomanáa"="cleptomania","tricotilomanáa"="tricotilomania",
"cardiopatáas"="cardiopatias","patologáa"="patologia",
"último"="ultimos","1o"="10",
"aná\u0081stasis"="anastasis","á‘uá‘oa"="nunoa","cocaána"="cocaina",
"hospederáa"="hospederia","ášnicamente"="unicamente",
"t\\?ico"="tecnico","nunca estudi"="nunca estudio","t\\?ica"="tecnica",
"educaci\\?\\?ca"="educacion basica","dáas"="dias","dáa"="dia",
"lesiones gravásimaslesiones gravásimas"="lesiones gravisimas",
"táas"="tias","táos"="tios","crámenes"="crimenes","jurádica"="juridica",
"daá‘o"="dano","puchuncavá"="puchuncavi","suspención"="suspension",
"á‘iquen"="niquen","doá‘ihue"="donihue","caá‘ete"="canete","vicuá‘a"="vicuna",
"chaá‘aral"="chanaral","quilpuá‰"="quilpue","peá‘alolen"="penalolen",
"peá‘aflor"="penaflor","ibaá‘ez"="ibanez","prisionización"="prisionalizacion",
"piromanáa"="piromania","psicotropas"="psicotropicas",
"fásico"="fisico","psáquica"="psiquica", "estudioo\\?"="estudio", "educaci\\?n b\\?sica"= "educacion basica", "m\\?s a\\?os"="mas anos", "t\\?cnic"="tecnic", "copiap\\?"="copiapo", "\\?uble"= "nuble", "iba\\?es"= "ibanez", "ays\\?n"= "aysen", "servicio de salud los ra\u008dos \\(valdivia\\)"= "servicio de salud los rios (valdivia)", "pla - amuykipaa[a‘]+"= "pla - amuykipana", "a\\?os"="anos", "raíos"= "rios"
)
repl_domain_post <- c(
"nocontesta"= "no contesta", "nunca estudioo$"= "nunca estudio", "heroana"="heroina", "ridad
no sabe o no se aplica"= "no sabe o no aplica", "1 dias - semana"= "1 dia - semana", "miocardiopataa"="miocardiopatia", "admnistrativa"= "administrativa"
)
fix_text_tt <- function(x) {
x <- as.character(x)
x <- stringi::stri_trans_tolower(x)
x <- stringr::str_replace_all(x, repl_generic)
x <- stringr::str_replace_all(x, repl_domain)
x <- stringr::str_squish(x)
x <- stringr::str_replace_all(x, repl_domain_post)
x <- stringr::str_replace_all(x, "\\.-$", "")
x
}
#Apply
CONS_C2_25_df <- CONS_C2_25_df %>%
tidytable::mutate(tidytable::across(tidytable::where(~ is.character(.x) || is.factor(.x)),
~ fix_text_tt(.x)))
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("See problematic characters")
unique_values_list_c24 <- setNames(
lapply(names(CONS_C2_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C2_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C2_25_df) # assign column names to the list
)
df_c2_problems3 <- map_dfr(names(unique_values_list_c24), function(name) {
tibble(element_name = name, subelement = unique_values_list_c24[[name]])
}) %>%
dplyr::filter(str_detect(subelement, "[^[:ascii:]]|Ã|Â|ã|â|\\?|\\\\|í|ì|î|ï|é|è|ê|ë|ó|ò|ô|õ|ö|ú|ù|û|ü|ñ|Ñ|ñ|¿|’|“|â€|ã³|ã©|ã|ãº|ã¼|ã²|ã¼|ã³|ã³|ã±|[@#$%^&*<>~`{}\\[\\]]"))
if(list_to_df(unique_values_list_c24 ) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value)) |> arrange(variable, value) |> nrow()>0){
warning(paste0( "Values with sign '?'= ",
list_to_df(unique_values_list_c24 ) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value)) |> arrange(variable, value) |> nrow())
)
}Warning: Values with sign ‘?’= 1
Code
if(list_to_df(unique_values_list_c24)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow()>0){
warning(paste0( "Values with signs '´ “ '= ",
list_to_df(unique_values_list_c24)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow())
)
}Warning: Values with signs ‘´ “’= 2
Code
#list_to_df(unique_values_list_c24 ) |> filter(variable!="codigo_identificacion", variable!="HASH_KEY", !grepl("fecha|edad|dias|numero|id|codigo",variable)) |> View()Clean C3
Code
# Apply transformations to character columns
CONS_C3_25_df <- CONS_C3_25 %>%
dplyr::mutate(across(everything(), ~ {
.x %>%
stringr::str_to_lower() %>% # Convert to lowercase
stringr::str_trim() %>% # Trim leading and trailing whitespace
stringr::str_replace_all("\\s+", " ") %>% # Replace multiple spaces with a single space
stringr::str_replace_all("\\s*\\.\\s*$", "") # Remove periods at the end (and spaces before)
}))
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Obtain unique values by column, again")
unique_values_list_c30 <- setNames(
lapply(names(CONS_C3_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C3_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C3_25_df) # assign column names to the list
)
df_c3_problems0 <-
purrr::map_dfr(names(unique_values_list_c30), function(name) {
tibble(element_name = name, subelement = unique_values_list_c30[[name]])
})%>%
dplyr::filter(stringr::str_detect(subelement, "[^[:ascii:]]|Ã|Â|ã|â|\\?|\\\\|í|ì|î|ï|é|è|ê|ë|ó|ò|ô|õ|ö|ú|ù|û|ü|ñ|Ñ|ñ|¿|’|“|â€|ã³|ã©|ã|ãº|ã¼|ã²|ã¼|ã³|ã³|ã±|[@#$%^&*<>~`{}\\[\\]]"))
if(list_to_df(unique_values_list_c30) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value)) |> arrange(variable, value) |> nrow()>0){
warning(paste0( "Values with sign '?'= ",
list_to_df(unique_values_list_c30) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value)) |> arrange(variable, value) |> nrow())
)
}Warning: Values with sign ‘?’= 80
Code
if(list_to_df(unique_values_list_c30)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow()>0){
warning(paste0( "Values with signs '´ “ '= ",
list_to_df(unique_values_list_c30)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow())
)
}Warning: Values with signs ‘´ “’= 127
Code
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# --- generic ASCII/mojibake cleanup (from previous step) ---
# First pass: Mojibake/corrupted UTF-8 fixes
mojibake_fixes <- c(
# Common corrupted accented characters
"á"="a", "é"="e", "Ã"="i", "ó"="o", "ú"="u", "ñ"="n",
"ãƒâ¡"="a", "ãƒâ©"="e", "ãƒâ"="i", "ãƒâ³"="o", "ãƒâº"="u", "ãƒâ±"="n",
# Specific corrupted words from your data
"ãƒâšnicamente"="unicamente",
"razãƒâ³n"="razon",
"esquizotãƒâpico"="esquizotipico",
"curicãƒâ"="curico",
"bãƒâo-bãƒâo"="bio-bio",
"viãƒâ±a"="vina",
"aysãƒâ©n"="aysen",
"concepciãƒâ³n"="concepcion",
"mãƒânimo"="minimo",
"tãƒâ©rmino"="termino",
"fãƒâsico"="fisico",
"otro gãƒâ©nero"="otro genero"
)
# Second pass: Question mark replacements
qmark_fixes <- c(
# Communes/regions
"cha\\?ral"="chanaral",
"vi\\?a del mar"="vina del mar",
"vi\\? del mar"="vina del mar",
"vicu\\?"="vicuna",
"san gregorio de \\?iquen"="san gregorio de niquen",
"san gregorio de \\?quen"="san gregorio de niquen",
"de \\?ble"="de nuble",
"de \\?uble"="de nuble",
"iba\\?es"="ibanez",
"iba\\?s"="ibanez",
# Education terms
"educaci\\?n"="educacion",
"b\\?sica"="basica",
"t\\?cnico"="tecnico",
"t\\?cnica"="tecnica",
"t\\?ico"="tecnico",
"t\\?ica"="tecnica",
"nunca estudi\\?"="nunca estudio",
# Handle years/time periods
"a\\?os"="anos",
"a\\?"="anos",
"m\\?s"="mas",
"m\\?a\\?"="mas",
"\\(4 o m\\?s a\\?os\\)"="(4 o mas anos)",
"\\(1-3 a\\?os\\)"="(1-3 anos)",
"1-3 a\\?"="1-3 anos",
# Clinical/psychiatric terms
"esquizot\\?pico"="esquizotipico",
"esquizot\\?co"="esquizotipico",
"h\\?bitos"="habitos",
"h\\?tos"="habitos",
"psicol\\?gico"="psicologico",
"psicol\\?gicos"="psicologicos",
"psicol\\?os"="psicologicos",
"psicol\\?o"="psicologico",
"fisiol\\?gicas"="fisiologicas",
"fisiol\\?as"="fisiologicas",
"som\\?tico"="somatico",
"som\\?ticos"="somaticos",
"som\\?ca"="somatica",
"som\\?ticas"="somaticas",
"psic\\?tico"="psicotico",
"psic\\?ticos"="psicoticos",
"psic\\?os"="psicoticos",
"neur\\?tico"="neurotico",
"neur\\?ticos"="neuroticos",
"neur\\?os"="neuroticos",
"org\\?nico"="organico",
"org\\?nicos"="organicos",
"org\\?co"="organico",
"sintom\\?tico"="sintomatico",
"sintom\\?ticos"="sintomaticos",
"sintom\\?cos"="sintomaticos",
"adaptaci\\?n"="adaptacion",
"adaptaci\\?"="adaptacion",
"especificaci\\?n"="especificacion",
"especificaci\\?"="especificacion",
"transformaci\\?n"="transformacion",
"transformaci\\?ersistente"="transformacion persistente",
"lesi\\?n"="lesion",
"lesi\\?"="lesion",
"disfunci\\?n"="disfuncion",
"disfunci\\?erebral"="disfuncion cerebral",
"espec\\?fico"="especifico",
"espec\\?ficos"="especificos",
# Evaluation/outcomes
"logro m\\?nimo"="logro minimo",
"logro m\\?mo"="logro minimo",
# Substances
"coca\\?na"="cocaina",
"coca\\?"="cocaina",
"analg\\?sicos"="analgesicos",
"analg\\?cos"="analgesicos",
# Other problematic patterns
"explotaci\\?n"="explotacion",
"explotaci\\?exual"="explotacion sexual",
"discriminaci\\?n"="discriminacion",
"discriminaci\\?"="discriminacion",
"violaci\\?n"="violacion",
"derivaci\\?n"="derivacion",
"orientaci\\?exuales"="orientacion sexuales",
"estr\\?grave"="estres grave",
"estr\\?"="estres",
"s\\?rome amn\\?co"="sindrome amnesico"
)
post_fixes <- c("profesional \\(4 o m\\?anos"= "profesional (4 o mas anos)",
"org\\?ca"= "organica",
"org\\?nica"= "organica",
"espec\\?co"= "especifico",
"sintom\\?co"= "sintomatico",
"som\\?cos"= "somaticos",
"nocontesta"= "no contesta",
"admnistrativa"= "administrativa")
post_fixes2 <- c("profesional \\(4 o m\\?anos"= "profesional (4 o mas anos)",
"org\\?ca"= "organica",
"org\\?nica"= "organica",
"espec\\?co"= "especifico",
"sintom\\?co"= "sintomatico",
"som\\?cos"= "somaticos")
post_fixes3 <- c(
# Use Unicode escapes for problematic patterns
"rehabilitaci\u00e3\u0192\u00e2n" = "rehabilitacion",
"rehabilitaciafa\u00e2n" = "rehabilitacion",
"rehabilitaciafan" = "rehabilitacion",
"\u00e3\u0192\u00e2'uble" = "nuble",
"afa'uble" = "nuble",
"afauble" = "nuble",
"afa\\'uble" = "nuble",
'rehabilitaciafa"n'= "rehabilitacion",
'curico\\"'="curico",
"\\bafa['’\"]?uble\\b" = "nuble", # e.g., "afa'uble" → "nuble"
"\\brehabilitaciafa['’\"]?n\\b" = "rehabilitacion" # e.g., 'rehabilitaciafa"n' → "rehabilitacion"
)
post_fixes4 <- c(
r"(rehabilitaciafa"n)" = "rehabilitacion",
r"(aƒa'uble)" = "nuble",
"afauble"= "nuble"
)
fix_text_ascii <- function(x) {
x <- as.character(x)
x <- stringi::stri_trans_tolower(x)
x <- stringr::str_replace_all(x, "[\"'`´‘’‚‛“”„‟‹›«»]", "")
# Apply all fixes - CORRECTED THE TYPO
x <- stringr::str_replace_all(x, mojibake_fixes)
x <- stringr::str_replace_all(x, qmark_fixes) # <- This was str_replpace_all in your code
x <- stringr::str_replace_all(x, post_fixes) # <- This was str_replpace_all in your code
x <- stringr::str_replace_all(x, post_fixes2) # <- This was str_replpace_all in your code
x <- stringr::str_replace_all(x, post_fixes3) # <- This was str_replpace_all in your code
x <- stringr::str_replace_all(x, post_fixes4) # <- This was str_replpace_all in your code
# Clean up double 'nn' at word boundaries
x <- stringr::str_replace_all(x, "nn\\b", "n")
# 3) final ASCII transliteration safeguard (removes any leftover accents)
x <- stringi::stri_trans_general(x, "Latin-ASCII")
# 4) squash spaces
x <- stringr::str_squish(x)
x
}
# apply to all character/factor columns
CONS_C3_25_df <- CONS_C3_25_df%>%
dplyr::mutate(dplyr::across(dplyr::where(~ is.character(.x) || is.factor(.x)),
~ fix_text_ascii(.x)))
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Obtain unique values by column, again")
unique_values_list_c31 <- setNames(
lapply(names(CONS_C3_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C3_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C3_25_df) # assign column names to the list
)
df_c3_problems1 <-
purrr::map_dfr(names(unique_values_list_c31), function(name) {
tibble(element_name = name, subelement = unique_values_list_c31[[name]])
})%>%
dplyr::filter(str_detect(subelement, "[^[:ascii:]]|Ã|Â|ã|â|\\?|\\\\|í|ì|î|ï|é|è|ê|ë|ó|ò|ô|õ|ö|ú|ù|û|ü|ñ|Ñ|ñ|¿|’|“|â€|ã³|ã©|ã|ãº|ã¼|ã²|ã¼|ã³|ã³|ã±|[@#$%^&*<>~`{}\\[\\]]"))
if(list_to_df(unique_values_list_c31) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value)) |> arrange(variable, value) |> nrow()>0){
warning(paste0( "Values with sign '?'= ",
list_to_df(unique_values_list_c31)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value))|> arrange(variable, value)|> nrow())
)
}
if(list_to_df(unique_values_list_c31)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow()>0){
warning(paste0( "Values with signs '´ “ '= ",
list_to_df(unique_values_list_c31)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow())
)
}
#list_to_df(unique_values_list_c31) |> filter(variable!="codigo_identificacion", variable!="HASH_KEY", !grepl("fecha|edad|dias|numero|id|codigo",variable)) |> View()Clean C4
Code
# Apply transformations to character columns
CONS_C4_25_df <- CONS_C4_25 %>%
dplyr::mutate(across(everything(), ~ {
.x %>%
stringr::str_to_lower() %>% # Convert to lowercase
stringr::str_trim() %>% # Trim leading and trailing whitespace
stringr::str_replace_all("\\s+", " ") %>% # Replace multiple spaces with a single space
stringr::str_replace_all("\\s*\\.\\s*$", "") # Remove periods at the end (and spaces before)
}))
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Obtain unique values by column, again")
unique_values_list_c40 <- setNames(
lapply(names(CONS_C4_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C4_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C4_25_df) # assign column names to the list
)
df_c4_problems0 <-
purrr::map_dfr(names(unique_values_list_c40), function(name) {
tibble(element_name = name, subelement = unique_values_list_c40[[name]])
})%>%
dplyr::filter(stringr::str_detect(subelement, "[^[:ascii:]]|Ã|Â|ã|â|\\?|\\\\|í|ì|î|ï|é|è|ê|ë|ó|ò|ô|õ|ö|ú|ù|û|ü|ñ|Ñ|ñ|¿|’|“|â€|ã³|ã©|ã|ãº|ã¼|ã²|ã¼|ã³|ã³|ã±|[@#$%^&*<>~`{}\\[\\]]"))
if(list_to_df(unique_values_list_c40) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value)) |> arrange(variable, value) |> nrow()>0){
warning(paste0( "Values with sign '?'= ", list_to_df(unique_values_list_c40)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value))|> arrange(variable, value)|> nrow())
)
}
if(list_to_df(unique_values_list_c40)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow()>0){
warning(paste0( "Values with signs '´ “ '= ", list_to_df(unique_values_list_c40)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow())
)
}
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# 1) Generic ASCII/mojibake cleanup (keeps only plain ASCII, no accents/ñ)
repl_ascii_generic <- c(
"\uFEFF" = "", "\u00C2" = "", "\u00AD" = "", "ÂÂ" = "", "ã‚â" = "",
"‘"="'","’"="'","“"="\"","�"="\"","´"="'","′"="'",
"–" = "-", "—" = "-", "\u2013" = "-", "\u2014" = "-",
# vowels/ü → plain ASCII
"á"="a","á"="a","ãƒâ¡"="a","á"="a",
"é"="e","é"="e","ãƒâ©"="e","é"="e",
"ÃÂ"="i","Ã"="i","ãƒâ"="i","í"="i",
"ó"="o","ó"="o","ãƒâ³"="o","ó"="o","Ó"="o",
"ú"="u","ú"="u","ãƒâº"="u","ú"="u",
"ü"="u","ü"="u","Ü"="u","Ü"="u","ãƒâ¼"="u","ü"="u",
# ñ/Ñ → n (ASCII-only requirement)
"ñ"="n","ñ"="n","ãƒâ±"="n","Ñ"="n","Ñ"="n","ñ"="n"
)
# 2) Targeted rules for the exact glitches you found (regex on the LEFT)
repl_c4 <- c(
# communes / regions
"\\bvi\\?a del mar\\b" = "vina del mar",
"\\bsan gregorio de \\?iquen\\b" = "san gregorio de niquen",
"\\bde \\?uble\\b" = "de nuble",
"iba\\?es" = "ibanez",
# occupations / conditions
"ocupaci\\?n" = "ocupacion",
"ocupaci\\?o" = "ocupacion",
"ocupaci\\?rregular" = "ocupacion irregular",
# law condition phrases
"condici\\?e tratamiento" = "condicion de tratamiento",
"condici\\?n de tratamiento" = "condicion de tratamiento",
# crimes/offences
"corrupci\\?e menores" = "corrupcion de menores",
"lesiones graves y grav\\?simas" = "lesiones graves y gravisimas",
"lesiones graves y grav\\?mas" = "lesiones graves y gravisimas",
"producci\\?e material pornogr\\?co"= "produccion de material pornografico",
"producci\\?n de material pornogr\\?fico" = "produccion de material pornografico",
"receptaci\\?" = "receptacion",
"robo con intimidaci\\?" = "robo con intimidacion",
"\\bviolaci\\?\\b" = "violacion",
# clinical (physical)
"megalobl\\?ica" = "megaloblastica",
"ferrop\\?ca" = "ferropenica",
"cardiopat\\?as" = "cardiopatias",
"cardiopat\\?:" = "cardiopatia:",
"miocardiopat\\? ?dilatada" = "miocardiopatia dilatada",
"som\\?cas" = "somaticas",
"som\\?ticas" = "somaticas",
"alcoh\\?lica" = "alcoholica",
"alcoh\\?a" = "alcoholica",
"hepatitis cr\\?a" = "hepatitis cronica",
"patolog\\?a bucal" = "patologia bucal",
"patolog\\?bucal" = "patologia bucal",
"patolog\\? de la gesti\\?n" = "patologia de la gestion",
"patolog\\?de la gesti\\? del ni\\?ntrauterino" = "patologia de la gestion del nin intrauterino",
# clinical (psychiatry)
"esquizot\\?pico" = "esquizotipico",
"esquizot\\?co" = "esquizotipico",
"h\\?bitos" = "habitos",
"h\\?tos" = "habitos",
"psicol\\?gic[oa]" = "psicologico",
"fisiol\\?gic[oa]s?" = "fisiologicas",
"som\\?tic[oa]s?" = "somaticos",
"psic\\?tic[oa]s?" = "psicoticos",
"neur\\?tic[oa]s?" = "neuroticos",
"org\\?nic[oa]s?" = "organicos",
"sintom\\?tic[oa]s?" = "sintomaticos",
"psic\\?os\\b" = "psicoticos",
"sue\\?" = "sueno",
"adaptaci\\?n" = "adaptacion",
# schooling
"educaci\\?n b\\?sica" = "educacion basica",
"\\(4 o m\\?a\\? completa" = "(4 o mas completa",
"\\(4 o m\\?a\\? incompleta" = "(4 o mas incompleta",
"t\\?cnica" = "tecnica",
"t\\?ico" = "tecnico",
"t\\?ico superior \\(1-3 a\\? completa" = "tecnico superior (1-3 anos) completa",
"t\\?ico superior \\(1-3 a\\? incompleta" = "tecnico superior (1-3 anos) incompleta",
"t\\?cnico superior \\(1-3 a\\?os\\)" = "tecnico superior (1-3 anos)",
"m\\?s a\\?os" = "mas anos",
# evaluation / outcomes
"logro m\\?mo" = "logro minimo",
"logro m\\?nimo" = "logro minimo",
# frequency/time
"\\b1 d\\? ?- semana\\b" = "1 dia - semana",
"\\b1 d\\?as - semana\\b" = "1 dias - semana",
"d\\?as" = "dias",
"d\\?a" = "dia",
"\\?ltimo" = "ultimo",
"^\\?mo\\b" = "ultimo",
# substances
"coca\\?na" = "cocaina",
"\\bcoca\\?\\b" = "cocaina",
"analg\\?sicos" = "analgesicos",
"analg\\?cos" = "analgesicos",
"hipn\\?ticos" = "hipnoticos",
"\\bhipn\\?os\\b" = "hipnoticos",
"alucin\\?genos" = "alucinogenos",
"alucin\\?os" = "alucinogenos",
# “otros problemas …”
"explotaci\\?n comercial sexual" = "explotacion comercial sexual",
"explotaci\\?omercial sexual" = "explotacion comercial sexual",
# nationality / country
"\\bper\\?\\b" = "peru",
"rep\\?ca dominicana" = "republica dominicana",
"\\bpais\\b" = "pais", # guard rail
# kinship
"cu\\?/a" = "cunado/a",
"c\\?nyuge" = "conyuge",
"s\\?del" = "solo del",
"s\\?lo" = "solo",
# housing / place
"hospeder\\?" = "hospederia",
"pensi\\?hostal" = "pension, hostal",
# routes of administration
"aspiraci\\?n de" = "aspiracion de",
"aspiraci\\?e" = "aspiracion de"
)
fix_text_ascii <- function(x) {
x <- as.character(x)
x <- stringi::stri_trans_tolower(x)
# 1) generic cleanup (kill mojibake / normalize punctuation)
x <- stringr::str_replace_all(x, repl_ascii_generic)
# 2) targeted domain repairs for this dataset
x <- stringr::str_replace_all(x, repl_c4)
# 3) small heuristics for leftover '?'
# - '?n'/'?t' often from 'án'/'át' → 'an'/'at'
x <- stringr::str_replace_all(x, "\\?(?=n|t)", "a")
# - 'l?g' → 'log' (fisiol?gicas → fisiologicas)
x <- stringr::str_replace_all(x, "l\\?g", "log")
# - between letters, default to 'o' (psicol?gico → psicologico)
x <- stringr::str_replace_all(x, "(?<=[a-z])\\?(?=[a-z])", "o")
# 4) final ASCII transliteration guard (removes any stray accents)
x <- stringi::stri_trans_general(x, "Latin-ASCII")
# 5) whitespace normalization
x <- stringr::str_squish(x)
x
}
# Apply to CONS_C4_25_df (all character/factor columns)
CONS_C4_25_df <- CONS_C4_25_df %>%
dplyr::mutate(dplyr::across(dplyr::where(~ is.character(.x) || is.factor(.x)),
~ fix_text_ascii(.x)))
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Obtain unique values by column, again")
unique_values_list_c41 <- setNames(
lapply(names(CONS_C4_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C4_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C4_25_df) # assign column names to the list
)
df_c4_problems1 <-
purrr::map_dfr(names(unique_values_list_c41), function(name) {
tibble(element_name = name, subelement = unique_values_list_c41[[name]])
})%>%
dplyr::filter(str_detect(subelement, "[^[:ascii:]]|Ã|Â|ã|â|\\?|\\\\|í|ì|î|ï|é|è|ê|ë|ó|ò|ô|õ|ö|ú|ù|û|ü|ñ|Ñ|ñ|¿|’|“|â€|ã³|ã©|ã|ãº|ã¼|ã²|ã¼|ã³|ã³|ã±|[@#$%^&*<>~`{}\\[\\]]"))
if(list_to_df(unique_values_list_c41) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value)) |> arrange(variable, value) |> nrow()>0){
warning(paste0( "Values with sign '?'= ", list_to_df(unique_values_list_c41)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value))|> arrange(variable, value)|> nrow())
)
}
if(list_to_df(unique_values_list_c41)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow()>0){
warning(paste0( "Values with signs '´ “ '= ", list_to_df(unique_values_list_c41)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow())
)
}
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# Extra rules for the exact strings you still have
repl_more <- c(
# communes/regions
"\\b\\?\\?a\\b" = "na", # literal "??a" token → best ASCII guess "na"
"\\bvi\\? del mar\\b" = "vina del mar", # "vi? del mar"
"\\bvi\\?a del mar\\b" = "vina del mar", # "vi?a del mar" (keep too, just in case)
# occupation / condition
"raz\\?" = "razon", # "otra raz?" → "otra razon"
# crimes/offences
"\\bviolaci\\?\\b" = "violacion", # "violaci?"
"violaci\\?" = "violacion",
"todos los d\\?" = "todos los dias",
"derivaci\\?" = "derivacion",
# schooling/time phrases
"1-3 a\\?\\s+completa" = "1-3 anos completa",
"1-3 a\\?\\s+incompleta" = "1-3 anos incompleta",
# " 1 a 2 a?" / " 3 a 4 a?" → " 1 a 2 anos"
"(?<=\\d)\\s*a\\?\\b" = " anos",
"moa\\?" = "mas", # "5 o moa?" → "5 o mas"
# frequency/day tokens
"1 dias - semana" = "1 dia - semana",
"2-3 d\\? - semana" = "2-3 dias - semana",
"4-6 d\\? - semana" = "4-6 dias - semana",
"5 o mas$" = "5 o mas anos",
"3 a 4 a\\?" = "3 a 4 anos",
"1 a 2 aoos" = "1 a 2 anos",
"1 a 2 a\\?" = "1 a 2 anos",
"\\bd\\?\\b" = "dia", # "2-3 d? - semana"
"d\\?as" = "dias",
"d\\?a" = "dia",
"todos los d\\?" = "todos los dias",
# substances
"\\bcoca\\?\\b" = "cocaina",
"coca\\?" = "cocaina",
"no consumi\\?" = "no consumio",
# nationality / country
"\\bper\\?\\b" = "peru",
"per\\?" = "peru"
)
repl_specific <- c(
"fafasico" = "fisico",
"orgocos" = "organicos",
"orgnaicos" = "organicos",
"terapoutica" = "terapeutica",
"terapoica" = "terapeutica",
"derivacian" = "derivacion",
"miocardiopatoa" = "miocardiopatia",
"gestian" = "gestion",
"\\bnioo\\b" = "nino",
"\\bnin\\b" = "nino",
"\\bsuenoo\\b" = "sueno",
"estado de nimo" = "estado de animo",
"neuroos" = "neurologicos",
"enfermedad moca" = "enfermedad medica",
"nioez" = "ninez",
"psicoloo" = "psicologico",
"aoos" = "anos",
# education phrases
"profesional \\(4 o mas completa" = "profesional (4 o mas anos) completa",
"profesional \\(4 o mas incompleta" = "profesional (4 o mas anos) incompleta",
"tecnico superior \\(1-3 anos completa" = "tecnico superior (1-3 anos) completa",
"tecnico superior \\(1-3 anos incompleta" = "tecnico superior (1-3 anos) incompleta",
"\\btoica\\b" = "tecnica",
"\\bnocontesta\\b" = "no contesta"
)
repl_specific2 <-
c("nocontesta"="no contesta", "razan"="razon", "orgocos"= "organicos", "fisioloas"= "fisiologicas", "nioo"= "ninez", "moca"= "morfologica", "nioez"= "ninez", "suenoo"= "sueno", "admnistrativa"= "administrativa", "terapoutica"= "terapeutica", "derivacian"= "derivacion", "neuroos"= "neurologicos", "psicoloo"= "psicologico", "fisioloas"= "fisiologicas", "somocos"= "somaticos", "\\bafa['’\"]?uble\\b" = "nuble", # e.g., "afa'uble" → "nuble"
"\\brehabilitaciafa['’\"]?n\\b" = "rehabilitacion" # e.g., 'rehabilitaciafa"n' → "rehabilitacion"
)
pat_map_extra <- c(
# Mis-spellings of disorders
"trastornos mentales organicos, incluidos los sintomocos" =
"trastornos mentales organicos, incluidos los sintomaticos",
"trastornos del estado de nimo" =
"trastornos del estado de animo",
# Pathology of ...
"patologia de la gestion del nin intrauterino" =
"patologia de la gestion y de la ninez intrauterino",
"patologoa de la gestian y del ninez intrauterino" =
"patologia de la gestion y de la ninez intrauterino",
# Common typos / lexicals
"hospederiaa" = "hospederia",
"pensian" = "pension",
"cuoado/a" = "cunado/a",
"tocnicos" = "tecnicos",
"cientoficos" = "cientificos",
"cientocos" = "cientificos"
)
# Update your fixer to apply these after the generic + previous domain rules
fix_text_ascii <- function(x) {
x <- as.character(x)
x <- stringi::stri_trans_tolower(x)
# 1) generic mojibake → ASCII
x <- stringr::str_replace_all(x, repl_ascii_generic)
# 2) your prior targeted rules (if you have them, keep that call here)
# x <- stringr::str_replace_all(x, repl_c4) # <-- keep if already defined
# 3) NEW: apply these extra rules
x <- stringr::str_replace_all(x, repl_more)
# 4) light heuristics for any stray '?'
x <- stringr::str_replace_all(x, "\\?(?=n|t)", "a") # án/át → an/at
x <- stringr::str_replace_all(x, "l\\?g", "log") # fisiol?g → fisiolog
x <- stringr::str_replace_all(x, "(?<=[a-z])\\?(?=[a-z])", "o") # psicolog?co → psicologico
# 5) final ASCII guard + spacing
x <- stringi::stri_trans_general(x, "Latin-ASCII")
x <- stringr::str_squish(x)
# 6) Specific corrections (regex on the left, ASCII on the right)
x <- stringr::str_replace_all(x, repl_ascii_generic)
x <- stringr::str_replace_all(x, repl_specific2)#pat_map_extra
x <- stringr::str_replace_all(x, pat_map_extra)#
# x
}
# Run on CONS_C4_25_df
CONS_C4_25_df <- CONS_C4_25_df %>%
dplyr::mutate(dplyr::across(where(~ is.character(.x) || is.factor(.x)),
~ fix_text_ascii(.x)))
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Obtain unique values by column, again")
unique_values_list_c42 <- setNames(
lapply(names(CONS_C4_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C4_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C4_25_df) # assign column names to the list
)
df_c4_problems2 <-
purrr::map_dfr(names(unique_values_list_c42), function(name) {
tibble(element_name = name, subelement = unique_values_list_c42[[name]])
})%>%
dplyr::filter(str_detect(subelement, "[^[:ascii:]]|Ã|Â|ã|â|\\?|\\\\|í|ì|î|ï|é|è|ê|ë|ó|ò|ô|õ|ö|ú|ù|û|ü|ñ|Ñ|ñ|¿|’|“|â€|ã³|ã©|ã|ãº|ã¼|ã²|ã¼|ã³|ã³|ã±|[@#$%^&*<>~`{}\\[\\]]"))
if(list_to_df(unique_values_list_c42) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value)) |> arrange(variable, value) |> nrow()>0){
warning(paste0( "Values with sign '?'= ", list_to_df(unique_values_list_c42)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value))|> arrange(variable, value)|> nrow())
)
}
if(list_to_df(unique_values_list_c42)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow()>0){
warning(paste0( "Values with signs '´ “ '= ", list_to_df(unique_values_list_c42)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow())
)
}
#list_to_df(unique_values_list_c42) |> filter(variable!="codigo_identificacion", variable!="HASH_KEY", !grepl("fecha|edad|dias|numero|id|codigo",variable)) |> View()Clean C5
Code
# Comprehensive encoding fix function
fix_encoding_complete <- function(x) {
x <- as.character(x)
x <- tolower(x)
repl_ascii_generic <- c(
"\uFEFF" = "", "\u00C2" = "", "\u00AD" = "", "ÂÂ" = "", "ã‚â" = "",
"‘"="'","’"="'","“"="\"","�"="\"","´"="'","′"="'",
"–" = "-", "—" = "-", "\u2013" = "-", "\u2014" = "-",
# vowels/ü → plain ASCII
"á"="a","á"="a","ãƒâ¡"="a","á"="a",
"é"="e","é"="e","ãƒâ©"="e","é"="e",
"ÃÂ"="i","Ã"="i","ãƒâ"="i","í"="i",
"ó"="o","ó"="o","ãƒâ³"="o","ó"="o","Ó"="o",
"ú"="u","ú"="u","ãƒâº"="u","ú"="u",
"ü"="u","ü"="u","Ü"="u","Ü"="u","ãƒâ¼"="u","ü"="u",
# ñ/Ñ → n (ASCII-only requirement)
"ñ"="n","ñ"="n","ãƒâ±"="n","Ñ"="n","Ñ"="n","ñ"="n"
)
x <- stringr::str_replace_all(x, repl_ascii_generic)
# Step 1: Remove generic UTF-8 BOM and control characters
x <- gsub("[\uFEFF\u00AD\u200B-\u200F\u202A-\u202E]", "", x, perl = TRUE)
# Step 2: Fix mojibake patterns using regex (avoids quote-breaking issues)
# Match corruption patterns and remove them, then fix the word
# Pattern 1: Remove mojibake sequences (3-4 char UTF-8 corruption)
# This catches ãƒâ and similar patterns
x <- gsub("[\u00C0-\u00FF][\u0080-\u00BF]{1,3}", "", x, perl = TRUE)
# Pattern 2: Fix words that had mojibake removed
word_corrections <- c(
# After mojibake removal, fix incomplete words
"^nicamente$" = "unicamente",
"^uble$" = "nuble",
"rehabilitaci.*?n$" = "rehabilitacion",
"raz.*?n$" = "razon",
"esquizot.*?pico$" = "esquizotipico",
"curic.*?$" = "curico",
"b.*?o-b.*?o$" = "bio-bio",
"vi.*?a$" = "vina",
"ays.*?n$" = "aysen",
"concepci.*?n$" = "concepcion",
"m.*?nimo$" = "minimo",
"t.*?rmino$" = "termino",
"f.*?sico$" = "fisico",
"otro g.*?nero$" = "otro genero"
)
for(pattern in names(word_corrections)) {
x <- gsub(pattern, word_corrections[pattern], x, perl = TRUE)
}
# Step 3: Handle question mark replacements systematically
qmark_replacements <- list(
# Communes and regions
c("cha\\?ral", "chanaral"),
c("vi\\?a del mar", "vina del mar"),
c("vi\\? del mar", "vina del mar"),
c("vicu\\?", "vicuna"),
c("san gregorio de \\?iquen", "san gregorio de niquen"),
c("san gregorio de \\?quen", "san gregorio de niquen"),
c("de \\?ble", "de nuble"),
c("de \\?uble", "de nuble"),
c("iba\\?es", "ibanez"),
c("iba\\?s", "ibanez"),
# Education
c("educaci\\?n", "educacion"),
c("b\\?sica", "basica"),
c("t\\?cnico", "tecnico"),
c("t\\?cnica", "tecnica"),
c("t\\?ico", "tecnico"),
c("t\\?ica", "tecnica"),
c("nunca estudi\\?", "nunca estudio"),
c("a\\?os", "anos"),
c("m\\?s", "mas"),
c("\\(4 o m\\?s a\\?os\\)", "(4 o mas anos)"),
c("\\(1-3 a\\?os\\)", "(1-3 anos)"),
c("1-3 a\\?", "1-3 anos"),
# Clinical terms
c("esquizot\\?pico", "esquizotipico"),
c("esquizot\\?co", "esquizotipico"),
c("h\\?bitos", "habitos"),
c("h\\?tos", "habitos"),
c("psicol\\?gico", "psicologico"),
c("psicol\\?gicos", "psicologicos"),
c("psicol\\?os", "psicologicos"),
c("fisiol\\?gicas", "fisiologicas"),
c("fisiol\\?as", "fisiologicas"),
c("som\\?tico", "somatico"),
c("som\\?ticos", "somaticos"),
c("som\\?ca", "somatica"),
c("psic\\?tico", "psicotico"),
c("psic\\?ticos", "psicoticos"),
c("psic\\?os", "psicoticos"),
c("neur\\?tico", "neurotico"),
c("neur\\?ticos", "neuroticos"),
c("neur\\?os", "neuroticos"),
c("org\\?nico", "organico"),
c("org\\?nicos", "organicos"),
c("org\\?co", "organico"),
c("sintom\\?tico", "sintomatico"),
c("sintom\\?ticos", "sintomaticos"),
c("sintom\\?cos", "sintomaticos"),
c("adaptaci\\?n", "adaptacion"),
c("adaptaci\\?", "adaptacion"),
c("especificaci\\?n", "especificacion"),
c("especificaci\\?", "especificacion"),
c("transformaci\\?n", "transformacion"),
c("transformaci\\?ersistente", "transformacion persistente"),
c("lesi\\?n", "lesion"),
c("lesi\\?", "lesion"),
c("disfunci\\?n", "disfuncion"),
c("disfunci\\?erebral", "disfuncion cerebral"),
c("espec\\?fico", "especifico"),
c("espec\\?ficos", "especificos"),
# Medical/physical
c("megalobl\\?ica", "megaloblastica"),
c("ferrop\\?ca", "ferropenica"),
c("cardiopat\\?as", "cardiopatias"),
c("cardiopat\\?", "cardiopatia"),
c("miocardiopat\\?", "miocardiopatia"),
c("alcoh\\?lica", "alcoholica"),
c("hepatitis cr\\?a", "hepatitis cronica"),
c("hepatitis cr\\?nica", "hepatitis cronica"),
c("patolog\\?a", "patologia"),
c("patolog\\?", "patologia"),
c("gesti\\?n", "gestion"),
c("ni\\?o", "nino"),
c("ni\\?", "nino"),
# Evaluation
c("logro m\\?nimo", "logro minimo"),
c("logro m\\?mo", "logro minimo"),
# Substances
c("coca\\?na", "cocaina"),
c("coca\\?", "cocaina"),
c("analg\\?sicos", "analgesicos"),
c("analg\\?cos", "analgesicos"),
c("hipn\\?ticos", "hipnoticos"),
c("hipn\\?os", "hipnoticos"),
c("alucin\\?genos", "alucinogenos"),
c("alucin\\?os", "alucinogenos"),
# Time/frequency
c("d\\?as", "dias"),
c("d\\?a", "dia"),
c("\\?ltimo", "ultimo"),
c("\\?ltimos", "ultimos"),
c("no consumi\\?", "no consumio"),
# Other terms
c("explotaci\\?n", "explotacion"),
c("explotaci\\?exual", "explotacion sexual"),
c("discriminaci\\?n", "discriminacion"),
c("discriminaci\\?", "discriminacion"),
c("violaci\\?n", "violacion"),
c("violaci\\?", "violacion"),
c("derivaci\\?n", "derivacion"),
c("derivaci\\?", "derivacion"),
c("orientaci\\?exuales", "orientacion sexuales"),
c("estr\\?grave", "estres grave"),
c("estr\\?", "estres"),
c("s\\?rome amn\\?co", "sindrome amnesico"),
c("per\\?", "peru"),
c("rep\\?ca dominicana", "republica dominicana"),
c("c\\?nyuge", "conyuge"),
c("cu\\?ado", "cunado"),
c("hospeder\\?", "hospederia"),
c("pensi\\?", "pension"),
c("aspiraci\\?n", "aspiracion"),
c("aspiraci\\?", "aspiracion"),
c("ocupaci\\?n", "ocupacion"),
c("ocupaci\\?", "ocupacion"),
c("condici\\?n", "condicion"),
c("condici\\?", "condicion"),
c("corrupci\\?", "corrupcion"),
c("producci\\?n", "produccion"),
c("producci\\?", "produccion"),
c("pornogr\\?fico", "pornografico"),
c("pornogr\\?co", "pornografico"),
c("receptaci\\?", "receptacion"),
c("intimidaci\\?", "intimidacion"),
c("sue\\?", "sueno"),
c("raz\\?", "razon")
)
# Apply question mark replacements
for(repl in qmark_replacements) {
x <- gsub(repl[1], repl[2], x, perl = TRUE)
}
# Step 4: Fix common typos and specific issues
typo_fixes <- c(
"orgocos" = "organicos",
"orgnaicos" = "organicos",
"fafasico" = "fisico",
"terapoutica" = "terapeutica",
"terapoica" = "terapeutica",
"derivacian" = "derivacion",
"miocardiopatoa" = "miocardiopatia",
"gestian" = "gestion",
"\\bnioo\\b" = "nino",
"\\bnin\\b" = "nino",
"suenoo" = "sueno",
"neuroos" = "neurologicos",
"psicoloo" = "psicologico",
"aoos" = "anos",
"nioez" = "ninez",
"toica" = "tecnica",
"nocontesta" = "no contesta",
"razan" = "razon",
"fisioloas" = "fisiologicas",
"somocos" = "somaticos",
"moca" = "morfologica",
"admnistrativa" = "administrativa",
"hospederiaa" = "hospederia",
"pensian" = "pension",
"cuoado/a" = "cunado/a",
"tocnicos" = "tecnicos",
"cientoficos" = "cientificos",
"cientocos" = "cientificos"
)
x <- stringr::str_replace_all(x, typo_fixes)
# Step 5: Handle special cases that break R strings
# Use regex to match patterns like afa'uble or rehabilitaciafa"n
x <- gsub("afa['\"]uble", "nuble", x, perl = TRUE)
x <- gsub("rehabilitaciafa['\"]n", "rehabilitacion", x, perl = TRUE)
# Step 6: Generic heuristics for remaining question marks
# '?n' or '?t' often from 'án' or 'át' -> 'an' or 'at'
x <- gsub("\\?(?=n|t)", "a", x, perl = TRUE)
# 'l?g' -> 'log' (fisiol?gicas -> fisiologicas)
x <- gsub("l\\?g", "log", x, perl = TRUE)
# Between letters, default to 'o'
x <- gsub("(?<=[a-z])\\?(?=[a-z])", "o", x, perl = TRUE)
# Step 7: Remove any remaining accented characters (convert to ASCII)
accent_map <- c(
"á"="a", "à"="a", "ä"="a", "â"="a", "ã"="a", "å"="a",
"é"="e", "è"="e", "ë"="e", "ê"="e",
"í"="i", "ì"="i", "ï"="i", "î"="i",
"ó"="o", "ò"="o", "ö"="o", "ô"="o", "õ"="o",
"ú"="u", "ù"="u", "ü"="u", "û"="u",
"ñ"="n", "ç"="c",
"Á"="A", "À"="A", "Ä"="A", "Â"="A", "Ã"="A", "Å"="A",
"É"="E", "È"="E", "Ë"="E", "Ê"="E",
"Í"="I", "Ì"="I", "Ï"="I", "Î"="I",
"Ó"="O", "Ò"="O", "Ö"="O", "Ô"="O", "Õ"="O",
"Ú"="U", "Ù"="U", "Ü"="U", "Û"="U",
"Ñ"="N", "Ç"="C"
)
x <- stringr::str_replace_all(x, accent_map)
# Step 8: Clean up double 'nn' at word boundaries
x <- gsub("nn\\b", "n", x, perl = TRUE)
# Step 9: Final cleanup - remove any remaining non-ASCII
x <- iconv(x, from = "UTF-8", to = "ASCII//TRANSLIT", sub = "")
# Step 10: Normalize whitespace
x <- str_squish(x)
x <- stringr::str_replace_all(x, "\\s*\\.\\s*$", "") # Remove trailing periods
# Regex-based replacements (dictionary)
replacements_c5 <- c(
"vafanculos" = "vinculos",
"araucanafaa" = "araucania",
"bafao-bafao" = "bio-bio",
"reloncavafa" = "reloncavi",
"valparaafaso" = "valparaiso",
"afa'uble" = "nuble",
"espontoa" = "espontanea",
"intervenciantegral" = "intervencion integral",
"agresioexual" = "agresion sexual",
"proteccioara" = "proteccion para",
"intervencioreve" = "intervencion breve",
"convina" = "convivencia",
"esquizotafapico" = "esquizotipico",
"especafafico" = "especifico",
"tricotilomanafaa" = "tricotilomania",
"especafaficos" = "especificos",
"nervina" = "nerviosa",
"psicotropas" = "psicotropicas",
"sueno-vina" = "sueno-vigilia",
"generalizada" = "generalizado",
"hipomanafaaco" = "hipomaniaco",
"cleptomanafaa" = "cleptomania",
"especafafica" = "especifica",
"lafamite" = "limite",
"manafaaco" = "maniaco",
"afnimo" = "animo",
"patologafaa" = "patologia",
"cocaafana" = "cocaina",
"1 dafaas - semana" = "1 dia - semana",
"menos de 1 dafaa - semana" = "menos de 1 dia - semana",
"2-3 dafaas - semana" = "2-3 dias - semana",
"4-6 dafaas - semana" = "4-6 dias - semana",
"todos los dafaas" = "todos los dias",
"afasnicamente" = "unicamente",
"lesiones gravafasimaslesiones gravafasimas" = "lesiones gravisimas",
"crafamenes" = "crimenes",
"vafafa,actima" = "victima",
"fiscalafafa,aa" = "fiscalia",
"programa especializados en temoca de nino ninoy/o adolescentes en situacioe calle (pe)" =
"programa especializados en tematica de ninos, ninas y/o adolescentes en situacian de calle (pe)",
"fiscaloa" = "fiscalia",
"proteccian" = "proteccion",
"intervencian" = "intervencion",
"intervencioamiliar" = "intervencion familiar",
"garant\\?" = "garantia",
"fiscal\\?" = "fiscalia",
"corporacioudicial" = "corporacion judicial",
"prevencioomunitaria" = "prevencion comunitaria",
"voimas" = "victimas",
"corporaciafafa,n" = "corporacion",
"representaciafafa,n jurafafa,adica" = "corporacion juridica",
"reparaciafafa,n" = "reparacion",
"daafafa,o" = "dano",
"voctimas" = "victimas",
"garantoa" = "garantia",
"representacian" = "representacion",
"situacian" = "situacion",
"educaci\\?\\?ca" = "educacion basica",
"cardiopatafaas" = "cardiopatias",
"miocardiopatafaa" = "miocardiopatia",
"ludopatafaa" = "ludopatia",
"prisionizacion" = "prisionalizacion"
)
# Apply regex replacements
x <- stringr::str_replace_all(x, replacements_c5)
# Literal (fixed) replacements for problematic patterns with '(' and '?'
replacements_c5_problematic <- c(
"profesional (4 o moa? incompleta" = "profesional (4 o mas) incompleta",
"profesional (4 o moa? completa" = "profesional (4 o mas) completa"
)
x <- stringi::stri_replace_all_fixed(
x,
names(replacements_c5_problematic),
unname(replacements_c5_problematic),
vectorize_all = FALSE
)
return(x)
}
# Apply the comprehensive fix to your dataframe
CONS_C5_25_df <- CONS_C5_25 %>%
mutate(across(where(~ is.character(.x) | is.factor(.x)),
~ fix_encoding_complete(.x)))
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
unique_values_list_c53 <- setNames(
lapply(names(CONS_C5_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C5_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C5_25_df) # assign column names to the list
)
if(list_to_df(unique_values_list_c53) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value)) |> arrange(variable, value) |> nrow()>0){
warning(paste0( "Values with sign '?'= ",
list_to_df(unique_values_list_c53)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value))|> arrange(variable, value)|> nrow())
)
}
if(list_to_df(unique_values_list_c53)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow()>0){
warning(paste0( "Values with signs '´ “ '= ",
list_to_df(unique_values_list_c53)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow())
)
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
dict_fixed <- c(
"vafanculos" = "vinculos",
"araucanafaa" = "araucania",
"bafao-bafao" = "bio-bio",
"reloncavafa" = "reloncavi",
"valparaafaso" = "valparaiso",
"afa'uble" = "nuble",
"espontoa" = "espontanea",
"intervenciantegral" = "intervencion integral",
"agresioexual" = "agresion sexual",
"proteccioara" = "proteccion para",
"intervencioreve" = "intervencion breve",
"convina" = "convivencia",
"esquizotafapico" = "esquizotipico",
"especafafico" = "especifico",
"tricotilomanafaa" = "tricotilomania",
"especafaficos" = "especificos",
"nervina" = "nerviosa",
"psicotropas" = "psicotropicas",
"sueno-vina" = "sueno-vigilia",
"generalizada" = "generalizado",
"hipomanafaaco" = "hipomaniaco",
"cleptomanafaa" = "cleptomania",
"especafafica" = "especifica",
"lafamite" = "limite",
"manafaaco" = "maniaco",
"afnimo" = "animo",
"patologafaa" = "patologia",
"cocaafana" = "cocaina",
"1 dafaas - semana" = "1 dia - semana",
"menos de 1 dafaa - semana" = "menos de 1 dia - semana",
"2-3 dafaas - semana" = "2-3 dias - semana",
"4-6 dafaas - semana" = "4-6 dias - semana",
"todos los dafaas" = "todos los dias",
"afasnicamente" = "unicamente",
"lesiones gravafasimaslesiones gravafasimas" = "lesiones gravisimas",
"crafamenes" = "crimenes",
"vafafa,actima" = "victima",
"fiscalafafa,aa" = "fiscalia",
# keep these exactly as written (with parentheses and ?), they are **literal** here:
"programa especializados en temoca de nino ninoy/o adolescentes en situacioe calle (pe)" =
"programa especializados en tematica de ninos, ninas y/o adolescentes en situacian de calle (pe)",
"fiscaloa" = "fiscalia",
"proteccian" = "proteccion",
"intervencian" = "intervencion",
"intervencioamiliar" = "intervencion familiar",
"garant?" = "garantia",
"fiscal?" = "fiscalia",
"corporacioudicial" = "corporacion judicial",
"prevencioomunitaria" = "prevencion comunitaria",
"voimas" = "victimas",
"corporaciafafa,n" = "corporacion",
"representaciafafa,n jurafafa,adica" = "corporacion juridica",
"reparaciafafa,n" = "reparacion",
"daafafa,o" = "dano",
"voctimas" = "victimas",
"garantoa" = "garantia",
"representacian" = "representacion",
"situacian" = "situacion",
"educaci??ca" = "educacion basica",
# the problematic ones (with unmatched '('): fixed mode handles them safely
"profesional (4 o moa? incompleta" = "profesional (4 o mas) incompleta",
"profesional (4 o moa? completa" = "profesional (4 o mas) completa",
"cardiopatafaas" = "cardiopatias",
"miocardiopatafaa" = "miocardiopatia",
"ludopatafaa" = "ludopatia",
"prisionizacion" = "prisionalizacion"
)
# 2) Literal multi-replace (no regex). Works on full vectors efficiently.
fixed_replace_all <- function(x, dict) {
x <- as.character(x)
pat <- names(dict)
rep <- unname(dict)
stringi::stri_replace_all_fixed(x, pat, rep, vectorize_all = FALSE)
}
# 3) Apply to all character/factor columns (no regex errors anymore)
CONS_C5_25_df <- CONS_C5_25_df %>%
mutate(across(where(~ is.character(.x) || is.factor(.x)),
~ fixed_replace_all(.x, dict_fixed)))
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
unique_values_list_c54 <- setNames(
lapply(names(CONS_C5_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C5_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C5_25_df) # assign column names to the list
)
if(list_to_df(unique_values_list_c54) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value)) |> arrange(variable, value) |> nrow()>0){
warning(paste0( "Values with sign '?'= ",
list_to_df(unique_values_list_c54)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value))|> arrange(variable, value)|> nrow())
)
}
if(list_to_df(unique_values_list_c54)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow()>0){
warning(paste0( "Values with signs '´ “ '= ",
list_to_df(unique_values_list_c54)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow())
)
}
#list_to_df(unique_values_list_c54) |> filter(variable!="codigo_identificacion", variable!="HASH_KEY", !grepl("fecha|edad|dias|numero|id|codigo|n_",variable)) |> View()Clean C6
Code
# Apply transformations to character columns
CONS_C6_25_df <- CONS_C6_25 %>%
dplyr::mutate(across(everything(), ~ {
.x %>%
stringr::str_to_lower() %>% # Convert to lowercase
stringr::str_trim() %>% # Trim leading and trailing whitespace
stringr::str_replace_all("\\s+", " ") %>% # Replace multiple spaces with a single space
stringr::str_replace_all("\\s*\\.\\s*$", "") # Remove periods at the end (and spaces before)
}))
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Obtain unique values by column, again")
unique_values_list_c60 <- setNames(
lapply(names(CONS_C6_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C6_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C6_25_df) # assign column names to the list
)
df_c6_problems0 <-
purrr::map_dfr(names(unique_values_list_c60), function(name) {
tibble(element_name = name, subelement = unique_values_list_c60[[name]])
})%>%
dplyr::filter(stringr::str_detect(subelement, "[^[:ascii:]]|Ã|Â|ã|â|\\?|\\\\|í|ì|î|ï|é|è|ê|ë|ó|ò|ô|õ|ö|ú|ù|û|ü|ñ|Ñ|ñ|¿|’|“|â€|ã³|ã©|ã|ãº|ã¼|ã²|ã¼|ã³|ã³|ã±|[@#$%^&*<>~`{}\\[\\]]"))
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
replacements_c6_1 <- c(
"ãƒâ±" = "ñ", "ãƒâ³" = "ó", "ãƒâ" = "í",
"ãƒâ©" = "é", "ãƒâ¡" = "á", "ãƒâº" = "ú"
)
replace_chars <- function(column) {
reduce(names(replacements_c6_1), ~ stringr::str_replace_all(.x, .y, replacements_c6_1[.y]), .init = column)
}
CONS_C6_25_df <- CONS_C6_25_df %>%
dplyr::mutate(across(everything(), replace_chars))
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Obtain unique values by column, again")
unique_values_list_c61 <- setNames(
lapply(names(CONS_C6_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C6_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C6_25_df) # assign column names to the list
)
df_c6_problems1 <-
purrr::map_dfr(names(unique_values_list_c61), function(name) {
tibble(element_name = name, subelement = unique_values_list_c61[[name]])
})%>%
dplyr::filter(str_detect(subelement, "[^[:ascii:]]|Ã|Â|ã|â|\\?|\\\\|í|ì|î|ï|é|è|ê|ë|ó|ò|ô|õ|ö|ú|ù|û|ü|ñ|Ñ|ñ|¿|’|“|â€|ã³|ã©|ã|ãº|ã¼|ã²|ã¼|ã³|ã³|ã±|[@#$%^&*<>~`{}\\[\\]]"))
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
replacements_c6_2 <- c(
"iba\\?s" = "ibáñez",
"ibaãƒâ‘es" = "ibáñez",
"ãƒâ‘uble" = "ñuble",
"peãƒâ‘alolen" = "peñalolén",
"viãƒâ‘a" = "viña",
"peãƒâ‘aflor" = "peñaflor",
"ãƒâ‘uãƒâ‘oa" = "ñuñoa",
"vicuãƒâ‘a" = "vicuña",
"chaãƒâ‘aral" = "chañaral",
"doãƒâ‘ihue" = "doñihue",
"hualaãƒâ‘e" = "hualañé",
"ãƒâ‘iquen" = "ñiquén",
"gendarmeríã‚âa" = "gendarmería",
"intimidaci\\?" = "intimidación",
"grav\\?mas" = "gravísimas",
"receptaci\\?" = "receptación",
"tr\\?co de estupefacientes" = "tráfico de estupefacientes",
"t\\?ico" = "técnico",
"\\(1-3 años completa" = "(1-3 años completa)",
"\\(1-3 años incompleta" = "(1-3 años incompleta)",
"otra raz\\?" = "otra razón",
"ãƒâšnicamente" = "únicamente",
"pensi\\?hostal" = "pensión, hostal",
"hospeder\\?" = "hospedería",
"ocupaci\\?rregular" = "ocupación irregular",
"1 a 2 a\\?" = "1 a 2 años",
"3 a 4 a\\?" = "3 a 4 años",
"5 o m\\?a" = "5 o más años",
"\\?mo 12 meses" = "últimos 12 meses",
"\\?mo 6 meses" = "últimos 6 meses",
"coca\\?" = "cocaína",
"sintom\\?cos" = "sintomáticos",
"psicol\\?o" = "psicológico",
"ãƒânimo"= "ánimo",
"explotaci\\?omercial" = "explotación comercial",
"logro m\\?mo" = "logro mínimo",
"org\\?cos" = "orgánicos",
"h\\?tos" = "hábitos",
"fisiol\\?as" = "fisiológicas",
"som\\?cos" = "somáticos",
"esquizot\\?co" = "esquizotípico",
"neur\\?os" = "neuróticos"
)
replace_chars <- function(column) {
reduce(names(replacements_c6_2), ~ stringr::str_replace_all(.x, .y, replacements_c6_2[.y]), .init = column)
}
CONS_C6_25_df <- CONS_C6_25_df %>%
dplyr::mutate(across(everything(), replace_chars))
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Obtain unique values by column, again")
unique_values_list_c62 <- setNames(
lapply(names(CONS_C6_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C6_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C6_25_df) # assign column names to the list
)
df_c6_problems2 <-
purrr::map_dfr(names(unique_values_list_c62), function(name) {
tibble(element_name = name, subelement = unique_values_list_c62[[name]])
})%>%
dplyr::filter(str_detect(subelement, "[^[:ascii:]]|Ã|Â|ã|â|\\?|\\\\|í|ì|î|ï|é|è|ê|ë|ó|ò|ô|õ|ö|ú|ù|û|ü|ñ|Ñ|ñ|¿|’|“|â€|ã³|ã©|ã|ãº|ã¼|ã²|ã¼|ã³|ã³|ã±|[@#$%^&*<>~`{}\\[\\]]"))
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
replacements4 <- c(
"á" = "a", "é" = "e", "í" = "i", "ó" = "o", "ú" = "u", "ñ" = "n",
"Á" = "A", "É" = "E", "Í" = "I", "Ó" = "O", "Ú" = "U", "Ñ" = "N"
)
replace_chars4 <- function(column) {
for (pattern in names(replacements4)) {
column <- gsub(pattern, replacements4[pattern], column, fixed = TRUE)
}
return(column)
}
CONS_C6_25_df <- CONS_C6_25_df %>%
dplyr::mutate(across(everything(), ~ replace_chars4(.)))
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# Comprehensive encoding fix function
fix_encoding_complete <- function(x) {
x <- as.character(x)
x <- tolower(x)
# Step 1: Remove generic UTF-8 BOM and control characters
x <- gsub("[\uFEFF\u00AD\u200B-\u200F\u202A-\u202E]", "", x, perl = TRUE)
# Step 2: Fix mojibake patterns using regex (avoids quote-breaking issues)
# Match corruption patterns and remove them, then fix the word
#----
# Pattern 1: Remove mojibake sequences (3-4 char UTF-8 corruption)
# This catches ãƒâ and similar patterns
x <- gsub("[\u00C0-\u00FF][\u0080-\u00BF]{1,3}", "", x, perl = TRUE)
# Pattern 2: Fix words that had mojibake removed
word_corrections <- c(
# After mojibake removal, fix incomplete words
"^nicamente$" = "unicamente",
"^uble$" = "nuble",
"rehabilitaci.*?n$" = "rehabilitacion",
"raz.*?n$" = "razon",
"esquizot.*?pico$" = "esquizotipico",
"curic.*?$" = "curico",
"b.*?o-b.*?o$" = "bio-bio",
"vi.*?a$" = "vina",
"ays.*?n$" = "aysen",
"concepci.*?n$" = "concepcion",
"m.*?nimo$" = "minimo",
"t.*?rmino$" = "termino",
"f.*?sico$" = "fisico",
"otro g.*?nero$" = "otro genero"
)
for(pattern in names(word_corrections)) {
x <- gsub(pattern, word_corrections[pattern], x, perl = TRUE)
}
# Step 3: Handle question mark replacements systematically
qmark_replacements <- list(
# Communes and regions (including specific cases from data)
c("cha\\?ral", "chanaral"),
c("vi\\?a del mar", "vina del mar"),
c("vi\\? del mar", "vina del mar"),
c("vicu\\?", "vicuna"),
c("san gregorio de \\?iquen", "san gregorio de niquen"),
c("san gregorio de \\?quen", "san gregorio de niquen"),
c("de \\?ble", "de nuble"),
c("de \\?uble", "de nuble"),
c("iba\\?es", "ibanez"),
c("iba\\?s", "ibanez"),
# Education
c("educaci\\?n", "educacion"),
c("b\\?sica", "basica"),
c("t\\?cnico", "tecnico"),
c("t\\?cnica", "tecnica"),
c("t\\?ico", "tecnico"),
c("t\\?ica", "tecnica"),
c("nunca estudi\\?", "nunca estudio"),
c("a\\?os", "anos"),
c("m\\?s", "mas"),
c("\\(4 o m\\?s a\\?os\\)", "(4 o mas anos)"),
c("\\(1-3 a\\?os\\)", "(1-3 anos)"),
c("1-3 a\\?", "1-3 anos"),
# Clinical terms
c("esquizot\\?pico", "esquizotipico"),
c("esquizot\\?co", "esquizotipico"),
c("h\\?bitos", "habitos"),
c("h\\?tos", "habitos"),
c("psicol\\?gico", "psicologico"),
c("psicol\\?gicos", "psicologicos"),
c("psicol\\?os", "psicologicos"),
c("fisiol\\?gicas", "fisiologicas"),
c("fisiol\\?as", "fisiologicas"),
c("som\\?tico", "somatico"),
c("som\\?ticos", "somaticos"),
c("som\\?ca", "somatica"),
c("psic\\?tico", "psicotico"),
c("psic\\?ticos", "psicoticos"),
c("psic\\?os", "psicoticos"),
c("neur\\?tico", "neurotico"),
c("neur\\?ticos", "neuroticos"),
c("neur\\?os", "neuroticos"),
c("org\\?nico", "organico"),
c("org\\?nicos", "organicos"),
c("org\\?co", "organico"),
c("sintom\\?tico", "sintomatico"),
c("sintom\\?ticos", "sintomaticos"),
c("sintom\\?cos", "sintomaticos"),
c("adaptaci\\?n", "adaptacion"),
c("adaptaci\\?", "adaptacion"),
c("especificaci\\?n", "especificacion"),
c("especificaci\\?", "especificacion"),
c("transformaci\\?n", "transformacion"),
c("transformaci\\?ersistente", "transformacion persistente"),
c("lesi\\?n", "lesion"),
c("lesi\\?", "lesion"),
c("disfunci\\?n", "disfuncion"),
c("disfunci\\?erebral", "disfuncion cerebral"),
c("espec\\?fico", "especifico"),
c("espec\\?ficos", "especificos"),
# Medical/physical
c("megalobl\\?ica", "megaloblastica"),
c("ferrop\\?ca", "ferropenica"),
c("cardiopat\\?as", "cardiopatias"),
c("cardiopat\\?", "cardiopatia"),
c("miocardiopat\\?", "miocardiopatia"),
c("alcoh\\?lica", "alcoholica"),
c("hepatitis cr\\?a", "hepatitis cronica"),
c("hepatitis cr\\?nica", "hepatitis cronica"),
c("patolog\\?a", "patologia"),
c("patolog\\?", "patologia"),
c("gesti\\?n", "gestion"),
c("ni\\?o", "nino"),
c("ni\\?", "nino"),
# Evaluation
c("logro m\\?nimo", "logro minimo"),
c("logro m\\?mo", "logro minimo"),
# Substances
c("coca\\?na", "cocaina"),
c("coca\\?", "cocaina"),
c("analg\\?sicos", "analgesicos"),
c("analg\\?cos", "analgesicos"),
c("hipn\\?ticos", "hipnoticos"),
c("hipn\\?os", "hipnoticos"),
c("alucin\\?genos", "alucinogenos"),
c("alucin\\?os", "alucinogenos"),
# Time/frequency
c("d\\?as", "dias"),
c("d\\?a", "dia"),
c("\\?ltimo", "ultimo"),
c("\\?ltimos", "ultimos"),
c("no consumi\\?", "no consumio"),
# Other terms
c("explotaci\\?n", "explotacion"),
c("explotaci\\?exual", "explotacion sexual"),
c("discriminaci\\?n", "discriminacion"),
c("discriminaci\\?", "discriminacion"),
c("violaci\\?n", "violacion"),
c("violaci\\?", "violacion"),
c("derivaci\\?n", "derivacion"),
c("derivaci\\?", "derivacion"),
c("orientaci\\?exuales", "orientacion sexuales"),
c("estr\\?grave", "estres grave"),
c("estr\\?", "estres"),
c("s\\?rome amn\\?co", "sindrome amnesico"),
c("per\\?", "peru"),
c("rep\\?ca dominicana", "republica dominicana"),
c("c\\?nyuge", "conyuge"),
c("cu\\?ado", "cunado"),
c("hospeder\\?", "hospederia"),
c("pensi\\?", "pension"),
c("aspiraci\\?n", "aspiracion"),
c("aspiraci\\?", "aspiracion"),
c("ocupaci\\?n", "ocupacion"),
c("ocupaci\\?", "ocupacion"),
c("condici\\?n", "condicion"),
c("condici\\?", "condicion"),
c("corrupci\\?", "corrupcion"),
c("producci\\?n", "produccion"),
c("producci\\?", "produccion"),
c("pornogr\\?fico", "pornografico"),
c("pornogr\\?co", "pornografico"),
c("receptaci\\?", "receptacion"),
c("intimidaci\\?", "intimidacion"),
c("sue\\?", "sueno"),
c("raz\\?", "razon"),
# Additional patterns from data
c("tr\\?fico", "trafico"),
c("grav\\?simas", "gravisimas"),
c("lesiones graves y grav\\?simas", "lesiones graves y gravisimas"),
c("pensi\\?n", "pension"),
c("residencial, pensi\\?n, hostal", "residencial, pension, hostal"),
c("ocupaci\\?n irregular", "ocupacion irregular"),
c("carlos iba\\?es del campo", "carlos ibanez del campo"),
c("logro m\\?nimo", "logro minimo"),
c("explotaci\\?n comercial sexual", "explotacion comercial sexual")
)
# Apply question mark replacements
for(repl in qmark_replacements) {
x <- gsub(repl[1], repl[2], x, perl = TRUE)
}
# Step 4: Fix common typos and specific issues
typo_fixes <- c(
"orgocos" = "organicos",
"orgnaicos" = "organicos",
"fafasico" = "fisico",
"terapoutica" = "terapeutica",
"terapoica" = "terapeutica",
"derivacian" = "derivacion",
"miocardiopatoa" = "miocardiopatia",
"gestian" = "gestion",
"\\bnioo\\b" = "nino",
"\\bnin\\b" = "nino",
"suenoo" = "sueno",
"neuroos" = "neurologicos",
"psicoloo" = "psicologico",
"aoos" = "anos",
"nioez" = "ninez",
"toica" = "tecnica",
"nocontesta" = "no contesta",
"razan" = "razon",
"fisioloas" = "fisiologicas",
"somocos" = "somaticos",
"moca" = "morfologica",
"admnistrativa" = "administrativa",
"hospederiaa" = "hospederia",
"pensian" = "pension",
"cuoado/a" = "cunado/a",
"tocnicos" = "tecnicos",
"cientoficos" = "cientificos",
"cientocos" = "cientificos",
# Fix double letter issues from data
"intimidacionn" = "intimidacion",
"receptacionn" = "receptacion",
"otra razonn" = "otra razon",
"cocainana" = "cocaina",
"terapeuticaa" = "terapeutica",
"alta admnistrativaa" = "alta administrativa",
# Fix region/place names
"valparaiso" = "valparaiso", # Already handled by accent removal
"ibanez" = "ibanez",
"aysen" = "aysen",
# Fix double n at end of words (more general)
"\\bintimidacionn\\b" = "intimidacion",
"\\breceptacionn\\b" = "receptacion",
"\\brazonn\\b" = "razon",
"\\bcondicionn\\b" = "condicion"
)
x <- str_replace_all(x, typo_fixes)
# Step 5: Handle special cases that break R strings
# Use regex to match patterns like afa'uble or rehabilitaciafa"n
x <- gsub("afa['\"]uble", "nuble", x, perl = TRUE)
x <- gsub("rehabilitaciafa['\"]n", "rehabilitacion", x, perl = TRUE)
# Step 5b: Fix trailing double letters (nn, aa, etc.)
# This handles cases like "intimidacionn" -> "intimidacion"
x <- gsub("([aeiou])nn\\b", "\\1n", x, perl = TRUE)
x <- gsub("([aeiou])aa\\b", "\\1a", x, perl = TRUE)
x <- gsub("([aeiou])ss\\b", "\\1s", x, perl = TRUE)
# Step 6: Generic heuristics for remaining question marks
# '?n' or '?t' often from 'án' or 'át' -> 'an' or 'at'
x <- gsub("\\?(?=n|t)", "a", x, perl = TRUE)
# 'l?g' -> 'log' (fisiol?gicas -> fisiologicas)
x <- gsub("l\\?g", "log", x, perl = TRUE)
# Between letters, default to 'o'
x <- gsub("(?<=[a-z])\\?(?=[a-z])", "o", x, perl = TRUE)
# Step 7: Remove any remaining accented characters (convert to ASCII)
accent_map <- c(
"á"="a", "à"="a", "ä"="a", "â"="a", "ã"="a", "å"="a",
"é"="e", "è"="e", "ë"="e", "ê"="e",
"í"="i", "ì"="i", "ï"="i", "î"="i",
"ó"="o", "ò"="o", "ö"="o", "ô"="o", "õ"="o",
"ú"="u", "ù"="u", "ü"="u", "û"="u",
"ñ"="n", "ç"="c",
"Á"="A", "À"="A", "Ä"="A", "Â"="A", "Ã"="A", "Å"="A",
"É"="E", "È"="E", "Ë"="E", "Ê"="E",
"Í"="I", "Ì"="I", "Ï"="I", "Î"="I",
"Ó"="O", "Ò"="O", "Ö"="O", "Ô"="O", "Õ"="O",
"Ú"="U", "Ù"="U", "Ü"="U", "Û"="U",
"Ñ"="N", "Ç"="C"
)
x <- str_replace_all(x, accent_map)
# Step 8: Clean up double letters at word boundaries
x <- gsub("nn\\b", "n", x, perl = TRUE)
x <- gsub("aa\\b", "a", x, perl = TRUE)
x <- gsub("ss\\b", "s", x, perl = TRUE)
# Step 9: Final cleanup - remove any remaining non-ASCII
# Use both iconv and additional cleanup for safety
x <- iconv(x, from = "UTF-8", to = "ASCII//TRANSLIT", sub = "")
x <- gsub("[^[:ascii:]]", "", x, perl = TRUE) # Remove any remaining non-ASCII
# Step 10: Normalize whitespace
x <- str_squish(x)
x <- str_replace_all(x, "\\s*\\.\\s*$", "") # Remove trailing periods
x <- str_replace_all(x, c("bolivina"= "boliviana", "robo con vina"= "robo con violencia", "5 o mas anos\\?"="5 o mas anos"))
return(x)
}
# Apply the comprehensive fix to your dataframe
CONS_C6_25_df <- CONS_C6_25_df %>%
mutate(across(where(~ is.character(.x) | is.factor(.x)),
~ fix_encoding_complete(.x)))
##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Obtain unique values by column, again")
unique_values_list_c63 <- setNames(
lapply(names(CONS_C6_25_df), function(col_name) {
# obtain unique values
unique_values <- unique(CONS_C6_25_df[[col_name]])
return(unique_values)
}),
names(CONS_C6_25_df) # assign column names to the list
)
if(list_to_df(unique_values_list_c63) |> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value)) |> arrange(variable, value) |> nrow()>0){
warning(paste0( "Values with sign '?'= ",
list_to_df(unique_values_list_c63)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[\\?]",value))|> arrange(variable, value)|> nrow())
)
}
if(list_to_df(unique_values_list_c63)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow()>0){
warning(paste0( "Values with signs '´ “ '= ",
list_to_df(unique_values_list_c63)|> filter(variable!="codigo_identificacion", variable!="comentario", grepl("[^\x20-\x7E]",value))|> arrange(variable, value)|> nrow())
)
}
#list_to_df(unique_values_list_c63) |> filter(variable!="codigo_identificacion", variable!="HASH_KEY", !grepl("fecha|edad|dias|numero|id|codigo|n_meses|tiempo",variable)) |> View()To close the project, we erase polars objects.
Code
rm(list = ls()[grepl("_pl$", ls())])
rm(list = ls()[grepl("_pl_", ls())])Session info
Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
message(paste0("R library: ", Sys.getenv("R_LIBS_USER")))Code
message(paste0("Date: ",withr::with_locale(new = c('LC_TIME' = 'C'), code =Sys.time())))Code
message(paste0("Editor context: ", path))Code
cat("quarto version: "); quarto::quarto_version()quarto version:
[1] '1.7.29'
Code
sesion_info <- devtools::session_info()Warning in system2(“quarto”, “-V”, stdout = TRUE, env = paste0(“TMPDIR=”, : el comando ejecutado ‘“quarto” TMPDIR=C:/Users/andre/AppData/Local/Temp/RtmpGgoTVt/file3034b8139ce -V’ tiene el estatus 1
Code
dplyr::select(
tibble::as_tibble(sesion_info$packages),
c(package, loadedversion, source)
) %>%
DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'', htmltools::em('R packages')),
options=list(
initComplete = htmlwidgets::JS(
"function(settings, json) {",
"$(this.api().tables().body()).css({
'font-family': 'Helvetica Neue',
'font-size': '70%',
'code-inline-font-size': '15%',
'white-space': 'nowrap',
'line-height': '0.75em',
'min-height': '0.5em'
});",
"}")))Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
#|class-output: center-table
reticulate::py_list_packages() %>%
DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'', htmltools::em('Python packages')),
options=list(
initComplete = htmlwidgets::JS(
"function(settings, json) {",
"$(this.api().tables().body()).css({
'font-family': 'Helvetica Neue',
'font-size': '70%',
'code-inline-font-size': '15%',
'white-space': 'nowrap',
'line-height': '0.75em',
'min-height': '0.5em'
});",
"}"))) Warning in system2(python, args, stdout = TRUE): el comando ejecutado ‘“G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe” -m pip freeze’ tiene el estatus 1
Save
Code
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
paste0(getwd(),"/cons")[1] "G:/My Drive/Alvacast/SISTRAT 2023/cons/cons"
Code
file.path(paste0(wdpath,"data/20241015_out"))[1] "G:/My Drive/Alvacast/SISTRAT 2023//data/20241015_out"
Code
file.path(paste0(envpath,"data/20241015_out"))[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/data/20241015_out"
Code
# Save
rdata_path <- file.path(wdpath, "data/20241015_out", paste0("22_ndp_", format(Sys.time(), "%Y_%m_%d"), ".Rdata"))
save.image(rdata_path)
cat("Saved in:",
rdata_path)Saved in: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/22_ndp_2025_09_27.Rdata
Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
password <- Sys.getenv("PASSWORD_SECRET")
} else {
if (interactive()) {
utils::savehistory(tempfile())
Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
utils::loadhistory()
}
Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
save.image(paste0(rdata_path,".enc"))
# Encriptar el archivo en el mismo lugar
httr2::secret_encrypt_file(path = paste0(rdata_path,".enc"), key = "PASSWORD_SECRET")Warning in writeBin(enc, path): problema al escribir en la conexión
Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Copy renv lock into cons folder\n")Copy renv lock into cons folder
Code
if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
message("Running on RStudio Server or inside Docker. Folder copy skipped.")
} else {
source_folder <-
destination_folder <- paste0(wdpath,"cons/renv")
# Copy the folder recursively
file.copy(paste0(wdpath,"renv.lock"), paste0(wdpath,"cons/renv.lock"), overwrite = TRUE)
message("Renv lock copy performed.")
}Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
time_after_dedup_pre2<-Sys.time()
paste0("Time in markdown: ");time_after_dedup_pre2-time_before_dedup_pre2[1] "Time in markdown: "
Time difference of 6.472572 mins