Data Preparation and Standardization

This page will be used to comment and understand the steps, advances and doubts about the process of data preparation and standardization of the SISTRAT C1 (Convenio 1) which can be translated to Agreement on Collaboration, Technical Assistance and Transfer of Resources to General Population (“Convenio de Colaboración Técnica y de Transferencia de Recursos para Programa de Población General”). Must note that it does not include Adolescent criminal offenders (C2).

To see the TOP or Profile of Treatment Results (“Perfil de Resultados de Tratamiento”) data preparation, go to this webpage.

Building dataset

Define working directories that contain different text files that will be merged, correspondent to the years of the C1 Sistrat.

dir_c1 <-paste0(gsub("/SUD_CL","",path),"/Encriptado c1/Personas tratadas c1/")
dir_top <-paste0(gsub("/SUD_CL","",path),"/encriptados TOP/")
#In my case,
#dir_c1 <-toString(paste0("G:/Mi unidad/Alvacast/SISTRAT 2019 (github)","/Encriptado c1/Personas tratadas c1/"))
#dir_top <-toString(paste0("G:/Mi unidad/Alvacast/SISTRAT 2019 (github)","/encriptados TOP/"))

Define files in the Directory folder, as long as they have the excel extension and they do not start with ~ (represent a working temporary file in excel)

SISTRAT_c1<-list.files(path=toString(dir_c1), all.files=T, pattern="^[2].*\\s*txt$")

SISTRAT_c1 is composed by the following files: * 2010tab-Resultado-20191113 (1).txt * 2011tab -Resultado-20191113.txt * 2012tab-Resultado-20191113.txt * 2013tab-Resultado-20191113.txt * 2014tab-Resultado-20191113.txt * 2015tab-Resultado-20191113.txt * 2016tab-Resultado-20191113.txt * 2017tab-Resultado-20191113.txt * 2018tab-Resultado-20191113.txt * 2019_EneOcttab-Resultado-20191113.txt

Function to read tab delimited text files, with UTF-8 encoding and, based on the first 7 letters of its name, assign them an object name.

read_excel_mult <- function(dir, filename) {
  assign(paste0(substr(filename, 1, 7)),read.delim(paste0(dir, filename),
        na.strings="null", header = T, fileEncoding="UTF-8"),envir = .GlobalEnv)
  }

Import datasets

To import the datasets, we apply the previous function to every dataset

for (x in SISTRAT_c1) {
  read_excel_mult(as.character(dir_c1), x)
}

Normalize datasets

Normalize datasets by defining a common name for hash key’s (“HASH_KEY”) and assign them a standardized name.

tab_10_18_mod <- function(x,y) {  get(x) %>% dplyr::rename("HASH_KEY" = !!names(.[91])) %>%
    as.data.frame() %>% 
    assign(paste0(y, as.character(x)),.,envir = .GlobalEnv)
}
for (i in paste0(c(2010:2018),"tab")) {tab_10_18_mod(i,y="c1_")}

`2019_En` %>% dplyr::rename("HASH_KEY" = !!names(.[92])) %>%
  as.data.frame() %>% 
  assign(paste0("c1_", "2019tab"),.,envir = .GlobalEnv)

Append the datasets

Once column names are normalized, bind every year’s datasets by their rows.

CONS_C1=rbindlist(mget(paste0("c1_",c(2010:2019),"tab")), idcol="TABLE", fill=T)
CONS_C1 <- CONS_C1 %>% dplyr::mutate(row=1:nrow(CONS_C1)) %>% dplyr::select(row,everything())

Define and format variable names

Redefine variable names, create a column named ano_bd with the year. Also, reorder the variables to get the identifiers in the first columns.

Rename Variables

CONS_C1 %>% 
  dplyr::rename(id=`Codigo.Identificación`) %>%
  dplyr::rename(fech_ing=`Fecha.Ingreso.a.Tratamiento`) %>%
  dplyr::rename(fech_egres=`Fecha.Egreso.de.Tratamiento`) %>%
  dplyr::rename(dias_trat=`Dias.en.Tratamiento`) %>%
  dplyr::rename(eva_consumo=`Evaluación.al.Egreso.Respecto.al.Patrón.de.consumo`) %>%
  dplyr::rename(eva_fam=`Evaluación.al.Egreso.Respecto.a.Situación.Familiar`) %>%
  dplyr::rename(eva_sm=`Evaluación.al.Egreso.Respecto.Salud.Mental`) %>%
  dplyr::rename(eva_fisica=`Evaluación.al.Egreso.Respecto.Salud.Física`) %>%
  dplyr::rename(eva_transgnorma=`Evaluación.al.Egreso.Respecto.Trasgresión.a.la.Norma.Social`) %>% 
  dplyr::rename(eva_relinterp=`Evaluación.al.Egreso.Respecto.Relaciones.Interpersonales`) %>%
  dplyr::rename(eva_ocupacion=`Evaluación.al.Egreso.Respecto.a.Situación.Ocupacional`) %>%
  dplyr::rename(evaluacindelprocesoteraputico=`Evaluación.del.Proceso.Terapéutico`) %>%
  dplyr::rename(nmesesentratamiento=`N.Meses.en.Tratamiento`) %>%
  dplyr::rename(motivodeegreso=`Motivo.de.Egreso`) %>%
  dplyr::rename(tipo_centro=`Tipo.Centro`) %>%
  dplyr::mutate(ano_bd=as.numeric(substr(TABLE,4,7))) %>%
  select(row, TABLE, HASH_KEY,ano_bd, everything()) %>%
  dplyr::arrange(id) %>%
  assign("CONS_C1_df",.,envir = .GlobalEnv)

Relevant variables are converted into factors.

CONS_C1_df %>%
  dplyr::mutate(motivodeegreso=as.factor(motivodeegreso)) %>%
  dplyr::mutate(evaluacindelprocesoteraputico=as.factor(evaluacindelprocesoteraputico)) %>%
  dplyr::mutate(eva_consumo=as.factor(eva_consumo)) %>%
  dplyr::mutate(eva_fam=as.factor(eva_fam)) %>%  
  dplyr::mutate(eva_relinterp=as.factor(eva_relinterp)) %>%  
  dplyr::mutate(eva_ocupacion=as.factor(eva_ocupacion)) %>%
  dplyr::mutate(eva_sm=as.factor(eva_sm)) %>%  
  dplyr::mutate(eva_fisica=as.factor(eva_fisica)) %>%
  dplyr::mutate(eva_transgnorma=as.factor(eva_transgnorma)) %>%  
  dplyr::mutate(sexo=as.factor(Sexo)) %>%
  dplyr::mutate(embarazo=as.factor(`X.Se.trata.de.una.mujer.embarazada.`)) %>%
  dplyr::mutate(tipo_de_plan=as.factor(`Tipo.de.Plan`)) %>%  
  dplyr::mutate(tipo_de_programa=as.factor(`Tipo.de.Programa`)) %>%  
#ACTUALIZATION: JANUARY 2020
  dplyr::mutate(`tipo_centro`=as.factor(`tipo_centro`)) %>%
  dplyr::mutate(`Servicio.de.Salud`=as.factor(`Servicio.de.Salud`)) %>%
  dplyr::mutate(SENDA=as.factor(SENDA)) %>%
  dplyr::mutate(Origen.de.Ingreso=as.factor(Origen.de.Ingreso)) %>%
  dplyr::mutate(País.Nacimiento=as.factor(País.Nacimiento)) %>%
  dplyr::mutate(Nacionalidad=as.factor(Nacionalidad)) %>%
  dplyr::mutate(Etnia=as.factor(Etnia)) %>%
  dplyr::mutate(Estado.Conyugal=as.factor(Estado.Conyugal)) %>%
  dplyr::mutate(Sustancia.de.Inicio=as.factor(Sustancia.de.Inicio)) %>%
  dplyr::mutate(X.Se.trata.de.una.mujer.embarazada.=as.factor(X.Se.trata.de.una.mujer.embarazada.)) %>%
  dplyr::mutate(Escolaridad..último.año.cursado.=as.factor(Escolaridad..último.año.cursado.)) %>%
  dplyr::mutate(Condicion.Ocupacional=as.factor(Condicion.Ocupacional)) %>%
  dplyr::mutate(Categoría.Ocupacional=as.factor(Categoría.Ocupacional)) %>%
  dplyr::mutate(Rubro.Trabaja=as.factor(Rubro.Trabaja)) %>%
  dplyr::mutate(Con.Quién.Vive=as.factor(Con.Quién.Vive)) %>%
  dplyr::mutate(Tipo.de.vivienda=as.factor(Tipo.de.vivienda)) %>%
  dplyr::mutate(Tenencia.de.la.vivienda=as.factor(Tenencia.de.la.vivienda)) %>%
  dplyr::mutate(Sustancia.Principal=as.factor(Sustancia.Principal)) %>%
  dplyr::mutate(Otras.Sustancias.nº1=as.factor(Otras.Sustancias.nº1)) %>%
  dplyr::mutate(Otras.Sustancias.nº2=as.factor(Otras.Sustancias.nº2)) %>%
  dplyr::mutate(Otras.Sustancias.nº3=as.factor(Otras.Sustancias.nº3)) %>%
  dplyr::mutate(Frecuencia.de.Consumo..Sustancia.Principal.=as.factor(Frecuencia.de.Consumo..Sustancia.Principal.)) %>%
  dplyr::mutate(Vía.Administración..Sustancia.Principal.=as.factor(Vía.Administración..Sustancia.Principal.)) %>%
  dplyr::mutate(Diagnóstico.Trs..Consumo.Sustancia=as.factor(Diagnóstico.Trs..Consumo.Sustancia)) %>%
  dplyr::mutate(Diagnóstico.Trs..Psiquiátrico.DSM.IV=as.factor(Diagnóstico.Trs..Psiquiátrico.DSM.IV)) %>%
  dplyr::mutate(Diagnóstico.Trs..Psiquiátrico.SUB.DSM.IV=as.factor(Diagnóstico.Trs..Psiquiátrico.SUB.DSM.IV)) %>%
  dplyr::mutate(X2.Diagnóstico.Trs..Psiquiátrico.DSM.IV=as.factor(X2.Diagnóstico.Trs..Psiquiátrico.DSM.IV)) %>%
  dplyr::mutate(X2.Diagnóstico.Trs..Psiquiátrico.SUB.DSM.IV=as.factor(X2.Diagnóstico.Trs..Psiquiátrico.SUB.DSM.IV)) %>%
  dplyr::mutate(X3.Diagnóstico.Trs..Psiquiátrico.DSM.IV=as.factor(X3.Diagnóstico.Trs..Psiquiátrico.DSM.IV)) %>%
  dplyr::mutate(X3.Diagnóstico.Trs..Psiquiátrico.SUB.DSM.IV=as.factor(X3.Diagnóstico.Trs..Psiquiátrico.SUB.DSM.IV)) %>%
  dplyr::mutate(Diagnóstico.Trs..Psiquiátrico.CIE.10=as.factor(Diagnóstico.Trs..Psiquiátrico.CIE.10)) %>%
  dplyr::mutate(Diagnóstico.Trs..Psiquiátrico.SUB.CIE.10=as.factor(Diagnóstico.Trs..Psiquiátrico.SUB.CIE.10)) %>%
  dplyr::mutate(X2.Diagnóstico.Trs..Psiquiátrico.CIE.10=as.factor(X2.Diagnóstico.Trs..Psiquiátrico.CIE.10)) %>%
  dplyr::mutate(X2.Diagnóstico.Trs..Psiquiátrico.SUB.CIE.10=as.factor(X2.Diagnóstico.Trs..Psiquiátrico.SUB.CIE.10)) %>%
  dplyr::mutate(X3.Diagnóstico.Trs..Psiquiátrico.CIE.10=as.factor(X3.Diagnóstico.Trs..Psiquiátrico.CIE.10)) %>%
  dplyr::mutate(X3.Diagnóstico.Trs..Psiquiátrico.SUB.CIE.10=as.factor(X3.Diagnóstico.Trs..Psiquiátrico.SUB.CIE.10)) %>%
  dplyr::mutate(Diagnóstico.Trs..Físico=as.factor(Diagnóstico.Trs..Físico)) %>%
  dplyr::mutate(Otros.Problemas.de.Atención.de.Salud.Mental=as.factor(Otros.Problemas.de.Atención.de.Salud.Mental)) %>%
  dplyr::mutate(Compromiso.Biopsicosocial=as.factor(Compromiso.Biopsicosocial)) %>%
  dplyr::mutate(DIAGNOSTICO.GLOBAL.DE.NECESIDADES.DE.INTEGRACION.SOCIAL=as.factor(DIAGNOSTICO.GLOBAL.DE.NECESIDADES.DE.INTEGRACION.SOCIAL)) %>%
  dplyr::mutate(DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.HUMANO=as.factor(DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.HUMANO)) %>%
  dplyr::mutate(DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.FISICO=as.factor(DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.FISICO)) %>%
  dplyr::mutate(DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.SOCIAL=as.factor(DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.SOCIAL)) %>%
  dplyr::mutate(Usuario.de.Tribunales..Tratamiento.Drogas=as.factor(Usuario.de.Tribunales..Tratamiento.Drogas)) %>%
  dplyr::mutate(Consentimiento.Informado=as.factor(Consentimiento.Informado)) %>%
  dplyr::mutate(motivodeegreso=as.factor(motivodeegreso)) %>%
  dplyr::mutate(Tipo.Centro.Derivación=as.factor(Tipo.Centro.Derivación)) %>%
  dplyr::mutate(evaluacindelprocesoteraputico=as.factor(evaluacindelprocesoteraputico)) %>%
  dplyr::mutate(eva_consumo=as.factor(eva_consumo)) %>%
  dplyr::mutate(eva_fam=as.factor(eva_fam)) %>%
  dplyr::mutate(eva_relinterp=as.factor(eva_relinterp)) %>%
  dplyr::mutate(eva_ocupacion=as.factor(eva_ocupacion)) %>%
  dplyr::mutate(eva_sm=as.factor(eva_sm)) %>%
  dplyr::mutate(eva_fisica=as.factor(eva_fisica)) %>%
  dplyr::mutate(eva_transgnorma=as.factor(eva_transgnorma)) %>%
  dplyr::mutate(Diagnóstico.Trastorno.Psiquiátrico.CIE.10.al.Egreso=as.factor(Diagnóstico.Trastorno.Psiquiátrico.CIE.10.al.Egreso)) %>%
  dplyr::mutate(DIAGNOSTICO.GLOBAL.DE.NECESIDADES.DE.INTEGRACION.SOCIAL.1=as.factor(DIAGNOSTICO.GLOBAL.DE.NECESIDADES.DE.INTEGRACION.SOCIAL.1)) %>%
  dplyr::mutate(DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.HUMANO.1=as.factor(DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.HUMANO.1)) %>%
  dplyr::mutate(DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.FISICO.1=as.factor(DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.FISICO.1)) %>%
  dplyr::mutate(DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.SOCIAL.1=as.factor(DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.SOCIAL.1)) %>%
  dplyr::mutate(Motivo.de.egreso.Alta.Administrativa=as.factor(Motivo.de.egreso.Alta.Administrativa)) %>%
  dplyr::mutate(Ha.estado.embarazada.egreso.=as.factor(Ha.estado.embarazada.egreso.)) %>%
  dplyr::mutate(identidad.de.genero=as.factor(identidad.de.genero)) %>%
  dplyr::mutate(discapacidad=as.factor(discapacidad)) %>%
  dplyr::mutate(Opción.discapacidad=as.factor(Opción.discapacidad)) %>%
  dplyr::mutate(sexo=as.factor(sexo)) %>%
  dplyr::mutate(embarazo=as.factor(embarazo)) %>%
  dplyr::mutate(tipo_de_plan=as.factor(tipo_de_plan)) %>%
  dplyr::mutate(tipo_de_programa=as.factor(tipo_de_programa)) %>%
  assign("CONS_C1_df",.,envir = .GlobalEnv)

Standardize Dates

Dates of admission and discharge

Most of the dates were not formatted equally. When standardized, some cases failed to transform dates, specifically in dates of discharge, which contained 87 cases, while only one case was problematic in admission dates.

CONS_C1_df %>%
  dplyr::mutate(fech_ing= ifelse(row=="14504", "10/01/2011",
                          ifelse(row=="88379", "09/07/2015", #ACTUALIZACION 2020-01-17: change of date of admission
                          ifelse(row=="79428", "09/07/2015", #ACTUALIZACION 2020-01-17: change of date of admission             
                                        fech_ing))))  %>%
  dplyr::mutate(fech_ing= lubridate::parse_date_time(fech_ing, c("%d/%m/%Y"),exact=T)) %>% #un caso falla en ser transformado
  dplyr::mutate(fech_egres_sin_fmt= fech_egres) %>% #me quedo con esta variable por si acaso
  dplyr::mutate(fech_egres= ifelse(row=="36308", "02/04/2013",
                            ifelse(row=="14083", "03/05/2011",
                            ifelse(row=="6349", "01/07/2010",
                            ifelse(row=="42741", "02/08/2013",
                            ifelse(row=="8608","01/02/2011",
                            ifelse(row=="11709","01/02/2011",
                            ifelse(row=="40486","03/07/2013",
                            ifelse(row=="42521","09/07/2013",
                            ifelse(row=="5757", "04/10/2010", #cambiado
                            ifelse(row=="39507", "02/07/2013",
                            ifelse(row=="3195", "04/10/2010",
                            ifelse(row=="37845", "01/07/2013",
                            ifelse(row=="40180", "07/08/2013",
                            ifelse(row=="28008", "27/11/2012",
                            ifelse(row=="35971", "01/03/2013",
                            ifelse(row=="5172", "03/05/2011",
                            ifelse(row=="10415", "03/05/2011",
                            ifelse(row=="16385", "30/08/2011",
                            ifelse(row=="39932", "02/08/2013",
                            ifelse(row=="16983", "06/09/2011",
                            ifelse(row=="88379", "11/04/2016", #ACTUALIZATION 2020-01-17: change of date of discharge
                            ifelse(row=="79428", "11/04/2016", #ACTUALIZATION 2020-01-17: change of date of discharge
                            ifelse(row=="37004", "02/08/2013",fech_egres)))))))))))))))))))))))) %>%
  dplyr::mutate(fech_egres= lubridate::parse_date_time(stringr::str_trim(fech_egres), 
                                                       orders = c("%d/%m/%Y", "%d/%m/%y","%d%m%Y"),exact=T)) %>%
  dplyr::arrange(desc(ano_bd)) %>% 
  assign("CONS_C1_df",.,envir = .GlobalEnv) #87 failed to parse, bajé a 37, y 30 después de quitar espacios

For discharge dates, many cases had different formats. Those had to be transformed once the first formats were traduced (e.g. 30-12-2019 is interpreted first in “%d/%m/%Y” format. If it is not traduced, it interprets in “DD/MM/YY” format (or “%d/%m/%y”). We can see in Table 1 in non-formatted date of discharge (fech_egres_sin_fmt) how it was traduced into an plausible discharge date (fech_egres). One criterion is that in many cases, the second and or third 0 would possibly correspond to a month character. On the contrary, “7082013” or “9072013” would be “2013-8-70” and “2013-7-90”, respectively.

Table 1. Problematic cases of dates which required a manual change
Case	HASH	ID	Date of Admission	Date of Discharge	Date of Discharge (No Format)	Days in Treatment
36,308	b6373dccc2b0b5ffba42db746b17c692	CAPE1**051992	2013-01-29	2013-04-02	2013/04/02	NA
42,741	090f64b945afce17900e82eccb8f8da6	COMO2**091988	2013-07-02	2013-08-02	2082013	NA
40,486	4fc0a1cff37ad152767eb323ed3b43c0	ELPO2**111980	2013-05-02	2013-07-03	307203	NA
42,521	c668aa3153ea4a1a4435c45557feb80b	HUDO1**091969	2013-06-28	2013-07-09	9072013	NA
39,507	2f477adf8b5c282660fa16180eb5096d	LIDO2**031992	2013-04-29	2013-07-02	2072013	NA
37,845	fd919f36ada0b9423cf3acdead577150	MAZA1**111968	2013-03-18	2013-07-01	1072013	NA
40,180	c8a0c6ccce12e9d3fbc74c06d6a06664	MIMA2**111973	2013-05-29	2013-08-07	7082013	NA
35,971	132aa32661fbedca4ecdd1e2018c58e3	PARA1**021977	2013-01-02	2013-03-01	01-03-2013	NA
39,932	c87466931290f44c2c6a89bd127d31e5	RELA2**011969	2013-05-22	2013-08-02	2082013	NA
37,004	ad7eb82186ec39a61ad29aed61522c2a	SO-A2**061980	2013-02-08	2013-08-02	2082013	NA
28,008	3d50c479484a18e510d6be8f8a35a1ed	OMHU1**041990	2012-06-25	2012-11-27	27-11-2012	NA
14,083	68930653a30b13a6f6f7853d6683dff3	CLFI1**031973	2011-02-10	2011-05-03	30520011	NA
14,504	a7a1fcb4a638a7b194063d95e34320a9	CLVE2**111966	2011-01-10	2011-08-30	30/08/2011	232
11,709	52eeb63fbce34e95c7687dd6007efce8	DASA2**031986	2008-08-08	2011-02-01	01-02-2011	NA
10,415	4aa3462c09ad1a7cc1e6d249f620d715	PATR1**061986	2010-04-26	2011-05-03	3052011	NA
16,385	1454ce9c37a56516ce054a3513361a66	RAPA1**051983	2011-04-18	2011-08-30	30/08/20011	NA
16,983	5ebc5439f9db863ff8744778280360db	SERI1**011960	2011-05-13	2011-09-06	06-09-2011	NA
6,349	9a8fef6d37e8010c477c25db3e5e8455	CLGO1**101984	2010-06-17	2010-07-01	01707/2010	NA
8,608	52eeb63fbce34e95c7687dd6007efce8	DASA2**031986	2008-08-08	2011-02-01	01-02-2011	NA
5,757	4485da22c54dbef2c45c61a92b5f7cac	JAGO1**121972	2010-05-03	2010-10-04	4102010	NA
3,195	07a0a0c984ae3d9dfcd30b55fcb6beb2	LUME1**091957	2009-10-07	2010-10-04	4102010	NA
5,172	4aa3462c09ad1a7cc1e6d249f620d715	PATR1**061986	2010-04-26	2011-05-03	3052011	NA

As we can see in Table 2, some non-formatted dates of discharge (fech_egres_sin_fmt), had insufficient information to traduce into valid dates.

Table 2. Non-interpretable discharge dates
Year	case	DischargeDate	DischargeDate-NoFmt
2,011	12,006	NA	5
2,011	12,005	NA	5
2,011	12,227	NA	25
2,010	8,239	NA	15
2,010	9,043	NA	5
2,010	2,603	NA	20/0/2010
2,010	9,042	NA	5
2,010	194	NA	15
2,010	9,353	NA	25

UPDATE: 2019-12-11, had to format this date because it was badly labelled. CONS_C1_df[CONS_C1_df$row=="5757","fech_egres"] <- parse_date_time(str_trim("04/10/2010"), orders = c("%d/%m/%Y"),exact=T). UPDATE: 2020-01-17. Thanks to SENDA’s experts, they provided us with information about a case with missing values in dates of treatment. This date had to be formatted because it was badly labeled. CONS_C1_df[CONS_C1_df$row=="88379","fech_ing"] <- parse_date_time(str_trim("09/07/2015"), orders = c("%d/%m/%Y"),exact=T). UPDATE: 2020-01-17, this date had to be formatted because it was badly labeled. CONS_C1_df[CONS_C1_df$row=="79428","fech_ing"] <- parse_date_time(str_trim("09/07/2015"), orders = c("%d/%m/%Y"),exact=T). UPDATE: 2020-01-17, this date had to be formatted because it was badly labeled. CONS_C1_df[CONS_C1_df$row=="88379","fech_egres"] <- parse_date_time(str_trim("11/04/2016"), orders = c("%d/%m/%Y"),exact=T). UPDATE: 2020-01-17, this date had to be formatted because it was badly labeled. CONS_C1_df[CONS_C1_df$row=="79428","fech_egres"] <- parse_date_time(str_trim("11/04/2016"), orders = c("%d/%m/%Y"),exact=T). Still, there are 9752 cases with NULL dates of discharge.

Data Cleaning Steps provided by SENDA’s professional

These rules were provided by SENDA’s professionals, in order to select adequate data for the analysis.

1. Exclude non-SENDA treatments

In the first place, we should discard treatments not provided by SENDA. NULL values correspond to a single HASH key (“72c54d822de128e52f10511f3eb2d19f”, in datasets from 2010 and 2011) that did not have a plan and program.

Table 3. Treatment Admissions
SENDA	Frequency	%
No	7,762	4.8%
Si	155,382	95.2%
NA	2	0%
No= Admission under another agreement
Si= Admission under SENDA agreement

As we can see in Table 3, around 4.8% of cases had a treatment that could not be part of SENDA’s agreements. The case that did not take part under SENDA categories was admitted to a SENDA agreement in 2009/12/10 and in 2009/12/14. Attending to what was informed by SENDAs professional in January 2020, many programs that was pointed as not part of SENDA may actually be part of SENDA. That is why we keep these programs anyways, despite what was originally stated.

2. Explore the type of plan

It is necessary to explore if there is any case that may have a treatment under a probation/parole (“libertad vigilada”) plan. It is necessary to consider the 3 null values in type of plans, equivalent to 2 HASHs, dropping the case with a plan in Probation/Parole.

## `summarise()` regrouping output by 'SENDA' (override with `.groups` argument)

Table 4. Treatment Plans
Type of Plan	No SENDA(n)	No SENDA(%)	Yes SENDA(n)	Yes SENDA(%)	NA (n)	NA (%)
CALLE	5	0%	10	0%	NA	NA
M-PAB	6	0%	109	0%	NA	NA
M-PAI	137	2%	10533	7%	NA	NA
M-PAI2	NA	NA	16	0%	NA	NA
M-PR	251	3%	7588	5%	NA	NA
M-PR2	NA	NA	10	0%	NA	NA
Otro	1778	23%	90	0%	NA	NA
PAI LV	NA	NA	1	0%	NA	NA
PG-PAB	2236	29%	54075	35%	NA	NA
PG-PAI	2233	29%	66300	43%	NA	NA
PG-PR	1116	14%	16599	11%	NA	NA
PG PAI 2	NA	NA	50	0%	NA	NA
NA	NA	NA	1	0%	2	100%
PR= Residential Program
PAB= Basic Outpatient Program
PAI= Intensive Outpatient Treatment
PAI LV= Probation/Parole
PAI= Intensive Outpatient Treatment

Table 4 shows that there is only 1 case in the probation/parole plan.

3. Changes in programs

Change fixed residential, flexible residential and residential programs into a general population residential program (PG PR). Additionally, flexible residential program cannot be under a probation/parole program.

Must note that there are some plans that have a number 2 but they do not contain more than 0% of the total of the datasets. Also there are some plans related to the Treatment of People Living on the Streets Program (Programa de Tratamiento de Personas en Situación de Calle). We should change plans ‘Otro’ and ‘Calle’ into ‘General Population’. Plans with number 2 should be grouped with their respective plan, leaving only M-PAI, M-PR, PG-PAB, PG-PAI, and PG-PR. For the moment, we are not sure what is the fixed and flexible residential programs.

Table 5. Contingency Tables of Programs by Plans
Type of Program	CALLE	M-PAB	M-PAI	M-PAI2	M-PR	M-PR2	Otro	PAI LV	PG-PAB	PG-PAI	PG-PR	PG PAI 2	NA
Libertad Vigilada	0	0	0	0	0	0	0	0	0	2	0	0	0
Otro	0	0	3	0	3	0	890	0	32	35	12	2	0
Programa Alcohol	0	0	0	0	0	0	0	0	0	80	0	0	0
Programa Calles	15	0	0	0	0	0	0	0	0	0	0	0	0
Programa Específico Mujeres	0	115	9,741	16	7,590	10	7	0	19	48	3	16	0
Programa Población General	0	0	926	0	246	0	971	1	56,260	68,368	17,700	32	0
NA	0	0	0	0	0	0	0	0	0	0	0	0	3

4. Plans within Alcohol Program

If there is a plan within the alcohol program, the plan must be modified into the general population plan.

Every of the 10, 80 cases in the alcohol program should be in the general population plan.

5. Another Plan

If there is any plan correspondent to Another (“Otro”), it changes into a general population program.

There are 1868 cases with treatments categorized as Another (“Otro”) plan.

6. Plan & Programs by Sex

Women should be in the M-PAI in which they actually are categorized in the general population program.

As can be seen in Table 6, around 58% of women are in the general population programs.Should they be removed or changed?

Table 6. Plan by Sex
Plan	Men (n)	Men (%)	Women (n)	Women (%)
CALLE	12	0%	3	0%
M-PAB	NA	NA	115	0%
M-PAI	86	0%	10,584	23%
M-PAI2	NA	NA	16	0%
M-PR	68	0%	7,771	17%
M-PR2	NA	NA	10	0%
Otro	1,305	1%	563	1%
PAI LV	1	0%	NA	NA
PG-PAB	44,527	38%	11,784	26%
PG-PAI	54,142	46%	14,391	31%
PG-PR	17,057	15%	658	1%
PG PAI 2	27	0%	23	0%
NA	3	0%	NA	NA
PR= Residential Program
PAB= Basic Outpatient Program
PAI= Intensive Outpatient Treatment
PAI LV= Probation/Parole
PAI= Intensive Outpatient Treatment

There are still a few doubts about these criteria:

Why are there men and women with female gender identity that are in the general population?, (Basic outpatient, intensive outpatient, Residential and Outpatient 2)
Should the 2 men and 12 women in M-PAI but with masculine gender identity be in the general population (PG PAI)?
If there is a woman with masculine gender identity in M-PR, should they not be in the general population (PG_PR)?

7. Plan & Programs by Sex

Only plans “M-PAI”, “M-PR”, “PG-PAB”, “PG-PAI” and “PG-PR” must be preserved. Considering Table 4, what happens with M-PAB (basic outpatient for women), “Calle”, “PG PAI 2”, “M-PAI2”, “M-PR2”, “PAI LV” or another (“Otro”)? (See Table 4)

8. Age of consumption onset

Treat the age of consumption onset as missing data if it is less than 5 years.

Figure 1. Histogram of Age of Consumption Begin

  par(mfrow = c(1,2)) 
  hist(CONS_C1_df$Edad.Inicio.Consumo, main="Distribution of Age\nof Consumption Begin", 
       xlab="Age", ylab="Freq", breaks=30, xlim=c(0,80))
  hist(CONS_C1_df$Edad.Inicio.Consumo, main="Distribution of Age\nof Consumption Begin (zoom)", 
       xlab="Age", ylab="Freq", breaks=30, xlim=c(0,10), ylim=c(0,20000))

There are around 1117 cases that showed a consumption onset of less than 5 years old. These should be declared as missing data.

9. Delete duplicated cases (same date of admission and ID)

In this case, we included the HASH-KEY as another and more robust criteria to filter duplicate cases.

Table 7. Repeated cases /Multiple Entrances
	Frequencies	Percentage
Repeated	45,526	27.9%
Unique	117,620	72.1%

As seen in Table 7, there are more than a quarter of cases that may be replicating across the different datasets. Considering the difficulties involved in determining repeated cases without dropping relevant information due to specific changes, this will be done posteriorly, once dataset is a bit more normalized.

10. Current Age

Consider age as missing age if the case was less than 18 years and greater than 90. UPDATE: This had been reconsidered as a result of a join with SENDA’s professionals, that stated that in specific cases people would have entered treatment when they were not yet adults (<18 years old).

Figure 2. Histogram of Actual Age

    par(mfrow = c(1,3)) 
    hist(CONS_C1_df$Edad, main="Distrbution of Age", 
         xlab="Age", ylab="Freq", breaks=60)
    hist(CONS_C1_df$Edad, main="Distrbution of Age (zoom -)", 
         xlab="Age", ylab="Freq", breaks=60, xlim=c(0,20), ylim=c(0,2000))
    hist(CONS_C1_df$Edad, main="Distrbution of Age (zoom +)", 
         xlab="Age", ylab="Freq", breaks=60, xlim=c(100,120), ylim=c(0,100))

    CONS_C1_df %>%
      dplyr::filter(Edad<18) %>%
      dplyr::summarise(n())

    CONS_C1_df %>%
      dplyr::filter(Edad>90) %>%
      dplyr::summarise(n())

There were 673 cases with less than 18 years, 24 cases with more than 90 years. Considering the complexities involved in the age and what should be done to retrieve it adequately, this will be done posteriorly, once other variables are normalized.

11. Days in Treatment

Consider days in treatment as missing if the case had more than 1095 days. Do the same for months in treatment.

Figure 3. Histogram of Days In Treatment

    par(mfrow = c(1,2)) 
    hist(CONS_C1_df$dias_trat, main="Distribution of\nDays in Treatment", 
         xlab="Days", ylab="Freq",xlim=c(-100,4100), breaks=60)
    hist(CONS_C1_df$dias_trat, main="Distribution of\nDays in Treatment (zoom +)", 
         xlab="Days", ylab="Freq", breaks=60, xlim=c(1000,4100), ylim=c(0,2000))

        #values out of boundaries

As seen in Figure 3, there are 2320 cases with more than 1095 days in treatment. On the other hand, there are 280 cases with negative or no days of treatment. Due to the complexities involved in the number of treatment days, deletion will be done posteriorly.

12. Pregnancy and Sex

Pregnant men should be considered as missing values. As seen in Table 8, 4 cases of men also reported pregnancy. These cases should be declared as missing.

Table 8. Pregnancy by Sex
Pregnancy	Man (n)	Man (%)	Women (n)	Women (%)
No	9515	8%	43466	95%
Si	4	0%	2430	5%
NA	107709	92%	22	0%
Si= Pregnant
No= Not Pregnant

13. Type of plan and Sex

See plans M-PR and M-PAI. Check if there are men. Keep the entries only if they identify as women.

As seen in Table 9, it must be noted that many cases do not answer this question, or perhaps it’s more plausible, this question was not present in previous datasets. This is why we may exclude NA’s (missing values) for the following analysis.

Table 9. Sex by Gender Identity
Gender Identity	Man (n)	Man (%)	Women (n)	Women (%)
Femenino	65	0%	3249	7%
Masculino	8012	7%	60	0%
NA	109151	93%	42609	93%
Femenino= Feminine
Masculino= Masculine

In Table 10, it can be seen that there are 2 men with a masculine gender identity that are subscribed to the M-PAI plan. However, what happens with the women with masculine gender identity in the M-PAI plan?

Table 10. Type of Plan by Gender Identity and Sex
Plan	Fem.-Men(n)	Fem.-Men(%)	Masc.-Men(n)	Masc.-Men(%)	Fem.-Women(n)	Fem.-Women(%)	Masc.-Women(n)	Masc.-Women(%)
M-PAI	2	3%	2	0%	877	27%	12	20%
M-PAI2	NA	NA	NA	NA	16	0%	NA	NA
M-PR	2	3%	NA	NA	505	16%	1	2%
M-PR2	NA	NA	NA	NA	10	0%	NA	NA
Otro	1	2%	25	0%	15	0%	NA	NA
PG-PAB	22	34%	2688	34%	690	21%	19	32%
PG-PAI	33	51%	4060	51%	1098	34%	28	47%
PG-PR	5	8%	1212	15%	26	1%	NA	NA
PG PAI 2	NA	NA	25	0%	12	0%	NA	NA
PR= Residential Program
PAB= Basic Outpatient Program
PAI= Intensive Outpatient Treatment
PAI LV= Probation/Parole
PAI= Intensive Outpatient Treatment

This can be done by replacing plans of men with masculine identities that are in M- plans, into the General Population. This should be resolved in the following secitons.

14. Days of treatment

Consider days of treatment as missing if there are values of 0 and negative values.

As shown in Figure 3, there are 280 values of 0 or negative values. This must be resolved once treatment days are standardized and overlappings in treatments are corrected.

    CONS_C1_df %>%
    dplyr::filter(dias_trat<1) %>%
      dplyr::select(-id,-TABLE,-14,-16,-17,-26,-27,-28,-29,-35,-36,-37,-88,-93,-94,-96,-101) %>%
      dplyr::select(row, ano_bd, HASH_KEY, hash_rut_completo, id_mod, dias_trat,fech_ing, fech_egres,tipo_de_plan, tipo_de_programa, 
                    ID.centro,motivodeegreso, everything()) %>%
      head() %>%
 knitr::kable(.,format = "html", format.args = list(decimal.mark = ".", big.mark = ","),
               caption="Table 11. Cases with lower dates of treatment",
                 align =rep('c', 101))  %>%
  kable_styling(bootstrap_options = c("striped", "hover"),font_size = 8) %>%
  scroll_box(width = "100%", height = "350px")

Table 11. Cases with lower dates of treatment
row	ano_bd	HASH_KEY	id_mod	fech_ing	fech_egres	tipo_de_plan	tipo_de_programa	ID.centro	motivodeegreso	Nombre.Centro	tipo_centro	Región.del.Centro	Servicio.de.Salud	Tipo.de.Programa	Tipo.de.Plan	SENDA	Dias.en.SENDA	Edad	Nombre.Usuario	Comuna.Residencia	Origen.de.Ingreso	País.Nacimiento	Nacionalidad	Etnia	Estado.Conyugal	Fecha.Ultimo.Tratamiento	Sustancia.de.Inicio	Edad.Inicio.Consumo	X.Se.trata.de.una.mujer.embarazada.	Escolaridad..último.año.cursado.	Con.Quién.Vive	Tipo.de.vivienda	Tenencia.de.la.vivienda	Sustancia.Principal	Otras.Sustancias.nº1	Otras.Sustancias.nº2	Otras.Sustancias.nº3	Frecuencia.de.Consumo..Sustancia.Principal.	Edad.Inicio..Sustancia.Principal.	Vía.Administración..Sustancia.Principal.	Diagnóstico.Trs..Consumo.Sustancia	Diagnóstico.Trs..Psiquiátrico.DSM.IV	Diagnóstico.Trs..Psiquiátrico.SUB.DSM.IV	X2.Diagnóstico.Trs..Psiquiátrico.DSM.IV	X2.Diagnóstico.Trs..Psiquiátrico.SUB.DSM.IV	X3.Diagnóstico.Trs..Psiquiátrico.DSM.IV	X3.Diagnóstico.Trs..Psiquiátrico.SUB.DSM.IV	Diagnóstico.Trs..Psiquiátrico.CIE.10	Diagnóstico.Trs..Psiquiátrico.SUB.CIE.10	X2.Diagnóstico.Trs..Psiquiátrico.CIE.10	X2.Diagnóstico.Trs..Psiquiátrico.SUB.CIE.10	X3.Diagnóstico.Trs..Psiquiátrico.CIE.10	X3.Diagnóstico.Trs..Psiquiátrico.SUB.CIE.10	Diagnóstico.Trs..Físico	Otros.Problemas.de.Atención.de.Salud.Mental	Compromiso.Biopsicosocial	DIAGNOSTICO.GLOBAL.DE.NECESIDADES.DE.INTEGRACION.SOCIAL	DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.HUMANO	DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.FISICO	DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.SOCIAL	Fecha.Ingreso.a.Convenio.SENDA	Usuario.de.Tribunales..Tratamiento.Drogas	Consentimiento.Informado	Tipo.Centro.Derivación	evaluacindelprocesoteraputico	eva_consumo	eva_fam	eva_relinterp	eva_ocupacion	eva_sm	eva_fisica	eva_transgnorma	Diagnóstico.Trastorno.Psiquiátrico.CIE.10.al.Egreso	DIAGNOSTICO.GLOBAL.DE.NECESIDADES.DE.INTEGRACION.SOCIAL.1	DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.HUMANO.1	DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.FISICO.1	DIAGNOSTICO.DE.NECESIDADES.DE.INTEGRACIóN.SOCIAL.EN.CAPITAL.SOCIAL.1	Motivo.de.egreso.Alta.Administrativa	Consorcio	Ha.estado.embarazada.egreso.	sexo	embarazo	ano_nac
53,692	2,014	44e3de7dae9efc5f37695d7bef2a19ee	CAGA2**121982	2014-01-28	2014-01-28	M-PR	Programa Específico Mujeres	159	Abandono	Comunidad Terapeutica Crem (Hogares Crem)	privado	METROPOLITANA	Metropolitano Oriente	Programa Específico Mujeres	M-PR	Si	0	36	OCULTO	MACUL	Otro Centro Tratamiento Drogas	Chile	Chile	No pertenece	Soltero	último 6 meses	Alcohol	13	No	UNIVERSITARIA INCOMPLETA	Únicamente con hijos y padres o familia de origen	Casa	Allegado	Alcohol	Cocaína	Marihuana	Hongos	4-6 días - semana	13	Oral (bebida o comida)	Dependencia	En estudio	NA	NA	NA	NA	NA	En estudio	NA	NA	NA	NA	NA	Sin trastorno	Abuso Sexual	Severo	NA	NA	NA	NA	28/01/2014	No	Si	NA	Logro M?mo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	NA	NA	NA	NA	NA	NA	Opazo y Compañia Ltda.	NA	Mujer	No	1,982
53,553	2,014	86abe004a5c39f635f562c49d2329f7d	JODI2**021993	2014-01-06	2014-01-06	Otro	Programa Específico Mujeres	219	Alta Admnistrativa	Comunidad Terapeutica Romeral (Dianova Mujeres Maule)	privado	DEL MAULE	Maule	Programa Específico Mujeres	Otro	Si	0	26	OCULTO	TALCA	Otros de la Red de Salud General Público	Chile	Chile	No pertenece	Soltero	último 6 meses	Marihuana	13	No	BASICA INCOMPLETA	Únicamente con padres o familia de origen	Casa	Allegado	Pasta Base	Marihuana	NA	NA	Todos los días	15	Fumada o Pulmonar (aspiración de gases o vapores)	Dependencia	NA	NA	NA	NA	NA	NA	En estudio	NA	NA	NA	NA	NA	ETS	Explotaci?omercial Sexual	Moderado	NA	NA	NA	NA	06/01/2014	No	Si	NA	Logro M?mo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	NA	NA	NA	NA	NA	NA	ONG. Dianova Chile	NA	Mujer	No	1,993
53,402	2,014	ac3c9557b7a403430b8058e3099fa221	PAAL1**071976	2014-01-27	2014-01-27	Otro	Programa Población General	303	Alta Admnistrativa	CTR Nehuen (Residencial Melipilla)	publico	METROPOLITANA	Metropolitano Occidente	Programa Población General	Otro	Si	0	43	OCULTO	MELIPILLA	Consulta Espontánea	Chile	Chile	No pertenece	Casado	último 12 meses	Alcohol	19	NA	BASICA COMPLETA	Únicamente con la pareja e hijos	Mediagua	Propia	Pasta Base	NA	NA	NA	2-3 días - semana	24	Fumada o Pulmonar (aspiración de gases o vapores)	Dependencia	NA	NA	NA	NA	NA	NA	En estudio	NA	NA	NA	NA	NA	En estudio	Sin otros problemas de salud mental	Severo	NA	NA	NA	NA	27/01/2014	No	Si	NA	Logro M?mo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	NA	NA	NA	NA	NA	NA	SERVICIO DE SALUD METROPOLITANO OCCIDENTE	NA	Hombre	NA	1,976
36,706	2,013	b24c50c72c85669ea416ec86b3b11681	ALAC1**031981	2013-02-04	2013-02-04	PG-PAI	Programa Población General	148	Abandono	COSAM Quilicura	publico	METROPOLITANA	Metropolitano Norte	Programa Población General	PG-PAI	No	NA	38	OCULTO	QUILICURA	Consulta Espontánea	Chile	Chile	No pertenece	Soltero	NA	Marihuana	17	NA	MEDIA COMPLETA	Únicamente con pareja	NA	NA	Pasta Base	NA	NA	NA	4-6 días - semana	28	Fumada o Pulmonar (aspiración de gases o vapores)	Dependencia	NA	NA	NA	NA	NA	NA	En estudio	NA	NA	NA	NA	NA	Sin trastorno	Otros	Severo	NA	NA	NA	NA	NA	No	Si	NA	Logro M?mo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	NA	NA	NA	NA	NA	NA	SERVICIO DE SALUD METROPOLITANO NORTE	NA	Hombre	NA	1,981
36,713	2,013	c4f4e254f4e5224db6b81d5f0fdddb6e	ALAG1**021994	2013-02-06	2013-02-06	PG-PAI	Programa Población General	NA	Derivación	NA	NA	NA	NA	Programa Población General	PG-PAI	Si	0	25	OCULTO	SANTIAGO CENTRO	Otros de la Red de Salud General Privado	Chile	Chile	Mapuche	Casado	NA	Alcohol	0	NA	BASICA COMPLETA	Solo	Casa	Arrienda	Alcohol	Marihuana	Cocaína	Pasta Base	Todos los días	12	Fumada o Pulmonar (aspiración de gases o vapores)	Dependencia	NA	NA	NA	NA	NA	NA	En estudio	NA	NA	NA	NA	NA	Otras Enfermedades o Condiciones de Riesgo Vital	NA	Severo	NA	NA	NA	NA	06/02/2013	No	Si	Otro Centro	Logro M?mo	Logro Alto	Logro Alto	Logro Alto	Logro Alto	Logro Alto	Logro Intermedio	Logro Alto	NA	NA	NA	NA	NA	NA	NA	NA	Hombre	NA	1,994
45,004	2,013	ace460ae4527071c9039dc91dfd58092	ALCA1**121956	2013-10-14	2013-10-14	PG-PAB	Programa Población General	241	Alta Admnistrativa	CRS La Florida	publico	METROPOLITANA	Metropolitano Sur Oriente	Programa Población General	PG-PAB	No	NA	62	OCULTO	LA FLORIDA	Estab. de APS	Chile	Chile	No pertenece	Casado	NA	Alcohol	16	NA	MEDIA INCOMPLETA	Únicamente con la pareja e hijos	Departamento	Propia	Alcohol	Cocaína	NA	NA	2-3 días - semana	16	Oral (bebida o comida)	Consumo Perjudicial	En estudio	NA	NA	NA	NA	NA	En estudio	NA	NA	NA	NA	NA	En estudio	Sin otros problemas de salud mental	Moderado	NA	NA	NA	NA	NA	No	No	NA	Logro M?mo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	Logro Mínimo	NA	NA	NA	NA	NA	NA	SERVICIO DE SALUD METROPOLITANO SUR ORIENTE	NA	Hombre	NA	1,956

15. Age of Consumption Onset and Current Age

Consider the age of consumption onset as missing data if it is greater than the current age.

In 780 cases, the age of consumption onset is greater than the current age. This will be corrected once ages are normalized.

CONS_C1_df %>% dplyr::mutate(edad_inicio_cons= ifelse(Edad.Inicio.Consumo>Edad, 1, 0)) %>% dplyr::filter(edad_inicio_cons==1) %>%
  dplyr::select(row, HASH_KEY, ano_bd, id_mod, Edad.Inicio.Consumo,Edad) %>% 
  head(50) %>%
 knitr::kable(.,format = "html", format.args = list(decimal.mark = ".", big.mark = ","),
               caption="Table 12. Cases with age of consumption begin greater than the actual age",
                 align =rep('c', 101))  %>%
  kable_styling(bootstrap_options = c("striped", "hover"),font_size = 8) %>%
  scroll_box(width = "100%", height = "350px")

Table 12. Cases with age of consumption begin greater than the actual age
row	HASH_KEY	ano_bd	id_mod	Edad.Inicio.Consumo	Edad
155,268	fb74493f56becc488867fe53c5573a08	2,019	ALAL2**012001	25	18
161,209	2a33829bf3464ee50d4c8c5087a81093	2,019	CORU2**021992	32	27
150,494	69c9e351e5132a9b487eaeaea44c0c77	2,019	GOMI1**062000	25	19
158,818	d6d3a38f49580254e4a9a876b2209b29	2,019	ISAM1**062001	20	18
151,947	910ad076dd1e17a321519f65c5feaa99	2,019	JALI1**061979	52	40
160,567	4935ae99848d64148eb2f1a0230c4ebb	2,019	KATE2**072001	24	18
159,445	a0eee7839dc689824f91b86db156f82f	2,019	MAES2**072001	22	18
153,734	7d92e6e8d55a12c1b308c78dbad0f5b9	2,019	MAPE1**031995	29	24
148,287	e586f94b6247f0e208a906f97f2420f2	2,019	NECU1**052000	21	19
146,598	1d91d6b2b75fe7a4d8e2a8dfebcfa16c	2,019	NELA1**121999	54	19
159,705	a6929b6c7987cce64b0ca1b3f3299bea	2,019	ROCO2**072001	20	18
160,788	f90dd528fbe2005a3a3cf88e4f873c83	2,019	RUTR2**051997	24	22
131,749	9018b21f72370bdeca005ab540d7c7be	2,018	ALNE1**101999	25	20
127,314	fc87d7f86b4e9f568fb50eccb29f3891	2,018	ANAR1**041999	24	20
136,733	07402f400b254ee4662b8a831365612b	2,018	DIJI1**051992	34	27
138,430	445bfb8e194056cb920c163d851c3f92	2,018	ELCA1**052000	30	19
125,808	578ae693be289de023125ddd2777dd1c	2,018	GARO1**012015	15	4
144,205	69c9e351e5132a9b487eaeaea44c0c77	2,018	GOMI1**062000	25	19
125,800	8d2dfe24c7a24d541a13b4e82b82eede	2,018	HEAV1**112014	13	4
131,633	e0a8c5bbde98b9ed30a52230a7a8b0cb	2,018	JACE2**061999	50	20
142,720	0c9be3e89c1a7576227d2204aada9c24	2,018	JUVA1**111993	26	25
126,276	9b2836d1a04bffaf371d93392573f692	2,018	MAGU2**051996	43	23
129,532	b24908c527faa1b7bd5a267d5dcabd45	2,018	MAHE2**091999	62	20
131,012	eaf995cb33d947ef6b637e0f89907f96	2,018	MIPE1**091999	25	20
127,827	6fc3007c689e78c3a6af220449514b4e	2,018	MYLO2**051986	46	33
140,084	e586f94b6247f0e208a906f97f2420f2	2,018	NECU1**052000	21	19
133,368	1d91d6b2b75fe7a4d8e2a8dfebcfa16c	2,018	NELA1**121999	54	19
130,850	865d8750fff6ca0711f52527bcd70b96	2,018	PAAV2**101997	25	22
129,149	ac8234f0a33399c702103d076a9a8de5	2,018	REVA2**081999	30	20
132,807	2174b5058b56e4471af11248a73f215b	2,018	ROVE1**121999	25	19
125,944	711029aaa5f478f2c55223d285eb84fe	2,018	SHMA1**082013	15	6
127,280	45fce40da4dfc892ba5d4154f8d6b5b6	2,018	SOVA2**121998	46	20
125,778	ad16844e62fd0a0db5ddd59ccffccbf6	2,018	SUUB2**102013	26	6
127,320	5ca64697dd399116abdd351906e52ab1	2,018	YESA1**031999	21	20
136,457	744991c3ca5e12e87f2a7925d254e5f6	2,018	YOBO2**022000	23	19
125,184	9018b21f72370bdeca005ab540d7c7be	2,017	ALNE1**101999	25	20
116,020	fc87d7f86b4e9f568fb50eccb29f3891	2,017	ANAR1**041999	24	20
106,706	c51ce03a395b620b21a70b0d028a77e6	2,017	ANMA1**031996	27	23
120,750	bde31338de6a454e11a31da13e9aa8ee	2,017	BAGA2**041996	25	23
106,134	68a1abebf175673e62b97b65fde63b50	2,017	BRHE1**092015	15	4
119,326	4908c125c91bed71becbd7cf1077e5b2	2,017	CALU2**071999	24	20
105,458	2cdd842e8b1102992ff2c060e7b46178	2,017	CASA1**122014	18	4
105,847	87afef689812b1ad6f576f0e83e5c963	2,017	DABU1**032015	14	4
107,001	31d3a088c8730a02c4ef7239a02e511f	2,017	DIBU2**041996	30	23
105,661	0d5782868d35d58892a0f938813f7608	2,017	EDHO1**052015	17	4
106,644	2817347e37b21d9ff166af0685b417ff	2,017	ENSA1**031996	25	23
108,082	d9e020a08875ef763ce0c3e06cbf1b9d	2,017	ERVA1**081995	25	24
124,962	99246ebba3341f26ce047e32d9c3742b	2,017	FEJA1**121999	36	19
105,599	e89eb1528cbb255f7a94ade170a93768	2,017	FRJA1**042015	16	4
124,952	4e11314413f0172ea97c6926639bff6f	2,017	FRMU1**011996	25	23

Around 666 cases could change once dates are resolved, due to less than 18 years of age.