From bbb65242f1af5f601def1c0b971ed601d459b4f3 Mon Sep 17 00:00:00 2001
From: Mauricio 'Pacha' Vargas Sepulveda <m.sepulveda@mail.utoronto.ca>
Date: Thu, 14 Nov 2024 00:41:54 -0500
Subject: [PATCH] fix CRAN ubsan errors?

---
 README.md                                     |  36 ++-
 dev/04-test-equivalence.r                     | 267 +++++++++++++++++-
 rpkg/DESCRIPTION                              |   4 +-
 rpkg/NAMESPACE                                |   5 +-
 rpkg/R/redatam-package.R                      |   4 +-
 rpkg/R/utils.R                                | 144 +++++-----
 .../redatamlib/readers/FuzzyEntityParser.cpp  |   6 +
 .../readers/FuzzyVariableParser.cpp           |  12 +-
 .../readers/FuzzyVariableParser.hpp           |   2 +-
 9 files changed, 356 insertions(+), 124 deletions(-)

diff --git a/README.md b/README.md
index 0c60e1f..2508afd 100644
--- a/README.md
+++ b/README.md
@@ -10,37 +10,33 @@
 
 ## About
 
-Open Redatam is an open source software for extracting raw information from REDATAM databases.
+Open Redatam is an open source software for extracting raw information from REDATAM databases. It was created to recover information of REDATAM databases for statistical analysis using standard tools such as SPSS, STATA, R, etc.
 
-For a given census, such as the [Chilean Census 2017](https://redatam.org/cdr/descargas/censos/poblacion/CP2017CHL.zip), run the following command:
+This software is a full C++ ground-up rewrite of the original [Redatam Converter](https://github.com/discontinuos/open-redatam/blob/master/README-EN.md) created by Pablo de Grande and written in C#. Rewriting the original C# code in C++ allows for better portability and the ability to use the program within R, Python, and other languages.
 
-```bash
-redatam input-dir/dictionary.dicx output-dir
-```
+## For R and Python users (otherwise skip this section)
 
-Or use the desktop app:
+**If you use R**: We have an R [package](rpkg) 📦 that allows to directly read REDATAM databases in R.
 
-![Open Redatam GUI](gui-demo.png)
+**If you use Python**: We have a Python [package](pypkg) 📦 that allows to directly read REDATAM databases in Python.
 
-The REDATAM database will be exported to CSV files and an XML summary of the tables and variables. It was created to recover information of REDATAM databases for statistical analysis using standard tools such as SPSS, STATA, R, etc.
+**If you only need the processed data**: We provide tidied [microdata](https://github.com/pachadotdev/redatam-microdata/releases) 📊 for R in RDS format.
 
-This software is a full C++ ground-up rewrite of the original [Redatam Converter](https://github.com/discontinuos/open-redatam/blob/master/README-EN.md) created by Pablo de Grande and written in C#. Rewriting the original C# code in C++ allows for better portability and the ability to use the program within R, Python, and other languages.
+## Usage
 
-**For the R package that allows to directly read REDATAM databases in R, see the [rpkg](rpkg) directory.**
+For a given census, such as the [Chilean Census 2017](https://redatam.org/cdr/descargas/censos/poblacion/CP2017CHL.zip), the following options are equivalent.
 
-**For the Python package that allows to directly read REDATAM databases in Python, see the [pypkg](pypkg) directory.**
+### Desktop app
 
-**If you only need the processed data, you can download the [microdata repository](https://github.com/pachadotdev/redatam-microdata/releases). It is available in RDS format for easy loading into R.**
+![Open Redatam GUI](gui-demo.png)
+
+### Command line
 
-**Available datasets:**
+```bash
+redatam input-dir/dictionary.dicx output-dir
+```
 
-- **Argentina: 1991, 2001, 2010**
-- **Bolivia: 2001, 2012**
-- **Chile: 2017**
-- **Ecuador: 2010**
-- **El Salvador: 2007**
-- **Guatemala: 2018**
-- **Mexico: 2000**
+The REDATAM database will be exported to CSV files and an XML summary of the tables and variables.
 
 ## Installation
 
diff --git a/dev/04-test-equivalence.r b/dev/04-test-equivalence.r
index ab9a33d..de5a80f 100644
--- a/dev/04-test-equivalence.r
+++ b/dev/04-test-equivalence.r
@@ -48,7 +48,8 @@ generate_tables <- function(d_control, file_name, country_code, census_year, var
     ungroup() %>%
     mutate(pct_rdtm = n_rdtm / sum(n_rdtm) * 100)
 
-  if (file_name == "downloads/redatam/CP2012BOL/open-redatam-dicx-to-csv/PERSONA.csv.gz") {
+  if (file_name == "downloads/redatam/CP2012BOL/open-redatam-dic-to-csv/PERSONA.csv.gz" |
+      file_name == "downloads/redatam/CP2012BOL/open-redatam-dicx-to-csv/PERSONA.csv.gz") {
     d_sex$sex <- c(2L,1L)
   }
 
@@ -100,9 +101,7 @@ generate_tables <- function(d_control, file_name, country_code, census_year, var
   return(list(d_control_sex = d_control_sex, d_control_age = d_control_age))
 }
 
-generate_tables_r <- function(d_control, country_code, census_year, var_sex, var_age) {
-  # just Chile, it is identical to the other validation function
-  file_name <- "downloads/CP2017CHL/BaseOrg16/CPV2017-16.dic"
+generate_tables_r <- function(d_control, file_name, country_code, census_year, var_sex, var_age) {
   # Process d_control for sex
   d_control_sex <- d_control %>%
     filter(country == country_code, year == census_year) %>%
@@ -124,12 +123,23 @@ generate_tables_r <- function(d_control, country_code, census_year, var_sex, var
 
   names(d_external)
 
+  if (any(file_name %in% c("downloads/redatam/CP2007PER/CP2007PER/BasePub/CPV2007PER_PUB.dic",
+                           "downloads/redatam/CP2007PER/CP2007PER/BasePub/CPV2007PER_PUB.dicx"))) {
+    # rename d_external$poblacio as d_external$persona
+    names(d_external)[which(names(d_external) == "poblacio")] <- "persona"
+  }
+  
   d_sex <- d_external$persona %>%
     group_by(sex = !!sym(var_sex)) %>%
     count(name = "n_rdtm") %>%
     ungroup() %>%
     mutate(pct_rdtm = n_rdtm / sum(n_rdtm) * 100)
 
+  if (file_name == "downloads/redatam/CP2012BOL/BaseMunicipio_V3/CPV2012Municipio.dic" |
+      file_name == "downloads/redatam/CP2012BOL/BaseMunicipio_V3/CPV2012Comunidad.dicx") {
+    d_sex$sex <- c(2L, 1L)
+  }
+
   # Bind columns and calculate differences for sex
   d_control_sex <- d_control_sex %>%
     left_join(d_sex) %>%
@@ -184,8 +194,28 @@ generate_tables_r <- function(d_control, country_code, census_year, var_sex, var
 # p08 = sex
 # p09 = age
 
+result <- generate_tables(
+  d_control,
+  "downloads/redatam/CP2017CHL/open-redatam-dic-to-csv/PERSONA.csv.gz",
+  152L,
+  2017L,
+  "p08",
+  "p09"
+)
+
+result2 <- generate_tables_r(
+  d_control,
+  "downloads/redatam/CP2017CHL/BaseOrg16/CPV2017-16.dic",
+  152L,
+  2017L,
+  "p08",
+  "p09"
+)
+
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
+
 result <- generate_tables(d_control,
-  # "downloads/redatam/CP2017CHL/open-redatam-dic-to-csv/PERSONA.csv.gz",
   "downloads/redatam/CP2017CHL/open-redatam-dicx-to-csv/PERSONA.csv.gz",
   152L,
   2017L,
@@ -193,19 +223,43 @@ result <- generate_tables(d_control,
   "p09"
 )
 
-result <- generate_tables_r(
+result2 <- generate_tables_r(
   d_control,
+  "downloads/redatam/CP2017CHL/BaseOrg16/CPV2017-16.dicx",
   152L,
   2017L,
   "p08",
   "p09"
 )
 
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
+
 # Bolivia 2001 ----
 
 result <- generate_tables(
   d_control,
-  # "downloads/redatam/CP2001BOL/open-redatam-dic-to-csv/PERSONA.csv.gz",
+  "downloads/redatam/CP2001BOL/open-redatam-dic-to-csv/PERSONA.csv.gz",
+  68L,
+  2001L,
+  "sexo",
+  "edad"
+)
+
+result2 <- generate_tables_r(
+  d_control,
+  "downloads/redatam/CP2001BOL/Cp2001BOL/BaseOriginal/CPV2001.dic",
+  68L,
+  2001L,
+  "sexo",
+  "edad"
+)
+
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
+
+result <- generate_tables(
+  d_control,
   "downloads/redatam/CP2001BOL/open-redatam-dicx-to-csv/PERSONA.csv.gz",
   68L,
   2001L,
@@ -213,11 +267,43 @@ result <- generate_tables(
   "edad"
 )
 
+result2 <- generate_tables_r(
+  d_control,
+  "downloads/redatam/CP2001BOL/Cp2001BOL/BaseOriginal/CPV2001.dicx",
+  68L,
+  2001L,
+  "sexo",
+  "edad"
+)
+
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
+
 # Bolivia 2012 ----
 
 result <- generate_tables(
   d_control,
-  # "downloads/redatam/CP2012BOL/open-redatam-dic-to-csv/PERSONA.csv.gz",
+  "downloads/redatam/CP2012BOL/open-redatam-dic-to-csv/PERSONA.csv.gz",
+  68L,
+  2012L,
+  "p24",
+  "p25"
+)
+
+result2 <- generate_tables_r(
+  d_control,
+  "downloads/redatam/CP2012BOL/BaseMunicipio_V3/CPV2012Municipio.dic",
+  68L,
+  2012L,
+  "p24",
+  "p25"
+)
+
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
+
+result <- generate_tables(
+  d_control,
   "downloads/redatam/CP2012BOL/open-redatam-dicx-to-csv/PERSONA.csv.gz",
   68L,
   2012L,
@@ -225,11 +311,43 @@ result <- generate_tables(
   "p25"
 )
 
+result2 <- generate_tables_r(
+  d_control,
+  "downloads/redatam/CP2012BOL/BaseMunicipio_V3/CPV2012Comunidad.dicx",
+  68L,
+  2012L,
+  "p24",
+  "p25"
+)
+
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
+
 # Dominican Republic 2002 ----
 
 result <- generate_tables(
   d_control,
-  # "downloads/redatam/CP2002DOM/open-redatam-dic-to-csv/PERSONA.csv.gz",
+  "downloads/redatam/CP2002DOM/open-redatam-dic-to-csv/PERSONA.csv.gz",
+  214L,
+  2002L,
+  "p28",
+  "p29d"
+)
+
+result2 <- generate_tables_r(
+  d_control,
+  "downloads/redatam/CP2002DOM/Cp2002DOM/BaseOriginal/CPV2002DOM.dic",
+  214L,
+  2002L,
+  "p28",
+  "p29d"
+)
+
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
+
+result <- generate_tables(
+  d_control,
   "downloads/redatam/CP2002DOM/open-redatam-dicx-to-csv/PERSONA.csv.gz",
   214L,
   2002L,
@@ -237,11 +355,43 @@ result <- generate_tables(
   "p29d"
 )
 
+result2 <- generate_tables_r(
+  d_control,
+  "downloads/redatam/CP2002DOM/Cp2002DOM/BaseOriginal/CPV2002DOM.dicx",
+  214L,
+  2002L,
+  "p28",
+  "p29d"
+)
+
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
+
 # Ecuador 2010 DIC ----
 
 result <- generate_tables(
   d_control,
-  # "downloads/redatam/CP2010ECU/open-redatam-dic-to-csv/PERSONA.csv.gz",
+  "downloads/redatam/CP2010ECU/open-redatam-dic-to-csv/PERSONA.csv.gz",
+  218L,
+  2010L,
+  "p01",
+  "p03"
+)
+
+result2 <- generate_tables_r(
+  d_control,
+  "downloads/redatam/CP2010ECU/Base/CE11.dic",
+  218L,
+  2010L,
+  "p01",
+  "p03"
+)
+
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
+
+result <- generate_tables(
+  d_control,
   "downloads/redatam/CP2010ECU/open-redatam-dicx-to-csv/PERSONA.csv.gz",
   218L,
   2010L,
@@ -249,23 +399,87 @@ result <- generate_tables(
   "p03"
 )
 
+result2 <- generate_tables_r(
+  d_control,
+  "downloads/redatam/CP2010ECU/Base/CE11.dicX",
+  218L,
+  2010L,
+  "p01",
+  "p03"
+)
+
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
+
 # El Salvador 2007 ----
 
 result <- generate_tables(
   d_control,
   "downloads/redatam/CP2007SLV/open-redatam-dic-to-csv/PERSONA.csv.gz",
-  # "downloads/redatam/CP2007SLV/open-redatam-dicx-to-csv/PERSONA.csv.gz",
   222L,
   2007L,
   "p02",
   "p03a"
 )
 
+result2 <- generate_tables_r(
+  d_control,
+  "downloads/redatam/CP2007SLV/CP2007SLV/BaseTotal/CPV2007ES.dic",
+  222L,
+  2007L,
+  "p02",
+  "p03a"
+)
+
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
+
+result <- generate_tables(
+  d_control,
+  "downloads/redatam/CP2007SLV/open-redatam-dicx-to-csv/PERSONA.csv.gz",
+  222L,
+  2007L,
+  "p02",
+  "p03a"
+)
+
+result2 <- generate_tables_r(
+  d_control,
+  "downloads/redatam/CP2007SLV/CP2007SLV/BaseTotal/CPV2007ES.dicx",
+  222L,
+  2007L,
+  "p02",
+  "p03a"
+)
+
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
+
 # Peru 2007 ----
 
 result <- generate_tables(
   d_control,
-  # "downloads/redatam/CP2007PER/open-redatam-dic-to-csv/Poblacio.csv.gz",
+  "downloads/redatam/CP2007PER/open-redatam-dic-to-csv/Poblacio.csv.gz",
+  604L,
+  2007L,
+  "p02sexo",
+  "p03aanio"
+)
+
+result2 <- generate_tables_r(
+  d_control,
+  "downloads/redatam/CP2007PER/CP2007PER/BasePub/CPV2007PER_PUB.dic",
+  604L,
+  2007L,
+  "p02sexo",
+  "p03aanio"
+)
+
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
+
+result <- generate_tables(
+  d_control,
   "downloads/redatam/CP2007PER/open-redatam-dicx-to-csv/Poblacio.csv.gz",
   604L,
   2007L,
@@ -273,14 +487,37 @@ result <- generate_tables(
   "p03aanio"
 )
 
-# Uruguay 2011 DIC ----
+result2 <- generate_tables_r(
+  d_control,
+  "downloads/redatam/CP2007PER/CP2007PER/BasePub/CPV2007PER_PUB.dicx",
+  604L,
+  2007L,
+  "p02sexo",
+  "p03aanio"
+)
+
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
+
+# Uruguay 2011 ----
 
 result <- generate_tables(
   d_control,
-  # "downloads/redatam/CP2011URY/open-redatam-dic-to-csv/PERSONA.csv.gz",
-  "downloads/redatam/CP2011URY/open-redatam-dicx-to-csv/PERSONA.csv.gz",
+  "downloads/redatam/CP2011URY/open-redatam-dic-to-csv/PERSONA.csv.gz",
+  858L,
+  2011L,
+  "ph02",
+  "na01"
+)
+
+result2 <- generate_tables_r(
+  d_control,
+  "downloads/redatam/CP2011URY/BaseRPub/CPV2011_uruguay_publica.dic",
   858L,
   2011L,
   "ph02",
   "na01"
 )
+
+all.equal(result$d_control_sex, result2$d_control_sex)
+all.equal(result$d_control_age, result2$d_control_age)
diff --git a/rpkg/DESCRIPTION b/rpkg/DESCRIPTION
index 81c6010..ce67977 100644
--- a/rpkg/DESCRIPTION
+++ b/rpkg/DESCRIPTION
@@ -26,7 +26,9 @@ Authors@R: c(
 Imports:
     data.table,
     janitor,
-    stringi
+    stringi,
+    stringr,
+    tibble
 Suggests: 
     knitr,
     rmarkdown,
diff --git a/rpkg/NAMESPACE b/rpkg/NAMESPACE
index 5c408df..0f24e21 100644
--- a/rpkg/NAMESPACE
+++ b/rpkg/NAMESPACE
@@ -7,5 +7,8 @@ importFrom(data.table,as.data.table)
 importFrom(data.table,is.data.table)
 importFrom(data.table,setnames)
 importFrom(janitor,make_clean_names)
-importFrom(stringi,stri_enc_detect)
+importFrom(stringi,stri_enc_toutf8)
+importFrom(stringr,str_replace_all)
+importFrom(stringr,str_trim)
+importFrom(tibble,as_tibble)
 useDynLib(redatam, .registration = TRUE)
diff --git a/rpkg/R/redatam-package.R b/rpkg/R/redatam-package.R
index 259de5d..14a3619 100644
--- a/rpkg/R/redatam-package.R
+++ b/rpkg/R/redatam-package.R
@@ -2,5 +2,7 @@
 #' @keywords internal
 #' @importFrom data.table as.data.table is.data.table setnames `:=` `.SD`
 #' @importFrom janitor make_clean_names
-#' @importFrom stringi stri_enc_detect
+#' @importFrom stringi stri_enc_toutf8
+#' @importFrom stringr str_trim str_replace_all
+#' @importFrom tibble as_tibble
 "_PACKAGE"
diff --git a/rpkg/R/utils.R b/rpkg/R/utils.R
index 1ef0350..54cffef 100644
--- a/rpkg/R/utils.R
+++ b/rpkg/R/utils.R
@@ -1,76 +1,58 @@
-detect_encoding_ <- function(strings) {
-  valid_strings <- paste(strings[!is.na(strings) & strings != ""],
-    collapse = " ")
-  if (length(valid_strings) > 0) {
-    detected_encoding <- stri_enc_detect(valid_strings)
-    if (length(detected_encoding) > 0) {
-      return(detected_encoding[[1]]$Encoding[1])
-    }
-  }
-  return("UTF-8")
-}
-
 list_to_datatable_ <- function(x) {
-  for (y in names(x)) {
-    if (is.list(x[[y]])) {
-      x[[y]] <- as.data.table(x[[y]])
-    }
-  }
-  return(x)
+  lapply(x, function(y) {
+    if (is.list(y)) as.data.table(y) else y
+  })
 }
 
-fix_encoding_recursive_ <- function(x, encoding) {
+# Encoding fix ----
+
+fix_encoding_recursive_ <- function(x) {
   if (is.list(x)) {
-    return(lapply(x, fix_encoding_recursive_, encoding))
+    return(lapply(x, fix_encoding_recursive_))
   } else if (is.character(x)) {
-    y <- iconv(x, from = encoding, to = "UTF-8", sub = "")
-    y <- gsub("\u00c3\u00a1", "\u00e1", y) # a with acute accent
-    y <- gsub("\u00c3\u00a9", "\u00e9", y) # e with acute accent
-    y <- gsub("\u00c3\u00b3", "\u00f3", y) # o with acute accent
-    y <- gsub("\u00c3\u00ba", "\u00fa", y) # u with acute accent
-    y <- gsub("\u00c3\u00b1", "\u00f1", y) # n with tilde
-    y <- gsub("\u00b1", "\u00f1", y) # n with tilde
-    y <- gsub("\u00c3", "\u00ed", y) # i with acute accent
-    return(y)
+    replacements <- c(
+      "\u00c3\u00a1" = "\u00e1", # a with acute accent
+      "\u00c3\u00a9" = "\u00e9", # e with acute accent
+      "\u00c3\u00b3" = "\u00f3", # o with acute accent
+      "\u00c3\u00ba" = "\u00fa", # u with acute accent
+      "\u00c3\u00b1" = "\u00f1", # n with tilde
+      "\u00b1" = "\u00f1", # n with tilde
+      "\u00c3" = "\u00ed" # i with acute accent
+    )
+    return(str_replace_all(stri_enc_toutf8(x), replacements))
+  }
+  x
+}
+
+fix_encoding_in_data_table <- function(y) {
+  char_cols <- names(y)[vapply(y, is.character, logical(1))]
+  y[, (char_cols) := lapply(.SD, fix_encoding_recursive_), .SDcols = char_cols]
+  y
+}
+
+# Extract the unique values of the "chr" vectors in the lists
+extract_strings <- function(y) {
+  if (is.data.table(y)) {
+    return(unlist(y[, lapply(.SD, function(col) if (is.character(col)) col else NULL)]))
   } else {
-    return(x)
+    return(unlist(y[vapply(y, is.character, logical(1))]))
   }
 }
 
 fix_encoding_ <- function(x) {
-  # Extract the unique values of the "chr" vectors in the lists
-  strings <- unique(unlist(lapply(x, function(y) {
-    if (is.data.table(y)) {
-      return(unlist(y[, lapply(
-        .SD,
-        function(col) {
-          if (is.character(col)) col else NULL
-        }
-      )]))
-    } else {
-      return(y[vapply(y, is.character, logical(1))])
-    }
-  })))
+  strings <- unique(unlist(lapply(x, extract_strings)))
 
-  # Detect the encoding of the strings
-  encoding <- detect_encoding_(strings)
-
-  # Apply the encoding fix recursively
-  return(lapply(x, function(y) {
+  lapply(x, function(y) {
     if (is.data.table(y)) {
-      char_cols <- names(y)[vapply(y, is.character, logical(1))]
-      y[, (char_cols) := lapply(.SD,
-        function(col) {
-          fix_encoding_recursive_(col, encoding)
-        }),
-        .SDcols = char_cols]
-      return(y)
+      return(fix_encoding_in_data_table(y))
     } else {
-      return(fix_encoding_recursive_(y, encoding))
+      return(fix_encoding_recursive_(y))
     }
-  }))
+  })
 }
 
+# Tidy names ----
+
 tidy_names_ <- function(x) {
   if (is.list(x)) {
     element_names <- names(x)
@@ -81,32 +63,34 @@ tidy_names_ <- function(x) {
     }
 
     # apply janitor to the non-empty names
-    cleaned_names <- ifelse(element_names == "", element_names,
-      make_clean_names(element_names))
-
+    cleaned_names <- ifelse(element_names == "", element_names, make_clean_names(element_names))
     names(x) <- cleaned_names
 
     # apply tidy_names_ recursively
-    return(lapply(x, function(y) {
-      if (is.data.table(y)) {
-        setnames(y, make_clean_names(names(y)))
-        return(y)
-      } else {
-        return(tidy_names_(y))
-      }
-    }))
+    return(
+      lapply(x, function(y) {
+        if (is.data.table(y)) {
+          setnames(y, make_clean_names(names(y)))
+          return(y)
+        } else if (is.list(y)) {
+          return(tidy_names_(y))
+        } else {
+          return(y)
+        }
+      })
+    )
   } else {
     return(x)
   }
 }
 
+# Fix multiple spaces ----
+
 # trim leading/trailing spaces and replace multiple spaces with a single space
 # +
 # convert empty strings to NA
 trim_and_clean_internal_ <- function(x) {
-  x <- trimws(gsub("^\\s+|\\s+$", "", gsub("\\s+", " ", x)))
-  x[x == ""] <- NA
-  return(x)
+  str_replace_all(str_trim(str_replace_all(x, "\\s+", " ")), "^$", NA_character_)
 }
 
 trim_and_clean_ <- function(x) {
@@ -128,9 +112,11 @@ harmonize_types_ <- function(x) {
     # if the data type of the column in the entity is different from the
     # description, convert it in the description
     for (col in common_col) {
-      target_class <- class(x[[entity]][[col]])
-      if (target_class != class(x[[element]][[col]])) {
-        x[[element]][, (col) := switch(target_class,
+      entity_col_class <- class(x[[entity]][[col]])
+      element_col_class <- class(x[[element]][[col]])
+
+      if (entity_col_class != element_col_class) {
+        x[[element]][, (col) := switch(entity_col_class,
           character = as.character(.SD[[col]]),
           factor = as.factor(.SD[[col]]),
           numeric = as.numeric(.SD[[col]]),
@@ -140,7 +126,7 @@ harmonize_types_ <- function(x) {
     }
   }
 
-  return(x)
+  x
 }
 
 merge_descriptions_ <- function(x) {
@@ -166,13 +152,9 @@ merge_descriptions_ <- function(x) {
     x[[entity]][, (char_cols) := lapply(.SD, as.factor), .SDcols = char_cols]
   }
 
-  return(x)
+  x
 }
 
 datatable_to_tibble_ <- function(x) {
-  for (y in names(x)) {
-    x[[y]] <- as.data.frame(x[[y]])
-    class(x[[y]]) <- c("tbl_df", "tbl", "data.frame")
-  }
-  return(x)
+  lapply(x, as_tibble)
 }
diff --git a/rpkg/src/redatamlib/readers/FuzzyEntityParser.cpp b/rpkg/src/redatamlib/readers/FuzzyEntityParser.cpp
index c943585..1603fe0 100644
--- a/rpkg/src/redatamlib/readers/FuzzyEntityParser.cpp
+++ b/rpkg/src/redatamlib/readers/FuzzyEntityParser.cpp
@@ -1,5 +1,7 @@
 #include <algorithm> //  std::replace
 
+#include <cpp11/function.hpp> //  cpp11::stop
+
 #include "FuzzyEntityParser.hpp"
 #include "FuzzyVariableParser.hpp"
 #include "utils.hpp" //  GetFileExtension, ThrowIfBad
@@ -25,6 +27,10 @@ vector<Entity> FuzzyEntityParser::ParseEntities() {
   } catch (const std::out_of_range &) {
   }
 
+  if (ret.empty()) {
+    cpp11::stop("Error: No entities found.");
+  }
+
   for (Entity &e : ret) {
     entities[e.GetName()] = &e;
   }
diff --git a/rpkg/src/redatamlib/readers/FuzzyVariableParser.cpp b/rpkg/src/redatamlib/readers/FuzzyVariableParser.cpp
index 18b12fe..a5e3037 100644
--- a/rpkg/src/redatamlib/readers/FuzzyVariableParser.cpp
+++ b/rpkg/src/redatamlib/readers/FuzzyVariableParser.cpp
@@ -16,6 +16,10 @@ FuzzyVariableParser::FuzzyVariableParser(ByteArrayReader reader,
     : m_reader(reader), m_rootPath(rootPath) {}
 
 void FuzzyVariableParser::ParseAllVariables(vector<Entity> &entities) {
+  if (entities.empty()) {
+    cpp11::stop("Error: The entities vector is empty.");
+  }
+
   vector<pair<size_t, size_t>> searchBounds = GetSearchBounds(entities);
 
   // R-devel suggestion: Default to using all available hardware concurrency
@@ -29,7 +33,7 @@ void FuzzyVariableParser::ParseAllVariables(vector<Entity> &entities) {
   }
 
   size_t numThreads = std::min(entities.size(), maxThreads);
-  
+
   if (numThreads == 0) {
     numThreads = 1;
   }
@@ -51,12 +55,12 @@ void FuzzyVariableParser::ParseAllVariables(vector<Entity> &entities) {
   }
 }
 
-vector<pair<size_t, size_t>>
-FuzzyVariableParser::GetSearchBounds(vector<Entity> entities) {
+vector<pair<size_t, size_t>> FuzzyVariableParser::GetSearchBounds(
+    vector<Entity> &entities) {
   vector<pair<size_t, size_t>> ret;
 
   if (entities.empty()) {
-    return ret;
+    cpp11::stop("Error: The entities vector is empty.");
   }
 
   for (size_t i = 0; i < entities.size() - 1; ++i) {
diff --git a/rpkg/src/redatamlib/readers/FuzzyVariableParser.hpp b/rpkg/src/redatamlib/readers/FuzzyVariableParser.hpp
index 61d94a3..7f23343 100644
--- a/rpkg/src/redatamlib/readers/FuzzyVariableParser.hpp
+++ b/rpkg/src/redatamlib/readers/FuzzyVariableParser.hpp
@@ -33,7 +33,7 @@ class FuzzyVariableParser {
   string m_rootPath;
   std::mutex m_mtx;
 
-  vector<pair<size_t, size_t>> GetSearchBounds(vector<Entity> entities);
+  vector<pair<size_t, size_t>> GetSearchBounds(vector<Entity> &entities);
 
   static VarType ParseType(ByteArrayReader *reader);
   static string ParseIdxFileName(const string &rootPath,