Add utf16 and utf32 file load (#66)

* Add utf16 and utf32 file load Fixes #65 * update tests * update int vs list * update * added strip mixin * removed to many positional * rename to positional * update int time * update flake8 * update text * update format * fixed white space * corrected dict * update to replace instead of strip * updated prints
Gorkowski · Oct 1, 2024 · c2dd8a0 · c2dd8a0
1 parent f1a3136
commit c2dd8a0
Show file tree

Hide file tree

Showing 32 changed files with 863 additions and 675 deletions.
diff --git a/particula/activity/species_density.py b/particula/activity/species_density.py
@@ -10,11 +10,11 @@
 
 
 def organic_density_estimate(
-        molar_mass,
-        oxygen2carbon,
-        hydrogen2carbon=None,
-        nitrogen2carbon=None,
-        mass_ratio_convert=False
+    molar_mass,
+    oxygen2carbon,
+    hydrogen2carbon=None,
+    nitrogen2carbon=None,
+    mass_ratio_convert=False,
 ):
     """
     Function to estimate the density of organic compounds based on the simple
@@ -51,8 +51,9 @@ def organic_density_estimate(
     # Assuming an aliphatic compound with hydrogen2carbon = 2.0 in the absence
     # of functional groups, then correct for oxygen content assuming a linear
     # -1 slope (Van Krevelen diagram for typical SOA)
-    hydrogen2carbon_est = 2.0 - oxygen2carbon \
-        if hydrogen2carbon < 0.1 else hydrogen2carbon
+    hydrogen2carbon_est = (
+        2.0 - oxygen2carbon if hydrogen2carbon < 0.1 else hydrogen2carbon
+    )
 
     # 2) Compute the approximate number of carbon atoms per organic molecule
     number_carbons = molar_mass / (
@@ -66,40 +67,51 @@ def organic_density_estimate(
     # Here no correction is applied for rings and aromatic compounds
     # (due to limited info at input)
     rho1 = molar_mass / (
-        5.0 * number_carbons
-        * (2.0
-           + hydrogen2carbon_est
-           + oxygen2carbon * 2.0
-           + nitrogen2carbon * 2.0
-           )
+        5.0
+        * number_carbons
+        * (
+            2.0
+            + hydrogen2carbon_est
+            + oxygen2carbon * 2.0
+            + nitrogen2carbon * 2.0
+        )
     )
 
     # the returned denisty is in [g/cm^3]; and scaled assuming that most
     # that most of the oxygen atoms are able to make H-bonds
     # (donor or acceptor)
     return rho1 * (
-        1.0 + min(number_carbons * oxygen2carbon * 0.1 +
-                  number_carbons * nitrogen2carbon * 0.1, 0.3))
+        1.0
+        + min(
+            number_carbons * oxygen2carbon * 0.1
+            + number_carbons * nitrogen2carbon * 0.1,
+            0.3,
+        )
+    )
 
 
 def organic_array(
-        molar_mass,
-        oxygen2carbon,
-        hydrogen2carbon=None,
-        nitrogen2carbon=None,
-        mass_ratio_convert=False
+    molar_mass,
+    oxygen2carbon,
+    hydrogen2carbon=None,
+    nitrogen2carbon=None,
+    mass_ratio_convert=False,
 ):
-    # pylint: disable=too-many-arguments
+    # pylint: disable=too-many-positional-arguments, too-many-arguments
     """Get densities for an array."""
     density = np.empty([len(molar_mass), 1], dtype=float)
     for i, molar in enumerate(molar_mass):
-        hydrogen2carbon_run = None if hydrogen2carbon is None else hydrogen2carbon[i]
-        nitrogen2carbon_run = None if nitrogen2carbon is None else nitrogen2carbon[i]
+        hydrogen2carbon_run = (
+            None if hydrogen2carbon is None else hydrogen2carbon[i]
+        )
+        nitrogen2carbon_run = (
+            None if nitrogen2carbon is None else nitrogen2carbon[i]
+        )
         density[i] = organic_density_estimate(
             molar_mass=molar,
             oxygen2carbon=oxygen2carbon[i],
             hydrogen2carbon=hydrogen2carbon_run,
             nitrogen2carbon=nitrogen2carbon_run,
-            mass_ratio_convert=mass_ratio_convert
+            mass_ratio_convert=mass_ratio_convert,
         )
     return density
diff --git a/particula/data/loader.py b/particula/data/loader.py
@@ -4,6 +4,7 @@
 from typing import List, Union, Tuple, Dict, Any, Optional
 from datetime import datetime, timezone
 import warnings
+import codecs
 import glob
 import os
 import pickle
@@ -23,7 +24,8 @@ def data_raw_loader(file_path: str) -> list:
     """Loads raw data from file.
 
     Load raw data from a file at the specified file path and return it as a
-    list of strings.
+    list of strings. Attempts to handle UTF-8, UTF-16, and UTF-32 encodings.
+    Defaults to UTF-8 if no byte order mark (BOM) is found.
 
     Args:
         file_path (str): The file path of the file to read.
@@ -40,11 +42,30 @@ def data_raw_loader(file_path: str) -> list:
         ```
     """
     try:
-        with open(file_path, "r", encoding="utf8", errors="replace") as file:
+        # Read a small part of the file to detect BOM (byte order mark)
+        with open(file_path, "rb") as f:
+            raw_bytes = f.read(4)
+
+        # Determine encoding based on BOM
+        if raw_bytes.startswith(codecs.BOM_UTF16_LE) or raw_bytes.startswith(
+            codecs.BOM_UTF16_BE
+        ):
+            encoding = "utf-16"
+        elif raw_bytes.startswith(codecs.BOM_UTF32_LE) or raw_bytes.startswith(
+            codecs.BOM_UTF32_BE
+        ):
+            encoding = "utf-32"
+        else:
+            encoding = "utf8"  # Default to utf-8 if no BOM is found
+
+        # Read file with the detected encoding
+        with open(file_path, "r", encoding=encoding, errors="replace") as file:
             data = [line.rstrip() for line in file]
+
     except FileNotFoundError:
         print(f"File not found: {file_path}")
         data = []
+
     return data
 
 
@@ -95,6 +116,41 @@ def filter_list(data: List[str], char_counts: dict) -> List[str]:
     return filtered_data
 
 
+def replace_list(data: List[str], replace_dict: Dict[str, str]) -> List[str]:
+    """
+    Replace characters in each string of a list based on a replacement
+    dictionary.
+
+    Each character specified in the `replace_dict` will be replaced with the
+    corresponding value in every string in the input list.
+
+    Arguments:
+        data: A list of strings in which the characters will be replaced.
+        replace_dict: A dictionary specifying character replacements.
+            The keys are the characters to be replaced, and the values are the
+            replacement characters or strings.
+
+    Returns:
+        A new list of strings with the replacements applied.
+
+    Examples:
+        ``` py title="Replace characters in a list of strings"
+        data = ['apple[banana]orange', '[pear] kiwi plum']
+        replace_dict = {'[': '', ']': ''}
+        replaced_data = replace_list(data, replace_dict)
+        print(replaced_data)
+        ['applebananaorange', 'pear kiwi plum']
+        ```
+    """
+    replaced_data = []
+    for row in data:
+        modified_row = row
+        for old_char, new_char in replace_dict.items():
+            modified_row = modified_row.replace(old_char, new_char)
+        replaced_data.append(modified_row)
+    return replaced_data
+
+
 def data_format_checks(data: List[str], data_checks: dict) -> List[str]:
     """
     Validate and format raw data according to specified checks.
@@ -155,6 +211,9 @@ def data_format_checks(data: List[str], data_checks: dict) -> List[str]:
     if "char_counts" in data_checks:
         char_counts = data_checks.get("char_counts", {})
         data = filter_list(data, char_counts)
+    if "replace_chars" in data_checks:
+        replace_dict = data_checks.get("replace_chars", {})
+        data = replace_list(data, replace_dict)
     if data := [x.strip() for x in data]:
         return data
     else:
@@ -194,7 +253,10 @@ def parse_time_column(
         return float(line[time_column]) + seconds_shift
     if date_offset:
         # if the time is in one column, and the date is fixed
-        time_str = f"{date_offset} {line[time_column]}"
+        if isinstance(time_column, int):
+            time_str = f"{date_offset} {line[time_column]}"
+        else:
+            time_str = f"{date_offset} {line[time_column[0]]}"
         return (
             time_str_to_epoch(time_str, time_format, timezone_identifier)
             + seconds_shift
@@ -206,6 +268,13 @@ def parse_time_column(
             )
             + seconds_shift
         )
+    if isinstance(time_column, list) and len(time_column) == 1:
+        return (
+            time_str_to_epoch(
+                line[time_column[0]], time_format, timezone_identifier
+            )
+            + seconds_shift
+        )
     if isinstance(time_column, list) and len(time_column) == 2:
         # if the time and date are in two column
         time_str = f"{line[time_column[0]]} {line[time_column[1]]}"
@@ -215,6 +284,7 @@ def parse_time_column(
         )
     raise ValueError(
         f"Invalid time column or format: {time_column}, {time_format}"
+        f"{line}"
     )
 
 
@@ -256,7 +326,7 @@ def sample_data(
             if no matching data value is found.
     """
     # flake8: noqa
-    # pylint disable: too-many-arguments
+    # pylint disable: too-many-positional-arguments
     epoch_time = np.zeros(len(data))
     epoch_time = np.zeros(len(data))
     data_array = np.zeros((len(data), len(data_columns)))
@@ -318,6 +388,10 @@ def sample_data(
                     "yES",
                     "y",
                     "Y",
+                    "OK",
+                    "ok",
+                    "Ok",
+                    "Okay",
                 ]
                 false_match = [
                     "OFF",
@@ -494,9 +568,9 @@ def sizer_data_formatter(
     Arguments:
         data: List of raw data strings to be formatted.
         data_checks: Dictionary specifying validation rules for the data.
-        data_sizer_reader: Dictionary containing mappings for interpreting 
+        data_sizer_reader: Dictionary containing mappings for interpreting
             the sizer data format.
-        time_column: Index or list of indices indicating the position of 
+        time_column: Index or list of indices indicating the position of
             the time column(s) in the data.
         time_format: Format string for parsing time information in the data.
         delimiter: Delimiter used to separate values in the data.

diff --git a/particula/data/loader_setting_builders.py b/particula/data/loader_setting_builders.py
@@ -21,6 +21,7 @@
     ChecksCharCountsMixin,
     ChecksSkipRowsMixin,
     ChecksSkipEndMixin,
+    ChecksReplaceCharsMixin,
     SizerConcentrationConvertFromMixin,
     SizerStartKeywordMixin,
     SizerEndKeywordMixin,
@@ -100,6 +101,7 @@ class DataChecksBuilder(
     BuilderABC,
     ChecksCharactersMixin,
     ChecksCharCountsMixin,
+    ChecksReplaceCharsMixin,
     ChecksSkipRowsMixin,
     ChecksSkipEndMixin,
 ):
@@ -109,12 +111,14 @@ def __init__(self):
         required_parameters = [
             "characters",
             "char_counts",
+            "strip_chars",
             "skip_rows",
             "skip_end",
         ]
         BuilderABC.__init__(self, required_parameters)
         ChecksCharactersMixin.__init__(self)
         ChecksCharCountsMixin.__init__(self)
+        ChecksReplaceCharsMixin.__init__(self)
         ChecksSkipRowsMixin.__init__(self)
         ChecksSkipEndMixin.__init__(self)
 
@@ -123,6 +127,7 @@ def build(self) -> Dict[str, Any]:
         return {
             "characters": self.characters,
             "char_counts": self.char_counts,
+            "replace_chars": self.replace_chars,
             "skip_rows": self.skip_rows,
             "skip_end": self.skip_end,
         }

diff --git a/particula/data/merger.py b/particula/data/merger.py
@@ -20,8 +20,8 @@ def combine_data(
     time_new: np.ndarray,
     header_new: list,
 ) -> Tuple[np.ndarray, list]:
-    # pylint: disable=too-many-arguments
-    """"
+    # pylint: disable=too-many-positional-arguments, too-many-arguments
+    """ "
     Merge or adds processed data together. Accounts for data shape
     miss matches and duplicate timestamps. If the data is a different shape
     than
@@ -52,9 +52,8 @@ def combine_data(
     """
 
     data_new = convert.data_shape_check(
-        time=time_new,
-        data=data_new,
-        header=header_new)
+        time=time_new, data=data_new, header=header_new
+    )
 
     # Check if time_new matches the dimensions of data_new
     if np.array_equal(time, time_new):
@@ -102,7 +101,7 @@ def stream_add_data(
     time_new: np.ndarray,
     data_new: np.ndarray,
     header_check: Optional[bool] = False,
-    header_new: Optional[list] = None
+    header_new: Optional[list] = None,
 ) -> Stream:
     """
     Adds a new data stream and corresponding time stream to the
@@ -155,13 +154,14 @@ def stream_add_data(
     elif header_check:
         header_new = stream.header if header_new is None else header_new
 
-        stream.data, stream.header, data_new, header_new = \
+        stream.data, stream.header, data_new, header_new = (
             stats.merge_formatting(
                 data_current=stream.data,
                 header_current=stream.header,
                 data_new=data_new,
-                header_new=header_new
+                header_new=header_new,
             )
+        )
         # updates stream
         stream.data = np.vstack((stream.data, data_new))
         stream.time = np.concatenate((stream.time, time_new))
@@ -170,10 +170,7 @@ def stream_add_data(
         stream.time = np.concatenate((stream.time, time_new))
 
     # check if the time stream added is increasing
-    increasing_time = np.all(
-        stream.time[1:] >= stream.time[:-1],
-        axis=0
-    )
+    increasing_time = np.all(stream.time[1:] >= stream.time[:-1], axis=0)
 
     if not increasing_time:
         # sort the time stream