Skip to content

Commit

Permalink
Add utf16 and utf32 file load (#66)
Browse files Browse the repository at this point in the history
* Add utf16 and utf32 file load
Fixes #65

* update tests

* update int vs list

* update

* added strip mixin

* removed to many positional

* rename to positional

* update int time

* update flake8

* update text

* update format

* fixed white space

* corrected dict

* update to replace instead of strip

* updated prints
  • Loading branch information
Gorkowski authored Oct 1, 2024
1 parent f1a3136 commit c2dd8a0
Show file tree
Hide file tree
Showing 32 changed files with 863 additions and 675 deletions.
60 changes: 36 additions & 24 deletions particula/activity/species_density.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@


def organic_density_estimate(
molar_mass,
oxygen2carbon,
hydrogen2carbon=None,
nitrogen2carbon=None,
mass_ratio_convert=False
molar_mass,
oxygen2carbon,
hydrogen2carbon=None,
nitrogen2carbon=None,
mass_ratio_convert=False,
):
"""
Function to estimate the density of organic compounds based on the simple
Expand Down Expand Up @@ -51,8 +51,9 @@ def organic_density_estimate(
# Assuming an aliphatic compound with hydrogen2carbon = 2.0 in the absence
# of functional groups, then correct for oxygen content assuming a linear
# -1 slope (Van Krevelen diagram for typical SOA)
hydrogen2carbon_est = 2.0 - oxygen2carbon \
if hydrogen2carbon < 0.1 else hydrogen2carbon
hydrogen2carbon_est = (
2.0 - oxygen2carbon if hydrogen2carbon < 0.1 else hydrogen2carbon
)

# 2) Compute the approximate number of carbon atoms per organic molecule
number_carbons = molar_mass / (
Expand All @@ -66,40 +67,51 @@ def organic_density_estimate(
# Here no correction is applied for rings and aromatic compounds
# (due to limited info at input)
rho1 = molar_mass / (
5.0 * number_carbons
* (2.0
+ hydrogen2carbon_est
+ oxygen2carbon * 2.0
+ nitrogen2carbon * 2.0
)
5.0
* number_carbons
* (
2.0
+ hydrogen2carbon_est
+ oxygen2carbon * 2.0
+ nitrogen2carbon * 2.0
)
)

# the returned denisty is in [g/cm^3]; and scaled assuming that most
# that most of the oxygen atoms are able to make H-bonds
# (donor or acceptor)
return rho1 * (
1.0 + min(number_carbons * oxygen2carbon * 0.1 +
number_carbons * nitrogen2carbon * 0.1, 0.3))
1.0
+ min(
number_carbons * oxygen2carbon * 0.1
+ number_carbons * nitrogen2carbon * 0.1,
0.3,
)
)


def organic_array(
molar_mass,
oxygen2carbon,
hydrogen2carbon=None,
nitrogen2carbon=None,
mass_ratio_convert=False
molar_mass,
oxygen2carbon,
hydrogen2carbon=None,
nitrogen2carbon=None,
mass_ratio_convert=False,
):
# pylint: disable=too-many-arguments
# pylint: disable=too-many-positional-arguments, too-many-arguments
"""Get densities for an array."""
density = np.empty([len(molar_mass), 1], dtype=float)
for i, molar in enumerate(molar_mass):
hydrogen2carbon_run = None if hydrogen2carbon is None else hydrogen2carbon[i]
nitrogen2carbon_run = None if nitrogen2carbon is None else nitrogen2carbon[i]
hydrogen2carbon_run = (
None if hydrogen2carbon is None else hydrogen2carbon[i]
)
nitrogen2carbon_run = (
None if nitrogen2carbon is None else nitrogen2carbon[i]
)
density[i] = organic_density_estimate(
molar_mass=molar,
oxygen2carbon=oxygen2carbon[i],
hydrogen2carbon=hydrogen2carbon_run,
nitrogen2carbon=nitrogen2carbon_run,
mass_ratio_convert=mass_ratio_convert
mass_ratio_convert=mass_ratio_convert,
)
return density
86 changes: 80 additions & 6 deletions particula/data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import List, Union, Tuple, Dict, Any, Optional
from datetime import datetime, timezone
import warnings
import codecs
import glob
import os
import pickle
Expand All @@ -23,7 +24,8 @@ def data_raw_loader(file_path: str) -> list:
"""Loads raw data from file.
Load raw data from a file at the specified file path and return it as a
list of strings.
list of strings. Attempts to handle UTF-8, UTF-16, and UTF-32 encodings.
Defaults to UTF-8 if no byte order mark (BOM) is found.
Args:
file_path (str): The file path of the file to read.
Expand All @@ -40,11 +42,30 @@ def data_raw_loader(file_path: str) -> list:
```
"""
try:
with open(file_path, "r", encoding="utf8", errors="replace") as file:
# Read a small part of the file to detect BOM (byte order mark)
with open(file_path, "rb") as f:
raw_bytes = f.read(4)

# Determine encoding based on BOM
if raw_bytes.startswith(codecs.BOM_UTF16_LE) or raw_bytes.startswith(
codecs.BOM_UTF16_BE
):
encoding = "utf-16"
elif raw_bytes.startswith(codecs.BOM_UTF32_LE) or raw_bytes.startswith(
codecs.BOM_UTF32_BE
):
encoding = "utf-32"
else:
encoding = "utf8" # Default to utf-8 if no BOM is found

# Read file with the detected encoding
with open(file_path, "r", encoding=encoding, errors="replace") as file:
data = [line.rstrip() for line in file]

except FileNotFoundError:
print(f"File not found: {file_path}")
data = []

return data


Expand Down Expand Up @@ -95,6 +116,41 @@ def filter_list(data: List[str], char_counts: dict) -> List[str]:
return filtered_data


def replace_list(data: List[str], replace_dict: Dict[str, str]) -> List[str]:
"""
Replace characters in each string of a list based on a replacement
dictionary.
Each character specified in the `replace_dict` will be replaced with the
corresponding value in every string in the input list.
Arguments:
data: A list of strings in which the characters will be replaced.
replace_dict: A dictionary specifying character replacements.
The keys are the characters to be replaced, and the values are the
replacement characters or strings.
Returns:
A new list of strings with the replacements applied.
Examples:
``` py title="Replace characters in a list of strings"
data = ['apple[banana]orange', '[pear] kiwi plum']
replace_dict = {'[': '', ']': ''}
replaced_data = replace_list(data, replace_dict)
print(replaced_data)
['applebananaorange', 'pear kiwi plum']
```
"""
replaced_data = []
for row in data:
modified_row = row
for old_char, new_char in replace_dict.items():
modified_row = modified_row.replace(old_char, new_char)
replaced_data.append(modified_row)
return replaced_data


def data_format_checks(data: List[str], data_checks: dict) -> List[str]:
"""
Validate and format raw data according to specified checks.
Expand Down Expand Up @@ -155,6 +211,9 @@ def data_format_checks(data: List[str], data_checks: dict) -> List[str]:
if "char_counts" in data_checks:
char_counts = data_checks.get("char_counts", {})
data = filter_list(data, char_counts)
if "replace_chars" in data_checks:
replace_dict = data_checks.get("replace_chars", {})
data = replace_list(data, replace_dict)
if data := [x.strip() for x in data]:
return data
else:
Expand Down Expand Up @@ -194,7 +253,10 @@ def parse_time_column(
return float(line[time_column]) + seconds_shift
if date_offset:
# if the time is in one column, and the date is fixed
time_str = f"{date_offset} {line[time_column]}"
if isinstance(time_column, int):
time_str = f"{date_offset} {line[time_column]}"
else:
time_str = f"{date_offset} {line[time_column[0]]}"
return (
time_str_to_epoch(time_str, time_format, timezone_identifier)
+ seconds_shift
Expand All @@ -206,6 +268,13 @@ def parse_time_column(
)
+ seconds_shift
)
if isinstance(time_column, list) and len(time_column) == 1:
return (
time_str_to_epoch(
line[time_column[0]], time_format, timezone_identifier
)
+ seconds_shift
)
if isinstance(time_column, list) and len(time_column) == 2:
# if the time and date are in two column
time_str = f"{line[time_column[0]]} {line[time_column[1]]}"
Expand All @@ -215,6 +284,7 @@ def parse_time_column(
)
raise ValueError(
f"Invalid time column or format: {time_column}, {time_format}"
f"{line}"
)


Expand Down Expand Up @@ -256,7 +326,7 @@ def sample_data(
if no matching data value is found.
"""
# flake8: noqa
# pylint disable: too-many-arguments
# pylint disable: too-many-positional-arguments
epoch_time = np.zeros(len(data))
epoch_time = np.zeros(len(data))
data_array = np.zeros((len(data), len(data_columns)))
Expand Down Expand Up @@ -318,6 +388,10 @@ def sample_data(
"yES",
"y",
"Y",
"OK",
"ok",
"Ok",
"Okay",
]
false_match = [
"OFF",
Expand Down Expand Up @@ -494,9 +568,9 @@ def sizer_data_formatter(
Arguments:
data: List of raw data strings to be formatted.
data_checks: Dictionary specifying validation rules for the data.
data_sizer_reader: Dictionary containing mappings for interpreting
data_sizer_reader: Dictionary containing mappings for interpreting
the sizer data format.
time_column: Index or list of indices indicating the position of
time_column: Index or list of indices indicating the position of
the time column(s) in the data.
time_format: Format string for parsing time information in the data.
delimiter: Delimiter used to separate values in the data.
Expand Down
5 changes: 5 additions & 0 deletions particula/data/loader_setting_builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
ChecksCharCountsMixin,
ChecksSkipRowsMixin,
ChecksSkipEndMixin,
ChecksReplaceCharsMixin,
SizerConcentrationConvertFromMixin,
SizerStartKeywordMixin,
SizerEndKeywordMixin,
Expand Down Expand Up @@ -100,6 +101,7 @@ class DataChecksBuilder(
BuilderABC,
ChecksCharactersMixin,
ChecksCharCountsMixin,
ChecksReplaceCharsMixin,
ChecksSkipRowsMixin,
ChecksSkipEndMixin,
):
Expand All @@ -109,12 +111,14 @@ def __init__(self):
required_parameters = [
"characters",
"char_counts",
"strip_chars",
"skip_rows",
"skip_end",
]
BuilderABC.__init__(self, required_parameters)
ChecksCharactersMixin.__init__(self)
ChecksCharCountsMixin.__init__(self)
ChecksReplaceCharsMixin.__init__(self)
ChecksSkipRowsMixin.__init__(self)
ChecksSkipEndMixin.__init__(self)

Expand All @@ -123,6 +127,7 @@ def build(self) -> Dict[str, Any]:
return {
"characters": self.characters,
"char_counts": self.char_counts,
"replace_chars": self.replace_chars,
"skip_rows": self.skip_rows,
"skip_end": self.skip_end,
}
Expand Down
21 changes: 9 additions & 12 deletions particula/data/merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ def combine_data(
time_new: np.ndarray,
header_new: list,
) -> Tuple[np.ndarray, list]:
# pylint: disable=too-many-arguments
""""
# pylint: disable=too-many-positional-arguments, too-many-arguments
""" "
Merge or adds processed data together. Accounts for data shape
miss matches and duplicate timestamps. If the data is a different shape
than
Expand Down Expand Up @@ -52,9 +52,8 @@ def combine_data(
"""

data_new = convert.data_shape_check(
time=time_new,
data=data_new,
header=header_new)
time=time_new, data=data_new, header=header_new
)

# Check if time_new matches the dimensions of data_new
if np.array_equal(time, time_new):
Expand Down Expand Up @@ -102,7 +101,7 @@ def stream_add_data(
time_new: np.ndarray,
data_new: np.ndarray,
header_check: Optional[bool] = False,
header_new: Optional[list] = None
header_new: Optional[list] = None,
) -> Stream:
"""
Adds a new data stream and corresponding time stream to the
Expand Down Expand Up @@ -155,13 +154,14 @@ def stream_add_data(
elif header_check:
header_new = stream.header if header_new is None else header_new

stream.data, stream.header, data_new, header_new = \
stream.data, stream.header, data_new, header_new = (
stats.merge_formatting(
data_current=stream.data,
header_current=stream.header,
data_new=data_new,
header_new=header_new
header_new=header_new,
)
)
# updates stream
stream.data = np.vstack((stream.data, data_new))
stream.time = np.concatenate((stream.time, time_new))
Expand All @@ -170,10 +170,7 @@ def stream_add_data(
stream.time = np.concatenate((stream.time, time_new))

# check if the time stream added is increasing
increasing_time = np.all(
stream.time[1:] >= stream.time[:-1],
axis=0
)
increasing_time = np.all(stream.time[1:] >= stream.time[:-1], axis=0)

if not increasing_time:
# sort the time stream
Expand Down
Loading

0 comments on commit c2dd8a0

Please sign in to comment.