Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lint: Utils module #350

Merged
merged 43 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
3b5578b
matplotlib >=3.8
ssolson Aug 9, 2024
6c62bfd
Merge branch 'develop' of https://github.com/MHKiT-Software/MHKiT-Pyt…
ssolson Aug 19, 2024
738e3d0
Merge branch 'develop' of https://github.com/MHKiT-Software/MHKiT-Pyt…
ssolson Aug 19, 2024
f91ef77
Merge branch 'develop' of https://github.com/MHKiT-Software/MHKiT-Pyt…
ssolson Aug 28, 2024
8b8d54b
lint utils
ssolson Sep 4, 2024
e1196e1
10 lint coverage
ssolson Sep 4, 2024
f4ec009
reduce handle_caching to 5 inputs
ssolson Sep 4, 2024
88810cf
index is now "t"
ssolson Sep 5, 2024
71974b3
10/10 lint
ssolson Sep 5, 2024
8d0263f
10/10 lint
ssolson Sep 5, 2024
769a26f
add test__calculate_statistics
ssolson Sep 5, 2024
2f2655b
10/10 lint
ssolson Sep 5, 2024
e6da2ed
10/10 pylint
ssolson Sep 5, 2024
1c3602a
handle cache returns None now
ssolson Sep 6, 2024
0e343d5
fix logic around None passed to handle_cache
ssolson Sep 6, 2024
486708d
back to index
ssolson Sep 6, 2024
bdf74b3
data no longer returned as list
ssolson Sep 6, 2024
dfad8dc
remove old cache_utils function
ssolson Sep 6, 2024
8d551e4
clean up
ssolson Sep 6, 2024
cf54e12
type hints
ssolson Sep 9, 2024
7720670
Merge branch 'develop' of https://github.com/MHKiT-Software/MHKiT-Pyt…
ssolson Sep 11, 2024
92ad905
Merge branch 'develop' of https://github.com/MHKiT-Software/MHKiT-Pyt…
ssolson Sep 11, 2024
65ad2e4
Merge branch 'develop' of https://github.com/MHKiT-Software/MHKiT-Pyt…
ssolson Nov 12, 2024
44416f5
fix pylint iussues
ssolson Nov 12, 2024
ef04cc2
clean up package installation
ssolson Nov 13, 2024
7064645
change env name to mhkit-env
ssolson Nov 13, 2024
7aaedea
clean up installation
ssolson Nov 13, 2024
9e0d63d
add cf-staging label
ssolson Nov 13, 2024
03c9552
Use conda env file in all tests
ssolson Nov 13, 2024
ddfd14f
add configs and debug
ssolson Nov 15, 2024
9f5e427
use legacy solver
ssolson Nov 15, 2024
e5f1b5c
Ensure compatibility with modern packages
ssolson Nov 15, 2024
fd9646b
Ensure compatibility with modern packages
ssolson Nov 15, 2024
b721f03
add pecos
ssolson Nov 15, 2024
e7fc584
netcdf4 from pip to conda
ssolson Nov 15, 2024
24fa8a5
py 3.11, relax hdf5& netCDF4
ssolson Nov 15, 2024
259e7e5
relax python constraints
ssolson Nov 15, 2024
af3800d
no pip
ssolson Nov 18, 2024
a06ad1a
update folium maps calls
ssolson Nov 18, 2024
f616e1c
remove debug flag
ssolson Nov 18, 2024
ff35897
actual folium updates
ssolson Nov 18, 2024
18fc9ff
remove channels and legacy solver
ssolson Nov 19, 2024
c5244a2
re-add handle_cache docstring
ssolson Nov 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,7 @@ jobs:
- name: Run Pylint on mhkit/power/
run: |
pylint mhkit/power/

- name: Run Pylint on mhkit/utils/
run: |
pylint mhkit/utils/
akeeste marked this conversation as resolved.
Show resolved Hide resolved
12 changes: 10 additions & 2 deletions mhkit/river/io/usgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,10 @@ def request_usgs_data(

# Use handle_caching to manage cache
cached_data, metadata, cache_filepath = handle_caching(
hash_params, cache_dir, write_json, clear_cache
hash_params,
cache_dir,
cache_content={"data": None, "metadata": None, "write_json": write_json},
clear_cache_file=clear_cache,
akeeste marked this conversation as resolved.
Show resolved Hide resolved
)

if cached_data is not None:
Expand Down Expand Up @@ -165,7 +168,12 @@ def request_usgs_data(

# After making the API request and processing the response, write the
# response to a cache file
handle_caching(hash_params, cache_dir, data=data, clear_cache_file=clear_cache)
handle_caching(
hash_params,
cache_dir,
cache_content={"data": data, "metadata": None, "write_json": None},
clear_cache_file=clear_cache,
)
akeeste marked this conversation as resolved.
Show resolved Hide resolved

if write_json:
shutil.copy(cache_filepath, write_json)
Expand Down
26 changes: 22 additions & 4 deletions mhkit/tests/utils/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,11 @@ def test_handle_caching_creates_cache(self):
Asserts:
- The cache file is successfully created at the expected file path.
"""
handle_caching(self.hash_params, self.cache_dir, data=self.data)
handle_caching(
self.hash_params,
self.cache_dir,
cache_content={"data": self.data, "metadata": None, "write_json": None},
)
akeeste marked this conversation as resolved.
Show resolved Hide resolved

cache_filename = (
hashlib.md5(self.hash_params.encode("utf-8")).hexdigest() + ".json"
Expand All @@ -114,8 +118,18 @@ def test_handle_caching_retrieves_data(self):
Asserts:
- The retrieved data matches the original sample DataFrame.
"""
handle_caching(self.hash_params, self.cache_dir, data=self.data)
retrieved_data, _, _ = handle_caching(self.hash_params, self.cache_dir)
handle_caching(
self.hash_params,
self.cache_dir,
cache_content={"data": self.data, "metadata": None, "write_json": None},
)

retrieved_data, _, _ = handle_caching(
self.hash_params,
self.cache_dir,
cache_content={"data": None, "metadata": None, "write_json": None},
)

akeeste marked this conversation as resolved.
Show resolved Hide resolved
pd.testing.assert_frame_equal(self.data, retrieved_data, check_freq=False)

def test_handle_caching_cdip_file_extension(self):
Expand All @@ -132,7 +146,11 @@ def test_handle_caching_cdip_file_extension(self):
- The cache file with a ".pkl" extension is successfully created at the expected file path.
"""
cache_dir = os.path.join(self.cache_dir, "cdip")
handle_caching(self.hash_params, cache_dir, data=self.data)
handle_caching(
self.hash_params,
cache_dir,
cache_content={"data": self.data, "metadata": None, "write_json": None},
)
akeeste marked this conversation as resolved.
Show resolved Hide resolved

cache_filename = (
hashlib.md5(self.hash_params.encode("utf-8")).hexdigest() + ".pkl"
Expand Down
35 changes: 33 additions & 2 deletions mhkit/tests/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ def test_get_statistics(self):
# load in file
df = self.data["loads"]
df.Timestamp = pd.to_datetime(df.Timestamp)
df.set_index("Timestamp", inplace=True)
test_df = df.set_index("Timestamp")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

inplace was overwriting the original dataset and prevented self.data["loads"] to be resued consistently in new tests e.g. def test__calculate_statistics(self):

# run function
means, maxs, mins, stdevs = utils.get_statistics(
df,
test_df,
self.freq,
period=self.period,
vector_channels=["WD_Nacelle", "WD_NacelleMod"],
Expand All @@ -57,6 +57,37 @@ def test_get_statistics(self):
time = pd.to_datetime(string_time)
self.assertTrue(means.index[0] == time)

def test__calculate_statistics(self):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New test for new utils local function _calculate_statistics

# load in file
df = self.data["loads"]
df.Timestamp = pd.to_datetime(df.Timestamp)
test_df = df.set_index("Timestamp")

# Select a specific data chunk (the first 10 rows)
datachunk = test_df.iloc[:10]

# Run the calculate_statistics function
stats = utils._calculate_statistics(
datachunk, vector_channels=["WD_Nacelle", "WD_NacelleMod"]
)

means = stats["means"]
maxs = stats["maxs"]
mins = stats["mins"]
stdevs = stats["stdevs"]

# check statistics for a specific column ('uWind_80m')
self.assertAlmostEqual(means["uWind_80m"], 3.226, 2) # mean
self.assertAlmostEqual(maxs["uWind_80m"], 3.234, 2) # max
self.assertAlmostEqual(mins["uWind_80m"], 3.221, 2) # min
self.assertAlmostEqual(stdevs["uWind_80m"], 0.005049, 2) # standard deviation

# check vector statistics for 'WD_Nacelle'
self.assertAlmostEqual(means["WD_Nacelle"], 157.302, 2) # vector mean
self.assertAlmostEqual(
stdevs["WD_Nacelle"], 0.000, 2
) # vector standard deviation

def test_vector_statistics(self):
# load in vector variable
df = self.data["loads"]
Expand Down
8 changes: 5 additions & 3 deletions mhkit/tidal/io/noaa.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,10 @@ def request_noaa_data(

# Use handle_caching to manage cache
cached_data, cached_metadata, cache_filepath = handle_caching(
hash_params, cache_dir, write_json=write_json, clear_cache_file=clear_cache
hash_params,
cache_dir,
cache_content={"data": None, "metadata": None, "write_json": write_json},
clear_cache_file=clear_cache,
akeeste marked this conversation as resolved.
Show resolved Hide resolved
)

if cached_data is not None:
Expand Down Expand Up @@ -205,8 +208,7 @@ def request_noaa_data(
handle_caching(
hash_params,
cache_dir,
data=data,
metadata=metadata,
cache_content={"data": data, "metadata": metadata, "write_json": None},
clear_cache_file=clear_cache,
)
akeeste marked this conversation as resolved.
Show resolved Hide resolved

Expand Down
8 changes: 8 additions & 0 deletions mhkit/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
"""
This module initializes and imports the essential utility functions for data
conversion, statistical analysis, caching, and event detection for the
MHKiT library.
"""

from .time_utils import matlab_to_datetime, excel_to_datetime, index_to_datetime
from .stat_utils import (
_calculate_statistics,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

new local function created to reduce the number of local variables in get_statistics

get_statistics,
vector_statistics,
unwrap_vector,
Expand All @@ -15,4 +22,5 @@
convert_nested_dict_and_pandas,
)

# pylint: disable=invalid-name
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did not want to impact and or change any Matlab wrapping functionality

_matlab = False # Private variable indicating if mhkit is run through matlab
182 changes: 68 additions & 114 deletions mhkit/utils/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
import hashlib
import json
import os
import re
import shutil
import pickle
import pandas as pd
Expand All @@ -51,146 +50,101 @@
def handle_caching(
hash_params,
cache_dir,
data=None,
metadata=None,
write_json=None,
cache_content=None,
akeeste marked this conversation as resolved.
Show resolved Hide resolved
clear_cache_file=False,
):
"""
Handles caching of data to avoid redundant network requests or
computations.

The function checks if a cache file exists for the given parameters.
If it does, the function will load data from the cache file, unless
the `clear_cache_file` parameter is set to `True`, in which case the
cache file is cleared. If the cache file does not exist and the
`data` parameter is not `None`, the function will store the
provided data in a cache file.

akeeste marked this conversation as resolved.
Show resolved Hide resolved
Parameters
----------
hash_params : str
The parameters to be hashed and used as the filename for the cache file.
cache_dir : str
The directory where the cache files are stored.
data : pandas DataFrame or None
The data to be stored in the cache file. If `None`, the function
will attempt to load data from the cache file.
metadata : dict or None
Metadata associated with the data. This will be stored in the
cache file along with the data.
write_json : str or None
If specified, the cache file will be copied to a file with this name.
clear_cache_file : bool
If `True`, the cache file for the given parameters will be cleared.

Returns
-------
data : pandas DataFrame or None
The data loaded from the cache file. If data was provided as a
parameter, the same data will be returned. If the cache file
does not exist and no data was provided, `None` will be returned.
metadata : dict or None
The metadata loaded from the cache file. If metadata was provided
as a parameter, the same metadata will be returned. If the cache
file does not exist and no metadata was provided, `None` will be
returned.
cache_filepath : str
The path to the cache file.
"""

# Check if 'cdip' is in cache_dir, then use .pkl instead of .json
file_extension = (
".pkl"
if "cdip" in cache_dir or "hindcast" in cache_dir or "ndbc" in cache_dir
else ".json"
)

# Make cache directory if it doesn't exist
if not os.path.isdir(cache_dir):
os.makedirs(cache_dir)

# Create a unique filename based on the function parameters
cache_filename = (
hashlib.md5(hash_params.encode("utf-8")).hexdigest() + file_extension
)
cache_filepath = os.path.join(cache_dir, cache_filename)

# If clear_cache_file is True, remove the cache file for this request
if clear_cache_file and os.path.isfile(cache_filepath):
os.remove(cache_filepath)
print(f"Cleared cache for {cache_filepath}")

# If a cached file exists, load and return the data from the file
if os.path.isfile(cache_filepath) and data is None:
data = None
metadata = None

def _generate_cache_filepath():
"""Generates the cache file path based on the hashed parameters."""
file_extension = (
".pkl"
if "cdip" in cache_dir or "hindcast" in cache_dir or "ndbc" in cache_dir
else ".json"
)
cache_filename = (
hashlib.md5(hash_params.encode("utf-8")).hexdigest() + file_extension
)
return os.path.join(cache_dir, cache_filename), file_extension

def _clear_cache(cache_filepath):
"""Clear the cache file if requested."""
if clear_cache_file and os.path.isfile(cache_filepath):
os.remove(cache_filepath)
print(f"Cleared cache for {cache_filepath}")

def _load_cache(file_extension, cache_filepath):
"""Load data from the cache file based on its extension."""
nonlocal data, metadata # Specify that these are outer variables
if file_extension == ".json":
with open(cache_filepath, encoding="utf-8") as f:
jsonData = json.load(f)

# Extract metadata if it exists
if "metadata" in jsonData:
metadata = jsonData.pop("metadata", None)

# Check if index is datetime formatted
if all(
re.match(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", str(dt))
for dt in jsonData["index"]
):
data = pd.DataFrame(
jsonData["data"],
index=pd.to_datetime(jsonData["index"]),
columns=jsonData["columns"],
)
else:
data = pd.DataFrame(
jsonData["data"],
index=jsonData["index"],
columns=jsonData["columns"],
)
json_data = json.load(f)

metadata = json_data.pop("metadata", None)

# Convert the rest to DataFrame
data = pd.DataFrame(
jsonData["data"],
index=pd.to_datetime(jsonData["index"]),
columns=jsonData["columns"],
json_data["data"],
index=pd.to_datetime(json_data["index"]),
columns=json_data["columns"],
)

elif file_extension == ".pkl":
with open(cache_filepath, "rb") as f:
data, metadata = pickle.load(f)

if write_json:
shutil.copy(cache_filepath, write_json)
return data, metadata

return data, metadata, cache_filepath

# If a cached file does not exist and data is provided,
# store the data in a cache file
elif data is not None:
def _write_cache(data, metadata, file_extension, cache_filepath):
"""Store data in the cache file based on the extension."""
if file_extension == ".json":
# Convert DataFrame to python dict
pyData = data.to_dict(orient="split")
# Add metadata to pyData
pyData["metadata"] = metadata
# Check if index is datetime indexed
py_data = data.to_dict(orient="split")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

snake_case

py_data["metadata"] = metadata
if isinstance(data.index, pd.DatetimeIndex):
pyData["index"] = [
dt.strftime("%Y-%m-%d %H:%M:%S") for dt in pyData["index"]
py_data["index"] = [
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

snake_case

dt.strftime("%Y-%m-%d %H:%M:%S") for dt in py_data["index"]
]
else:
pyData["index"] = list(data.index)
py_data["index"] = list(data.index)
with open(cache_filepath, "w", encoding="utf-8") as f:
json.dump(pyData, f)

json.dump(py_data, f)
elif file_extension == ".pkl":
with open(cache_filepath, "wb") as f:
pickle.dump((data, metadata), f)

if write_json:
shutil.copy(cache_filepath, write_json)
# Create the cache directory if it doesn't exist
if not os.path.isdir(cache_dir):
os.makedirs(cache_dir)

# Generate cache filepath and extension
cache_filepath, file_extension = _generate_cache_filepath()

# Clear cache if requested
_clear_cache(cache_filepath)

# If cache file exists and cache_content["data"] is None, load from cache
if os.path.isfile(cache_filepath) and (
cache_content is None or cache_content["data"] is None
):
return _load_cache(file_extension, cache_filepath) + (cache_filepath,)

# Store data in cache if provided
if cache_content and cache_content["data"] is not None:
_write_cache(
cache_content["data"],
cache_content["metadata"],
file_extension,
cache_filepath,
)
if cache_content["write_json"]:
shutil.copy(cache_filepath, cache_content["write_json"])

return cache_content["data"], cache_content["metadata"], cache_filepath

return data, metadata, cache_filepath
# If data is not provided and the cache file doesn't exist, return cache_filepath
return None, None, cache_filepath


Expand Down
Loading
Loading