Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: integrate polars into quipus #47

Merged
merged 3 commits into from
Nov 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 171 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@ weasyprint = "62.3"
paramiko = "3.4.1,<3.5.0"
boto3 = "^1.35.34"
pandas = "^2.2.3"
polars = "^1.12.0"
psycopg = "^3.2.3"
psycopg-pool = "^3.2.3"
openpyxl = "^3.1.5"
XlsxWriter = "^3.2.0"
fastexcel = "^0.12.0"

[tool.poetry.group.dev.dependencies]
pylint = "^3.3.1"
Expand Down
59 changes: 35 additions & 24 deletions quipus/data_sources/csv_data_source.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,71 @@
from typing import Optional, List
from pathlib import Path
from typing import Union, Optional, List

import pandas as pd
import polars as pl


class CSVDataSource:
"""
CSV DataSource class to manage data retrieval from CSV files.

Attributes:
file_path (str): Path to the CSV file.
file_path (Union[Path, str]): Path to the CSV file.
delimiter (str): Delimiter used in the CSV file.
encoding (str): Encoding of the CSV file.
dataframe (Optional[pd.DataFrame]): Loaded data as a pandas DataFrame.
dataframe (Optional[pl.DataFrame]): Loaded data as a polars DataFrame.
"""

def __init__(self, file_path: str, delimiter: str = ",", encoding: str = "utf-8"):
def __init__(
self,
file_path: Union[Path, str],
delimiter: str = ",",
encoding: str = "utf8"
):
self.file_path = file_path
self.delimiter = delimiter
self.encoding = encoding
self.dataframe: Optional[pd.DataFrame] = None
self.dataframe: Optional[pl.DataFrame] = None
self.__load_data()

def __load_data(self) -> None:
"""
Load data from the CSV file into a pandas DataFrame.
Load data from the CSV file into a polars DataFrame.
"""
self.dataframe = pd.read_csv(
self.file_path, delimiter=self.delimiter, encoding=self.encoding
self.dataframe = pl.read_csv(
source=self.file_path,
separator=self.delimiter,
encoding=self.encoding
)

@property
def file_path(self) -> str:
def file_path(self) -> Union[Path, str]:
"""
Get the path to the CSV file.

Returns:
str: Path to the CSV file.
Union[Path, str]: Path to the CSV file.
"""
return self.__file_path

@file_path.setter
def file_path(self, file_path: str) -> None:
def file_path(self, file_path: Union[Path, str]) -> None:
"""
Set the path to the CSV file.

Args:
file_path (str): Path to the CSV file.
file_path (Union[Path, str]): Path to the CSV file.

Raises:
TypeError: If 'file_path' is not a string.
ValueError: If 'file_path' is an empty string.
"""
if not isinstance(file_path, str):
raise TypeError("'file_path' must be a string.")
if not file_path.strip():
raise ValueError("'file_path' cannot be an empty string.")
if not isinstance(file_path, (Path, str)):
raise TypeError("'file_path' must be either a string or 'Path' object.")

# Ensure that path exists
file_path = Path(file_path) if isinstance(file_path, str) else file_path
if not file_path.exists() or file_path.is_dir():
raise FileNotFoundError(f"'{file_path}' does not exist.")
self.__file_path = file_path

@property
Expand Down Expand Up @@ -98,12 +109,12 @@ def encoding(self, encoding: str) -> None:
raise TypeError("'encoding' must be a string.")
self.__encoding = encoding

def fetch_data(self) -> pd.DataFrame:
def fetch_data(self) -> pl.DataFrame:
"""
Fetch all data from the CSV file as a pandas DataFrame.
Fetch all data from the CSV file as a polars DataFrame.

Returns:
pd.DataFrame: Data loaded from the CSV file.
pl.DataFrame: Data loaded from the CSV file.
"""
if self.dataframe is None:
raise RuntimeError("No data loaded from the CSV file.")
Expand All @@ -120,15 +131,15 @@ def get_columns(self) -> List[str]:
raise RuntimeError("No data loaded from the CSV file.")
return list(self.dataframe.columns)

def filter_data(self, query: str) -> pd.DataFrame:
def filter_data(self, query: str) -> pl.DataFrame:
"""
Filter the CSV data using a pandas query string.
Filter the CSV data using a polars query string.

Args:
query (str): Query string to filter the data.

Returns:
pd.DataFrame: Filtered data based on the query.
pl.DataFrame: Filtered data based on the query.

Raises:
RuntimeError: If no data is loaded.
Expand All @@ -138,7 +149,7 @@ def filter_data(self, query: str) -> pd.DataFrame:
raise RuntimeError("No data loaded from the CSV file.")

try:
return self.dataframe.query(query)
return self.dataframe.sql(query=query)
except Exception as e:
raise ValueError(f"Invalid query: {query}") from e

Expand Down
32 changes: 16 additions & 16 deletions quipus/data_sources/dataframe_data_source.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,50 @@
from typing import List

import pandas as pd
import polars as pl


class DataFrameDataSource:
"""
Pandas DataFrame DataSource to manage data retrieval from DataFrames.
polars DataFrame DataSource to manage data retrieval from DataFrames.

Attributes:
dataframe (pd.DataFrame): DataFrame containing the data.
dataframe (pl.DataFrame): DataFrame containing the data.
"""

def __init__(self, dataframe: pd.DataFrame):
def __init__(self, dataframe: pl.DataFrame):
self.dataframe = dataframe

@property
def dataframe(self) -> pd.DataFrame:
def dataframe(self) -> pl.DataFrame:
"""
Get the DataFrame containing the data.

Returns:
pd.DataFrame: DataFrame containing the data.
pl.DataFrame: DataFrame containing the data.
"""
return self.__dataframe

@dataframe.setter
def dataframe(self, dataframe: pd.DataFrame) -> None:
def dataframe(self, dataframe: pl.DataFrame) -> None:
"""
Set the DataFrame containing the data.

Args:
dataframe (pd.DataFrame): DataFrame containing the data.
dataframe (pl.DataFrame): DataFrame containing the data.

Raises:
TypeError: If 'dataframe' is not a pandas DataFrame.
TypeError: If 'dataframe' is not a polars DataFrame.
"""
if not isinstance(dataframe, pd.DataFrame):
raise TypeError("'dataframe' must be a pandas DataFrame.")
if not isinstance(dataframe, pl.DataFrame):
raise TypeError("'dataframe' must be a polars DataFrame.")
self.__dataframe = dataframe

def fetch_data(self) -> pd.DataFrame:
def fetch_data(self) -> pl.DataFrame:
"""
Fetch data from the DataFrame.

Returns:
pd.DataFrame: DataFrame containing the data.
pl.DataFrame: DataFrame containing the data.
"""
if self.dataframe is None:
raise RuntimeError("No data loaded in the DataFrame.")
Expand All @@ -61,15 +61,15 @@ def get_columns(self) -> List[str]:
raise RuntimeError("No data loaded in the DataFrame.")
return list(self.dataframe.columns)

def filter_data(self, query: str) -> pd.DataFrame:
def filter_data(self, query: str) -> pl.DataFrame:
"""
Filter the data in the DataFrame using a query.

Args:
query (str): Query to filter the data.

Returns:
pd.DataFrame: Filtered DataFrame.
pl.DataFrame: Filtered DataFrame.

Raises:
RuntimeError: If no data is loaded in the DataFrame.
Expand All @@ -86,7 +86,7 @@ def filter_data(self, query: str) -> pd.DataFrame:
if query.strip() == "":
raise ValueError("Query cannot be an empty string.")

return self.dataframe.query(query)
return self.dataframe.sql(query)

def __str__(self) -> str:
"""
Expand Down
50 changes: 27 additions & 23 deletions quipus/data_sources/xlsx_data_source.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,43 @@
from typing import Optional, List
from pathlib import Path
from typing import Union, Optional, List

import pandas as pd
import polars as pl


class XLSXDataSource:
"""
XLSX DataSource class to manage data retrieval from Excel (.xlsx) files.

Attributes:
file_path (str): Path to the Excel file.
file_path (Union[Path, str]): Path to the Excel file.
sheet_name (str): Name of the sheet to load from the Excel file.
dataframe (Optional[pd.DataFrame]): Loaded data as a pandas DataFrame.
dataframe (Optional[pl.DataFrame]): Loaded data as a polars DataFrame.
"""

def __init__(self, file_path: str, sheet_name: str):
def __init__(self, file_path: Union[Path, str], sheet_name: str):
self.file_path = file_path
self.sheet_name = sheet_name
self.dataframe: Optional[pd.DataFrame] = None
self.dataframe: Optional[pl.DataFrame] = None
self.__load_data()

def __load_data(self) -> None:
"""
Load data from the Excel file into a pandas DataFrame.
Load data from the Excel file into a polars DataFrame.
"""
self.dataframe = pd.read_excel(self.file_path, sheet_name=self.sheet_name)
self.dataframe = pl.read_excel(self.file_path, sheet_name=self.sheet_name)

@property
def file_path(self) -> str:
def file_path(self) -> Union[Path, str]:
"""
Get the path to the Excel file.

Returns:
str: Path to the Excel file.
Union[Path, str]: Path to the Excel file.
"""
return self.__file_path

@file_path.setter
def file_path(self, file_path: str) -> None:
def file_path(self, file_path: Union[Path, str]) -> None:
"""
Set the path to the Excel file.

Expand All @@ -47,11 +48,14 @@ def file_path(self, file_path: str) -> None:
TypeError: If 'file_path' is not a string.
ValueError: If 'file_path' is an empty string.
"""
if not isinstance(file_path, str):
raise TypeError("'file_path' must be a string.")
if not file_path.strip():
raise ValueError("'file_path' cannot be an empty string.")
self.__file_path = file_path
if not isinstance(file_path, (Path, str)):
raise TypeError("'file_path' must be either a string or 'Path' object.")

# Ensure if path exists
path = Path(file_path) if isinstance(file_path, str) else file_path
if not path.exists() or path.is_dir():
raise FileNotFoundError(f"'{file_path}' does not exist.")
self.__file_path = path

@property
def sheet_name(self) -> str:
Expand All @@ -77,12 +81,12 @@ def sheet_name(self, sheet_name: str) -> None:
raise TypeError("'sheet_name' must be a string.")
self.__sheet_name = sheet_name

def fetch_data(self) -> pd.DataFrame:
def fetch_data(self) -> pl.DataFrame:
"""
Fetch all data from the Excel sheet as a pandas DataFrame.
Fetch all data from the Excel sheet as a polars DataFrame.

Returns:
pd.DataFrame: Data loaded from the Excel sheet.
pl.DataFrame: Data loaded from the Excel sheet.
"""
if self.dataframe is None:
raise RuntimeError("No data loaded from the Excel file.")
Expand All @@ -99,15 +103,15 @@ def get_columns(self) -> List[str]:
raise RuntimeError("No data loaded from the Excel file.")
return list(self.dataframe.columns)

def filter_data(self, query: str) -> pd.DataFrame:
def filter_data(self, query: str) -> pl.DataFrame:
"""
Filter the Excel data using a pandas query string.
Filter the Excel data using a polars query string.

Args:
query (str): Query string to filter the data.

Returns:
pd.DataFrame: Filtered data based on the query.
pl.DataFrame: Filtered data based on the query.

Raises:
RuntimeError: If no data is loaded.
Expand All @@ -117,7 +121,7 @@ def filter_data(self, query: str) -> pd.DataFrame:
raise RuntimeError("No data loaded from the Excel file.")

try:
return self.dataframe.query(query)
return self.dataframe.sql(query)
except Exception:
raise ValueError("Invalid query provided.")

Expand Down
Loading
Loading