Skip to content

Commit

Permalink
feat: more hash, sizeof and eq implementations (#609)
Browse files Browse the repository at this point in the history
### Summary of Changes

- adds specific hash-implementations to classical classification and
regression models
- use more properties from table transformer to calculate hash
- add eq to imputer strategies and svm kernels
- adds general reusable structural hashing infrastructure for future
usage
  • Loading branch information
WinPlay02 authored Apr 10, 2024
1 parent 8b9c7a9 commit 2bc0b0a
Show file tree
Hide file tree
Showing 37 changed files with 772 additions and 52 deletions.
7 changes: 7 additions & 0 deletions src/safeds/_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""Utilities for Safe-DS."""

from ._hashing import _structural_hash

__all__ = [
"_structural_hash",
]
69 changes: 69 additions & 0 deletions src/safeds/_utils/_hashing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import functools
import operator
import struct
from typing import Any

import xxhash


def _structural_hash(*value: Any) -> int:
"""
Calculate a deterministic hash value, based on the provided values.
Parameters
----------
value
Variable amount of values to hash
Returns
-------
hash
Deterministic hash value
"""
return xxhash.xxh3_64(_value_to_bytes(value)).intdigest()


def _value_to_bytes(value: Any) -> bytes:
"""
Convert any value to a deterministically hashable representation.
Parameters
----------
value
Object to convert to a byte representation for deterministic structural hashing
Returns
-------
bytes
Byte representation of the provided value
"""
if value is None:
return b"\0"
elif isinstance(value, bytes):
return value
elif isinstance(value, bool):
return b"\1" if value else b"\0"
elif isinstance(value, int) and value < 0:
return value.to_bytes(8, signed=True)
elif isinstance(value, int) and value >= 0:
return value.to_bytes(8)
elif isinstance(value, str):
return value.encode("utf-8")
elif isinstance(value, float):
return struct.pack("d", value)
elif isinstance(value, list | tuple):
return functools.reduce(operator.add, [_value_to_bytes(entry) for entry in value], len(value).to_bytes(8))
elif isinstance(value, frozenset | set):
return functools.reduce(
operator.add,
sorted([_value_to_bytes(entry) for entry in value]),
len(value).to_bytes(8),
)
elif isinstance(value, dict):
return functools.reduce(
operator.add,
sorted([_value_to_bytes(key) + _value_to_bytes(entry) for key, entry in value.items()]),
len(value).to_bytes(8),
)
else:
return _value_to_bytes(hash(value))
44 changes: 26 additions & 18 deletions src/safeds/data/image/containers/_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@

import torch
import torch.nn.functional as func
import xxhash
from PIL.Image import open as pil_image_open
from torch import Tensor

from safeds._config import _get_device
from safeds._utils import _structural_hash

if TYPE_CHECKING:
from torch.types import Device
Expand Down Expand Up @@ -119,7 +119,7 @@ def __hash__(self) -> int:
hash : int
The hash value.
"""
return xxhash.xxh3_64(self.width.to_bytes(8) + self.height.to_bytes(8) + self.channel.to_bytes(8)).intdigest()
return _structural_hash(self.width, self.height, self.channel)

def __sizeof__(self) -> int:
"""
Expand Down Expand Up @@ -301,10 +301,12 @@ def convert_to_grayscale(self) -> Image:
"""
if self.channel == 4:
return Image(
torch.cat([
func2.rgb_to_grayscale(self._image_tensor[0:3], num_output_channels=3),
self._image_tensor[3].unsqueeze(dim=0),
]),
torch.cat(
[
func2.rgb_to_grayscale(self._image_tensor[0:3], num_output_channels=3),
self._image_tensor[3].unsqueeze(dim=0),
],
),
device=self.device,
)
else:
Expand Down Expand Up @@ -391,10 +393,12 @@ def adjust_brightness(self, factor: float) -> Image:
)
if self.channel == 4:
return Image(
torch.cat([
func2.adjust_brightness(self._image_tensor[0:3], factor * 1.0),
self._image_tensor[3].unsqueeze(dim=0),
]),
torch.cat(
[
func2.adjust_brightness(self._image_tensor[0:3], factor * 1.0),
self._image_tensor[3].unsqueeze(dim=0),
],
),
device=self.device,
)
else:
Expand Down Expand Up @@ -462,10 +466,12 @@ def adjust_contrast(self, factor: float) -> Image:
)
if self.channel == 4:
return Image(
torch.cat([
func2.adjust_contrast(self._image_tensor[0:3], factor * 1.0),
self._image_tensor[3].unsqueeze(dim=0),
]),
torch.cat(
[
func2.adjust_contrast(self._image_tensor[0:3], factor * 1.0),
self._image_tensor[3].unsqueeze(dim=0),
],
),
device=self.device,
)
else:
Expand Down Expand Up @@ -562,10 +568,12 @@ def sharpen(self, factor: float) -> Image:
)
if self.channel == 4:
return Image(
torch.cat([
func2.adjust_sharpness(self._image_tensor[0:3], factor * 1.0),
self._image_tensor[3].unsqueeze(dim=0),
]),
torch.cat(
[
func2.adjust_sharpness(self._image_tensor[0:3], factor * 1.0),
self._image_tensor[3].unsqueeze(dim=0),
],
),
device=self.device,
)
else:
Expand Down
4 changes: 2 additions & 2 deletions src/safeds/data/tabular/containers/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
import numpy as np
import pandas as pd
import seaborn as sns
import xxhash

from safeds._utils import _structural_hash
from safeds.data.image.containers import Image
from safeds.data.tabular.typing import ColumnType
from safeds.exceptions import (
Expand Down Expand Up @@ -201,7 +201,7 @@ def __hash__(self) -> int:
hash : int
The hash value.
"""
return xxhash.xxh3_64(self.name.encode("utf-8") + self.type.__repr__().encode("utf-8") + self.number_of_rows.to_bytes(8)).intdigest()
return _structural_hash(self.name, self.type.__repr__(), self.number_of_rows)

def __iter__(self) -> Iterator[T]:
r"""
Expand Down
7 changes: 3 additions & 4 deletions src/safeds/data/tabular/containers/_row.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
from __future__ import annotations

import sys
import functools
import operator
import sys
from collections.abc import Callable, Mapping
from typing import TYPE_CHECKING, Any

import pandas as pd
import xxhash

from safeds._utils import _structural_hash
from safeds.data.tabular.typing import ColumnType, Schema
from safeds.exceptions import UnknownColumnNameError

Expand Down Expand Up @@ -227,7 +226,7 @@ def __hash__(self) -> int:
hash : int
The hash value.
"""
return xxhash.xxh3_64(hash(self._schema).to_bytes(8) + functools.reduce(operator.add, [xxhash.xxh3_64(str(self.get_value(value))).intdigest().to_bytes(8) for value in self], b"\0")).intdigest()
return _structural_hash(self._schema, [str(self.get_value(value)) for value in self])

def __iter__(self) -> Iterator[Any]:
"""
Expand Down
4 changes: 2 additions & 2 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
import pandas as pd
import seaborn as sns
import torch
import xxhash
from pandas import DataFrame
from scipy import stats
from torch.utils.data import DataLoader, Dataset

from safeds._utils import _structural_hash
from safeds.data.image.containers import Image
from safeds.data.tabular.typing import ColumnType, Schema
from safeds.exceptions import (
Expand Down Expand Up @@ -469,7 +469,7 @@ def __hash__(self) -> int:
hash : int
The hash value.
"""
return xxhash.xxh3_64(hash(self._schema).to_bytes(8) + self.number_of_rows.to_bytes(8)).intdigest()
return _structural_hash(self._schema, self.number_of_rows)

def __repr__(self) -> str:
r"""
Expand Down
6 changes: 2 additions & 4 deletions src/safeds/data/tabular/containers/_tagged_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@

import numpy as np
import torch
import xxhash
from torch.utils.data import DataLoader, Dataset

from safeds._utils import _structural_hash
from safeds.data.tabular.containers import Column, Row, Table
from safeds.exceptions import (
ColumnIsTargetError,
Expand Down Expand Up @@ -193,9 +193,7 @@ def __hash__(self) -> int:
hash : int
The hash value.
"""
return xxhash.xxh3_64(
hash(self.target).to_bytes(8) + hash(self.features).to_bytes(8) + Table.__hash__(self).to_bytes(8),
).intdigest()
return _structural_hash(self.target, self.features, Table.__hash__(self))

def __sizeof__(self) -> int:
"""
Expand Down
9 changes: 2 additions & 7 deletions src/safeds/data/tabular/containers/_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import xxhash

from safeds._utils import _structural_hash
from safeds.data.image.containers import Image
from safeds.data.tabular.containers import Column, Row, Table, TaggedTable
from safeds.exceptions import (
Expand Down Expand Up @@ -295,12 +295,7 @@ def __hash__(self) -> int:
hash : int
The hash value.
"""
return xxhash.xxh3_64(
hash(self.time).to_bytes(8)
+ hash(self.target).to_bytes(8)
+ hash(self.features).to_bytes(8)
+ Table.__hash__(self).to_bytes(8),
).intdigest()
return _structural_hash(self.time, self.target, self.features, Table.__hash__(self))

def __sizeof__(self) -> int:
"""
Expand Down
41 changes: 41 additions & 0 deletions src/safeds/data/tabular/transformation/_imputer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import sys
import warnings
from typing import Any

Expand Down Expand Up @@ -47,9 +48,28 @@ class Constant(ImputerStrategy):
The given value to impute missing values.
"""

def __eq__(self, other: object) -> bool:
if not isinstance(other, Imputer.Strategy.Constant):
return NotImplemented
if self is other:
return True
return self._value == other._value

__hash__ = ImputerStrategy.__hash__

def __init__(self, value: Any):
self._value = value

def __sizeof__(self) -> int:
"""
Return the complete size of this object.
Returns
-------
Size of this object in bytes.
"""
return sys.getsizeof(self._value)

def __str__(self) -> str:
return f"Constant({self._value})"

Expand All @@ -60,6 +80,13 @@ def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
class Mean(ImputerStrategy):
"""An imputation strategy for imputing missing data with mean values."""

def __eq__(self, other: object) -> bool:
if not isinstance(other, Imputer.Strategy.Mean):
return NotImplemented
return True

__hash__ = ImputerStrategy.__hash__

def __str__(self) -> str:
return "Mean"

Expand All @@ -69,6 +96,13 @@ def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
class Median(ImputerStrategy):
"""An imputation strategy for imputing missing data with median values."""

def __eq__(self, other: object) -> bool:
if not isinstance(other, Imputer.Strategy.Median):
return NotImplemented
return True

__hash__ = ImputerStrategy.__hash__

def __str__(self) -> str:
return "Median"

Expand All @@ -78,6 +112,13 @@ def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
class Mode(ImputerStrategy):
"""An imputation strategy for imputing missing data with mode values. The lowest value will be used if there are multiple values with the same highest count."""

def __eq__(self, other: object) -> bool:
if not isinstance(other, Imputer.Strategy.Mode):
return NotImplemented
return True

__hash__ = ImputerStrategy.__hash__

def __str__(self) -> str:
return "Mode"

Expand Down
7 changes: 5 additions & 2 deletions src/safeds/data/tabular/transformation/_table_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

import xxhash
from safeds._utils import _structural_hash

if TYPE_CHECKING:
from safeds.data.tabular.containers import Table
Expand All @@ -21,7 +21,10 @@ def __hash__(self) -> int:
hash : int
The hash value.
"""
return xxhash.xxh3_64(self.__class__.__qualname__.encode("utf-8") + (1 if self.is_fitted() else 0).to_bytes(1)).intdigest()
added = self.get_names_of_added_columns() if self.is_fitted() else []
changed = self.get_names_of_changed_columns() if self.is_fitted() else []
removed = self.get_names_of_removed_columns() if self.is_fitted() else []
return _structural_hash(self.__class__.__qualname__, self.is_fitted(), added, changed, removed)

@abstractmethod
def fit(self, table: Table, column_names: list[str] | None) -> TableTransformer:
Expand Down
Loading

0 comments on commit 2bc0b0a

Please sign in to comment.