Skip to content

Commit

Permalink
Add is_like to Schema (#129)
Browse files Browse the repository at this point in the history
* Add is_like to Schema

* update

* update

* update
  • Loading branch information
goodwanghan authored Mar 18, 2024
1 parent e7b4a40 commit 61eed0a
Show file tree
Hide file tree
Showing 6 changed files with 246 additions and 14 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ pip install triad

## Release History

### 0.9.6

* Add `is_like` to Schema to compare similar schemas

### 0.9.5

* Add parse json column function to pyarrow utils
Expand Down
4 changes: 4 additions & 0 deletions tests/collections/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ def test_schema_eq():
assert s == [("a", "int"), ("b", str)]
assert s == OrderedDict([("a", "int"), ("b", str)])

assert s.is_like(s)
assert not s.is_like("a:long,b:str")
assert s.is_like("a:long,b:str", equal_groups=[(pa.types.is_integer,)])


def test_schema_contains():
s = Schema("a:int,b:str,``:str")
Expand Down
73 changes: 70 additions & 3 deletions tests/utils/test_pyarrow.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import warnings
from datetime import date, datetime
import json

import numpy as np
import pandas as pd
import pyarrow as pa
Expand All @@ -21,7 +22,11 @@
get_alter_func,
get_eq_func,
is_supported,
pa_batch_to_dicts,
pa_datatypes_equal,
pa_schemas_equal,
pa_table_to_pandas,
parse_json_columns,
replace_type,
replace_types_in_schema,
replace_types_in_table,
Expand All @@ -30,8 +35,6 @@
to_pa_datatype,
to_pandas_dtype,
to_single_pandas_dtype,
pa_batch_to_dicts,
parse_json_columns,
)


Expand Down Expand Up @@ -160,6 +163,70 @@ def test_to_pa_datatype():
raises(TypeError, lambda: to_pa_datatype(None))


def test_pa_datatypes_equal():
tp = pa.int32()
assert pa_datatypes_equal(tp, tp)
assert pa_datatypes_equal(pa.int32(), pa.int32())
assert not pa_datatypes_equal(pa.int32(), pa.int64())
assert pa_datatypes_equal(
pa.int32(), pa.int64(), equal_groups=[(pa.types.is_integer,)]
)
assert pa_datatypes_equal(
pa.int32(),
pa.float64(),
equal_groups=[(pa.types.is_integer, pa.types.is_floating)],
)
assert pa_datatypes_equal(pa.list_(pa.field("a", pa.int32())), pa.list_(pa.int32()))
assert not pa_datatypes_equal(
pa.list_(pa.field("a", pa.int32())),
pa.list_(pa.int32()),
ignore_list_item_name=False,
)
assert not pa_datatypes_equal(
pa.struct([pa.field("a", pa.int32())]), pa.struct([("a", pa.int64())])
)
assert not pa_datatypes_equal(
pa.struct([pa.field("a", pa.int32()), pa.field("b", pa.int32())]),
pa.struct([("a", pa.int64())]),
)
assert pa_datatypes_equal(
pa.struct([pa.field("a", pa.int32())]),
pa.struct([("a", pa.int64())]),
equal_groups=[(pa.types.is_integer,)],
)
assert not pa_datatypes_equal(
pa.struct([pa.field("a", pa.int32())]),
pa.struct([("b", pa.int64())]),
equal_groups=[(pa.types.is_integer,)],
)
assert not pa_datatypes_equal(
pa.map_(pa.int32(), pa.string()), pa.map_(pa.int64(), pa.string())
)
assert pa_datatypes_equal(
pa.map_(pa.int32(), pa.string()),
pa.map_(pa.int64(), pa.string()),
equal_groups=[(pa.types.is_integer,)],
)


def test_pa_schema_eq():
s1 = expression_to_schema("a:int,b:[str]")
s2 = expression_to_schema("a:int,b:[str]")
s3 = expression_to_schema("a:long,b:[str]")
assert pa_schemas_equal(s1, s1)
assert pa_schemas_equal(s1, s1, ignore_list_item_name=False)
assert pa_schemas_equal(s1, s2)
assert not pa_schemas_equal(s1, s3)
assert pa_schemas_equal(s1, s3, equal_groups=[(pa.types.is_integer,)])
s4 = pa.schema(
[pa.field("a", pa.int32()), pa.field("b", pa.list_(pa.field("a", pa.string())))]
)
assert pa_schemas_equal(s1, s4)
assert not pa_schemas_equal(s1, s4, ignore_list_item_name=False)
s5 = expression_to_schema("aa:int,b:[str]")
assert not pa_schemas_equal(s1, s5)


def test_to_single_pandas_dtype():
assert np.bool_ == to_single_pandas_dtype(pa.bool_(), False)
assert np.int16 == to_single_pandas_dtype(pa.int16(), False)
Expand Down
51 changes: 41 additions & 10 deletions triad/collections/schema.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from collections import OrderedDict
from typing import Any, Dict, List, Optional, Set, Tuple
from typing import Any, Callable, Dict, List, Optional, Set, Tuple

import numpy as np
import pandas as pd
Expand All @@ -12,6 +12,7 @@
expression_to_schema,
is_supported,
schema_to_expression,
pa_schemas_equal,
to_pa_datatype,
to_pandas_dtype,
)
Expand Down Expand Up @@ -267,15 +268,7 @@ def __eq__(self, other: Any) -> bool:
:param other: a schema like object
:return: True if the two schemas are equal
"""
if other is None:
return False
if other is self:
return True
if isinstance(other, Schema):
return super().__eq__(other)
if isinstance(other, str):
return self.__repr__() == other
return self == Schema(other)
return self.is_like(other)

def __contains__(self, key: Any) -> bool: # noqa: C901
"""Check if the schema contains the key.
Expand Down Expand Up @@ -306,6 +299,44 @@ def __contains__(self, key: Any) -> bool: # noqa: C901
return True
return Schema(key) in self

def is_like(
self,
other: Any,
equal_groups: Optional[List[List[Callable[[pa.DataType], bool]]]] = None,
) -> bool:
"""Check if the two schemas are equal or similar
:param other: a schema like object
:param equal_groups: a list of list of functions to check if two types
are equal, default None
:return: True if the two schemas are equal
.. admonition:: Examples
.. code-block:: python
s = Schema("a:int,b:str")
assert s.is_like("a:int,b:str")
assert not s.is_like("a:long,b:str")
assert s.is_like("a:long,b:str", equal_groups=[(pa.types.is_integer,)])
"""
if other is None:
return False
if other is self:
return True
if isinstance(other, Schema):
_other = other
elif isinstance(other, str):
if equal_groups is None:
return self.__repr__() == other
_other = Schema(other)
else:
_other = Schema(other)
return pa_schemas_equal(
self.pa_schema, _other.pa_schema, equal_groups=equal_groups
)

def append(self, obj: Any) -> "Schema": # noqa: C901
"""Append schema like object to the current schema. Only new columns
are allowed.
Expand Down
126 changes: 126 additions & 0 deletions triad/utils/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,132 @@ def to_pa_datatype(obj: Any) -> pa.DataType: # noqa: C901
return pa.from_numpy_dtype(np.dtype(obj))


def pa_datatypes_equal( # noqa: C901
t1: pa.DataType,
t2: pa.DataType,
ignore_list_item_name: bool = True,
equal_groups: Optional[List[List[Callable[[pa.DataType], bool]]]] = None,
) -> bool:
"""Check if two pyarrow data types are equal
:param t1: the first pyarrow data type
:param t2: the second pyarrow data type
:param ignore_list_item_name: whether to ignore list item name,
defaults to True
:param equal_groups: a list of groups of functions to check equality,
defaults to None
:return: if the two data types are equal
.. note::
In the lastest version of pyarrow, in the default comparison logic,
list field names are not compared.
.. admonition:: Examples
.. code-block:: python
assert not pa_datatypes_equal(pa.int32(), pa.int64())
assert pa_datatypes_equal(
pa.int32(),
pa.int64(),
equal_groups=[[pa.types.is_integer]],
)
"""
if t1 is t2:
return True
if (
not ignore_list_item_name
and pa.types.is_list(t1)
and pa.types.is_list(t2)
and t1.value_field.name != t2.value_field.name
):
return False
if t1 == t2:
return True
if equal_groups is not None:
for group in equal_groups:
if any(f(t1) for f in group) and any(f(t2) for f in group):
return True
params: Dict[str, Any] = dict( # noqa: C408
ignore_list_item_name=ignore_list_item_name,
equal_groups=equal_groups,
)
if pa.types.is_list(t1) and pa.types.is_list(t2): # pragma: no cover
# for lower version of pyarrow, list field names are compared
# for higher version of pyarrow, list field names are ignored
return pa_datatypes_equal(t1.value_type, t2.value_type, **params)
if pa.types.is_struct(t1) and pa.types.is_struct(t2):
if len(t1) != len(t2):
return False
for f1, f2 in zip(t1, t2):
if f1.name != f2.name:
return False
if not pa_datatypes_equal(f1.type, f2.type, **params):
return False
return True
if pa.types.is_map(t1) and pa.types.is_map(t2):
return pa_datatypes_equal(
t1.key_type, t2.key_type, **params
) and pa_datatypes_equal(t1.item_type, t2.item_type, **params)

return False


def pa_schemas_equal(
s1: pa.Schema,
s2: pa.Schema,
ignore_list_item_name: bool = True,
equal_groups: Optional[List[List[Callable[[pa.DataType], bool]]]] = None,
) -> bool:
"""Check if two pyarrow schemas are equal
:param s1: the first pyarrow schema
:param s2: the second pyarrow schema
:param ignore_list_item_name: whether to ignore list item name,
defaults to True
:param equal_groups: a list of groups of functions to check equality,
defaults to None
:return: if the two schemas are equal
.. note::
In the lastest version of pyarrow, in the default comparison logic,
list field names are not compared.
.. admonition:: Examples
.. code-block:: python
s1 = pa.schema([("a", pa.int32()), ("b", pa.string())])
s2 = pa.schema([("a", pa.int64()), ("b", pa.string())])
assert not pa_schemas_equal(s1, s2)
assert pa_schemas_equal(
s1,
s2,
equal_groups=[[pa.types.is_integer]],
)
"""
if ignore_list_item_name:
if s1 is s2 or s1.equals(s2):
return True
elif s1 is s2:
return True
if s1.names != s2.names:
return False
for f1, f2 in zip(s1, s2):
if not pa_datatypes_equal(
f1.type,
f2.type,
ignore_list_item_name=ignore_list_item_name,
equal_groups=equal_groups,
):
return False
return True


def cast_pa_array(col: pa.Array, new_type: pa.DataType) -> pa.Array: # noqa: C901
old_type = col.type
if new_type.equals(old_type):
Expand Down
2 changes: 1 addition & 1 deletion triad_version/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# flake8: noqa
__version__ = "0.9.5"
__version__ = "0.9.6"

0 comments on commit 61eed0a

Please sign in to comment.