You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
import functools
import operator
from typing import Union, List, Optional, Tuple
import pyarrow.compute as pc
import pyarrow as pa
def _filters_to_expression(
filters: Optional[Union[pc.Expression, List[Tuple], List[List[Tuple]]]] = None
) -> pc.Expression:
"""Constructs a pyarrow filter from a combo of DNF(s) or expressions.
:param filters: pyarrow filters.
See `pyarrow_read_table <https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html>`_ .
:param filters: Optional[Union[pc.Expression:
:param List[Tuple]:
:param List[List[Tuple]]]]: (Default value = None)
:returns: pyarrow filter
:raises ValueError:
raised when operation requested is not valid or supported
"""
if isinstance(filters, pc.Expression):
return filters
elif isinstance(filters, list):
"""
if any item in the list is not a pc.expression convert it to one first
"""
if all([isinstance(f, list) for f in filters]):
filters = [_filters_to_expression(filter_item) for filter_item in filters]
return functools.reduce(operator.or_, filters)
else:
filters = [_filters_to_expression(filter_item) for filter_item in filters]
if len(filters) > 1:
return functools.reduce(operator.and_, filters)
else:
return _filters_to_expression(filters[0])
elif not filters:
return None
else:
assert len(filters) == 3 and isinstance(filters, tuple)
f, op, val = filters
f = pc.field(f)
if op in ("=", "==", "is"):
if val is None:
return pc.is_null(f)
else:
return f == val
elif op in ("!=", "is not"):
if val is None:
return pc.invert(pc.is_null(f))
else:
return f != val
elif op == "in":
return pc.is_in(f, pa.array(val))
elif op == "not in":
return pc.invert(pc.is_in(f, pa.array(val)))
elif op == "<":
return f < val
elif op == "<=":
return f <= val
elif op == ">":
return f > val
elif op == ">=":
return f >= val
elif op == "between":
val = pa.array(val).sort()
return (f >= val[0]) & (f <= val[-1])
elif op == "not between":
val = pa.array(val).sort()
return (f < val[0]) | (f > val[-1])
elif op == "like":
return pc.match_like(f, val)
elif op == "not like":
return pc.invert(pc.match_like(f, val))
elif op in ("match_like", "starts_with", "ends_with"):
func = getattr(pc, op)
return func(f, val)
elif op in ("not match_like", "not starts_with", "not ends_with"):
# strip not before inverting
op = op[4:]
func = getattr(pc, op)
return pc.invert(func(f, val))
else:
raise ValueError(f"Not supported operator {op}")
Component(s)
C++, Python
The text was updated successfully, but these errors were encountered:
Describe the enhancement requested
pyarrow.parquet supports DNF filtering. Add enhanced DNF filtering to pyarrow.dataset.
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html
The samples below demos how you can mix pyarrow compute expressions with DNF expressions.
Additional question: Is there an easy way to map pyarrow.compute functions as DNF operators using pyarrow.compute.get_function()?
Component(s)
C++, Python
The text was updated successfully, but these errors were encountered: