From eb63209debca919c5464d72676d1d88d3ed4e6a0 Mon Sep 17 00:00:00 2001 From: Ratchet Date: Mon, 8 Apr 2024 12:46:31 +0300 Subject: [PATCH] fix timedeltas add back pythonic filter because we made assertions that noone uses non-primitive types and adding generic type support for non-primitive types non-trivial --- nimlite.nimble | 2 +- nimlite/funcs/filter.nim | 1 + nimlite/numpy.nim | 79 ++++++++++++----- tablite/redux.py | 181 ++++++++++++++++++++++++++++++++++++++- tablite/version.py | 2 +- 5 files changed, 238 insertions(+), 27 deletions(-) diff --git a/nimlite.nimble b/nimlite.nimble index ef6ac9e7..b7514d90 100644 --- a/nimlite.nimble +++ b/nimlite.nimble @@ -1,6 +1,6 @@ # Package -version = "0.3.0" +version = "0.3.1" author = "Ratchet" description = "Utilities for tablite to work with nim" license = "MIT" diff --git a/nimlite/funcs/filter.nim b/nimlite/funcs/filter.nim index 64f27cb4..b3a9d5a1 100644 --- a/nimlite/funcs/filter.nim +++ b/nimlite/funcs/filter.nim @@ -147,6 +147,7 @@ proc filter*(table: nimpy.PyObject, pyExpressions: seq[nimpy.PyObject], filterTy let pyType = builtins.getTypeName(pyVal) let obj: PY_ObjectND = ( case pyType + of "NoneType": PY_None of "int": newPY_Object(pyVal.to(int)) of "float": newPY_Object(pyVal.to(float)) of "bool": newPY_Object(pyVal.to(bool)) diff --git a/nimlite/numpy.nim b/nimlite/numpy.nim index 015d7c34..883e6d0e 100644 --- a/nimlite/numpy.nim +++ b/nimlite/numpy.nim @@ -22,8 +22,10 @@ type NDArrayTypeDescriptor = enum D_BOOLEAN D_INT D_FLOAT - D_TIME D_DATE_DAYS + D_TIME_SECONDS + D_TIME_MILISECONDS + D_TIME_MICROSECONDS D_DATETIME_SECONDS D_DATETIME_MILISECONDS D_DATETIME_MICROSECONDS @@ -523,16 +525,20 @@ proc consumeDescr(header: var string, header_len: int, offset: var int): NDArray descriptor = NDArrayTypeDescriptor.D_OBJECT of 'm': case dt_descriptor: + of "us": NDArrayTypeDescriptor.D_TIME_MICROSECONDS + of "ms": NDArrayTypeDescriptor.D_TIME_MILISECONDS + of "s": NDArrayTypeDescriptor.D_TIME_SECONDS else: implement(descr) of 'M': - case dt_descriptor: - of "D": - size = 8 - descriptor = NDArrayTypeDescriptor.D_DATE_DAYS - of "us": - size = 8 - descriptor = NDArrayTypeDescriptor.D_DATETIME_MICROSECONDS - else: implement(descr) + size = 8 + descriptor = ( + case dt_descriptor: + of "D": NDArrayTypeDescriptor.D_DATE_DAYS + of "us": NDArrayTypeDescriptor.D_DATETIME_MICROSECONDS + of "ms": NDArrayTypeDescriptor.D_DATETIME_MILISECONDS + of "s": NDArrayTypeDescriptor.D_DATETIME_SECONDS + else: implement(descr) + ) else: size = parseInt(descr[type_offset+1..descr.len-1]) @@ -659,6 +665,33 @@ proc newDateTimeArray_Microseconds(fh: var File, endianness: Endianness, shape: return DateTimeNDArray(buf: buf, shape: shape) +proc newTimeArray_Seconds(fh: var File, endianness: Endianness, shape: var Shape): ObjectNDArray {.inline.} = + let data = readPrimitiveBuffer[int64](fh, shape) + let dtypes = {K_TIME: data.len}.toTable + let buf = collect: + for v in data: + newPY_Object(seconds2Duration(float v)) + + return ObjectNDArray(buf: buf, shape: shape, dtypes: dtypes) + +proc newTimeArray_Miliseconds(fh: var File, endianness: Endianness, shape: var Shape): ObjectNDArray {.inline.} = + let data = readPrimitiveBuffer[int64](fh, shape) + let dtypes = {K_TIME: data.len}.toTable + let buf = collect: + for v in data: + newPY_Object(seconds2Duration(float v * 1_000)) + + return ObjectNDArray(buf: buf, shape: shape, dtypes: dtypes) + +proc newTimeArray_Microseconds(fh: var File, endianness: Endianness, shape: var Shape): ObjectNDArray {.inline.} = + let data = readPrimitiveBuffer[int64](fh, shape) + let dtypes = {K_TIME: data.len}.toTable + let buf = collect: + for v in data: + newPY_Object(seconds2Duration(float v * 1_000_000)) + + return ObjectNDArray(buf: buf, shape: shape, dtypes: dtypes) + template newFloatNDArray(fh: var File, endianness: Endianness, size: int, shape: var Shape) = case size: of 4: Float32NDArray(buf: readPrimitiveBuffer[float32](fh, shape), shape: shape) @@ -711,20 +744,22 @@ proc readPageInfo(fh: var File): (NDArrayDescriptor, bool, Shape) = proc readNumpy(fh: var File): BaseNDArray = var ((descrEndianness, descrType, descrSize), _, shape) = readPageInfo(fh) - var page: BaseNDArray - - case descrType: - of D_BOOLEAN: page = newBooleanNDArray(fh, shape) - of D_INT: page = newIntNDArray(fh, descrEndianness, descrSize, shape) - of D_FLOAT: page = newFloatNDArray(fh, descrEndianness, descrSize, shape) - of D_UNICODE: page = newUnicodeNDArray(fh, descrEndianness, descrSize, shape) - of D_OBJECT: page = newObjectNDArray(fh, descrEndianness, shape) - of D_DATE_DAYS: page = newDateArray_Days(fh, descrEndianness, shape) - of D_DATETIME_SECONDS: page = newDateTimeArray_Seconds(fh, descrEndianness, shape) - of D_DATETIME_MILISECONDS: page = newDateTimeArray_Miliseconds(fh, descrEndianness, shape) - of D_DATETIME_MICROSECONDS: page = newDateTimeArray_Microseconds(fh, descrEndianness, shape) - else: implement($descrType) + let page = ( + case descrType: + of D_BOOLEAN: newBooleanNDArray(fh, shape) + of D_INT: newIntNDArray(fh, descrEndianness, descrSize, shape) + of D_FLOAT: newFloatNDArray(fh, descrEndianness, descrSize, shape) + of D_UNICODE: newUnicodeNDArray(fh, descrEndianness, descrSize, shape) + of D_OBJECT: newObjectNDArray(fh, descrEndianness, shape) + of D_DATE_DAYS: newDateArray_Days(fh, descrEndianness, shape) + of D_DATETIME_SECONDS: newDateTimeArray_Seconds(fh, descrEndianness, shape) + of D_DATETIME_MILISECONDS: newDateTimeArray_Miliseconds(fh, descrEndianness, shape) + of D_DATETIME_MICROSECONDS: newDateTimeArray_Microseconds(fh, descrEndianness, shape) + of D_TIME_SECONDS: newTimeArray_Seconds(fh, descrEndianness, shape) + of D_TIME_MILISECONDS: newTimeArray_Miliseconds(fh, descrEndianness, shape) + of D_TIME_MICROSECONDS: newTimeArray_Microseconds(fh, descrEndianness, shape) + ) return page proc readNumpy*(path: string): BaseNDArray = diff --git a/tablite/redux.py b/tablite/redux.py index 04aa9ca5..b922367e 100644 --- a/tablite/redux.py +++ b/tablite/redux.py @@ -1,10 +1,11 @@ from tablite.base import BaseTable import numpy as np +import warnings from tablite.utils import sub_cls_check, type_check, expression_interpreter from tablite.mp_utils import filter_ops from tablite.datatypes import list_to_np_array from tablite.config import Config -from tablite.nimlite import filter as _filter_using_list_of_dicts +from tablite.nimlite import filter as _filter_using_list_of_dicts_native from tqdm import tqdm as _tqdm @@ -163,10 +164,184 @@ def _compress_both(T, mask, pbar: _tqdm): pbar.update(pbar_step) return true, false +def _filter_using_list_of_dicts(T, expressions, filter_type, pbar: _tqdm): + """ + enables filtering across columns for multiple criteria. + + expressions: + + str: Expression that can be compiled and executed row by row. + exampLe: "all((A==B and C!=4 and 200": + result = dset_A > dset_B + elif expr == ">=": + result = dset_A >= dset_B + elif expr == "==": + result = dset_A == dset_B + elif expr == "<": + result = dset_A < dset_B + elif expr == "<=": + result = dset_A <= dset_B + elif expr == "!=": + result = dset_A != dset_B + else: # it's a python evaluations (slow) + f = filter_ops.get(expr) + assert callable(f) + result = list_to_np_array([f(a, b) for a, b in zip(dset_A, dset_B)]) + except TypeError: + def safe_test(f, a, b): + try: + return f(a, b) + except TypeError: + return False + f = filter_ops.get(expr) + assert callable(f) + result = list_to_np_array([safe_test(f, a, b) for a, b in zip(dset_A, dset_B)]) + bitmap[bit_index, start:end] = result + pbar.update(pbar_step) + + f = np.all if filter_type == "all" else np.any + mask = f(bitmap, axis=0) + # 4. The mask is now created and is no longer needed. + pbar.update(10 - pbar.n) + return mask + +def filter_non_primitive(T, expressions, filter_type="all", tqdm=_tqdm): + """ + OBSOLETE + filters table + + + Args: + T (Table subclass): Table. + expressions (list or str): + str: + filters based on an expression, such as: + "all((A==B, C!=4, 200