Skip to content

Commit

Permalink
Merge pull request #14 from visualfabriq/preprod
Browse files Browse the repository at this point in the history
Patch release 0.3.3
  • Loading branch information
jverstraaten authored Aug 1, 2022
2 parents 1cb21be + addaa73 commit 6fc497e
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 20 deletions.
21 changes: 20 additions & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
name: Install dependencies
command: |
~/venv/bin/pip install .[test]

# Save the cached dependencies
- save_cache:
Expand Down Expand Up @@ -81,13 +81,22 @@ jobs:

build-and-push:
executor: codeartifact/default
parameters:
is-pre-release:
description: if true the version tag will contain the branch
type: boolean
default: false
steps:
- checkout
- codeartifact/init:
tool: pip
- run:
name: Build
command: |
if [[ "<< parameters.is-pre-release >>" == true ]]; then
DEV_VERSION=$((16#${CIRCLE_SHA1:0:6}))
export PRE_RELEASE_VERSION=".dev${DEV_VERSION}"
fi
pip install build
python -m build
- codeartifact/push:
Expand Down Expand Up @@ -115,6 +124,16 @@ workflows:
only:
- master
- main
- build-and-push:
context: codeartifact-dev
requires:
- py2
- py3
is-pre-release: true
filters:
branches:
only:
- uat

daily:
triggers:
Expand Down
6 changes: 6 additions & 0 deletions RELEASE_NOTES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
Release notes for parquery
========================

Release 0.3.3
==============
- Skip row_group if output is empty
- Fixes bug that min and max can't be calculated in an empty row group
- Refactored rowgroup_metadata_filter

Release 0.3.2
==============
- Add handling of missing columns in a parquet file that is used in a filter. This happens when new dimensions are created but existing parquet files do not have them yet. Now it throws an error for the query, the new behaviour will change this to giving an empty result. This is better because as the real value for the dimension is unknown for the file, the result should also be zero. It also greatly helps with issues where old files break reporting because they have not been updated yet.
Expand Down
4 changes: 4 additions & 0 deletions parquery/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from parquery.aggregate import aggregate_pq, aggregate_pa
from parquery.transport import serialize_df, deserialize_df, serialize_pa_table, deserialize_pa_table
from parquery.write import df_to_parquet
import os

pre_release_version = os.getenv('PRE_RELEASE_VERSION', '')
__version__ = '0.3.3{}'.format(pre_release_version)
39 changes: 21 additions & 18 deletions parquery/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,19 @@ def convert_data_filter(data_filter):


def rowgroup_metadata_filter(metadata_filter, pq_file, row_group):
"""
Check if the filter applies, if the filter does not apply skip the row_group.
Args:
metadata_filter (list of list): e.g. [[0, '>', 10000]]
pq_file (pyarrow.parquet.ParquetFile): file to be checked
row_group:
Returns (bool): True if row_group should be skipped otherwise False
"""
rg_meta = pq_file.metadata.row_group(row_group)
skip = False
if rg_meta.num_rows == 0:
return True
for col_nr, sign, values in metadata_filter:
rg_col = rg_meta.column(col_nr)
min_val = rg_col.statistics.min
Expand All @@ -280,37 +291,29 @@ def rowgroup_metadata_filter(metadata_filter, pq_file, row_group):
# if the filter is not in the boundary of the range, then skip the rowgroup
if sign == 'in':
if not any(min_val <= val <= max_val for val in values):
skip = True
break
return True
elif sign == 'not in':
if any(min_val <= val <= max_val for val in values):
skip = True
break
return True
elif sign in ['=', '==']:
if not min_val <= values <= max_val:
skip = True
break
return True
elif sign == '!=':
if min_val <= values <= max_val:
skip = True
break
return True
elif sign == '>':
if max_val <= values:
skip = True
break
return True
elif sign == '>=':
if max_val < values:
skip = True
break
return True
elif sign == '<':
if min_val >= values:
skip = True
break
return True
elif sign == '<=':
if min_val > values:
skip = True
break
return skip
return True
return False


def check_measure_cols(measure_cols):
Expand Down
18 changes: 18 additions & 0 deletions parquery/tests/test_parquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -1086,3 +1086,21 @@ def test_pa_serialization(self):
data_table_2 = deserialize_pa_table(buf)

assert data_table == data_table_2

def test_emtpy_file(self):
"""
When a file is empty (does not contain any rows) it should still work.
"""
self.filename = tempfile.mkstemp(prefix='test-')[-1]

data_table = pa.Table.from_pandas(pd.DataFrame(columns=['f0', 'f1']), preserve_index=False)
with pa.parquet.ParquetWriter(self.filename, data_table.schema, version='2.0', compression='ZSTD') as writer:
writer.write_table(data_table)

terms_filter = [('f0', '>', 10000)]
result_parquery = aggregate_pq(self.filename, ['f0'], ['f1'],
data_filter=terms_filter,
aggregate=False)

assert len(result_parquery) == 0
assert set(result_parquery.columns) == {'f0', 'f1'}
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
[metadata]
# ensure LICENSE is included in wheel metadata
license_file = LICENSE
version = 0.3.2
version = attr: parquery.__version__

0 comments on commit 6fc497e

Please sign in to comment.