Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resampling MVP #1495

Merged
merged 34 commits into from
May 30, 2024
Merged
Changes from 1 commit
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
b256370
Enhancement 1010: Resampling MVP
alexowens90 Apr 3, 2024
2e67393
Revert to C++17
alexowens90 May 9, 2024
e752b65
Comment changes
alexowens90 May 10, 2024
03df99c
Revert change to lmdb_version_store_tiny_segment
alexowens90 May 10, 2024
6c3739c
Remove check that input has initial expected get calls in split_by_ro…
alexowens90 May 10, 2024
519e56c
Use Bucket class in aggregation as well
alexowens90 May 10, 2024
37d9810
Remove Pandas date_range timing/logging, and modified some formatting
alexowens90 May 10, 2024
8ac1c3b
Move all sorted aggregation stuff to own files
alexowens90 May 10, 2024
cac141e
Renaming refactor
alexowens90 May 13, 2024
28404a8
Renaming refactor
alexowens90 May 13, 2024
6464068
Remove unused function
alexowens90 May 13, 2024
9e79d0f
Remove summing timestamps from supported aggregations
alexowens90 May 13, 2024
76e0baf
Started refactoring aggregation
alexowens90 May 13, 2024
b4e6aa7
Added push_to_aggregator method
alexowens90 May 13, 2024
39ab291
USe push_to_aggregator in the other relevant place
alexowens90 May 14, 2024
91080f6
Fixed test_resampling_unsupported_aggregation_type_combos
alexowens90 May 14, 2024
d2369eb
Factor out finalize_aggregator
alexowens90 May 14, 2024
32547a2
Presize output index column in blocks, and trim unused blocks at the end
alexowens90 May 14, 2024
868ffa0
USe constexpr where possible
alexowens90 May 14, 2024
108867a
Reinstate all tests, reorder source files
alexowens90 May 14, 2024
91e02f4
Comment changes
alexowens90 May 14, 2024
de3bd8e
Use ColumnDataIterator in copy_frame_data_to_buffer
alexowens90 May 14, 2024
7f1d178
Revert accidentally committed change to task scheduler
alexowens90 May 14, 2024
e04b124
Comment updates
alexowens90 May 14, 2024
bb3639c
Move profile_resample.py out of tests directory
alexowens90 May 14, 2024
d9c0506
Resample docstring
alexowens90 May 14, 2024
1d5ab87
Fix mac build?
alexowens90 May 14, 2024
13f598d
Fix tests
alexowens90 May 15, 2024
b7c7a1d
Make resamply.py in ASV benchmarks dir
alexowens90 May 15, 2024
05b92ca
Dummy commit
alexowens90 May 16, 2024
4ecf145
Resampling ASV benchmarks
alexowens90 May 16, 2024
0264dc4
Update benchmarks.json file too
alexowens90 May 16, 2024
cf6772f
Remove ASV features added in 0.6.0
alexowens90 May 17, 2024
c2d994c
Address review comments
alexowens90 May 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Make resamply.py in ASV benchmarks dir
alexowens90 committed May 15, 2024

Verified

This commit was signed with the committer’s verified signature.
snyk-bot Snyk bot
commit b7c7a1d1b9ddb0cac65473f0d3d5651b456e94d4
121 changes: 121 additions & 0 deletions python/benchmarks/resample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""
Copyright 2024 Man Group Operations Limited

Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.

As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
"""
from arcticdb import Arctic
from arcticdb.version_store.processing import QueryBuilder

from .common import *


class LocalQueryBuilderFunctions:
number = 5
timeout = 6000
LIB_NAME = "query_builder"
CONNECTION_STRING = "lmdb://query_builder?map_size=5GB"

params = [1_000_000, 10_000_000]
param_names = ["num_rows"]

def setup_cache(self):
self.ac = Arctic(LocalQueryBuilderFunctions.CONNECTION_STRING)

num_rows = LocalQueryBuilderFunctions.params
self.lib_name = LocalQueryBuilderFunctions.LIB_NAME
self.ac.delete_library(self.lib_name)
lib = self.ac.create_library(self.lib_name)
for rows in num_rows:
lib.write(f"{rows}_rows", generate_benchmark_df(rows))

def teardown(self, num_rows):
del self.lib
del self.ac

def setup(self, num_rows):
self.ac = Arctic(LocalQueryBuilderFunctions.CONNECTION_STRING)
self.lib = self.ac[LocalQueryBuilderFunctions.LIB_NAME]

# Omit string columns in filtering/projection benchmarks to avoid time/memory being dominated by Python string
# allocation
def time_filtering_numeric(self, num_rows):
q = QueryBuilder()
# v3 is random floats between 0 and 100
q = q[q["v3"] < 1.0]
self.lib.read(f"{num_rows}_rows", columns=["v3"], query_builder=q)

def peakmem_filtering_numeric(self, num_rows):
q = QueryBuilder()
# v3 is random floats between 0 and 100
q = q[q["v3"] < 10.0]
self.lib.read(f"{num_rows}_rows", columns=["v3"], query_builder=q)

def time_filtering_string_isin(self, num_rows):
# Selects about 1% of the rows
k = num_rows // 1000
string_set = [f"id{str(i).zfill(3)}" for i in range(1, k + 1)]
q = QueryBuilder()
q = q[q["id1"].isin(string_set)]
self.lib.read(f"{num_rows}_rows", columns=["v3"], query_builder=q)

def peakmem_filtering_string_isin(self, num_rows):
# Selects about 1% of the rows
k = num_rows // 1000
string_set = [f"id{str(i).zfill(3)}" for i in range(1, k + 1)]
q = QueryBuilder()
q = q[q["id1"].isin(string_set)]
self.lib.read(f"{num_rows}_rows", columns=["v3"], query_builder=q)

def time_projection(self, num_rows):
q = QueryBuilder()
q = q.apply("new_col", q["v2"] * q["v3"])
self.lib.read(f"{num_rows}_rows", columns=["new_col"], query_builder=q)

def peakmem_projection(self, num_rows):
q = QueryBuilder()
q = q.apply("new_col", q["v2"] * q["v3"])
self.lib.read(f"{num_rows}_rows", columns=["new_col"], query_builder=q)

# The names are based on the queries used here: https://duckdblabs.github.io/db-benchmark/
# Don't rename to distinguish from other query tests as renaming makes it a new benchmark, losing historic results
def time_query_1(self, num_rows):
q = QueryBuilder()
q = q.groupby("id1").agg({"v1": "sum"})
self.lib.read(f"{num_rows}_rows", query_builder=q)

def peakmem_query_1(self, num_rows):
q = QueryBuilder()
q = q.groupby("id1").agg({"v1": "sum"})
self.lib.read(f"{num_rows}_rows", query_builder=q)

def time_query_3(self, num_rows):
q = QueryBuilder()
q = q.groupby("id3").agg({"v1": "sum", "v3": "sum"})
self.lib.read(f"{num_rows}_rows", query_builder=q)

def peakmem_query_3(self, num_rows):
q = QueryBuilder()
q = q.groupby("id3").agg({"v1": "sum", "v3": "sum"})
self.lib.read(f"{num_rows}_rows", query_builder=q)

def time_query_4(self, num_rows):
q = QueryBuilder()
q = q.groupby("id6").agg({"v1": "sum", "v2": "sum"})
self.lib.read(f"{num_rows}_rows", query_builder=q)

def peakmem_query_4(self, num_rows):
q = QueryBuilder()
q = q.groupby("id6").agg({"v1": "sum", "v2": "sum"})
self.lib.read(f"{num_rows}_rows", query_builder=q)

def time_query_adv_query_2(self, num_rows):
q = QueryBuilder()
q = q.groupby("id3").agg({"v1": "max", "v2": "min"})
self.lib.read(f"{num_rows}_rows", query_builder=q)

def peakmem_query_adv_query_2(self, num_rows):
q = QueryBuilder()
q = q.groupby("id3").agg({"v1": "max", "v2": "min"})
self.lib.read(f"{num_rows}_rows", query_builder=q)