From 524afbf7c91e81a85422f74db21cbc4b2c1694a2 Mon Sep 17 00:00:00 2001 From: riazanov Date: Thu, 26 Aug 2021 00:12:31 -0400 Subject: [PATCH 01/30] Add moving quantile Implementation of moving quantile instead of moving median, which is a partial case of quantile. For now using the fixed constant in #define --- bottleneck/src/move_median/move_median.c | 49 +++++++++++++++++------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/bottleneck/src/move_median/move_median.c b/bottleneck/src/move_median/move_median.c index 3f0709abe4..e93ca2a9a4 100644 --- a/bottleneck/src/move_median/move_median.c +++ b/bottleneck/src/move_median/move_median.c @@ -19,6 +19,8 @@ node2->idx = idx1; \ idx1 = idx2 +#define QUANTILE 0.25 + /* ----------------------------------------------------------------------------- Prototypes @@ -48,6 +50,17 @@ static inline void mm_swap_heap_heads(mm_node **s_heap, idx_t n_s, mm_node *s_node, mm_node *l_node); +/* function to find current number of statistic */ +idx_t mm_k_stat(idx_t n_total_nonnan) { + return (idx_t) floor((n_total_nonnan - 1) * QUANTILE) + 1; +} + +/* function to check if */ +int mm_stat_exact(idx_t n_total_nonnan) { + return ((n_total_nonnan - 1) * QUANTILE) == floor((n_total_nonnan - 1) * QUANTILE); +} + + /* ----------------------------------------------------------------------------- Top-level non-nan functions @@ -64,7 +77,8 @@ mm_new(const idx_t window, idx_t min_count) { mm->node_data = malloc(window * sizeof(mm_node)); mm->s_heap = mm->nodes; - mm->l_heap = &mm->nodes[window / 2 + window % 2]; + idx_t k_stat = mm_k_stat(window); + mm->l_heap = &mm->nodes[k_stat]; mm->window = window; mm->odd = window % 2; @@ -98,8 +112,11 @@ mm_update_init(mm_handle *mm, ai_t ai) { mm->s_first_leaf = 0; } else { /* at least one node already exists in the heaps */ + + idx_t k_stat = mm_k_stat(n_s + n_l + 1); + mm->newest->next = node; - if (n_s > n_l) { + if (n_s >= k_stat) { /* add new node to large heap */ mm->l_heap[n_l] = node; node->region = LH; @@ -147,11 +164,7 @@ mm_update(mm_handle *mm, ai_t ai) { } /* return the median */ - if (mm->odd) { - return mm->s_heap[0]->ai; - } else { - return (mm->s_heap[0]->ai + mm->l_heap[0]->ai) / 2.0; - } + return mm_get_median(mm); } @@ -172,7 +185,8 @@ mm_new_nan(const idx_t window, idx_t min_count) { mm->node_data = malloc(window * sizeof(mm_node)); mm->s_heap = mm->nodes; - mm->l_heap = &mm->nodes[window / 2 + window % 2]; + idx_t k_stat = mm_k_stat(window); + mm->l_heap = &mm->nodes[k_stat]; mm->n_array = &mm->nodes[window]; mm->window = window; @@ -225,8 +239,11 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { mm->s_first_leaf = 0; } else { /* at least one node already exists in the heaps */ + /* number of non-nan nodes including the new node is n_s + n_l + 1 */ + idx_t k_stat = mm_k_stat(n_s + n_l + 1); + mm->newest->next = node; - if (n_s > n_l) { + if (n_s >= k_stat) { /* add new node to large heap */ mm->l_heap[n_l] = node; node->region = LH; @@ -281,12 +298,15 @@ mm_update_nan(mm_handle *mm, ai_t ai) { n_l = mm->n_l; n_n = mm->n_n; + idx_t k_stat; + if (ai != ai) { if (node->region == SH) { /* Oldest node is in the small heap and needs to be moved * to the nan array. Resulting hole in the small heap will be * filled with the rightmost leaf of the last row of the small * heap. */ + k_stat = mm_k_stat(n_s + n_l - 1); /* insert node into nan array */ node->region = NA; @@ -324,7 +344,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { s_heap[idx]->idx = idx; heapify_small_node(mm, idx); } - if (mm->n_s < mm->n_l) { + if (mm->n_s < k_stat) { /* move head node from the large heap to the small heap */ node2 = mm->l_heap[0]; node2->idx = mm->n_s; @@ -357,6 +377,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { * filled with the rightmost leaf of the last row of the large * heap. */ + k_stat = mm_k_stat(n_s + n_l - 1); /* insert node into nan array */ node->region = NA; node->idx = n_n; @@ -375,7 +396,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { } else { mm->l_first_leaf = FIRST_LEAF(mm->n_l); } - if (mm->n_l < mm->n_s - 1) { + if (mm->n_s > k_stat) { /* move head node from the small heap to the large heap */ node2 = mm->s_heap[0]; node2->idx = mm->n_l; @@ -412,7 +433,9 @@ mm_update_nan(mm_handle *mm, ai_t ai) { heapify_large_node(mm, idx); } else { /* ai is not NaN but oldest node is in nan array */ - if (n_s > n_l) { + k_stat = mm_k_stat(n_s + n_l + 1); + + if (n_s == k_stat) { /* insert into large heap */ node->region = LH; node->idx = n_l; @@ -481,7 +504,7 @@ mm_get_median(mm_handle *mm) { idx_t n_total = mm->n_l + mm->n_s; if (n_total < mm->min_count) return MM_NAN(); - if (min(mm->window, n_total) % 2 == 1) + if (mm_stat_exact(n_total)) return mm->s_heap[0]->ai; return (mm->s_heap[0]->ai + mm->l_heap[0]->ai) / 2.0; } From 4f98aa4d117fd5a17206b507538f87760b63b214 Mon Sep 17 00:00:00 2001 From: riazanov Date: Wed, 14 Sep 2022 19:44:31 -0400 Subject: [PATCH 02/30] initial changes from median to quantile --- bottleneck/src/move_median/move_median.c | 44 ++++++++++++------- bottleneck/src/move_median/move_median.h | 5 ++- .../src/move_median/move_median_debug.c | 9 ++-- bottleneck/src/move_template.c | 20 ++++++--- 4 files changed, 50 insertions(+), 28 deletions(-) diff --git a/bottleneck/src/move_median/move_median.c b/bottleneck/src/move_median/move_median.c index e93ca2a9a4..6bc91f6038 100644 --- a/bottleneck/src/move_median/move_median.c +++ b/bottleneck/src/move_median/move_median.c @@ -28,7 +28,9 @@ idx1 = idx2 */ /* helper functions */ -static inline ai_t mm_get_median(mm_handle *mm); +static inline ai_t mm_get_quantile(mm_handle *mm); +idx_t mm_k_stat(mm_handle *mm, idx_t idx); +int mm_stat_exact(mm_handle *mm, idx_t idx); static inline void heapify_small_node(mm_handle *mm, idx_t idx); static inline void heapify_large_node(mm_handle *mm, idx_t idx); static inline idx_t mm_get_smallest_child(mm_node **heap, idx_t window, @@ -50,15 +52,7 @@ static inline void mm_swap_heap_heads(mm_node **s_heap, idx_t n_s, mm_node *s_node, mm_node *l_node); -/* function to find current number of statistic */ -idx_t mm_k_stat(idx_t n_total_nonnan) { - return (idx_t) floor((n_total_nonnan - 1) * QUANTILE) + 1; -} -/* function to check if */ -int mm_stat_exact(idx_t n_total_nonnan) { - return ((n_total_nonnan - 1) * QUANTILE) == floor((n_total_nonnan - 1) * QUANTILE); -} /* @@ -71,7 +65,7 @@ int mm_stat_exact(idx_t n_total_nonnan) { * small values (a max heap); the other heap contains the large values (a min * heap). The handle, containing information about the heaps, is returned. */ mm_handle * -mm_new(const idx_t window, idx_t min_count) { +mm_new(const idx_t window, idx_t min_count, double quantile) { mm_handle *mm = malloc(sizeof(mm_handle)); mm->nodes = malloc(window * sizeof(mm_node*)); mm->node_data = malloc(window * sizeof(mm_node)); @@ -84,6 +78,8 @@ mm_new(const idx_t window, idx_t min_count) { mm->odd = window % 2; mm->min_count = min_count; + mm->quantile = quantile; + mm_reset(mm); return mm; @@ -137,7 +133,7 @@ mm_update_init(mm_handle *mm, ai_t ai) { mm->newest = node; - return mm_get_median(mm); + return mm_get_quantile(mm); } @@ -164,7 +160,7 @@ mm_update(mm_handle *mm, ai_t ai) { } /* return the median */ - return mm_get_median(mm); + return mm_get_quantile(mm); } @@ -179,7 +175,7 @@ mm_update(mm_handle *mm, ai_t ai) { * large values (a min heap); the nan array contains the NaNs. The handle, * containing information about the heaps and the nan array is returned. */ mm_handle * -mm_new_nan(const idx_t window, idx_t min_count) { +mm_new_nan(const idx_t window, idx_t min_count, double quantile) { mm_handle *mm = malloc(sizeof(mm_handle)); mm->nodes = malloc(2 * window * sizeof(mm_node*)); mm->node_data = malloc(window * sizeof(mm_node)); @@ -192,6 +188,8 @@ mm_new_nan(const idx_t window, idx_t min_count) { mm->window = window; mm->min_count = min_count; + mm->quantile = quantile; + mm_reset(mm); return mm; @@ -264,7 +262,7 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { } mm->newest = node; - return mm_get_median(mm); + return mm_get_quantile(mm); } @@ -460,7 +458,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { --mm->n_n; } } - return mm_get_median(mm); + return mm_get_quantile(mm); } @@ -498,9 +496,21 @@ mm_free(mm_handle *mm) { ----------------------------------------------------------------------------- */ -/* Return the current median */ +/* function to find the current index of element correspodning to the quantile */ +idx_t mm_k_stat(mm_handle *mm, idx_t idx) { + return (idx_t) floor((idx - 1) * mm->quantile) + 1; +} + +/* function to check if the current index of the quantile is integer, and so + * the quantile is the element at the top of the heap. Otherwise take midpoint */ +int mm_stat_exact(mm_handle *mm, idx_t idx) { + return ((idx - 1) * mm->quantile) == floor((idx - 1) * mm->quantile); +} + + +/* Return the current quantile */ static inline ai_t -mm_get_median(mm_handle *mm) { +mm_get_quantile(mm_handle *mm) { idx_t n_total = mm->n_l + mm->n_s; if (n_total < mm->min_count) return MM_NAN(); diff --git a/bottleneck/src/move_median/move_median.h b/bottleneck/src/move_median/move_median.h index ccc3176096..eb02f96c48 100644 --- a/bottleneck/src/move_median/move_median.h +++ b/bottleneck/src/move_median/move_median.h @@ -51,16 +51,17 @@ struct _mm_handle { mm_node *newest; /* The newest node (most recent insert) */ idx_t s_first_leaf; /* All nodes this index or greater are leaf nodes */ idx_t l_first_leaf; /* All nodes this index or greater are leaf nodes */ + double quantile; /* Value between 0 <= quantile <= 1, the quantile to compute */ }; typedef struct _mm_handle mm_handle; /* non-nan functions */ -mm_handle *mm_new(const idx_t window, idx_t min_count); +mm_handle *mm_new(const idx_t window, idx_t min_count, double quantile); ai_t mm_update_init(mm_handle *mm, ai_t ai); ai_t mm_update(mm_handle *mm, ai_t ai); /* nan functions */ -mm_handle *mm_new_nan(const idx_t window, idx_t min_count); +mm_handle *mm_new_nan(const idx_t window, idx_t min_count, double quantile); ai_t mm_update_init_nan(mm_handle *mm, ai_t ai); ai_t mm_update_nan(mm_handle *mm, ai_t ai); diff --git a/bottleneck/src/move_median/move_median_debug.c b/bottleneck/src/move_median/move_median_debug.c index 13871d2e6e..dc1017f683 100644 --- a/bottleneck/src/move_median/move_median_debug.c +++ b/bottleneck/src/move_median/move_median_debug.c @@ -1,6 +1,6 @@ #include "move_median.h" -ai_t *mm_move_median(ai_t *a, idx_t length, idx_t window, idx_t min_count); +ai_t *mm_move_median(ai_t *a, idx_t length, idx_t window, idx_t min_count, double quantile); int mm_assert_equal(ai_t *actual, ai_t *desired, ai_t *input, idx_t length, char *err_msg); int mm_unit_test(void); @@ -19,13 +19,13 @@ int main(void) { /* moving window median of 1d arrays returns output array */ -ai_t *mm_move_median(ai_t *a, idx_t length, idx_t window, idx_t min_count) { +ai_t *mm_move_median(ai_t *a, idx_t length, idx_t window, idx_t min_count, double quantile) { mm_handle *mm; ai_t *out; idx_t i; out = malloc(length * sizeof(ai_t)); - mm = mm_new_nan(window, min_count); + mm = mm_new_nan(window, min_count, quantile); for (i=0; i < length; i++) { if (i < window) { out[i] = mm_update_init_nan(mm, a[i]); @@ -84,13 +84,14 @@ int mm_unit_test(void) { int length; char *err_msg; int failed; + double quantile = 0.5; length = sizeof(arr_input) / sizeof(*arr_input); err_msg = malloc(1024 * sizeof *err_msg); sprintf(err_msg, "move_median failed with window=%d, min_count=%d", window, min_count); - actual = mm_move_median(arr_input, length, window, min_count); + actual = mm_move_median(arr_input, length, window, min_count, quantile); failed = mm_assert_equal(actual, desired, arr_input, length, err_msg); free(actual); diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index 8a589703d1..b104e7f1c4 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -32,6 +32,16 @@ int axis, \ int ddof) +/* low-level functions with double argument for move_quantile */ +#define MOVE_QUANTILE(name, dtype) \ + static PyObject * \ + name##_##dtype(PyArrayObject *a, \ + int window, \ + int min_count, \ + int axis, \ + int ddof, \ + double quantile) + /* top-level functions such as move_sum */ #define MOVE_MAIN(name, ddof) \ static PyObject * \ @@ -541,9 +551,9 @@ MOVE_MAIN(NAME, 0) /* move_median ----------------------------------------------------------- */ /* dtype = [['float64'], ['float32']] */ -MOVE(move_median, DTYPE0) { +MOVE_QUANTILE(move_quantile, DTYPE0) { npy_DTYPE0 ai; - mm_handle *mm = mm_new_nan(window, min_count); + mm_handle *mm = mm_new_nan(window, min_count, quantile); INIT(NPY_DTYPE0) if (window == 1) { mm_free(mm); @@ -576,9 +586,9 @@ MOVE(move_median, DTYPE0) { /* dtype end */ /* dtype = [['int64', 'float64'], ['int32', 'float64']] */ -MOVE(move_median, DTYPE0) { +MOVE_QUANTILE(move_quantile, DTYPE0) { npy_DTYPE0 ai; - mm_handle *mm = mm_new(window, min_count); + mm_handle *mm = mm_new(window, min_count, quantile); INIT(NPY_DTYPE1) if (window == 1) { return PyArray_CastToType(a, @@ -612,7 +622,7 @@ MOVE(move_median, DTYPE0) { /* dtype end */ -MOVE_MAIN(move_median, 0) +MOVE_MAIN(move_quantile, 0) /* move_rank-------------------------------------------------------------- */ From e470fa2927e9d58065cbeac92d1c361783b7e7c2 Mon Sep 17 00:00:00 2001 From: riazanov Date: Wed, 14 Sep 2022 19:44:31 -0400 Subject: [PATCH 03/30] initial changes from median to quantile --- bottleneck/__init__.py | 2 +- bottleneck/src/move_median/move_median.c | 44 ++++++++++++------- bottleneck/src/move_median/move_median.h | 5 ++- .../src/move_median/move_median_debug.c | 9 ++-- bottleneck/src/move_template.c | 30 ++++++++----- 5 files changed, 56 insertions(+), 34 deletions(-) diff --git a/bottleneck/__init__.py b/bottleneck/__init__.py index c5495b3337..0aad5268f6 100644 --- a/bottleneck/__init__.py +++ b/bottleneck/__init__.py @@ -4,7 +4,7 @@ from . import slow from ._pytesttester import PytestTester -from .move import (move_argmax, move_argmin, move_max, move_mean, move_median, +from .move import (move_argmax, move_argmin, move_max, move_mean, move_quantile, move_min, move_rank, move_std, move_sum, move_var) from .nonreduce import replace from .nonreduce_axis import (argpartition, nanrankdata, partition, push, diff --git a/bottleneck/src/move_median/move_median.c b/bottleneck/src/move_median/move_median.c index e93ca2a9a4..6bc91f6038 100644 --- a/bottleneck/src/move_median/move_median.c +++ b/bottleneck/src/move_median/move_median.c @@ -28,7 +28,9 @@ idx1 = idx2 */ /* helper functions */ -static inline ai_t mm_get_median(mm_handle *mm); +static inline ai_t mm_get_quantile(mm_handle *mm); +idx_t mm_k_stat(mm_handle *mm, idx_t idx); +int mm_stat_exact(mm_handle *mm, idx_t idx); static inline void heapify_small_node(mm_handle *mm, idx_t idx); static inline void heapify_large_node(mm_handle *mm, idx_t idx); static inline idx_t mm_get_smallest_child(mm_node **heap, idx_t window, @@ -50,15 +52,7 @@ static inline void mm_swap_heap_heads(mm_node **s_heap, idx_t n_s, mm_node *s_node, mm_node *l_node); -/* function to find current number of statistic */ -idx_t mm_k_stat(idx_t n_total_nonnan) { - return (idx_t) floor((n_total_nonnan - 1) * QUANTILE) + 1; -} -/* function to check if */ -int mm_stat_exact(idx_t n_total_nonnan) { - return ((n_total_nonnan - 1) * QUANTILE) == floor((n_total_nonnan - 1) * QUANTILE); -} /* @@ -71,7 +65,7 @@ int mm_stat_exact(idx_t n_total_nonnan) { * small values (a max heap); the other heap contains the large values (a min * heap). The handle, containing information about the heaps, is returned. */ mm_handle * -mm_new(const idx_t window, idx_t min_count) { +mm_new(const idx_t window, idx_t min_count, double quantile) { mm_handle *mm = malloc(sizeof(mm_handle)); mm->nodes = malloc(window * sizeof(mm_node*)); mm->node_data = malloc(window * sizeof(mm_node)); @@ -84,6 +78,8 @@ mm_new(const idx_t window, idx_t min_count) { mm->odd = window % 2; mm->min_count = min_count; + mm->quantile = quantile; + mm_reset(mm); return mm; @@ -137,7 +133,7 @@ mm_update_init(mm_handle *mm, ai_t ai) { mm->newest = node; - return mm_get_median(mm); + return mm_get_quantile(mm); } @@ -164,7 +160,7 @@ mm_update(mm_handle *mm, ai_t ai) { } /* return the median */ - return mm_get_median(mm); + return mm_get_quantile(mm); } @@ -179,7 +175,7 @@ mm_update(mm_handle *mm, ai_t ai) { * large values (a min heap); the nan array contains the NaNs. The handle, * containing information about the heaps and the nan array is returned. */ mm_handle * -mm_new_nan(const idx_t window, idx_t min_count) { +mm_new_nan(const idx_t window, idx_t min_count, double quantile) { mm_handle *mm = malloc(sizeof(mm_handle)); mm->nodes = malloc(2 * window * sizeof(mm_node*)); mm->node_data = malloc(window * sizeof(mm_node)); @@ -192,6 +188,8 @@ mm_new_nan(const idx_t window, idx_t min_count) { mm->window = window; mm->min_count = min_count; + mm->quantile = quantile; + mm_reset(mm); return mm; @@ -264,7 +262,7 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { } mm->newest = node; - return mm_get_median(mm); + return mm_get_quantile(mm); } @@ -460,7 +458,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { --mm->n_n; } } - return mm_get_median(mm); + return mm_get_quantile(mm); } @@ -498,9 +496,21 @@ mm_free(mm_handle *mm) { ----------------------------------------------------------------------------- */ -/* Return the current median */ +/* function to find the current index of element correspodning to the quantile */ +idx_t mm_k_stat(mm_handle *mm, idx_t idx) { + return (idx_t) floor((idx - 1) * mm->quantile) + 1; +} + +/* function to check if the current index of the quantile is integer, and so + * the quantile is the element at the top of the heap. Otherwise take midpoint */ +int mm_stat_exact(mm_handle *mm, idx_t idx) { + return ((idx - 1) * mm->quantile) == floor((idx - 1) * mm->quantile); +} + + +/* Return the current quantile */ static inline ai_t -mm_get_median(mm_handle *mm) { +mm_get_quantile(mm_handle *mm) { idx_t n_total = mm->n_l + mm->n_s; if (n_total < mm->min_count) return MM_NAN(); diff --git a/bottleneck/src/move_median/move_median.h b/bottleneck/src/move_median/move_median.h index ccc3176096..eb02f96c48 100644 --- a/bottleneck/src/move_median/move_median.h +++ b/bottleneck/src/move_median/move_median.h @@ -51,16 +51,17 @@ struct _mm_handle { mm_node *newest; /* The newest node (most recent insert) */ idx_t s_first_leaf; /* All nodes this index or greater are leaf nodes */ idx_t l_first_leaf; /* All nodes this index or greater are leaf nodes */ + double quantile; /* Value between 0 <= quantile <= 1, the quantile to compute */ }; typedef struct _mm_handle mm_handle; /* non-nan functions */ -mm_handle *mm_new(const idx_t window, idx_t min_count); +mm_handle *mm_new(const idx_t window, idx_t min_count, double quantile); ai_t mm_update_init(mm_handle *mm, ai_t ai); ai_t mm_update(mm_handle *mm, ai_t ai); /* nan functions */ -mm_handle *mm_new_nan(const idx_t window, idx_t min_count); +mm_handle *mm_new_nan(const idx_t window, idx_t min_count, double quantile); ai_t mm_update_init_nan(mm_handle *mm, ai_t ai); ai_t mm_update_nan(mm_handle *mm, ai_t ai); diff --git a/bottleneck/src/move_median/move_median_debug.c b/bottleneck/src/move_median/move_median_debug.c index 13871d2e6e..dc1017f683 100644 --- a/bottleneck/src/move_median/move_median_debug.c +++ b/bottleneck/src/move_median/move_median_debug.c @@ -1,6 +1,6 @@ #include "move_median.h" -ai_t *mm_move_median(ai_t *a, idx_t length, idx_t window, idx_t min_count); +ai_t *mm_move_median(ai_t *a, idx_t length, idx_t window, idx_t min_count, double quantile); int mm_assert_equal(ai_t *actual, ai_t *desired, ai_t *input, idx_t length, char *err_msg); int mm_unit_test(void); @@ -19,13 +19,13 @@ int main(void) { /* moving window median of 1d arrays returns output array */ -ai_t *mm_move_median(ai_t *a, idx_t length, idx_t window, idx_t min_count) { +ai_t *mm_move_median(ai_t *a, idx_t length, idx_t window, idx_t min_count, double quantile) { mm_handle *mm; ai_t *out; idx_t i; out = malloc(length * sizeof(ai_t)); - mm = mm_new_nan(window, min_count); + mm = mm_new_nan(window, min_count, quantile); for (i=0; i < length; i++) { if (i < window) { out[i] = mm_update_init_nan(mm, a[i]); @@ -84,13 +84,14 @@ int mm_unit_test(void) { int length; char *err_msg; int failed; + double quantile = 0.5; length = sizeof(arr_input) / sizeof(*arr_input); err_msg = malloc(1024 * sizeof *err_msg); sprintf(err_msg, "move_median failed with window=%d, min_count=%d", window, min_count); - actual = mm_move_median(arr_input, length, window, min_count); + actual = mm_move_median(arr_input, length, window, min_count, quantile); failed = mm_assert_equal(actual, desired, arr_input, length, err_msg); free(actual); diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index 8a589703d1..b34fc33a11 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -32,6 +32,16 @@ int axis, \ int ddof) +/* low-level functions with double argument for move_quantile */ +#define MOVE_QUANTILE(name, dtype) \ + static PyObject * \ + name##_##dtype(PyArrayObject *a, \ + int window, \ + int min_count, \ + int axis, \ + int ddof, \ + double quantile) + /* top-level functions such as move_sum */ #define MOVE_MAIN(name, ddof) \ static PyObject * \ @@ -538,19 +548,19 @@ MOVE(NAME, DTYPE0) { MOVE_MAIN(NAME, 0) /* repeat end */ -/* move_median ----------------------------------------------------------- */ +/* move_quantile ----------------------------------------------------------- */ /* dtype = [['float64'], ['float32']] */ -MOVE(move_median, DTYPE0) { +MOVE_QUANTILE(move_quantile, DTYPE0) { npy_DTYPE0 ai; - mm_handle *mm = mm_new_nan(window, min_count); + mm_handle *mm = mm_new_nan(window, min_count, quantile); INIT(NPY_DTYPE0) if (window == 1) { mm_free(mm); return PyArray_Copy(a); } if (mm == NULL) { - MEMORY_ERR("Could not allocate memory for move_median"); + MEMORY_ERR("Could not allocate memory for move_quantile"); } BN_BEGIN_ALLOW_THREADS WHILE { @@ -576,9 +586,9 @@ MOVE(move_median, DTYPE0) { /* dtype end */ /* dtype = [['int64', 'float64'], ['int32', 'float64']] */ -MOVE(move_median, DTYPE0) { +MOVE_QUANTILE(move_quantile, DTYPE0) { npy_DTYPE0 ai; - mm_handle *mm = mm_new(window, min_count); + mm_handle *mm = mm_new(window, min_count, quantile); INIT(NPY_DTYPE1) if (window == 1) { return PyArray_CastToType(a, @@ -586,7 +596,7 @@ MOVE(move_median, DTYPE0) { PyArray_CHKFLAGS(a, NPY_ARRAY_F_CONTIGUOUS)); } if (mm == NULL) { - MEMORY_ERR("Could not allocate memory for move_median"); + MEMORY_ERR("Could not allocate memory for move_quantile"); } BN_BEGIN_ALLOW_THREADS WHILE { @@ -612,7 +622,7 @@ MOVE(move_median, DTYPE0) { /* dtype end */ -MOVE_MAIN(move_median, 0) +MOVE_MAIN(move_quantile, 0) /* move_rank-------------------------------------------------------------- */ @@ -1389,7 +1399,7 @@ array([ nan, nan, 0., 1., 0., 1., 2.]) MULTILINE STRING END */ -static char move_median_doc[] = +static char move_quantile_doc[] = /* MULTILINE STRING BEGIN move_median(a, window, min_count=None, axis=-1) @@ -1498,7 +1508,7 @@ move_methods[] = { {"move_max", (PyCFunction)move_max, VARKEY, move_max_doc}, {"move_argmin", (PyCFunction)move_argmin, VARKEY, move_argmin_doc}, {"move_argmax", (PyCFunction)move_argmax, VARKEY, move_argmax_doc}, - {"move_median", (PyCFunction)move_median, VARKEY, move_median_doc}, + {"move_quantile", (PyCFunction)move_quantile, VARKEY, move_quantile_doc}, {"move_rank", (PyCFunction)move_rank, VARKEY, move_rank_doc}, {NULL, NULL, 0, NULL} }; From 10b88241cb9318601214c93b0d8b3f63b5a2d2ab Mon Sep 17 00:00:00 2001 From: riazanov Date: Thu, 15 Sep 2022 02:37:17 -0400 Subject: [PATCH 04/30] Change all move_median to move_quantile Add quantile and has_quantile parameters to the template --- .gitignore | 2 + asv_bench/benchmarks/move.py | 4 +- bottleneck/src/move_median/move_median.c | 16 ++--- bottleneck/src/move_template.c | 82 +++++++++++++++--------- bottleneck/tests/move_test.py | 4 +- bottleneck/tests/util.py | 2 +- 6 files changed, 65 insertions(+), 45 deletions(-) diff --git a/.gitignore b/.gitignore index 81808d667d..4161403fbe 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ MANIFEST *~ \#*# bottleneck/src/bn_config.h +# TODO : change gitignore back +.vscode/c_cpp_properties.json diff --git a/asv_bench/benchmarks/move.py b/asv_bench/benchmarks/move.py index 9e24560d89..41ad95fb40 100644 --- a/asv_bench/benchmarks/move.py +++ b/asv_bench/benchmarks/move.py @@ -38,7 +38,7 @@ def time_move_argmax(self, dtype, shape, window): bn.move_argmax(self.arr, window) def time_move_median(self, dtype, shape, window): - bn.move_median(self.arr, window) + bn.move_quantile(self.arr, window) def time_move_rank(self, dtype, shape, window): bn.move_rank(self.arr, window) @@ -82,7 +82,7 @@ def time_move_argmax(self, dtype, shape, order, axis, window): bn.move_argmax(self.arr, window, axis=axis) def time_move_median(self, dtype, shape, order, axis, window): - bn.move_median(self.arr, window, axis=axis) + bn.move_quantile(self.arr, window, axis=axis) def time_move_rank(self, dtype, shape, order, axis, window): bn.move_rank(self.arr, window, axis=axis) diff --git a/bottleneck/src/move_median/move_median.c b/bottleneck/src/move_median/move_median.c index 6bc91f6038..7acec61b7f 100644 --- a/bottleneck/src/move_median/move_median.c +++ b/bottleneck/src/move_median/move_median.c @@ -71,7 +71,7 @@ mm_new(const idx_t window, idx_t min_count, double quantile) { mm->node_data = malloc(window * sizeof(mm_node)); mm->s_heap = mm->nodes; - idx_t k_stat = mm_k_stat(window); + idx_t k_stat = mm_k_stat(mm, window); mm->l_heap = &mm->nodes[k_stat]; mm->window = window; @@ -109,7 +109,7 @@ mm_update_init(mm_handle *mm, ai_t ai) { } else { /* at least one node already exists in the heaps */ - idx_t k_stat = mm_k_stat(n_s + n_l + 1); + idx_t k_stat = mm_k_stat(mm, n_s + n_l + 1); mm->newest->next = node; if (n_s >= k_stat) { @@ -181,7 +181,7 @@ mm_new_nan(const idx_t window, idx_t min_count, double quantile) { mm->node_data = malloc(window * sizeof(mm_node)); mm->s_heap = mm->nodes; - idx_t k_stat = mm_k_stat(window); + idx_t k_stat = mm_k_stat(mm, window); mm->l_heap = &mm->nodes[k_stat]; mm->n_array = &mm->nodes[window]; @@ -238,7 +238,7 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { } else { /* at least one node already exists in the heaps */ /* number of non-nan nodes including the new node is n_s + n_l + 1 */ - idx_t k_stat = mm_k_stat(n_s + n_l + 1); + idx_t k_stat = mm_k_stat(mm, n_s + n_l + 1); mm->newest->next = node; if (n_s >= k_stat) { @@ -304,7 +304,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { * to the nan array. Resulting hole in the small heap will be * filled with the rightmost leaf of the last row of the small * heap. */ - k_stat = mm_k_stat(n_s + n_l - 1); + k_stat = mm_k_stat(mm, n_s + n_l - 1); /* insert node into nan array */ node->region = NA; @@ -375,7 +375,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { * filled with the rightmost leaf of the last row of the large * heap. */ - k_stat = mm_k_stat(n_s + n_l - 1); + k_stat = mm_k_stat(mm, n_s + n_l - 1); /* insert node into nan array */ node->region = NA; node->idx = n_n; @@ -431,7 +431,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { heapify_large_node(mm, idx); } else { /* ai is not NaN but oldest node is in nan array */ - k_stat = mm_k_stat(n_s + n_l + 1); + k_stat = mm_k_stat(mm, n_s + n_l + 1); if (n_s == k_stat) { /* insert into large heap */ @@ -514,7 +514,7 @@ mm_get_quantile(mm_handle *mm) { idx_t n_total = mm->n_l + mm->n_s; if (n_total < mm->min_count) return MM_NAN(); - if (mm_stat_exact(n_total)) + if (mm_stat_exact(mm, n_total)) return mm->s_heap[0]->ai; return (mm->s_heap[0]->ai + mm->l_heap[0]->ai) / 2.0; } diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index b34fc33a11..16da2cd265 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -25,15 +25,6 @@ /* low-level functions such as move_sum_float64 */ #define MOVE(name, dtype) \ - static PyObject * \ - name##_##dtype(PyArrayObject *a, \ - int window, \ - int min_count, \ - int axis, \ - int ddof) - -/* low-level functions with double argument for move_quantile */ -#define MOVE_QUANTILE(name, dtype) \ static PyObject * \ name##_##dtype(PyArrayObject *a, \ int window, \ @@ -43,7 +34,7 @@ double quantile) /* top-level functions such as move_sum */ -#define MOVE_MAIN(name, ddof) \ +#define MOVE_MAIN(name, has_ddof, has_quantile) \ static PyObject * \ name(PyObject *self, PyObject *args, PyObject *kwds) \ { \ @@ -54,7 +45,8 @@ name##_float32, \ name##_int64, \ name##_int32, \ - ddof); \ + has_ddof, \ + has_quantile); \ } /* typedefs and prototypes ----------------------------------------------- */ @@ -67,7 +59,7 @@ struct _pairs { typedef struct _pairs pairs; /* function pointer for functions passed to mover */ -typedef PyObject *(*move_t)(PyArrayObject *, int, int, int, int); +typedef PyObject *(*move_t)(PyArrayObject *, int, int, int, int, double); static PyObject * mover(char *name, @@ -77,7 +69,8 @@ mover(char *name, move_t, move_t, move_t, - int has_ddof); + int has_ddof, + int has_quantile); /* move_sum -------------------------------------------------------------- */ @@ -158,7 +151,7 @@ MOVE(move_sum, DTYPE0) { /* dtype end */ -MOVE_MAIN(move_sum, 0) +MOVE_MAIN(move_sum, 0, 0) /* move_mean -------------------------------------------------------------- */ @@ -244,7 +237,7 @@ MOVE(move_mean, DTYPE0) { /* dtype end */ -MOVE_MAIN(move_mean, 0) +MOVE_MAIN(move_mean, 0, 0) /* move_std, move_var ---------------------------------------------------- */ @@ -383,7 +376,7 @@ MOVE(NAME, DTYPE0) { } /* dtype end */ -MOVE_MAIN(NAME, 1) +MOVE_MAIN(NAME, 1, 0) /* repeat end */ @@ -545,13 +538,13 @@ MOVE(NAME, DTYPE0) { } /* dtype end */ -MOVE_MAIN(NAME, 0) +MOVE_MAIN(NAME, 0, 0) /* repeat end */ /* move_quantile ----------------------------------------------------------- */ /* dtype = [['float64'], ['float32']] */ -MOVE_QUANTILE(move_quantile, DTYPE0) { +MOVE(move_quantile, DTYPE0) { npy_DTYPE0 ai; mm_handle *mm = mm_new_nan(window, min_count, quantile); INIT(NPY_DTYPE0) @@ -586,7 +579,7 @@ MOVE_QUANTILE(move_quantile, DTYPE0) { /* dtype end */ /* dtype = [['int64', 'float64'], ['int32', 'float64']] */ -MOVE_QUANTILE(move_quantile, DTYPE0) { +MOVE(move_quantile, DTYPE0) { npy_DTYPE0 ai; mm_handle *mm = mm_new(window, min_count, quantile); INIT(NPY_DTYPE1) @@ -622,7 +615,7 @@ MOVE_QUANTILE(move_quantile, DTYPE0) { /* dtype end */ -MOVE_MAIN(move_quantile, 0) +MOVE_MAIN(move_quantile, 0, 1) /* move_rank-------------------------------------------------------------- */ @@ -748,7 +741,7 @@ MOVE(move_rank, DTYPE0) { /* dtype end */ -MOVE_MAIN(move_rank, 0) +MOVE_MAIN(move_rank, 0, 0) /* python strings -------------------------------------------------------- */ @@ -758,6 +751,7 @@ PyObject *pystr_window = NULL; PyObject *pystr_min_count = NULL; PyObject *pystr_axis = NULL; PyObject *pystr_ddof = NULL; +PyObject *pystr_quantile = NULL; static int intern_strings(void) { @@ -766,8 +760,9 @@ intern_strings(void) { pystr_min_count = PyString_InternFromString("min_count"); pystr_axis = PyString_InternFromString("axis"); pystr_ddof = PyString_InternFromString("ddof"); + pystr_quantile = PyString_InternFromString("quantile"); return pystr_a && pystr_window && pystr_min_count && - pystr_axis && pystr_ddof; + pystr_axis && pystr_ddof && pystr_quantile; } /* mover ----------------------------------------------------------------- */ @@ -776,11 +771,13 @@ static inline int parse_args(PyObject *args, PyObject *kwds, int has_ddof, + int has_quantile, PyObject **a, PyObject **window, PyObject **min_count, PyObject **axis, - PyObject **ddof) { + PyObject **ddof, + PyObject **quantile) { const Py_ssize_t nargs = PyTuple_GET_SIZE(args); const Py_ssize_t nkwds = kwds == NULL ? 0 : PyDict_Size(kwds); if (nkwds) { @@ -788,7 +785,7 @@ parse_args(PyObject *args, PyObject *tmp; switch (nargs) { case 4: - if (has_ddof) { + if (has_ddof | has_quantile) { *axis = PyTuple_GET_ITEM(args, 3); } else { TYPE_ERR("wrong number of arguments"); @@ -837,6 +834,13 @@ parse_args(PyObject *args, nkwds_found++; } break; + } else if (has_quantile) { + tmp = PyDict_GetItem(kwds, pystr_quantile); + if (tmp != NULL) { + *quantile = tmp; + nkwds_found++; + } + break; } break; default: @@ -847,7 +851,7 @@ parse_args(PyObject *args, TYPE_ERR("wrong number of keyword arguments"); return 0; } - if (nargs + nkwds_found > 4 + has_ddof) { + if (nargs + nkwds_found > 4 + has_ddof + has_quantile) { TYPE_ERR("too many arguments"); return 0; } @@ -856,6 +860,8 @@ parse_args(PyObject *args, case 5: if (has_ddof) { *ddof = PyTuple_GET_ITEM(args, 4); + } else if (has_quantile) { + *quantile = PyTuple_GET_ITEM(args, 4); } else { TYPE_ERR("wrong number of arguments"); return 0; @@ -887,10 +893,12 @@ mover(char *name, move_t move_float32, move_t move_int64, move_t move_int32, - int has_ddof) { + int has_ddof, + int has_quantile) { int mc; int window; + double quantile; int axis; int ddof; int dtype; @@ -904,11 +912,12 @@ mover(char *name, PyObject *a_obj = NULL; PyObject *window_obj = NULL; PyObject *min_count_obj = Py_None; + PyObject *quantile_obj = Py_None; PyObject *axis_obj = NULL; PyObject *ddof_obj = NULL; - if (!parse_args(args, kwds, has_ddof, &a_obj, &window_obj, - &min_count_obj, &axis_obj, &ddof_obj)) { + if (!parse_args(args, kwds, has_ddof, has_quantile, &a_obj, &window_obj, + &min_count_obj, &axis_obj, &ddof_obj, &quantile_obj)) { return NULL; } @@ -956,6 +965,15 @@ mover(char *name, } } + /* quantile */ + if (quantile_obj == Py_None){ + /* take median by default */ + quantile = (double) 0.5; + } else { + /* QUANTILE TODO: add checks here*/ + quantile = PyFloat_AsDouble(quantile_obj); + } + ndim = PyArray_NDIM(a); /* defend against 0d beings */ @@ -1008,13 +1026,13 @@ mover(char *name, dtype = PyArray_TYPE(a); if (dtype == NPY_float64) { - y = move_float64(a, window, mc, axis, ddof); + y = move_float64(a, window, mc, axis, ddof, quantile); } else if (dtype == NPY_float32) { - y = move_float32(a, window, mc, axis, ddof); + y = move_float32(a, window, mc, axis, ddof, quantile); } else if (dtype == NPY_int64) { - y = move_int64(a, window, mc, axis, ddof); + y = move_int64(a, window, mc, axis, ddof, quantile); } else if (dtype == NPY_int32) { - y = move_int32(a, window, mc, axis, ddof); + y = move_int32(a, window, mc, axis, ddof, quantile); } else { y = slow(name, args, kwds); } diff --git a/bottleneck/tests/move_test.py b/bottleneck/tests/move_test.py index 26fe36f843..0bca5f7bcc 100644 --- a/bottleneck/tests/move_test.py +++ b/bottleneck/tests/move_test.py @@ -149,7 +149,7 @@ def test_move_median_with_nans(): aaae = assert_array_almost_equal min_count = 1 size = 10 - func = bn.move_median + func = bn.move_quantile func0 = bn.slow.move_median rs = np.random.RandomState([1, 2, 3]) for i in range(100): @@ -172,7 +172,7 @@ def test_move_median_without_nans(): aaae = assert_array_almost_equal min_count = 1 size = 10 - func = bn.move_median + func = bn.move_quantile func0 = bn.slow.move_median rs = np.random.RandomState([1, 2, 3]) for i in range(100): diff --git a/bottleneck/tests/util.py b/bottleneck/tests/util.py index cd19d63042..70d5ad95db 100644 --- a/bottleneck/tests/util.py +++ b/bottleneck/tests/util.py @@ -47,7 +47,7 @@ def func_dict(): bn.move_max, bn.move_argmin, bn.move_argmax, - bn.move_median, + bn.move_quantile, bn.move_rank, ] d["nonreduce"] = [bn.replace] From c718a188dbbe371629d8a32a110416b8ede001d7 Mon Sep 17 00:00:00 2001 From: riazanov Date: Thu, 15 Sep 2022 03:27:06 -0400 Subject: [PATCH 05/30] Add move_median as move_quantile without q argument at C level --- bottleneck/__init__.py | 4 +-- bottleneck/src/move_template.c | 60 +++++++++++++++++++++++++++++++++- bottleneck/tests/move_test.py | 2 +- bottleneck/tests/util.py | 2 +- 4 files changed, 63 insertions(+), 5 deletions(-) diff --git a/bottleneck/__init__.py b/bottleneck/__init__.py index 0aad5268f6..850da0cc21 100644 --- a/bottleneck/__init__.py +++ b/bottleneck/__init__.py @@ -4,8 +4,8 @@ from . import slow from ._pytesttester import PytestTester -from .move import (move_argmax, move_argmin, move_max, move_mean, move_quantile, - move_min, move_rank, move_std, move_sum, move_var) +from .move import (move_argmax, move_argmin, move_max, move_mean, move_median, + move_quantile, move_min, move_rank, move_std, move_sum, move_var) from .nonreduce import replace from .nonreduce_axis import (argpartition, nanrankdata, partition, push, rankdata) diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index 16da2cd265..c754932c71 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -49,6 +49,21 @@ has_quantile); \ } +#define MOVE_MAIN_SUBSTITUTE(name, func_name, has_ddof, has_quantile) \ + static PyObject * \ + name(PyObject *self, PyObject *args, PyObject *kwds) \ + { \ + return mover(#name, \ + args, \ + kwds, \ + func_name##_float64, \ + func_name##_float32, \ + func_name##_int64, \ + func_name##_int32, \ + has_ddof, \ + has_quantile); \ + } + /* typedefs and prototypes ----------------------------------------------- */ /* used by move_min and move_max */ @@ -616,6 +631,8 @@ MOVE(move_quantile, DTYPE0) { MOVE_MAIN(move_quantile, 0, 1) +MOVE_MAIN_SUBSTITUTE(move_median, move_quantile, 0, 0) + /* move_rank-------------------------------------------------------------- */ @@ -1417,7 +1434,7 @@ array([ nan, nan, 0., 1., 0., 1., 2.]) MULTILINE STRING END */ -static char move_quantile_doc[] = +static char move_median_doc[] = /* MULTILINE STRING BEGIN move_median(a, window, min_count=None, axis=-1) @@ -1455,6 +1472,46 @@ array([ 1. , 1.5, 2.5, 3.5]) MULTILINE STRING END */ +static char move_quantile_doc[] = +/* MULTILINE STRING BEGIN +move_quantile(a, window, min_count=None, axis=-1, quantile=0.5) + +TODO: right docs, change var name to q, might need to change order as well, so that quantile can be passes as non-keyword + +Moving window median along the specified axis, optionally ignoring NaNs. + +float64 output is returned for all input data types. + +Parameters +---------- +a : ndarray + Input array. If `a` is not an array, a conversion is attempted. +window : int + The number of elements in the moving window. +min_count: {int, None}, optional + If the number of non-NaN values in a window is less than `min_count`, + then a value of NaN is assigned to the window. By default `min_count` + is None, which is equivalent to setting `min_count` equal to `window`. +axis : int, optional + The axis over which the window is moved. By default the last axis + (axis=-1) is used. An axis of None is not allowed. + +Returns +------- +y : ndarray + The moving median of the input array along the specified axis. The + output has the same shape as the input. + +Examples +-------- +>>> a = np.array([1.0, 2.0, 3.0, 4.0]) +>>> bn.move_median(a, window=2) +array([ nan, 1.5, 2.5, 3.5]) +>>> bn.move_median(a, window=2, min_count=1) +array([ 1. , 1.5, 2.5, 3.5]) + +MULTILINE STRING END */ + static char move_rank_doc[] = /* MULTILINE STRING BEGIN move_rank(a, window, min_count=None, axis=-1) @@ -1526,6 +1583,7 @@ move_methods[] = { {"move_max", (PyCFunction)move_max, VARKEY, move_max_doc}, {"move_argmin", (PyCFunction)move_argmin, VARKEY, move_argmin_doc}, {"move_argmax", (PyCFunction)move_argmax, VARKEY, move_argmax_doc}, + {"move_median", (PyCFunction)move_median, VARKEY, move_median_doc}, {"move_quantile", (PyCFunction)move_quantile, VARKEY, move_quantile_doc}, {"move_rank", (PyCFunction)move_rank, VARKEY, move_rank_doc}, {NULL, NULL, 0, NULL} diff --git a/bottleneck/tests/move_test.py b/bottleneck/tests/move_test.py index 0bca5f7bcc..d724fbe23b 100644 --- a/bottleneck/tests/move_test.py +++ b/bottleneck/tests/move_test.py @@ -149,7 +149,7 @@ def test_move_median_with_nans(): aaae = assert_array_almost_equal min_count = 1 size = 10 - func = bn.move_quantile + func = bn.move_median func0 = bn.slow.move_median rs = np.random.RandomState([1, 2, 3]) for i in range(100): diff --git a/bottleneck/tests/util.py b/bottleneck/tests/util.py index 70d5ad95db..cd19d63042 100644 --- a/bottleneck/tests/util.py +++ b/bottleneck/tests/util.py @@ -47,7 +47,7 @@ def func_dict(): bn.move_max, bn.move_argmin, bn.move_argmax, - bn.move_quantile, + bn.move_median, bn.move_rank, ] d["nonreduce"] = [bn.replace] From 009c83554b73cbe46fd855bc33a6866cff9b1611 Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Thu, 15 Sep 2022 23:04:20 -0400 Subject: [PATCH 06/30] Fix bug with addressing quantile before assignment --- .gitignore | 3 +++ bottleneck/src/move_median/move_median.c | 14 ++++---------- bottleneck/src/move_template.c | 3 +++ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 4161403fbe..d2d9ea2ba6 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ MANIFEST bottleneck/src/bn_config.h # TODO : change gitignore back .vscode/c_cpp_properties.json +c.o +checks.c +checks.py diff --git a/bottleneck/src/move_median/move_median.c b/bottleneck/src/move_median/move_median.c index 7acec61b7f..021b2ffe61 100644 --- a/bottleneck/src/move_median/move_median.c +++ b/bottleneck/src/move_median/move_median.c @@ -18,9 +18,6 @@ node1->idx = idx2; \ node2->idx = idx1; \ idx1 = idx2 - -#define QUANTILE 0.25 - /* ----------------------------------------------------------------------------- Prototypes @@ -52,9 +49,6 @@ static inline void mm_swap_heap_heads(mm_node **s_heap, idx_t n_s, mm_node *s_node, mm_node *l_node); - - - /* ----------------------------------------------------------------------------- Top-level non-nan functions @@ -70,6 +64,8 @@ mm_new(const idx_t window, idx_t min_count, double quantile) { mm->nodes = malloc(window * sizeof(mm_node*)); mm->node_data = malloc(window * sizeof(mm_node)); + mm->quantile = quantile; + mm->s_heap = mm->nodes; idx_t k_stat = mm_k_stat(mm, window); mm->l_heap = &mm->nodes[k_stat]; @@ -78,8 +74,6 @@ mm_new(const idx_t window, idx_t min_count, double quantile) { mm->odd = window % 2; mm->min_count = min_count; - mm->quantile = quantile; - mm_reset(mm); return mm; @@ -180,6 +174,8 @@ mm_new_nan(const idx_t window, idx_t min_count, double quantile) { mm->nodes = malloc(2 * window * sizeof(mm_node*)); mm->node_data = malloc(window * sizeof(mm_node)); + mm->quantile = quantile; + mm->s_heap = mm->nodes; idx_t k_stat = mm_k_stat(mm, window); mm->l_heap = &mm->nodes[k_stat]; @@ -188,8 +184,6 @@ mm_new_nan(const idx_t window, idx_t min_count, double quantile) { mm->window = window; mm->min_count = min_count; - mm->quantile = quantile; - mm_reset(mm); return mm; diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index c754932c71..094299bd85 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -631,6 +631,8 @@ MOVE(move_quantile, DTYPE0) { MOVE_MAIN(move_quantile, 0, 1) +/* move_median is move_quantile but doesn't take the quantile argument, + * which defaults to 0.5, giving the desired median */ MOVE_MAIN_SUBSTITUTE(move_median, move_quantile, 0, 0) @@ -1051,6 +1053,7 @@ mover(char *name, } else if (dtype == NPY_int32) { y = move_int32(a, window, mc, axis, ddof, quantile); } else { + /* TODO: add slow for quantile */ y = slow(name, args, kwds); } From 832035ad8fd136a363528ec9cee02e99d2e300a0 Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Sat, 17 Sep 2022 01:47:24 -0400 Subject: [PATCH 07/30] Initial tests and some fixes Added all imports Fixed a bug when q=1. Call move_max for this case (on python layer) Added a lot of tests for move_median and move_quantile --- bottleneck/__init__.py | 5 +- bottleneck/slow/move.py | 10 ++ bottleneck/src/move_median/move_median.c | 10 +- bottleneck/src/move_quantile.py | 14 ++ bottleneck/src/move_template.c | 30 ++-- bottleneck/tests/move_test.py | 167 ++++++++++++++++++++--- bottleneck/tests/util.py | 1 + 7 files changed, 195 insertions(+), 42 deletions(-) create mode 100644 bottleneck/src/move_quantile.py diff --git a/bottleneck/__init__.py b/bottleneck/__init__.py index 850da0cc21..63b7c764d9 100644 --- a/bottleneck/__init__.py +++ b/bottleneck/__init__.py @@ -5,7 +5,10 @@ from . import slow from ._pytesttester import PytestTester from .move import (move_argmax, move_argmin, move_max, move_mean, move_median, - move_quantile, move_min, move_rank, move_std, move_sum, move_var) + move_min, move_rank, move_std, move_sum, move_var) + +from .src.move_quantile import move_quantile + from .nonreduce import replace from .nonreduce_axis import (argpartition, nanrankdata, partition, push, rankdata) diff --git a/bottleneck/slow/move.py b/bottleneck/slow/move.py index 222b6ca840..64beb0a570 100644 --- a/bottleneck/slow/move.py +++ b/bottleneck/slow/move.py @@ -14,6 +14,7 @@ "move_argmin", "move_argmax", "move_median", + "move_quantile", "move_rank", ] @@ -104,11 +105,20 @@ def move_median(a, window, min_count=None, axis=-1): "Slow move_median for unaccelerated dtype" return move_func(np.nanmedian, a, window, min_count, axis=axis) +def move_quantile(a, window, min_count=None, axis=-1, q=0.5): + "Slow move_quantile for unaccelerated dtype" + return move_func(np_nanquantile_infs, a, window, min_count, axis=axis, q=q) def move_rank(a, window, min_count=None, axis=-1): "Slow move_rank for unaccelerated dtype" return move_func(lastrank, a, window, min_count, axis=axis) +# function for handling infs in np.nanquantile +def np_nanquantile_infs(a, **kwargs): + if not np.isinf(a).any(): + return np.nanquantile(a, method='midpoint', **kwargs) + else: + return ((np.nanquantile(a, method='lower', **kwargs) + np.nanquantile(a, method='higher', **kwargs)) / 2) # magic utility functions --------------------------------------------------- diff --git a/bottleneck/src/move_median/move_median.c b/bottleneck/src/move_median/move_median.c index 021b2ffe61..e0f31ee09f 100644 --- a/bottleneck/src/move_median/move_median.c +++ b/bottleneck/src/move_median/move_median.c @@ -26,8 +26,8 @@ idx1 = idx2 /* helper functions */ static inline ai_t mm_get_quantile(mm_handle *mm); -idx_t mm_k_stat(mm_handle *mm, idx_t idx); -int mm_stat_exact(mm_handle *mm, idx_t idx); +static inline idx_t mm_k_stat(mm_handle *mm, idx_t idx); +static inline short mm_stat_exact(mm_handle *mm, idx_t idx); static inline void heapify_small_node(mm_handle *mm, idx_t idx); static inline void heapify_large_node(mm_handle *mm, idx_t idx); static inline idx_t mm_get_smallest_child(mm_node **heap, idx_t window, @@ -61,7 +61,7 @@ static inline void mm_swap_heap_heads(mm_node **s_heap, idx_t n_s, mm_handle * mm_new(const idx_t window, idx_t min_count, double quantile) { mm_handle *mm = malloc(sizeof(mm_handle)); - mm->nodes = malloc(window * sizeof(mm_node*)); + mm->nodes = malloc(window * sizeof(mm_node*)); mm->node_data = malloc(window * sizeof(mm_node)); mm->quantile = quantile; @@ -491,13 +491,13 @@ mm_free(mm_handle *mm) { */ /* function to find the current index of element correspodning to the quantile */ -idx_t mm_k_stat(mm_handle *mm, idx_t idx) { +static inline idx_t mm_k_stat(mm_handle *mm, idx_t idx) { return (idx_t) floor((idx - 1) * mm->quantile) + 1; } /* function to check if the current index of the quantile is integer, and so * the quantile is the element at the top of the heap. Otherwise take midpoint */ -int mm_stat_exact(mm_handle *mm, idx_t idx) { +static inline short mm_stat_exact(mm_handle *mm, idx_t idx) { return ((idx - 1) * mm->quantile) == floor((idx - 1) * mm->quantile); } diff --git a/bottleneck/src/move_quantile.py b/bottleneck/src/move_quantile.py new file mode 100644 index 0000000000..e5c8e6e79c --- /dev/null +++ b/bottleneck/src/move_quantile.py @@ -0,0 +1,14 @@ +from ..move import move_max, move_min +from ..move import move_quantile as move_quantile_c + +all = ["move_quantile"] + +def move_quantile(*args, **kwargs): + if ('q' not in kwargs) or ((kwargs['q'] != 1.) and (kwargs['q'] != 0.)): + return move_quantile_c(*args, **kwargs) + elif (kwargs['q'] == 1.): + del kwargs['q'] + return move_max(*args, **kwargs) + else: + del kwargs['q'] + return move_min(*args, **kwargs) \ No newline at end of file diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index 094299bd85..3e96974733 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -631,7 +631,7 @@ MOVE(move_quantile, DTYPE0) { MOVE_MAIN(move_quantile, 0, 1) -/* move_median is move_quantile but doesn't take the quantile argument, +/* move_median uses move_quantile but doesn't take the quantile argument, * which defaults to 0.5, giving the desired median */ MOVE_MAIN_SUBSTITUTE(move_median, move_quantile, 0, 0) @@ -779,7 +779,7 @@ intern_strings(void) { pystr_min_count = PyString_InternFromString("min_count"); pystr_axis = PyString_InternFromString("axis"); pystr_ddof = PyString_InternFromString("ddof"); - pystr_quantile = PyString_InternFromString("quantile"); + pystr_quantile = PyString_InternFromString("q"); return pystr_a && pystr_window && pystr_min_count && pystr_axis && pystr_ddof && pystr_quantile; } @@ -1477,9 +1477,9 @@ MULTILINE STRING END */ static char move_quantile_doc[] = /* MULTILINE STRING BEGIN -move_quantile(a, window, min_count=None, axis=-1, quantile=0.5) +move_quantile(a, window, min_count=None, axis=-1, q=0.5) -TODO: right docs, change var name to q, might need to change order as well, so that quantile can be passes as non-keyword +TODO: right docs, change var name to q, might need to change order as well, so that quantile can be passes as non-keyword third argument (hard) Moving window median along the specified axis, optionally ignoring NaNs. @@ -1578,17 +1578,17 @@ MULTILINE STRING END */ static PyMethodDef move_methods[] = { - {"move_sum", (PyCFunction)move_sum, VARKEY, move_sum_doc}, - {"move_mean", (PyCFunction)move_mean, VARKEY, move_mean_doc}, - {"move_std", (PyCFunction)move_std, VARKEY, move_std_doc}, - {"move_var", (PyCFunction)move_var, VARKEY, move_var_doc}, - {"move_min", (PyCFunction)move_min, VARKEY, move_min_doc}, - {"move_max", (PyCFunction)move_max, VARKEY, move_max_doc}, - {"move_argmin", (PyCFunction)move_argmin, VARKEY, move_argmin_doc}, - {"move_argmax", (PyCFunction)move_argmax, VARKEY, move_argmax_doc}, - {"move_median", (PyCFunction)move_median, VARKEY, move_median_doc}, - {"move_quantile", (PyCFunction)move_quantile, VARKEY, move_quantile_doc}, - {"move_rank", (PyCFunction)move_rank, VARKEY, move_rank_doc}, + {"move_sum", (PyCFunction)move_sum, VARKEY, move_sum_doc}, + {"move_mean", (PyCFunction)move_mean, VARKEY, move_mean_doc}, + {"move_std", (PyCFunction)move_std, VARKEY, move_std_doc}, + {"move_var", (PyCFunction)move_var, VARKEY, move_var_doc}, + {"move_min", (PyCFunction)move_min, VARKEY, move_min_doc}, + {"move_max", (PyCFunction)move_max, VARKEY, move_max_doc}, + {"move_argmin", (PyCFunction)move_argmin, VARKEY, move_argmin_doc}, + {"move_argmax", (PyCFunction)move_argmax, VARKEY, move_argmax_doc}, + {"move_median", (PyCFunction)move_median, VARKEY, move_median_doc}, + {"move_quantile", (PyCFunction)move_quantile, VARKEY, move_quantile_doc}, + {"move_rank", (PyCFunction)move_rank, VARKEY, move_rank_doc}, {NULL, NULL, 0, NULL} }; diff --git a/bottleneck/tests/move_test.py b/bottleneck/tests/move_test.py index d724fbe23b..d73c86e520 100644 --- a/bottleneck/tests/move_test.py +++ b/bottleneck/tests/move_test.py @@ -5,6 +5,7 @@ import bottleneck as bn from .util import arrays, array_order import pytest +import itertools @pytest.mark.parametrize("func", bn.get_functions("move"), ids=lambda x: x.__name__) @@ -142,6 +143,7 @@ def test_arg_parse_raises(func): # increase size to 30. With those two changes the unit tests will take a # LONG time to run. +REPEAT = 20 def test_move_median_with_nans(): """test move_median.c with nans""" @@ -152,18 +154,19 @@ def test_move_median_with_nans(): func = bn.move_median func0 = bn.slow.move_median rs = np.random.RandomState([1, 2, 3]) - for i in range(100): - a = np.arange(size, dtype=np.float64) - idx = rs.rand(*a.shape) < 0.1 - a[idx] = np.inf - idx = rs.rand(*a.shape) < 0.2 - a[idx] = np.nan - rs.shuffle(a) - for window in range(2, size + 1): - actual = func(a, window=window, min_count=min_count) - desired = func0(a, window=window, min_count=min_count) - err_msg = fmt % (func.__name__, window, min_count, a) - aaae(actual, desired, decimal=5, err_msg=err_msg) + for size in [1, 2, 3, 4, 5, 9, 10, 19, 20, 21]: + for i in range(REPEAT): + a = np.arange(size, dtype=np.float64) + idx = rs.rand(*a.shape) < 0.1 + a[idx] = np.inf + idx = rs.rand(*a.shape) < 0.2 + a[idx] = np.nan + rs.shuffle(a) + for window in range(2, size + 1): + actual = func(a, window=window, min_count=min_count) + desired = func0(a, window=window, min_count=min_count) + err_msg = fmt % (func.__name__, window, min_count, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) def test_move_median_without_nans(): @@ -172,17 +175,139 @@ def test_move_median_without_nans(): aaae = assert_array_almost_equal min_count = 1 size = 10 - func = bn.move_quantile + func = bn.move_median func0 = bn.slow.move_median rs = np.random.RandomState([1, 2, 3]) - for i in range(100): - a = np.arange(size, dtype=np.int64) - rs.shuffle(a) - for window in range(2, size + 1): - actual = func(a, window=window, min_count=min_count) - desired = func0(a, window=window, min_count=min_count) - err_msg = fmt % (func.__name__, window, min_count, a) - aaae(actual, desired, decimal=5, err_msg=err_msg) + for size in [1, 2, 3, 4, 5, 9, 10, 19, 20, 21]: + for i in range(REPEAT): + a = np.arange(size, dtype=np.int64) + rs.shuffle(a) + for window in range(2, size + 1): + actual = func(a, window=window, min_count=min_count) + desired = func0(a, window=window, min_count=min_count) + err_msg = fmt % (func.__name__, window, min_count, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) + + +# --------------------------------------------------------------------------- +# move_quantile.c is newly added. So let's do extensive testing +# +# Unfortunately, np.nanmedian(a) and np.nanquantile(a, q=0.5) don't always agree +# when a contains inf or -inf values. See for instance: +# https://github.com/numpy/numpy/issues/21932 +# https://github.com/numpy/numpy/issues/21091 +# +# Let's first test without inf and -inf + +def test_move_quantile_with_nans(): + """test move_quantile.c with nans""" + fmt = "\nfunc %s | window %d | min_count %s | q %f\n\nInput array:\n%s\n" + aaae = assert_array_almost_equal + min_count = 1 + size = 10 + func = bn.move_quantile + func0 = bn.slow.move_quantile + rs = np.random.RandomState([1, 2, 3]) + for size in [1, 2, 3, 5, 9, 10, 20, 21]: + for _ in range(REPEAT): + # 0 and 1 are important edge cases + for q in [0., 1., rs.rand()]: + for inf_frac in [0.2, 0.5, 0.7]: + a = np.arange(size, dtype=np.float64) + idx = rs.rand(*a.shape) < inf_frac + a[idx] = np.nan + rs.shuffle(a) + for window in range(2, size + 1): + actual = func(a, window=window, min_count=min_count, q=q) + desired = func0(a, window=window, min_count=min_count, q=q) + err_msg = fmt % (func.__name__, window, min_count, q, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) + +def test_move_quantile_without_nans(): + """test move_quantile.c without nans""" + fmt = "\nfunc %s | window %d | min_count %s | q %f\n\nInput array:\n%s\n" + aaae = assert_array_almost_equal + min_count = 1 + size = 10 + func = bn.move_quantile + func0 = bn.slow.move_quantile_no_infs + rs = np.random.RandomState([1, 2, 3]) + for size in [1, 2, 3, 5, 9, 10, 20, 21]: + for _ in range(REPEAT): + for q in [0., 1., rs.rand()]: + a = np.arange(size, dtype=np.float64) + rs.shuffle(a) + for window in range(2, size + 1): + actual = func(a, window=window, min_count=min_count, q=q) + desired = func0(a, window=window, min_count=min_count, q=q) + err_msg = fmt % (func.__name__, window, min_count, q, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) + + +# Now let's deal with inf ans -infs +# TODO explain what's happening there + +def test_numpy_nanquantile_infs(): + """test move_median.c with nans""" + fmt = "\nfunc %s \n\nInput array:\n%s\n" + aaae = assert_array_almost_equal + min_count = 1 + func = np.nanmedian + func0 = bn.slow.np_nanquantile_infs + rs = np.random.RandomState([1, 2, 3]) + sizes = [1, 2, 3, 4, 5, 9, 10, 20, 31, 50] + fracs = [0., 0.2, 0.4, 0.6, 0.8, 1.] + for size in sizes: + for _ in range(REPEAT): + for inf_frac, minus_inf_frac, nan_frac in itertools.product(fracs, fracs, fracs): + a = np.arange(size, dtype=np.float64) + idx = rs.rand(*a.shape) < inf_frac + a[idx] = np.inf + idx = rs.rand(*a.shape) < minus_inf_frac + a[idx] = -np.inf + idx = rs.rand(*a.shape) < nan_frac + a[idx] = np.nan + rs.shuffle(a) + actual = func(a) + desired = func0(a, q=0.5) + err_msg = fmt % (func.__name__, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) + +# This should convince us TODO finish + + +def test_move_quantile_with_infs_and_nans(): + """test move_quantile.c with infs and nans""" + fmt = "\nfunc %s | window %d | min_count %s | q %f\n\nInput array:\n%s\n" + aaae = assert_array_almost_equal + min_count = 1 + func = bn.move_quantile + func0 = bn.slow.move_quantile + rs = np.random.RandomState([1, 2, 3]) + fracs = [0., 0.2, 0.4, 0.6, 0.8, 1.] + for size in [1, 2, 3, 5, 9, 10, 20, 21, 47, 48]: + print(size) + for min_count in [1, 2, 3, size//2, size - 1, size]: + if min_count < 1 or min_count > size: + continue + for _ in range(REPEAT): + for q in [0., 1., rs.rand()]: + for inf_frac, minus_inf_frac, nan_frac in itertools.product(fracs, fracs, fracs): + a = np.arange(size, dtype=np.float64) + idx = rs.rand(*a.shape) < inf_frac + a[idx] = np.inf + idx = rs.rand(*a.shape) < minus_inf_frac + a[idx] = -np.inf + idx = rs.rand(*a.shape) < nan_frac + a[idx] = np.nan + rs.shuffle(a) + for window in range(min_count, size + 1): + actual = func(a, window=window, min_count=min_count, q=q) + desired = func0(a, window=window, min_count=min_count, q=q) + err_msg = fmt % (func.__name__, window, min_count, q, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) + + # ---------------------------------------------------------------------------- diff --git a/bottleneck/tests/util.py b/bottleneck/tests/util.py index cd19d63042..c94faa2e28 100644 --- a/bottleneck/tests/util.py +++ b/bottleneck/tests/util.py @@ -48,6 +48,7 @@ def func_dict(): bn.move_argmin, bn.move_argmax, bn.move_median, + bn.move_quantile, bn.move_rank, ] d["nonreduce"] = [bn.replace] From 7af716847e1c1892d6251dc2098a6dc03b06819b Mon Sep 17 00:00:00 2001 From: riazanov Date: Sat, 17 Sep 2022 18:15:04 -0400 Subject: [PATCH 08/30] Finish extensive testing of move_quantile Fix keyword argument "method"/"interpolation" for different numpy verisons (keyword was changed after 1.22.0) Copy over doc string for move_quantile from C layer to Python layer --- asv_bench/benchmarks/move.py | 6 ++++ bottleneck/__init__.py | 5 +-- bottleneck/slow/move.py | 17 ++++++++-- bottleneck/src/move_quantile.py | 4 ++- bottleneck/tests/move_test.py | 57 ++++++++++++++++++++------------- 5 files changed, 62 insertions(+), 27 deletions(-) diff --git a/asv_bench/benchmarks/move.py b/asv_bench/benchmarks/move.py index 41ad95fb40..5b229c46fb 100644 --- a/asv_bench/benchmarks/move.py +++ b/asv_bench/benchmarks/move.py @@ -38,6 +38,9 @@ def time_move_argmax(self, dtype, shape, window): bn.move_argmax(self.arr, window) def time_move_median(self, dtype, shape, window): + bn.move_median(self.arr, window) + + def time_move_quantile(self, dtype, shape, window): bn.move_quantile(self.arr, window) def time_move_rank(self, dtype, shape, window): @@ -82,6 +85,9 @@ def time_move_argmax(self, dtype, shape, order, axis, window): bn.move_argmax(self.arr, window, axis=axis) def time_move_median(self, dtype, shape, order, axis, window): + bn.move_median(self.arr, window, axis=axis) + + def time_move_quantile(self, dtype, shape, order, axis, window): bn.move_quantile(self.arr, window, axis=axis) def time_move_rank(self, dtype, shape, order, axis, window): diff --git a/bottleneck/__init__.py b/bottleneck/__init__.py index 63b7c764d9..e80b7cc135 100644 --- a/bottleneck/__init__.py +++ b/bottleneck/__init__.py @@ -4,10 +4,11 @@ from . import slow from ._pytesttester import PytestTester -from .move import (move_argmax, move_argmin, move_max, move_mean, move_median, +from .move import (move_argmax, move_argmin, move_max, move_mean, move_median, move_min, move_rank, move_std, move_sum, move_var) -from .src.move_quantile import move_quantile +from .move import move_quantile as move_quantile_c +from .src.move_quantile import move_quantile as move_quantile from .nonreduce import replace from .nonreduce_axis import (argpartition, nanrankdata, partition, push, diff --git a/bottleneck/slow/move.py b/bottleneck/slow/move.py index 64beb0a570..0cb97e2cb9 100644 --- a/bottleneck/slow/move.py +++ b/bottleneck/slow/move.py @@ -114,11 +114,24 @@ def move_rank(a, window, min_count=None, axis=-1): return move_func(lastrank, a, window, min_count, axis=axis) # function for handling infs in np.nanquantile +from packaging import version +if version.parse(np.__version__) > version.parse("1.22.0"): + METHOD_KEYWORD = "method" +else: + METHOD_KEYWORD = "interpolation" + def np_nanquantile_infs(a, **kwargs): if not np.isinf(a).any(): - return np.nanquantile(a, method='midpoint', **kwargs) + kwargs[METHOD_KEYWORD] = 'midpoint' + return np.nanquantile(a, **kwargs) else: - return ((np.nanquantile(a, method='lower', **kwargs) + np.nanquantile(a, method='higher', **kwargs)) / 2) + kwargs[METHOD_KEYWORD] = 'lower' + lower_nanquantile = np.nanquantile(a, **kwargs) + kwargs[METHOD_KEYWORD] = 'higher' + higher_nanquantile = np.nanquantile(a, **kwargs) + + midpoint_nanquantile = (lower_nanquantile + higher_nanquantile) / 2 + return midpoint_nanquantile # magic utility functions --------------------------------------------------- diff --git a/bottleneck/src/move_quantile.py b/bottleneck/src/move_quantile.py index e5c8e6e79c..5a303ad830 100644 --- a/bottleneck/src/move_quantile.py +++ b/bottleneck/src/move_quantile.py @@ -11,4 +11,6 @@ def move_quantile(*args, **kwargs): return move_max(*args, **kwargs) else: del kwargs['q'] - return move_min(*args, **kwargs) \ No newline at end of file + return move_min(*args, **kwargs) + +move_quantile.__doc__ = move_quantile_c.__doc__ \ No newline at end of file diff --git a/bottleneck/tests/move_test.py b/bottleneck/tests/move_test.py index d73c86e520..550ef1f0f0 100644 --- a/bottleneck/tests/move_test.py +++ b/bottleneck/tests/move_test.py @@ -143,7 +143,7 @@ def test_arg_parse_raises(func): # increase size to 30. With those two changes the unit tests will take a # LONG time to run. -REPEAT = 20 +REPEAT_MEDIAN = 10 def test_move_median_with_nans(): """test move_median.c with nans""" @@ -155,7 +155,7 @@ def test_move_median_with_nans(): func0 = bn.slow.move_median rs = np.random.RandomState([1, 2, 3]) for size in [1, 2, 3, 4, 5, 9, 10, 19, 20, 21]: - for i in range(REPEAT): + for i in range(REPEAT_MEDIAN): a = np.arange(size, dtype=np.float64) idx = rs.rand(*a.shape) < 0.1 a[idx] = np.inf @@ -179,7 +179,7 @@ def test_move_median_without_nans(): func0 = bn.slow.move_median rs = np.random.RandomState([1, 2, 3]) for size in [1, 2, 3, 4, 5, 9, 10, 19, 20, 21]: - for i in range(REPEAT): + for i in range(REPEAT_MEDIAN): a = np.arange(size, dtype=np.int64) rs.shuffle(a) for window in range(2, size + 1): @@ -199,6 +199,8 @@ def test_move_median_without_nans(): # # Let's first test without inf and -inf +REPEAT_QUANTILE = 10 + def test_move_quantile_with_nans(): """test move_quantile.c with nans""" fmt = "\nfunc %s | window %d | min_count %s | q %f\n\nInput array:\n%s\n" @@ -209,7 +211,7 @@ def test_move_quantile_with_nans(): func0 = bn.slow.move_quantile rs = np.random.RandomState([1, 2, 3]) for size in [1, 2, 3, 5, 9, 10, 20, 21]: - for _ in range(REPEAT): + for _ in range(REPEAT_QUANTILE): # 0 and 1 are important edge cases for q in [0., 1., rs.rand()]: for inf_frac in [0.2, 0.5, 0.7]: @@ -230,10 +232,10 @@ def test_move_quantile_without_nans(): min_count = 1 size = 10 func = bn.move_quantile - func0 = bn.slow.move_quantile_no_infs + func0 = bn.slow.move_quantile rs = np.random.RandomState([1, 2, 3]) for size in [1, 2, 3, 5, 9, 10, 20, 21]: - for _ in range(REPEAT): + for _ in range(REPEAT_QUANTILE): for q in [0., 1., rs.rand()]: a = np.arange(size, dtype=np.float64) rs.shuffle(a) @@ -247,18 +249,23 @@ def test_move_quantile_without_nans(): # Now let's deal with inf ans -infs # TODO explain what's happening there + +from ..slow.move import np_nanquantile_infs + +REPEAT_NUMPY_QUANTILE = 10 + def test_numpy_nanquantile_infs(): """test move_median.c with nans""" fmt = "\nfunc %s \n\nInput array:\n%s\n" aaae = assert_array_almost_equal min_count = 1 func = np.nanmedian - func0 = bn.slow.np_nanquantile_infs + func0 = np_nanquantile_infs rs = np.random.RandomState([1, 2, 3]) sizes = [1, 2, 3, 4, 5, 9, 10, 20, 31, 50] fracs = [0., 0.2, 0.4, 0.6, 0.8, 1.] for size in sizes: - for _ in range(REPEAT): + for _ in range(REPEAT_NUMPY_QUANTILE): for inf_frac, minus_inf_frac, nan_frac in itertools.product(fracs, fracs, fracs): a = np.arange(size, dtype=np.float64) idx = rs.rand(*a.shape) < inf_frac @@ -275,33 +282,39 @@ def test_numpy_nanquantile_infs(): # This should convince us TODO finish +REPEAT_FULL_QUANTILE = 1 def test_move_quantile_with_infs_and_nans(): """test move_quantile.c with infs and nans""" fmt = "\nfunc %s | window %d | min_count %s | q %f\n\nInput array:\n%s\n" aaae = assert_array_almost_equal - min_count = 1 func = bn.move_quantile func0 = bn.slow.move_quantile rs = np.random.RandomState([1, 2, 3]) fracs = [0., 0.2, 0.4, 0.6, 0.8, 1.] - for size in [1, 2, 3, 5, 9, 10, 20, 21, 47, 48]: + fracs = [0., 0.3, 0.65, 1.] + inf_minf_nan_fracs = [triple for triple in itertools.product(fracs, fracs, fracs) if np.sum(triple) <= 1] + total = 0 + # for size in [1, 2, 3, 5, 9, 10, 20, 21, 47, 48]: + for size in [1, 2, 3, 5, 9, 10, 20, 21]: print(size) for min_count in [1, 2, 3, size//2, size - 1, size]: if min_count < 1 or min_count > size: continue - for _ in range(REPEAT): - for q in [0., 1., rs.rand()]: - for inf_frac, minus_inf_frac, nan_frac in itertools.product(fracs, fracs, fracs): - a = np.arange(size, dtype=np.float64) - idx = rs.rand(*a.shape) < inf_frac - a[idx] = np.inf - idx = rs.rand(*a.shape) < minus_inf_frac - a[idx] = -np.inf - idx = rs.rand(*a.shape) < nan_frac - a[idx] = np.nan - rs.shuffle(a) - for window in range(min_count, size + 1): + for (inf_frac, minus_inf_frac, nan_frac) in inf_minf_nan_fracs: + for window in range(min_count, size + 1): + for _ in range(REPEAT_FULL_QUANTILE): + # for q in [0., 1., 0.25, 0.75, rs.rand()]: + for q in [0.25, 0.75, rs.rand()]: + a = np.arange(size, dtype=np.float64) + randoms = rs.rand(*a.shape) + idx_nans = randoms < inf_frac + minus_inf_frac + nan_frac + a[idx_nans] = np.nan + idx_minfs = randoms < inf_frac + minus_inf_frac + a[idx_minfs] = -np.inf + idx_infs = randoms < inf_frac + a[idx_infs] = np.inf + rs.shuffle(a) actual = func(a, window=window, min_count=min_count, q=q) desired = func0(a, window=window, min_count=min_count, q=q) err_msg = fmt % (func.__name__, window, min_count, q, a) From 42eddad90dae65994fee6468bade47ca67ccfefc Mon Sep 17 00:00:00 2001 From: riazanov Date: Sat, 17 Sep 2022 18:37:32 -0400 Subject: [PATCH 09/30] Ignore warnings from numpy about infs and NaNs --- bottleneck/slow/move.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/bottleneck/slow/move.py b/bottleneck/slow/move.py index 0cb97e2cb9..30bdcdd815 100644 --- a/bottleneck/slow/move.py +++ b/bottleneck/slow/move.py @@ -120,18 +120,20 @@ def move_rank(a, window, min_count=None, axis=-1): else: METHOD_KEYWORD = "interpolation" -def np_nanquantile_infs(a, **kwargs): - if not np.isinf(a).any(): - kwargs[METHOD_KEYWORD] = 'midpoint' - return np.nanquantile(a, **kwargs) - else: - kwargs[METHOD_KEYWORD] = 'lower' - lower_nanquantile = np.nanquantile(a, **kwargs) - kwargs[METHOD_KEYWORD] = 'higher' - higher_nanquantile = np.nanquantile(a, **kwargs) - - midpoint_nanquantile = (lower_nanquantile + higher_nanquantile) / 2 - return midpoint_nanquantile +def np_nanquantile_infs(a, **kwargs): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + if not np.isinf(a).any(): + kwargs[METHOD_KEYWORD] = 'midpoint' + return np.nanquantile(a, **kwargs) + else: + kwargs[METHOD_KEYWORD] = 'lower' + lower_nanquantile = np.nanquantile(a, **kwargs) + kwargs[METHOD_KEYWORD] = 'higher' + higher_nanquantile = np.nanquantile(a, **kwargs) + + midpoint_nanquantile = (lower_nanquantile + higher_nanquantile) / 2 + return midpoint_nanquantile # magic utility functions --------------------------------------------------- From c2a2ae3e2a77299f7fe6f22a2e77094fefd6d8e1 Mon Sep 17 00:00:00 2001 From: riazanov Date: Tue, 20 Sep 2022 21:01:43 -0400 Subject: [PATCH 10/30] move_quantile(q=0) vs move_min benching --- bottleneck/__init__.py | 2 ++ bottleneck/benchmark/bench_detailed.py | 5 +++-- bottleneck/src/move_quantile.py | 8 +++++++- bottleneck/tests/util.py | 1 + 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/bottleneck/__init__.py b/bottleneck/__init__.py index e80b7cc135..a676c890c9 100644 --- a/bottleneck/__init__.py +++ b/bottleneck/__init__.py @@ -9,6 +9,8 @@ from .move import move_quantile as move_quantile_c from .src.move_quantile import move_quantile as move_quantile +from .src.move_quantile import move_min_via_quantile + from .nonreduce import replace from .nonreduce_axis import (argpartition, nanrankdata, partition, push, diff --git a/bottleneck/benchmark/bench_detailed.py b/bottleneck/benchmark/bench_detailed.py index 4907446414..9a647ff68e 100644 --- a/bottleneck/benchmark/bench_detailed.py +++ b/bottleneck/benchmark/bench_detailed.py @@ -74,7 +74,8 @@ def benchsuite(function, fraction_nan): setup = """ from bottleneck import %s as bn_fn try: from numpy import %s as sl_fn - except ImportError: from bottleneck.slow import %s as sl_fn + except ImportError: from bottleneck.slow import move_min as sl_fn + if "%s" != "move_min_via_quantile": from bottleneck.slow import %s as sl_fn # avoid all-nan slice warnings from np.median and np.nanmedian if "%s" == "median": from bottleneck.slow import median as sl_fn @@ -116,7 +117,7 @@ def benchsuite(function, fraction_nan): run = {} run["name"] = [f + signature, array] run["statements"] = ["bn_fn" + signature, "sl_fn" + signature] - run["setup"] = setup % (f, f, f, f, f, array, fraction_nan, fraction_nan) + run["setup"] = setup % (f, f, f, f, f, f, array, fraction_nan, fraction_nan) run["repeat"] = repeat suite.append(run) diff --git a/bottleneck/src/move_quantile.py b/bottleneck/src/move_quantile.py index 5a303ad830..430aac1322 100644 --- a/bottleneck/src/move_quantile.py +++ b/bottleneck/src/move_quantile.py @@ -13,4 +13,10 @@ def move_quantile(*args, **kwargs): del kwargs['q'] return move_min(*args, **kwargs) -move_quantile.__doc__ = move_quantile_c.__doc__ \ No newline at end of file +move_quantile.__doc__ = move_quantile_c.__doc__ + + + +def move_min_via_quantile(*args, **kwargs): + kwargs['q'] = 0. + return move_quantile_c(*args, **kwargs) \ No newline at end of file diff --git a/bottleneck/tests/util.py b/bottleneck/tests/util.py index c94faa2e28..8a0cd793bd 100644 --- a/bottleneck/tests/util.py +++ b/bottleneck/tests/util.py @@ -49,6 +49,7 @@ def func_dict(): bn.move_argmax, bn.move_median, bn.move_quantile, + bn.move_min_via_quantile, bn.move_rank, ] d["nonreduce"] = [bn.replace] From 9f4c5dc59efd8313ac65bb1aad4058c7bc1390eb Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Tue, 20 Sep 2022 21:25:08 -0400 Subject: [PATCH 11/30] Revert "move_quantile(q=0) vs move_min benching" This reverts commit c2a2ae3e2a77299f7fe6f22a2e77094fefd6d8e1. move_min is significanlty faster than move_quantile with q = 0. So in case of q=0 apply move_min instead. Same for q=1 and move_max. --- bottleneck/__init__.py | 2 -- bottleneck/benchmark/bench_detailed.py | 5 ++--- bottleneck/src/move_quantile.py | 8 +------- bottleneck/tests/util.py | 1 - 4 files changed, 3 insertions(+), 13 deletions(-) diff --git a/bottleneck/__init__.py b/bottleneck/__init__.py index a676c890c9..e80b7cc135 100644 --- a/bottleneck/__init__.py +++ b/bottleneck/__init__.py @@ -9,8 +9,6 @@ from .move import move_quantile as move_quantile_c from .src.move_quantile import move_quantile as move_quantile -from .src.move_quantile import move_min_via_quantile - from .nonreduce import replace from .nonreduce_axis import (argpartition, nanrankdata, partition, push, diff --git a/bottleneck/benchmark/bench_detailed.py b/bottleneck/benchmark/bench_detailed.py index 9a647ff68e..4907446414 100644 --- a/bottleneck/benchmark/bench_detailed.py +++ b/bottleneck/benchmark/bench_detailed.py @@ -74,8 +74,7 @@ def benchsuite(function, fraction_nan): setup = """ from bottleneck import %s as bn_fn try: from numpy import %s as sl_fn - except ImportError: from bottleneck.slow import move_min as sl_fn - if "%s" != "move_min_via_quantile": from bottleneck.slow import %s as sl_fn + except ImportError: from bottleneck.slow import %s as sl_fn # avoid all-nan slice warnings from np.median and np.nanmedian if "%s" == "median": from bottleneck.slow import median as sl_fn @@ -117,7 +116,7 @@ def benchsuite(function, fraction_nan): run = {} run["name"] = [f + signature, array] run["statements"] = ["bn_fn" + signature, "sl_fn" + signature] - run["setup"] = setup % (f, f, f, f, f, f, array, fraction_nan, fraction_nan) + run["setup"] = setup % (f, f, f, f, f, array, fraction_nan, fraction_nan) run["repeat"] = repeat suite.append(run) diff --git a/bottleneck/src/move_quantile.py b/bottleneck/src/move_quantile.py index 430aac1322..5a303ad830 100644 --- a/bottleneck/src/move_quantile.py +++ b/bottleneck/src/move_quantile.py @@ -13,10 +13,4 @@ def move_quantile(*args, **kwargs): del kwargs['q'] return move_min(*args, **kwargs) -move_quantile.__doc__ = move_quantile_c.__doc__ - - - -def move_min_via_quantile(*args, **kwargs): - kwargs['q'] = 0. - return move_quantile_c(*args, **kwargs) \ No newline at end of file +move_quantile.__doc__ = move_quantile_c.__doc__ \ No newline at end of file diff --git a/bottleneck/tests/util.py b/bottleneck/tests/util.py index 8a0cd793bd..c94faa2e28 100644 --- a/bottleneck/tests/util.py +++ b/bottleneck/tests/util.py @@ -49,7 +49,6 @@ def func_dict(): bn.move_argmax, bn.move_median, bn.move_quantile, - bn.move_min_via_quantile, bn.move_rank, ] d["nonreduce"] = [bn.replace] From 94476975a0807ede484a5b31f60b93b3c01f865d Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Tue, 20 Sep 2022 21:27:13 -0400 Subject: [PATCH 12/30] Some changes (to ammend later) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d2d9ea2ba6..361857bbb9 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ bottleneck/src/bn_config.h c.o checks.c checks.py +results.txt From de181daf28d0c1617058c62846fd330b6f772e97 Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Tue, 20 Sep 2022 21:56:16 -0400 Subject: [PATCH 13/30] Bench move_quantile(q=0.5) with slow.move_median --- bottleneck/benchmark/bench_detailed.py | 3 ++- bottleneck/slow/move.py | 1 + bottleneck/src/move_quantile.py | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bottleneck/benchmark/bench_detailed.py b/bottleneck/benchmark/bench_detailed.py index 4907446414..e3e56a7ce8 100644 --- a/bottleneck/benchmark/bench_detailed.py +++ b/bottleneck/benchmark/bench_detailed.py @@ -79,6 +79,7 @@ def benchsuite(function, fraction_nan): # avoid all-nan slice warnings from np.median and np.nanmedian if "%s" == "median": from bottleneck.slow import median as sl_fn if "%s" == "nanmedian": from bottleneck.slow import nanmedian as sl_fn + if "%s" == "move_quantile": from bottleneck.slow import move_median as sl_fn from numpy import array, nan from numpy.random import RandomState @@ -116,7 +117,7 @@ def benchsuite(function, fraction_nan): run = {} run["name"] = [f + signature, array] run["statements"] = ["bn_fn" + signature, "sl_fn" + signature] - run["setup"] = setup % (f, f, f, f, f, array, fraction_nan, fraction_nan) + run["setup"] = setup % (f, f, f, f, f, f, array, fraction_nan, fraction_nan) run["repeat"] = repeat suite.append(run) diff --git a/bottleneck/slow/move.py b/bottleneck/slow/move.py index 30bdcdd815..22a1b03e75 100644 --- a/bottleneck/slow/move.py +++ b/bottleneck/slow/move.py @@ -114,6 +114,7 @@ def move_rank(a, window, min_count=None, axis=-1): return move_func(lastrank, a, window, min_count, axis=axis) # function for handling infs in np.nanquantile +# keyword argument for interpolation method in np.nanquantile changed in 1.22.0 from packaging import version if version.parse(np.__version__) > version.parse("1.22.0"): METHOD_KEYWORD = "method" diff --git a/bottleneck/src/move_quantile.py b/bottleneck/src/move_quantile.py index 5a303ad830..53d2a1b86f 100644 --- a/bottleneck/src/move_quantile.py +++ b/bottleneck/src/move_quantile.py @@ -4,12 +4,12 @@ all = ["move_quantile"] def move_quantile(*args, **kwargs): - if ('q' not in kwargs) or ((kwargs['q'] != 1.) and (kwargs['q'] != 0.)): + if ('q' not in kwargs) or ((kwargs['q'] > 0.) and (kwargs['q'] < 1.)): return move_quantile_c(*args, **kwargs) elif (kwargs['q'] == 1.): del kwargs['q'] return move_max(*args, **kwargs) - else: + elif (kwargs['q'] == 0.): del kwargs['q'] return move_min(*args, **kwargs) From 02a0ce101690130caffa441d89ff02f2af897bfc Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Tue, 20 Sep 2022 23:15:50 -0400 Subject: [PATCH 14/30] Bring old move_median, add move_quantile separately Instead of move_quantile substituting move_median completely, have both move_median and move_quantile implemented separately. --- bottleneck/src/move_median/move_median.c | 684 ++++++++++++++++++----- bottleneck/src/move_median/move_median.h | 36 +- bottleneck/src/move_template.c | 109 +++- 3 files changed, 649 insertions(+), 180 deletions(-) diff --git a/bottleneck/src/move_median/move_median.c b/bottleneck/src/move_median/move_median.c index e0f31ee09f..3e431bbd5e 100644 --- a/bottleneck/src/move_median/move_median.c +++ b/bottleneck/src/move_median/move_median.c @@ -25,11 +25,14 @@ idx1 = idx2 */ /* helper functions */ -static inline ai_t mm_get_quantile(mm_handle *mm); -static inline idx_t mm_k_stat(mm_handle *mm, idx_t idx); -static inline short mm_stat_exact(mm_handle *mm, idx_t idx); -static inline void heapify_small_node(mm_handle *mm, idx_t idx); -static inline void heapify_large_node(mm_handle *mm, idx_t idx); +static inline ai_t mm_get_median(mm_handle *mm); +static inline ai_t mq_get_quantile(mq_handle *mq); +static inline idx_t mq_k_stat(mq_handle *mq, idx_t idx); +static inline short mq_stat_exact(mq_handle *mq, idx_t idx); +static inline void heapify_small_node_mm(mm_handle *mm, idx_t idx); +static inline void heapify_large_node_mm(mm_handle *mm, idx_t idx); +static inline void heapify_small_node_mq(mq_handle *mq, idx_t idx); +static inline void heapify_large_node_mq(mq_handle *mq, idx_t idx); static inline idx_t mm_get_smallest_child(mm_node **heap, idx_t window, idx_t idx, mm_node **child); static inline idx_t mm_get_largest_child(mm_node **heap, idx_t window, @@ -59,16 +62,13 @@ static inline void mm_swap_heap_heads(mm_node **s_heap, idx_t n_s, * small values (a max heap); the other heap contains the large values (a min * heap). The handle, containing information about the heaps, is returned. */ mm_handle * -mm_new(const idx_t window, idx_t min_count, double quantile) { +mm_new(const idx_t window, idx_t min_count) { mm_handle *mm = malloc(sizeof(mm_handle)); - mm->nodes = malloc(window * sizeof(mm_node*)); + mm->nodes = malloc(window * sizeof(mm_node*)); mm->node_data = malloc(window * sizeof(mm_node)); - mm->quantile = quantile; - mm->s_heap = mm->nodes; - idx_t k_stat = mm_k_stat(mm, window); - mm->l_heap = &mm->nodes[k_stat]; + mm->l_heap = &mm->nodes[window / 2 + window % 2]; mm->window = window; mm->odd = window % 2; @@ -79,6 +79,27 @@ mm_new(const idx_t window, idx_t min_count, double quantile) { return mm; } +mq_handle * +mq_new(const idx_t window, idx_t min_count, double quantile) { + mq_handle *mq = malloc(sizeof(mq_handle)); + mq->nodes = malloc(window * sizeof(mm_node*)); + mq->node_data = malloc(window * sizeof(mm_node)); + + mq->quantile = quantile; + + mq->s_heap = mq->nodes; + idx_t k_stat = mq_k_stat(mq, window); + mq->l_heap = &mq->nodes[k_stat]; + + mq->window = window; + mq->odd = window % 2; + mq->min_count = min_count; + + mq_reset(mq); + + return mq; +} + /* Insert a new value, ai, into one of the heaps. Use this function when * the heaps contain less than window-1 nodes. Returns the median value. @@ -102,11 +123,8 @@ mm_update_init(mm_handle *mm, ai_t ai) { mm->s_first_leaf = 0; } else { /* at least one node already exists in the heaps */ - - idx_t k_stat = mm_k_stat(mm, n_s + n_l + 1); - mm->newest->next = node; - if (n_s >= k_stat) { + if (n_s > n_l) { /* add new node to large heap */ mm->l_heap[n_l] = node; node->region = LH; @@ -121,13 +139,61 @@ mm_update_init(mm_handle *mm, ai_t ai) { node->idx = n_s; ++mm->n_s; mm->s_first_leaf = FIRST_LEAF(mm->n_s); - heapify_small_node(mm, n_s); + heapify_small_node_mm(mm, n_s); } } mm->newest = node; - return mm_get_quantile(mm); + return mm_get_median(mm); +} + + +ai_t +mq_update_init(mq_handle *mq, ai_t ai) { + mm_node *node; + idx_t n_s = mq->n_s; + idx_t n_l = mq->n_l; + + node = &mq->node_data[n_s + n_l]; + node->ai = ai; + + if (n_s == 0) { + /* the first node to appear in a heap */ + mq->s_heap[0] = node; + node->region = SH; + node->idx = 0; + mq->oldest = node; /* only need to set the oldest node once */ + mq->n_s = 1; + mq->s_first_leaf = 0; + } else { + /* at least one node already exists in the heaps */ + + idx_t k_stat = mq_k_stat(mq, n_s + n_l + 1); + + mq->newest->next = node; + if (n_s >= k_stat) { + /* add new node to large heap */ + mq->l_heap[n_l] = node; + node->region = LH; + node->idx = n_l; + ++mq->n_l; + mq->l_first_leaf = FIRST_LEAF(mq->n_l); + heapify_large_node(mq, n_l); + } else { + /* add new node to small heap */ + mq->s_heap[n_s] = node; + node->region = SH; + node->idx = n_s; + ++mq->n_s; + mq->s_first_leaf = FIRST_LEAF(mq->n_s); + heapify_small_node_mq(mq, n_s); + } + } + + mq->newest = node; + + return mq_get_quantile(mq); } @@ -148,13 +214,39 @@ mm_update(mm_handle *mm, ai_t ai) { /* adjust position of new node in heap if needed */ if (node->region == SH) { - heapify_small_node(mm, node->idx); + heapify_small_node_mm(mm, node->idx); } else { heapify_large_node(mm, node->idx); } /* return the median */ - return mm_get_quantile(mm); + if (mm->odd) { + return mm->s_heap[0]->ai; + } else { + return (mm->s_heap[0]->ai + mm->l_heap[0]->ai) / 2.0; + } +} + +ai_t +mq_update(mq_handle *mq, ai_t ai) { + /* node is oldest node with ai of newest node */ + mm_node *node = mq->oldest; + node->ai = ai; + + /* update oldest, newest */ + mq->oldest = mq->oldest->next; + mq->newest->next = node; + mq->newest = node; + + /* adjust position of new node in heap if needed */ + if (node->region == SH) { + heapify_small_node_mq(mq, node->idx); + } else { + heapify_large_node(mq, node->idx); + } + + /* return the median */ + return mq_get_quantile(mq); } @@ -169,16 +261,13 @@ mm_update(mm_handle *mm, ai_t ai) { * large values (a min heap); the nan array contains the NaNs. The handle, * containing information about the heaps and the nan array is returned. */ mm_handle * -mm_new_nan(const idx_t window, idx_t min_count, double quantile) { +mm_new_nan(const idx_t window, idx_t min_count) { mm_handle *mm = malloc(sizeof(mm_handle)); mm->nodes = malloc(2 * window * sizeof(mm_node*)); mm->node_data = malloc(window * sizeof(mm_node)); - mm->quantile = quantile; - mm->s_heap = mm->nodes; - idx_t k_stat = mm_k_stat(mm, window); - mm->l_heap = &mm->nodes[k_stat]; + mm->l_heap = &mm->nodes[window / 2 + window % 2]; mm->n_array = &mm->nodes[window]; mm->window = window; @@ -190,6 +279,28 @@ mm_new_nan(const idx_t window, idx_t min_count, double quantile) { } +mq_handle * +mq_new_nan(const idx_t window, idx_t min_count, double quantile) { + mq_handle *mq = malloc(sizeof(mq_handle)); + mq->nodes = malloc(2 * window * sizeof(mm_node*)); + mq->node_data = malloc(window * sizeof(mm_node)); + + mq->quantile = quantile; + + mq->s_heap = mq->nodes; + idx_t k_stat = mq_k_stat(mq, window); + mq->l_heap = &mq->nodes[k_stat]; + mq->n_array = &mq->nodes[window]; + + mq->window = window; + mq->min_count = min_count; + + mq_reset(mq); + + return mq; +} + + /* Insert a new value, ai, into one of the heaps or the nan array. Use this * function when there are less than window-1 nodes. Returns the median * value. Once there are window-1 nodes in the heap, switch to using @@ -231,11 +342,8 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { mm->s_first_leaf = 0; } else { /* at least one node already exists in the heaps */ - /* number of non-nan nodes including the new node is n_s + n_l + 1 */ - idx_t k_stat = mm_k_stat(mm, n_s + n_l + 1); - mm->newest->next = node; - if (n_s >= k_stat) { + if (n_s > n_l) { /* add new node to large heap */ mm->l_heap[n_l] = node; node->region = LH; @@ -250,13 +358,79 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { node->idx = n_s; ++mm->n_s; mm->s_first_leaf = FIRST_LEAF(mm->n_s); - heapify_small_node(mm, n_s); + heapify_small_node_mm(mm, n_s); } } } mm->newest = node; - return mm_get_quantile(mm); + return mm_get_median(mm); +} + + +ai_t +mq_update_init_nan(mq_handle *mq, ai_t ai) { + mm_node *node; + idx_t n_s = mq->n_s; + idx_t n_l = mq->n_l; + idx_t n_n = mq->n_n; + + node = &mq->node_data[n_s + n_l + n_n]; + node->ai = ai; + + if (ai != ai) { + mq->n_array[n_n] = node; + node->region = NA; + node->idx = n_n; + if (n_s + n_l + n_n == 0) { + /* only need to set the oldest node once */ + mq->oldest = node; + } else { + mq->newest->next = node; + } + ++mq->n_n; + } else { + if (n_s == 0) { + /* the first node to appear in a heap */ + mq->s_heap[0] = node; + node->region = SH; + node->idx = 0; + if (n_s + n_l + n_n == 0) { + /* only need to set the oldest node once */ + mq->oldest = node; + } else { + mq->newest->next = node; + } + mq->n_s = 1; + mq->s_first_leaf = 0; + } else { + /* at least one node already exists in the heaps */ + /* number of non-nan nodes including the new node is n_s + n_l + 1 */ + idx_t k_stat = mq_k_stat(mq, n_s + n_l + 1); + + mq->newest->next = node; + if (n_s >= k_stat) { + /* add new node to large heap */ + mq->l_heap[n_l] = node; + node->region = LH; + node->idx = n_l; + ++mq->n_l; + mq->l_first_leaf = FIRST_LEAF(mq->n_l); + heapify_large_node(mq, n_l); + } else { + /* add new node to small heap */ + mq->s_heap[n_s] = node; + node->region = SH; + node->idx = n_s; + ++mq->n_s; + mq->s_first_leaf = FIRST_LEAF(mq->n_s); + heapify_small_node_mq(mq, n_s); + } + } + } + mq->newest = node; + + return mq_get_quantile(mq); } @@ -290,15 +464,12 @@ mm_update_nan(mm_handle *mm, ai_t ai) { n_l = mm->n_l; n_n = mm->n_n; - idx_t k_stat; - if (ai != ai) { if (node->region == SH) { /* Oldest node is in the small heap and needs to be moved * to the nan array. Resulting hole in the small heap will be * filled with the rightmost leaf of the last row of the small * heap. */ - k_stat = mm_k_stat(mm, n_s + n_l - 1); /* insert node into nan array */ node->region = NA; @@ -334,9 +505,9 @@ mm_update_nan(mm_handle *mm, ai_t ai) { if (idx != n_s - 1) { s_heap[idx] = s_heap[n_s - 1]; s_heap[idx]->idx = idx; - heapify_small_node(mm, idx); + heapify_small_node_mm(mm, idx); } - if (mm->n_s < k_stat) { + if (mm->n_s < mm->n_l) { /* move head node from the large heap to the small heap */ node2 = mm->l_heap[0]; node2->idx = mm->n_s; @@ -344,7 +515,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { s_heap[mm->n_s] = node2; ++mm->n_s; mm->l_first_leaf = FIRST_LEAF(mm->n_s); - heapify_small_node(mm, node2->idx); + heapify_small_node_mm(mm, node2->idx); /* plug hole in large heap */ node2 = mm->l_heap[mm->n_l - 1]; @@ -360,7 +531,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { } else { mm->s_first_leaf = FIRST_LEAF(mm->n_s); - heapify_small_node(mm, idx); + heapify_small_node_mm(mm, idx); } } } else if (node->region == LH) { @@ -369,7 +540,6 @@ mm_update_nan(mm_handle *mm, ai_t ai) { * filled with the rightmost leaf of the last row of the large * heap. */ - k_stat = mm_k_stat(mm, n_s + n_l - 1); /* insert node into nan array */ node->region = NA; node->idx = n_n; @@ -388,7 +558,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { } else { mm->l_first_leaf = FIRST_LEAF(mm->n_l); } - if (mm->n_s > k_stat) { + if (mm->n_l < mm->n_s - 1) { /* move head node from the small heap to the large heap */ node2 = mm->s_heap[0]; node2->idx = mm->n_l; @@ -410,7 +580,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { } else { mm->s_first_leaf = FIRST_LEAF(mm->n_s); } - heapify_small_node(mm, 0); + heapify_small_node_mm(mm, 0); } /* reorder large heap if needed */ heapify_large_node(mm, idx); @@ -420,14 +590,12 @@ mm_update_nan(mm_handle *mm, ai_t ai) { } } else { if (node->region == SH) { - heapify_small_node(mm, idx); + heapify_small_node_mm(mm, idx); } else if (node->region == LH) { heapify_large_node(mm, idx); } else { /* ai is not NaN but oldest node is in nan array */ - k_stat = mm_k_stat(mm, n_s + n_l + 1); - - if (n_s == k_stat) { + if (n_s > n_l) { /* insert into large heap */ node->region = LH; node->idx = n_l; @@ -442,7 +610,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { s_heap[n_s] = node; ++mm->n_s; mm->s_first_leaf = FIRST_LEAF(mm->n_s); - heapify_small_node(mm, n_s); + heapify_small_node_mm(mm, n_s); } /* plug nan array hole */ if (idx != n_n - 1) { @@ -452,7 +620,200 @@ mm_update_nan(mm_handle *mm, ai_t ai) { --mm->n_n; } } - return mm_get_quantile(mm); + return mm_get_median(mm); +} + + +ai_t +mq_update_nan(mq_handle *mq, ai_t ai) { + idx_t n_s, n_l, n_n; + + mm_node **l_heap; + mm_node **s_heap; + mm_node **n_array; + mm_node *node2; + + /* node is oldest node with ai of newest node */ + mm_node *node = mq->oldest; + idx_t idx = node->idx; + node->ai = ai; + + /* update oldest, newest */ + mq->oldest = mq->oldest->next; + mq->newest->next = node; + mq->newest = node; + + l_heap = mq->l_heap; + s_heap = mq->s_heap; + n_array = mq->n_array; + + n_s = mq->n_s; + n_l = mq->n_l; + n_n = mq->n_n; + + idx_t k_stat; + + if (ai != ai) { + if (node->region == SH) { + /* Oldest node is in the small heap and needs to be moved + * to the nan array. Resulting hole in the small heap will be + * filled with the rightmost leaf of the last row of the small + * heap. */ + k_stat = mq_k_stat(mq, n_s + n_l - 1); + + /* insert node into nan array */ + node->region = NA; + node->idx = n_n; + n_array[n_n] = node; + ++mq->n_n; + + /* plug small heap hole */ + --mq->n_s; + if (mq->n_s == 0) { + mq->s_first_leaf = 0; + if (n_l > 0) { + /* move head node from the large heap to the small heap */ + node2 = mq->l_heap[0]; + node2->region = SH; + s_heap[0] = node2; + mq->n_s = 1; + mq->s_first_leaf = 0; + + /* plug hole in large heap */ + node2 = mq->l_heap[mq->n_l - 1]; + node2->idx = 0; + l_heap[0] = node2; + --mq->n_l; + if (mq->n_l == 0) { + mq->l_first_leaf = 0; + } else { + mq->l_first_leaf = FIRST_LEAF(mq->n_l); + } + heapify_large_node(mq, 0); + } + } else { + if (idx != n_s - 1) { + s_heap[idx] = s_heap[n_s - 1]; + s_heap[idx]->idx = idx; + heapify_small_node_mq(mq, idx); + } + if (mq->n_s < k_stat) { + /* move head node from the large heap to the small heap */ + node2 = mq->l_heap[0]; + node2->idx = mq->n_s; + node2->region = SH; + s_heap[mq->n_s] = node2; + ++mq->n_s; + mq->l_first_leaf = FIRST_LEAF(mq->n_s); + heapify_small_node_mq(mq, node2->idx); + + /* plug hole in large heap */ + node2 = mq->l_heap[mq->n_l - 1]; + node2->idx = 0; + l_heap[0] = node2; + --mq->n_l; + if (mq->n_l == 0) { + mq->l_first_leaf = 0; + } else { + mq->l_first_leaf = FIRST_LEAF(mq->n_l); + } + heapify_large_node(mq, 0); + + } else { + mq->s_first_leaf = FIRST_LEAF(mq->n_s); + heapify_small_node_mq(mq, idx); + } + } + } else if (node->region == LH) { + /* Oldest node is in the large heap and needs to be moved + * to the nan array. Resulting hole in the large heap will be + * filled with the rightmost leaf of the last row of the large + * heap. */ + + k_stat = mq_k_stat(mq, n_s + n_l - 1); + /* insert node into nan array */ + node->region = NA; + node->idx = n_n; + n_array[n_n] = node; + ++mq->n_n; + + /* plug large heap hole */ + if (idx != n_l - 1) { + l_heap[idx] = l_heap[n_l - 1]; + l_heap[idx]->idx = idx; + heapify_large_node(mq, idx); + } + --mq->n_l; + if (mq->n_l == 0) { + mq->l_first_leaf = 0; + } else { + mq->l_first_leaf = FIRST_LEAF(mq->n_l); + } + if (mq->n_s > k_stat) { + /* move head node from the small heap to the large heap */ + node2 = mq->s_heap[0]; + node2->idx = mq->n_l; + node2->region = LH; + l_heap[mq->n_l] = node2; + ++mq->n_l; + mq->l_first_leaf = FIRST_LEAF(mq->n_l); + heapify_large_node(mq, node2->idx); + + /* plug hole in small heap */ + if (n_s != 1) { + node2 = mq->s_heap[mq->n_s - 1]; + node2->idx = 0; + s_heap[0] = node2; + } + --mq->n_s; + if (mq->n_s == 0) { + mq->s_first_leaf = 0; + } else { + mq->s_first_leaf = FIRST_LEAF(mq->n_s); + } + heapify_small_node_mq(mq, 0); + } + /* reorder large heap if needed */ + heapify_large_node(mq, idx); + } else if (node->region == NA) { + /* insert node into nan heap */ + n_array[idx] = node; + } + } else { + if (node->region == SH) { + heapify_small_node_mq(mq, idx); + } else if (node->region == LH) { + heapify_large_node(mq, idx); + } else { + /* ai is not NaN but oldest node is in nan array */ + k_stat = mq_k_stat(mq, n_s + n_l + 1); + + if (n_s == k_stat) { + /* insert into large heap */ + node->region = LH; + node->idx = n_l; + l_heap[n_l] = node; + ++mq->n_l; + mq->l_first_leaf = FIRST_LEAF(mq->n_l); + heapify_large_node(mq, n_l); + } else { + /* insert into small heap */ + node->region = SH; + node->idx = n_s; + s_heap[n_s] = node; + ++mq->n_s; + mq->s_first_leaf = FIRST_LEAF(mq->n_s); + heapify_small_node_mq(mq, n_s); + } + /* plug nan array hole */ + if (idx != n_n - 1) { + n_array[idx] = n_array[n_n - 1]; + n_array[idx]->idx = idx; + } + --mq->n_n; + } + } + return mq_get_quantile(mq); } @@ -474,6 +835,15 @@ mm_reset(mm_handle *mm) { mm->l_first_leaf = 0; } +void +mq_reset(mq_handle *mq) { + mq->n_l = 0; + mq->n_s = 0; + mq->n_n = 0; + mq->s_first_leaf = 0; + mq->l_first_leaf = 0; +} + /* After bn.move_median is done, free the memory */ void @@ -483,6 +853,12 @@ mm_free(mm_handle *mm) { free(mm); } +void +mq_free(mq_handle *mq) { + free(mq->node_data); + free(mq->nodes); + free(mq); +} /* ----------------------------------------------------------------------------- @@ -491,123 +867,129 @@ mm_free(mm_handle *mm) { */ /* function to find the current index of element correspodning to the quantile */ -static inline idx_t mm_k_stat(mm_handle *mm, idx_t idx) { - return (idx_t) floor((idx - 1) * mm->quantile) + 1; +static inline idx_t mq_k_stat(mq_handle *mq, idx_t idx) { + return (idx_t) floor((idx - 1) * mq->quantile) + 1; } /* function to check if the current index of the quantile is integer, and so * the quantile is the element at the top of the heap. Otherwise take midpoint */ -static inline short mm_stat_exact(mm_handle *mm, idx_t idx) { - return ((idx - 1) * mm->quantile) == floor((idx - 1) * mm->quantile); +static inline short mq_stat_exact(mq_handle *mq, idx_t idx) { + return ((idx - 1) * mq->quantile) == floor((idx - 1) * mq->quantile); } /* Return the current quantile */ static inline ai_t -mm_get_quantile(mm_handle *mm) { - idx_t n_total = mm->n_l + mm->n_s; - if (n_total < mm->min_count) +mq_get_quantile(mq_handle *mq) { + idx_t n_total = mq->n_l + mq->n_s; + if (n_total < mq->min_count) return MM_NAN(); - if (mm_stat_exact(mm, n_total)) - return mm->s_heap[0]->ai; - return (mm->s_heap[0]->ai + mm->l_heap[0]->ai) / 2.0; -} - - -static inline void -heapify_small_node(mm_handle *mm, idx_t idx) { - idx_t idx2; - mm_node *node; - mm_node *node2; - mm_node **s_heap; - mm_node **l_heap; - idx_t n_s, n_l; - ai_t ai; - - s_heap = mm->s_heap; - l_heap = mm->l_heap; - node = s_heap[idx]; - n_s = mm->n_s; - n_l = mm->n_l; - ai = node->ai; - - /* Internal or leaf node */ - if (idx > 0) { - idx2 = P_IDX(idx); - node2 = s_heap[idx2]; - - /* Move up */ - if (ai > node2->ai) { - mm_move_up_small(s_heap, idx, node, idx2, node2); - - /* Maybe swap between heaps */ - node2 = l_heap[0]; - if (ai > node2->ai) { - mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node, node2); - } - } else if (idx < mm->s_first_leaf) { - /* Move down */ - mm_move_down_small(s_heap, n_s, idx, node); - } - } else { - /* Head node */ - node2 = l_heap[0]; - if (n_l > 0 && ai > node2->ai) { - mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node, node2); - } else { - mm_move_down_small(s_heap, n_s, idx, node); - } - } -} - - -static inline void -heapify_large_node(mm_handle *mm, idx_t idx) { - idx_t idx2; - mm_node *node; - mm_node *node2; - mm_node **s_heap; - mm_node **l_heap; - idx_t n_s, n_l; - ai_t ai; - - s_heap = mm->s_heap; - l_heap = mm->l_heap; - node = l_heap[idx]; - n_s = mm->n_s; - n_l = mm->n_l; - ai = node->ai; - - /* Internal or leaf node */ - if (idx > 0) { - idx2 = P_IDX(idx); - node2 = l_heap[idx2]; - - /* Move down */ - if (ai < node2->ai) { - mm_move_down_large(l_heap, idx, node, idx2, node2); - - /* Maybe swap between heaps */ - node2 = s_heap[0]; - if (ai < node2->ai) { - mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node2, node); - } - } else if (idx < mm->l_first_leaf) { - /* Move up */ - mm_move_up_large(l_heap, n_l, idx, node); - } - } else { - /* Head node */ - node2 = s_heap[0]; - if (n_s > 0 && ai < node2->ai) { - mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node2, node); - } else { - mm_move_up_large(l_heap, n_l, idx, node); - } - } - + if (mq_stat_exact(mq, n_total)) + return mq->s_heap[0]->ai; + return (mq->s_heap[0]->ai + mq->l_heap[0]->ai) / 2.0; } +/* mtype is mm (move_median) or mq (move_quantile) */ +#define HEAPIFY_SMALL_NODE(mtype) \ + static inline void \ + heapify_small_node_##mtype(mtype##_handle *mtype, idx_t idx) { \ + idx_t idx2; \ + mm_node *node; \ + mm_node *node2; \ + mm_node **s_heap; \ + mm_node **l_heap; \ + idx_t n_s, n_l; \ + ai_t ai; \ + \ + s_heap = mtype->s_heap; \ + l_heap = mtype->l_heap; \ + node = s_heap[idx]; \ + n_s = mtype->n_s; \ + n_l = mtype->n_l; \ + ai = node->ai; \ + \ + /* Internal or leaf node */ \ + if (idx > 0) { \ + idx2 = P_IDX(idx); \ + node2 = s_heap[idx2]; \ + \ + /* Move up */ \ + if (ai > node2->ai) { \ + mm_move_up_small(s_heap, idx, node, idx2, node2); \ + \ + /* Maybe swap between heaps */ \ + node2 = l_heap[0]; \ + if (ai > node2->ai) { \ + mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node, node2); \ + } \ + } else if (idx < mtype->s_first_leaf) { \ + /* Move down */ \ + mm_move_down_small(s_heap, n_s, idx, node); \ + } \ + } else { \ + /* Head node */ \ + node2 = l_heap[0]; \ + if (n_l > 0 && ai > node2->ai) { \ + mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node, node2); \ + } else { \ + mm_move_down_small(s_heap, n_s, idx, node); \ + } \ + } \ + } \ + +HEAPIFY_SMALL_NODE(mm) +HEAPIFY_SMALL_NODE(mq) + +/* mtype is mm (move_median) or mq (move_quantile) */ +#define HEAPIFY_LARGE_NODE(mtype) \ + static inline void \ + heapify_large_node##mtype(mtype##_handle *mtype, idx_t idx) { \ + idx_t idx2; \ + mm_node *node; \ + mm_node *node2; \ + mm_node **s_heap; \ + mm_node **l_heap; \ + idx_t n_s, n_l; \ + ai_t ai; \ + \ + s_heap = mtype->s_heap; \ + l_heap = mtype->l_heap; \ + node = l_heap[idx]; \ + n_s = mtype->n_s; \ + n_l = mtype->n_l; \ + ai = node->ai; \ + \ + /* Internal or leaf node */ \ + if (idx > 0) { \ + idx2 = P_IDX(idx); \ + node2 = l_heap[idx2]; \ + \ + /* Move down */ \ + if (ai < node2->ai) { \ + mm_move_down_large(l_heap, idx, node, idx2, node2); \ + \ + /* Maybe swap between heaps */ \ + node2 = s_heap[0]; \ + if (ai < node2->ai) { \ + mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node2, node); \ + } \ + } else if (idx < mtype->l_first_leaf) { \ + /* Move up */ \ + mm_move_up_large(l_heap, n_l, idx, node); \ + } \ + } else { \ + /* Head node */ \ + node2 = s_heap[0]; \ + if (n_s > 0 && ai < node2->ai) { \ + mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node2, node); \ + } else { \ + mm_move_up_large(l_heap, n_l, idx, node); \ + } \ + } \ + } \ + +HEAPIFY_LARGE_NODE(mm) +HEAPIFY_LARGE_NODE(mq) /* Return the index of the smallest child of the node. The pointer * child will also be set. */ diff --git a/bottleneck/src/move_median/move_median.h b/bottleneck/src/move_median/move_median.h index eb02f96c48..33a654f0ab 100644 --- a/bottleneck/src/move_median/move_median.h +++ b/bottleneck/src/move_median/move_median.h @@ -51,24 +51,54 @@ struct _mm_handle { mm_node *newest; /* The newest node (most recent insert) */ idx_t s_first_leaf; /* All nodes this index or greater are leaf nodes */ idx_t l_first_leaf; /* All nodes this index or greater are leaf nodes */ - double quantile; /* Value between 0 <= quantile <= 1, the quantile to compute */ }; typedef struct _mm_handle mm_handle; +struct _mq_handle { + idx_t window; /* window size */ + int odd; /* is window even (0) or odd (1) */ + idx_t min_count; /* Same meaning as in bn.move_median */ + idx_t n_s; /* Number of nodes in the small heap */ + idx_t n_l; /* Number of nodes in the large heap */ + idx_t n_n; /* Number of nodes in the nan array */ + mm_node **s_heap; /* The max heap of small ai */ + mm_node **l_heap; /* The min heap of large ai */ + mm_node **n_array; /* The nan array */ + mm_node **nodes; /* s_heap and l_heap point into this array */ + mm_node *node_data; /* Pointer to memory location where nodes live */ + mm_node *oldest; /* The oldest node */ + mm_node *newest; /* The newest node (most recent insert) */ + idx_t s_first_leaf; /* All nodes this index or greater are leaf nodes */ + idx_t l_first_leaf; /* All nodes this index or greater are leaf nodes */ + double quantile; /* Value between 0 <= quantile <= 1, the quantile to compute */ +}; +typedef struct _mq_handle mq_handle; + /* non-nan functions */ -mm_handle *mm_new(const idx_t window, idx_t min_count, double quantile); +mm_handle *mm_new(const idx_t window, idx_t min_count); ai_t mm_update_init(mm_handle *mm, ai_t ai); ai_t mm_update(mm_handle *mm, ai_t ai); +mq_handle *mq_new(const idx_t window, idx_t min_count, double quantile); +ai_t mq_update_init(mq_handle *mq, ai_t ai); +ai_t mq_update(mq_handle *mq, ai_t ai); + /* nan functions */ -mm_handle *mm_new_nan(const idx_t window, idx_t min_count, double quantile); +mm_handle *mm_new_nan(const idx_t window, idx_t min_count); ai_t mm_update_init_nan(mm_handle *mm, ai_t ai); ai_t mm_update_nan(mm_handle *mm, ai_t ai); +mq_handle *mq_new_nan(const idx_t window, idx_t min_count, double quantile); +ai_t mq_update_init_nan(mq_handle *mq, ai_t ai); +ai_t mq_update_nan(mq_handle *mq, ai_t ai); + /* functions common to non-nan and nan cases */ void mm_reset(mm_handle *mm); void mm_free(mm_handle *mm); +void mq_reset(mq_handle *mq); +void mq_free(mq_handle *mq); + /* Copied from Cython ---------------------------------------------------- */ /* NaN */ diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index 3e96974733..1d0a2585b8 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -49,21 +49,6 @@ has_quantile); \ } -#define MOVE_MAIN_SUBSTITUTE(name, func_name, has_ddof, has_quantile) \ - static PyObject * \ - name(PyObject *self, PyObject *args, PyObject *kwds) \ - { \ - return mover(#name, \ - args, \ - kwds, \ - func_name##_float64, \ - func_name##_float32, \ - func_name##_int64, \ - func_name##_int32, \ - has_ddof, \ - has_quantile); \ - } - /* typedefs and prototypes ----------------------------------------------- */ /* used by move_min and move_max */ @@ -556,19 +541,19 @@ MOVE(NAME, DTYPE0) { MOVE_MAIN(NAME, 0, 0) /* repeat end */ -/* move_quantile ----------------------------------------------------------- */ +/* move_median ----------------------------------------------------------- */ /* dtype = [['float64'], ['float32']] */ -MOVE(move_quantile, DTYPE0) { +MOVE(move_median, DTYPE0) { npy_DTYPE0 ai; - mm_handle *mm = mm_new_nan(window, min_count, quantile); + mm_handle *mm = mm_new_nan(window, min_count); INIT(NPY_DTYPE0) if (window == 1) { mm_free(mm); return PyArray_Copy(a); } if (mm == NULL) { - MEMORY_ERR("Could not allocate memory for move_quantile"); + MEMORY_ERR("Could not allocate memory for move_median"); } BN_BEGIN_ALLOW_THREADS WHILE { @@ -594,9 +579,9 @@ MOVE(move_quantile, DTYPE0) { /* dtype end */ /* dtype = [['int64', 'float64'], ['int32', 'float64']] */ -MOVE(move_quantile, DTYPE0) { +MOVE(move_median, DTYPE0) { npy_DTYPE0 ai; - mm_handle *mm = mm_new(window, min_count, quantile); + mm_handle *mm = mm_new(window, min_count); INIT(NPY_DTYPE1) if (window == 1) { return PyArray_CastToType(a, @@ -604,7 +589,7 @@ MOVE(move_quantile, DTYPE0) { PyArray_CHKFLAGS(a, NPY_ARRAY_F_CONTIGUOUS)); } if (mm == NULL) { - MEMORY_ERR("Could not allocate memory for move_quantile"); + MEMORY_ERR("Could not allocate memory for move_median"); } BN_BEGIN_ALLOW_THREADS WHILE { @@ -630,12 +615,84 @@ MOVE(move_quantile, DTYPE0) { /* dtype end */ -MOVE_MAIN(move_quantile, 0, 1) -/* move_median uses move_quantile but doesn't take the quantile argument, - * which defaults to 0.5, giving the desired median */ -MOVE_MAIN_SUBSTITUTE(move_median, move_quantile, 0, 0) +MOVE_MAIN(move_median, 0, 0) + +/* move_quantile ----------------------------------------------------------- */ + +/* dtype = [['float64'], ['float32']] */ +MOVE(move_quantile, DTYPE0) { + npy_DTYPE0 ai; + mq_handle *mq = mq_new_nan(window, min_count, quantile); + INIT(NPY_DTYPE0) + if (window == 1) { + mq_free(mq); + return PyArray_Copy(a); + } + if (mq == NULL) { + MEMORY_ERR("Could not allocate memory for move_quantile"); + } + BN_BEGIN_ALLOW_THREADS + WHILE { + WHILE0 { + ai = AI(DTYPE0); + YI(DTYPE0) = mq_update_init_nan(mq, ai); + } + WHILE1 { + ai = AI(DTYPE0); + YI(DTYPE0) = mq_update_init_nan(mq, ai); + } + WHILE2 { + ai = AI(DTYPE0); + YI(DTYPE0) = mq_update_nan(mq, ai); + } + mq_reset(mq); + NEXT2 + } + mq_free(mq); + BN_END_ALLOW_THREADS + return y; +} +/* dtype end */ + +/* dtype = [['int64', 'float64'], ['int32', 'float64']] */ +MOVE(move_quantile, DTYPE0) { + npy_DTYPE0 ai; + mq_handle *mq = mq_new(window, min_count, quantile); + INIT(NPY_DTYPE1) + if (window == 1) { + return PyArray_CastToType(a, + PyArray_DescrFromType(NPY_DTYPE1), + PyArray_CHKFLAGS(a, NPY_ARRAY_F_CONTIGUOUS)); + } + if (mq == NULL) { + MEMORY_ERR("Could not allocate memory for move_quantile"); + } + BN_BEGIN_ALLOW_THREADS + WHILE { + WHILE0 { + ai = AI(DTYPE0); + YI(DTYPE1) = mq_update_init(mq, ai); + } + WHILE1 { + ai = AI(DTYPE0); + YI(DTYPE1) = mq_update_init(mq, ai); + } + WHILE2 { + ai = AI(DTYPE0); + YI(DTYPE1) = mq_update(mq, ai); + } + mq_reset(mq); + NEXT2 + } + mq_free(mq); + BN_END_ALLOW_THREADS + return y; +} +/* dtype end */ +MOVE_MAIN(move_quantile, 0, 1) + /* move_rank-------------------------------------------------------------- */ From cd49b4f9630a5d60115c8847509be8d38305e5a8 Mon Sep 17 00:00:00 2001 From: riazanov Date: Tue, 20 Sep 2022 23:39:28 -0400 Subject: [PATCH 15/30] Finish bringing move_median back --- bottleneck/src/move_median/move_median.c | 54 +++++++++++-------- .../src/move_median/move_median_debug.c | 2 +- bottleneck/tests/move_test.py | 9 ++-- 3 files changed, 40 insertions(+), 25 deletions(-) diff --git a/bottleneck/src/move_median/move_median.c b/bottleneck/src/move_median/move_median.c index 3e431bbd5e..5566290a1c 100644 --- a/bottleneck/src/move_median/move_median.c +++ b/bottleneck/src/move_median/move_median.c @@ -131,7 +131,7 @@ mm_update_init(mm_handle *mm, ai_t ai) { node->idx = n_l; ++mm->n_l; mm->l_first_leaf = FIRST_LEAF(mm->n_l); - heapify_large_node(mm, n_l); + heapify_large_node_mm(mm, n_l); } else { /* add new node to small heap */ mm->s_heap[n_s] = node; @@ -179,7 +179,7 @@ mq_update_init(mq_handle *mq, ai_t ai) { node->idx = n_l; ++mq->n_l; mq->l_first_leaf = FIRST_LEAF(mq->n_l); - heapify_large_node(mq, n_l); + heapify_large_node_mq(mq, n_l); } else { /* add new node to small heap */ mq->s_heap[n_s] = node; @@ -216,7 +216,7 @@ mm_update(mm_handle *mm, ai_t ai) { if (node->region == SH) { heapify_small_node_mm(mm, node->idx); } else { - heapify_large_node(mm, node->idx); + heapify_large_node_mm(mm, node->idx); } /* return the median */ @@ -242,7 +242,7 @@ mq_update(mq_handle *mq, ai_t ai) { if (node->region == SH) { heapify_small_node_mq(mq, node->idx); } else { - heapify_large_node(mq, node->idx); + heapify_large_node_mq(mq, node->idx); } /* return the median */ @@ -350,7 +350,7 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { node->idx = n_l; ++mm->n_l; mm->l_first_leaf = FIRST_LEAF(mm->n_l); - heapify_large_node(mm, n_l); + heapify_large_node_mm(mm, n_l); } else { /* add new node to small heap */ mm->s_heap[n_s] = node; @@ -416,7 +416,7 @@ mq_update_init_nan(mq_handle *mq, ai_t ai) { node->idx = n_l; ++mq->n_l; mq->l_first_leaf = FIRST_LEAF(mq->n_l); - heapify_large_node(mq, n_l); + heapify_large_node_mq(mq, n_l); } else { /* add new node to small heap */ mq->s_heap[n_s] = node; @@ -499,7 +499,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { } else { mm->l_first_leaf = FIRST_LEAF(mm->n_l); } - heapify_large_node(mm, 0); + heapify_large_node_mm(mm, 0); } } else { if (idx != n_s - 1) { @@ -527,7 +527,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { } else { mm->l_first_leaf = FIRST_LEAF(mm->n_l); } - heapify_large_node(mm, 0); + heapify_large_node_mm(mm, 0); } else { mm->s_first_leaf = FIRST_LEAF(mm->n_s); @@ -550,7 +550,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { if (idx != n_l - 1) { l_heap[idx] = l_heap[n_l - 1]; l_heap[idx]->idx = idx; - heapify_large_node(mm, idx); + heapify_large_node_mm(mm, idx); } --mm->n_l; if (mm->n_l == 0) { @@ -566,7 +566,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { l_heap[mm->n_l] = node2; ++mm->n_l; mm->l_first_leaf = FIRST_LEAF(mm->n_l); - heapify_large_node(mm, node2->idx); + heapify_large_node_mm(mm, node2->idx); /* plug hole in small heap */ if (n_s != 1) { @@ -583,7 +583,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { heapify_small_node_mm(mm, 0); } /* reorder large heap if needed */ - heapify_large_node(mm, idx); + heapify_large_node_mm(mm, idx); } else if (node->region == NA) { /* insert node into nan heap */ n_array[idx] = node; @@ -592,7 +592,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { if (node->region == SH) { heapify_small_node_mm(mm, idx); } else if (node->region == LH) { - heapify_large_node(mm, idx); + heapify_large_node_mm(mm, idx); } else { /* ai is not NaN but oldest node is in nan array */ if (n_s > n_l) { @@ -602,7 +602,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { l_heap[n_l] = node; ++mm->n_l; mm->l_first_leaf = FIRST_LEAF(mm->n_l); - heapify_large_node(mm, n_l); + heapify_large_node_mm(mm, n_l); } else { /* insert into small heap */ node->region = SH; @@ -689,7 +689,7 @@ mq_update_nan(mq_handle *mq, ai_t ai) { } else { mq->l_first_leaf = FIRST_LEAF(mq->n_l); } - heapify_large_node(mq, 0); + heapify_large_node_mq(mq, 0); } } else { if (idx != n_s - 1) { @@ -717,7 +717,7 @@ mq_update_nan(mq_handle *mq, ai_t ai) { } else { mq->l_first_leaf = FIRST_LEAF(mq->n_l); } - heapify_large_node(mq, 0); + heapify_large_node_mq(mq, 0); } else { mq->s_first_leaf = FIRST_LEAF(mq->n_s); @@ -741,7 +741,7 @@ mq_update_nan(mq_handle *mq, ai_t ai) { if (idx != n_l - 1) { l_heap[idx] = l_heap[n_l - 1]; l_heap[idx]->idx = idx; - heapify_large_node(mq, idx); + heapify_large_node_mq(mq, idx); } --mq->n_l; if (mq->n_l == 0) { @@ -757,7 +757,7 @@ mq_update_nan(mq_handle *mq, ai_t ai) { l_heap[mq->n_l] = node2; ++mq->n_l; mq->l_first_leaf = FIRST_LEAF(mq->n_l); - heapify_large_node(mq, node2->idx); + heapify_large_node_mq(mq, node2->idx); /* plug hole in small heap */ if (n_s != 1) { @@ -774,7 +774,7 @@ mq_update_nan(mq_handle *mq, ai_t ai) { heapify_small_node_mq(mq, 0); } /* reorder large heap if needed */ - heapify_large_node(mq, idx); + heapify_large_node_mq(mq, idx); } else if (node->region == NA) { /* insert node into nan heap */ n_array[idx] = node; @@ -783,7 +783,7 @@ mq_update_nan(mq_handle *mq, ai_t ai) { if (node->region == SH) { heapify_small_node_mq(mq, idx); } else if (node->region == LH) { - heapify_large_node(mq, idx); + heapify_large_node_mq(mq, idx); } else { /* ai is not NaN but oldest node is in nan array */ k_stat = mq_k_stat(mq, n_s + n_l + 1); @@ -795,7 +795,7 @@ mq_update_nan(mq_handle *mq, ai_t ai) { l_heap[n_l] = node; ++mq->n_l; mq->l_first_leaf = FIRST_LEAF(mq->n_l); - heapify_large_node(mq, n_l); + heapify_large_node_mq(mq, n_l); } else { /* insert into small heap */ node->region = SH; @@ -866,6 +866,18 @@ mq_free(mq_handle *mq) { ----------------------------------------------------------------------------- */ +/* Return the current median */ +static inline ai_t +mm_get_median(mm_handle *mm) { + idx_t n_total = mm->n_l + mm->n_s; + if (n_total < mm->min_count) + return MM_NAN(); + if (min(mm->window, n_total) % 2 == 1) + return mm->s_heap[0]->ai; + return (mm->s_heap[0]->ai + mm->l_heap[0]->ai) / 2.0; +} + + /* function to find the current index of element correspodning to the quantile */ static inline idx_t mq_k_stat(mq_handle *mq, idx_t idx) { return (idx_t) floor((idx - 1) * mq->quantile) + 1; @@ -943,7 +955,7 @@ HEAPIFY_SMALL_NODE(mq) /* mtype is mm (move_median) or mq (move_quantile) */ #define HEAPIFY_LARGE_NODE(mtype) \ static inline void \ - heapify_large_node##mtype(mtype##_handle *mtype, idx_t idx) { \ + heapify_large_node_##mtype(mtype##_handle *mtype, idx_t idx) { \ idx_t idx2; \ mm_node *node; \ mm_node *node2; \ diff --git a/bottleneck/src/move_median/move_median_debug.c b/bottleneck/src/move_median/move_median_debug.c index dc1017f683..ce8791eff8 100644 --- a/bottleneck/src/move_median/move_median_debug.c +++ b/bottleneck/src/move_median/move_median_debug.c @@ -25,7 +25,7 @@ ai_t *mm_move_median(ai_t *a, idx_t length, idx_t window, idx_t min_count, doubl idx_t i; out = malloc(length * sizeof(ai_t)); - mm = mm_new_nan(window, min_count, quantile); + mm = mm_new_nan(window, min_count); for (i=0; i < length; i++) { if (i < window) { out[i] = mm_update_init_nan(mm, a[i]); diff --git a/bottleneck/tests/move_test.py b/bottleneck/tests/move_test.py index 550ef1f0f0..9c3a19d1a3 100644 --- a/bottleneck/tests/move_test.py +++ b/bottleneck/tests/move_test.py @@ -210,7 +210,8 @@ def test_move_quantile_with_nans(): func = bn.move_quantile func0 = bn.slow.move_quantile rs = np.random.RandomState([1, 2, 3]) - for size in [1, 2, 3, 5, 9, 10, 20, 21]: + for size in [1, 2, 3, 5, 9]: + # for size in [1, 2, 3, 5, 9, 10, 20, 21]: for _ in range(REPEAT_QUANTILE): # 0 and 1 are important edge cases for q in [0., 1., rs.rand()]: @@ -234,7 +235,8 @@ def test_move_quantile_without_nans(): func = bn.move_quantile func0 = bn.slow.move_quantile rs = np.random.RandomState([1, 2, 3]) - for size in [1, 2, 3, 5, 9, 10, 20, 21]: + for size in [1, 2, 3, 5, 9]: + # for size in [1, 2, 3, 5, 9, 10, 20, 21]: for _ in range(REPEAT_QUANTILE): for q in [0., 1., rs.rand()]: a = np.arange(size, dtype=np.float64) @@ -296,7 +298,8 @@ def test_move_quantile_with_infs_and_nans(): inf_minf_nan_fracs = [triple for triple in itertools.product(fracs, fracs, fracs) if np.sum(triple) <= 1] total = 0 # for size in [1, 2, 3, 5, 9, 10, 20, 21, 47, 48]: - for size in [1, 2, 3, 5, 9, 10, 20, 21]: + # for size in [1, 2, 3, 5, 9, 10, 20, 21]: + for size in [1, 2, 3, 5, 9]: print(size) for min_count in [1, 2, 3, size//2, size - 1, size]: if min_count < 1 or min_count > size: From fcaefdec9f02c8adaf360e0efce993655efb5bd3 Mon Sep 17 00:00:00 2001 From: riazanov Date: Wed, 21 Sep 2022 02:02:54 -0400 Subject: [PATCH 16/30] Move move_quantile to C level fully Remove the wrapper for move_quantile on python level which checked for q = 0 or 1. Now it's fully in C. Also check for q=0.5 as we checked it's 3-4% faster to call move_median --- bottleneck/__init__.py | 5 +-- bottleneck/src/move_quantile.py | 16 -------- bottleneck/src/move_template.c | 73 +++++++++++++++++++++++++-------- bottleneck/tests/move_test.py | 44 ++++++++++---------- 4 files changed, 79 insertions(+), 59 deletions(-) delete mode 100644 bottleneck/src/move_quantile.py diff --git a/bottleneck/__init__.py b/bottleneck/__init__.py index e80b7cc135..b2c01de453 100644 --- a/bottleneck/__init__.py +++ b/bottleneck/__init__.py @@ -5,10 +5,7 @@ from . import slow from ._pytesttester import PytestTester from .move import (move_argmax, move_argmin, move_max, move_mean, move_median, - move_min, move_rank, move_std, move_sum, move_var) - -from .move import move_quantile as move_quantile_c -from .src.move_quantile import move_quantile as move_quantile + move_quantile, move_min, move_rank, move_std, move_sum, move_var) from .nonreduce import replace from .nonreduce_axis import (argpartition, nanrankdata, partition, push, diff --git a/bottleneck/src/move_quantile.py b/bottleneck/src/move_quantile.py deleted file mode 100644 index 53d2a1b86f..0000000000 --- a/bottleneck/src/move_quantile.py +++ /dev/null @@ -1,16 +0,0 @@ -from ..move import move_max, move_min -from ..move import move_quantile as move_quantile_c - -all = ["move_quantile"] - -def move_quantile(*args, **kwargs): - if ('q' not in kwargs) or ((kwargs['q'] > 0.) and (kwargs['q'] < 1.)): - return move_quantile_c(*args, **kwargs) - elif (kwargs['q'] == 1.): - del kwargs['q'] - return move_max(*args, **kwargs) - elif (kwargs['q'] == 0.): - del kwargs['q'] - return move_min(*args, **kwargs) - -move_quantile.__doc__ = move_quantile_c.__doc__ \ No newline at end of file diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index 1d0a2585b8..551c6d7733 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -39,16 +39,29 @@ name(PyObject *self, PyObject *args, PyObject *kwds) \ { \ return mover(#name, \ - args, \ - kwds, \ - name##_float64, \ - name##_float32, \ - name##_int64, \ - name##_int32, \ - has_ddof, \ - has_quantile); \ + args, \ + kwds, \ + name##_float64, \ + name##_float32, \ + name##_int64, \ + name##_int32, \ + has_ddof, \ + has_quantile); \ } +/* Mover function */ +#define MOVER(name, args, kwds, has_ddof, has_quantile) \ + return mover(#name, \ + args, \ + kwds, \ + name##_float64, \ + name##_float32, \ + name##_int64, \ + name##_int32, \ + has_ddof, \ + has_quantile); + + /* typedefs and prototypes ----------------------------------------------- */ /* used by move_min and move_max */ @@ -997,6 +1010,40 @@ mover(char *name, return NULL; } + /* quantile + * Checking quantile first because if q in {0, 0.5, 1} then + * another `mover` function is called. Check the quantile + * first to avoid converting (if needed) `a` to an array twice + */ + + quantile = (double) 0.5; + + if ((has_quantile) && (!strcmp(name, "move_quantile"))) { + if (quantile_obj != Py_None) { + quantile = PyFloat_AsDouble(quantile_obj); + if (error_converting(quantile)) { + TYPE_ERR("`q` must be a float"); + return NULL; + } + if ((quantile < 0.0) || (quantile > 1.0)) { + /* Float/double specifiers %f and %lf don't work here for some reason*/ + PyErr_Format(PyExc_ValueError, + "`q` must be between 0. and 1."); + return NULL; + } + + if (quantile == 1.0) { + MOVER(move_max, args, kwds, has_ddof, 1) + } else if (quantile == 0.0) { + MOVER(move_min, args, kwds, has_ddof, 1) + } + } + + if (quantile == 0.5) { + MOVER(move_median, args, kwds, has_ddof, 1) + } + } + /* convert to array if necessary */ if (PyArray_Check(a_obj)) { a = (PyArrayObject *)a_obj; @@ -1041,15 +1088,6 @@ mover(char *name, } } - /* quantile */ - if (quantile_obj == Py_None){ - /* take median by default */ - quantile = (double) 0.5; - } else { - /* QUANTILE TODO: add checks here*/ - quantile = PyFloat_AsDouble(quantile_obj); - } - ndim = PyArray_NDIM(a); /* defend against 0d beings */ @@ -1110,7 +1148,6 @@ mover(char *name, } else if (dtype == NPY_int32) { y = move_int32(a, window, mc, axis, ddof, quantile); } else { - /* TODO: add slow for quantile */ y = slow(name, args, kwds); } diff --git a/bottleneck/tests/move_test.py b/bottleneck/tests/move_test.py index 9c3a19d1a3..a8e0e79208 100644 --- a/bottleneck/tests/move_test.py +++ b/bottleneck/tests/move_test.py @@ -6,6 +6,7 @@ from .util import arrays, array_order import pytest import itertools +import warnings @pytest.mark.parametrize("func", bn.get_functions("move"), ids=lambda x: x.__name__) @@ -210,8 +211,8 @@ def test_move_quantile_with_nans(): func = bn.move_quantile func0 = bn.slow.move_quantile rs = np.random.RandomState([1, 2, 3]) - for size in [1, 2, 3, 5, 9]: - # for size in [1, 2, 3, 5, 9, 10, 20, 21]: + # for size in [1, 2, 3, 5, 9]: + for size in [1, 2, 3, 5, 9, 10, 20, 21]: for _ in range(REPEAT_QUANTILE): # 0 and 1 are important edge cases for q in [0., 1., rs.rand()]: @@ -235,8 +236,8 @@ def test_move_quantile_without_nans(): func = bn.move_quantile func0 = bn.slow.move_quantile rs = np.random.RandomState([1, 2, 3]) - for size in [1, 2, 3, 5, 9]: - # for size in [1, 2, 3, 5, 9, 10, 20, 21]: + # for size in [1, 2, 3, 5, 9]: + for size in [1, 2, 3, 5, 9, 10, 20, 21]: for _ in range(REPEAT_QUANTILE): for q in [0., 1., rs.rand()]: a = np.arange(size, dtype=np.float64) @@ -266,21 +267,23 @@ def test_numpy_nanquantile_infs(): rs = np.random.RandomState([1, 2, 3]) sizes = [1, 2, 3, 4, 5, 9, 10, 20, 31, 50] fracs = [0., 0.2, 0.4, 0.6, 0.8, 1.] - for size in sizes: - for _ in range(REPEAT_NUMPY_QUANTILE): - for inf_frac, minus_inf_frac, nan_frac in itertools.product(fracs, fracs, fracs): - a = np.arange(size, dtype=np.float64) - idx = rs.rand(*a.shape) < inf_frac - a[idx] = np.inf - idx = rs.rand(*a.shape) < minus_inf_frac - a[idx] = -np.inf - idx = rs.rand(*a.shape) < nan_frac - a[idx] = np.nan - rs.shuffle(a) - actual = func(a) - desired = func0(a, q=0.5) - err_msg = fmt % (func.__name__, a) - aaae(actual, desired, decimal=5, err_msg=err_msg) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + for size in sizes: + for _ in range(REPEAT_NUMPY_QUANTILE): + for inf_frac, minus_inf_frac, nan_frac in itertools.product(fracs, fracs, fracs): + a = np.arange(size, dtype=np.float64) + idx = rs.rand(*a.shape) < inf_frac + a[idx] = np.inf + idx = rs.rand(*a.shape) < minus_inf_frac + a[idx] = -np.inf + idx = rs.rand(*a.shape) < nan_frac + a[idx] = np.nan + rs.shuffle(a) + actual = func(a) + desired = func0(a, q=0.5) + err_msg = fmt % (func.__name__, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) # This should convince us TODO finish @@ -298,8 +301,7 @@ def test_move_quantile_with_infs_and_nans(): inf_minf_nan_fracs = [triple for triple in itertools.product(fracs, fracs, fracs) if np.sum(triple) <= 1] total = 0 # for size in [1, 2, 3, 5, 9, 10, 20, 21, 47, 48]: - # for size in [1, 2, 3, 5, 9, 10, 20, 21]: - for size in [1, 2, 3, 5, 9]: + for size in [1, 2, 3, 5, 9, 10, 20, 21]: print(size) for min_count in [1, 2, 3, size//2, size - 1, size]: if min_count < 1 or min_count > size: From 1bddeddc1aa4365bb936bc08bc6241e206cf9548 Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Wed, 21 Sep 2022 23:24:10 -0400 Subject: [PATCH 17/30] Refactor parse_args function in move_template Add some more tests --- bottleneck/slow/move.py | 12 +-- bottleneck/src/move_template.c | 158 ++++++++++++--------------------- bottleneck/tests/move_test.py | 77 ++++++++++------ 3 files changed, 116 insertions(+), 131 deletions(-) diff --git a/bottleneck/slow/move.py b/bottleneck/slow/move.py index 22a1b03e75..cd2ccfabab 100644 --- a/bottleneck/slow/move.py +++ b/bottleneck/slow/move.py @@ -38,13 +38,15 @@ def move_var(a, window, min_count=None, axis=-1, ddof=0): "Slow move_var for unaccelerated dtype" return move_func(np.nanvar, a, window, min_count, axis=axis, ddof=ddof) - -def move_min(a, window, min_count=None, axis=-1): +# move_min, move_max, and move_median from bn.slow can be called +# from bn.move_quantile in case of byte swapped input array, +# and so can take `q` argument, hence add **kwargs to these functions +def move_min(a, window, min_count=None, axis=-1, **kwargs): "Slow move_min for unaccelerated dtype" return move_func(np.nanmin, a, window, min_count, axis=axis) -def move_max(a, window, min_count=None, axis=-1): +def move_max(a, window, min_count=None, axis=-1, **kwargs): "Slow move_max for unaccelerated dtype" return move_func(np.nanmax, a, window, min_count, axis=axis) @@ -101,7 +103,7 @@ def argmax(a, axis): return move_func(argmax, a, window, min_count, axis=axis) -def move_median(a, window, min_count=None, axis=-1): +def move_median(a, window, min_count=None, axis=-1, **kwargs): "Slow move_median for unaccelerated dtype" return move_func(np.nanmedian, a, window, min_count, axis=axis) @@ -114,7 +116,7 @@ def move_rank(a, window, min_count=None, axis=-1): return move_func(lastrank, a, window, min_count, axis=axis) # function for handling infs in np.nanquantile -# keyword argument for interpolation method in np.nanquantile changed in 1.22.0 +# keyword argument for interpolation method in np.nanquantile was changed in 1.22.0 from packaging import version if version.parse(np.__version__) > version.parse("1.22.0"): METHOD_KEYWORD = "method" diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index 551c6d7733..b5b563079f 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -840,7 +840,7 @@ PyObject *pystr_window = NULL; PyObject *pystr_min_count = NULL; PyObject *pystr_axis = NULL; PyObject *pystr_ddof = NULL; -PyObject *pystr_quantile = NULL; +PyObject *pystr_q = NULL; static int intern_strings(void) { @@ -849,12 +849,24 @@ intern_strings(void) { pystr_min_count = PyString_InternFromString("min_count"); pystr_axis = PyString_InternFromString("axis"); pystr_ddof = PyString_InternFromString("ddof"); - pystr_quantile = PyString_InternFromString("q"); + pystr_q = PyString_InternFromString("q"); return pystr_a && pystr_window && pystr_min_count && - pystr_axis && pystr_ddof && pystr_quantile; + pystr_axis && pystr_ddof && pystr_q; } /* mover ----------------------------------------------------------------- */ +#define COMPARE_AND_SET(var_name, key, value) \ + if (PyObject_RichCompareBool(key, pystr_##var_name, Py_EQ)) { \ + *var_name = value; \ + continue; \ + } + +static inline void get_argument(PyObject **var, PyObject *args, short condition, int nargs, short *counter) { + if (*counter && condition) { + *var = PyTuple_GET_ITEM(args, nargs - *counter); + *counter -= 1; + } +} static inline int parse_args(PyObject *args, @@ -866,111 +878,57 @@ parse_args(PyObject *args, PyObject **min_count, PyObject **axis, PyObject **ddof, - PyObject **quantile) { + PyObject **q) { const Py_ssize_t nargs = PyTuple_GET_SIZE(args); const Py_ssize_t nkwds = kwds == NULL ? 0 : PyDict_Size(kwds); + + if (nargs + nkwds > 4 + has_ddof + has_quantile) { + TYPE_ERR("Too many arguments"); + return 0; + } + + PyObject *key; + PyObject *value; + Py_ssize_t pos = 0; + if (nkwds) { - int nkwds_found = 0; - PyObject *tmp; - switch (nargs) { - case 4: - if (has_ddof | has_quantile) { - *axis = PyTuple_GET_ITEM(args, 3); - } else { - TYPE_ERR("wrong number of arguments"); - return 0; - } - case 3: *min_count = PyTuple_GET_ITEM(args, 2); - case 2: *window = PyTuple_GET_ITEM(args, 1); - case 1: *a = PyTuple_GET_ITEM(args, 0); - case 0: break; - default: - TYPE_ERR("wrong number of arguments"); - return 0; - } - switch (nargs) { - case 0: - *a = PyDict_GetItem(kwds, pystr_a); - if (*a == NULL) { - TYPE_ERR("Cannot find `a` keyword input"); - return 0; - } - nkwds_found += 1; - case 1: - *window = PyDict_GetItem(kwds, pystr_window); - if (*window == NULL) { - TYPE_ERR("Cannot find `window` keyword input"); - return 0; - } - nkwds_found++; - case 2: - tmp = PyDict_GetItem(kwds, pystr_min_count); - if (tmp != NULL) { - *min_count = tmp; - nkwds_found++; - } - case 3: - tmp = PyDict_GetItem(kwds, pystr_axis); - if (tmp != NULL) { - *axis = tmp; - nkwds_found++; - } - case 4: - if (has_ddof) { - tmp = PyDict_GetItem(kwds, pystr_ddof); - if (tmp != NULL) { - *ddof = tmp; - nkwds_found++; - } - break; - } else if (has_quantile) { - tmp = PyDict_GetItem(kwds, pystr_quantile); - if (tmp != NULL) { - *quantile = tmp; - nkwds_found++; - } - break; - } - break; - default: - TYPE_ERR("wrong number of arguments"); - return 0; - } - if (nkwds_found != nkwds) { - TYPE_ERR("wrong number of keyword arguments"); + while (PyDict_Next(kwds, &pos, &key, &value)) { + COMPARE_AND_SET(a, key, value) + COMPARE_AND_SET(window, key, value) + COMPARE_AND_SET(min_count, key, value) + COMPARE_AND_SET(axis, key, value) + COMPARE_AND_SET(ddof, key, value) + COMPARE_AND_SET(q, key, value) + TYPE_ERR("Unsupported keyword argument"); return 0; } - if (nargs + nkwds_found > 4 + has_ddof + has_quantile) { - TYPE_ERR("too many arguments"); - return 0; - } - } else { - switch (nargs) { - case 5: - if (has_ddof) { - *ddof = PyTuple_GET_ITEM(args, 4); - } else if (has_quantile) { - *quantile = PyTuple_GET_ITEM(args, 4); - } else { - TYPE_ERR("wrong number of arguments"); - return 0; - } - case 4: - *axis = PyTuple_GET_ITEM(args, 3); - case 3: - *min_count = PyTuple_GET_ITEM(args, 2); - case 2: - *window = PyTuple_GET_ITEM(args, 1); - *a = PyTuple_GET_ITEM(args, 0); - break; - default: - TYPE_ERR("wrong number of arguments"); - return 0; - } } - return 1; + short args_not_found = nargs; + get_argument(a, args, 1, nargs, &args_not_found); + get_argument(window, args, 1, nargs, &args_not_found); + get_argument(min_count, args, 1, nargs, &args_not_found); + get_argument(axis, args, 1, nargs, &args_not_found); + get_argument(ddof, args, has_ddof, nargs, &args_not_found); + get_argument(q, args, has_quantile, nargs, &args_not_found); + + if (args_not_found) { + TYPE_ERR("wrong number of arguments"); + return 0; + } + + if (*a == NULL) { + TYPE_ERR("Cannot find `a` argument"); + return 0; + } + + if (*window == NULL) { + TYPE_ERR("Cannot find `window` argument"); + return 0; + } + + return 1; } diff --git a/bottleneck/tests/move_test.py b/bottleneck/tests/move_test.py index a8e0e79208..bb82b86159 100644 --- a/bottleneck/tests/move_test.py +++ b/bottleneck/tests/move_test.py @@ -1,5 +1,6 @@ """Test moving window functions.""" +from errno import EUNATCH import numpy as np from numpy.testing import assert_equal, assert_array_almost_equal, assert_raises import bottleneck as bn @@ -24,6 +25,9 @@ def test_move(func): decimal = 3 else: decimal = 5 + quantiles = [1.] + if func_name == "move_quantile": + quantiles = [0., 0.25, 0.5, 0.75, 1.] for i, a in enumerate(arrays(func_name)): axes = range(-1, a.ndim) for axis in axes: @@ -31,25 +35,29 @@ def test_move(func): for window in windows: min_counts = list(range(1, window + 1)) + [None] for min_count in min_counts: - actual = func(a, window, min_count, axis=axis) - desired = func0(a, window, min_count, axis=axis) - tup = ( - func_name, - window, - str(min_count), - "a" + str(i), - str(a.dtype), - str(a.shape), - str(axis), - array_order(a), - a, - ) - err_msg = fmt % tup - aaae(actual, desired, decimal, err_msg) - err_msg += "\n dtype mismatch %s %s" - da = actual.dtype - dd = desired.dtype - assert_equal(da, dd, err_msg % (da, dd)) + for q in quantiles: + kwargs = {} + if func_name == "move_quantile": + kwargs = {"q" : q} + actual = func(a, window, min_count, axis=axis, **kwargs) + desired = func0(a, window, min_count, axis=axis, **kwargs) + tup = ( + func_name, + window, + str(min_count), + "a" + str(i), + str(a.dtype), + str(a.shape), + str(axis), + array_order(a), + a, + ) + err_msg = fmt % tup + aaae(actual, desired, decimal, err_msg) + err_msg += "\n dtype mismatch %s %s" + da = actual.dtype + dd = desired.dtype + assert_equal(da, dd, err_msg % (da, dd)) # --------------------------------------------------------------------------- @@ -109,12 +117,34 @@ def test_arg_parsing(func, decimal=5): err_msg = fmt % "(a=a, axis=-1, min_count=None, window=2)" assert_array_almost_equal(actual, desired, decimal, err_msg) + actual = func(a=a, axis=-1, min_count=None, window=2) + desired = func0(a=a, axis=-1, min_count=None, window=2) + err_msg = fmt % "(a=a, axis=-1, min_count=None, window=2)" + assert_array_almost_equal(actual, desired, decimal, err_msg) + if name in ("move_std", "move_var"): actual = func(a, 2, 1, -1, ddof=1) desired = func0(a, 2, 1, -1, ddof=1) err_msg = fmt % "(a, 2, 1, -1, ddof=1)" assert_array_almost_equal(actual, desired, decimal, err_msg) + if name == "move_quantile": + q = 0.3 + actual = func(q=q, axis=-1, a=a, min_count=None, window=2) + desired = func0(q=q, axis=-1, a=a, min_count=None, window=2) + err_msg = fmt % "(q=q, axis=-1, a=a, min_count=None, window=2)" + assert_array_almost_equal(actual, desired, decimal, err_msg) + + actual = func(a, axis=-1, q=q, window=2, min_count=None) + desired = func0(a, axis=-1, q=q, window=2, min_count=None) + err_msg = fmt % "(a, axis=-1, q=q, window=2, min_count=None)" + assert_array_almost_equal(actual, desired, decimal, err_msg) + + actual = func(axis=-1, a=a, q=q, window=2) + desired = func0(axis=-1, a=a, q=q, window=2) + err_msg = fmt % "(axis=-1, a=a, q=q, window=2)" + assert_array_almost_equal(actual, desired, decimal, err_msg) + # regression test: make sure len(kwargs) == 0 doesn't raise args = (a, 1, 1, -1) kwargs = {} @@ -151,12 +181,11 @@ def test_move_median_with_nans(): fmt = "\nfunc %s | window %d | min_count %s\n\nInput array:\n%s\n" aaae = assert_array_almost_equal min_count = 1 - size = 10 func = bn.move_median func0 = bn.slow.move_median rs = np.random.RandomState([1, 2, 3]) for size in [1, 2, 3, 4, 5, 9, 10, 19, 20, 21]: - for i in range(REPEAT_MEDIAN): + for _ in range(REPEAT_MEDIAN): a = np.arange(size, dtype=np.float64) idx = rs.rand(*a.shape) < 0.1 a[idx] = np.inf @@ -175,12 +204,11 @@ def test_move_median_without_nans(): fmt = "\nfunc %s | window %d | min_count %s\n\nInput array:\n%s\n" aaae = assert_array_almost_equal min_count = 1 - size = 10 func = bn.move_median func0 = bn.slow.move_median rs = np.random.RandomState([1, 2, 3]) for size in [1, 2, 3, 4, 5, 9, 10, 19, 20, 21]: - for i in range(REPEAT_MEDIAN): + for _ in range(REPEAT_MEDIAN): a = np.arange(size, dtype=np.int64) rs.shuffle(a) for window in range(2, size + 1): @@ -211,7 +239,6 @@ def test_move_quantile_with_nans(): func = bn.move_quantile func0 = bn.slow.move_quantile rs = np.random.RandomState([1, 2, 3]) - # for size in [1, 2, 3, 5, 9]: for size in [1, 2, 3, 5, 9, 10, 20, 21]: for _ in range(REPEAT_QUANTILE): # 0 and 1 are important edge cases @@ -236,7 +263,6 @@ def test_move_quantile_without_nans(): func = bn.move_quantile func0 = bn.slow.move_quantile rs = np.random.RandomState([1, 2, 3]) - # for size in [1, 2, 3, 5, 9]: for size in [1, 2, 3, 5, 9, 10, 20, 21]: for _ in range(REPEAT_QUANTILE): for q in [0., 1., rs.rand()]: @@ -302,7 +328,6 @@ def test_move_quantile_with_infs_and_nans(): total = 0 # for size in [1, 2, 3, 5, 9, 10, 20, 21, 47, 48]: for size in [1, 2, 3, 5, 9, 10, 20, 21]: - print(size) for min_count in [1, 2, 3, size//2, size - 1, size]: if min_count < 1 or min_count > size: continue From 7a413cb720fe57736c366e10a94a04799a4c7ecd Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Wed, 21 Sep 2022 23:55:35 -0400 Subject: [PATCH 18/30] Add docs and comments --- bottleneck/tests/move_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bottleneck/tests/move_test.py b/bottleneck/tests/move_test.py index bb82b86159..b2b47d3366 100644 --- a/bottleneck/tests/move_test.py +++ b/bottleneck/tests/move_test.py @@ -284,7 +284,7 @@ def test_move_quantile_without_nans(): REPEAT_NUMPY_QUANTILE = 10 def test_numpy_nanquantile_infs(): - """test move_median.c with nans""" + """test move_quantile.c with nans""" fmt = "\nfunc %s \n\nInput array:\n%s\n" aaae = assert_array_almost_equal min_count = 1 From 6851ed280a693fc0ad2569186b2c2c6aac0d77d2 Mon Sep 17 00:00:00 2001 From: andrii-riazanov <59578540+andrii-riazanov@users.noreply.github.com> Date: Thu, 22 Sep 2022 00:22:42 -0400 Subject: [PATCH 19/30] Update move_test.py Remove redundant import --- bottleneck/tests/move_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bottleneck/tests/move_test.py b/bottleneck/tests/move_test.py index b2b47d3366..d2db15a0c3 100644 --- a/bottleneck/tests/move_test.py +++ b/bottleneck/tests/move_test.py @@ -1,6 +1,5 @@ """Test moving window functions.""" -from errno import EUNATCH import numpy as np from numpy.testing import assert_equal, assert_array_almost_equal, assert_raises import bottleneck as bn From 97ecd15bb6d473c4b1758171a2ea9e46a6dc8bd8 Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Thu, 22 Sep 2022 21:01:07 -0400 Subject: [PATCH 20/30] Actually add docs and comments --- bottleneck/src/move_median/move_median.c | 17 +++++++++-------- bottleneck/src/move_median/move_median.h | 2 +- bottleneck/src/move_template.c | 21 ++++++++++++--------- doc/source/reference.rst | 7 ++++++- 4 files changed, 28 insertions(+), 19 deletions(-) diff --git a/bottleneck/src/move_median/move_median.c b/bottleneck/src/move_median/move_median.c index 5566290a1c..7b68fad250 100644 --- a/bottleneck/src/move_median/move_median.c +++ b/bottleneck/src/move_median/move_median.c @@ -79,6 +79,7 @@ mm_new(const idx_t window, idx_t min_count) { return mm; } +/* Same as mm_new, but the heaps have different sizes */ mq_handle * mq_new(const idx_t window, idx_t min_count, double quantile) { mq_handle *mq = malloc(sizeof(mq_handle)); @@ -148,7 +149,7 @@ mm_update_init(mm_handle *mm, ai_t ai) { return mm_get_median(mm); } - +/* Same as mm_update_init, but returns the current quantile. */ ai_t mq_update_init(mq_handle *mq, ai_t ai) { mm_node *node; @@ -227,6 +228,7 @@ mm_update(mm_handle *mm, ai_t ai) { } } +/* Same as mm_update, but returns the current quantile. */ ai_t mq_update(mq_handle *mq, ai_t ai) { /* node is oldest node with ai of newest node */ @@ -278,7 +280,7 @@ mm_new_nan(const idx_t window, idx_t min_count) { return mm; } - +/* Same as mm_new_nan, but the heaps have different sizes */ mq_handle * mq_new_nan(const idx_t window, idx_t min_count, double quantile) { mq_handle *mq = malloc(sizeof(mq_handle)); @@ -367,7 +369,7 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { return mm_get_median(mm); } - +/* Same as mm_update_init_nan, but returns the current quantile. */ ai_t mq_update_init_nan(mq_handle *mq, ai_t ai) { mm_node *node; @@ -623,7 +625,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { return mm_get_median(mm); } - +/* Same as mm_update_nan, but returns the current quantile. */ ai_t mq_update_nan(mq_handle *mq, ai_t ai) { idx_t n_s, n_l, n_n; @@ -878,18 +880,17 @@ mm_get_median(mm_handle *mm) { } -/* function to find the current index of element correspodning to the quantile */ +/* function to find the index of element correspodning to the current quantile */ static inline idx_t mq_k_stat(mq_handle *mq, idx_t idx) { return (idx_t) floor((idx - 1) * mq->quantile) + 1; } -/* function to check if the current index of the quantile is integer, and so - * the quantile is the element at the top of the heap. Otherwise take midpoint */ +/* function to check if the current index of the quantile is integer. If so, + * then the quantile is the element at the top of the heap. Otherwise take a midpoint */ static inline short mq_stat_exact(mq_handle *mq, idx_t idx) { return ((idx - 1) * mq->quantile) == floor((idx - 1) * mq->quantile); } - /* Return the current quantile */ static inline ai_t mq_get_quantile(mq_handle *mq) { diff --git a/bottleneck/src/move_median/move_median.h b/bottleneck/src/move_median/move_median.h index 33a654f0ab..68e612f435 100644 --- a/bottleneck/src/move_median/move_median.h +++ b/bottleneck/src/move_median/move_median.h @@ -57,7 +57,7 @@ typedef struct _mm_handle mm_handle; struct _mq_handle { idx_t window; /* window size */ int odd; /* is window even (0) or odd (1) */ - idx_t min_count; /* Same meaning as in bn.move_median */ + idx_t min_count; /* Same meaning as in bn.move_quantile */ idx_t n_s; /* Number of nodes in the small heap */ idx_t n_l; /* Number of nodes in the large heap */ idx_t n_n; /* Number of nodes in the nan array */ diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index b5b563079f..9cc09005f5 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -1531,12 +1531,12 @@ static char move_quantile_doc[] = /* MULTILINE STRING BEGIN move_quantile(a, window, min_count=None, axis=-1, q=0.5) -TODO: right docs, change var name to q, might need to change order as well, so that quantile can be passes as non-keyword third argument (hard) - -Moving window median along the specified axis, optionally ignoring NaNs. +Moving window quantile along the specified axis, ignoring NaNs. float64 output is returned for all input data types. +Interpolation method for the quantile is "midpoint". + Parameters ---------- a : ndarray @@ -1550,20 +1550,23 @@ min_count: {int, None}, optional axis : int, optional The axis over which the window is moved. By default the last axis (axis=-1) is used. An axis of None is not allowed. +q : float, optional + Quantile to compute, which must be between 0 and 1 inclusive. + By default q=0.5 is used, computing a moving median. Returns ------- y : ndarray - The moving median of the input array along the specified axis. The + The moving quantile of the input array along the specified axis. The output has the same shape as the input. Examples -------- ->>> a = np.array([1.0, 2.0, 3.0, 4.0]) ->>> bn.move_median(a, window=2) -array([ nan, 1.5, 2.5, 3.5]) ->>> bn.move_median(a, window=2, min_count=1) -array([ 1. , 1.5, 2.5, 3.5]) +>>> a = np.array([1.0, np.nan, 3.0, 4.0, 5.0, 6.0, 7.0]) +>>> bn.move_quantile(a, window=4, q=0.3) +array([nan, nan, nan, nan, nan, 3.5, 4.5]) +>>> bn.move_quantile(a, window=4, q=0.3, min_count=3) +array([nan, nan, nan, 2. , 3.5, 3.5, 4.5]) MULTILINE STRING END */ diff --git a/doc/source/reference.rst b/doc/source/reference.rst index eae6d79070..4991e7459e 100644 --- a/doc/source/reference.rst +++ b/doc/source/reference.rst @@ -23,7 +23,8 @@ moving window :meth:`move_sum `, :meth :meth:`move_std `, :meth:`move_var `, :meth:`move_min `, :meth:`move_max `, :meth:`move_argmin `, :meth:`move_argmax `, - :meth:`move_median `, :meth:`move_rank ` + :meth:`move_median `, :meth:`move_quantile `, + :meth:`move_rank ` ================================= ============================================================================================== @@ -167,5 +168,9 @@ Functions that operate along a (1d) moving window. ------------ +.. autofunction:: bottleneck.move_quantile + +------------ + .. autofunction:: bottleneck.move_rank From a7d5c220e2b24ead4e79544596f3df6456d256a7 Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Thu, 22 Sep 2022 23:49:22 -0400 Subject: [PATCH 21/30] Add comments, modify tests, change back gitignore --- .gitignore | 8 +- bottleneck/src/move_template.c | 2 +- bottleneck/tests/move_test.py | 157 ++++++++++++++++++++------------- 3 files changed, 99 insertions(+), 68 deletions(-) diff --git a/.gitignore b/.gitignore index 361857bbb9..b6a1242cb0 100644 --- a/.gitignore +++ b/.gitignore @@ -9,10 +9,4 @@ MANIFEST .*.swp *~ \#*# -bottleneck/src/bn_config.h -# TODO : change gitignore back -.vscode/c_cpp_properties.json -c.o -checks.c -checks.py -results.txt +bottleneck/src/bn_config.h \ No newline at end of file diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index 9cc09005f5..8899f3eed9 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -1535,7 +1535,7 @@ Moving window quantile along the specified axis, ignoring NaNs. float64 output is returned for all input data types. -Interpolation method for the quantile is "midpoint". +Interpolation method for the quantile is `midpoint`. Parameters ---------- diff --git a/bottleneck/tests/move_test.py b/bottleneck/tests/move_test.py index d2db15a0c3..c82b234d43 100644 --- a/bottleneck/tests/move_test.py +++ b/bottleneck/tests/move_test.py @@ -218,14 +218,17 @@ def test_move_median_without_nans(): # --------------------------------------------------------------------------- -# move_quantile.c is newly added. So let's do extensive testing +# move_quantile is newly added. So let's do (very) extensive testing # # Unfortunately, np.nanmedian(a) and np.nanquantile(a, q=0.5) don't always agree # when a contains inf or -inf values. See for instance: # https://github.com/numpy/numpy/issues/21932 # https://github.com/numpy/numpy/issues/21091 -# -# Let's first test without inf and -inf +# +# Let's first test without inf and -inf. +# When there are no infs in data, bn.slow.move_quantile calls +# move_func for np_nanquantile_infs, which just runs +# np.nanquantile with interpolation="midpoint" REPEAT_QUANTILE = 10 @@ -240,18 +243,24 @@ def test_move_quantile_with_nans(): rs = np.random.RandomState([1, 2, 3]) for size in [1, 2, 3, 5, 9, 10, 20, 21]: for _ in range(REPEAT_QUANTILE): - # 0 and 1 are important edge cases - for q in [0., 1., rs.rand()]: - for inf_frac in [0.2, 0.5, 0.7]: - a = np.arange(size, dtype=np.float64) - idx = rs.rand(*a.shape) < inf_frac - a[idx] = np.nan - rs.shuffle(a) - for window in range(2, size + 1): - actual = func(a, window=window, min_count=min_count, q=q) - desired = func0(a, window=window, min_count=min_count, q=q) - err_msg = fmt % (func.__name__, window, min_count, q, a) - aaae(actual, desired, decimal=5, err_msg=err_msg) + for nan_frac in [0.2, 0.5, 0.7, 1.]: + # test more variants of arrays (ints, floats, diff values) + arrays = [np.arange(size, dtype=np.float64), + (rs.random(size) - 0.5) * rs.randint(0, 100_000), + (rs.random(size) - 0.5) / rs.randint(0, 100_000)] + for a in arrays: + # q = 0. and 1. are important edge cases. We call existing + # move_min and move_max for these, but still must check + # that it works properly + for q in [0., 1., rs.rand()]: + idx = rs.rand(*a.shape) < nan_frac + a[idx] = np.nan + rs.shuffle(a) + for window in range(2, size + 1): + actual = func(a, window=window, min_count=min_count, q=q) + desired = func0(a, window=window, min_count=min_count, q=q) + err_msg = fmt % (func.__name__, window, min_count, q, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) def test_move_quantile_without_nans(): """test move_quantile.c without nans""" @@ -264,19 +273,36 @@ def test_move_quantile_without_nans(): rs = np.random.RandomState([1, 2, 3]) for size in [1, 2, 3, 5, 9, 10, 20, 21]: for _ in range(REPEAT_QUANTILE): - for q in [0., 1., rs.rand()]: - a = np.arange(size, dtype=np.float64) - rs.shuffle(a) - for window in range(2, size + 1): - actual = func(a, window=window, min_count=min_count, q=q) - desired = func0(a, window=window, min_count=min_count, q=q) - err_msg = fmt % (func.__name__, window, min_count, q, a) - aaae(actual, desired, decimal=5, err_msg=err_msg) + # test more variants of arrays (ints, floats, diff values) + arrays = [np.arange(size, dtype=np.float64), + (rs.random(size) - 0.5) * rs.randint(0, 100_000), + (rs.random(size) - 0.5) / rs.randint(0, 100_000)] + for a in arrays: + # q = 0. and 1. are important edge cases. We call existing + # move_min and move_max for these, but still must check + # that it works properly + for q in [0., 1., rs.rand()]: + rs.shuffle(a) + for window in range(2, size + 1): + actual = func(a, window=window, min_count=min_count, q=q) + desired = func0(a, window=window, min_count=min_count, q=q) + err_msg = fmt % (func.__name__, window, min_count, q, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) # Now let's deal with inf ans -infs -# TODO explain what's happening there - +# np.nanquantile doesn't give desired results when infs are present, +# due to abmiguities with arithmetic operations with infs. +# For instance, +# np.nanquantile([np.inf, np.inf], q=0.5, method="midpoint") returns np.nan, +# while +# np.nanmedian([np.inf, np.inf]) returns np.inf, +# although arguably these should give the same result. The behaviour of +# np.nanmedian is also agruably more expected. +# We check that the following will always give the same result as np.nanmedian(a): +# (np.nanquantile(a, q=0.5, method="lower") + np.nanquantile(a, q=0.5, method="higher")) / 2 +# It is also clear that this essentially is the same as method="midpoint". +# The next test verifies this. from ..slow.move import np_nanquantile_infs @@ -292,25 +318,33 @@ def test_numpy_nanquantile_infs(): rs = np.random.RandomState([1, 2, 3]) sizes = [1, 2, 3, 4, 5, 9, 10, 20, 31, 50] fracs = [0., 0.2, 0.4, 0.6, 0.8, 1.] + inf_minf_nan_fracs = [triple for triple in itertools.product(fracs, fracs, fracs) if np.sum(triple) <= 1] with warnings.catch_warnings(): warnings.simplefilter("ignore") for size in sizes: - for _ in range(REPEAT_NUMPY_QUANTILE): - for inf_frac, minus_inf_frac, nan_frac in itertools.product(fracs, fracs, fracs): - a = np.arange(size, dtype=np.float64) - idx = rs.rand(*a.shape) < inf_frac - a[idx] = np.inf - idx = rs.rand(*a.shape) < minus_inf_frac - a[idx] = -np.inf - idx = rs.rand(*a.shape) < nan_frac - a[idx] = np.nan - rs.shuffle(a) - actual = func(a) - desired = func0(a, q=0.5) - err_msg = fmt % (func.__name__, a) - aaae(actual, desired, decimal=5, err_msg=err_msg) - -# This should convince us TODO finish + for _ in range(REPEAT_NUMPY_QUANTILE): + for (inf_frac, minus_inf_frac, nan_frac) in inf_minf_nan_fracs: + arrays = [np.arange(size, dtype=np.float64), + (rs.random(size) - 0.5) * rs.randint(0, 100_000), + (rs.random(size) - 0.5) / rs.randint(0, 100_000)] + for a in arrays: + randoms = rs.rand(*a.shape) + idx_nans = randoms < inf_frac + minus_inf_frac + nan_frac + a[idx_nans] = np.nan + idx_minfs = randoms < inf_frac + minus_inf_frac + a[idx_minfs] = -np.inf + idx_infs = randoms < inf_frac + a[idx_infs] = np.inf + rs.shuffle(a) + actual = func(a) + desired = func0(a, q=0.5) + err_msg = fmt % (func.__name__, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) + +# This shows that np_nanquantile_infs indeed replicates the +# behaviour of np.nanquantile, while correclty handling infs in data. +# So we use np_nanquantile_infs in our bn.slow.move_quantile for testing + REPEAT_FULL_QUANTILE = 1 @@ -322,33 +356,36 @@ def test_move_quantile_with_infs_and_nans(): func0 = bn.slow.move_quantile rs = np.random.RandomState([1, 2, 3]) fracs = [0., 0.2, 0.4, 0.6, 0.8, 1.] - fracs = [0., 0.3, 0.65, 1.] inf_minf_nan_fracs = [triple for triple in itertools.product(fracs, fracs, fracs) if np.sum(triple) <= 1] total = 0 - # for size in [1, 2, 3, 5, 9, 10, 20, 21, 47, 48]: - for size in [1, 2, 3, 5, 9, 10, 20, 21]: + for size in [1, 2, 3, 5, 9, 10, 20, 21, 47, 48]: + # for size in [1, 2, 3, 5, 9, 10, 20, 21]: for min_count in [1, 2, 3, size//2, size - 1, size]: if min_count < 1 or min_count > size: continue for (inf_frac, minus_inf_frac, nan_frac) in inf_minf_nan_fracs: for window in range(min_count, size + 1): for _ in range(REPEAT_FULL_QUANTILE): - # for q in [0., 1., 0.25, 0.75, rs.rand()]: - for q in [0.25, 0.75, rs.rand()]: - a = np.arange(size, dtype=np.float64) - randoms = rs.rand(*a.shape) - idx_nans = randoms < inf_frac + minus_inf_frac + nan_frac - a[idx_nans] = np.nan - idx_minfs = randoms < inf_frac + minus_inf_frac - a[idx_minfs] = -np.inf - idx_infs = randoms < inf_frac - a[idx_infs] = np.inf - rs.shuffle(a) - actual = func(a, window=window, min_count=min_count, q=q) - desired = func0(a, window=window, min_count=min_count, q=q) - err_msg = fmt % (func.__name__, window, min_count, q, a) - aaae(actual, desired, decimal=5, err_msg=err_msg) - + for q in [0., 1., 0.25, 0.75, rs.rand()]: + arrays = [np.arange(size, dtype=np.float64), + (rs.random(size) - 0.5) * rs.randint(0, 100_000), + (rs.random(size) - 0.5) / rs.randint(0, 100_000)] + for a in arrays: + total += 1 + a = np.arange(size, dtype=np.float64) + randoms = rs.rand(*a.shape) + idx_nans = randoms < inf_frac + minus_inf_frac + nan_frac + a[idx_nans] = np.nan + idx_minfs = randoms < inf_frac + minus_inf_frac + a[idx_minfs] = -np.inf + idx_infs = randoms < inf_frac + a[idx_infs] = np.inf + rs.shuffle(a) + actual = func(a, window=window, min_count=min_count, q=q) + desired = func0(a, window=window, min_count=min_count, q=q) + err_msg = fmt % (func.__name__, window, min_count, q, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) + print(total) From 5a2bcac50343e1e17a6da29e9bce03cf3cda48fc Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Fri, 23 Sep 2022 00:42:24 -0400 Subject: [PATCH 22/30] Refactor parse_args again to actually work Mostly get rid of macros --- bottleneck/src/move_template.c | 86 +++++++++++++++++++++++++++------- 1 file changed, 68 insertions(+), 18 deletions(-) diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index 8899f3eed9..a3aaef13d9 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -855,17 +855,53 @@ intern_strings(void) { } /* mover ----------------------------------------------------------------- */ -#define COMPARE_AND_SET(var_name, key, value) \ - if (PyObject_RichCompareBool(key, pystr_##var_name, Py_EQ)) { \ - *var_name = value; \ - continue; \ - } -static inline void get_argument(PyObject **var, PyObject *args, short condition, int nargs, short *counter) { +/* helper function to set a keyword argument */ +static inline short +set_kw_argument(PyObject **var, + PyObject **key_ptr, + PyObject **value_ptr, + PyObject **string_ptr, + short *set_var, + short condition) { + if (PyObject_RichCompareBool(*key_ptr, *string_ptr, Py_EQ)) { + if (*set_var) { + TYPE_ERR("Repeated argument was passed!"); + return 0; + } else if (!condition) { + TYPE_ERR("Keyword argument not supported!"); + return 0; + } + *var = *value_ptr; + *set_var = 1; + return 1; + } + return 2; + +} + +#define CHECK_STATUS(status) \ + if (!status) { return 0; } else if (status == 1) { continue; } + +/* helper function to set a non-keyword argument */ +static inline short +set_argument(PyObject **var, + PyObject *args, + short *set_var, + short condition, + int nargs, + short *counter) { if (*counter && condition) { + if (*set_var) { + TYPE_ERR("Repeated argument was passed!"); + return 0; + } *var = PyTuple_GET_ITEM(args, nargs - *counter); *counter -= 1; + *set_var = 1; + return 1; } + return 1; } static inline int @@ -891,14 +927,28 @@ parse_args(PyObject *args, PyObject *value; Py_ssize_t pos = 0; + short set_a = 0, + set_window = 0, + set_min_count = 0, + set_axis = 0, + set_ddof = 0, + set_q = 0; + if (nkwds) { + short status; while (PyDict_Next(kwds, &pos, &key, &value)) { - COMPARE_AND_SET(a, key, value) - COMPARE_AND_SET(window, key, value) - COMPARE_AND_SET(min_count, key, value) - COMPARE_AND_SET(axis, key, value) - COMPARE_AND_SET(ddof, key, value) - COMPARE_AND_SET(q, key, value) + status = set_kw_argument(a, &key, &value, &pystr_a, &set_a, 1); + CHECK_STATUS(status) + status = set_kw_argument(window, &key, &value, &pystr_window, &set_window, 1); + CHECK_STATUS(status) + status = set_kw_argument(min_count, &key, &value, &pystr_min_count, &set_min_count, 1); + CHECK_STATUS(status) + status = set_kw_argument(axis, &key, &value, &pystr_axis, &set_axis, 1); + CHECK_STATUS(status) + status = set_kw_argument(ddof, &key, &value, &pystr_ddof, &set_ddof, has_ddof); + CHECK_STATUS(status) + status = set_kw_argument(q, &key, &value, &pystr_q, &set_q, has_quantile); + CHECK_STATUS(status) TYPE_ERR("Unsupported keyword argument"); return 0; } @@ -906,12 +956,12 @@ parse_args(PyObject *args, short args_not_found = nargs; - get_argument(a, args, 1, nargs, &args_not_found); - get_argument(window, args, 1, nargs, &args_not_found); - get_argument(min_count, args, 1, nargs, &args_not_found); - get_argument(axis, args, 1, nargs, &args_not_found); - get_argument(ddof, args, has_ddof, nargs, &args_not_found); - get_argument(q, args, has_quantile, nargs, &args_not_found); + if (!set_argument(a, args, &set_a, 1, nargs, &args_not_found)) return 0; + if (!set_argument(window, args, &set_window, 1, nargs, &args_not_found)) return 0; + if (!set_argument(min_count, args, &set_min_count, 1, nargs, &args_not_found)) return 0; + if (!set_argument(axis, args, &set_axis, 1, nargs, &args_not_found)) return 0; + if (!set_argument(ddof, args, &set_ddof, has_ddof, nargs, &args_not_found)) return 0; + if (!set_argument(q, args, &set_q, has_quantile, nargs, &args_not_found)) return 0; if (args_not_found) { TYPE_ERR("wrong number of arguments"); From c390863f63aadba8c2d0153eeacf476bfc4586c9 Mon Sep 17 00:00:00 2001 From: riazanov Date: Fri, 23 Sep 2022 01:39:41 -0400 Subject: [PATCH 23/30] Dial tests back a little to run reasonable time --- bottleneck/tests/move_test.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/bottleneck/tests/move_test.py b/bottleneck/tests/move_test.py index c82b234d43..01d3d6e841 100644 --- a/bottleneck/tests/move_test.py +++ b/bottleneck/tests/move_test.py @@ -26,7 +26,7 @@ def test_move(func): decimal = 5 quantiles = [1.] if func_name == "move_quantile": - quantiles = [0., 0.25, 0.5, 0.75, 1.] + quantiles = [0.33, 0.67] for i, a in enumerate(arrays(func_name)): axes = range(-1, a.ndim) for axis in axes: @@ -241,7 +241,7 @@ def test_move_quantile_with_nans(): func = bn.move_quantile func0 = bn.slow.move_quantile rs = np.random.RandomState([1, 2, 3]) - for size in [1, 2, 3, 5, 9, 10, 20, 21]: + for size in [1, 2, 3, 5, 9, 10, 13, 16]: for _ in range(REPEAT_QUANTILE): for nan_frac in [0.2, 0.5, 0.7, 1.]: # test more variants of arrays (ints, floats, diff values) @@ -271,7 +271,7 @@ def test_move_quantile_without_nans(): func = bn.move_quantile func0 = bn.slow.move_quantile rs = np.random.RandomState([1, 2, 3]) - for size in [1, 2, 3, 5, 9, 10, 20, 21]: + for size in [1, 2, 3, 5, 9, 10, 13, 16]: for _ in range(REPEAT_QUANTILE): # test more variants of arrays (ints, floats, diff values) arrays = [np.arange(size, dtype=np.float64), @@ -316,7 +316,7 @@ def test_numpy_nanquantile_infs(): func = np.nanmedian func0 = np_nanquantile_infs rs = np.random.RandomState([1, 2, 3]) - sizes = [1, 2, 3, 4, 5, 9, 10, 20, 31, 50] + sizes = [1, 2, 3, 4, 5, 9, 10, 20, 31] fracs = [0., 0.2, 0.4, 0.6, 0.8, 1.] inf_minf_nan_fracs = [triple for triple in itertools.product(fracs, fracs, fracs) if np.sum(triple) <= 1] with warnings.catch_warnings(): @@ -345,7 +345,6 @@ def test_numpy_nanquantile_infs(): # behaviour of np.nanquantile, while correclty handling infs in data. # So we use np_nanquantile_infs in our bn.slow.move_quantile for testing - REPEAT_FULL_QUANTILE = 1 def test_move_quantile_with_infs_and_nans(): @@ -357,9 +356,7 @@ def test_move_quantile_with_infs_and_nans(): rs = np.random.RandomState([1, 2, 3]) fracs = [0., 0.2, 0.4, 0.6, 0.8, 1.] inf_minf_nan_fracs = [triple for triple in itertools.product(fracs, fracs, fracs) if np.sum(triple) <= 1] - total = 0 - for size in [1, 2, 3, 5, 9, 10, 20, 21, 47, 48]: - # for size in [1, 2, 3, 5, 9, 10, 20, 21]: + for size in [1, 2, 3, 5, 9, 10, 17, 20, 31]: for min_count in [1, 2, 3, size//2, size - 1, size]: if min_count < 1 or min_count > size: continue @@ -371,7 +368,6 @@ def test_move_quantile_with_infs_and_nans(): (rs.random(size) - 0.5) * rs.randint(0, 100_000), (rs.random(size) - 0.5) / rs.randint(0, 100_000)] for a in arrays: - total += 1 a = np.arange(size, dtype=np.float64) randoms = rs.rand(*a.shape) idx_nans = randoms < inf_frac + minus_inf_frac + nan_frac @@ -385,7 +381,6 @@ def test_move_quantile_with_infs_and_nans(): desired = func0(a, window=window, min_count=min_count, q=q) err_msg = fmt % (func.__name__, window, min_count, q, a) aaae(actual, desired, decimal=5, err_msg=err_msg) - print(total) From 6f1e5d4b66bccf3f6d0af9db8ae943e756b7d7d0 Mon Sep 17 00:00:00 2001 From: riazanov Date: Fri, 23 Sep 2022 04:05:53 -0400 Subject: [PATCH 24/30] Modify benches, restore old files --- .gitignore | 2 +- bottleneck/benchmark/bench.py | 2 + bottleneck/benchmark/bench_detailed.py | 41 +++++++++++-------- bottleneck/slow/move.py | 36 ++++++++-------- .../src/move_median/move_median_debug.c | 7 ++-- 5 files changed, 48 insertions(+), 40 deletions(-) diff --git a/.gitignore b/.gitignore index b6a1242cb0..81808d667d 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,4 @@ MANIFEST .*.swp *~ \#*# -bottleneck/src/bn_config.h \ No newline at end of file +bottleneck/src/bn_config.h diff --git a/bottleneck/benchmark/bench.py b/bottleneck/benchmark/bench.py index f18ec5a19c..408a332028 100644 --- a/bottleneck/benchmark/bench.py +++ b/bottleneck/benchmark/bench.py @@ -198,6 +198,8 @@ def getsetups(setup, shapes, nans, axes, dtype, order): run = {} run["name"] = func run["statements"] = ["bn_func(a, w, 1, axis)", "sw_func(a, w, 1, axis)"] + if func == "move_quantile": + run["statements"] = ["bn_func(a, w, 1, axis, q=0.25)", "sw_func(a, w, 1, axis, q=0.25)"] setup = """ from bottleneck.slow.move import %s as sw_func from bottleneck import %s as bn_func diff --git a/bottleneck/benchmark/bench_detailed.py b/bottleneck/benchmark/bench_detailed.py index e3e56a7ce8..da30ba1ec0 100644 --- a/bottleneck/benchmark/bench_detailed.py +++ b/bottleneck/benchmark/bench_detailed.py @@ -79,7 +79,6 @@ def benchsuite(function, fraction_nan): # avoid all-nan slice warnings from np.median and np.nanmedian if "%s" == "median": from bottleneck.slow import median as sl_fn if "%s" == "nanmedian": from bottleneck.slow import nanmedian as sl_fn - if "%s" == "move_quantile": from bottleneck.slow import move_median as sl_fn from numpy import array, nan from numpy.random import RandomState @@ -95,12 +94,14 @@ def benchsuite(function, fraction_nan): index = 0 elif function in ["rankdata", "nanrankdata"]: index = 0 - elif function in bn.get_functions("move", as_string=True): + elif function in bn.get_functions("move", as_string=True) and function != "move_quantile": index = 1 elif function in ["partition", "argpartition", "push"]: index = 2 elif function == "replace": index = 3 + elif function == "move_quantile": + index = 4 else: raise ValueError("`function` (%s) not recognized" % function) @@ -117,7 +118,7 @@ def benchsuite(function, fraction_nan): run = {} run["name"] = [f + signature, array] run["statements"] = ["bn_fn" + signature, "sl_fn" + signature] - run["setup"] = setup % (f, f, f, f, f, f, array, fraction_nan, fraction_nan) + run["setup"] = setup % (f, f, f, f, f, array, fraction_nan, fraction_nan) run["repeat"] = repeat suite.append(run) @@ -134,23 +135,24 @@ def get_instructions(): "(a, 1)", # move "(a, 0)", # (arg)partition "(a, np.nan, 0)", # replace + "(a, 1, q=0.25)", # move_quantile 10, ), - ("rand(10)", "(a)", "(a, 2)", "(a, 2)", "(a, np.nan, 0)", 10), - ("rand(100)", "(a)", "(a, 20)", "(a, 20)", "(a, np.nan, 0)", 6), - ("rand(1000)", "(a)", "(a, 200)", "(a, 200)", "(a, np.nan, 0)", 3), - ("rand(1000000)", "(a)", "(a, 200)", "(a, 200)", "(a, np.nan, 0)", 2), + ("rand(10)", "(a)", "(a, 2)", "(a, 2)", "(a, np.nan, 0)", "(a, 2, q=0.25)", 10), + ("rand(100)", "(a)", "(a, 20)", "(a, 20)", "(a, np.nan, 0)", "(a, 20, q=0.25)", 6), + ("rand(1000)", "(a)", "(a, 200)", "(a, 200)", "(a, np.nan, 0)", "(a, 200, q=0.25)", 3), + ("rand(1000000)", "(a)", "(a, 200)", "(a, 200)", "(a, np.nan, 0)", None, 2), # 2d input array - ("rand(10, 10)", "(a)", "(a, 2)", "(a, 2)", "(a, np.nan, 0)", 6), - ("rand(100, 100)", "(a)", "(a, 20)", "(a, 20)", "(a, np.nan, 0)", 3), - ("rand(1000, 1000)", "(a)", "(a, 200)", "(a, 200)", "(a, np.nan, 0)", 2), - ("rand(10, 10)", "(a, 1)", None, None, None, 6), - ("rand(100, 100)", "(a, 1)", None, None, None, 3), - ("rand(1000, 1000)", "(a, 1)", None, None, None, 2), - ("rand(100000, 2)", "(a, 1)", "(a, 1)", "(a, 1)", None, 2), - ("rand(10, 10)", "(a, 0)", None, None, None, 6), - ("rand(100, 100)", "(a, 0)", "(a, 20, axis=0)", None, None, 3), - ("rand(1000, 1000)", "(a, 0)", "(a, 200, axis=0)", None, None, 2), + ("rand(10, 10)", "(a)", "(a, 2)", "(a, 2)", "(a, np.nan, 0)", "(a, 2, q=0.25)", 6), + ("rand(100, 100)", "(a)", "(a, 20)", "(a, 20)", "(a, np.nan, 0)", "(a, 20, q=0.25)", 3), + ("rand(1000, 1000)", "(a)", "(a, 200)", "(a, 200)", "(a, np.nan, 0)", None ,2), + ("rand(10, 10)", "(a, 1)", None, None, None, None, 6), + ("rand(100, 100)", "(a, 1)", None, None, None, None, 3), + ("rand(1000, 1000)", "(a, 1)", None, None, None, None, 2), + ("rand(100000, 2)", "(a, 1)", "(a, 1)", "(a, 1)", None, None, 2), + ("rand(10, 10)", "(a, 0)", None, None, None, None, 6), + ("rand(100, 100)", "(a, 0)", "(a, 20, axis=0)", None, None, None, 3), + ("rand(1000, 1000)", "(a, 0)", "(a, 200, axis=0)", None, None, None, 2), # 3d input array ( "rand(100, 100, 100)", @@ -158,6 +160,7 @@ def get_instructions(): "(a, 20, axis=0)", "(a, 20, axis=0)", None, + "(a, 20, axis=0, q=0.25)", 2, ), ( @@ -166,6 +169,7 @@ def get_instructions(): "(a, 20, axis=1)", "(a, 20, axis=1)", None, + "(a, 20, axis=1, q=0.25)", 2, ), ( @@ -174,10 +178,11 @@ def get_instructions(): "(a, 20, axis=2)", "(a, 20, axis=2)", "(a, np.nan, 0)", + "(a, 20, axis=2, q=0.25)", 2, ), # 0d input array - ("array(1.0)", "(a)", None, None, "(a, 0, 2)", 10), + ("array(1.0)", "(a)", None, None, "(a, 0, 2)", None, 10), ] return instructions diff --git a/bottleneck/slow/move.py b/bottleneck/slow/move.py index cd2ccfabab..25130b95bb 100644 --- a/bottleneck/slow/move.py +++ b/bottleneck/slow/move.py @@ -107,15 +107,7 @@ def move_median(a, window, min_count=None, axis=-1, **kwargs): "Slow move_median for unaccelerated dtype" return move_func(np.nanmedian, a, window, min_count, axis=axis) -def move_quantile(a, window, min_count=None, axis=-1, q=0.5): - "Slow move_quantile for unaccelerated dtype" - return move_func(np_nanquantile_infs, a, window, min_count, axis=axis, q=q) - -def move_rank(a, window, min_count=None, axis=-1): - "Slow move_rank for unaccelerated dtype" - return move_func(lastrank, a, window, min_count, axis=axis) -# function for handling infs in np.nanquantile # keyword argument for interpolation method in np.nanquantile was changed in 1.22.0 from packaging import version if version.parse(np.__version__) > version.parse("1.22.0"): @@ -123,20 +115,30 @@ def move_rank(a, window, min_count=None, axis=-1): else: METHOD_KEYWORD = "interpolation" -def np_nanquantile_infs(a, **kwargs): +def move_quantile(a, window, min_count=None, axis=-1, q=0.5, **kwargs): + "Slow move_quantile for unaccelerated dtype" with warnings.catch_warnings(): warnings.simplefilter("ignore") if not np.isinf(a).any(): kwargs[METHOD_KEYWORD] = 'midpoint' - return np.nanquantile(a, **kwargs) + return move_func(np.nanquantile, a, window, min_count, axis=axis, q=q, **kwargs) else: - kwargs[METHOD_KEYWORD] = 'lower' - lower_nanquantile = np.nanquantile(a, **kwargs) - kwargs[METHOD_KEYWORD] = 'higher' - higher_nanquantile = np.nanquantile(a, **kwargs) - - midpoint_nanquantile = (lower_nanquantile + higher_nanquantile) / 2 - return midpoint_nanquantile + return move_func(np_nanquantile_infs, a, window, min_count, axis=axis, q=q, **kwargs) + +def move_rank(a, window, min_count=None, axis=-1): + "Slow move_rank for unaccelerated dtype" + return move_func(lastrank, a, window, min_count, axis=axis) + + +# function for handling infs in np.nanquantile +def np_nanquantile_infs(a, **kwargs): + kwargs[METHOD_KEYWORD] = 'lower' + lower_nanquantile = np.nanquantile(a, **kwargs) + kwargs[METHOD_KEYWORD] = 'higher' + higher_nanquantile = np.nanquantile(a, **kwargs) + + midpoint_nanquantile = (lower_nanquantile + higher_nanquantile) / 2 + return midpoint_nanquantile # magic utility functions --------------------------------------------------- diff --git a/bottleneck/src/move_median/move_median_debug.c b/bottleneck/src/move_median/move_median_debug.c index ce8791eff8..13871d2e6e 100644 --- a/bottleneck/src/move_median/move_median_debug.c +++ b/bottleneck/src/move_median/move_median_debug.c @@ -1,6 +1,6 @@ #include "move_median.h" -ai_t *mm_move_median(ai_t *a, idx_t length, idx_t window, idx_t min_count, double quantile); +ai_t *mm_move_median(ai_t *a, idx_t length, idx_t window, idx_t min_count); int mm_assert_equal(ai_t *actual, ai_t *desired, ai_t *input, idx_t length, char *err_msg); int mm_unit_test(void); @@ -19,7 +19,7 @@ int main(void) { /* moving window median of 1d arrays returns output array */ -ai_t *mm_move_median(ai_t *a, idx_t length, idx_t window, idx_t min_count, double quantile) { +ai_t *mm_move_median(ai_t *a, idx_t length, idx_t window, idx_t min_count) { mm_handle *mm; ai_t *out; idx_t i; @@ -84,14 +84,13 @@ int mm_unit_test(void) { int length; char *err_msg; int failed; - double quantile = 0.5; length = sizeof(arr_input) / sizeof(*arr_input); err_msg = malloc(1024 * sizeof *err_msg); sprintf(err_msg, "move_median failed with window=%d, min_count=%d", window, min_count); - actual = mm_move_median(arr_input, length, window, min_count, quantile); + actual = mm_move_median(arr_input, length, window, min_count); failed = mm_assert_equal(actual, desired, arr_input, length, err_msg); free(actual); From 7f1c3af476fd1eae7af143eae2a40f1c2ad47aa2 Mon Sep 17 00:00:00 2001 From: riazanov Date: Mon, 26 Sep 2022 01:07:58 -0400 Subject: [PATCH 25/30] Change `packaging` module to `pkg_resources` for versions comparison --- bottleneck/slow/move.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bottleneck/slow/move.py b/bottleneck/slow/move.py index 25130b95bb..ff5cd153a2 100644 --- a/bottleneck/slow/move.py +++ b/bottleneck/slow/move.py @@ -109,8 +109,8 @@ def move_median(a, window, min_count=None, axis=-1, **kwargs): # keyword argument for interpolation method in np.nanquantile was changed in 1.22.0 -from packaging import version -if version.parse(np.__version__) > version.parse("1.22.0"): +import pkg_resources +if pkg_resources.parse_version(np.__version__) >= pkg_resources.parse_version("1.22.0"): METHOD_KEYWORD = "method" else: METHOD_KEYWORD = "interpolation" From 4dadfe4533d4dc0cd579a7c4515bd5cb919616e2 Mon Sep 17 00:00:00 2001 From: andrii-riazanov <59578540+andrii-riazanov@users.noreply.github.com> Date: Tue, 27 Sep 2022 01:36:30 -0400 Subject: [PATCH 26/30] Update move_quantile benches in asv with q=0.25 --- asv_bench/benchmarks/move.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/move.py b/asv_bench/benchmarks/move.py index 5b229c46fb..47056d9bac 100644 --- a/asv_bench/benchmarks/move.py +++ b/asv_bench/benchmarks/move.py @@ -41,7 +41,7 @@ def time_move_median(self, dtype, shape, window): bn.move_median(self.arr, window) def time_move_quantile(self, dtype, shape, window): - bn.move_quantile(self.arr, window) + bn.move_quantile(self.arr, window, q=0.25) def time_move_rank(self, dtype, shape, window): bn.move_rank(self.arr, window) @@ -88,7 +88,7 @@ def time_move_median(self, dtype, shape, order, axis, window): bn.move_median(self.arr, window, axis=axis) def time_move_quantile(self, dtype, shape, order, axis, window): - bn.move_quantile(self.arr, window, axis=axis) + bn.move_quantile(self.arr, window, axis=axis, q=0.25) def time_move_rank(self, dtype, shape, order, axis, window): bn.move_rank(self.arr, window, axis=axis) From 9012e24a4bd28ee8e3e13706bc10245785c09ee7 Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Tue, 27 Sep 2022 21:15:27 -0400 Subject: [PATCH 27/30] Make mm_handle and mq_handle the same This eliminate the need for macro in move_median.c mm_handle will just have an unused membet "quanitle" for the case of move_median. --- bottleneck/src/move_median/move_median.c | 330 +++++++++++------------ bottleneck/src/move_median/move_median.h | 36 +-- bottleneck/src/move_template.c | 14 +- bottleneck/tests/move_test.py | 7 +- 4 files changed, 172 insertions(+), 215 deletions(-) diff --git a/bottleneck/src/move_median/move_median.c b/bottleneck/src/move_median/move_median.c index 7b68fad250..5bd4380714 100644 --- a/bottleneck/src/move_median/move_median.c +++ b/bottleneck/src/move_median/move_median.c @@ -18,6 +18,7 @@ node1->idx = idx2; \ node2->idx = idx1; \ idx1 = idx2 + /* ----------------------------------------------------------------------------- Prototypes @@ -26,13 +27,11 @@ idx1 = idx2 /* helper functions */ static inline ai_t mm_get_median(mm_handle *mm); -static inline ai_t mq_get_quantile(mq_handle *mq); -static inline idx_t mq_k_stat(mq_handle *mq, idx_t idx); -static inline short mq_stat_exact(mq_handle *mq, idx_t idx); -static inline void heapify_small_node_mm(mm_handle *mm, idx_t idx); -static inline void heapify_large_node_mm(mm_handle *mm, idx_t idx); -static inline void heapify_small_node_mq(mq_handle *mq, idx_t idx); -static inline void heapify_large_node_mq(mq_handle *mq, idx_t idx); +static inline ai_t mq_get_quantile(mm_handle *mq); +static inline idx_t mq_k_stat(mm_handle *mq, idx_t idx); +static inline short mq_stat_exact(mm_handle *mq, idx_t idx); +static inline void heapify_small_node(mm_handle *mm, idx_t idx); +static inline void heapify_large_node(mm_handle *mm, idx_t idx); static inline idx_t mm_get_smallest_child(mm_node **heap, idx_t window, idx_t idx, mm_node **child); static inline idx_t mm_get_largest_child(mm_node **heap, idx_t window, @@ -80,9 +79,9 @@ mm_new(const idx_t window, idx_t min_count) { } /* Same as mm_new, but the heaps have different sizes */ -mq_handle * +mm_handle * mq_new(const idx_t window, idx_t min_count, double quantile) { - mq_handle *mq = malloc(sizeof(mq_handle)); + mm_handle *mq = malloc(sizeof(mm_handle)); mq->nodes = malloc(window * sizeof(mm_node*)); mq->node_data = malloc(window * sizeof(mm_node)); @@ -96,7 +95,7 @@ mq_new(const idx_t window, idx_t min_count, double quantile) { mq->odd = window % 2; mq->min_count = min_count; - mq_reset(mq); + mm_reset(mq); return mq; } @@ -132,7 +131,7 @@ mm_update_init(mm_handle *mm, ai_t ai) { node->idx = n_l; ++mm->n_l; mm->l_first_leaf = FIRST_LEAF(mm->n_l); - heapify_large_node_mm(mm, n_l); + heapify_large_node(mm, n_l); } else { /* add new node to small heap */ mm->s_heap[n_s] = node; @@ -140,7 +139,7 @@ mm_update_init(mm_handle *mm, ai_t ai) { node->idx = n_s; ++mm->n_s; mm->s_first_leaf = FIRST_LEAF(mm->n_s); - heapify_small_node_mm(mm, n_s); + heapify_small_node(mm, n_s); } } @@ -151,7 +150,7 @@ mm_update_init(mm_handle *mm, ai_t ai) { /* Same as mm_update_init, but returns the current quantile. */ ai_t -mq_update_init(mq_handle *mq, ai_t ai) { +mq_update_init(mm_handle *mq, ai_t ai) { mm_node *node; idx_t n_s = mq->n_s; idx_t n_l = mq->n_l; @@ -169,10 +168,8 @@ mq_update_init(mq_handle *mq, ai_t ai) { mq->s_first_leaf = 0; } else { /* at least one node already exists in the heaps */ - - idx_t k_stat = mq_k_stat(mq, n_s + n_l + 1); - mq->newest->next = node; + idx_t k_stat = mq_k_stat(mq, n_s + n_l + 1); if (n_s >= k_stat) { /* add new node to large heap */ mq->l_heap[n_l] = node; @@ -180,7 +177,7 @@ mq_update_init(mq_handle *mq, ai_t ai) { node->idx = n_l; ++mq->n_l; mq->l_first_leaf = FIRST_LEAF(mq->n_l); - heapify_large_node_mq(mq, n_l); + heapify_large_node(mq, n_l); } else { /* add new node to small heap */ mq->s_heap[n_s] = node; @@ -188,7 +185,7 @@ mq_update_init(mq_handle *mq, ai_t ai) { node->idx = n_s; ++mq->n_s; mq->s_first_leaf = FIRST_LEAF(mq->n_s); - heapify_small_node_mq(mq, n_s); + heapify_small_node(mq, n_s); } } @@ -215,9 +212,9 @@ mm_update(mm_handle *mm, ai_t ai) { /* adjust position of new node in heap if needed */ if (node->region == SH) { - heapify_small_node_mm(mm, node->idx); + heapify_small_node(mm, node->idx); } else { - heapify_large_node_mm(mm, node->idx); + heapify_large_node(mm, node->idx); } /* return the median */ @@ -230,7 +227,7 @@ mm_update(mm_handle *mm, ai_t ai) { /* Same as mm_update, but returns the current quantile. */ ai_t -mq_update(mq_handle *mq, ai_t ai) { +mq_update(mm_handle *mq, ai_t ai) { /* node is oldest node with ai of newest node */ mm_node *node = mq->oldest; node->ai = ai; @@ -242,9 +239,9 @@ mq_update(mq_handle *mq, ai_t ai) { /* adjust position of new node in heap if needed */ if (node->region == SH) { - heapify_small_node_mq(mq, node->idx); + heapify_small_node(mq, node->idx); } else { - heapify_large_node_mq(mq, node->idx); + heapify_large_node(mq, node->idx); } /* return the median */ @@ -281,9 +278,9 @@ mm_new_nan(const idx_t window, idx_t min_count) { } /* Same as mm_new_nan, but the heaps have different sizes */ -mq_handle * +mm_handle * mq_new_nan(const idx_t window, idx_t min_count, double quantile) { - mq_handle *mq = malloc(sizeof(mq_handle)); + mm_handle *mq = malloc(sizeof(mm_handle)); mq->nodes = malloc(2 * window * sizeof(mm_node*)); mq->node_data = malloc(window * sizeof(mm_node)); @@ -297,7 +294,7 @@ mq_new_nan(const idx_t window, idx_t min_count, double quantile) { mq->window = window; mq->min_count = min_count; - mq_reset(mq); + mm_reset(mq); return mq; } @@ -352,7 +349,7 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { node->idx = n_l; ++mm->n_l; mm->l_first_leaf = FIRST_LEAF(mm->n_l); - heapify_large_node_mm(mm, n_l); + heapify_large_node(mm, n_l); } else { /* add new node to small heap */ mm->s_heap[n_s] = node; @@ -360,7 +357,7 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { node->idx = n_s; ++mm->n_s; mm->s_first_leaf = FIRST_LEAF(mm->n_s); - heapify_small_node_mm(mm, n_s); + heapify_small_node(mm, n_s); } } } @@ -371,7 +368,7 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { /* Same as mm_update_init_nan, but returns the current quantile. */ ai_t -mq_update_init_nan(mq_handle *mq, ai_t ai) { +mq_update_init_nan(mm_handle *mq, ai_t ai) { mm_node *node; idx_t n_s = mq->n_s; idx_t n_l = mq->n_l; @@ -418,7 +415,7 @@ mq_update_init_nan(mq_handle *mq, ai_t ai) { node->idx = n_l; ++mq->n_l; mq->l_first_leaf = FIRST_LEAF(mq->n_l); - heapify_large_node_mq(mq, n_l); + heapify_large_node(mq, n_l); } else { /* add new node to small heap */ mq->s_heap[n_s] = node; @@ -426,7 +423,7 @@ mq_update_init_nan(mq_handle *mq, ai_t ai) { node->idx = n_s; ++mq->n_s; mq->s_first_leaf = FIRST_LEAF(mq->n_s); - heapify_small_node_mq(mq, n_s); + heapify_small_node(mq, n_s); } } } @@ -501,13 +498,13 @@ mm_update_nan(mm_handle *mm, ai_t ai) { } else { mm->l_first_leaf = FIRST_LEAF(mm->n_l); } - heapify_large_node_mm(mm, 0); + heapify_large_node(mm, 0); } } else { if (idx != n_s - 1) { s_heap[idx] = s_heap[n_s - 1]; s_heap[idx]->idx = idx; - heapify_small_node_mm(mm, idx); + heapify_small_node(mm, idx); } if (mm->n_s < mm->n_l) { /* move head node from the large heap to the small heap */ @@ -517,7 +514,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { s_heap[mm->n_s] = node2; ++mm->n_s; mm->l_first_leaf = FIRST_LEAF(mm->n_s); - heapify_small_node_mm(mm, node2->idx); + heapify_small_node(mm, node2->idx); /* plug hole in large heap */ node2 = mm->l_heap[mm->n_l - 1]; @@ -529,11 +526,11 @@ mm_update_nan(mm_handle *mm, ai_t ai) { } else { mm->l_first_leaf = FIRST_LEAF(mm->n_l); } - heapify_large_node_mm(mm, 0); + heapify_large_node(mm, 0); } else { mm->s_first_leaf = FIRST_LEAF(mm->n_s); - heapify_small_node_mm(mm, idx); + heapify_small_node(mm, idx); } } } else if (node->region == LH) { @@ -552,7 +549,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { if (idx != n_l - 1) { l_heap[idx] = l_heap[n_l - 1]; l_heap[idx]->idx = idx; - heapify_large_node_mm(mm, idx); + heapify_large_node(mm, idx); } --mm->n_l; if (mm->n_l == 0) { @@ -568,7 +565,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { l_heap[mm->n_l] = node2; ++mm->n_l; mm->l_first_leaf = FIRST_LEAF(mm->n_l); - heapify_large_node_mm(mm, node2->idx); + heapify_large_node(mm, node2->idx); /* plug hole in small heap */ if (n_s != 1) { @@ -582,19 +579,19 @@ mm_update_nan(mm_handle *mm, ai_t ai) { } else { mm->s_first_leaf = FIRST_LEAF(mm->n_s); } - heapify_small_node_mm(mm, 0); + heapify_small_node(mm, 0); } /* reorder large heap if needed */ - heapify_large_node_mm(mm, idx); + heapify_large_node(mm, idx); } else if (node->region == NA) { /* insert node into nan heap */ n_array[idx] = node; } } else { if (node->region == SH) { - heapify_small_node_mm(mm, idx); + heapify_small_node(mm, idx); } else if (node->region == LH) { - heapify_large_node_mm(mm, idx); + heapify_large_node(mm, idx); } else { /* ai is not NaN but oldest node is in nan array */ if (n_s > n_l) { @@ -604,7 +601,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { l_heap[n_l] = node; ++mm->n_l; mm->l_first_leaf = FIRST_LEAF(mm->n_l); - heapify_large_node_mm(mm, n_l); + heapify_large_node(mm, n_l); } else { /* insert into small heap */ node->region = SH; @@ -612,7 +609,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { s_heap[n_s] = node; ++mm->n_s; mm->s_first_leaf = FIRST_LEAF(mm->n_s); - heapify_small_node_mm(mm, n_s); + heapify_small_node(mm, n_s); } /* plug nan array hole */ if (idx != n_n - 1) { @@ -627,7 +624,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { /* Same as mm_update_nan, but returns the current quantile. */ ai_t -mq_update_nan(mq_handle *mq, ai_t ai) { +mq_update_nan(mm_handle *mq, ai_t ai) { idx_t n_s, n_l, n_n; mm_node **l_heap; @@ -691,13 +688,13 @@ mq_update_nan(mq_handle *mq, ai_t ai) { } else { mq->l_first_leaf = FIRST_LEAF(mq->n_l); } - heapify_large_node_mq(mq, 0); + heapify_large_node(mq, 0); } } else { if (idx != n_s - 1) { s_heap[idx] = s_heap[n_s - 1]; s_heap[idx]->idx = idx; - heapify_small_node_mq(mq, idx); + heapify_small_node(mq, idx); } if (mq->n_s < k_stat) { /* move head node from the large heap to the small heap */ @@ -707,7 +704,7 @@ mq_update_nan(mq_handle *mq, ai_t ai) { s_heap[mq->n_s] = node2; ++mq->n_s; mq->l_first_leaf = FIRST_LEAF(mq->n_s); - heapify_small_node_mq(mq, node2->idx); + heapify_small_node(mq, node2->idx); /* plug hole in large heap */ node2 = mq->l_heap[mq->n_l - 1]; @@ -719,11 +716,11 @@ mq_update_nan(mq_handle *mq, ai_t ai) { } else { mq->l_first_leaf = FIRST_LEAF(mq->n_l); } - heapify_large_node_mq(mq, 0); + heapify_large_node(mq, 0); } else { mq->s_first_leaf = FIRST_LEAF(mq->n_s); - heapify_small_node_mq(mq, idx); + heapify_small_node(mq, idx); } } } else if (node->region == LH) { @@ -743,7 +740,7 @@ mq_update_nan(mq_handle *mq, ai_t ai) { if (idx != n_l - 1) { l_heap[idx] = l_heap[n_l - 1]; l_heap[idx]->idx = idx; - heapify_large_node_mq(mq, idx); + heapify_large_node(mq, idx); } --mq->n_l; if (mq->n_l == 0) { @@ -759,7 +756,7 @@ mq_update_nan(mq_handle *mq, ai_t ai) { l_heap[mq->n_l] = node2; ++mq->n_l; mq->l_first_leaf = FIRST_LEAF(mq->n_l); - heapify_large_node_mq(mq, node2->idx); + heapify_large_node(mq, node2->idx); /* plug hole in small heap */ if (n_s != 1) { @@ -773,19 +770,19 @@ mq_update_nan(mq_handle *mq, ai_t ai) { } else { mq->s_first_leaf = FIRST_LEAF(mq->n_s); } - heapify_small_node_mq(mq, 0); + heapify_small_node(mq, 0); } /* reorder large heap if needed */ - heapify_large_node_mq(mq, idx); + heapify_large_node(mq, idx); } else if (node->region == NA) { /* insert node into nan heap */ n_array[idx] = node; } } else { if (node->region == SH) { - heapify_small_node_mq(mq, idx); + heapify_small_node(mq, idx); } else if (node->region == LH) { - heapify_large_node_mq(mq, idx); + heapify_large_node(mq, idx); } else { /* ai is not NaN but oldest node is in nan array */ k_stat = mq_k_stat(mq, n_s + n_l + 1); @@ -797,7 +794,7 @@ mq_update_nan(mq_handle *mq, ai_t ai) { l_heap[n_l] = node; ++mq->n_l; mq->l_first_leaf = FIRST_LEAF(mq->n_l); - heapify_large_node_mq(mq, n_l); + heapify_large_node(mq, n_l); } else { /* insert into small heap */ node->region = SH; @@ -805,7 +802,7 @@ mq_update_nan(mq_handle *mq, ai_t ai) { s_heap[n_s] = node; ++mq->n_s; mq->s_first_leaf = FIRST_LEAF(mq->n_s); - heapify_small_node_mq(mq, n_s); + heapify_small_node(mq, n_s); } /* plug nan array hole */ if (idx != n_n - 1) { @@ -837,15 +834,6 @@ mm_reset(mm_handle *mm) { mm->l_first_leaf = 0; } -void -mq_reset(mq_handle *mq) { - mq->n_l = 0; - mq->n_s = 0; - mq->n_n = 0; - mq->s_first_leaf = 0; - mq->l_first_leaf = 0; -} - /* After bn.move_median is done, free the memory */ void @@ -855,12 +843,6 @@ mm_free(mm_handle *mm) { free(mm); } -void -mq_free(mq_handle *mq) { - free(mq->node_data); - free(mq->nodes); - free(mq); -} /* ----------------------------------------------------------------------------- @@ -881,19 +863,19 @@ mm_get_median(mm_handle *mm) { /* function to find the index of element correspodning to the current quantile */ -static inline idx_t mq_k_stat(mq_handle *mq, idx_t idx) { +static inline idx_t mq_k_stat(mm_handle *mq, idx_t idx) { return (idx_t) floor((idx - 1) * mq->quantile) + 1; } /* function to check if the current index of the quantile is integer. If so, * then the quantile is the element at the top of the heap. Otherwise take a midpoint */ -static inline short mq_stat_exact(mq_handle *mq, idx_t idx) { +static inline short mq_stat_exact(mm_handle *mq, idx_t idx) { return ((idx - 1) * mq->quantile) == floor((idx - 1) * mq->quantile); } /* Return the current quantile */ static inline ai_t -mq_get_quantile(mq_handle *mq) { +mq_get_quantile(mm_handle *mq) { idx_t n_total = mq->n_l + mq->n_s; if (n_total < mq->min_count) return MM_NAN(); @@ -902,107 +884,101 @@ mq_get_quantile(mq_handle *mq) { return (mq->s_heap[0]->ai + mq->l_heap[0]->ai) / 2.0; } -/* mtype is mm (move_median) or mq (move_quantile) */ -#define HEAPIFY_SMALL_NODE(mtype) \ - static inline void \ - heapify_small_node_##mtype(mtype##_handle *mtype, idx_t idx) { \ - idx_t idx2; \ - mm_node *node; \ - mm_node *node2; \ - mm_node **s_heap; \ - mm_node **l_heap; \ - idx_t n_s, n_l; \ - ai_t ai; \ - \ - s_heap = mtype->s_heap; \ - l_heap = mtype->l_heap; \ - node = s_heap[idx]; \ - n_s = mtype->n_s; \ - n_l = mtype->n_l; \ - ai = node->ai; \ - \ - /* Internal or leaf node */ \ - if (idx > 0) { \ - idx2 = P_IDX(idx); \ - node2 = s_heap[idx2]; \ - \ - /* Move up */ \ - if (ai > node2->ai) { \ - mm_move_up_small(s_heap, idx, node, idx2, node2); \ - \ - /* Maybe swap between heaps */ \ - node2 = l_heap[0]; \ - if (ai > node2->ai) { \ - mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node, node2); \ - } \ - } else if (idx < mtype->s_first_leaf) { \ - /* Move down */ \ - mm_move_down_small(s_heap, n_s, idx, node); \ - } \ - } else { \ - /* Head node */ \ - node2 = l_heap[0]; \ - if (n_l > 0 && ai > node2->ai) { \ - mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node, node2); \ - } else { \ - mm_move_down_small(s_heap, n_s, idx, node); \ - } \ - } \ - } \ - -HEAPIFY_SMALL_NODE(mm) -HEAPIFY_SMALL_NODE(mq) - -/* mtype is mm (move_median) or mq (move_quantile) */ -#define HEAPIFY_LARGE_NODE(mtype) \ - static inline void \ - heapify_large_node_##mtype(mtype##_handle *mtype, idx_t idx) { \ - idx_t idx2; \ - mm_node *node; \ - mm_node *node2; \ - mm_node **s_heap; \ - mm_node **l_heap; \ - idx_t n_s, n_l; \ - ai_t ai; \ - \ - s_heap = mtype->s_heap; \ - l_heap = mtype->l_heap; \ - node = l_heap[idx]; \ - n_s = mtype->n_s; \ - n_l = mtype->n_l; \ - ai = node->ai; \ - \ - /* Internal or leaf node */ \ - if (idx > 0) { \ - idx2 = P_IDX(idx); \ - node2 = l_heap[idx2]; \ - \ - /* Move down */ \ - if (ai < node2->ai) { \ - mm_move_down_large(l_heap, idx, node, idx2, node2); \ - \ - /* Maybe swap between heaps */ \ - node2 = s_heap[0]; \ - if (ai < node2->ai) { \ - mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node2, node); \ - } \ - } else if (idx < mtype->l_first_leaf) { \ - /* Move up */ \ - mm_move_up_large(l_heap, n_l, idx, node); \ - } \ - } else { \ - /* Head node */ \ - node2 = s_heap[0]; \ - if (n_s > 0 && ai < node2->ai) { \ - mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node2, node); \ - } else { \ - mm_move_up_large(l_heap, n_l, idx, node); \ - } \ - } \ - } \ - -HEAPIFY_LARGE_NODE(mm) -HEAPIFY_LARGE_NODE(mq) +/*heapify_small/large_node is same for mm and mq */ +static inline void +heapify_small_node(mm_handle *mm, idx_t idx) { + idx_t idx2; + mm_node *node; + mm_node *node2; + mm_node **s_heap; + mm_node **l_heap; + idx_t n_s, n_l; + ai_t ai; + + s_heap = mm->s_heap; + l_heap = mm->l_heap; + node = s_heap[idx]; + n_s = mm->n_s; + n_l = mm->n_l; + ai = node->ai; + + /* Internal or leaf node */ + if (idx > 0) { + idx2 = P_IDX(idx); + node2 = s_heap[idx2]; + + /* Move up */ + if (ai > node2->ai) { + mm_move_up_small(s_heap, idx, node, idx2, node2); + + /* Maybe swap between heaps */ + node2 = l_heap[0]; + if (ai > node2->ai) { + mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node, node2); + } + } else if (idx < mm->s_first_leaf) { + /* Move down */ + mm_move_down_small(s_heap, n_s, idx, node); + } + } else { + /* Head node */ + node2 = l_heap[0]; + if (n_l > 0 && ai > node2->ai) { + mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node, node2); + } else { + mm_move_down_small(s_heap, n_s, idx, node); + } + } +} + + +static inline void +heapify_large_node(mm_handle *mm, idx_t idx) { + idx_t idx2; + mm_node *node; + mm_node *node2; + mm_node **s_heap; + mm_node **l_heap; + idx_t n_s, n_l; + ai_t ai; + + s_heap = mm->s_heap; + l_heap = mm->l_heap; + node = l_heap[idx]; + n_s = mm->n_s; + n_l = mm->n_l; + ai = node->ai; + + /* Internal or leaf node */ + if (idx > 0) { + idx2 = P_IDX(idx); + node2 = l_heap[idx2]; + + /* Move down */ + if (ai < node2->ai) { + mm_move_down_large(l_heap, idx, node, idx2, node2); + + /* Maybe swap between heaps */ + node2 = s_heap[0]; + if (ai < node2->ai) { + mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node2, node); + } + } else if (idx < mm->l_first_leaf) { + /* Move up */ + mm_move_up_large(l_heap, n_l, idx, node); + } + } else { + /* Head node */ + node2 = s_heap[0]; + if (n_s > 0 && ai < node2->ai) { + mm_swap_heap_heads(s_heap, n_s, l_heap, n_l, node2, node); + } else { + mm_move_up_large(l_heap, n_l, idx, node); + } + } + +} + /* Return the index of the smallest child of the node. The pointer * child will also be set. */ diff --git a/bottleneck/src/move_median/move_median.h b/bottleneck/src/move_median/move_median.h index 68e612f435..9ed271349b 100644 --- a/bottleneck/src/move_median/move_median.h +++ b/bottleneck/src/move_median/move_median.h @@ -51,53 +51,33 @@ struct _mm_handle { mm_node *newest; /* The newest node (most recent insert) */ idx_t s_first_leaf; /* All nodes this index or greater are leaf nodes */ idx_t l_first_leaf; /* All nodes this index or greater are leaf nodes */ -}; -typedef struct _mm_handle mm_handle; - -struct _mq_handle { - idx_t window; /* window size */ - int odd; /* is window even (0) or odd (1) */ - idx_t min_count; /* Same meaning as in bn.move_quantile */ - idx_t n_s; /* Number of nodes in the small heap */ - idx_t n_l; /* Number of nodes in the large heap */ - idx_t n_n; /* Number of nodes in the nan array */ - mm_node **s_heap; /* The max heap of small ai */ - mm_node **l_heap; /* The min heap of large ai */ - mm_node **n_array; /* The nan array */ - mm_node **nodes; /* s_heap and l_heap point into this array */ - mm_node *node_data; /* Pointer to memory location where nodes live */ - mm_node *oldest; /* The oldest node */ - mm_node *newest; /* The newest node (most recent insert) */ - idx_t s_first_leaf; /* All nodes this index or greater are leaf nodes */ - idx_t l_first_leaf; /* All nodes this index or greater are leaf nodes */ double quantile; /* Value between 0 <= quantile <= 1, the quantile to compute */ }; -typedef struct _mq_handle mq_handle; +typedef struct _mm_handle mm_handle; /* non-nan functions */ mm_handle *mm_new(const idx_t window, idx_t min_count); ai_t mm_update_init(mm_handle *mm, ai_t ai); ai_t mm_update(mm_handle *mm, ai_t ai); -mq_handle *mq_new(const idx_t window, idx_t min_count, double quantile); -ai_t mq_update_init(mq_handle *mq, ai_t ai); -ai_t mq_update(mq_handle *mq, ai_t ai); +mm_handle *mq_new(const idx_t window, idx_t min_count, double quantile); +ai_t mq_update_init(mm_handle *mq, ai_t ai); +ai_t mq_update(mm_handle *mq, ai_t ai); /* nan functions */ mm_handle *mm_new_nan(const idx_t window, idx_t min_count); ai_t mm_update_init_nan(mm_handle *mm, ai_t ai); ai_t mm_update_nan(mm_handle *mm, ai_t ai); -mq_handle *mq_new_nan(const idx_t window, idx_t min_count, double quantile); -ai_t mq_update_init_nan(mq_handle *mq, ai_t ai); -ai_t mq_update_nan(mq_handle *mq, ai_t ai); +mm_handle *mq_new_nan(const idx_t window, idx_t min_count, double quantile); +ai_t mq_update_init_nan(mm_handle *mq, ai_t ai); +ai_t mq_update_nan(mm_handle *mq, ai_t ai); /* functions common to non-nan and nan cases */ +/* same for mm and mq */ void mm_reset(mm_handle *mm); void mm_free(mm_handle *mm); -void mq_reset(mq_handle *mq); -void mq_free(mq_handle *mq); /* Copied from Cython ---------------------------------------------------- */ diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index a3aaef13d9..2f1d933b36 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -635,10 +635,10 @@ MOVE_MAIN(move_median, 0, 0) /* dtype = [['float64'], ['float32']] */ MOVE(move_quantile, DTYPE0) { npy_DTYPE0 ai; - mq_handle *mq = mq_new_nan(window, min_count, quantile); + mm_handle *mq = mq_new_nan(window, min_count, quantile); INIT(NPY_DTYPE0) if (window == 1) { - mq_free(mq); + mm_free(mq); return PyArray_Copy(a); } if (mq == NULL) { @@ -658,10 +658,10 @@ MOVE(move_quantile, DTYPE0) { ai = AI(DTYPE0); YI(DTYPE0) = mq_update_nan(mq, ai); } - mq_reset(mq); + mm_reset(mq); NEXT2 } - mq_free(mq); + mm_free(mq); BN_END_ALLOW_THREADS return y; } @@ -670,7 +670,7 @@ MOVE(move_quantile, DTYPE0) { /* dtype = [['int64', 'float64'], ['int32', 'float64']] */ MOVE(move_quantile, DTYPE0) { npy_DTYPE0 ai; - mq_handle *mq = mq_new(window, min_count, quantile); + mm_handle *mq = mq_new(window, min_count, quantile); INIT(NPY_DTYPE1) if (window == 1) { return PyArray_CastToType(a, @@ -694,10 +694,10 @@ MOVE(move_quantile, DTYPE0) { ai = AI(DTYPE0); YI(DTYPE1) = mq_update(mq, ai); } - mq_reset(mq); + mm_reset(mq); NEXT2 } - mq_free(mq); + mm_free(mq); BN_END_ALLOW_THREADS return y; } diff --git a/bottleneck/tests/move_test.py b/bottleneck/tests/move_test.py index 01d3d6e841..7b7ae3d5fa 100644 --- a/bottleneck/tests/move_test.py +++ b/bottleneck/tests/move_test.py @@ -356,7 +356,8 @@ def test_move_quantile_with_infs_and_nans(): rs = np.random.RandomState([1, 2, 3]) fracs = [0., 0.2, 0.4, 0.6, 0.8, 1.] inf_minf_nan_fracs = [triple for triple in itertools.product(fracs, fracs, fracs) if np.sum(triple) <= 1] - for size in [1, 2, 3, 5, 9, 10, 17, 20, 31]: + # for size in [1, 2, 3, 5, 9, 10, 17, 20, 31]: + for size in [1, 2, 3, 5, 9, 10]: for min_count in [1, 2, 3, size//2, size - 1, size]: if min_count < 1 or min_count > size: continue @@ -365,8 +366,8 @@ def test_move_quantile_with_infs_and_nans(): for _ in range(REPEAT_FULL_QUANTILE): for q in [0., 1., 0.25, 0.75, rs.rand()]: arrays = [np.arange(size, dtype=np.float64), - (rs.random(size) - 0.5) * rs.randint(0, 100_000), - (rs.random(size) - 0.5) / rs.randint(0, 100_000)] + (rs.random(size) - 0.5) * rs.randint(1, 100_000), + (rs.random(size) - 0.5) / rs.randint(1, 100_000)] for a in arrays: a = np.arange(size, dtype=np.float64) randoms = rs.rand(*a.shape) From 72677f8f2ba96090a51cccecc66a6b844945a3e0 Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Wed, 28 Sep 2022 00:19:39 -0400 Subject: [PATCH 28/30] Median and quantile with function pointers move_median and move_quantile now have all the same functions except for the construction of mm/mq. --- bottleneck/src/move_median/move_median.c | 431 +++++------------------ bottleneck/src/move_median/move_median.h | 27 +- bottleneck/src/move_template.c | 16 +- 3 files changed, 103 insertions(+), 371 deletions(-) diff --git a/bottleneck/src/move_median/move_median.c b/bottleneck/src/move_median/move_median.c index 5bd4380714..7c49ac72e4 100644 --- a/bottleneck/src/move_median/move_median.c +++ b/bottleneck/src/move_median/move_median.c @@ -27,8 +27,11 @@ idx1 = idx2 /* helper functions */ static inline ai_t mm_get_median(mm_handle *mm); +static inline ai_t mm_get_median_no_nans_full_window_odd(mm_handle *mm); +static inline ai_t mm_get_median_no_nans_full_window_even(mm_handle *mm); +static inline idx_t mm_get_nl(mm_handle *mm, idx_t idx, short adjustment); static inline ai_t mq_get_quantile(mm_handle *mq); -static inline idx_t mq_k_stat(mm_handle *mq, idx_t idx); +static inline idx_t mq_k_stat(mm_handle *mq, idx_t idx, short adjustment); static inline short mq_stat_exact(mm_handle *mq, idx_t idx); static inline void heapify_small_node(mm_handle *mm, idx_t idx); static inline void heapify_large_node(mm_handle *mm, idx_t idx); @@ -72,6 +75,8 @@ mm_new(const idx_t window, idx_t min_count) { mm->window = window; mm->odd = window % 2; mm->min_count = min_count; + mm->get_statistic = mm_get_median; + mm->get_index = mm_get_nl; mm_reset(mm); @@ -88,12 +93,14 @@ mq_new(const idx_t window, idx_t min_count, double quantile) { mq->quantile = quantile; mq->s_heap = mq->nodes; - idx_t k_stat = mq_k_stat(mq, window); + idx_t k_stat = mq_k_stat(mq, window, 0) + 1; mq->l_heap = &mq->nodes[k_stat]; mq->window = window; mq->odd = window % 2; mq->min_count = min_count; + mq->get_statistic = mq_get_quantile; + mq->get_index = mq_k_stat; mm_reset(mq); @@ -103,7 +110,8 @@ mq_new(const idx_t window, idx_t min_count, double quantile) { /* Insert a new value, ai, into one of the heaps. Use this function when * the heaps contain less than window-1 nodes. Returns the median value. - * Once there are window-1 nodes in the heap, switch to using mm_update. */ + * Once there are window-1 nodes in the heap, switch to using mm_update. + * Used by both mm and mq */ ai_t mm_update_init(mm_handle *mm, ai_t ai) { mm_node *node; @@ -124,7 +132,8 @@ mm_update_init(mm_handle *mm, ai_t ai) { } else { /* at least one node already exists in the heaps */ mm->newest->next = node; - if (n_s > n_l) { + idx_t k_stat = mm->get_index(mm, n_s + n_l + 1, 0); + if (n_s > k_stat) { /* add new node to large heap */ mm->l_heap[n_l] = node; node->region = LH; @@ -145,60 +154,27 @@ mm_update_init(mm_handle *mm, ai_t ai) { mm->newest = node; - return mm_get_median(mm); + return mm->get_statistic(mm); } -/* Same as mm_update_init, but returns the current quantile. */ -ai_t -mq_update_init(mm_handle *mq, ai_t ai) { - mm_node *node; - idx_t n_s = mq->n_s; - idx_t n_l = mq->n_l; - - node = &mq->node_data[n_s + n_l]; - node->ai = ai; - - if (n_s == 0) { - /* the first node to appear in a heap */ - mq->s_heap[0] = node; - node->region = SH; - node->idx = 0; - mq->oldest = node; /* only need to set the oldest node once */ - mq->n_s = 1; - mq->s_first_leaf = 0; +/* For non-nan move_median, as soon as heap has window nodes, switch to + * more explicit median functions, to avoid redundant calculations*/ +void +mm_update_statistic_function(mm_handle *mm) { + if (mm->odd) { + mm->get_statistic = mm_get_median_no_nans_full_window_odd; } else { - /* at least one node already exists in the heaps */ - mq->newest->next = node; - idx_t k_stat = mq_k_stat(mq, n_s + n_l + 1); - if (n_s >= k_stat) { - /* add new node to large heap */ - mq->l_heap[n_l] = node; - node->region = LH; - node->idx = n_l; - ++mq->n_l; - mq->l_first_leaf = FIRST_LEAF(mq->n_l); - heapify_large_node(mq, n_l); - } else { - /* add new node to small heap */ - mq->s_heap[n_s] = node; - node->region = SH; - node->idx = n_s; - ++mq->n_s; - mq->s_first_leaf = FIRST_LEAF(mq->n_s); - heapify_small_node(mq, n_s); - } + mm->get_statistic = mm_get_median_no_nans_full_window_even; } - - mq->newest = node; - - return mq_get_quantile(mq); + return; } /* Insert a new value, ai, into the double heap structure. Use this function * when the double heap contains at least window-1 nodes. Returns the median * value. If there are less than window-1 nodes in the heap, use - * mm_update_init. */ + * mm_update_init. + * Used by both mm and mq */ ai_t mm_update(mm_handle *mm, ai_t ai) { /* node is oldest node with ai of newest node */ @@ -218,34 +194,7 @@ mm_update(mm_handle *mm, ai_t ai) { } /* return the median */ - if (mm->odd) { - return mm->s_heap[0]->ai; - } else { - return (mm->s_heap[0]->ai + mm->l_heap[0]->ai) / 2.0; - } -} - -/* Same as mm_update, but returns the current quantile. */ -ai_t -mq_update(mm_handle *mq, ai_t ai) { - /* node is oldest node with ai of newest node */ - mm_node *node = mq->oldest; - node->ai = ai; - - /* update oldest, newest */ - mq->oldest = mq->oldest->next; - mq->newest->next = node; - mq->newest = node; - - /* adjust position of new node in heap if needed */ - if (node->region == SH) { - heapify_small_node(mq, node->idx); - } else { - heapify_large_node(mq, node->idx); - } - - /* return the median */ - return mq_get_quantile(mq); + return mm->get_statistic(mm); } @@ -271,6 +220,8 @@ mm_new_nan(const idx_t window, idx_t min_count) { mm->window = window; mm->min_count = min_count; + mm->get_statistic = mm_get_median; + mm->get_index = mm_get_nl; mm_reset(mm); @@ -287,12 +238,14 @@ mq_new_nan(const idx_t window, idx_t min_count, double quantile) { mq->quantile = quantile; mq->s_heap = mq->nodes; - idx_t k_stat = mq_k_stat(mq, window); + idx_t k_stat = mq_k_stat(mq, window, 0) + 1; mq->l_heap = &mq->nodes[k_stat]; mq->n_array = &mq->nodes[window]; mq->window = window; mq->min_count = min_count; + mq->get_statistic = mq_get_quantile; + mq->get_index = mq_k_stat; mm_reset(mq); @@ -303,7 +256,8 @@ mq_new_nan(const idx_t window, idx_t min_count, double quantile) { /* Insert a new value, ai, into one of the heaps or the nan array. Use this * function when there are less than window-1 nodes. Returns the median * value. Once there are window-1 nodes in the heap, switch to using - * mm_update_nan. */ + * mm_update_nan. + * Used by both mm and mq*/ ai_t mm_update_init_nan(mm_handle *mm, ai_t ai) { mm_node *node; @@ -342,7 +296,9 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { } else { /* at least one node already exists in the heaps */ mm->newest->next = node; - if (n_s > n_l) { + /* number of non-nan nodes including the new node is n_s + n_l + 1 */ + idx_t k_stat = mm->get_index(mm, n_s + n_l + 1, 0); + if (n_s > k_stat) { /* add new node to large heap */ mm->l_heap[n_l] = node; node->region = LH; @@ -363,79 +319,14 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { } mm->newest = node; - return mm_get_median(mm); -} - -/* Same as mm_update_init_nan, but returns the current quantile. */ -ai_t -mq_update_init_nan(mm_handle *mq, ai_t ai) { - mm_node *node; - idx_t n_s = mq->n_s; - idx_t n_l = mq->n_l; - idx_t n_n = mq->n_n; - - node = &mq->node_data[n_s + n_l + n_n]; - node->ai = ai; - - if (ai != ai) { - mq->n_array[n_n] = node; - node->region = NA; - node->idx = n_n; - if (n_s + n_l + n_n == 0) { - /* only need to set the oldest node once */ - mq->oldest = node; - } else { - mq->newest->next = node; - } - ++mq->n_n; - } else { - if (n_s == 0) { - /* the first node to appear in a heap */ - mq->s_heap[0] = node; - node->region = SH; - node->idx = 0; - if (n_s + n_l + n_n == 0) { - /* only need to set the oldest node once */ - mq->oldest = node; - } else { - mq->newest->next = node; - } - mq->n_s = 1; - mq->s_first_leaf = 0; - } else { - /* at least one node already exists in the heaps */ - /* number of non-nan nodes including the new node is n_s + n_l + 1 */ - idx_t k_stat = mq_k_stat(mq, n_s + n_l + 1); - - mq->newest->next = node; - if (n_s >= k_stat) { - /* add new node to large heap */ - mq->l_heap[n_l] = node; - node->region = LH; - node->idx = n_l; - ++mq->n_l; - mq->l_first_leaf = FIRST_LEAF(mq->n_l); - heapify_large_node(mq, n_l); - } else { - /* add new node to small heap */ - mq->s_heap[n_s] = node; - node->region = SH; - node->idx = n_s; - ++mq->n_s; - mq->s_first_leaf = FIRST_LEAF(mq->n_s); - heapify_small_node(mq, n_s); - } - } - } - mq->newest = node; - - return mq_get_quantile(mq); + return mm->get_statistic(mm); } /* Insert a new value, ai, into one of the heaps or the nan array. Use this * function when there are at least window-1 nodes. Returns the median value. - * If there are less than window-1 nodes, use mm_update_init_nan. */ + * If there are less than window-1 nodes, use mm_update_init_nan. + * Used by both mm and mq */ ai_t mm_update_nan(mm_handle *mm, ai_t ai) { idx_t n_s, n_l, n_n; @@ -506,7 +397,8 @@ mm_update_nan(mm_handle *mm, ai_t ai) { s_heap[idx]->idx = idx; heapify_small_node(mm, idx); } - if (mm->n_s < mm->n_l) { + idx_t k_stat = mm->get_index(mm, mm->n_s + mm->n_l, 1); + if (mm->n_s < k_stat) { /* move head node from the large heap to the small heap */ node2 = mm->l_heap[0]; node2->idx = mm->n_s; @@ -557,7 +449,8 @@ mm_update_nan(mm_handle *mm, ai_t ai) { } else { mm->l_first_leaf = FIRST_LEAF(mm->n_l); } - if (mm->n_l < mm->n_s - 1) { + idx_t k_stat = mm->get_index(mm, mm->n_s + mm->n_l, 0); + if (mm->n_s > k_stat + 1) { /* move head node from the small heap to the large heap */ node2 = mm->s_heap[0]; node2->idx = mm->n_l; @@ -594,7 +487,9 @@ mm_update_nan(mm_handle *mm, ai_t ai) { heapify_large_node(mm, idx); } else { /* ai is not NaN but oldest node is in nan array */ - if (n_s > n_l) { + // k_stat = mm_k_stat(mm, n_s + n_l + 1); + idx_t k_stat = mm->get_index(mm, n_s + n_l + 1, 0); + if (n_s > k_stat) { /* insert into large heap */ node->region = LH; node->idx = n_l; @@ -619,200 +514,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { --mm->n_n; } } - return mm_get_median(mm); -} - -/* Same as mm_update_nan, but returns the current quantile. */ -ai_t -mq_update_nan(mm_handle *mq, ai_t ai) { - idx_t n_s, n_l, n_n; - - mm_node **l_heap; - mm_node **s_heap; - mm_node **n_array; - mm_node *node2; - - /* node is oldest node with ai of newest node */ - mm_node *node = mq->oldest; - idx_t idx = node->idx; - node->ai = ai; - - /* update oldest, newest */ - mq->oldest = mq->oldest->next; - mq->newest->next = node; - mq->newest = node; - - l_heap = mq->l_heap; - s_heap = mq->s_heap; - n_array = mq->n_array; - - n_s = mq->n_s; - n_l = mq->n_l; - n_n = mq->n_n; - - idx_t k_stat; - - if (ai != ai) { - if (node->region == SH) { - /* Oldest node is in the small heap and needs to be moved - * to the nan array. Resulting hole in the small heap will be - * filled with the rightmost leaf of the last row of the small - * heap. */ - k_stat = mq_k_stat(mq, n_s + n_l - 1); - - /* insert node into nan array */ - node->region = NA; - node->idx = n_n; - n_array[n_n] = node; - ++mq->n_n; - - /* plug small heap hole */ - --mq->n_s; - if (mq->n_s == 0) { - mq->s_first_leaf = 0; - if (n_l > 0) { - /* move head node from the large heap to the small heap */ - node2 = mq->l_heap[0]; - node2->region = SH; - s_heap[0] = node2; - mq->n_s = 1; - mq->s_first_leaf = 0; - - /* plug hole in large heap */ - node2 = mq->l_heap[mq->n_l - 1]; - node2->idx = 0; - l_heap[0] = node2; - --mq->n_l; - if (mq->n_l == 0) { - mq->l_first_leaf = 0; - } else { - mq->l_first_leaf = FIRST_LEAF(mq->n_l); - } - heapify_large_node(mq, 0); - } - } else { - if (idx != n_s - 1) { - s_heap[idx] = s_heap[n_s - 1]; - s_heap[idx]->idx = idx; - heapify_small_node(mq, idx); - } - if (mq->n_s < k_stat) { - /* move head node from the large heap to the small heap */ - node2 = mq->l_heap[0]; - node2->idx = mq->n_s; - node2->region = SH; - s_heap[mq->n_s] = node2; - ++mq->n_s; - mq->l_first_leaf = FIRST_LEAF(mq->n_s); - heapify_small_node(mq, node2->idx); - - /* plug hole in large heap */ - node2 = mq->l_heap[mq->n_l - 1]; - node2->idx = 0; - l_heap[0] = node2; - --mq->n_l; - if (mq->n_l == 0) { - mq->l_first_leaf = 0; - } else { - mq->l_first_leaf = FIRST_LEAF(mq->n_l); - } - heapify_large_node(mq, 0); - - } else { - mq->s_first_leaf = FIRST_LEAF(mq->n_s); - heapify_small_node(mq, idx); - } - } - } else if (node->region == LH) { - /* Oldest node is in the large heap and needs to be moved - * to the nan array. Resulting hole in the large heap will be - * filled with the rightmost leaf of the last row of the large - * heap. */ - - k_stat = mq_k_stat(mq, n_s + n_l - 1); - /* insert node into nan array */ - node->region = NA; - node->idx = n_n; - n_array[n_n] = node; - ++mq->n_n; - - /* plug large heap hole */ - if (idx != n_l - 1) { - l_heap[idx] = l_heap[n_l - 1]; - l_heap[idx]->idx = idx; - heapify_large_node(mq, idx); - } - --mq->n_l; - if (mq->n_l == 0) { - mq->l_first_leaf = 0; - } else { - mq->l_first_leaf = FIRST_LEAF(mq->n_l); - } - if (mq->n_s > k_stat) { - /* move head node from the small heap to the large heap */ - node2 = mq->s_heap[0]; - node2->idx = mq->n_l; - node2->region = LH; - l_heap[mq->n_l] = node2; - ++mq->n_l; - mq->l_first_leaf = FIRST_LEAF(mq->n_l); - heapify_large_node(mq, node2->idx); - - /* plug hole in small heap */ - if (n_s != 1) { - node2 = mq->s_heap[mq->n_s - 1]; - node2->idx = 0; - s_heap[0] = node2; - } - --mq->n_s; - if (mq->n_s == 0) { - mq->s_first_leaf = 0; - } else { - mq->s_first_leaf = FIRST_LEAF(mq->n_s); - } - heapify_small_node(mq, 0); - } - /* reorder large heap if needed */ - heapify_large_node(mq, idx); - } else if (node->region == NA) { - /* insert node into nan heap */ - n_array[idx] = node; - } - } else { - if (node->region == SH) { - heapify_small_node(mq, idx); - } else if (node->region == LH) { - heapify_large_node(mq, idx); - } else { - /* ai is not NaN but oldest node is in nan array */ - k_stat = mq_k_stat(mq, n_s + n_l + 1); - - if (n_s == k_stat) { - /* insert into large heap */ - node->region = LH; - node->idx = n_l; - l_heap[n_l] = node; - ++mq->n_l; - mq->l_first_leaf = FIRST_LEAF(mq->n_l); - heapify_large_node(mq, n_l); - } else { - /* insert into small heap */ - node->region = SH; - node->idx = n_s; - s_heap[n_s] = node; - ++mq->n_s; - mq->s_first_leaf = FIRST_LEAF(mq->n_s); - heapify_small_node(mq, n_s); - } - /* plug nan array hole */ - if (idx != n_n - 1) { - n_array[idx] = n_array[n_n - 1]; - n_array[idx]->idx = idx; - } - --mq->n_n; - } - } - return mq_get_quantile(mq); + return mm->get_statistic(mm); } @@ -834,6 +536,12 @@ mm_reset(mm_handle *mm) { mm->l_first_leaf = 0; } +/* For non-nan move_median, need to reset the statistic function as well */ +void +mm_reset_statistic_function(mm_handle *mm) { + mm->get_statistic = mm_get_median; + return; +} /* After bn.move_median is done, free the memory */ void @@ -861,16 +569,14 @@ mm_get_median(mm_handle *mm) { return (mm->s_heap[0]->ai + mm->l_heap[0]->ai) / 2.0; } - -/* function to find the index of element correspodning to the current quantile */ -static inline idx_t mq_k_stat(mm_handle *mq, idx_t idx) { - return (idx_t) floor((idx - 1) * mq->quantile) + 1; +static inline ai_t +mm_get_median_no_nans_full_window_odd(mm_handle *mm) { + return mm->s_heap[0]->ai; } -/* function to check if the current index of the quantile is integer. If so, - * then the quantile is the element at the top of the heap. Otherwise take a midpoint */ -static inline short mq_stat_exact(mm_handle *mq, idx_t idx) { - return ((idx - 1) * mq->quantile) == floor((idx - 1) * mq->quantile); +static inline ai_t +mm_get_median_no_nans_full_window_even(mm_handle *mm) { + return (mm->s_heap[0]->ai + mm->l_heap[0]->ai) / 2.0; } /* Return the current quantile */ @@ -884,6 +590,27 @@ mq_get_quantile(mm_handle *mq) { return (mq->s_heap[0]->ai + mq->l_heap[0]->ai) / 2.0; } +/* function to check if the current index of the quantile is integer. If so, + * then the quantile is the element at the top of the heap. Otherwise take a midpoint */ +static inline short mq_stat_exact(mm_handle *mq, idx_t idx) { + return ((idx - 1) * mq->quantile) == floor((idx - 1) * mq->quantile); +} + +/* helper function to get mm->n_l while having same functions for mm and mq + * This will be assigned to mm->get_index */ +static inline idx_t mm_get_nl(mm_handle *mm, idx_t idx, short adjustment) { + return mm->n_l; +} + +/* function to find the index of element correspodning to the current quantile + * Due some some off-by-one nuances, one place in mm_update_nan requires + * this to return index + 1 (to be consistent with mm). Hence use adjustment arg + * This will be assigned to mq->get_index */ +static inline idx_t mq_k_stat(mm_handle *mq, idx_t idx, short adjustment) { + return (idx_t) (floor((idx - 1) * mq->quantile) + adjustment); +} + + /*heapify_small/large_node is same for mm and mq */ static inline void heapify_small_node(mm_handle *mm, idx_t idx) { diff --git a/bottleneck/src/move_median/move_median.h b/bottleneck/src/move_median/move_median.h index 9ed271349b..5ca39acd44 100644 --- a/bottleneck/src/move_median/move_median.h +++ b/bottleneck/src/move_median/move_median.h @@ -35,6 +35,8 @@ struct _mm_node { }; typedef struct _mm_node mm_node; +/* this struct is used by both move_median(mm) and move_quantile(mq) */ +typedef struct _mm_handle mm_handle; struct _mm_handle { idx_t window; /* window size */ int odd; /* is window even (0) or odd (1) */ @@ -51,34 +53,37 @@ struct _mm_handle { mm_node *newest; /* The newest node (most recent insert) */ idx_t s_first_leaf; /* All nodes this index or greater are leaf nodes */ idx_t l_first_leaf; /* All nodes this index or greater are leaf nodes */ - double quantile; /* Value between 0 <= quantile <= 1, the quantile to compute */ + double quantile; /* Value between 0 and 1, the quantile to compute */ + + /* get_median for move_median, get_quantile for move_quantile */ + ai_t (*get_statistic)(mm_handle *); + /* returns n_l for median, returns index before quantile for quantile */ + idx_t (*get_index)(mm_handle *, idx_t, short); }; -typedef struct _mm_handle mm_handle; /* non-nan functions */ mm_handle *mm_new(const idx_t window, idx_t min_count); +mm_handle *mq_new(const idx_t window, idx_t min_count, double quantile); +/* used by both mm and mq */ ai_t mm_update_init(mm_handle *mm, ai_t ai); ai_t mm_update(mm_handle *mm, ai_t ai); -mm_handle *mq_new(const idx_t window, idx_t min_count, double quantile); -ai_t mq_update_init(mm_handle *mq, ai_t ai); -ai_t mq_update(mm_handle *mq, ai_t ai); +/* helper functions for non-nan move_median */ +void mm_update_statistic_function(mm_handle *mm); +void mm_reset_statistic_function(mm_handle *mm); /* nan functions */ mm_handle *mm_new_nan(const idx_t window, idx_t min_count); +mm_handle *mq_new_nan(const idx_t window, idx_t min_count, double quantile); +/* used by both mm and mq */ ai_t mm_update_init_nan(mm_handle *mm, ai_t ai); ai_t mm_update_nan(mm_handle *mm, ai_t ai); -mm_handle *mq_new_nan(const idx_t window, idx_t min_count, double quantile); -ai_t mq_update_init_nan(mm_handle *mq, ai_t ai); -ai_t mq_update_nan(mm_handle *mq, ai_t ai); - /* functions common to non-nan and nan cases */ -/* same for mm and mq */ +/* used by both mm and mq */ void mm_reset(mm_handle *mm); void mm_free(mm_handle *mm); - /* Copied from Cython ---------------------------------------------------- */ /* NaN */ diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index 2f1d933b36..8eeb260a71 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -614,11 +614,13 @@ MOVE(move_median, DTYPE0) { ai = AI(DTYPE0); YI(DTYPE1) = mm_update_init(mm, ai); } + mm_update_statistic_function(mm); WHILE2 { ai = AI(DTYPE0); YI(DTYPE1) = mm_update(mm, ai); } mm_reset(mm); + mm_reset_statistic_function(mm); NEXT2 } mm_free(mm); @@ -648,15 +650,15 @@ MOVE(move_quantile, DTYPE0) { WHILE { WHILE0 { ai = AI(DTYPE0); - YI(DTYPE0) = mq_update_init_nan(mq, ai); + YI(DTYPE0) = mm_update_init_nan(mq, ai); } WHILE1 { ai = AI(DTYPE0); - YI(DTYPE0) = mq_update_init_nan(mq, ai); + YI(DTYPE0) = mm_update_init_nan(mq, ai); } WHILE2 { ai = AI(DTYPE0); - YI(DTYPE0) = mq_update_nan(mq, ai); + YI(DTYPE0) = mm_update_nan(mq, ai); } mm_reset(mq); NEXT2 @@ -684,15 +686,15 @@ MOVE(move_quantile, DTYPE0) { WHILE { WHILE0 { ai = AI(DTYPE0); - YI(DTYPE1) = mq_update_init(mq, ai); + YI(DTYPE1) = mm_update_init(mq, ai); } WHILE1 { ai = AI(DTYPE0); - YI(DTYPE1) = mq_update_init(mq, ai); + YI(DTYPE1) = mm_update_init(mq, ai); } WHILE2 { ai = AI(DTYPE0); - YI(DTYPE1) = mq_update(mq, ai); + YI(DTYPE1) = mm_update(mq, ai); } mm_reset(mq); NEXT2 @@ -967,12 +969,10 @@ parse_args(PyObject *args, TYPE_ERR("wrong number of arguments"); return 0; } - if (*a == NULL) { TYPE_ERR("Cannot find `a` argument"); return 0; } - if (*window == NULL) { TYPE_ERR("Cannot find `window` argument"); return 0; From 2c892db1d0d7f3caf13fe5c45d329027e174ba5f Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Sat, 1 Oct 2022 23:12:36 -0400 Subject: [PATCH 29/30] Support of itrable q argument for move_quantile --- bottleneck/__init__.py | 4 +++- bottleneck/src/move_quantile.py | 13 +++++++++++++ bottleneck/src/move_template.c | 11 +++++------ 3 files changed, 21 insertions(+), 7 deletions(-) create mode 100644 bottleneck/src/move_quantile.py diff --git a/bottleneck/__init__.py b/bottleneck/__init__.py index b2c01de453..d99fc782ed 100644 --- a/bottleneck/__init__.py +++ b/bottleneck/__init__.py @@ -5,7 +5,9 @@ from . import slow from ._pytesttester import PytestTester from .move import (move_argmax, move_argmin, move_max, move_mean, move_median, - move_quantile, move_min, move_rank, move_std, move_sum, move_var) + move_min, move_rank, move_std, move_sum, move_var) + +from .src.move_quantile import move_quantile from .nonreduce import replace from .nonreduce_axis import (argpartition, nanrankdata, partition, push, diff --git a/bottleneck/src/move_quantile.py b/bottleneck/src/move_quantile.py new file mode 100644 index 0000000000..1013c7fc4a --- /dev/null +++ b/bottleneck/src/move_quantile.py @@ -0,0 +1,13 @@ +from ..move import move_quantile as move_quantile_c +from collections.abc import Iterable +import numpy as np + +def move_quantile(a, window, q, min_count=None, axis=-1): + if not isinstance(q, Iterable): + return move_quantile_c(a, window=window, min_count=min_count, axis=axis, q=q) + result = np.asarray( + [move_quantile_c(a=a, window=window, min_count=min_count, axis=axis, q=quantile) for quantile in q] + ) + return result + +move_quantile.__doc__ = move_quantile_c.__doc__ \ No newline at end of file diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index 8eeb260a71..bdd4fc0b95 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -1030,13 +1030,13 @@ mover(char *name, if (quantile_obj != Py_None) { quantile = PyFloat_AsDouble(quantile_obj); if (error_converting(quantile)) { - TYPE_ERR("`q` must be a float"); + TYPE_ERR("Value(s) in `q` must be float"); return NULL; } if ((quantile < 0.0) || (quantile > 1.0)) { /* Float/double specifiers %f and %lf don't work here for some reason*/ PyErr_Format(PyExc_ValueError, - "`q` must be between 0. and 1."); + "Value(s) in `q` must be between 0. and 1."); return NULL; } @@ -1579,7 +1579,7 @@ MULTILINE STRING END */ static char move_quantile_doc[] = /* MULTILINE STRING BEGIN -move_quantile(a, window, min_count=None, axis=-1, q=0.5) +move_quantile(a, window, q, min_count=None, axis=-1) Moving window quantile along the specified axis, ignoring NaNs. @@ -1593,6 +1593,8 @@ a : ndarray Input array. If `a` is not an array, a conversion is attempted. window : int The number of elements in the moving window. +q : float or list of floats + Quantile(s) to compute, all values must be between 0 and 1 inclusive. min_count: {int, None}, optional If the number of non-NaN values in a window is less than `min_count`, then a value of NaN is assigned to the window. By default `min_count` @@ -1600,9 +1602,6 @@ min_count: {int, None}, optional axis : int, optional The axis over which the window is moved. By default the last axis (axis=-1) is used. An axis of None is not allowed. -q : float, optional - Quantile to compute, which must be between 0 and 1 inclusive. - By default q=0.5 is used, computing a moving median. Returns ------- From 654ab1452d888cf623baf044579859017ab9c287 Mon Sep 17 00:00:00 2001 From: andrii-riazanov Date: Sun, 2 Oct 2022 03:37:55 -0400 Subject: [PATCH 30/30] Make tests work with posiitonal q in move_quantile --- bottleneck/tests/input_modification_test.py | 6 +++++- bottleneck/tests/list_input_test.py | 7 +++++-- bottleneck/tests/move_test.py | 9 +++++++-- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/bottleneck/tests/input_modification_test.py b/bottleneck/tests/input_modification_test.py index 1546d81828..3ae33907e9 100644 --- a/bottleneck/tests/input_modification_test.py +++ b/bottleneck/tests/input_modification_test.py @@ -50,6 +50,10 @@ def test_modification(func): if "partition" in name: second_arg = 0 + kwargs = {} + if name == "move_quantile": + kwargs['q'] = 0.5 + for axis in axes: with np.errstate(invalid="ignore"): a1 = a.copy() @@ -57,7 +61,7 @@ def test_modification(func): if any(x in name for x in ["move", "sort", "partition"]): with warnings.catch_warnings(): warnings.simplefilter("ignore") - func(a1, second_arg, axis=axis) + func(a1, second_arg, axis=axis, **kwargs) else: try: with warnings.catch_warnings(): diff --git a/bottleneck/tests/list_input_test.py b/bottleneck/tests/list_input_test.py index 4306a23990..3fd570b299 100644 --- a/bottleneck/tests/list_input_test.py +++ b/bottleneck/tests/list_input_test.py @@ -34,6 +34,9 @@ def test_list_input(func): name = func.__name__ if name == "replace": return + kwargs = {} + if name == "move_quantile": + kwargs['q'] = 0.5 func0 = eval("bn.slow.%s" % name) for i, a in enumerate(lists()): with warnings.catch_warnings(): @@ -42,8 +45,8 @@ def test_list_input(func): actual = func(a) desired = func0(a) except TypeError: - actual = func(a, 2) - desired = func0(a, 2) + actual = func(a, 2, **kwargs) + desired = func0(a, 2, **kwargs) a = np.array(a) tup = (name, "a" + str(i), str(a.dtype), str(a.shape), a) err_msg = msg % tup diff --git a/bottleneck/tests/move_test.py b/bottleneck/tests/move_test.py index 7b7ae3d5fa..6bd8ac2e64 100644 --- a/bottleneck/tests/move_test.py +++ b/bottleneck/tests/move_test.py @@ -38,8 +38,8 @@ def test_move(func): kwargs = {} if func_name == "move_quantile": kwargs = {"q" : q} - actual = func(a, window, min_count, axis=axis, **kwargs) - desired = func0(a, window, min_count, axis=axis, **kwargs) + actual = func(a, window=window, min_count=min_count, axis=axis, **kwargs) + desired = func0(a, window=window, min_count=min_count, axis=axis, **kwargs) tup = ( func_name, window, @@ -68,6 +68,11 @@ def test_arg_parsing(func, decimal=5): """test argument parsing.""" name = func.__name__ + + if name == "move_quantile": + from ..move import move_quantile as move_quantile_c + func = move_quantile_c + func0 = eval("bn.slow.%s" % name) a = np.array([1.0, 2, 3])