diff --git a/pandas/parser.pyx b/pandas/parser.pyx index b6e5ad0d73b7c..23aee860b3108 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -716,11 +716,10 @@ cdef class TextReader: # header is now a list of lists, so field_count should use header[0] cdef: - size_t i, start, data_line, field_count, passed_count, hr, unnamed_count # noqa + Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa char *word object name - int status - Py_ssize_t size + int status, hr, data_line char *errors = "strict" cdef StringPath path = _string_path(self.c_encoding) @@ -1416,8 +1415,7 @@ cdef _string_box_factorize(parser_t *parser, int col, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 - Py_ssize_t i - size_t lines + Py_ssize_t i, lines coliter_t it const char *word = NULL ndarray[object] result @@ -1470,8 +1468,7 @@ cdef _string_box_utf8(parser_t *parser, int col, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 - Py_ssize_t i - size_t lines + Py_ssize_t i, lines coliter_t it const char *word = NULL ndarray[object] result @@ -1525,8 +1522,7 @@ cdef _string_box_decode(parser_t *parser, int col, char *encoding): cdef: int error, na_count = 0 - Py_ssize_t i, size - size_t lines + Py_ssize_t i, size, lines coliter_t it const char *word = NULL ndarray[object] result @@ -1586,8 +1582,7 @@ cdef _categorical_convert(parser_t *parser, int col, "Convert column data into codes, categories" cdef: int error, na_count = 0 - Py_ssize_t i, size - size_t lines + Py_ssize_t i, size, lines coliter_t it const char *word = NULL @@ -1691,7 +1686,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset, object na_flist): cdef: int error, na_count = 0 - size_t i, lines + Py_ssize_t i, lines coliter_t it const char *word = NULL char *p_end @@ -1738,8 +1733,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int *na_count) nogil: cdef: int error, - size_t i - size_t lines = line_end - line_start + Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL char *p_end @@ -1801,7 +1795,7 @@ cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error - size_t i, lines + Py_ssize_t i, lines coliter_t it uint64_t *data ndarray result @@ -1837,8 +1831,7 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start, uint64_t *data, uint_state *state) nogil: cdef: int error - size_t i - size_t lines = line_end - line_start + Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL khiter_t k @@ -1873,7 +1866,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 - size_t i, lines + Py_ssize_t i, lines coliter_t it int64_t *data ndarray result @@ -1902,8 +1895,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, int64_t *data, int *na_count) nogil: cdef: int error - size_t i - size_t lines = line_end - line_start + Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL khiter_t k @@ -1939,7 +1931,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset): cdef: int na_count - size_t lines = line_end - line_start + Py_ssize_t lines = line_end - line_start uint8_t *data cnp.ndarray[cnp.uint8_t, ndim=1] result @@ -1963,8 +1955,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, uint8_t *data, int *na_count) nogil: cdef: int error - size_t lines = line_end - line_start - size_t i + Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL khiter_t k @@ -2004,7 +1995,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, const kh_str_t *false_hashset): cdef: int error, na_count = 0 - size_t i, lines + Py_ssize_t i, lines coliter_t it const char *word = NULL uint8_t *data @@ -2033,8 +2024,7 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, int *na_count) nogil: cdef: int error = 0 - size_t i - size_t lines = line_end - line_start + Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL khiter_t k @@ -2249,8 +2239,7 @@ cdef _apply_converter(object f, parser_t *parser, int col, char* c_encoding): cdef: int error - Py_ssize_t i - size_t lines + Py_ssize_t i, lines coliter_t it const char *word = NULL char *errors = "strict" @@ -2341,7 +2330,7 @@ def _to_structured_array(dict columns, object names, object usecols): cdef _fill_structured_column(char *dst, char* src, int elsize, int stride, int length, bint incref): cdef: - size_t i + Py_ssize_t i if incref: util.transfer_object_column(dst, src, stride, length) diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in index fda1e51bd2b1f..9552b4299fe6a 100644 --- a/pandas/src/algos_groupby_helper.pxi.in +++ b/pandas/src/algos_groupby_helper.pxi.in @@ -361,7 +361,11 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} if val == val and val != {{nan_val}}: + {{endif}} nobs[lab, j] += 1 resx[lab, j] = val @@ -407,7 +411,11 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} if val == val and val != {{nan_val}}: + {{endif}} nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val @@ -478,7 +486,11 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} if val == val and val != {{nan_val}}: + {{endif}} nobs[lab, j] += 1 if val > maxx[lab, j]: maxx[lab, j] = val @@ -492,7 +504,11 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, 0] # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} if val == val and val != {{nan_val}}: + {{endif}} nobs[lab, 0] += 1 if val > maxx[lab, 0]: maxx[lab, 0] = val @@ -541,8 +557,11 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} if val == val and val != {{nan_val}}: - + {{endif}} nobs[lab, j] += 1 if val < minx[lab, j]: minx[lab, j] = val @@ -556,7 +575,11 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, 0] # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} if val == val and val != {{nan_val}}: + {{endif}} nobs[lab, 0] += 1 if val < minx[lab, 0]: minx[lab, 0] = val @@ -596,14 +619,19 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, continue for j in range(K): val = values[i, j] + + # val = nan + {{if name == 'int64'}} + if is_datetimelike and val == {{nan_val}}: + out[i, j] = {{nan_val}} + else: + {{else}} if val == val: + {{endif}} if val < accum[lab, j]: min_val = val accum[lab, j] = min_val out[i, j] = accum[lab, j] - # val = nan - elif is_datetimelike: - out[i, j] = {{nan_val}} @cython.boundscheck(False) @@ -633,14 +661,18 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, continue for j in range(K): val = values[i, j] + + {{if name == 'int64'}} + if is_datetimelike and val == {{nan_val}}: + out[i, j] = {{nan_val}} + else: + {{else}} if val == val: + {{endif}} if val > accum[lab, j]: max_val = val accum[lab, j] = max_val out[i, j] = accum[lab, j] - # val = nan - elif is_datetimelike: - out[i, j] = {{nan_val}} {{endfor}} @@ -738,7 +770,12 @@ def group_cumsum(numeric[:, :] out, continue for j in range(K): val = values[i, j] - if val == val: + + if numeric == float32_t or numeric == float64_t: + if val == val: + accum[lab, j] += val + out[i, j] = accum[lab, j] + else: accum[lab, j] += val out[i, j] = accum[lab, j] diff --git a/pandas/src/algos_rank_helper.pxi.in b/pandas/src/algos_rank_helper.pxi.in index 7e7f819c7515f..aafffbf60f638 100644 --- a/pandas/src/algos_rank_helper.pxi.in +++ b/pandas/src/algos_rank_helper.pxi.in @@ -175,11 +175,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, count += 1.0 - {{if dtype == 'float64'}} if i == n - 1 or sorted_data[i + 1] != val: - {{else}} - if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0: - {{endif}} if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = sum_ranks / dups @@ -345,10 +341,8 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', {{if dtype == 'object'}} if j == k - 1 or are_diff(values[i, j + 1], val): - {{elif dtype == 'float64'}} - if j == k - 1 or values[i, j + 1] != val: {{else}} - if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: + if j == k - 1 or values[i, j + 1] != val: {{endif}} if tiebreak == TIEBREAK_AVERAGE: for z in range(j - dups + 1, j + 1): diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index 93f9411dbc8a1..74c38dfdb393e 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -386,9 +386,11 @@ cdef class {{name}}HashTable(HashTable): val = values[i] # specific for groupby + {{if dtype != 'uint64'}} if val < 0: labels[i] = -1 continue + {{endif}} k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: diff --git a/pandas/src/hashtable_func_helper.pxi.in b/pandas/src/hashtable_func_helper.pxi.in index c292256767315..fa373905ef08a 100644 --- a/pandas/src/hashtable_func_helper.pxi.in +++ b/pandas/src/hashtable_func_helper.pxi.in @@ -59,7 +59,12 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, for i in range(n): val = values[i] + + {{if dtype == 'float64'}} if val == val or not dropna: + {{else}} + if True: + {{endif}} k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: table.vals[k] += 1 @@ -85,7 +90,7 @@ cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna): int64_t[:] result_counts {{endif}} - int k + Py_ssize_t k table = kh_init_{{ttype}}() {{if dtype == 'object'}} @@ -133,11 +138,11 @@ def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'): def duplicated_{{dtype}}({{dtype}}_t[:] values, object keep='first'): {{endif}} cdef: - int ret = 0, k + int ret = 0 {{if dtype != 'object'}} {{dtype}}_t value {{endif}} - Py_ssize_t i, n = len(values) + Py_ssize_t k, i, n = len(values) kh_{{ttype}}_t * table = kh_init_{{ttype}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') @@ -230,7 +235,7 @@ def mode_{{dtype}}({{ctype}}[:] values): cdef: int count, max_count = 2 int j = -1 # so you can do += - int k + Py_ssize_t k kh_{{table_type}}_t *table ndarray[{{ctype}}] modes diff --git a/pandas/src/parser/io.c b/pandas/src/parser/io.c index 562d6033ce3eb..4381ef19e991b 100644 --- a/pandas/src/parser/io.c +++ b/pandas/src/parser/io.c @@ -215,7 +215,7 @@ void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, retval = src->memmap + src->position; - if (src->position + nbytes > src->last_pos) { + if (src->position + (off_t)nbytes > src->last_pos) { // fewer than nbytes remaining *bytes_read = src->last_pos - src->position; } else { diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index b6428c8b76743..916f06d357473 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -622,7 +622,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { stream = self->stream + self->stream_len; \ slen = self->stream_len; \ self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + line_limit) { \ + if (line_limit > 0 && self->lines == start_lines + (int)line_limit) { \ goto linelimit; \ } @@ -637,7 +637,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { stream = self->stream + self->stream_len; \ slen = self->stream_len; \ self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + line_limit) { \ + if (line_limit > 0 && self->lines == start_lines + (int)line_limit) { \ goto linelimit; \ } @@ -1072,7 +1072,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { --i; buf--; // let's try this character again (HACK!) if (line_limit > 0 && - self->lines == start_lines + line_limit) { + self->lines == start_lines + (int)line_limit) { goto linelimit; } } @@ -1160,7 +1160,7 @@ static int parser_handle_eof(parser_t *self) { int parser_consume_rows(parser_t *self, size_t nrows) { int i, offset, word_deletions, char_count; - if (nrows > self->lines) { + if ((int)nrows > self->lines) { nrows = self->lines; } @@ -1197,7 +1197,7 @@ int parser_consume_rows(parser_t *self, size_t nrows) { self->word_start -= char_count; /* move line metadata */ - for (i = 0; i < self->lines - nrows + 1; ++i) { + for (i = 0; i < self->lines - (int)nrows + 1; ++i) { offset = i + nrows; self->line_start[i] = self->line_start[offset] - word_deletions; self->line_fields[i] = self->line_fields[offset]; @@ -1224,7 +1224,7 @@ int parser_trim_buffers(parser_t *self) { /* trim words, word_starts */ new_cap = _next_pow2(self->words_len) + 1; - if (new_cap < self->words_cap) { + if ((int)new_cap < self->words_cap) { TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); newptr = safe_realloc((void *)self->words, new_cap * sizeof(char *)); if (newptr == NULL) { @@ -1247,7 +1247,7 @@ int parser_trim_buffers(parser_t *self) { ("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = " "%zu\n", new_cap, self->stream_cap, self->lines_cap)); - if (new_cap < self->stream_cap) { + if ((int)new_cap < self->stream_cap) { TRACE( ("parser_trim_buffers: new_cap < self->stream_cap, calling " "safe_realloc\n")); @@ -1275,7 +1275,7 @@ int parser_trim_buffers(parser_t *self) { /* trim line_start, line_fields */ new_cap = _next_pow2(self->lines) + 1; - if (new_cap < self->lines_cap) { + if ((int)new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); newptr = safe_realloc((void *)self->line_start, new_cap * sizeof(int)); if (newptr == NULL) { @@ -1328,7 +1328,7 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { (int)nrows, self->datapos, self->datalen)); while (1) { - if (!all && self->lines - start_lines >= nrows) break; + if (!all && self->lines - start_lines >= (int)nrows) break; if (self->datapos == self->datalen) { status = parser_buffer_bytes(self, self->chunksize); @@ -1986,7 +1986,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, return 0; } - if (number > int_max) { + if (number > (uint64_t)int_max) { state->seen_uint = 1; }