-
-
Notifications
You must be signed in to change notification settings - Fork 18.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: Use size_t to avoid array index overflow; add missing malloc of error_msg #17040
Changes from 4 commits
d5c75e8
e04d12a
1f24847
669d99b
0985cf3
3171674
e4dfd19
2930eaa
2ab4971
e3cb9c1
7b1cd8d
4380c53
a5d5677
6a1ba23
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -121,30 +121,30 @@ cdef extern from "parser/tokenizer.h": | |
io_callback cb_io | ||
io_cleanup cb_cleanup | ||
|
||
int chunksize # Number of bytes to prepare for each chunk | ||
char *data # pointer to data to be processed | ||
int datalen # amount of data available | ||
int datapos | ||
int64_t chunksize # Number of bytes to prepare for each chunk | ||
char *data # pointer to data to be processed | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls line up |
||
int64_t datalen # amount of data available | ||
int64_t datapos | ||
|
||
# where to write out tokenized data | ||
char *stream | ||
int stream_len | ||
int stream_cap | ||
int64_t stream_len | ||
int64_t stream_cap | ||
|
||
# Store words in (potentially ragged) matrix for now, hmm | ||
char **words | ||
int *word_starts # where we are in the stream | ||
int words_len | ||
int words_cap | ||
int64_t *word_starts # where we are in the stream | ||
int64_t words_len | ||
int64_t words_cap | ||
|
||
char *pword_start # pointer to stream start of current field | ||
int word_start # position start of current field | ||
char *pword_start # pointer to stream start of current field | ||
int64_t word_start # position start of current field | ||
|
||
int *line_start # position in words for start of line | ||
int *line_fields # Number of fields in each line | ||
int lines # Number of lines observed | ||
int file_lines # Number of file lines observed (with bad/skipped) | ||
int lines_cap # Vector capacity | ||
int64_t *line_start # position in words for start of line | ||
int64_t *line_fields # Number of fields in each line | ||
int64_t lines # Number of lines observed | ||
int64_t file_lines # Number of lines observed (with bad/skipped) | ||
int64_t lines_cap # Vector capacity | ||
|
||
# Tokenizing stuff | ||
ParserState state | ||
|
@@ -177,14 +177,14 @@ cdef extern from "parser/tokenizer.h": | |
# thousands separator (comma, period) | ||
char thousands | ||
|
||
int header # Boolean: 1: has header, 0: no header | ||
int header_start # header row start | ||
int header_end # header row end | ||
int header # Boolean: 1: has header, 0: no header | ||
int64_t header_start # header row start | ||
int64_t header_end # header row end | ||
|
||
void *skipset | ||
PyObject *skipfunc | ||
int64_t skip_first_N_rows | ||
int skipfooter | ||
int64_t skipfooter | ||
# pick one, depending on whether the converter requires GIL | ||
double (*double_converter_nogil)(const char *, char **, | ||
char, char, char, int) nogil | ||
|
@@ -195,12 +195,12 @@ cdef extern from "parser/tokenizer.h": | |
char *warn_msg | ||
char *error_msg | ||
|
||
int skip_empty_lines | ||
int64_t skip_empty_lines | ||
|
||
ctypedef struct coliter_t: | ||
char **words | ||
int *line_start | ||
int col | ||
int64_t *line_start | ||
int64_t col | ||
|
||
ctypedef struct uint_state: | ||
int seen_sint | ||
|
@@ -210,7 +210,8 @@ cdef extern from "parser/tokenizer.h": | |
void uint_state_init(uint_state *self) | ||
int uint64_conflict(uint_state *self) | ||
|
||
void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) nogil | ||
void coliter_setup(coliter_t *it, parser_t *parser, | ||
int64_t i, int64_t start) nogil | ||
void COLITER_NEXT(coliter_t, const char *) nogil | ||
|
||
parser_t* parser_new() | ||
|
@@ -289,14 +290,14 @@ cdef class TextReader: | |
object true_values, false_values | ||
object handle | ||
bint na_filter, verbose, has_usecols, has_mi_columns | ||
int parser_start | ||
int64_t parser_start | ||
list clocks | ||
char *c_encoding | ||
kh_str_t *false_set | ||
kh_str_t *true_set | ||
|
||
cdef public: | ||
int leading_cols, table_width, skipfooter, buffer_lines | ||
int64_t leading_cols, table_width, skipfooter, buffer_lines | ||
object allow_leading_cols | ||
object delimiter, converters, delim_whitespace | ||
object na_values | ||
|
@@ -730,7 +731,8 @@ cdef class TextReader: | |
Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa | ||
char *word | ||
object name | ||
int status, hr, data_line | ||
int status | ||
int64_t hr, data_line | ||
char *errors = "strict" | ||
cdef StringPath path = _string_path(self.c_encoding) | ||
|
||
|
@@ -949,8 +951,8 @@ cdef class TextReader: | |
|
||
cdef _read_rows(self, rows, bint trim): | ||
cdef: | ||
int buffered_lines | ||
int irows, footer = 0 | ||
int64_t buffered_lines | ||
int64_t irows, footer = 0 | ||
|
||
self._start_clock() | ||
|
||
|
@@ -1018,12 +1020,13 @@ cdef class TextReader: | |
|
||
def _convert_column_data(self, rows=None, upcast_na=False, footer=0): | ||
cdef: | ||
Py_ssize_t i, nused | ||
int64_t i | ||
int nused | ||
kh_str_t *na_hashset = NULL | ||
int start, end | ||
int64_t start, end | ||
object name, na_flist, col_dtype = None | ||
bint na_filter = 0 | ||
Py_ssize_t num_cols | ||
int64_t num_cols | ||
|
||
start = self.parser_start | ||
|
||
|
@@ -1195,7 +1198,7 @@ cdef class TextReader: | |
return col_res, na_count | ||
|
||
cdef _convert_with_dtype(self, object dtype, Py_ssize_t i, | ||
int start, int end, | ||
int64_t start, int64_t end, | ||
bint na_filter, | ||
bint user_dtype, | ||
kh_str_t *na_hashset, | ||
|
@@ -1275,7 +1278,7 @@ cdef class TextReader: | |
raise TypeError("the dtype %s is not " | ||
"supported for parsing" % dtype) | ||
|
||
cdef _string_convert(self, Py_ssize_t i, int start, int end, | ||
cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end, | ||
bint na_filter, kh_str_t *na_hashset): | ||
|
||
cdef StringPath path = _string_path(self.c_encoding) | ||
|
@@ -1336,6 +1339,7 @@ cdef class TextReader: | |
kh_destroy_str(table) | ||
|
||
cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused): | ||
cdef int64_t j | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. was this not defined before? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, and for the life of me I couldn't figure out how it was working given all other instances seem to require it (left as an exercise for the reader as to why it worked but should be defined with the proper type here, regardless). |
||
if self.has_usecols and self.names is not None: | ||
if (not callable(self.usecols) and | ||
len(self.names) == len(self.usecols)): | ||
|
@@ -1427,8 +1431,8 @@ cdef inline StringPath _string_path(char *encoding): | |
# ---------------------------------------------------------------------- | ||
# Type conversions / inference support code | ||
|
||
cdef _string_box_factorize(parser_t *parser, int col, | ||
int line_start, int line_end, | ||
cdef _string_box_factorize(parser_t *parser, int64_t col, | ||
int64_t line_start, int64_t line_end, | ||
bint na_filter, kh_str_t *na_hashset): | ||
cdef: | ||
int error, na_count = 0 | ||
|
@@ -1480,8 +1484,8 @@ cdef _string_box_factorize(parser_t *parser, int col, | |
|
||
return result, na_count | ||
|
||
cdef _string_box_utf8(parser_t *parser, int col, | ||
int line_start, int line_end, | ||
cdef _string_box_utf8(parser_t *parser, int64_t col, | ||
int64_t line_start, int64_t line_end, | ||
bint na_filter, kh_str_t *na_hashset): | ||
cdef: | ||
int error, na_count = 0 | ||
|
@@ -1533,8 +1537,8 @@ cdef _string_box_utf8(parser_t *parser, int col, | |
|
||
return result, na_count | ||
|
||
cdef _string_box_decode(parser_t *parser, int col, | ||
int line_start, int line_end, | ||
cdef _string_box_decode(parser_t *parser, int64_t col, | ||
int64_t line_start, int64_t line_end, | ||
bint na_filter, kh_str_t *na_hashset, | ||
char *encoding): | ||
cdef: | ||
|
@@ -1592,8 +1596,8 @@ cdef _string_box_decode(parser_t *parser, int col, | |
|
||
|
||
@cython.boundscheck(False) | ||
cdef _categorical_convert(parser_t *parser, int col, | ||
int line_start, int line_end, | ||
cdef _categorical_convert(parser_t *parser, int64_t col, | ||
int64_t line_start, int64_t line_end, | ||
bint na_filter, kh_str_t *na_hashset, | ||
char *encoding): | ||
"Convert column data into codes, categories" | ||
|
@@ -1663,8 +1667,8 @@ cdef _categorical_convert(parser_t *parser, int col, | |
kh_destroy_str(table) | ||
return np.asarray(codes), result, na_count | ||
|
||
cdef _to_fw_string(parser_t *parser, int col, int line_start, | ||
int line_end, size_t width): | ||
cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, | ||
int64_t line_end, int64_t width): | ||
cdef: | ||
Py_ssize_t i | ||
coliter_t it | ||
|
@@ -1680,11 +1684,11 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start, | |
|
||
return result | ||
|
||
cdef inline void _to_fw_string_nogil(parser_t *parser, int col, | ||
int line_start, int line_end, | ||
cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col, | ||
int64_t line_start, int64_t line_end, | ||
size_t width, char *data) nogil: | ||
cdef: | ||
Py_ssize_t i | ||
int64_t i | ||
coliter_t it | ||
const char *word = NULL | ||
|
||
|
@@ -1699,7 +1703,8 @@ cdef char* cinf = b'inf' | |
cdef char* cposinf = b'+inf' | ||
cdef char* cneginf = b'-inf' | ||
|
||
cdef _try_double(parser_t *parser, int col, int line_start, int line_end, | ||
cdef _try_double(parser_t *parser, int64_t col, | ||
int64_t line_start, int64_t line_end, | ||
bint na_filter, kh_str_t *na_hashset, object na_flist): | ||
cdef: | ||
int error, na_count = 0 | ||
|
@@ -1808,7 +1813,8 @@ cdef inline int _try_double_nogil(parser_t *parser, | |
|
||
return 0 | ||
|
||
cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end, | ||
cdef _try_uint64(parser_t *parser, int64_t col, | ||
int64_t line_start, int64_t line_end, | ||
bint na_filter, kh_str_t *na_hashset): | ||
cdef: | ||
int error | ||
|
@@ -1842,8 +1848,9 @@ cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end, | |
|
||
return result | ||
|
||
cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start, | ||
int line_end, bint na_filter, | ||
cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, | ||
int64_t line_start, | ||
int64_t line_end, bint na_filter, | ||
const kh_str_t *na_hashset, | ||
uint64_t *data, uint_state *state) nogil: | ||
cdef: | ||
|
@@ -1879,7 +1886,8 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start, | |
|
||
return 0 | ||
|
||
cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, | ||
cdef _try_int64(parser_t *parser, int64_t col, | ||
int64_t line_start, int64_t line_end, | ||
bint na_filter, kh_str_t *na_hashset): | ||
cdef: | ||
int error, na_count = 0 | ||
|
@@ -1906,8 +1914,9 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, | |
|
||
return result, na_count | ||
|
||
cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, | ||
int line_end, bint na_filter, | ||
cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, | ||
int64_t line_start, | ||
int64_t line_end, bint na_filter, | ||
const kh_str_t *na_hashset, int64_t NA, | ||
int64_t *data, int *na_count) nogil: | ||
cdef: | ||
|
@@ -1944,7 +1953,8 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, | |
|
||
return 0 | ||
|
||
cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, | ||
cdef _try_bool(parser_t *parser, int64_t col, | ||
int64_t line_start, int64_t line_end, | ||
bint na_filter, kh_str_t *na_hashset): | ||
cdef: | ||
int na_count | ||
|
@@ -1966,8 +1976,9 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, | |
return None, None | ||
return result.view(np.bool_), na_count | ||
|
||
cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, | ||
int line_end, bint na_filter, | ||
cdef inline int _try_bool_nogil(parser_t *parser, int64_t col, | ||
int64_t line_start, | ||
int64_t line_end, bint na_filter, | ||
const kh_str_t *na_hashset, uint8_t NA, | ||
uint8_t *data, int *na_count) nogil: | ||
cdef: | ||
|
@@ -2006,7 +2017,8 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, | |
data += 1 | ||
return 0 | ||
|
||
cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, | ||
cdef _try_bool_flex(parser_t *parser, int64_t col, | ||
int64_t line_start, int64_t line_end, | ||
bint na_filter, const kh_str_t *na_hashset, | ||
const kh_str_t *true_hashset, | ||
const kh_str_t *false_hashset): | ||
|
@@ -2032,8 +2044,9 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, | |
return None, None | ||
return result.view(np.bool_), na_count | ||
|
||
cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, | ||
int line_end, bint na_filter, | ||
cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, | ||
int64_t line_start, | ||
int64_t line_end, bint na_filter, | ||
const kh_str_t *na_hashset, | ||
const kh_str_t *true_hashset, | ||
const kh_str_t *false_hashset, | ||
|
@@ -2251,8 +2264,8 @@ for k in list(na_values): | |
na_values[np.dtype(k)] = na_values[k] | ||
|
||
|
||
cdef _apply_converter(object f, parser_t *parser, int col, | ||
int line_start, int line_end, | ||
cdef _apply_converter(object f, parser_t *parser, int64_t col, | ||
int64_t line_start, int64_t line_end, | ||
char* c_encoding): | ||
cdef: | ||
int error | ||
|
@@ -2296,7 +2309,7 @@ def _to_structured_array(dict columns, object names, object usecols): | |
|
||
object name, fnames, field_type | ||
Py_ssize_t i, offset, nfields, length | ||
int stride, elsize | ||
int64_t stride, elsize | ||
char *buf | ||
|
||
if names is None: | ||
|
@@ -2344,10 +2357,10 @@ def _to_structured_array(dict columns, object names, object usecols): | |
|
||
return recs | ||
|
||
cdef _fill_structured_column(char *dst, char* src, int elsize, | ||
int stride, int length, bint incref): | ||
cdef _fill_structured_column(char *dst, char* src, int64_t elsize, | ||
int64_t stride, int64_t length, bint incref): | ||
cdef: | ||
Py_ssize_t i | ||
int64_t i | ||
|
||
if incref: | ||
util.transfer_object_column(dst, src, stride, length) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if we can make these descriptions less technical. Our (less technically) audience might not understand everything written here. For example, you could write:
True, it doesn't capture the entire error description as before, but it gets the general idea across I hope. The ellipses mean I took your entire sentence from what proceeds "would cause"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure. I was a bit hesitant to write these entries for this exact reason (I wasn't sure who the audience was and what the expectations were about entries). I've removed a lot of the detail while still giving the reader enough information to determine if they bug they just saw is, in fact, one of the two listed.