From d5c75e8e8456c6e84bf6905eeedba92b1d369330 Mon Sep 17 00:00:00 2001 From: Jeff Knupp Date: Wed, 19 Jul 2017 21:59:00 -0400 Subject: [PATCH 01/14] BUG: Use size_t to avoid array index overflow; add missing malloc of error_msg Fix a few locations where a parser's `error_msg` buffer is written to without having been previously allocated. This manifested as a double free during exception handling code making use of the `error_msg`. Aditionally, use `size_t/ssize_t` where array indicies or lengths will be stored. Previously, int32_t was used and would overflow on columns with very large amounts of data (i.e. greater than INTMAX bytes). --- pandas/_libs/parsers.pyx | 131 +++++++++++++------------- pandas/_libs/src/parser/tokenizer.c | 138 ++++++++++++++++------------ pandas/_libs/src/parser/tokenizer.h | 36 ++++---- 3 files changed, 166 insertions(+), 139 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 7375a2197c6b7..d6f87344bb28c 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -121,30 +121,30 @@ cdef extern from "parser/tokenizer.h": io_callback cb_io io_cleanup cb_cleanup - int chunksize # Number of bytes to prepare for each chunk + size_t chunksize # Number of bytes to prepare for each chunk char *data # pointer to data to be processed - int datalen # amount of data available - int datapos + size_t datalen # amount of data available + size_t datapos # where to write out tokenized data char *stream - int stream_len - int stream_cap + size_t stream_len + size_t stream_cap # Store words in (potentially ragged) matrix for now, hmm char **words - int *word_starts # where we are in the stream - int words_len - int words_cap + size_t *word_starts # where we are in the stream + size_t words_len + size_t words_cap char *pword_start # pointer to stream start of current field - int word_start # position start of current field + size_t word_start # position start of current field - int *line_start # position in words for start of line - int *line_fields # Number of fields in each line - int lines # Number of lines observed - int file_lines # Number of file lines observed (with bad/skipped) - int lines_cap # Vector capacity + size_t *line_start # position in words for start of line + size_t *line_fields # Number of fields in each line + size_t lines # Number of lines observed + size_t file_lines # Number of file lines observed (with bad/skipped) + size_t lines_cap # Vector capacity # Tokenizing stuff ParserState state @@ -178,13 +178,13 @@ cdef extern from "parser/tokenizer.h": char thousands int header # Boolean: 1: has header, 0: no header - int header_start # header row start - int header_end # header row end + ssize_t header_start # header row start + ssize_t header_end # header row end void *skipset PyObject *skipfunc int64_t skip_first_N_rows - int skipfooter + size_t skipfooter # pick one, depending on whether the converter requires GIL double (*double_converter_nogil)(const char *, char **, char, char, char, int) nogil @@ -195,12 +195,12 @@ cdef extern from "parser/tokenizer.h": char *warn_msg char *error_msg - int skip_empty_lines + size_t skip_empty_lines ctypedef struct coliter_t: char **words - int *line_start - int col + size_t *line_start + size_t col ctypedef struct uint_state: int seen_sint @@ -210,7 +210,7 @@ cdef extern from "parser/tokenizer.h": void uint_state_init(uint_state *self) int uint64_conflict(uint_state *self) - void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) nogil + void coliter_setup(coliter_t *it, parser_t *parser, size_t i, size_t start) nogil void COLITER_NEXT(coliter_t, const char *) nogil parser_t* parser_new() @@ -289,14 +289,14 @@ cdef class TextReader: object true_values, false_values object handle bint na_filter, verbose, has_usecols, has_mi_columns - int parser_start + size_t parser_start list clocks char *c_encoding kh_str_t *false_set kh_str_t *true_set cdef public: - int leading_cols, table_width, skipfooter, buffer_lines + size_t leading_cols, table_width, skipfooter, buffer_lines object allow_leading_cols object delimiter, converters, delim_whitespace object na_values @@ -730,7 +730,8 @@ cdef class TextReader: Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa char *word object name - int status, hr, data_line + int status + size_t hr, data_line char *errors = "strict" cdef StringPath path = _string_path(self.c_encoding) @@ -949,8 +950,8 @@ cdef class TextReader: cdef _read_rows(self, rows, bint trim): cdef: - int buffered_lines - int irows, footer = 0 + size_t buffered_lines + size_t irows, footer = 0 self._start_clock() @@ -1018,12 +1019,13 @@ cdef class TextReader: def _convert_column_data(self, rows=None, upcast_na=False, footer=0): cdef: - Py_ssize_t i, nused + size_t i + int nused kh_str_t *na_hashset = NULL - int start, end + size_t start, end object name, na_flist, col_dtype = None bint na_filter = 0 - Py_ssize_t num_cols + size_t num_cols start = self.parser_start @@ -1036,7 +1038,7 @@ cdef class TextReader: # if footer > 0: # end -= footer - num_cols = -1 + num_cols = 0 for i in range(self.parser.lines): num_cols = (num_cols < self.parser.line_fields[i]) * \ self.parser.line_fields[i] + \ @@ -1195,7 +1197,7 @@ cdef class TextReader: return col_res, na_count cdef _convert_with_dtype(self, object dtype, Py_ssize_t i, - int start, int end, + size_t start, size_t end, bint na_filter, bint user_dtype, kh_str_t *na_hashset, @@ -1275,7 +1277,7 @@ cdef class TextReader: raise TypeError("the dtype %s is not " "supported for parsing" % dtype) - cdef _string_convert(self, Py_ssize_t i, int start, int end, + cdef _string_convert(self, Py_ssize_t i, size_t start, size_t end, bint na_filter, kh_str_t *na_hashset): cdef StringPath path = _string_path(self.c_encoding) @@ -1336,6 +1338,7 @@ cdef class TextReader: kh_destroy_str(table) cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused): + cdef int j if self.has_usecols and self.names is not None: if (not callable(self.usecols) and len(self.names) == len(self.usecols)): @@ -1427,8 +1430,8 @@ cdef inline StringPath _string_path(char *encoding): # ---------------------------------------------------------------------- # Type conversions / inference support code -cdef _string_box_factorize(parser_t *parser, int col, - int line_start, int line_end, +cdef _string_box_factorize(parser_t *parser, size_t col, + size_t line_start, size_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 @@ -1480,8 +1483,8 @@ cdef _string_box_factorize(parser_t *parser, int col, return result, na_count -cdef _string_box_utf8(parser_t *parser, int col, - int line_start, int line_end, +cdef _string_box_utf8(parser_t *parser, size_t col, + size_t line_start, size_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 @@ -1533,8 +1536,8 @@ cdef _string_box_utf8(parser_t *parser, int col, return result, na_count -cdef _string_box_decode(parser_t *parser, int col, - int line_start, int line_end, +cdef _string_box_decode(parser_t *parser, size_t col, + size_t line_start, size_t line_end, bint na_filter, kh_str_t *na_hashset, char *encoding): cdef: @@ -1592,8 +1595,8 @@ cdef _string_box_decode(parser_t *parser, int col, @cython.boundscheck(False) -cdef _categorical_convert(parser_t *parser, int col, - int line_start, int line_end, +cdef _categorical_convert(parser_t *parser, size_t col, + size_t line_start, size_t line_end, bint na_filter, kh_str_t *na_hashset, char *encoding): "Convert column data into codes, categories" @@ -1663,8 +1666,8 @@ cdef _categorical_convert(parser_t *parser, int col, kh_destroy_str(table) return np.asarray(codes), result, na_count -cdef _to_fw_string(parser_t *parser, int col, int line_start, - int line_end, size_t width): +cdef _to_fw_string(parser_t *parser, size_t col, size_t line_start, + size_t line_end, size_t width): cdef: Py_ssize_t i coliter_t it @@ -1680,11 +1683,11 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start, return result -cdef inline void _to_fw_string_nogil(parser_t *parser, int col, - int line_start, int line_end, +cdef inline void _to_fw_string_nogil(parser_t *parser, size_t col, + size_t line_start, size_t line_end, size_t width, char *data) nogil: cdef: - Py_ssize_t i + size_t i coliter_t it const char *word = NULL @@ -1699,7 +1702,7 @@ cdef char* cinf = b'inf' cdef char* cposinf = b'+inf' cdef char* cneginf = b'-inf' -cdef _try_double(parser_t *parser, int col, int line_start, int line_end, +cdef _try_double(parser_t *parser, size_t col, size_t line_start, size_t line_end, bint na_filter, kh_str_t *na_hashset, object na_flist): cdef: int error, na_count = 0 @@ -1808,7 +1811,7 @@ cdef inline int _try_double_nogil(parser_t *parser, return 0 -cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end, +cdef _try_uint64(parser_t *parser, size_t col, size_t line_start, size_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error @@ -1842,8 +1845,8 @@ cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end, return result -cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start, - int line_end, bint na_filter, +cdef inline int _try_uint64_nogil(parser_t *parser, size_t col, size_t line_start, + size_t line_end, bint na_filter, const kh_str_t *na_hashset, uint64_t *data, uint_state *state) nogil: cdef: @@ -1879,7 +1882,7 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start, return 0 -cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, +cdef _try_int64(parser_t *parser, size_t col, size_t line_start, size_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 @@ -1906,8 +1909,8 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, return result, na_count -cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, - int line_end, bint na_filter, +cdef inline int _try_int64_nogil(parser_t *parser, size_t col, size_t line_start, + size_t line_end, bint na_filter, const kh_str_t *na_hashset, int64_t NA, int64_t *data, int *na_count) nogil: cdef: @@ -1944,7 +1947,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, return 0 -cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, +cdef _try_bool(parser_t *parser, size_t col, size_t line_start, size_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int na_count @@ -1966,8 +1969,8 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, return None, None return result.view(np.bool_), na_count -cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, - int line_end, bint na_filter, +cdef inline int _try_bool_nogil(parser_t *parser, size_t col, size_t line_start, + size_t line_end, bint na_filter, const kh_str_t *na_hashset, uint8_t NA, uint8_t *data, int *na_count) nogil: cdef: @@ -2006,7 +2009,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, data += 1 return 0 -cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, +cdef _try_bool_flex(parser_t *parser, size_t col, size_t line_start, size_t line_end, bint na_filter, const kh_str_t *na_hashset, const kh_str_t *true_hashset, const kh_str_t *false_hashset): @@ -2032,8 +2035,8 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, return None, None return result.view(np.bool_), na_count -cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, - int line_end, bint na_filter, +cdef inline int _try_bool_flex_nogil(parser_t *parser, size_t col, size_t line_start, + size_t line_end, bint na_filter, const kh_str_t *na_hashset, const kh_str_t *true_hashset, const kh_str_t *false_hashset, @@ -2251,8 +2254,8 @@ for k in list(na_values): na_values[np.dtype(k)] = na_values[k] -cdef _apply_converter(object f, parser_t *parser, int col, - int line_start, int line_end, +cdef _apply_converter(object f, parser_t *parser, size_t col, + size_t line_start, size_t line_end, char* c_encoding): cdef: int error @@ -2296,7 +2299,7 @@ def _to_structured_array(dict columns, object names, object usecols): object name, fnames, field_type Py_ssize_t i, offset, nfields, length - int stride, elsize + size_t stride, elsize char *buf if names is None: @@ -2344,10 +2347,10 @@ def _to_structured_array(dict columns, object names, object usecols): return recs -cdef _fill_structured_column(char *dst, char* src, int elsize, - int stride, int length, bint incref): +cdef _fill_structured_column(char *dst, char* src, size_t elsize, + size_t stride, size_t length, bint incref): cdef: - Py_ssize_t i + size_t i if incref: util.transfer_object_column(dst, src, stride, length) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index be23ebb023383..186babc2a4720 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -69,14 +69,18 @@ static void free_if_not_null(void **ptr) { */ -static void *grow_buffer(void *buffer, int length, int *capacity, int space, - int elsize, int *error) { - int cap = *capacity; +static void *grow_buffer(void *buffer, size_t length, size_t *capacity, + size_t space, size_t elsize, int *error) { + size_t cap = *capacity; void *newbuffer = buffer; // Can we fit potentially nbytes tokens (+ null terminators) in the stream? while ((length + space >= cap) && (newbuffer != NULL)) { - cap = cap ? cap << 1 : 2; + if (cap < 1024 * 1024 * 1024) { + cap = cap ? cap << 1 : 2; + } else { + cap *= 2; + } buffer = newbuffer; newbuffer = safe_realloc(newbuffer, elsize * cap); } @@ -169,7 +173,7 @@ int parser_cleanup(parser_t *self) { } int parser_init(parser_t *self) { - int sz; + size_t sz; /* Initialize data buffers @@ -196,14 +200,14 @@ int parser_init(parser_t *self) { sz = STREAM_INIT_SIZE / 10; sz = sz ? sz : 1; self->words = (char **)malloc(sz * sizeof(char *)); - self->word_starts = (int *)malloc(sz * sizeof(int)); + self->word_starts = (size_t *)malloc(sz * sizeof(size_t)); self->words_cap = sz; self->words_len = 0; // line pointers and metadata - self->line_start = (int *)malloc(sz * sizeof(int)); + self->line_start = (size_t *)malloc(sz * sizeof(size_t)); - self->line_fields = (int *)malloc(sz * sizeof(int)); + self->line_fields = (size_t *)malloc(sz * sizeof(size_t)); self->lines_cap = sz; self->lines = 0; @@ -247,7 +251,8 @@ void parser_del(parser_t *self) { } static int make_stream_space(parser_t *self, size_t nbytes) { - int i, status, cap; + size_t i, cap; + int status; void *orig_ptr, *newptr; // Can we fit potentially nbytes tokens (+ null terminators) in the stream? @@ -304,11 +309,11 @@ static int make_stream_space(parser_t *self, size_t nbytes) { "self->words_cap=%d\n", nbytes, self->words_cap)) newptr = safe_realloc((void *)self->word_starts, - sizeof(int) * self->words_cap); + sizeof(size_t) * self->words_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->word_starts = (int *)newptr; + self->word_starts = (size_t *)newptr; } } @@ -317,8 +322,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { */ cap = self->lines_cap; self->line_start = - (int *)grow_buffer((void *)self->line_start, self->lines + 1, - &self->lines_cap, nbytes, sizeof(int), &status); + (size_t *)grow_buffer((void *)self->line_start, self->lines + 1, + &self->lines_cap, nbytes, sizeof(size_t), &status); TRACE(( "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", self->lines + 1, self->lines_cap, nbytes, status)) @@ -331,11 +336,11 @@ static int make_stream_space(parser_t *self, size_t nbytes) { TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", nbytes)) newptr = safe_realloc((void *)self->line_fields, - sizeof(int) * self->lines_cap); + sizeof(size_t) * self->lines_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_fields = (int *)newptr; + self->line_fields = (size_t *)newptr; } } @@ -350,7 +355,7 @@ static int push_char(parser_t *self, char c) { ("push_char: ERROR!!! self->stream_len(%d) >= " "self->stream_cap(%d)\n", self->stream_len, self->stream_cap)) - int bufsize = 100; + size_t bufsize = 100; self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - possible malformed input file.\n"); @@ -367,7 +372,7 @@ int P_INLINE end_field(parser_t *self) { ("end_field: ERROR!!! self->words_len(%zu) >= " "self->words_cap(%zu)\n", self->words_len, self->words_cap)) - int bufsize = 100; + size_t bufsize = 100; self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - possible malformed input file.\n"); @@ -399,8 +404,8 @@ int P_INLINE end_field(parser_t *self) { } static void append_warning(parser_t *self, const char *msg) { - int ex_length; - int length = strlen(msg); + size_t ex_length; + size_t length = strlen(msg); void *newptr; if (self->warn_msg == NULL) { @@ -420,12 +425,13 @@ static int end_line(parser_t *self) { char *msg; int fields; int ex_fields = self->expected_fields; - int bufsize = 100; // for error or warning messages + size_t bufsize = 100; // for error or warning messages fields = self->line_fields[self->lines]; TRACE(("end_line: Line end, nfields: %d\n", fields)); + TRACE(("end_line: lines: %d\n", self->lines)); if (self->lines > 0) { if (self->expected_fields >= 0) { ex_fields = self->expected_fields; @@ -433,6 +439,7 @@ static int end_line(parser_t *self) { ex_fields = self->line_fields[self->lines - 1]; } } + TRACE(("end_line: ex_fields: %d\n", ex_fields)); if (self->state == START_FIELD_IN_SKIP_LINE || self->state == IN_FIELD_IN_SKIP_LINE || @@ -450,7 +457,7 @@ static int end_line(parser_t *self) { return 0; } - if (!(self->lines <= self->header_end + 1) && + if (!(self->lines <= (unsigned long) self->header_end + 1) && (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) { // increment file line count self->file_lines++; @@ -465,7 +472,7 @@ static int end_line(parser_t *self) { if (self->error_bad_lines) { self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, - "Expected %d fields in line %d, saw %d\n", + "Expected %d fields in line %zu, saw %d\n", ex_fields, self->file_lines, fields); TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); @@ -477,7 +484,7 @@ static int end_line(parser_t *self) { // pass up error message msg = (char *)malloc(bufsize); snprintf(msg, bufsize, - "Skipping line %d: expected %d fields, saw %d\n", + "Skipping line %zu: expected %d fields, saw %d\n", self->file_lines, ex_fields, fields); append_warning(self, msg); free(msg); @@ -485,10 +492,13 @@ static int end_line(parser_t *self) { } } else { // missing trailing delimiters - if ((self->lines >= self->header_end + 1) && fields < ex_fields) { + if ((self->lines >= (unsigned long) self->header_end + 1) && + fields < ex_fields) { // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { - self->error_msg = "out of memory"; + size_t bufsize = 100; + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, "out of memory"); return -1; } @@ -507,7 +517,7 @@ static int end_line(parser_t *self) { TRACE(( "end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", self->lines, self->lines_cap)) - int bufsize = 100; + size_t bufsize = 100; self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - " @@ -568,7 +578,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { self->datalen = bytes_read; if (status != REACHED_EOF && self->data == NULL) { - int bufsize = 200; + size_t bufsize = 200; self->error_msg = (char *)malloc(bufsize); if (status == CALLING_READ_FAILED) { @@ -599,7 +609,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { if (slen >= self->stream_cap) { \ TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ self->stream_cap)) \ - int bufsize = 100; \ + size_t bufsize = 100; \ self->error_msg = (char *)malloc(bufsize); \ snprintf(self->error_msg, bufsize, \ "Buffer overflow caught - possible malformed input file.\n");\ @@ -626,7 +636,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { stream = self->stream + self->stream_len; \ slen = self->stream_len; \ self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + (int)line_limit) { \ + if (line_limit > 0 && self->lines == start_lines + (size_t)line_limit) { \ goto linelimit; \ } @@ -641,7 +651,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { stream = self->stream + self->stream_len; \ slen = self->stream_len; \ self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + (int)line_limit) { \ + if (line_limit > 0 && self->lines == start_lines + (size_t)line_limit) { \ goto linelimit; \ } @@ -712,15 +722,17 @@ int skip_this_line(parser_t *self, int64_t rownum) { } } -int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { - int i, slen; +int tokenize_bytes(parser_t *self, size_t line_limit, size_t start_lines) { + size_t i, slen; int should_skip; char c; char *stream; char *buf = self->data + self->datapos; if (make_stream_space(self, self->datalen - self->datapos) < 0) { - self->error_msg = "out of memory"; + size_t bufsize = 100; + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, "out of memory"); return -1; } @@ -1025,7 +1037,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { PUSH_CHAR(c); self->state = IN_FIELD; } else { - int bufsize = 100; + size_t bufsize = 100; self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "delimiter expected after quote in quote"); @@ -1079,7 +1091,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { --i; buf--; // let's try this character again (HACK!) if (line_limit > 0 && - self->lines == start_lines + (int)line_limit) { + self->lines == start_lines + line_limit) { goto linelimit; } } @@ -1121,7 +1133,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { } static int parser_handle_eof(parser_t *self) { - int bufsize = 100; + size_t bufsize = 100; TRACE( ("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) @@ -1139,7 +1151,7 @@ static int parser_handle_eof(parser_t *self) { case IN_QUOTED_FIELD: self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, - "EOF inside string starting at line %d", self->file_lines); + "EOF inside string starting at line %zu", self->file_lines); return -1; case ESCAPED_CHAR: @@ -1165,9 +1177,9 @@ static int parser_handle_eof(parser_t *self) { } int parser_consume_rows(parser_t *self, size_t nrows) { - int i, offset, word_deletions, char_count; + size_t i, offset, word_deletions, char_count; - if ((int)nrows > self->lines) { + if (nrows > self->lines) { nrows = self->lines; } @@ -1204,7 +1216,7 @@ int parser_consume_rows(parser_t *self, size_t nrows) { self->word_start -= char_count; /* move line metadata */ - for (i = 0; i < self->lines - (int)nrows + 1; ++i) { + for (i = 0; i < self->lines - nrows + 1; ++i) { offset = i + nrows; self->line_start[i] = self->line_start[offset] - word_deletions; self->line_fields[i] = self->line_fields[offset]; @@ -1227,11 +1239,11 @@ int parser_trim_buffers(parser_t *self) { size_t new_cap; void *newptr; - int i; + size_t i; /* trim words, word_starts */ new_cap = _next_pow2(self->words_len) + 1; - if ((int)new_cap < self->words_cap) { + if (new_cap < self->words_cap) { TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); newptr = safe_realloc((void *)self->words, new_cap * sizeof(char *)); if (newptr == NULL) { @@ -1239,22 +1251,28 @@ int parser_trim_buffers(parser_t *self) { } else { self->words = (char **)newptr; } - newptr = safe_realloc((void *)self->word_starts, new_cap * sizeof(int)); + newptr = safe_realloc((void *)self->word_starts, + new_cap * sizeof(size_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->word_starts = (int *)newptr; + self->word_starts = (size_t *)newptr; self->words_cap = new_cap; } } /* trim stream */ - new_cap = _next_pow2(self->stream_len) + 1; + if (new_cap < INT32_MAX) { + new_cap = _next_pow2(self->stream_len) + 1; + } else { + new_cap *= 2; + } + TRACE( ("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = " "%zu\n", new_cap, self->stream_cap, self->lines_cap)); - if ((int)new_cap < self->stream_cap) { + if (new_cap < self->stream_cap) { TRACE( ("parser_trim_buffers: new_cap < self->stream_cap, calling " "safe_realloc\n")); @@ -1281,20 +1299,26 @@ int parser_trim_buffers(parser_t *self) { } /* trim line_start, line_fields */ - new_cap = _next_pow2(self->lines) + 1; - if ((int)new_cap < self->lines_cap) { + if (new_cap < 1024 * 1024 * 1024) { + new_cap = _next_pow2(self->lines) + 1; + } else { + new_cap *= 2; + } + if (new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); - newptr = safe_realloc((void *)self->line_start, new_cap * sizeof(int)); + newptr = safe_realloc((void *)self->line_start, + new_cap * sizeof(size_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_start = (int *)newptr; + self->line_start = (size_t *)newptr; } - newptr = safe_realloc((void *)self->line_fields, new_cap * sizeof(int)); + newptr = safe_realloc((void *)self->line_fields, + new_cap * sizeof(size_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_fields = (int *)newptr; + self->line_fields = (size_t *)newptr; self->lines_cap = new_cap; } } @@ -1303,11 +1327,11 @@ int parser_trim_buffers(parser_t *self) { } void debug_print_parser(parser_t *self) { - int j, line; + size_t j, line; char *token; for (line = 0; line < self->lines; ++line) { - printf("(Parsed) Line %d: ", line); + printf("(Parsed) Line %zu: ", line); for (j = 0; j < self->line_fields[j]; ++j) { token = self->words[j + self->line_start[line]]; @@ -1324,7 +1348,7 @@ void debug_print_parser(parser_t *self) { int _tokenize_helper(parser_t *self, size_t nrows, int all) { int status = 0; - int start_lines = self->lines; + size_t start_lines = self->lines; if (self->state == FINISHED) { return 0; @@ -1332,10 +1356,10 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { TRACE(( "_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n", - (int)nrows, self->datapos, self->datalen)); + nrows, self->datapos, self->datalen)); while (1) { - if (!all && self->lines - start_lines >= (int)nrows) break; + if (!all && self->lines - start_lines >= nrows) break; if (self->datapos == self->datalen) { status = parser_buffer_bytes(self, self->chunksize); diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index b4344e8a6c070..a1341b37952eb 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -137,30 +137,30 @@ typedef struct parser_t { io_callback cb_io; io_cleanup cb_cleanup; - int chunksize; // Number of bytes to prepare for each chunk + size_t chunksize; // Number of bytes to prepare for each chunk char *data; // pointer to data to be processed - int datalen; // amount of data available - int datapos; + size_t datalen; // amount of data available + size_t datapos; // where to write out tokenized data char *stream; - int stream_len; - int stream_cap; + size_t stream_len; + size_t stream_cap; // Store words in (potentially ragged) matrix for now, hmm char **words; - int *word_starts; // where we are in the stream - int words_len; - int words_cap; + size_t *word_starts; // where we are in the stream + size_t words_len; + size_t words_cap; char *pword_start; // pointer to stream start of current field - int word_start; // position start of current field + size_t word_start; // position start of current field - int *line_start; // position in words for start of line - int *line_fields; // Number of fields in each line - int lines; // Number of (good) lines observed - int file_lines; // Number of file lines observed (including bad or skipped) - int lines_cap; // Vector capacity + size_t *line_start; // position in words for start of line + size_t *line_fields; // Number of fields in each line + size_t lines; // Number of (good) lines observed + size_t file_lines; // Number of lines observed (including bad or skipped) + size_t lines_cap; // Vector capacity // Tokenizing stuff ParserState state; @@ -194,8 +194,8 @@ typedef struct parser_t { char thousands; int header; // Boolean: 1: has header, 0: no header - int header_start; // header row start - int header_end; // header row end + ssize_t header_start; // header row start + ssize_t header_end; // header row end void *skipset; PyObject *skipfunc; @@ -216,7 +216,7 @@ typedef struct parser_t { typedef struct coliter_t { char **words; - int *line_start; + size_t *line_start; int col; } coliter_t; @@ -225,7 +225,7 @@ coliter_t *coliter_new(parser_t *self, int i); #define COLITER_NEXT(iter, word) \ do { \ - const int i = *iter.line_start++ + iter.col; \ + const size_t i = *iter.line_start++ + iter.col; \ word = i < *iter.line_start ? iter.words[i] : ""; \ } while (0) From e04d12a9735707927edebf20501c1176ef5a2859 Mon Sep 17 00:00:00 2001 From: Jeff Knupp Date: Thu, 20 Jul 2017 18:21:17 -0400 Subject: [PATCH 02/14] Switch to use int64_t rather than size_t due to portability concerns. --- pandas/_libs/parsers.pyx | 134 ++++++++++++++-------------- pandas/_libs/src/parser/tokenizer.h | 36 ++++---- 2 files changed, 85 insertions(+), 85 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index d6f87344bb28c..88c695a3faf27 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -121,30 +121,30 @@ cdef extern from "parser/tokenizer.h": io_callback cb_io io_cleanup cb_cleanup - size_t chunksize # Number of bytes to prepare for each chunk - char *data # pointer to data to be processed - size_t datalen # amount of data available - size_t datapos + int64_t chunksize # Number of bytes to prepare for each chunk + char *data # pointer to data to be processed + int64_t datalen # amount of data available + int64_t datapos # where to write out tokenized data char *stream - size_t stream_len - size_t stream_cap + int64_t stream_len + int64_t stream_cap # Store words in (potentially ragged) matrix for now, hmm char **words - size_t *word_starts # where we are in the stream - size_t words_len - size_t words_cap + int64_t *word_starts # where we are in the stream + int64_t words_len + int64_t words_cap - char *pword_start # pointer to stream start of current field - size_t word_start # position start of current field + char *pword_start # pointer to stream start of current field + int64_t word_start # position start of current field - size_t *line_start # position in words for start of line - size_t *line_fields # Number of fields in each line - size_t lines # Number of lines observed - size_t file_lines # Number of file lines observed (with bad/skipped) - size_t lines_cap # Vector capacity + int64_t *line_start # position in words for start of line + int64_t *line_fields # Number of fields in each line + int64_t lines # Number of lines observed + int64_t file_lines # Number of file lines observed (with bad/skipped) + int64_t lines_cap # Vector capacity # Tokenizing stuff ParserState state @@ -178,13 +178,13 @@ cdef extern from "parser/tokenizer.h": char thousands int header # Boolean: 1: has header, 0: no header - ssize_t header_start # header row start - ssize_t header_end # header row end + int64_t header_start # header row start + int64_t header_end # header row end void *skipset PyObject *skipfunc int64_t skip_first_N_rows - size_t skipfooter + int64_t skipfooter # pick one, depending on whether the converter requires GIL double (*double_converter_nogil)(const char *, char **, char, char, char, int) nogil @@ -195,12 +195,12 @@ cdef extern from "parser/tokenizer.h": char *warn_msg char *error_msg - size_t skip_empty_lines + int64_t skip_empty_lines ctypedef struct coliter_t: char **words - size_t *line_start - size_t col + int64_t *line_start + int64_t col ctypedef struct uint_state: int seen_sint @@ -210,7 +210,7 @@ cdef extern from "parser/tokenizer.h": void uint_state_init(uint_state *self) int uint64_conflict(uint_state *self) - void coliter_setup(coliter_t *it, parser_t *parser, size_t i, size_t start) nogil + void coliter_setup(coliter_t *it, parser_t *parser, int64_t i, int64_t start) nogil void COLITER_NEXT(coliter_t, const char *) nogil parser_t* parser_new() @@ -289,14 +289,14 @@ cdef class TextReader: object true_values, false_values object handle bint na_filter, verbose, has_usecols, has_mi_columns - size_t parser_start + int64_t parser_start list clocks char *c_encoding kh_str_t *false_set kh_str_t *true_set cdef public: - size_t leading_cols, table_width, skipfooter, buffer_lines + int64_t leading_cols, table_width, skipfooter, buffer_lines object allow_leading_cols object delimiter, converters, delim_whitespace object na_values @@ -731,7 +731,7 @@ cdef class TextReader: char *word object name int status - size_t hr, data_line + int64_t hr, data_line char *errors = "strict" cdef StringPath path = _string_path(self.c_encoding) @@ -950,8 +950,8 @@ cdef class TextReader: cdef _read_rows(self, rows, bint trim): cdef: - size_t buffered_lines - size_t irows, footer = 0 + int64_t buffered_lines + int64_t irows, footer = 0 self._start_clock() @@ -1019,13 +1019,13 @@ cdef class TextReader: def _convert_column_data(self, rows=None, upcast_na=False, footer=0): cdef: - size_t i + int64_t i int nused kh_str_t *na_hashset = NULL - size_t start, end + int64_t start, end object name, na_flist, col_dtype = None bint na_filter = 0 - size_t num_cols + int64_t num_cols start = self.parser_start @@ -1038,7 +1038,7 @@ cdef class TextReader: # if footer > 0: # end -= footer - num_cols = 0 + num_cols = -1 for i in range(self.parser.lines): num_cols = (num_cols < self.parser.line_fields[i]) * \ self.parser.line_fields[i] + \ @@ -1197,7 +1197,7 @@ cdef class TextReader: return col_res, na_count cdef _convert_with_dtype(self, object dtype, Py_ssize_t i, - size_t start, size_t end, + int64_t start, int64_t end, bint na_filter, bint user_dtype, kh_str_t *na_hashset, @@ -1277,7 +1277,7 @@ cdef class TextReader: raise TypeError("the dtype %s is not " "supported for parsing" % dtype) - cdef _string_convert(self, Py_ssize_t i, size_t start, size_t end, + cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end, bint na_filter, kh_str_t *na_hashset): cdef StringPath path = _string_path(self.c_encoding) @@ -1338,7 +1338,7 @@ cdef class TextReader: kh_destroy_str(table) cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused): - cdef int j + cdef int64_t j if self.has_usecols and self.names is not None: if (not callable(self.usecols) and len(self.names) == len(self.usecols)): @@ -1430,8 +1430,8 @@ cdef inline StringPath _string_path(char *encoding): # ---------------------------------------------------------------------- # Type conversions / inference support code -cdef _string_box_factorize(parser_t *parser, size_t col, - size_t line_start, size_t line_end, +cdef _string_box_factorize(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 @@ -1483,8 +1483,8 @@ cdef _string_box_factorize(parser_t *parser, size_t col, return result, na_count -cdef _string_box_utf8(parser_t *parser, size_t col, - size_t line_start, size_t line_end, +cdef _string_box_utf8(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 @@ -1536,8 +1536,8 @@ cdef _string_box_utf8(parser_t *parser, size_t col, return result, na_count -cdef _string_box_decode(parser_t *parser, size_t col, - size_t line_start, size_t line_end, +cdef _string_box_decode(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset, char *encoding): cdef: @@ -1595,8 +1595,8 @@ cdef _string_box_decode(parser_t *parser, size_t col, @cython.boundscheck(False) -cdef _categorical_convert(parser_t *parser, size_t col, - size_t line_start, size_t line_end, +cdef _categorical_convert(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset, char *encoding): "Convert column data into codes, categories" @@ -1666,8 +1666,8 @@ cdef _categorical_convert(parser_t *parser, size_t col, kh_destroy_str(table) return np.asarray(codes), result, na_count -cdef _to_fw_string(parser_t *parser, size_t col, size_t line_start, - size_t line_end, size_t width): +cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, + int64_t line_end, int64_t width): cdef: Py_ssize_t i coliter_t it @@ -1683,11 +1683,11 @@ cdef _to_fw_string(parser_t *parser, size_t col, size_t line_start, return result -cdef inline void _to_fw_string_nogil(parser_t *parser, size_t col, - size_t line_start, size_t line_end, +cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, size_t width, char *data) nogil: cdef: - size_t i + int64_t i coliter_t it const char *word = NULL @@ -1702,7 +1702,7 @@ cdef char* cinf = b'inf' cdef char* cposinf = b'+inf' cdef char* cneginf = b'-inf' -cdef _try_double(parser_t *parser, size_t col, size_t line_start, size_t line_end, +cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset, object na_flist): cdef: int error, na_count = 0 @@ -1811,7 +1811,7 @@ cdef inline int _try_double_nogil(parser_t *parser, return 0 -cdef _try_uint64(parser_t *parser, size_t col, size_t line_start, size_t line_end, +cdef _try_uint64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error @@ -1845,8 +1845,8 @@ cdef _try_uint64(parser_t *parser, size_t col, size_t line_start, size_t line_en return result -cdef inline int _try_uint64_nogil(parser_t *parser, size_t col, size_t line_start, - size_t line_end, bint na_filter, +cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, int64_t line_start, + int64_t line_end, bint na_filter, const kh_str_t *na_hashset, uint64_t *data, uint_state *state) nogil: cdef: @@ -1882,7 +1882,7 @@ cdef inline int _try_uint64_nogil(parser_t *parser, size_t col, size_t line_star return 0 -cdef _try_int64(parser_t *parser, size_t col, size_t line_start, size_t line_end, +cdef _try_int64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 @@ -1909,8 +1909,8 @@ cdef _try_int64(parser_t *parser, size_t col, size_t line_start, size_t line_end return result, na_count -cdef inline int _try_int64_nogil(parser_t *parser, size_t col, size_t line_start, - size_t line_end, bint na_filter, +cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, int64_t line_start, + int64_t line_end, bint na_filter, const kh_str_t *na_hashset, int64_t NA, int64_t *data, int *na_count) nogil: cdef: @@ -1947,7 +1947,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, size_t col, size_t line_start return 0 -cdef _try_bool(parser_t *parser, size_t col, size_t line_start, size_t line_end, +cdef _try_bool(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int na_count @@ -1969,8 +1969,8 @@ cdef _try_bool(parser_t *parser, size_t col, size_t line_start, size_t line_end, return None, None return result.view(np.bool_), na_count -cdef inline int _try_bool_nogil(parser_t *parser, size_t col, size_t line_start, - size_t line_end, bint na_filter, +cdef inline int _try_bool_nogil(parser_t *parser, int64_t col, int64_t line_start, + int64_t line_end, bint na_filter, const kh_str_t *na_hashset, uint8_t NA, uint8_t *data, int *na_count) nogil: cdef: @@ -2009,7 +2009,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, size_t col, size_t line_start, data += 1 return 0 -cdef _try_bool_flex(parser_t *parser, size_t col, size_t line_start, size_t line_end, +cdef _try_bool_flex(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, const kh_str_t *na_hashset, const kh_str_t *true_hashset, const kh_str_t *false_hashset): @@ -2035,8 +2035,8 @@ cdef _try_bool_flex(parser_t *parser, size_t col, size_t line_start, size_t line return None, None return result.view(np.bool_), na_count -cdef inline int _try_bool_flex_nogil(parser_t *parser, size_t col, size_t line_start, - size_t line_end, bint na_filter, +cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, int64_t line_start, + int64_t line_end, bint na_filter, const kh_str_t *na_hashset, const kh_str_t *true_hashset, const kh_str_t *false_hashset, @@ -2254,8 +2254,8 @@ for k in list(na_values): na_values[np.dtype(k)] = na_values[k] -cdef _apply_converter(object f, parser_t *parser, size_t col, - size_t line_start, size_t line_end, +cdef _apply_converter(object f, parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, char* c_encoding): cdef: int error @@ -2299,7 +2299,7 @@ def _to_structured_array(dict columns, object names, object usecols): object name, fnames, field_type Py_ssize_t i, offset, nfields, length - size_t stride, elsize + int64_t stride, elsize char *buf if names is None: @@ -2347,10 +2347,10 @@ def _to_structured_array(dict columns, object names, object usecols): return recs -cdef _fill_structured_column(char *dst, char* src, size_t elsize, - size_t stride, size_t length, bint incref): +cdef _fill_structured_column(char *dst, char* src, int64_t elsize, + int64_t stride, int64_t length, bint incref): cdef: - size_t i + int64_t i if incref: util.transfer_object_column(dst, src, stride, length) diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index a1341b37952eb..f293baa3cda12 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -137,30 +137,30 @@ typedef struct parser_t { io_callback cb_io; io_cleanup cb_cleanup; - size_t chunksize; // Number of bytes to prepare for each chunk + int64_t chunksize; // Number of bytes to prepare for each chunk char *data; // pointer to data to be processed - size_t datalen; // amount of data available - size_t datapos; + int64_t datalen; // amount of data available + int64_t datapos; // where to write out tokenized data char *stream; - size_t stream_len; - size_t stream_cap; + int64_t stream_len; + int64_t stream_cap; // Store words in (potentially ragged) matrix for now, hmm char **words; - size_t *word_starts; // where we are in the stream - size_t words_len; - size_t words_cap; + int64_t *word_starts; // where we are in the stream + int64_t words_len; + int64_t words_cap; char *pword_start; // pointer to stream start of current field - size_t word_start; // position start of current field + int64_t word_start; // position start of current field - size_t *line_start; // position in words for start of line - size_t *line_fields; // Number of fields in each line - size_t lines; // Number of (good) lines observed - size_t file_lines; // Number of lines observed (including bad or skipped) - size_t lines_cap; // Vector capacity + int64_t *line_start; // position in words for start of line + int64_t *line_fields; // Number of fields in each line + int64_t lines; // Number of (good) lines observed + int64_t file_lines; // Number of lines observed (including bad or skipped) + int64_t lines_cap; // Vector capacity // Tokenizing stuff ParserState state; @@ -194,8 +194,8 @@ typedef struct parser_t { char thousands; int header; // Boolean: 1: has header, 0: no header - ssize_t header_start; // header row start - ssize_t header_end; // header row end + int64_t header_start; // header row start + int64_t header_end; // header row end void *skipset; PyObject *skipfunc; @@ -216,7 +216,7 @@ typedef struct parser_t { typedef struct coliter_t { char **words; - size_t *line_start; + int64_t *line_start; int col; } coliter_t; @@ -225,7 +225,7 @@ coliter_t *coliter_new(parser_t *self, int i); #define COLITER_NEXT(iter, word) \ do { \ - const size_t i = *iter.line_start++ + iter.col; \ + const int64_t i = *iter.line_start++ + iter.col; \ word = i < *iter.line_start ? iter.words[i] : ""; \ } while (0) From 1f24847effd169b127d64c32f9dfa9a36e6bc1f2 Mon Sep 17 00:00:00 2001 From: Jeff Knupp Date: Thu, 20 Jul 2017 19:47:03 -0400 Subject: [PATCH 03/14] Fix comment alignment; add whatsnew entry --- doc/source/whatsnew/v0.21.0.txt | 3 ++- pandas/_libs/parsers.pyx | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 5146bd35dff30..1c4effc31833f 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -201,7 +201,8 @@ I/O ^^^ - Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`) - +- Bug in :func:`read_csv` in which passing a CSV with at least one very large (i.e. more than 2^31 - 1 bytes) column along with ``low_memory=False`` would cause an integer overflow. The result was an always unsuccessful attempt to allocate an enourmous buffer and then reporting "Out of memory." (:issue:`16798`). +- Bug in :func:`read_csv` in which some errors paths were assigning error messages to the internal tokenizer's ``error_msg`` field without first allocating the memory. When this happened as part of exception handling, it resulted in a double ``free`` and the program halted due to a ``SIGSEGV`` (:issue:`16798`). - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) Plotting diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 88c695a3faf27..72e5fb7f12c7f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -137,7 +137,7 @@ cdef extern from "parser/tokenizer.h": int64_t words_len int64_t words_cap - char *pword_start # pointer to stream start of current field + char *pword_start # pointer to stream start of current field int64_t word_start # position start of current field int64_t *line_start # position in words for start of line @@ -177,9 +177,9 @@ cdef extern from "parser/tokenizer.h": # thousands separator (comma, period) char thousands - int header # Boolean: 1: has header, 0: no header - int64_t header_start # header row start - int64_t header_end # header row end + int header # Boolean: 1: has header, 0: no header + int64_t header_start # header row start + int64_t header_end # header row end void *skipset PyObject *skipfunc From 669d99bffb89dec193ef3040a2805f5bae947d91 Mon Sep 17 00:00:00 2001 From: Jeff Knupp Date: Thu, 20 Jul 2017 20:49:30 -0400 Subject: [PATCH 04/14] Fix linting errors re: line length --- pandas/_libs/parsers.pyx | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 72e5fb7f12c7f..e7f559cc150fe 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -143,7 +143,7 @@ cdef extern from "parser/tokenizer.h": int64_t *line_start # position in words for start of line int64_t *line_fields # Number of fields in each line int64_t lines # Number of lines observed - int64_t file_lines # Number of file lines observed (with bad/skipped) + int64_t file_lines # Number of lines observed (with bad/skipped) int64_t lines_cap # Vector capacity # Tokenizing stuff @@ -210,7 +210,8 @@ cdef extern from "parser/tokenizer.h": void uint_state_init(uint_state *self) int uint64_conflict(uint_state *self) - void coliter_setup(coliter_t *it, parser_t *parser, int64_t i, int64_t start) nogil + void coliter_setup(coliter_t *it, parser_t *parser, + int64_t i, int64_t start) nogil void COLITER_NEXT(coliter_t, const char *) nogil parser_t* parser_new() @@ -1702,7 +1703,8 @@ cdef char* cinf = b'inf' cdef char* cposinf = b'+inf' cdef char* cneginf = b'-inf' -cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, +cdef _try_double(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset, object na_flist): cdef: int error, na_count = 0 @@ -1811,7 +1813,8 @@ cdef inline int _try_double_nogil(parser_t *parser, return 0 -cdef _try_uint64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, +cdef _try_uint64(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error @@ -1845,7 +1848,8 @@ cdef _try_uint64(parser_t *parser, int64_t col, int64_t line_start, int64_t line return result -cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, int64_t line_start, +cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, const kh_str_t *na_hashset, uint64_t *data, uint_state *state) nogil: @@ -1882,7 +1886,8 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, int64_t line_st return 0 -cdef _try_int64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, +cdef _try_int64(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int error, na_count = 0 @@ -1909,7 +1914,8 @@ cdef _try_int64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_ return result, na_count -cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, int64_t line_start, +cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, const kh_str_t *na_hashset, int64_t NA, int64_t *data, int *na_count) nogil: @@ -1947,7 +1953,8 @@ cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, int64_t line_sta return 0 -cdef _try_bool(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, +cdef _try_bool(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): cdef: int na_count @@ -1969,7 +1976,8 @@ cdef _try_bool(parser_t *parser, int64_t col, int64_t line_start, int64_t line_e return None, None return result.view(np.bool_), na_count -cdef inline int _try_bool_nogil(parser_t *parser, int64_t col, int64_t line_start, +cdef inline int _try_bool_nogil(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, const kh_str_t *na_hashset, uint8_t NA, uint8_t *data, int *na_count) nogil: @@ -2009,7 +2017,8 @@ cdef inline int _try_bool_nogil(parser_t *parser, int64_t col, int64_t line_star data += 1 return 0 -cdef _try_bool_flex(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, +cdef _try_bool_flex(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, const kh_str_t *na_hashset, const kh_str_t *true_hashset, const kh_str_t *false_hashset): @@ -2035,7 +2044,8 @@ cdef _try_bool_flex(parser_t *parser, int64_t col, int64_t line_start, int64_t l return None, None return result.view(np.bool_), na_count -cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, int64_t line_start, +cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, + int64_t line_start, int64_t line_end, bint na_filter, const kh_str_t *na_hashset, const kh_str_t *true_hashset, From 0985cf387f806a2c753534881a4c98fa3bc8cd73 Mon Sep 17 00:00:00 2001 From: Jeff Knupp Date: Fri, 21 Jul 2017 11:52:54 -0400 Subject: [PATCH 05/14] Remove debugging code; fix type cast --- pandas/_libs/src/parser/tokenizer.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 186babc2a4720..9a31a6356ee08 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -76,11 +76,7 @@ static void *grow_buffer(void *buffer, size_t length, size_t *capacity, // Can we fit potentially nbytes tokens (+ null terminators) in the stream? while ((length + space >= cap) && (newbuffer != NULL)) { - if (cap < 1024 * 1024 * 1024) { - cap = cap ? cap << 1 : 2; - } else { - cap *= 2; - } + cap = cap ? cap << 1 : 2; buffer = newbuffer; newbuffer = safe_realloc(newbuffer, elsize * cap); } @@ -457,7 +453,7 @@ static int end_line(parser_t *self) { return 0; } - if (!(self->lines <= (unsigned long) self->header_end + 1) && + if (!(self->lines <= (int64_t) self->header_end + 1) && (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) { // increment file line count self->file_lines++; @@ -492,7 +488,7 @@ static int end_line(parser_t *self) { } } else { // missing trailing delimiters - if ((self->lines >= (unsigned long) self->header_end + 1) && + if ((self->lines >= (int64_t) self->header_end + 1) && fields < ex_fields) { // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { @@ -1299,11 +1295,7 @@ int parser_trim_buffers(parser_t *self) { } /* trim line_start, line_fields */ - if (new_cap < 1024 * 1024 * 1024) { - new_cap = _next_pow2(self->lines) + 1; - } else { - new_cap *= 2; - } + new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); newptr = safe_realloc((void *)self->line_start, From 3171674567c90aaaf7b723e2cc72c418ce374d18 Mon Sep 17 00:00:00 2001 From: Jeff Knupp Date: Fri, 21 Jul 2017 12:08:42 -0400 Subject: [PATCH 06/14] Fix some leftover size_t references --- pandas/_libs/src/parser/tokenizer.c | 34 ++++++++++++++--------------- pandas/_libs/src/parser/tokenizer.h | 22 +++++++++---------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 9a31a6356ee08..fc1462b7a2b03 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -305,11 +305,11 @@ static int make_stream_space(parser_t *self, size_t nbytes) { "self->words_cap=%d\n", nbytes, self->words_cap)) newptr = safe_realloc((void *)self->word_starts, - sizeof(size_t) * self->words_cap); + sizeof(int64_t) * self->words_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->word_starts = (size_t *)newptr; + self->word_starts = (int64_t *)newptr; } } @@ -318,8 +318,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { */ cap = self->lines_cap; self->line_start = - (size_t *)grow_buffer((void *)self->line_start, self->lines + 1, - &self->lines_cap, nbytes, sizeof(size_t), &status); + (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1, + &self->lines_cap, nbytes, sizeof(int64_t), &status); TRACE(( "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", self->lines + 1, self->lines_cap, nbytes, status)) @@ -332,11 +332,11 @@ static int make_stream_space(parser_t *self, size_t nbytes) { TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", nbytes)) newptr = safe_realloc((void *)self->line_fields, - sizeof(size_t) * self->lines_cap); + sizeof(int64_t) * self->lines_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_fields = (size_t *)newptr; + self->line_fields = (int64_t *)newptr; } } @@ -718,8 +718,8 @@ int skip_this_line(parser_t *self, int64_t rownum) { } } -int tokenize_bytes(parser_t *self, size_t line_limit, size_t start_lines) { - size_t i, slen; +int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) { + int64_t i, slen; int should_skip; char c; char *stream; @@ -1235,7 +1235,7 @@ int parser_trim_buffers(parser_t *self) { size_t new_cap; void *newptr; - size_t i; + int64_t i; /* trim words, word_starts */ new_cap = _next_pow2(self->words_len) + 1; @@ -1248,11 +1248,11 @@ int parser_trim_buffers(parser_t *self) { self->words = (char **)newptr; } newptr = safe_realloc((void *)self->word_starts, - new_cap * sizeof(size_t)); + new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->word_starts = (size_t *)newptr; + self->word_starts = (int64_t *)newptr; self->words_cap = new_cap; } } @@ -1299,18 +1299,18 @@ int parser_trim_buffers(parser_t *self) { if (new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); newptr = safe_realloc((void *)self->line_start, - new_cap * sizeof(size_t)); + new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_start = (size_t *)newptr; + self->line_start = (int64_t *)newptr; } newptr = safe_realloc((void *)self->line_fields, - new_cap * sizeof(size_t)); + new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_fields = (size_t *)newptr; + self->line_fields = (int64_t *)newptr; self->lines_cap = new_cap; } } @@ -1319,7 +1319,7 @@ int parser_trim_buffers(parser_t *self) { } void debug_print_parser(parser_t *self) { - size_t j, line; + int64_t j, line; char *token; for (line = 0; line < self->lines; ++line) { @@ -1340,7 +1340,7 @@ void debug_print_parser(parser_t *self) { int _tokenize_helper(parser_t *self, size_t nrows, int all) { int status = 0; - size_t start_lines = self->lines; + int64_t start_lines = self->lines; if (self->state == FINISHED) { return 0; diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index f293baa3cda12..1cd391aef68a4 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -137,9 +137,9 @@ typedef struct parser_t { io_callback cb_io; io_cleanup cb_cleanup; - int64_t chunksize; // Number of bytes to prepare for each chunk - char *data; // pointer to data to be processed - int64_t datalen; // amount of data available + int64_t chunksize; // Number of bytes to prepare for each chunk + char *data; // pointer to data to be processed + int64_t datalen; // amount of data available int64_t datapos; // where to write out tokenized data @@ -149,18 +149,18 @@ typedef struct parser_t { // Store words in (potentially ragged) matrix for now, hmm char **words; - int64_t *word_starts; // where we are in the stream + int64_t *word_starts; // where we are in the stream int64_t words_len; int64_t words_cap; - char *pword_start; // pointer to stream start of current field + char *pword_start; // pointer to stream start of current field int64_t word_start; // position start of current field - int64_t *line_start; // position in words for start of line - int64_t *line_fields; // Number of fields in each line - int64_t lines; // Number of (good) lines observed - int64_t file_lines; // Number of lines observed (including bad or skipped) - int64_t lines_cap; // Vector capacity + int64_t *line_start; // position in words for start of line + int64_t *line_fields; // Number of fields in each line + int64_t lines; // Number of (good) lines observed + int64_t file_lines; // Number of lines observed (including bad or skipped) + int64_t lines_cap; // Vector capacity // Tokenizing stuff ParserState state; @@ -193,7 +193,7 @@ typedef struct parser_t { // thousands separator (comma, period) char thousands; - int header; // Boolean: 1: has header, 0: no header + int header; // Boolean: 1: has header, 0: no header int64_t header_start; // header row start int64_t header_end; // header row end From e4dfd19b6d53a58ca1a4f2679454cbcc14238b2f Mon Sep 17 00:00:00 2001 From: Jeff Knupp Date: Fri, 21 Jul 2017 15:05:58 -0400 Subject: [PATCH 07/14] Revert printf format strings; fix more comment alignment --- pandas/_libs/parsers.pyx | 2 +- pandas/_libs/src/parser/tokenizer.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index e7f559cc150fe..c512a9fd39e95 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -122,7 +122,7 @@ cdef extern from "parser/tokenizer.h": io_cleanup cb_cleanup int64_t chunksize # Number of bytes to prepare for each chunk - char *data # pointer to data to be processed + char *data # pointer to data to be processed int64_t datalen # amount of data available int64_t datapos diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index fc1462b7a2b03..d5bb1a2fbc136 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -468,7 +468,7 @@ static int end_line(parser_t *self) { if (self->error_bad_lines) { self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, - "Expected %d fields in line %zu, saw %d\n", + "Expected %d fields in line %d, saw %d\n", ex_fields, self->file_lines, fields); TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); @@ -480,7 +480,7 @@ static int end_line(parser_t *self) { // pass up error message msg = (char *)malloc(bufsize); snprintf(msg, bufsize, - "Skipping line %zu: expected %d fields, saw %d\n", + "Skipping line %d: expected %d fields, saw %d\n", self->file_lines, ex_fields, fields); append_warning(self, msg); free(msg); @@ -1147,7 +1147,7 @@ static int parser_handle_eof(parser_t *self) { case IN_QUOTED_FIELD: self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, - "EOF inside string starting at line %zu", self->file_lines); + "EOF inside string starting at line %d", self->file_lines); return -1; case ESCAPED_CHAR: @@ -1323,7 +1323,7 @@ void debug_print_parser(parser_t *self) { char *token; for (line = 0; line < self->lines; ++line) { - printf("(Parsed) Line %zu: ", line); + printf("(Parsed) Line %d: ", line); for (j = 0; j < self->line_fields[j]; ++j) { token = self->words[j + self->line_start[line]]; From 2930eaa3845d5c62ae7a5971b04727a3bfe8a763 Mon Sep 17 00:00:00 2001 From: Jeff Knupp Date: Fri, 21 Jul 2017 16:30:07 -0400 Subject: [PATCH 08/14] Fix line length to conform to linter rules --- pandas/_libs/src/parser/tokenizer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 1cd391aef68a4..9462608a26814 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -159,7 +159,7 @@ typedef struct parser_t { int64_t *line_start; // position in words for start of line int64_t *line_fields; // Number of fields in each line int64_t lines; // Number of (good) lines observed - int64_t file_lines; // Number of lines observed (including bad or skipped) + int64_t file_lines; // Number of lines (including bad or skipped) int64_t lines_cap; // Vector capacity // Tokenizing stuff From 2ab4971449cd13b0cfc95fbe737f4f5204b07e5a Mon Sep 17 00:00:00 2001 From: Jeff Knupp Date: Sat, 22 Jul 2017 23:14:03 -0400 Subject: [PATCH 09/14] Remove debugging code --- pandas/_libs/src/parser/tokenizer.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index d5bb1a2fbc136..ab92290f87719 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1258,12 +1258,7 @@ int parser_trim_buffers(parser_t *self) { } /* trim stream */ - if (new_cap < INT32_MAX) { - new_cap = _next_pow2(self->stream_len) + 1; - } else { - new_cap *= 2; - } - + new_cap = _next_pow2(self->stream_len) + 1; TRACE( ("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = " "%zu\n", From e3cb9c1d8d37c91af6156ff41e4413f617082dcd Mon Sep 17 00:00:00 2001 From: Jeff Knupp Date: Sun, 23 Jul 2017 00:22:03 -0400 Subject: [PATCH 10/14] Add unit test plus '--high-memory' option, *off by default*. --- pandas/conftest.py | 7 ++++++- pandas/tests/io/parser/test_parsers.py | 13 +++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 8a3ffe22242ac..049756a0680f6 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -9,7 +9,9 @@ def pytest_addoption(parser): parser.addoption("--skip-slow", action="store_true", help="skip slow tests") parser.addoption("--skip-network", action="store_true", - help="run network tests") + help="skip network tests") + parser.addoption("--run-highmemory", action="store_true", + help="run high memory tests") parser.addoption("--only-slow", action="store_true", help="run only slow tests") @@ -24,6 +26,9 @@ def pytest_runtest_setup(item): if 'network' in item.keywords and item.config.getoption("--skip-network"): pytest.skip("skipping due to --skip-network") + if 'high_memory' in item.keywords and not item.config.getoption("--run-highmemory"): + pytest.skip("skipping high memory test since --run-highmemory was not set") + # Configurations for all tests and all test modules diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 8d59e3acb3230..1a53dce37bcd7 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- import os +from io import StringIO + +import pytest import pandas.util.testing as tm @@ -24,6 +27,16 @@ from .python_parser_only import PythonParserTests from .dtypes import DtypeTests +@pytest.mark.high_memory +def test_bytes_exceed_2gb(): + """Read from a "CSV" that has a column larger than 2GB. + + GH 16798 + """ + csv = StringIO('strings\n' + '\n'.join(['x' * (1 << 20) for _ in range(2100)])) + df = read_csv(csv, low_memory=False) + assert not df.empty + class BaseParser(CommentTests, CompressionTests, ConverterTests, DialectTests, From 7b1cd8d99cd84e7e1e0614485e21183d750ff1c8 Mon Sep 17 00:00:00 2001 From: Jeff Knupp Date: Sun, 23 Jul 2017 00:23:54 -0400 Subject: [PATCH 11/14] Fix linting issues --- pandas/conftest.py | 6 ++++-- pandas/tests/io/parser/test_parsers.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 049756a0680f6..0e6472966d616 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -26,8 +26,10 @@ def pytest_runtest_setup(item): if 'network' in item.keywords and item.config.getoption("--skip-network"): pytest.skip("skipping due to --skip-network") - if 'high_memory' in item.keywords and not item.config.getoption("--run-highmemory"): - pytest.skip("skipping high memory test since --run-highmemory was not set") + if 'high_memory' in item.keywords and not item.config.getoption( + "--run-highmemory"): + pytest.skip( + "skipping high memory test since --run-highmemory was not set") # Configurations for all tests and all test modules diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 1a53dce37bcd7..466cafd85a515 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -33,7 +33,8 @@ def test_bytes_exceed_2gb(): GH 16798 """ - csv = StringIO('strings\n' + '\n'.join(['x' * (1 << 20) for _ in range(2100)])) + csv = StringIO('strings\n' + '\n'.join( + ['x' * (1 << 20) for _ in range(2100)])) df = read_csv(csv, low_memory=False) assert not df.empty From 4380c5340e714ae121a1d6313776b769fae98359 Mon Sep 17 00:00:00 2001 From: Jeff Knupp Date: Sun, 23 Jul 2017 02:28:26 -0400 Subject: [PATCH 12/14] Fix linting issues --- pandas/conftest.py | 4 ++-- pandas/tests/io/parser/test_parsers.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 0e6472966d616..ab097b79dcd4e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -28,8 +28,8 @@ def pytest_runtest_setup(item): if 'high_memory' in item.keywords and not item.config.getoption( "--run-highmemory"): - pytest.skip( - "skipping high memory test since --run-highmemory was not set") + pytest.skip( + "skipping high memory test since --run-highmemory was not set") # Configurations for all tests and all test modules diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 466cafd85a515..f23bd24f5cbe3 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -27,6 +27,7 @@ from .python_parser_only import PythonParserTests from .dtypes import DtypeTests + @pytest.mark.high_memory def test_bytes_exceed_2gb(): """Read from a "CSV" that has a column larger than 2GB. From a5d567716e0b221b0a13da60f98a3e0c7c26a354 Mon Sep 17 00:00:00 2001 From: Jeff Knupp Date: Sun, 23 Jul 2017 02:31:37 -0400 Subject: [PATCH 13/14] Fix linting issues --- pandas/conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index ab097b79dcd4e..bae45743bbcfb 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -27,9 +27,9 @@ def pytest_runtest_setup(item): pytest.skip("skipping due to --skip-network") if 'high_memory' in item.keywords and not item.config.getoption( - "--run-highmemory"): - pytest.skip( - "skipping high memory test since --run-highmemory was not set") + "--run-highmemory"): + pytest.skip( + "skipping high memory test since --run-highmemory was not set") # Configurations for all tests and all test modules From 6a1ba230d14f06ef71494d943dcc8be809da7278 Mon Sep 17 00:00:00 2001 From: Jeff Knupp Date: Sun, 23 Jul 2017 10:07:29 -0400 Subject: [PATCH 14/14] Clear up prose --- doc/source/whatsnew/v0.21.0.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 1c4effc31833f..520634fa2504a 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -201,8 +201,8 @@ I/O ^^^ - Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`) -- Bug in :func:`read_csv` in which passing a CSV with at least one very large (i.e. more than 2^31 - 1 bytes) column along with ``low_memory=False`` would cause an integer overflow. The result was an always unsuccessful attempt to allocate an enourmous buffer and then reporting "Out of memory." (:issue:`16798`). -- Bug in :func:`read_csv` in which some errors paths were assigning error messages to the internal tokenizer's ``error_msg`` field without first allocating the memory. When this happened as part of exception handling, it resulted in a double ``free`` and the program halted due to a ``SIGSEGV`` (:issue:`16798`). +- Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the Python interpreter to crash (:issue:`16798`). +- Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size caused Pandas to grossly overestimate memory requirements and preemptively raise an Exception with the message "out of memory" (:issue:`16798`). - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) Plotting