From d5c75e8e8456c6e84bf6905eeedba92b1d369330 Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff.knupp@enigma.com>
Date: Wed, 19 Jul 2017 21:59:00 -0400
Subject: [PATCH 01/14] BUG: Use size_t to avoid array index overflow; add
 missing malloc of error_msg

Fix a few locations where a parser's `error_msg` buffer is written to
without having been previously allocated. This manifested as a double
free during exception handling code making use of the `error_msg`.

Aditionally, use `size_t/ssize_t` where array indicies or lengths will
be stored. Previously, int32_t was used and would overflow on columns
with very large amounts of data (i.e. greater than INTMAX bytes).
---
 pandas/_libs/parsers.pyx            | 131 +++++++++++++-------------
 pandas/_libs/src/parser/tokenizer.c | 138 ++++++++++++++++------------
 pandas/_libs/src/parser/tokenizer.h |  36 ++++----
 3 files changed, 166 insertions(+), 139 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 7375a2197c6b7..d6f87344bb28c 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -121,30 +121,30 @@ cdef extern from "parser/tokenizer.h":
         io_callback cb_io
         io_cleanup cb_cleanup
 
-        int chunksize  # Number of bytes to prepare for each chunk
+        size_t chunksize  # Number of bytes to prepare for each chunk
         char *data     # pointer to data to be processed
-        int datalen    # amount of data available
-        int datapos
+        size_t datalen    # amount of data available
+        size_t datapos
 
         # where to write out tokenized data
         char *stream
-        int stream_len
-        int stream_cap
+        size_t stream_len
+        size_t stream_cap
 
         # Store words in (potentially ragged) matrix for now, hmm
         char **words
-        int *word_starts # where we are in the stream
-        int words_len
-        int words_cap
+        size_t *word_starts # where we are in the stream
+        size_t words_len
+        size_t words_cap
 
         char *pword_start    # pointer to stream start of current field
-        int word_start       # position start of current field
+        size_t word_start       # position start of current field
 
-        int *line_start      # position in words for start of line
-        int *line_fields     # Number of fields in each line
-        int lines            # Number of lines observed
-        int file_lines       # Number of file lines observed (with bad/skipped)
-        int lines_cap        # Vector capacity
+        size_t *line_start      # position in words for start of line
+        size_t *line_fields     # Number of fields in each line
+        size_t lines            # Number of lines observed
+        size_t file_lines       # Number of file lines observed (with bad/skipped)
+        size_t lines_cap        # Vector capacity
 
         # Tokenizing stuff
         ParserState state
@@ -178,13 +178,13 @@ cdef extern from "parser/tokenizer.h":
         char thousands
 
         int header # Boolean: 1: has header, 0: no header
-        int header_start # header row start
-        int header_end # header row end
+        ssize_t header_start # header row start
+        ssize_t header_end # header row end
 
         void *skipset
         PyObject *skipfunc
         int64_t skip_first_N_rows
-        int skipfooter
+        size_t skipfooter
         # pick one, depending on whether the converter requires GIL
         double (*double_converter_nogil)(const char *, char **,
                                          char, char, char, int) nogil
@@ -195,12 +195,12 @@ cdef extern from "parser/tokenizer.h":
         char *warn_msg
         char *error_msg
 
-        int skip_empty_lines
+        size_t skip_empty_lines
 
     ctypedef struct coliter_t:
         char **words
-        int *line_start
-        int col
+        size_t *line_start
+        size_t col
 
     ctypedef struct uint_state:
         int seen_sint
@@ -210,7 +210,7 @@ cdef extern from "parser/tokenizer.h":
     void uint_state_init(uint_state *self)
     int uint64_conflict(uint_state *self)
 
-    void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) nogil
+    void coliter_setup(coliter_t *it, parser_t *parser, size_t i, size_t start) nogil
     void COLITER_NEXT(coliter_t, const char *) nogil
 
     parser_t* parser_new()
@@ -289,14 +289,14 @@ cdef class TextReader:
         object true_values, false_values
         object handle
         bint na_filter, verbose, has_usecols, has_mi_columns
-        int parser_start
+        size_t parser_start
         list clocks
         char *c_encoding
         kh_str_t *false_set
         kh_str_t *true_set
 
     cdef public:
-        int leading_cols, table_width, skipfooter, buffer_lines
+        size_t leading_cols, table_width, skipfooter, buffer_lines
         object allow_leading_cols
         object delimiter, converters, delim_whitespace
         object na_values
@@ -730,7 +730,8 @@ cdef class TextReader:
             Py_ssize_t i, start, field_count, passed_count, unnamed_count  # noqa
             char *word
             object name
-            int status, hr, data_line
+            int status
+            size_t hr, data_line
             char *errors = "strict"
             cdef StringPath path = _string_path(self.c_encoding)
 
@@ -949,8 +950,8 @@ cdef class TextReader:
 
     cdef _read_rows(self, rows, bint trim):
         cdef:
-            int buffered_lines
-            int irows, footer = 0
+            size_t buffered_lines
+            size_t irows, footer = 0
 
         self._start_clock()
 
@@ -1018,12 +1019,13 @@ cdef class TextReader:
 
     def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
         cdef:
-            Py_ssize_t i, nused
+            size_t i
+            int nused
             kh_str_t *na_hashset = NULL
-            int start, end
+            size_t start, end
             object name, na_flist, col_dtype = None
             bint na_filter = 0
-            Py_ssize_t num_cols
+            size_t num_cols
 
         start = self.parser_start
 
@@ -1036,7 +1038,7 @@ cdef class TextReader:
         # if footer > 0:
         #     end -= footer
 
-        num_cols = -1
+        num_cols = 0
         for i in range(self.parser.lines):
             num_cols = (num_cols < self.parser.line_fields[i]) * \
                 self.parser.line_fields[i] + \
@@ -1195,7 +1197,7 @@ cdef class TextReader:
         return col_res, na_count
 
     cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
-                             int start, int end,
+                             size_t start, size_t end,
                              bint na_filter,
                              bint user_dtype,
                              kh_str_t *na_hashset,
@@ -1275,7 +1277,7 @@ cdef class TextReader:
             raise TypeError("the dtype %s is not "
                             "supported for parsing" % dtype)
 
-    cdef _string_convert(self, Py_ssize_t i, int start, int end,
+    cdef _string_convert(self, Py_ssize_t i, size_t start, size_t end,
                          bint na_filter, kh_str_t *na_hashset):
 
         cdef StringPath path = _string_path(self.c_encoding)
@@ -1336,6 +1338,7 @@ cdef class TextReader:
         kh_destroy_str(table)
 
     cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
+        cdef int j
         if self.has_usecols and self.names is not None:
             if (not callable(self.usecols) and
                     len(self.names) == len(self.usecols)):
@@ -1427,8 +1430,8 @@ cdef inline StringPath _string_path(char *encoding):
 # ----------------------------------------------------------------------
 # Type conversions / inference support code
 
-cdef _string_box_factorize(parser_t *parser, int col,
-                           int line_start, int line_end,
+cdef _string_box_factorize(parser_t *parser, size_t col,
+                           size_t line_start, size_t line_end,
                            bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error, na_count = 0
@@ -1480,8 +1483,8 @@ cdef _string_box_factorize(parser_t *parser, int col,
 
     return result, na_count
 
-cdef _string_box_utf8(parser_t *parser, int col,
-                      int line_start, int line_end,
+cdef _string_box_utf8(parser_t *parser, size_t col,
+                      size_t line_start, size_t line_end,
                       bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error, na_count = 0
@@ -1533,8 +1536,8 @@ cdef _string_box_utf8(parser_t *parser, int col,
 
     return result, na_count
 
-cdef _string_box_decode(parser_t *parser, int col,
-                        int line_start, int line_end,
+cdef _string_box_decode(parser_t *parser, size_t col,
+                        size_t line_start, size_t line_end,
                         bint na_filter, kh_str_t *na_hashset,
                         char *encoding):
     cdef:
@@ -1592,8 +1595,8 @@ cdef _string_box_decode(parser_t *parser, int col,
 
 
 @cython.boundscheck(False)
-cdef _categorical_convert(parser_t *parser, int col,
-                          int line_start, int line_end,
+cdef _categorical_convert(parser_t *parser, size_t col,
+                          size_t line_start, size_t line_end,
                           bint na_filter, kh_str_t *na_hashset,
                           char *encoding):
     "Convert column data into codes, categories"
@@ -1663,8 +1666,8 @@ cdef _categorical_convert(parser_t *parser, int col,
     kh_destroy_str(table)
     return np.asarray(codes), result, na_count
 
-cdef _to_fw_string(parser_t *parser, int col, int line_start,
-                   int line_end, size_t width):
+cdef _to_fw_string(parser_t *parser, size_t col, size_t line_start,
+                   size_t line_end, size_t width):
     cdef:
         Py_ssize_t i
         coliter_t it
@@ -1680,11 +1683,11 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,
 
     return result
 
-cdef inline void _to_fw_string_nogil(parser_t *parser, int col,
-                                     int line_start, int line_end,
+cdef inline void _to_fw_string_nogil(parser_t *parser, size_t col,
+                                     size_t line_start, size_t line_end,
                                      size_t width, char *data) nogil:
     cdef:
-        Py_ssize_t i
+        size_t i
         coliter_t it
         const char *word = NULL
 
@@ -1699,7 +1702,7 @@ cdef char* cinf = b'inf'
 cdef char* cposinf = b'+inf'
 cdef char* cneginf = b'-inf'
 
-cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
+cdef _try_double(parser_t *parser, size_t col, size_t line_start, size_t line_end,
                  bint na_filter, kh_str_t *na_hashset, object na_flist):
     cdef:
         int error, na_count = 0
@@ -1808,7 +1811,7 @@ cdef inline int _try_double_nogil(parser_t *parser,
 
     return 0
 
-cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,
+cdef _try_uint64(parser_t *parser, size_t col, size_t line_start, size_t line_end,
                  bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error
@@ -1842,8 +1845,8 @@ cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,
 
     return result
 
-cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,
-                                  int line_end, bint na_filter,
+cdef inline int _try_uint64_nogil(parser_t *parser, size_t col, size_t line_start,
+                                  size_t line_end, bint na_filter,
                                   const kh_str_t *na_hashset,
                                   uint64_t *data, uint_state *state) nogil:
     cdef:
@@ -1879,7 +1882,7 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,
 
     return 0
 
-cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
+cdef _try_int64(parser_t *parser, size_t col, size_t line_start, size_t line_end,
                 bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error, na_count = 0
@@ -1906,8 +1909,8 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
 
     return result, na_count
 
-cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start,
-                                 int line_end, bint na_filter,
+cdef inline int _try_int64_nogil(parser_t *parser, size_t col, size_t line_start,
+                                 size_t line_end, bint na_filter,
                                  const kh_str_t *na_hashset, int64_t NA,
                                  int64_t *data, int *na_count) nogil:
     cdef:
@@ -1944,7 +1947,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start,
 
     return 0
 
-cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
+cdef _try_bool(parser_t *parser, size_t col, size_t line_start, size_t line_end,
                bint na_filter, kh_str_t *na_hashset):
     cdef:
         int na_count
@@ -1966,8 +1969,8 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
         return None, None
     return result.view(np.bool_), na_count
 
-cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start,
-                                int line_end, bint na_filter,
+cdef inline int _try_bool_nogil(parser_t *parser, size_t col, size_t line_start,
+                                size_t line_end, bint na_filter,
                                 const kh_str_t *na_hashset, uint8_t NA,
                                 uint8_t *data, int *na_count) nogil:
     cdef:
@@ -2006,7 +2009,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start,
             data += 1
     return 0
 
-cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
+cdef _try_bool_flex(parser_t *parser, size_t col, size_t line_start, size_t line_end,
                     bint na_filter, const kh_str_t *na_hashset,
                     const kh_str_t *true_hashset,
                     const kh_str_t *false_hashset):
@@ -2032,8 +2035,8 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
         return None, None
     return result.view(np.bool_), na_count
 
-cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start,
-                                     int line_end, bint na_filter,
+cdef inline int _try_bool_flex_nogil(parser_t *parser, size_t col, size_t line_start,
+                                     size_t line_end, bint na_filter,
                                      const kh_str_t *na_hashset,
                                      const kh_str_t *true_hashset,
                                      const kh_str_t *false_hashset,
@@ -2251,8 +2254,8 @@ for k in list(na_values):
     na_values[np.dtype(k)] = na_values[k]
 
 
-cdef _apply_converter(object f, parser_t *parser, int col,
-                      int line_start, int line_end,
+cdef _apply_converter(object f, parser_t *parser, size_t col,
+                      size_t line_start, size_t line_end,
                       char* c_encoding):
     cdef:
         int error
@@ -2296,7 +2299,7 @@ def _to_structured_array(dict columns, object names, object usecols):
 
         object name, fnames, field_type
         Py_ssize_t i, offset, nfields, length
-        int stride, elsize
+        size_t stride, elsize
         char *buf
 
     if names is None:
@@ -2344,10 +2347,10 @@ def _to_structured_array(dict columns, object names, object usecols):
 
     return recs
 
-cdef _fill_structured_column(char *dst, char* src, int elsize,
-                             int stride, int length, bint incref):
+cdef _fill_structured_column(char *dst, char* src, size_t elsize,
+                             size_t stride, size_t length, bint incref):
     cdef:
-        Py_ssize_t i
+        size_t i
 
     if incref:
         util.transfer_object_column(dst, src, stride, length)
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index be23ebb023383..186babc2a4720 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -69,14 +69,18 @@ static void free_if_not_null(void **ptr) {
 
 */
 
-static void *grow_buffer(void *buffer, int length, int *capacity, int space,
-                         int elsize, int *error) {
-    int cap = *capacity;
+static void *grow_buffer(void *buffer, size_t length, size_t *capacity,
+                         size_t space, size_t elsize, int *error) {
+    size_t cap = *capacity;
     void *newbuffer = buffer;
 
     // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
     while ((length + space >= cap) && (newbuffer != NULL)) {
-        cap = cap ? cap << 1 : 2;
+        if (cap < 1024 * 1024 * 1024) {
+            cap = cap ? cap << 1 : 2;
+        } else {
+            cap *= 2;
+        }
         buffer = newbuffer;
         newbuffer = safe_realloc(newbuffer, elsize * cap);
     }
@@ -169,7 +173,7 @@ int parser_cleanup(parser_t *self) {
 }
 
 int parser_init(parser_t *self) {
-    int sz;
+    size_t sz;
 
     /*
       Initialize data buffers
@@ -196,14 +200,14 @@ int parser_init(parser_t *self) {
     sz = STREAM_INIT_SIZE / 10;
     sz = sz ? sz : 1;
     self->words = (char **)malloc(sz * sizeof(char *));
-    self->word_starts = (int *)malloc(sz * sizeof(int));
+    self->word_starts = (size_t *)malloc(sz * sizeof(size_t));
     self->words_cap = sz;
     self->words_len = 0;
 
     // line pointers and metadata
-    self->line_start = (int *)malloc(sz * sizeof(int));
+    self->line_start = (size_t *)malloc(sz * sizeof(size_t));
 
-    self->line_fields = (int *)malloc(sz * sizeof(int));
+    self->line_fields = (size_t *)malloc(sz * sizeof(size_t));
 
     self->lines_cap = sz;
     self->lines = 0;
@@ -247,7 +251,8 @@ void parser_del(parser_t *self) {
 }
 
 static int make_stream_space(parser_t *self, size_t nbytes) {
-    int i, status, cap;
+    size_t i, cap;
+    int status;
     void *orig_ptr, *newptr;
 
     // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
@@ -304,11 +309,11 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
              "self->words_cap=%d\n",
              nbytes, self->words_cap))
         newptr = safe_realloc((void *)self->word_starts,
-                              sizeof(int) * self->words_cap);
+                              sizeof(size_t) * self->words_cap);
         if (newptr == NULL) {
             return PARSER_OUT_OF_MEMORY;
         } else {
-            self->word_starts = (int *)newptr;
+            self->word_starts = (size_t *)newptr;
         }
     }
 
@@ -317,8 +322,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
     */
     cap = self->lines_cap;
     self->line_start =
-        (int *)grow_buffer((void *)self->line_start, self->lines + 1,
-                           &self->lines_cap, nbytes, sizeof(int), &status);
+        (size_t *)grow_buffer((void *)self->line_start, self->lines + 1,
+                           &self->lines_cap, nbytes, sizeof(size_t), &status);
     TRACE((
         "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n",
         self->lines + 1, self->lines_cap, nbytes, status))
@@ -331,11 +336,11 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
         TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n",
                nbytes))
         newptr = safe_realloc((void *)self->line_fields,
-                              sizeof(int) * self->lines_cap);
+                              sizeof(size_t) * self->lines_cap);
         if (newptr == NULL) {
             return PARSER_OUT_OF_MEMORY;
         } else {
-            self->line_fields = (int *)newptr;
+            self->line_fields = (size_t *)newptr;
         }
     }
 
@@ -350,7 +355,7 @@ static int push_char(parser_t *self, char c) {
             ("push_char: ERROR!!! self->stream_len(%d) >= "
              "self->stream_cap(%d)\n",
              self->stream_len, self->stream_cap))
-        int bufsize = 100;
+        size_t bufsize = 100;
         self->error_msg = (char *)malloc(bufsize);
         snprintf(self->error_msg, bufsize,
                  "Buffer overflow caught - possible malformed input file.\n");
@@ -367,7 +372,7 @@ int P_INLINE end_field(parser_t *self) {
             ("end_field: ERROR!!! self->words_len(%zu) >= "
              "self->words_cap(%zu)\n",
              self->words_len, self->words_cap))
-        int bufsize = 100;
+        size_t bufsize = 100;
         self->error_msg = (char *)malloc(bufsize);
         snprintf(self->error_msg, bufsize,
                  "Buffer overflow caught - possible malformed input file.\n");
@@ -399,8 +404,8 @@ int P_INLINE end_field(parser_t *self) {
 }
 
 static void append_warning(parser_t *self, const char *msg) {
-    int ex_length;
-    int length = strlen(msg);
+    size_t ex_length;
+    size_t length = strlen(msg);
     void *newptr;
 
     if (self->warn_msg == NULL) {
@@ -420,12 +425,13 @@ static int end_line(parser_t *self) {
     char *msg;
     int fields;
     int ex_fields = self->expected_fields;
-    int bufsize = 100;  // for error or warning messages
+    size_t bufsize = 100;  // for error or warning messages
 
     fields = self->line_fields[self->lines];
 
     TRACE(("end_line: Line end, nfields: %d\n", fields));
 
+    TRACE(("end_line: lines: %d\n", self->lines));
     if (self->lines > 0) {
         if (self->expected_fields >= 0) {
             ex_fields = self->expected_fields;
@@ -433,6 +439,7 @@ static int end_line(parser_t *self) {
             ex_fields = self->line_fields[self->lines - 1];
         }
     }
+    TRACE(("end_line: ex_fields: %d\n", ex_fields));
 
     if (self->state == START_FIELD_IN_SKIP_LINE ||
         self->state == IN_FIELD_IN_SKIP_LINE ||
@@ -450,7 +457,7 @@ static int end_line(parser_t *self) {
         return 0;
     }
 
-    if (!(self->lines <= self->header_end + 1) &&
+    if (!(self->lines <= (unsigned long) self->header_end + 1) &&
         (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) {
         // increment file line count
         self->file_lines++;
@@ -465,7 +472,7 @@ static int end_line(parser_t *self) {
         if (self->error_bad_lines) {
             self->error_msg = (char *)malloc(bufsize);
             snprintf(self->error_msg, bufsize,
-                    "Expected %d fields in line %d, saw %d\n",
+                    "Expected %d fields in line %zu, saw %d\n",
                     ex_fields, self->file_lines, fields);
 
             TRACE(("Error at line %d, %d fields\n", self->file_lines, fields));
@@ -477,7 +484,7 @@ static int end_line(parser_t *self) {
                 // pass up error message
                 msg = (char *)malloc(bufsize);
                 snprintf(msg, bufsize,
-                        "Skipping line %d: expected %d fields, saw %d\n",
+                        "Skipping line %zu: expected %d fields, saw %d\n",
                          self->file_lines, ex_fields, fields);
                 append_warning(self, msg);
                 free(msg);
@@ -485,10 +492,13 @@ static int end_line(parser_t *self) {
         }
     } else {
         // missing trailing delimiters
-        if ((self->lines >= self->header_end + 1) && fields < ex_fields) {
+        if ((self->lines >= (unsigned long) self->header_end + 1) &&
+                fields < ex_fields) {
             // might overrun the buffer when closing fields
             if (make_stream_space(self, ex_fields - fields) < 0) {
-                self->error_msg = "out of memory";
+                size_t bufsize = 100;
+                self->error_msg = (char *)malloc(bufsize);
+                snprintf(self->error_msg, bufsize, "out of memory");
                 return -1;
             }
 
@@ -507,7 +517,7 @@ static int end_line(parser_t *self) {
             TRACE((
                 "end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n",
                 self->lines, self->lines_cap))
-            int bufsize = 100;
+            size_t bufsize = 100;
             self->error_msg = (char *)malloc(bufsize);
             snprintf(self->error_msg, bufsize,
                      "Buffer overflow caught - "
@@ -568,7 +578,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
     self->datalen = bytes_read;
 
     if (status != REACHED_EOF && self->data == NULL) {
-        int bufsize = 200;
+        size_t bufsize = 200;
         self->error_msg = (char *)malloc(bufsize);
 
         if (status == CALLING_READ_FAILED) {
@@ -599,7 +609,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
     if (slen >= self->stream_cap) {                                           \
         TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen,      \
                self->stream_cap))                                             \
-        int bufsize = 100;                                                    \
+        size_t bufsize = 100;                                                 \
         self->error_msg = (char *)malloc(bufsize);                            \
         snprintf(self->error_msg, bufsize,                                    \
                  "Buffer overflow caught - possible malformed input file.\n");\
@@ -626,7 +636,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
     stream = self->stream + self->stream_len;                        \
     slen = self->stream_len;                                         \
     self->state = STATE;                                             \
-    if (line_limit > 0 && self->lines == start_lines + (int)line_limit) {  \
+    if (line_limit > 0 && self->lines == start_lines + (size_t)line_limit) {  \
         goto linelimit;                                              \
     }
 
@@ -641,7 +651,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
     stream = self->stream + self->stream_len;                        \
     slen = self->stream_len;                                         \
     self->state = STATE;                                             \
-    if (line_limit > 0 && self->lines == start_lines + (int)line_limit) { \
+    if (line_limit > 0 && self->lines == start_lines + (size_t)line_limit) { \
         goto linelimit;                                              \
     }
 
@@ -712,15 +722,17 @@ int skip_this_line(parser_t *self, int64_t rownum) {
     }
 }
 
-int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
-    int i, slen;
+int tokenize_bytes(parser_t *self, size_t line_limit, size_t start_lines) {
+    size_t i, slen;
     int should_skip;
     char c;
     char *stream;
     char *buf = self->data + self->datapos;
 
     if (make_stream_space(self, self->datalen - self->datapos) < 0) {
-        self->error_msg = "out of memory";
+        size_t bufsize = 100;
+        self->error_msg = (char *)malloc(bufsize);
+        snprintf(self->error_msg, bufsize, "out of memory");
         return -1;
     }
 
@@ -1025,7 +1037,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
                     PUSH_CHAR(c);
                     self->state = IN_FIELD;
                 } else {
-                    int bufsize = 100;
+                    size_t bufsize = 100;
                     self->error_msg = (char *)malloc(bufsize);
                     snprintf(self->error_msg, bufsize,
                             "delimiter expected after quote in quote");
@@ -1079,7 +1091,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
                         --i;
                         buf--;  // let's try this character again (HACK!)
                         if (line_limit > 0 &&
-                            self->lines == start_lines + (int)line_limit) {
+                            self->lines == start_lines + line_limit) {
                             goto linelimit;
                         }
                     }
@@ -1121,7 +1133,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
 }
 
 static int parser_handle_eof(parser_t *self) {
-    int bufsize = 100;
+    size_t bufsize = 100;
 
     TRACE(
         ("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state))
@@ -1139,7 +1151,7 @@ static int parser_handle_eof(parser_t *self) {
         case IN_QUOTED_FIELD:
             self->error_msg = (char *)malloc(bufsize);
             snprintf(self->error_msg, bufsize,
-                    "EOF inside string starting at line %d", self->file_lines);
+                    "EOF inside string starting at line %zu", self->file_lines);
             return -1;
 
         case ESCAPED_CHAR:
@@ -1165,9 +1177,9 @@ static int parser_handle_eof(parser_t *self) {
 }
 
 int parser_consume_rows(parser_t *self, size_t nrows) {
-    int i, offset, word_deletions, char_count;
+    size_t i, offset, word_deletions, char_count;
 
-    if ((int)nrows > self->lines) {
+    if (nrows > self->lines) {
         nrows = self->lines;
     }
 
@@ -1204,7 +1216,7 @@ int parser_consume_rows(parser_t *self, size_t nrows) {
     self->word_start -= char_count;
 
     /* move line metadata */
-    for (i = 0; i < self->lines - (int)nrows + 1; ++i) {
+    for (i = 0; i < self->lines - nrows + 1; ++i) {
         offset = i + nrows;
         self->line_start[i] = self->line_start[offset] - word_deletions;
         self->line_fields[i] = self->line_fields[offset];
@@ -1227,11 +1239,11 @@ int parser_trim_buffers(parser_t *self) {
     size_t new_cap;
     void *newptr;
 
-    int i;
+    size_t i;
 
     /* trim words, word_starts */
     new_cap = _next_pow2(self->words_len) + 1;
-    if ((int)new_cap < self->words_cap) {
+    if (new_cap < self->words_cap) {
         TRACE(("parser_trim_buffers: new_cap < self->words_cap\n"));
         newptr = safe_realloc((void *)self->words, new_cap * sizeof(char *));
         if (newptr == NULL) {
@@ -1239,22 +1251,28 @@ int parser_trim_buffers(parser_t *self) {
         } else {
             self->words = (char **)newptr;
         }
-        newptr = safe_realloc((void *)self->word_starts, new_cap * sizeof(int));
+        newptr = safe_realloc((void *)self->word_starts,
+                              new_cap * sizeof(size_t));
         if (newptr == NULL) {
             return PARSER_OUT_OF_MEMORY;
         } else {
-            self->word_starts = (int *)newptr;
+            self->word_starts = (size_t *)newptr;
             self->words_cap = new_cap;
         }
     }
 
     /* trim stream */
-    new_cap = _next_pow2(self->stream_len) + 1;
+    if (new_cap < INT32_MAX) {
+        new_cap = _next_pow2(self->stream_len) + 1;
+    } else {
+        new_cap *= 2;
+    }
+
     TRACE(
         ("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = "
          "%zu\n",
          new_cap, self->stream_cap, self->lines_cap));
-    if ((int)new_cap < self->stream_cap) {
+    if (new_cap < self->stream_cap) {
         TRACE(
             ("parser_trim_buffers: new_cap < self->stream_cap, calling "
              "safe_realloc\n"));
@@ -1281,20 +1299,26 @@ int parser_trim_buffers(parser_t *self) {
     }
 
     /* trim line_start, line_fields */
-    new_cap = _next_pow2(self->lines) + 1;
-    if ((int)new_cap < self->lines_cap) {
+    if (new_cap < 1024 * 1024 * 1024) {
+        new_cap = _next_pow2(self->lines) + 1;
+    } else {
+        new_cap *= 2;
+    }
+    if (new_cap < self->lines_cap) {
         TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n"));
-        newptr = safe_realloc((void *)self->line_start, new_cap * sizeof(int));
+        newptr = safe_realloc((void *)self->line_start,
+                              new_cap * sizeof(size_t));
         if (newptr == NULL) {
             return PARSER_OUT_OF_MEMORY;
         } else {
-            self->line_start = (int *)newptr;
+            self->line_start = (size_t *)newptr;
         }
-        newptr = safe_realloc((void *)self->line_fields, new_cap * sizeof(int));
+        newptr = safe_realloc((void *)self->line_fields,
+                              new_cap * sizeof(size_t));
         if (newptr == NULL) {
             return PARSER_OUT_OF_MEMORY;
         } else {
-            self->line_fields = (int *)newptr;
+            self->line_fields = (size_t *)newptr;
             self->lines_cap = new_cap;
         }
     }
@@ -1303,11 +1327,11 @@ int parser_trim_buffers(parser_t *self) {
 }
 
 void debug_print_parser(parser_t *self) {
-    int j, line;
+    size_t j, line;
     char *token;
 
     for (line = 0; line < self->lines; ++line) {
-        printf("(Parsed) Line %d: ", line);
+        printf("(Parsed) Line %zu: ", line);
 
         for (j = 0; j < self->line_fields[j]; ++j) {
             token = self->words[j + self->line_start[line]];
@@ -1324,7 +1348,7 @@ void debug_print_parser(parser_t *self) {
 
 int _tokenize_helper(parser_t *self, size_t nrows, int all) {
     int status = 0;
-    int start_lines = self->lines;
+    size_t start_lines = self->lines;
 
     if (self->state == FINISHED) {
         return 0;
@@ -1332,10 +1356,10 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
 
     TRACE((
         "_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n",
-        (int)nrows, self->datapos, self->datalen));
+        nrows, self->datapos, self->datalen));
 
     while (1) {
-        if (!all && self->lines - start_lines >= (int)nrows) break;
+        if (!all && self->lines - start_lines >= nrows) break;
 
         if (self->datapos == self->datalen) {
             status = parser_buffer_bytes(self, self->chunksize);
diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
index b4344e8a6c070..a1341b37952eb 100644
--- a/pandas/_libs/src/parser/tokenizer.h
+++ b/pandas/_libs/src/parser/tokenizer.h
@@ -137,30 +137,30 @@ typedef struct parser_t {
     io_callback cb_io;
     io_cleanup cb_cleanup;
 
-    int chunksize;  // Number of bytes to prepare for each chunk
+    size_t chunksize;  // Number of bytes to prepare for each chunk
     char *data;     // pointer to data to be processed
-    int datalen;    // amount of data available
-    int datapos;
+    size_t datalen;    // amount of data available
+    size_t datapos;
 
     // where to write out tokenized data
     char *stream;
-    int stream_len;
-    int stream_cap;
+    size_t stream_len;
+    size_t stream_cap;
 
     // Store words in (potentially ragged) matrix for now, hmm
     char **words;
-    int *word_starts;  // where we are in the stream
-    int words_len;
-    int words_cap;
+    size_t *word_starts;  // where we are in the stream
+    size_t words_len;
+    size_t words_cap;
 
     char *pword_start;  // pointer to stream start of current field
-    int word_start;     // position start of current field
+    size_t word_start;     // position start of current field
 
-    int *line_start;   // position in words for start of line
-    int *line_fields;  // Number of fields in each line
-    int lines;         // Number of (good) lines observed
-    int file_lines;  // Number of file lines observed (including bad or skipped)
-    int lines_cap;   // Vector capacity
+    size_t *line_start;   // position in words for start of line
+    size_t *line_fields;  // Number of fields in each line
+    size_t lines;         // Number of (good) lines observed
+    size_t file_lines;  // Number of lines observed (including bad or skipped)
+    size_t lines_cap;   // Vector capacity
 
     // Tokenizing stuff
     ParserState state;
@@ -194,8 +194,8 @@ typedef struct parser_t {
     char thousands;
 
     int header;        // Boolean: 1: has header, 0: no header
-    int header_start;  // header row start
-    int header_end;    // header row end
+    ssize_t header_start;  // header row start
+    ssize_t header_end;    // header row end
 
     void *skipset;
     PyObject *skipfunc;
@@ -216,7 +216,7 @@ typedef struct parser_t {
 
 typedef struct coliter_t {
     char **words;
-    int *line_start;
+    size_t *line_start;
     int col;
 } coliter_t;
 
@@ -225,7 +225,7 @@ coliter_t *coliter_new(parser_t *self, int i);
 
 #define COLITER_NEXT(iter, word)                          \
     do {                                                  \
-        const int i = *iter.line_start++ + iter.col;      \
+        const size_t i = *iter.line_start++ + iter.col;      \
         word = i < *iter.line_start ? iter.words[i] : ""; \
     } while (0)
 

From e04d12a9735707927edebf20501c1176ef5a2859 Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff@jeffknupp.com>
Date: Thu, 20 Jul 2017 18:21:17 -0400
Subject: [PATCH 02/14] Switch to use int64_t rather than size_t due to
 portability concerns.

---
 pandas/_libs/parsers.pyx            | 134 ++++++++++++++--------------
 pandas/_libs/src/parser/tokenizer.h |  36 ++++----
 2 files changed, 85 insertions(+), 85 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index d6f87344bb28c..88c695a3faf27 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -121,30 +121,30 @@ cdef extern from "parser/tokenizer.h":
         io_callback cb_io
         io_cleanup cb_cleanup
 
-        size_t chunksize  # Number of bytes to prepare for each chunk
-        char *data     # pointer to data to be processed
-        size_t datalen    # amount of data available
-        size_t datapos
+        int64_t chunksize  # Number of bytes to prepare for each chunk
+        char *data        # pointer to data to be processed
+        int64_t datalen    # amount of data available
+        int64_t datapos
 
         # where to write out tokenized data
         char *stream
-        size_t stream_len
-        size_t stream_cap
+        int64_t stream_len
+        int64_t stream_cap
 
         # Store words in (potentially ragged) matrix for now, hmm
         char **words
-        size_t *word_starts # where we are in the stream
-        size_t words_len
-        size_t words_cap
+        int64_t *word_starts # where we are in the stream
+        int64_t words_len
+        int64_t words_cap
 
-        char *pword_start    # pointer to stream start of current field
-        size_t word_start       # position start of current field
+        char *pword_start       # pointer to stream start of current field
+        int64_t word_start       # position start of current field
 
-        size_t *line_start      # position in words for start of line
-        size_t *line_fields     # Number of fields in each line
-        size_t lines            # Number of lines observed
-        size_t file_lines       # Number of file lines observed (with bad/skipped)
-        size_t lines_cap        # Vector capacity
+        int64_t *line_start      # position in words for start of line
+        int64_t *line_fields     # Number of fields in each line
+        int64_t lines            # Number of lines observed
+        int64_t file_lines       # Number of file lines observed (with bad/skipped)
+        int64_t lines_cap        # Vector capacity
 
         # Tokenizing stuff
         ParserState state
@@ -178,13 +178,13 @@ cdef extern from "parser/tokenizer.h":
         char thousands
 
         int header # Boolean: 1: has header, 0: no header
-        ssize_t header_start # header row start
-        ssize_t header_end # header row end
+        int64_t header_start # header row start
+        int64_t header_end # header row end
 
         void *skipset
         PyObject *skipfunc
         int64_t skip_first_N_rows
-        size_t skipfooter
+        int64_t skipfooter
         # pick one, depending on whether the converter requires GIL
         double (*double_converter_nogil)(const char *, char **,
                                          char, char, char, int) nogil
@@ -195,12 +195,12 @@ cdef extern from "parser/tokenizer.h":
         char *warn_msg
         char *error_msg
 
-        size_t skip_empty_lines
+        int64_t skip_empty_lines
 
     ctypedef struct coliter_t:
         char **words
-        size_t *line_start
-        size_t col
+        int64_t *line_start
+        int64_t col
 
     ctypedef struct uint_state:
         int seen_sint
@@ -210,7 +210,7 @@ cdef extern from "parser/tokenizer.h":
     void uint_state_init(uint_state *self)
     int uint64_conflict(uint_state *self)
 
-    void coliter_setup(coliter_t *it, parser_t *parser, size_t i, size_t start) nogil
+    void coliter_setup(coliter_t *it, parser_t *parser, int64_t i, int64_t start) nogil
     void COLITER_NEXT(coliter_t, const char *) nogil
 
     parser_t* parser_new()
@@ -289,14 +289,14 @@ cdef class TextReader:
         object true_values, false_values
         object handle
         bint na_filter, verbose, has_usecols, has_mi_columns
-        size_t parser_start
+        int64_t parser_start
         list clocks
         char *c_encoding
         kh_str_t *false_set
         kh_str_t *true_set
 
     cdef public:
-        size_t leading_cols, table_width, skipfooter, buffer_lines
+        int64_t leading_cols, table_width, skipfooter, buffer_lines
         object allow_leading_cols
         object delimiter, converters, delim_whitespace
         object na_values
@@ -731,7 +731,7 @@ cdef class TextReader:
             char *word
             object name
             int status
-            size_t hr, data_line
+            int64_t hr, data_line
             char *errors = "strict"
             cdef StringPath path = _string_path(self.c_encoding)
 
@@ -950,8 +950,8 @@ cdef class TextReader:
 
     cdef _read_rows(self, rows, bint trim):
         cdef:
-            size_t buffered_lines
-            size_t irows, footer = 0
+            int64_t buffered_lines
+            int64_t irows, footer = 0
 
         self._start_clock()
 
@@ -1019,13 +1019,13 @@ cdef class TextReader:
 
     def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
         cdef:
-            size_t i
+            int64_t i
             int nused
             kh_str_t *na_hashset = NULL
-            size_t start, end
+            int64_t start, end
             object name, na_flist, col_dtype = None
             bint na_filter = 0
-            size_t num_cols
+            int64_t num_cols
 
         start = self.parser_start
 
@@ -1038,7 +1038,7 @@ cdef class TextReader:
         # if footer > 0:
         #     end -= footer
 
-        num_cols = 0
+        num_cols = -1
         for i in range(self.parser.lines):
             num_cols = (num_cols < self.parser.line_fields[i]) * \
                 self.parser.line_fields[i] + \
@@ -1197,7 +1197,7 @@ cdef class TextReader:
         return col_res, na_count
 
     cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
-                             size_t start, size_t end,
+                             int64_t start, int64_t end,
                              bint na_filter,
                              bint user_dtype,
                              kh_str_t *na_hashset,
@@ -1277,7 +1277,7 @@ cdef class TextReader:
             raise TypeError("the dtype %s is not "
                             "supported for parsing" % dtype)
 
-    cdef _string_convert(self, Py_ssize_t i, size_t start, size_t end,
+    cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
                          bint na_filter, kh_str_t *na_hashset):
 
         cdef StringPath path = _string_path(self.c_encoding)
@@ -1338,7 +1338,7 @@ cdef class TextReader:
         kh_destroy_str(table)
 
     cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
-        cdef int j
+        cdef int64_t j
         if self.has_usecols and self.names is not None:
             if (not callable(self.usecols) and
                     len(self.names) == len(self.usecols)):
@@ -1430,8 +1430,8 @@ cdef inline StringPath _string_path(char *encoding):
 # ----------------------------------------------------------------------
 # Type conversions / inference support code
 
-cdef _string_box_factorize(parser_t *parser, size_t col,
-                           size_t line_start, size_t line_end,
+cdef _string_box_factorize(parser_t *parser, int64_t col,
+                           int64_t line_start, int64_t line_end,
                            bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error, na_count = 0
@@ -1483,8 +1483,8 @@ cdef _string_box_factorize(parser_t *parser, size_t col,
 
     return result, na_count
 
-cdef _string_box_utf8(parser_t *parser, size_t col,
-                      size_t line_start, size_t line_end,
+cdef _string_box_utf8(parser_t *parser, int64_t col,
+                      int64_t line_start, int64_t line_end,
                       bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error, na_count = 0
@@ -1536,8 +1536,8 @@ cdef _string_box_utf8(parser_t *parser, size_t col,
 
     return result, na_count
 
-cdef _string_box_decode(parser_t *parser, size_t col,
-                        size_t line_start, size_t line_end,
+cdef _string_box_decode(parser_t *parser, int64_t col,
+                        int64_t line_start, int64_t line_end,
                         bint na_filter, kh_str_t *na_hashset,
                         char *encoding):
     cdef:
@@ -1595,8 +1595,8 @@ cdef _string_box_decode(parser_t *parser, size_t col,
 
 
 @cython.boundscheck(False)
-cdef _categorical_convert(parser_t *parser, size_t col,
-                          size_t line_start, size_t line_end,
+cdef _categorical_convert(parser_t *parser, int64_t col,
+                          int64_t line_start, int64_t line_end,
                           bint na_filter, kh_str_t *na_hashset,
                           char *encoding):
     "Convert column data into codes, categories"
@@ -1666,8 +1666,8 @@ cdef _categorical_convert(parser_t *parser, size_t col,
     kh_destroy_str(table)
     return np.asarray(codes), result, na_count
 
-cdef _to_fw_string(parser_t *parser, size_t col, size_t line_start,
-                   size_t line_end, size_t width):
+cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
+                   int64_t line_end, int64_t width):
     cdef:
         Py_ssize_t i
         coliter_t it
@@ -1683,11 +1683,11 @@ cdef _to_fw_string(parser_t *parser, size_t col, size_t line_start,
 
     return result
 
-cdef inline void _to_fw_string_nogil(parser_t *parser, size_t col,
-                                     size_t line_start, size_t line_end,
+cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col,
+                                     int64_t line_start, int64_t line_end,
                                      size_t width, char *data) nogil:
     cdef:
-        size_t i
+        int64_t i
         coliter_t it
         const char *word = NULL
 
@@ -1702,7 +1702,7 @@ cdef char* cinf = b'inf'
 cdef char* cposinf = b'+inf'
 cdef char* cneginf = b'-inf'
 
-cdef _try_double(parser_t *parser, size_t col, size_t line_start, size_t line_end,
+cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
                  bint na_filter, kh_str_t *na_hashset, object na_flist):
     cdef:
         int error, na_count = 0
@@ -1811,7 +1811,7 @@ cdef inline int _try_double_nogil(parser_t *parser,
 
     return 0
 
-cdef _try_uint64(parser_t *parser, size_t col, size_t line_start, size_t line_end,
+cdef _try_uint64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
                  bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error
@@ -1845,8 +1845,8 @@ cdef _try_uint64(parser_t *parser, size_t col, size_t line_start, size_t line_en
 
     return result
 
-cdef inline int _try_uint64_nogil(parser_t *parser, size_t col, size_t line_start,
-                                  size_t line_end, bint na_filter,
+cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, int64_t line_start,
+                                  int64_t line_end, bint na_filter,
                                   const kh_str_t *na_hashset,
                                   uint64_t *data, uint_state *state) nogil:
     cdef:
@@ -1882,7 +1882,7 @@ cdef inline int _try_uint64_nogil(parser_t *parser, size_t col, size_t line_star
 
     return 0
 
-cdef _try_int64(parser_t *parser, size_t col, size_t line_start, size_t line_end,
+cdef _try_int64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
                 bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error, na_count = 0
@@ -1909,8 +1909,8 @@ cdef _try_int64(parser_t *parser, size_t col, size_t line_start, size_t line_end
 
     return result, na_count
 
-cdef inline int _try_int64_nogil(parser_t *parser, size_t col, size_t line_start,
-                                 size_t line_end, bint na_filter,
+cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, int64_t line_start,
+                                 int64_t line_end, bint na_filter,
                                  const kh_str_t *na_hashset, int64_t NA,
                                  int64_t *data, int *na_count) nogil:
     cdef:
@@ -1947,7 +1947,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, size_t col, size_t line_start
 
     return 0
 
-cdef _try_bool(parser_t *parser, size_t col, size_t line_start, size_t line_end,
+cdef _try_bool(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
                bint na_filter, kh_str_t *na_hashset):
     cdef:
         int na_count
@@ -1969,8 +1969,8 @@ cdef _try_bool(parser_t *parser, size_t col, size_t line_start, size_t line_end,
         return None, None
     return result.view(np.bool_), na_count
 
-cdef inline int _try_bool_nogil(parser_t *parser, size_t col, size_t line_start,
-                                size_t line_end, bint na_filter,
+cdef inline int _try_bool_nogil(parser_t *parser, int64_t col, int64_t line_start,
+                                int64_t line_end, bint na_filter,
                                 const kh_str_t *na_hashset, uint8_t NA,
                                 uint8_t *data, int *na_count) nogil:
     cdef:
@@ -2009,7 +2009,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, size_t col, size_t line_start,
             data += 1
     return 0
 
-cdef _try_bool_flex(parser_t *parser, size_t col, size_t line_start, size_t line_end,
+cdef _try_bool_flex(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
                     bint na_filter, const kh_str_t *na_hashset,
                     const kh_str_t *true_hashset,
                     const kh_str_t *false_hashset):
@@ -2035,8 +2035,8 @@ cdef _try_bool_flex(parser_t *parser, size_t col, size_t line_start, size_t line
         return None, None
     return result.view(np.bool_), na_count
 
-cdef inline int _try_bool_flex_nogil(parser_t *parser, size_t col, size_t line_start,
-                                     size_t line_end, bint na_filter,
+cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, int64_t line_start,
+                                     int64_t line_end, bint na_filter,
                                      const kh_str_t *na_hashset,
                                      const kh_str_t *true_hashset,
                                      const kh_str_t *false_hashset,
@@ -2254,8 +2254,8 @@ for k in list(na_values):
     na_values[np.dtype(k)] = na_values[k]
 
 
-cdef _apply_converter(object f, parser_t *parser, size_t col,
-                      size_t line_start, size_t line_end,
+cdef _apply_converter(object f, parser_t *parser, int64_t col,
+                      int64_t line_start, int64_t line_end,
                       char* c_encoding):
     cdef:
         int error
@@ -2299,7 +2299,7 @@ def _to_structured_array(dict columns, object names, object usecols):
 
         object name, fnames, field_type
         Py_ssize_t i, offset, nfields, length
-        size_t stride, elsize
+        int64_t stride, elsize
         char *buf
 
     if names is None:
@@ -2347,10 +2347,10 @@ def _to_structured_array(dict columns, object names, object usecols):
 
     return recs
 
-cdef _fill_structured_column(char *dst, char* src, size_t elsize,
-                             size_t stride, size_t length, bint incref):
+cdef _fill_structured_column(char *dst, char* src, int64_t elsize,
+                             int64_t stride, int64_t length, bint incref):
     cdef:
-        size_t i
+        int64_t i
 
     if incref:
         util.transfer_object_column(dst, src, stride, length)
diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
index a1341b37952eb..f293baa3cda12 100644
--- a/pandas/_libs/src/parser/tokenizer.h
+++ b/pandas/_libs/src/parser/tokenizer.h
@@ -137,30 +137,30 @@ typedef struct parser_t {
     io_callback cb_io;
     io_cleanup cb_cleanup;
 
-    size_t chunksize;  // Number of bytes to prepare for each chunk
+    int64_t chunksize;  // Number of bytes to prepare for each chunk
     char *data;     // pointer to data to be processed
-    size_t datalen;    // amount of data available
-    size_t datapos;
+    int64_t datalen;    // amount of data available
+    int64_t datapos;
 
     // where to write out tokenized data
     char *stream;
-    size_t stream_len;
-    size_t stream_cap;
+    int64_t stream_len;
+    int64_t stream_cap;
 
     // Store words in (potentially ragged) matrix for now, hmm
     char **words;
-    size_t *word_starts;  // where we are in the stream
-    size_t words_len;
-    size_t words_cap;
+    int64_t *word_starts;  // where we are in the stream
+    int64_t words_len;
+    int64_t words_cap;
 
     char *pword_start;  // pointer to stream start of current field
-    size_t word_start;     // position start of current field
+    int64_t word_start;     // position start of current field
 
-    size_t *line_start;   // position in words for start of line
-    size_t *line_fields;  // Number of fields in each line
-    size_t lines;         // Number of (good) lines observed
-    size_t file_lines;  // Number of lines observed (including bad or skipped)
-    size_t lines_cap;   // Vector capacity
+    int64_t *line_start;   // position in words for start of line
+    int64_t *line_fields;  // Number of fields in each line
+    int64_t lines;         // Number of (good) lines observed
+    int64_t file_lines;  // Number of lines observed (including bad or skipped)
+    int64_t lines_cap;   // Vector capacity
 
     // Tokenizing stuff
     ParserState state;
@@ -194,8 +194,8 @@ typedef struct parser_t {
     char thousands;
 
     int header;        // Boolean: 1: has header, 0: no header
-    ssize_t header_start;  // header row start
-    ssize_t header_end;    // header row end
+    int64_t header_start;  // header row start
+    int64_t header_end;    // header row end
 
     void *skipset;
     PyObject *skipfunc;
@@ -216,7 +216,7 @@ typedef struct parser_t {
 
 typedef struct coliter_t {
     char **words;
-    size_t *line_start;
+    int64_t *line_start;
     int col;
 } coliter_t;
 
@@ -225,7 +225,7 @@ coliter_t *coliter_new(parser_t *self, int i);
 
 #define COLITER_NEXT(iter, word)                          \
     do {                                                  \
-        const size_t i = *iter.line_start++ + iter.col;      \
+        const int64_t i = *iter.line_start++ + iter.col;      \
         word = i < *iter.line_start ? iter.words[i] : ""; \
     } while (0)
 

From 1f24847effd169b127d64c32f9dfa9a36e6bc1f2 Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff@jeffknupp.com>
Date: Thu, 20 Jul 2017 19:47:03 -0400
Subject: [PATCH 03/14] Fix comment alignment; add whatsnew entry

---
 doc/source/whatsnew/v0.21.0.txt | 3 ++-
 pandas/_libs/parsers.pyx        | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index 5146bd35dff30..1c4effc31833f 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -201,7 +201,8 @@ I/O
 ^^^
 
 - Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`)
-
+- Bug in :func:`read_csv` in which passing a CSV with at least one very large (i.e. more than 2^31 - 1 bytes) column along with ``low_memory=False`` would cause an integer overflow. The result was an always unsuccessful attempt to allocate an enourmous buffer and then reporting "Out of memory." (:issue:`16798`).
+- Bug in :func:`read_csv` in which some errors paths were assigning error messages to the internal tokenizer's ``error_msg`` field without first allocating the memory. When this happened as part of exception handling, it resulted in a double ``free`` and the program halted due to a ``SIGSEGV`` (:issue:`16798`).
 - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`)
 
 Plotting
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 88c695a3faf27..72e5fb7f12c7f 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -137,7 +137,7 @@ cdef extern from "parser/tokenizer.h":
         int64_t words_len
         int64_t words_cap
 
-        char *pword_start       # pointer to stream start of current field
+        char *pword_start        # pointer to stream start of current field
         int64_t word_start       # position start of current field
 
         int64_t *line_start      # position in words for start of line
@@ -177,9 +177,9 @@ cdef extern from "parser/tokenizer.h":
         # thousands separator (comma, period)
         char thousands
 
-        int header # Boolean: 1: has header, 0: no header
-        int64_t header_start # header row start
-        int64_t header_end # header row end
+        int header                  # Boolean: 1: has header, 0: no header
+        int64_t header_start        # header row start
+        int64_t header_end          # header row end
 
         void *skipset
         PyObject *skipfunc

From 669d99bffb89dec193ef3040a2805f5bae947d91 Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff@jeffknupp.com>
Date: Thu, 20 Jul 2017 20:49:30 -0400
Subject: [PATCH 04/14] Fix linting errors re: line length

---
 pandas/_libs/parsers.pyx | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 72e5fb7f12c7f..e7f559cc150fe 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -143,7 +143,7 @@ cdef extern from "parser/tokenizer.h":
         int64_t *line_start      # position in words for start of line
         int64_t *line_fields     # Number of fields in each line
         int64_t lines            # Number of lines observed
-        int64_t file_lines       # Number of file lines observed (with bad/skipped)
+        int64_t file_lines       # Number of lines observed (with bad/skipped)
         int64_t lines_cap        # Vector capacity
 
         # Tokenizing stuff
@@ -210,7 +210,8 @@ cdef extern from "parser/tokenizer.h":
     void uint_state_init(uint_state *self)
     int uint64_conflict(uint_state *self)
 
-    void coliter_setup(coliter_t *it, parser_t *parser, int64_t i, int64_t start) nogil
+    void coliter_setup(coliter_t *it, parser_t *parser,
+                       int64_t i, int64_t start) nogil
     void COLITER_NEXT(coliter_t, const char *) nogil
 
     parser_t* parser_new()
@@ -1702,7 +1703,8 @@ cdef char* cinf = b'inf'
 cdef char* cposinf = b'+inf'
 cdef char* cneginf = b'-inf'
 
-cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
+cdef _try_double(parser_t *parser, int64_t col,
+                 int64_t line_start, int64_t line_end,
                  bint na_filter, kh_str_t *na_hashset, object na_flist):
     cdef:
         int error, na_count = 0
@@ -1811,7 +1813,8 @@ cdef inline int _try_double_nogil(parser_t *parser,
 
     return 0
 
-cdef _try_uint64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
+cdef _try_uint64(parser_t *parser, int64_t col,
+                 int64_t line_start, int64_t line_end,
                  bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error
@@ -1845,7 +1848,8 @@ cdef _try_uint64(parser_t *parser, int64_t col, int64_t line_start, int64_t line
 
     return result
 
-cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, int64_t line_start,
+cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col,
+                                  int64_t line_start,
                                   int64_t line_end, bint na_filter,
                                   const kh_str_t *na_hashset,
                                   uint64_t *data, uint_state *state) nogil:
@@ -1882,7 +1886,8 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, int64_t line_st
 
     return 0
 
-cdef _try_int64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
+cdef _try_int64(parser_t *parser, int64_t col,
+                int64_t line_start, int64_t line_end,
                 bint na_filter, kh_str_t *na_hashset):
     cdef:
         int error, na_count = 0
@@ -1909,7 +1914,8 @@ cdef _try_int64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_
 
     return result, na_count
 
-cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, int64_t line_start,
+cdef inline int _try_int64_nogil(parser_t *parser, int64_t col,
+                                 int64_t line_start,
                                  int64_t line_end, bint na_filter,
                                  const kh_str_t *na_hashset, int64_t NA,
                                  int64_t *data, int *na_count) nogil:
@@ -1947,7 +1953,8 @@ cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, int64_t line_sta
 
     return 0
 
-cdef _try_bool(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
+cdef _try_bool(parser_t *parser, int64_t col,
+               int64_t line_start, int64_t line_end,
                bint na_filter, kh_str_t *na_hashset):
     cdef:
         int na_count
@@ -1969,7 +1976,8 @@ cdef _try_bool(parser_t *parser, int64_t col, int64_t line_start, int64_t line_e
         return None, None
     return result.view(np.bool_), na_count
 
-cdef inline int _try_bool_nogil(parser_t *parser, int64_t col, int64_t line_start,
+cdef inline int _try_bool_nogil(parser_t *parser, int64_t col,
+                                int64_t line_start,
                                 int64_t line_end, bint na_filter,
                                 const kh_str_t *na_hashset, uint8_t NA,
                                 uint8_t *data, int *na_count) nogil:
@@ -2009,7 +2017,8 @@ cdef inline int _try_bool_nogil(parser_t *parser, int64_t col, int64_t line_star
             data += 1
     return 0
 
-cdef _try_bool_flex(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end,
+cdef _try_bool_flex(parser_t *parser, int64_t col,
+                    int64_t line_start, int64_t line_end,
                     bint na_filter, const kh_str_t *na_hashset,
                     const kh_str_t *true_hashset,
                     const kh_str_t *false_hashset):
@@ -2035,7 +2044,8 @@ cdef _try_bool_flex(parser_t *parser, int64_t col, int64_t line_start, int64_t l
         return None, None
     return result.view(np.bool_), na_count
 
-cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, int64_t line_start,
+cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col,
+                                     int64_t line_start,
                                      int64_t line_end, bint na_filter,
                                      const kh_str_t *na_hashset,
                                      const kh_str_t *true_hashset,

From 0985cf387f806a2c753534881a4c98fa3bc8cd73 Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff.knupp@enigma.com>
Date: Fri, 21 Jul 2017 11:52:54 -0400
Subject: [PATCH 05/14] Remove debugging code; fix type cast

---
 pandas/_libs/src/parser/tokenizer.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 186babc2a4720..9a31a6356ee08 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -76,11 +76,7 @@ static void *grow_buffer(void *buffer, size_t length, size_t *capacity,
 
     // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
     while ((length + space >= cap) && (newbuffer != NULL)) {
-        if (cap < 1024 * 1024 * 1024) {
-            cap = cap ? cap << 1 : 2;
-        } else {
-            cap *= 2;
-        }
+        cap = cap ? cap << 1 : 2;
         buffer = newbuffer;
         newbuffer = safe_realloc(newbuffer, elsize * cap);
     }
@@ -457,7 +453,7 @@ static int end_line(parser_t *self) {
         return 0;
     }
 
-    if (!(self->lines <= (unsigned long) self->header_end + 1) &&
+    if (!(self->lines <= (int64_t) self->header_end + 1) &&
         (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) {
         // increment file line count
         self->file_lines++;
@@ -492,7 +488,7 @@ static int end_line(parser_t *self) {
         }
     } else {
         // missing trailing delimiters
-        if ((self->lines >= (unsigned long) self->header_end + 1) &&
+        if ((self->lines >= (int64_t) self->header_end + 1) &&
                 fields < ex_fields) {
             // might overrun the buffer when closing fields
             if (make_stream_space(self, ex_fields - fields) < 0) {
@@ -1299,11 +1295,7 @@ int parser_trim_buffers(parser_t *self) {
     }
 
     /* trim line_start, line_fields */
-    if (new_cap < 1024 * 1024 * 1024) {
-        new_cap = _next_pow2(self->lines) + 1;
-    } else {
-        new_cap *= 2;
-    }
+    new_cap = _next_pow2(self->lines) + 1;
     if (new_cap < self->lines_cap) {
         TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n"));
         newptr = safe_realloc((void *)self->line_start,

From 3171674567c90aaaf7b723e2cc72c418ce374d18 Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff@jeffknupp.com>
Date: Fri, 21 Jul 2017 12:08:42 -0400
Subject: [PATCH 06/14] Fix some leftover size_t references

---
 pandas/_libs/src/parser/tokenizer.c | 34 ++++++++++++++---------------
 pandas/_libs/src/parser/tokenizer.h | 22 +++++++++----------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 9a31a6356ee08..fc1462b7a2b03 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -305,11 +305,11 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
              "self->words_cap=%d\n",
              nbytes, self->words_cap))
         newptr = safe_realloc((void *)self->word_starts,
-                              sizeof(size_t) * self->words_cap);
+                              sizeof(int64_t) * self->words_cap);
         if (newptr == NULL) {
             return PARSER_OUT_OF_MEMORY;
         } else {
-            self->word_starts = (size_t *)newptr;
+            self->word_starts = (int64_t *)newptr;
         }
     }
 
@@ -318,8 +318,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
     */
     cap = self->lines_cap;
     self->line_start =
-        (size_t *)grow_buffer((void *)self->line_start, self->lines + 1,
-                           &self->lines_cap, nbytes, sizeof(size_t), &status);
+        (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1,
+                           &self->lines_cap, nbytes, sizeof(int64_t), &status);
     TRACE((
         "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n",
         self->lines + 1, self->lines_cap, nbytes, status))
@@ -332,11 +332,11 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
         TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n",
                nbytes))
         newptr = safe_realloc((void *)self->line_fields,
-                              sizeof(size_t) * self->lines_cap);
+                              sizeof(int64_t) * self->lines_cap);
         if (newptr == NULL) {
             return PARSER_OUT_OF_MEMORY;
         } else {
-            self->line_fields = (size_t *)newptr;
+            self->line_fields = (int64_t *)newptr;
         }
     }
 
@@ -718,8 +718,8 @@ int skip_this_line(parser_t *self, int64_t rownum) {
     }
 }
 
-int tokenize_bytes(parser_t *self, size_t line_limit, size_t start_lines) {
-    size_t i, slen;
+int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) {
+    int64_t i, slen;
     int should_skip;
     char c;
     char *stream;
@@ -1235,7 +1235,7 @@ int parser_trim_buffers(parser_t *self) {
     size_t new_cap;
     void *newptr;
 
-    size_t i;
+    int64_t i;
 
     /* trim words, word_starts */
     new_cap = _next_pow2(self->words_len) + 1;
@@ -1248,11 +1248,11 @@ int parser_trim_buffers(parser_t *self) {
             self->words = (char **)newptr;
         }
         newptr = safe_realloc((void *)self->word_starts,
-                              new_cap * sizeof(size_t));
+                              new_cap * sizeof(int64_t));
         if (newptr == NULL) {
             return PARSER_OUT_OF_MEMORY;
         } else {
-            self->word_starts = (size_t *)newptr;
+            self->word_starts = (int64_t *)newptr;
             self->words_cap = new_cap;
         }
     }
@@ -1299,18 +1299,18 @@ int parser_trim_buffers(parser_t *self) {
     if (new_cap < self->lines_cap) {
         TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n"));
         newptr = safe_realloc((void *)self->line_start,
-                              new_cap * sizeof(size_t));
+                              new_cap * sizeof(int64_t));
         if (newptr == NULL) {
             return PARSER_OUT_OF_MEMORY;
         } else {
-            self->line_start = (size_t *)newptr;
+            self->line_start = (int64_t *)newptr;
         }
         newptr = safe_realloc((void *)self->line_fields,
-                              new_cap * sizeof(size_t));
+                              new_cap * sizeof(int64_t));
         if (newptr == NULL) {
             return PARSER_OUT_OF_MEMORY;
         } else {
-            self->line_fields = (size_t *)newptr;
+            self->line_fields = (int64_t *)newptr;
             self->lines_cap = new_cap;
         }
     }
@@ -1319,7 +1319,7 @@ int parser_trim_buffers(parser_t *self) {
 }
 
 void debug_print_parser(parser_t *self) {
-    size_t j, line;
+    int64_t j, line;
     char *token;
 
     for (line = 0; line < self->lines; ++line) {
@@ -1340,7 +1340,7 @@ void debug_print_parser(parser_t *self) {
 
 int _tokenize_helper(parser_t *self, size_t nrows, int all) {
     int status = 0;
-    size_t start_lines = self->lines;
+    int64_t start_lines = self->lines;
 
     if (self->state == FINISHED) {
         return 0;
diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
index f293baa3cda12..1cd391aef68a4 100644
--- a/pandas/_libs/src/parser/tokenizer.h
+++ b/pandas/_libs/src/parser/tokenizer.h
@@ -137,9 +137,9 @@ typedef struct parser_t {
     io_callback cb_io;
     io_cleanup cb_cleanup;
 
-    int64_t chunksize;  // Number of bytes to prepare for each chunk
-    char *data;     // pointer to data to be processed
-    int64_t datalen;    // amount of data available
+    int64_t chunksize;      // Number of bytes to prepare for each chunk
+    char *data;             // pointer to data to be processed
+    int64_t datalen;        // amount of data available
     int64_t datapos;
 
     // where to write out tokenized data
@@ -149,18 +149,18 @@ typedef struct parser_t {
 
     // Store words in (potentially ragged) matrix for now, hmm
     char **words;
-    int64_t *word_starts;  // where we are in the stream
+    int64_t *word_starts;   // where we are in the stream
     int64_t words_len;
     int64_t words_cap;
 
-    char *pword_start;  // pointer to stream start of current field
+    char *pword_start;      // pointer to stream start of current field
     int64_t word_start;     // position start of current field
 
-    int64_t *line_start;   // position in words for start of line
-    int64_t *line_fields;  // Number of fields in each line
-    int64_t lines;         // Number of (good) lines observed
-    int64_t file_lines;  // Number of lines observed (including bad or skipped)
-    int64_t lines_cap;   // Vector capacity
+    int64_t *line_start;    // position in words for start of line
+    int64_t *line_fields;   // Number of fields in each line
+    int64_t lines;          // Number of (good) lines observed
+    int64_t file_lines;     // Number of lines observed (including bad or skipped)
+    int64_t lines_cap;      // Vector capacity
 
     // Tokenizing stuff
     ParserState state;
@@ -193,7 +193,7 @@ typedef struct parser_t {
     // thousands separator (comma, period)
     char thousands;
 
-    int header;        // Boolean: 1: has header, 0: no header
+    int header;            // Boolean: 1: has header, 0: no header
     int64_t header_start;  // header row start
     int64_t header_end;    // header row end
 

From e4dfd19b6d53a58ca1a4f2679454cbcc14238b2f Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff@jeffknupp.com>
Date: Fri, 21 Jul 2017 15:05:58 -0400
Subject: [PATCH 07/14] Revert printf format strings; fix more comment
 alignment

---
 pandas/_libs/parsers.pyx            | 2 +-
 pandas/_libs/src/parser/tokenizer.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index e7f559cc150fe..c512a9fd39e95 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -122,7 +122,7 @@ cdef extern from "parser/tokenizer.h":
         io_cleanup cb_cleanup
 
         int64_t chunksize  # Number of bytes to prepare for each chunk
-        char *data        # pointer to data to be processed
+        char *data         # pointer to data to be processed
         int64_t datalen    # amount of data available
         int64_t datapos
 
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index fc1462b7a2b03..d5bb1a2fbc136 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -468,7 +468,7 @@ static int end_line(parser_t *self) {
         if (self->error_bad_lines) {
             self->error_msg = (char *)malloc(bufsize);
             snprintf(self->error_msg, bufsize,
-                    "Expected %d fields in line %zu, saw %d\n",
+                    "Expected %d fields in line %d, saw %d\n",
                     ex_fields, self->file_lines, fields);
 
             TRACE(("Error at line %d, %d fields\n", self->file_lines, fields));
@@ -480,7 +480,7 @@ static int end_line(parser_t *self) {
                 // pass up error message
                 msg = (char *)malloc(bufsize);
                 snprintf(msg, bufsize,
-                        "Skipping line %zu: expected %d fields, saw %d\n",
+                        "Skipping line %d: expected %d fields, saw %d\n",
                          self->file_lines, ex_fields, fields);
                 append_warning(self, msg);
                 free(msg);
@@ -1147,7 +1147,7 @@ static int parser_handle_eof(parser_t *self) {
         case IN_QUOTED_FIELD:
             self->error_msg = (char *)malloc(bufsize);
             snprintf(self->error_msg, bufsize,
-                    "EOF inside string starting at line %zu", self->file_lines);
+                    "EOF inside string starting at line %d", self->file_lines);
             return -1;
 
         case ESCAPED_CHAR:
@@ -1323,7 +1323,7 @@ void debug_print_parser(parser_t *self) {
     char *token;
 
     for (line = 0; line < self->lines; ++line) {
-        printf("(Parsed) Line %zu: ", line);
+        printf("(Parsed) Line %d: ", line);
 
         for (j = 0; j < self->line_fields[j]; ++j) {
             token = self->words[j + self->line_start[line]];

From 2930eaa3845d5c62ae7a5971b04727a3bfe8a763 Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff@jeffknupp.com>
Date: Fri, 21 Jul 2017 16:30:07 -0400
Subject: [PATCH 08/14] Fix line length to conform to linter rules

---
 pandas/_libs/src/parser/tokenizer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
index 1cd391aef68a4..9462608a26814 100644
--- a/pandas/_libs/src/parser/tokenizer.h
+++ b/pandas/_libs/src/parser/tokenizer.h
@@ -159,7 +159,7 @@ typedef struct parser_t {
     int64_t *line_start;    // position in words for start of line
     int64_t *line_fields;   // Number of fields in each line
     int64_t lines;          // Number of (good) lines observed
-    int64_t file_lines;     // Number of lines observed (including bad or skipped)
+    int64_t file_lines;     // Number of lines (including bad or skipped)
     int64_t lines_cap;      // Vector capacity
 
     // Tokenizing stuff

From 2ab4971449cd13b0cfc95fbe737f4f5204b07e5a Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff.knupp@enigma.com>
Date: Sat, 22 Jul 2017 23:14:03 -0400
Subject: [PATCH 09/14] Remove debugging code

---
 pandas/_libs/src/parser/tokenizer.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index d5bb1a2fbc136..ab92290f87719 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1258,12 +1258,7 @@ int parser_trim_buffers(parser_t *self) {
     }
 
     /* trim stream */
-    if (new_cap < INT32_MAX) {
-        new_cap = _next_pow2(self->stream_len) + 1;
-    } else {
-        new_cap *= 2;
-    }
-
+    new_cap = _next_pow2(self->stream_len) + 1;
     TRACE(
         ("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = "
          "%zu\n",

From e3cb9c1d8d37c91af6156ff41e4413f617082dcd Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff.knupp@enigma.com>
Date: Sun, 23 Jul 2017 00:22:03 -0400
Subject: [PATCH 10/14] Add unit test plus '--high-memory' option, *off by
 default*.

---
 pandas/conftest.py                     |  7 ++++++-
 pandas/tests/io/parser/test_parsers.py | 13 +++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index 8a3ffe22242ac..049756a0680f6 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -9,7 +9,9 @@ def pytest_addoption(parser):
     parser.addoption("--skip-slow", action="store_true",
                      help="skip slow tests")
     parser.addoption("--skip-network", action="store_true",
-                     help="run network tests")
+                     help="skip network tests")
+    parser.addoption("--run-highmemory", action="store_true",
+                     help="run high memory tests")
     parser.addoption("--only-slow", action="store_true",
                      help="run only slow tests")
 
@@ -24,6 +26,9 @@ def pytest_runtest_setup(item):
     if 'network' in item.keywords and item.config.getoption("--skip-network"):
         pytest.skip("skipping due to --skip-network")
 
+    if 'high_memory' in item.keywords and not item.config.getoption("--run-highmemory"):
+        pytest.skip("skipping high memory test since --run-highmemory was not set")
+
 
 # Configurations for all tests and all test modules
 
diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py
index 8d59e3acb3230..1a53dce37bcd7 100644
--- a/pandas/tests/io/parser/test_parsers.py
+++ b/pandas/tests/io/parser/test_parsers.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 
 import os
+from io import StringIO
+
+import pytest
 
 import pandas.util.testing as tm
 
@@ -24,6 +27,16 @@
 from .python_parser_only import PythonParserTests
 from .dtypes import DtypeTests
 
+@pytest.mark.high_memory
+def test_bytes_exceed_2gb():
+    """Read from a "CSV" that has a column larger than 2GB.
+
+    GH 16798
+    """
+    csv = StringIO('strings\n' + '\n'.join(['x' * (1 << 20) for _ in range(2100)]))
+    df = read_csv(csv, low_memory=False)
+    assert not df.empty
+
 
 class BaseParser(CommentTests, CompressionTests,
                  ConverterTests, DialectTests,

From 7b1cd8d99cd84e7e1e0614485e21183d750ff1c8 Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff.knupp@enigma.com>
Date: Sun, 23 Jul 2017 00:23:54 -0400
Subject: [PATCH 11/14] Fix linting issues

---
 pandas/conftest.py                     | 6 ++++--
 pandas/tests/io/parser/test_parsers.py | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index 049756a0680f6..0e6472966d616 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -26,8 +26,10 @@ def pytest_runtest_setup(item):
     if 'network' in item.keywords and item.config.getoption("--skip-network"):
         pytest.skip("skipping due to --skip-network")
 
-    if 'high_memory' in item.keywords and not item.config.getoption("--run-highmemory"):
-        pytest.skip("skipping high memory test since --run-highmemory was not set")
+    if 'high_memory' in item.keywords and not item.config.getoption(
+        "--run-highmemory"):
+        pytest.skip(
+            "skipping high memory test since --run-highmemory was not set")
 
 
 # Configurations for all tests and all test modules
diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py
index 1a53dce37bcd7..466cafd85a515 100644
--- a/pandas/tests/io/parser/test_parsers.py
+++ b/pandas/tests/io/parser/test_parsers.py
@@ -33,7 +33,8 @@ def test_bytes_exceed_2gb():
 
     GH 16798
     """
-    csv = StringIO('strings\n' + '\n'.join(['x' * (1 << 20) for _ in range(2100)]))
+    csv = StringIO('strings\n' + '\n'.join(
+        ['x' * (1 << 20) for _ in range(2100)]))
     df = read_csv(csv, low_memory=False)
     assert not df.empty
 

From 4380c5340e714ae121a1d6313776b769fae98359 Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff.knupp@enigma.com>
Date: Sun, 23 Jul 2017 02:28:26 -0400
Subject: [PATCH 12/14] Fix linting issues

---
 pandas/conftest.py                     | 4 ++--
 pandas/tests/io/parser/test_parsers.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index 0e6472966d616..ab097b79dcd4e 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -28,8 +28,8 @@ def pytest_runtest_setup(item):
 
     if 'high_memory' in item.keywords and not item.config.getoption(
         "--run-highmemory"):
-        pytest.skip(
-            "skipping high memory test since --run-highmemory was not set")
+            pytest.skip(
+                "skipping high memory test since --run-highmemory was not set")
 
 
 # Configurations for all tests and all test modules
diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py
index 466cafd85a515..f23bd24f5cbe3 100644
--- a/pandas/tests/io/parser/test_parsers.py
+++ b/pandas/tests/io/parser/test_parsers.py
@@ -27,6 +27,7 @@
 from .python_parser_only import PythonParserTests
 from .dtypes import DtypeTests
 
+
 @pytest.mark.high_memory
 def test_bytes_exceed_2gb():
     """Read from a "CSV" that has a column larger than 2GB.

From a5d567716e0b221b0a13da60f98a3e0c7c26a354 Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff.knupp@enigma.com>
Date: Sun, 23 Jul 2017 02:31:37 -0400
Subject: [PATCH 13/14] Fix linting issues

---
 pandas/conftest.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index ab097b79dcd4e..bae45743bbcfb 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -27,9 +27,9 @@ def pytest_runtest_setup(item):
         pytest.skip("skipping due to --skip-network")
 
     if 'high_memory' in item.keywords and not item.config.getoption(
-        "--run-highmemory"):
-            pytest.skip(
-                "skipping high memory test since --run-highmemory was not set")
+            "--run-highmemory"):
+        pytest.skip(
+            "skipping high memory test since --run-highmemory was not set")
 
 
 # Configurations for all tests and all test modules

From 6a1ba230d14f06ef71494d943dcc8be809da7278 Mon Sep 17 00:00:00 2001
From: Jeff Knupp <jeff.knupp@enigma.com>
Date: Sun, 23 Jul 2017 10:07:29 -0400
Subject: [PATCH 14/14] Clear up prose

---
 doc/source/whatsnew/v0.21.0.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index 1c4effc31833f..520634fa2504a 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -201,8 +201,8 @@ I/O
 ^^^
 
 - Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`)
-- Bug in :func:`read_csv` in which passing a CSV with at least one very large (i.e. more than 2^31 - 1 bytes) column along with ``low_memory=False`` would cause an integer overflow. The result was an always unsuccessful attempt to allocate an enourmous buffer and then reporting "Out of memory." (:issue:`16798`).
-- Bug in :func:`read_csv` in which some errors paths were assigning error messages to the internal tokenizer's ``error_msg`` field without first allocating the memory. When this happened as part of exception handling, it resulted in a double ``free`` and the program halted due to a ``SIGSEGV`` (:issue:`16798`).
+- Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the Python interpreter to crash (:issue:`16798`).
+- Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size caused Pandas to grossly overestimate memory requirements and preemptively raise an Exception with the message "out of memory" (:issue:`16798`).
 - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`)
 
 Plotting